From 7af5dbae357024f490eff27e2d732d112aed3c49 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:13:12 +0000
Subject: [PATCH 01/29] Add `IdString::unescape()` method

We've already talked about adding this as an alternative to `log_id()`, and we'll
need it later in this PR.
---
 kernel/rtlil.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/kernel/rtlil.h b/kernel/rtlil.h
index 6f26d0d39..6dec89dfe 100644
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@@ -275,6 +275,17 @@ struct RTLIL::IdString
 		*out += std::to_string(-index_);
 	}
 
+	std::string unescape() const {
+		if (index_ < 0) {
+			// Must start with "$auto$" so no unescaping required.
+			return str();
+		}
+		std::string_view str = global_id_storage_.at(index_).str_view();
+		if (str.size() < 2 || str[0] != '\\' || str[1] == '$' || str[1] == '\\' || (str[1] >= '0' && str[1] <= '9'))
+			return std::string(str);
+		return std::string(str.substr(1));
+	}
+
 	class Substrings {
 		std::string_view first_;
 		int suffix_number;
@@ -759,7 +770,7 @@ namespace RTLIL {
 	}
 
 	static inline std::string unescape_id(RTLIL::IdString str) {
-		return unescape_id(str.str());
+		return str.unescape();
 	}
 
 	static inline const char *id2cstr(RTLIL::IdString str) {

From bd7f2d9ba4088d455528ffcbd0b8b0fd567233b5 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:14:46 +0000
Subject: [PATCH 02/29] Make `log_error()` work in a `Multithreaded` context.

`log_error()` causes an exit so we don't have to try too hard here. The main
thing is to ensure that we normally are able to exit without causing a stack
overflow due to recursive asserts about not being in a `Multithreaded` context.
---
 kernel/log.cc | 14 ++++++++++++--
 kernel/log.h  |  7 +------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/kernel/log.cc b/kernel/log.cc
index 018a19081..b114f1eaf 100644
--- a/kernel/log.cc
+++ b/kernel/log.cc
@@ -324,6 +324,14 @@ void log_formatted_file_info(std::string_view filename, int lineno, std::string
 	log("%s:%d: Info: %s", filename, lineno, str);
 }
 
+void log_suppressed() {
+	if (log_debug_suppressed && !log_make_debug) {
+		constexpr const char* format = "<suppressed ~%d debug messages>\n";
+		logv_string(format, stringf(format, log_debug_suppressed));
+		log_debug_suppressed = 0;
+	}
+}
+
 [[noreturn]]
 static void log_error_with_prefix(std::string_view prefix, std::string str)
 {
@@ -345,7 +353,9 @@ static void log_error_with_prefix(std::string_view prefix, std::string str)
 	}
 
 	log_last_error = std::move(str);
-	log("%s%s", prefix, log_last_error);
+	std::string message(prefix);
+	message += log_last_error;
+	logv_string("%s%s", message);
 	log_flush();
 
 	log_make_debug = bak_log_make_debug;
@@ -355,7 +365,7 @@ static void log_error_with_prefix(std::string_view prefix, std::string str)
 			item.current_count++;
 
 	for (auto &[_, item] : log_expect_prefix_error)
-		if (std::regex_search(string(prefix) + string(log_last_error), item.pattern))
+		if (std::regex_search(message, item.pattern))
 			item.current_count++;
 
 	log_check_expected();
diff --git a/kernel/log.h b/kernel/log.h
index 63faf7091..d132ba1a0 100644
--- a/kernel/log.h
+++ b/kernel/log.h
@@ -206,12 +206,7 @@ template <typename... Args>
 	log_formatted_cmd_error(fmt.format(args...));
 }
 
-static inline void log_suppressed() {
-	if (log_debug_suppressed && !log_make_debug) {
-		log("<suppressed ~%d debug messages>\n", log_debug_suppressed);
-		log_debug_suppressed = 0;
-	}
-}
+void log_suppressed();
 
 struct LogMakeDebugHdl {
 	bool status = false;

From 13d9fffdb9c04ccc2a5502117b906c0b51a1339e Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:16:24 +0000
Subject: [PATCH 03/29] Work around `std::reverse` miscompilation with empty
 range

This causes problems when compiling with fuzzing instrumenation enabled.
---
 frontends/rtlil/rtlil_frontend.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/frontends/rtlil/rtlil_frontend.cc b/frontends/rtlil/rtlil_frontend.cc
index a1412d983..7e2ec5460 100644
--- a/frontends/rtlil/rtlil_frontend.cc
+++ b/frontends/rtlil/rtlil_frontend.cc
@@ -286,6 +286,7 @@ struct RTLILFrontendWorker {
 		if (width > MAX_CONST_WIDTH)
 			error("Constant width %lld out of range before `%s`.", width, error_token());
 		bits.reserve(width);
+		int start_idx = idx;
 		while (true) {
 			RTLIL::State bit;
 			switch (line[idx]) {
@@ -300,8 +301,9 @@ struct RTLILFrontendWorker {
 			bits.push_back(bit);
 			++idx;
 		}
-		done:
-		std::reverse(bits.begin(), bits.end());
+	done:
+		if (start_idx < idx)
+			std::reverse(bits.begin(), bits.end());
 
 		if (GetSize(bits) > width)
 			bits.resize(width);

From 898a288a99a3e8be45e8fbefe0fa7ebcbf4d6257 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:50:23 +0000
Subject: [PATCH 04/29] Add `work_pool_size`, `IntRange`,
 `item_range_for_worker`, and `ThreadIndex`

We'll use these later in this PR.
---
 kernel/threading.cc | 35 +++++++++++++++++++++++++++++++++++
 kernel/threading.h  | 16 ++++++++++++++++
 kernel/utils.h      | 15 +++++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/kernel/threading.cc b/kernel/threading.cc
index dcc044c89..8c9bfb390 100644
--- a/kernel/threading.cc
+++ b/kernel/threading.cc
@@ -17,6 +17,20 @@ static int get_max_threads()
 	return max_threads;
 }
 
+static int init_work_units_per_thread_override()
+{
+	const char *v = getenv("YOSYS_WORK_UNITS_PER_THREAD");
+	if (v == nullptr)
+		return 0;
+	return atoi(v);
+}
+
+static int get_work_units_per_thread_override()
+{
+	static int work_units_per_thread = init_work_units_per_thread_override();
+	return work_units_per_thread;
+}
+
 void DeferredLogs::flush()
 {
 	for (auto &m : logs)
@@ -37,6 +51,14 @@ int ThreadPool::pool_size(int reserved_cores, int max_worker_threads)
 #endif
 }
 
+int ThreadPool::work_pool_size(int reserved_cores, int work_units, int work_units_per_thread)
+{
+	int work_units_per_thread_override = get_work_units_per_thread_override();
+	if (work_units_per_thread_override > 0)
+		work_units_per_thread = work_units_per_thread_override;
+	return pool_size(reserved_cores, work_units / work_units_per_thread);
+}
+
 ThreadPool::ThreadPool(int pool_size, std::function<void(int)> b)
 	: body(std::move(b))
 {
@@ -57,4 +79,17 @@ ThreadPool::~ThreadPool()
 #endif
 }
 
+IntRange item_range_for_worker(int num_items, int thread_num, int num_threads)
+{
+	if (num_threads <= 1) {
+		return {0, num_items};
+	}
+	int items_per_thread = num_items / num_threads;
+	int extra_items = num_items % num_threads;
+	// The first `extra_items` threads get one extra item.
+	int start = thread_num * items_per_thread + std::min(thread_num, extra_items);
+	int end = (thread_num + 1) * items_per_thread + std::min(thread_num + 1, extra_items);
+	return {start, end};
+}
+
 YOSYS_NAMESPACE_END
diff --git a/kernel/threading.h b/kernel/threading.h
index b8cd62f87..971945abc 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -8,6 +8,7 @@
 
 #include "kernel/yosys_common.h"
 #include "kernel/log.h"
+#include "kernel/utils.h"
 
 #ifndef YOSYS_THREADING_H
 #define YOSYS_THREADING_H
@@ -131,6 +132,11 @@ public:
 	// The result may be 0.
 	static int pool_size(int reserved_cores, int max_worker_threads);
 
+	// Computes the number of worker threads to use, by dividing work_units among threads.
+	// For testing purposes you can set YOSYS_WORK_UNITS_PER_THREAD to override `work_units_per_thread`.
+	// The result may be 0.
+	static int work_pool_size(int reserved_cores, int work_units, int work_units_per_thread);
+
 	// Create a pool of threads running the given closure (parameterized by thread number).
 	// `pool_size` must be the result of a `pool_size()` call.
 	ThreadPool(int pool_size, std::function<void(int)> b);
@@ -154,6 +160,16 @@ private:
 #endif
 };
 
+// Divides some number of items into `num_threads` subranges and returns the
+// `thread_num`'th subrange. If `num_threads` is zero, returns the whole range.
+IntRange item_range_for_worker(int num_items, int thread_num, int num_threads);
+
+// A type that encapsulates the index of a thread in some list of threads. Useful for
+// stronger typechecking and code readability.
+struct ThreadIndex {
+	int thread_num;
+};
+
 template <class T>
 class ConcurrentStack
 {
diff --git a/kernel/utils.h b/kernel/utils.h
index e90ba09d8..68e00153f 100644
--- a/kernel/utils.h
+++ b/kernel/utils.h
@@ -299,6 +299,21 @@ auto reversed(const T& container) {
 	return reverse_view{container};
 }
 
+// A range of integers [start_, end_) that can be iterated over with a
+// C++ range-based for loop.
+struct IntRange {
+	int start_;
+	int end_;
+	struct Int {
+		int v;
+		int operator*() const { return v; }
+		Int &operator++() { ++v; return *this; }
+		bool operator!=(const Int &other) const { return v != other.v; }
+	};
+	Int begin() const { return {start_}; }
+	Int end() const { return {end_}; }
+};
+
 YOSYS_NAMESPACE_END
 
 #endif

From b079e5721c7ab585335bfcbcf9f9955c61829450 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:52:17 +0000
Subject: [PATCH 05/29] Add `ParallelDispatchThreadPool`

We'll use this later in the PR.
---
 kernel/threading.cc |  55 +++++++++++++++++++++++
 kernel/threading.h  | 107 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)

diff --git a/kernel/threading.cc b/kernel/threading.cc
index 8c9bfb390..3766c4ddf 100644
--- a/kernel/threading.cc
+++ b/kernel/threading.cc
@@ -92,4 +92,59 @@ IntRange item_range_for_worker(int num_items, int thread_num, int num_threads)
 	return {start, end};
 }
 
+ParallelDispatchThreadPool::ParallelDispatchThreadPool(int pool_size)
+		: num_worker_threads_(std::max(1, pool_size) - 1)
+{
+#ifdef YOSYS_ENABLE_THREADS
+	main_to_workers_signal.resize(num_worker_threads_, 0);
+#endif
+	// Don't start the threads until we've constructed all our data members.
+	thread_pool = std::make_unique<ThreadPool>(num_worker_threads_, [this](int thread_num){
+		run_worker(thread_num);
+	});
+}
+
+ParallelDispatchThreadPool::~ParallelDispatchThreadPool()
+{
+#ifdef YOSYS_ENABLE_THREADS
+	if (num_worker_threads_ == 0)
+		return;
+	current_work = nullptr;
+	num_active_worker_threads_ = num_worker_threads_;
+	signal_workers_start();
+	wait_for_workers_done();
+#endif
+}
+
+void ParallelDispatchThreadPool::run(std::function<void(const RunCtx &)> work, int max_threads)
+{
+	Multithreading multithreading;
+	num_active_worker_threads_ = num_threads(max_threads) - 1;
+	if (num_active_worker_threads_ == 0) {
+		work({{0}, 1});
+		return;
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	current_work = &work;
+	signal_workers_start();
+	work({{0}, num_active_worker_threads_ + 1});
+	wait_for_workers_done();
+#endif
+}
+
+void ParallelDispatchThreadPool::run_worker(int thread_num)
+{
+#ifdef YOSYS_ENABLE_THREADS
+	while (true)
+	{
+		worker_wait_for_start(thread_num);
+		if (current_work == nullptr)
+			break;
+		(*current_work)({{thread_num + 1}, num_active_worker_threads_ + 1});
+		signal_worker_done();
+	}
+	signal_worker_done();
+#endif
+}
+
 YOSYS_NAMESPACE_END
diff --git a/kernel/threading.h b/kernel/threading.h
index 971945abc..9c2b0e7f7 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -170,6 +170,113 @@ struct ThreadIndex {
 	int thread_num;
 };
 
+// A set of threads with a `run()` API that runs a closure on all of the threads
+// and wait for all those closures to complete. This is a convenient way to implement
+// parallel algorithms that use barrier synchronization.
+class ParallelDispatchThreadPool
+{
+public:
+	// Create a pool of threads running the given closure (parameterized by thread number).
+	// `pool_size` must be the result of a `pool_size()` call.
+	// `pool_size` can be zero, which we treat as 1.
+	ParallelDispatchThreadPool(int pool_size);
+	~ParallelDispatchThreadPool();
+
+	// For each thread running a closure, a `RunCtx` is passed to the closure. Currently
+	// it contains the thread index and the total number of threads. It can be passed
+	// directly to any APIs requiring a `ThreadIndex`.
+	struct RunCtx : public ThreadIndex {
+		int num_threads;
+		IntRange item_range(int num_items) const {
+			return item_range_for_worker(num_items, thread_num, num_threads);
+		}
+	};
+	// Sometimes we only want to activate a subset of the threads in the pool. This
+	// class provides a way to do that. It provides the same `num_threads()`
+	// and `run()` APIs as a `ParallelDispatchThreadPool`.
+	class Subpool {
+	public:
+		Subpool(ParallelDispatchThreadPool &parent, int max_threads)
+				: parent(parent), max_threads(max_threads) {}
+		// Returns the number of threads that will be used when calling `run()`.
+		int num_threads() const {
+			return parent.num_threads(max_threads);
+		}
+		void run(std::function<void(const RunCtx &)> work) {
+			parent.run(std::move(work), max_threads);
+		}
+		ParallelDispatchThreadPool &thread_pool() { return parent; }
+	private:
+		ParallelDispatchThreadPool &parent;
+		int max_threads;
+	};
+
+	// Run the `work` function in parallel on each thread in the pool (parameterized by
+	// thread number). Waits for all work functions to complete. Only one `run()` can be
+	// active at a time.
+	// Uses no more than `max_threads` threads (but at least one).
+	void run(std::function<void(const RunCtx &)> work) {
+		run(std::move(work), INT_MAX);
+	}
+
+	// Returns the number of threads that will be used when calling `run()`.
+	int num_threads() const {
+		return num_threads(INT_MAX);
+	}
+private:
+	friend class Subpool;
+
+	void run(std::function<void(const RunCtx &)> work, int max_threads);
+	int num_threads(int max_threads) const {
+		return std::min(num_worker_threads_ + 1, std::max(1, max_threads));
+	}
+	void run_worker(int thread_num);
+
+	std::unique_ptr<ThreadPool> thread_pool;
+	std::function<void(const RunCtx &)> *current_work = nullptr;
+	// Keeps a correct count even when threads are exiting.
+	int num_worker_threads_;
+	// The count of active workerthreads for the current `run()`.
+	int num_active_worker_threads_ = 0;
+
+#ifdef YOSYS_ENABLE_THREADS
+	// Not especially efficient for large numbers of threads. Worker wakeup could scale
+	// better by conceptually organising workers into a tree and having workers wake
+	// up their children.
+	std::mutex main_to_workers_signal_mutex;
+	std::condition_variable main_to_workers_signal_cv;
+	std::vector<uint8_t> main_to_workers_signal;
+	void signal_workers_start() {
+		std::unique_lock lock(main_to_workers_signal_mutex);
+		std::fill(main_to_workers_signal.begin(), main_to_workers_signal.begin() + num_active_worker_threads_, 1);
+		// When `num_active_worker_threads_` is small compared to `num_worker_threads_`, we have a "thundering herd"
+		// problem here. Fixing that would add complexity so don't worry about it for now.
+		main_to_workers_signal_cv.notify_all();
+	}
+	void worker_wait_for_start(int thread_num) {
+		std::unique_lock lock(main_to_workers_signal_mutex);
+		main_to_workers_signal_cv.wait(lock, [this, thread_num] { return main_to_workers_signal[thread_num] > 0; });
+		main_to_workers_signal[thread_num] = 0;
+	}
+
+	std::atomic<int> done_workers = 0;
+	std::mutex workers_to_main_signal_mutex;
+	std::condition_variable workers_to_main_signal_cv;
+	void signal_worker_done() {
+		int d = done_workers.fetch_add(1, std::memory_order_release);
+		if (d + 1 == num_active_worker_threads_) {
+			std::unique_lock lock(workers_to_main_signal_mutex);
+			workers_to_main_signal_cv.notify_all();
+		}
+	}
+	void wait_for_workers_done() {
+		std::unique_lock lock(workers_to_main_signal_mutex);
+		workers_to_main_signal_cv.wait(lock, [this] { return done_workers.load(std::memory_order_acquire) == num_active_worker_threads_; });
+		done_workers.store(0, std::memory_order_relaxed);
+	}
+#endif
+};
+
 template <class T>
 class ConcurrentStack
 {

From 87521df5347e8c455c4c07a1c9df3534b5c87e6b Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:58:09 +0000
Subject: [PATCH 06/29] Add `ShardedVector`

We'll use this later in the PR.
---
 kernel/threading.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index 9c2b0e7f7..f48b14949 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -304,6 +304,80 @@ private:
 	std::vector<T> contents;
 };
 
+// A vector that is sharded into buckets, one per thread. This lets multiple threads write
+// efficiently to the vector without synchronization overhead. After all writers have
+// finished writing, the vector can be iterated over. The iteration order is deterministic:
+// all the elements written by thread 0 in the order it inserted them, followed by all elements
+// written by thread 1, etc.
+template <typename T>
+class ShardedVector {
+public:
+	ShardedVector(const ParallelDispatchThreadPool &thread_pool) {
+		init(thread_pool.num_threads());
+	}
+	ShardedVector(const ParallelDispatchThreadPool::Subpool &thread_pool) {
+		init(thread_pool.num_threads());
+	}
+
+	// Insert a value, passing the `ThreadIndex` of the writer thread.
+	// Parallel inserts with different `ThreadIndex` values are fine.
+	// Inserts must not run concurrently with any other methods (e.g.
+	// iteration or `empty()`.)
+	void insert(const ThreadIndex &thread, T value) {
+		buckets[thread.thread_num].emplace_back(std::move(value));
+	}
+
+	bool empty() const {
+		for (const std::vector<T> &bucket : buckets)
+			if (!bucket.empty())
+				return false;
+		return true;
+	}
+
+	using Buckets = std::vector<std::vector<T>>;
+	class iterator {
+	public:
+		iterator(typename Buckets::iterator bucket_it, typename Buckets::iterator bucket_end)
+			: bucket_it(std::move(bucket_it)), bucket_end(std::move(bucket_end)) {
+			if (bucket_it != bucket_end)
+				inner_it = bucket_it->begin();
+			normalize();
+		}
+		T& operator*() const { return *inner_it.value(); }
+		iterator &operator++() {
+			++*inner_it;
+			normalize();
+			return *this;
+		}
+		bool operator!=(const iterator &other) const {
+			return bucket_it != other.bucket_it || inner_it != other.inner_it;
+		}
+	private:
+		void normalize() {
+			if (bucket_it == bucket_end)
+				return;
+			while (inner_it == bucket_it->end()) {
+				++bucket_it;
+				if (bucket_it == bucket_end) {
+					inner_it.reset();
+					return;
+				}
+				inner_it = bucket_it->begin();
+			}
+		}
+		std::optional<typename std::vector<T>::iterator> inner_it;
+		typename Buckets::iterator bucket_it;
+		typename Buckets::iterator bucket_end;
+	};
+	iterator begin() { return iterator(buckets.begin(), buckets.end()); }
+	iterator end() { return iterator(buckets.end(), buckets.end()); }
+private:
+	void init(int num_threads) {
+		buckets.resize(num_threads);
+	}
+	Buckets buckets;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From ab238c3145195fe55b1bd6ca2820fa635dc8954d Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:59:35 +0000
Subject: [PATCH 07/29] Add `ShardedHashSet`

We'll use this later in the PR.
---
 kernel/threading.h | 180 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index f48b14949..33aa5fab1 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -378,6 +378,186 @@ private:
 	Buckets buckets;
 };
 
+// This collision handler for `ShardedHashtable` resolves collisions by keeping
+// the current value and discarding the other. This is correct when all values with the
+// same key are interchangeable, i.e. when the hashtable is being used as a set instead
+// of a map.
+template <typename V>
+struct SetCollisionHandler {
+	void operator()(typename V::Accumulated &, typename V::Accumulated &) const {}
+};
+
+// A hashtable that can be efficiently built in parallel and then looked up concurrently.
+// `V` is the type of elements that will be added to the hashtable. It must have a
+// member type `Accumulated` representing the combination of multiple `V` elements. This
+// can be the same as `V`, but for example `V` could contain a Wire* and `V::Accumulated`
+// could contain a `pool<Wire*>`. `KeyEquality` is a class containing an `operator()` that
+// returns true of two `V` elements have equal keys.
+// `CollisionHandler` is used to reduce two `V::Accumulated` values into a single value.
+//
+// To use this, first construct a `Builder` and fill it in (in parallel), then construct
+// a `ShardedHashtable` from the `Builder`.
+template <typename V, typename KeyEquality, typename CollisionHandler>
+class ShardedHashtable {
+public:
+	// A combination of a `V` and its hash value.
+	struct Value {
+		Value(V value, unsigned int hash) : value(std::move(value)), hash(hash) {}
+		Value(Value &&) = default;
+		Value(const Value &) = delete;
+		Value &operator=(const Value &) = delete;
+		V value;
+		unsigned int hash;
+	};
+	// A combination of a `V::Accumulated` and its hash value.
+	struct AccumulatedValue {
+		AccumulatedValue(typename V::Accumulated value, unsigned int hash) : value(std::move(value)), hash(hash) {}
+		AccumulatedValue(AccumulatedValue &&) = default;
+#if defined(_MSC_VER)
+		AccumulatedValue(const AccumulatedValue &) {
+			log_error("Copy constructor called on AccumulatedValue");
+		}
+		AccumulatedValue &operator=(const AccumulatedValue &) {
+			log_error("Copy assignment called on AccumulatedValue");
+			return *this;
+		}
+#else
+		AccumulatedValue(const AccumulatedValue &) = delete;
+		AccumulatedValue &operator=(const AccumulatedValue &) = delete;
+#endif
+		typename V::Accumulated value;
+		unsigned int hash;
+	};
+	// A class containing an `operator()` that returns true of two `AccumulatedValue`
+	// elements have equal keys.
+	// Required to insert `AccumulatedValue`s into an `std::unordered_set`.
+	struct AccumulatedValueEquality {
+		KeyEquality inner;
+		AccumulatedValueEquality(const KeyEquality &inner) : inner(inner) {}
+		bool operator()(const AccumulatedValue &v1, const AccumulatedValue &v2) const {
+			return inner(v1.value, v2.value);
+		}
+	};
+	// A class containing an `operator()` that returns the hash value of an `AccumulatedValue`.
+	// Required to insert `AccumulatedValue`s into an `std::unordered_set`.
+	struct AccumulatedValueHashOp {
+		size_t operator()(const AccumulatedValue &v) const {
+			return static_cast<size_t>(v.hash);
+		}
+	};
+	using Shard = std::unordered_set<AccumulatedValue, AccumulatedValueHashOp, AccumulatedValueEquality>;
+
+	// First construct one of these. Then populate it in parallel by calling `insert()` from many threads.
+	// Then do another parallel phase calling `process()` from many threads.
+	class Builder {
+	public:
+		Builder(const ParallelDispatchThreadPool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler())
+				: collision_handler(std::move(collision_handler)) {
+			init(thread_pool.num_threads(), std::move(equality));
+		}
+		Builder(const ParallelDispatchThreadPool::Subpool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler())
+				: collision_handler(std::move(collision_handler)) {
+			init(thread_pool.num_threads(), std::move(equality));
+		}
+		// First call `insert` to insert all elements. All inserts must finish
+		// before calling any `process()`.
+		void insert(const ThreadIndex &thread, Value v) {
+			// You might think that for the single-threaded case, we can optimize by
+			// inserting directly into the `std::unordered_set` here. But that slows things down
+			// a lot and I never got around to figuring out why.
+			std::vector<std::vector<Value>> &buckets = all_buckets[thread.thread_num];
+			size_t bucket = static_cast<size_t>(v.hash) % buckets.size();
+			buckets[bucket].emplace_back(std::move(v));
+		}
+		// Then call `process` for each thread. All `process()`s must finish before using
+		// the `Builder` to construct a `ShardedHashtable`.
+		void process(const ThreadIndex &thread) {
+			int size = 0;
+			for (std::vector<std::vector<Value>> &buckets : all_buckets)
+				size += GetSize(buckets[thread.thread_num]);
+			Shard &shard = shards[thread.thread_num];
+			shard.reserve(size);
+			for (std::vector<std::vector<Value>> &buckets : all_buckets) {
+				for (Value &value : buckets[thread.thread_num])
+					accumulate(value, shard);
+				// Free as much memory as we can during the parallel phase.
+				std::vector<Value>().swap(buckets[thread.thread_num]);
+			}
+		}
+	private:
+		friend class ShardedHashtable<V, KeyEquality, CollisionHandler>;
+		void accumulate(Value &value, Shard &shard) {
+			// With C++20 we could make this more efficient using heterogenous lookup
+			AccumulatedValue accumulated_value{std::move(value.value), value.hash};
+			auto [it, inserted] = shard.insert(std::move(accumulated_value));
+			if (!inserted)
+				collision_handler(const_cast<typename V::Accumulated &>(it->value), accumulated_value.value);
+		}
+		void init(int num_threads, KeyEquality equality) {
+			all_buckets.resize(num_threads);
+			for (std::vector<std::vector<Value>> &buckets : all_buckets)
+				buckets.resize(num_threads);
+			for (int i = 0; i < num_threads; ++i)
+				shards.emplace_back(0, AccumulatedValueHashOp(), AccumulatedValueEquality(equality));
+		}
+		const CollisionHandler collision_handler;
+		// A num_threads x num_threads matrix of buckets.
+		// In the first phase, each thread i gemerates elements and writes them to
+		// bucket [i][j] where j = hash(element) % num_threads.
+		// In the second phase, thread i reads from bucket [j][i] for all j, collecting
+		// all elements where i = hash(element) % num_threads.
+		std::vector<std::vector<std::vector<Value>>> all_buckets;
+		std::vector<Shard> shards;
+	};
+
+	// Then finally construct the hashtable:
+	ShardedHashtable(Builder &builder) : shards(std::move(builder.shards)) {
+		// Check that all necessary 'process()' calls were made.
+		for (std::vector<std::vector<Value>> &buckets : builder.all_buckets)
+			for (std::vector<Value> &bucket : buckets)
+				log_assert(bucket.empty());
+		// Free memory.
+		std::vector<std::vector<std::vector<Value>>>().swap(builder.all_buckets);
+	}
+	ShardedHashtable(ShardedHashtable &&other) = default;
+	ShardedHashtable() {}
+
+	ShardedHashtable &operator=(ShardedHashtable &&other) = default;
+
+	// Look up by `AccumulatedValue`. If we switch to C++20 then we could use
+	// heterogenous lookup to support looking up by `Value` here. Returns nullptr
+	// if the key is not found.
+	const typename V::Accumulated *find(const AccumulatedValue &v) const {
+		size_t num_shards = shards.size();
+		if (num_shards == 0)
+			return nullptr;
+		size_t shard = static_cast<size_t>(v.hash) % num_shards;
+		auto it = shards[shard].find(v);
+		if (it == shards[shard].end())
+			return nullptr;
+		return &it->value;
+	}
+
+	// Insert an element into the table. The caller is responsible for ensuring this does not
+	// happen concurrently with any other method calls.
+	void insert(AccumulatedValue v) {
+		size_t num_shards = shards.size();
+		if (num_shards == 0)
+			return;
+		size_t shard = static_cast<size_t>(v.hash) % num_shards;
+		shards[shard].insert(v);
+	}
+
+	// Call this for each shard to implement parallel destruction. For very large `ShardedHashtable`s,
+	// deleting all elements of all shards on a single thread can be a performance bottleneck.
+	void clear(const ThreadIndex &shard) {
+		AccumulatedValueEquality equality = shards[0].key_eq();
+		shards[shard.thread_num] = Shard(0, AccumulatedValueHashOp(), equality);
+	}
+private:
+	std::vector<Shard> shards;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From e71da963141581e1ef64e589002351e30ff09f78 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:00:47 +0000
Subject: [PATCH 08/29] Add `ConcurrentWorkQueue`

We'll use this later in the PR.
---
 kernel/threading.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index 33aa5fab1..649b65bdd 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -558,6 +558,112 @@ private:
 	std::vector<Shard> shards;
 };
 
+// A concurrent work-queue that can share batches of work across threads.
+// Uses a naive implementation of work-stealing.
+template <typename T>
+class ConcurrentWorkQueue {
+public:
+	// Create a queue that supports the given number of threads and
+	// groups work into `batch_size` units.
+	ConcurrentWorkQueue(int num_threads, int batch_size = 100)
+		: batch_size(batch_size), thread_states(num_threads) {}
+	int num_threads() const { return GetSize(thread_states); }
+	// Push some work to do. Pushes and pops with the same `thread` must
+	// not happen concurrently.
+	void push(const ThreadIndex &thread, T work) {
+		ThreadState &thread_state = thread_states[thread.thread_num];
+		thread_state.next_batch.emplace_back(std::move(work));
+		if (GetSize(thread_state.next_batch) < batch_size)
+			return;
+		bool was_empty;
+		{
+			std::unique_lock lock(thread_state.batches_lock);
+			was_empty = thread_state.batches.empty();
+			thread_state.batches.push_back(std::move(thread_state.next_batch));
+		}
+		if (was_empty) {
+			std::unique_lock lock(waiters_lock);
+			if (num_waiters > 0) {
+				waiters_cv.notify_one();
+			}
+		}
+	}
+	// Grab some work to do.
+	// If all threads enter `pop_batch()`, then instead of deadlocking the
+	// queue will return no work. That is the only case in which it will
+	// return no work.
+	std::vector<T> pop_batch(const ThreadIndex &thread) {
+		ThreadState &thread_state = thread_states[thread.thread_num];
+		if (!thread_state.next_batch.empty())
+			return std::move(thread_state.next_batch);
+		// Empty our own work queue first.
+		{
+			std::unique_lock lock(thread_state.batches_lock);
+			if (!thread_state.batches.empty()) {
+				std::vector<T> batch = std::move(thread_state.batches.back());
+				thread_state.batches.pop_back();
+				return batch;
+			}
+		}
+		// From here on in this function, our work queue is empty.
+		while (true) {
+			std::vector<T> batch = try_steal(thread);
+			if (!batch.empty()) {
+				return std::move(batch);
+			}
+			// Termination: if all threads run out of work, then all of
+			// them will eventually enter this loop and there will be no further
+			// notifications on waiters_cv, so all will eventually increment
+			// num_waiters and wait, so num_waiters == num_threads()
+			// will become true.
+			std::unique_lock lock(waiters_lock);
+			++num_waiters;
+			if (num_waiters == num_threads()) {
+				waiters_cv.notify_all();
+				return {};
+			}
+			// As above, it's possible that we'll wait here even when there
+			// are work batches posted by other threads. That's OK.
+			waiters_cv.wait(lock);
+			if (num_waiters == num_threads())
+				return {};
+			--num_waiters;
+		}
+	}
+private:
+	std::vector<T> try_steal(const ThreadIndex &thread) {
+		for (int i = 1; i < num_threads(); i++) {
+			int other_thread_num = (thread.thread_num + i) % num_threads();
+			ThreadState &other_thread_state = thread_states[other_thread_num];
+			std::unique_lock lock(other_thread_state.batches_lock);
+			if (!other_thread_state.batches.empty()) {
+				std::vector<T> batch = std::move(other_thread_state.batches.front());
+				other_thread_state.batches.pop_front();
+				return batch;
+			}
+		}
+		return {};
+	}
+
+	int batch_size;
+
+	struct ThreadState {
+		// Entirely thread-local.
+		std::vector<T> next_batch;
+
+		std::mutex batches_lock;
+		// Only the associated thread ever adds to this, and only at the back.
+		// Other threads can remove elements from the front.
+		std::deque<std::vector<T>> batches;
+	};
+	std::vector<ThreadState> thread_states;
+
+	std::mutex waiters_lock;
+	std::condition_variable waiters_cv;
+	// Number of threads waiting for work. Their queues are empty.
+	int num_waiters = 0;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From fe329a0e14d7b7ac2253462d953e57a359a27a65 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:01:43 +0000
Subject: [PATCH 09/29] Add `MonotonicFlag`

We'll use this later in the PR.
---
 kernel/threading.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index 649b65bdd..fb1602cf5 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -664,6 +664,22 @@ private:
 	int num_waiters = 0;
 };
 
+// A monotonic flag. Starts false, and can be set to true in a thread-safe way.
+// Once `load()` returns true, it will always return true.
+// Uses relaxed atomics so there are no memory ordering guarantees. Do not use this
+// to guard access to shared memory.
+class MonotonicFlag {
+public:
+	MonotonicFlag() : value(false) {}
+	bool load() const { return value.load(std::memory_order_relaxed); }
+	void set() { value.store(true, std::memory_order_relaxed); }
+	bool set_and_return_old() {
+		return value.exchange(true, std::memory_order_relaxed);
+	}
+private:
+	std::atomic<bool> value;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From 5ff7d344c93ee89052ef8ed517c358a5c2efc61f Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:20:13 +0000
Subject: [PATCH 10/29] Add `FfInitVals::set_parallel()` method

We'll use this later in the PR.
---
 kernel/ffinit.h | 64 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/kernel/ffinit.h b/kernel/ffinit.h
index 920fba307..8b4758f60 100644
--- a/kernel/ffinit.h
+++ b/kernel/ffinit.h
@@ -22,6 +22,7 @@
 
 #include "kernel/yosys.h"
 #include "kernel/sigtools.h"
+#include "kernel/threading.h"
 
 YOSYS_NAMESPACE_BEGIN
 
@@ -35,34 +36,55 @@ struct FfInitVals
 		sigmap = sigmap_;
 		initbits.clear();
 		for (auto wire : module->wires())
+			if (wire->attributes.count(ID::init))
+				process_wire(wire);
+	}
+
+	void process_wire(RTLIL::Wire *wire)
+	{
+		SigSpec wirebits = (*sigmap)(wire);
+		Const initval = wire->attributes.at(ID::init);
+
+		for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++)
 		{
-			if (wire->attributes.count(ID::init) == 0)
+			SigBit bit = wirebits[i];
+			State val = initval[i];
+
+			if (val != State::S0 && val != State::S1 && bit.wire != nullptr)
 				continue;
 
-			SigSpec wirebits = (*sigmap)(wire);
-			Const initval = wire->attributes.at(ID::init);
-
-			for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++)
-			{
-				SigBit bit = wirebits[i];
-				State val = initval[i];
-
-				if (val != State::S0 && val != State::S1 && bit.wire != nullptr)
-					continue;
-
-				if (initbits.count(bit)) {
-					if (initbits.at(bit).first != val)
-						log_error("Conflicting init values for signal %s (%s = %s != %s).\n",
-								log_signal(bit), log_signal(SigBit(wire, i)),
-								log_signal(val), log_signal(initbits.at(bit).first));
-					continue;
-				}
-
-				initbits[bit] = std::make_pair(val,SigBit(wire,i));
+			if (initbits.count(bit)) {
+				if (initbits.at(bit).first != val)
+					log_error("Conflicting init values for signal %s (%s = %s != %s).\n",
+							log_signal(bit), log_signal(SigBit(wire, i)),
+							log_signal(val), log_signal(initbits.at(bit).first));
+				continue;
 			}
+
+			initbits[bit] = std::make_pair(val,SigBit(wire,i));
 		}
 	}
 
+	void set_parallel(const SigMapView *sigmap_, ParallelDispatchThreadPool &thread_pool, RTLIL::Module *module)
+	{
+		sigmap = sigmap_;
+		initbits.clear();
+
+		const RTLIL::Module *const_module = module;
+		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->wires_size(), 1000));
+		ShardedVector<RTLIL::Wire*> init_wires(subpool);
+		subpool.run([const_module, &init_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			for (int i : ctx.item_range(const_module->wires_size())) {
+				RTLIL::Wire *wire = const_module->wire_at(i);
+				if (wire->attributes.count(ID::init))
+					init_wires.insert(ctx, wire);
+			}
+		});
+
+		for (RTLIL::Wire *wire : init_wires)
+			process_wire(wire);
+	}
+
 	RTLIL::State operator()(RTLIL::SigBit bit) const
 	{
 		auto it = initbits.find((*sigmap)(bit));

From e2166c468406c1f5c90777e3e4505dfc174fa440 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 29 Jan 2026 22:16:46 +0000
Subject: [PATCH 11/29] Parallelize `collect_garbage()`

---
 kernel/rtlil.cc | 95 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 29 deletions(-)

diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc
index 66bf3b9f7..82587ec26 100644
--- a/kernel/rtlil.cc
+++ b/kernel/rtlil.cc
@@ -22,6 +22,7 @@
 #include "kernel/newcelltypes.h"
 #include "kernel/binding.h"
 #include "kernel/sigtools.h"
+#include "kernel/threading.h"
 #include "frontends/verilog/verilog_frontend.h"
 #include "frontends/verilog/preproc.h"
 #include "backends/rtlil/rtlil_backend.h"
@@ -142,9 +143,17 @@ static constexpr bool check_well_known_id_order()
 // and in sorted ascii order, as required by the ID macro.
 static_assert(check_well_known_id_order());
 
+constexpr int STATIC_ID_END = static_cast<int>(RTLIL::StaticId::STATIC_ID_END);
+
 struct IdStringCollector {
+	IdStringCollector(std::vector<MonotonicFlag> &live_ids)
+			: live_ids(live_ids) {}
+
 	void trace(IdString id) {
-		live.insert(id.index_);
+		if (id.index_ >= STATIC_ID_END)
+			live_ids[id.index_ - STATIC_ID_END].set();
+		else if (id.index_ < 0)
+			live_autoidx_ids.push_back(id.index_);
 	}
 	template <typename T> void trace(const T* v) {
 		trace(*v);
@@ -178,10 +187,6 @@ struct IdStringCollector {
 			trace(element);
 	}
 
-	void trace(const RTLIL::Design &design) {
-		trace_values(design.modules_);
-		trace(design.selection_vars);
-	}
 	void trace(const RTLIL::Selection &selection_var) {
 		trace(selection_var.selected_modules);
 		trace(selection_var.selected_members);
@@ -190,15 +195,6 @@ struct IdStringCollector {
 		trace_keys(named.attributes);
 		trace(named.name);
 	}
-	void trace(const RTLIL::Module &module) {
-		trace_named(module);
-		trace_values(module.wires_);
-		trace_values(module.cells_);
-		trace(module.avail_parameters);
-		trace_keys(module.parameter_default_values);
-		trace_values(module.memories);
-		trace_values(module.processes);
-	}
 	void trace(const RTLIL::Wire &wire) {
 		trace_named(wire);
 		if (wire.known_driver())
@@ -234,7 +230,8 @@ struct IdStringCollector {
 		trace(action.memid);
 	}
 
-	std::unordered_set<int> live;
+	std::vector<MonotonicFlag> &live_ids;
+	std::vector<int> live_autoidx_ids;
 };
 
 int64_t RTLIL::OwningIdString::gc_ns;
@@ -243,20 +240,55 @@ int RTLIL::OwningIdString::gc_count;
 void RTLIL::OwningIdString::collect_garbage()
 {
 	int64_t start = PerformanceTimer::query();
-	IdStringCollector collector;
-	for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) {
-		collector.trace(*design);
-	}
-	int size = GetSize(global_id_storage_);
-	for (int i = static_cast<int>(StaticId::STATIC_ID_END); i < size; ++i) {
-		RTLIL::IdString::Storage &storage = global_id_storage_.at(i);
-		if (storage.buf == nullptr)
-			continue;
-		if (collector.live.find(i) != collector.live.end())
-			continue;
-		if (global_refcount_storage_.find(i) != global_refcount_storage_.end())
-			continue;
 
+	int pool_size = 0;
+	for (auto &[idx, design] : *RTLIL::Design::get_all_designs())
+		for (RTLIL::Module *module : design->modules())
+			pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+	ParallelDispatchThreadPool thread_pool(pool_size);
+
+	int size = GetSize(global_id_storage_);
+	std::vector<MonotonicFlag> live_ids(size - STATIC_ID_END);
+	std::vector<IdStringCollector> collectors;
+	int num_threads = thread_pool.num_threads();
+	collectors.reserve(num_threads);
+	for (int i = 0; i < num_threads; ++i)
+		collectors.emplace_back(live_ids);
+
+	for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) {
+		for (RTLIL::Module *module : design->modules()) {
+			collectors[0].trace_named(*module);
+			ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+			subpool.run([&collectors, module](const ParallelDispatchThreadPool::RunCtx &ctx) {
+				for (int i : ctx.item_range(module->cells_size()))
+					collectors[ctx.thread_num].trace(module->cell_at(i));
+				for (int i : ctx.item_range(module->wires_size()))
+					collectors[ctx.thread_num].trace(module->wire_at(i));
+			});
+			collectors[0].trace(module->avail_parameters);
+			collectors[0].trace_keys(module->parameter_default_values);
+			collectors[0].trace_values(module->memories);
+			collectors[0].trace_values(module->processes);
+		}
+		collectors[0].trace(design->selection_vars);
+	}
+
+	ShardedVector<int> free_ids(thread_pool);
+	thread_pool.run([&live_ids, size, &free_ids](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(size - STATIC_ID_END)) {
+			int index = i + STATIC_ID_END;
+			RTLIL::IdString::Storage &storage = global_id_storage_.at(index);
+			if (storage.buf == nullptr)
+				continue;
+			if (live_ids[i].load())
+				continue;
+			if (global_refcount_storage_.find(index) != global_refcount_storage_.end())
+				continue;
+			free_ids.insert(ctx, index);
+		}
+	});
+	for (int i : free_ids) {
+		RTLIL::IdString::Storage &storage = global_id_storage_.at(i);
 		if (yosys_xtrace) {
 			log("#X# Removed IdString '%s' with index %d.\n", storage.buf, i);
 			log_backtrace("-X- ", yosys_xtrace-1);
@@ -268,8 +300,13 @@ void RTLIL::OwningIdString::collect_garbage()
 		global_free_idx_list_.push_back(i);
 	}
 
+	std::unordered_set<int> live_autoidx_ids;
+	for (IdStringCollector &collector : collectors)
+		for (int id : collector.live_autoidx_ids)
+			live_autoidx_ids.insert(id);
+
 	for (auto it = global_autoidx_id_storage_.begin(); it != global_autoidx_id_storage_.end();) {
-		if (collector.live.find(it->first) != collector.live.end()) {
+		if (live_autoidx_ids.find(it->first) != live_autoidx_ids.end()) {
 			++it;
 			continue;
 		}

From b42bb05b63fdcce70c8b69d657757a77a5021a31 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 29 Jan 2026 22:45:10 +0000
Subject: [PATCH 12/29] Parallelize `Design::check()`

---
 kernel/rtlil.cc | 183 +++++++++++++++++++++++++++---------------------
 kernel/rtlil.h  |   4 +-
 2 files changed, 104 insertions(+), 83 deletions(-)

diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc
index 82587ec26..279c9b0e6 100644
--- a/kernel/rtlil.cc
+++ b/kernel/rtlil.cc
@@ -1361,15 +1361,21 @@ void RTLIL::Design::sort_modules()
 	modules_.sort(sort_by_id_str());
 }
 
+void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool);
+
 void RTLIL::Design::check()
 {
 #ifndef NDEBUG
 	log_assert(!selection_stack.empty());
+	int pool_size = 0;
+	for (auto &it : modules_)
+		pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, it.second->cells_size(), 1000));
+	ParallelDispatchThreadPool thread_pool(pool_size);
 	for (auto &it : modules_) {
 		log_assert(this == it.second->design);
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
-		it.second->check();
+		check_module(it.second, thread_pool);
 	}
 #endif
 }
@@ -1605,11 +1611,11 @@ size_t RTLIL::Module::count_id(RTLIL::IdString id)
 namespace {
 	struct InternalCellChecker
 	{
-		RTLIL::Module *module;
+		const RTLIL::Module *module;
 		RTLIL::Cell *cell;
 		pool<RTLIL::IdString> expected_params, expected_ports;
 
-		InternalCellChecker(RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { }
+		InternalCellChecker(const RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { }
 
 		void error(int linenr)
 		{
@@ -2585,88 +2591,96 @@ void RTLIL::Module::sort()
 		it.second->attributes.sort(sort_by_id_str());
 }
 
-void RTLIL::Module::check()
+void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool)
 {
 #ifndef NDEBUG
-	std::vector<bool> ports_declared;
-	for (auto &it : wires_) {
-		log_assert(this == it.second->module);
-		log_assert(it.first == it.second->name);
-		log_assert(!it.first.empty());
-		log_assert(it.second->width >= 0);
-		log_assert(it.second->port_id >= 0);
-		for (auto &it2 : it.second->attributes)
-			log_assert(!it2.first.empty());
-		if (it.second->port_id) {
-			log_assert(GetSize(ports) >= it.second->port_id);
-			log_assert(ports.at(it.second->port_id-1) == it.first);
-			log_assert(it.second->port_input || it.second->port_output);
-			if (GetSize(ports_declared) < it.second->port_id)
-				ports_declared.resize(it.second->port_id);
-			log_assert(ports_declared[it.second->port_id-1] == false);
-			ports_declared[it.second->port_id-1] = true;
-		} else
-			log_assert(!it.second->port_input && !it.second->port_output);
-	}
-	for (auto port_declared : ports_declared)
-		log_assert(port_declared == true);
-	log_assert(GetSize(ports) == GetSize(ports_declared));
+	ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+	const RTLIL::Module *const_module = module;
 
-	for (auto &it : memories) {
+	pool<std::string> memory_strings;
+	for (auto &it : module->memories) {
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
 		log_assert(it.second->width >= 0);
 		log_assert(it.second->size >= 0);
 		for (auto &it2 : it.second->attributes)
 			log_assert(!it2.first.empty());
+		memory_strings.insert(it.second->name.str());
 	}
 
-	pool<IdString> packed_memids;
+	std::vector<MonotonicFlag> ports_declared(GetSize(module->ports));
+	ShardedVector<std::string> memids(subpool);
+	subpool.run([const_module, &ports_declared, &memory_strings, &memids](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			auto it = *const_module->cells_.element(i);
+			log_assert(const_module == it.second->module);
+			log_assert(it.first == it.second->name);
+			log_assert(!it.first.empty());
+			log_assert(!it.second->type.empty());
+			for (auto &it2 : it.second->connections()) {
+				log_assert(!it2.first.empty());
+				it2.second.check(const_module);
+			}
+			for (auto &it2 : it.second->attributes)
+				log_assert(!it2.first.empty());
+			for (auto &it2 : it.second->parameters)
+				log_assert(!it2.first.empty());
+			InternalCellChecker checker(const_module, it.second);
+			checker.check();
+			if (it.second->has_memid()) {
+				log_assert(memory_strings.count(it.second->parameters.at(ID::MEMID).decode_string()));
+			} else if (it.second->is_mem_cell()) {
+				std::string memid = it.second->parameters.at(ID::MEMID).decode_string();
+				log_assert(!memory_strings.count(memid));
+				memids.insert(ctx, std::move(memid));
+			}
+			auto cell_mod = const_module->design->module(it.first);
+			if (cell_mod != nullptr) {
+				// assertion check below to make sure that there are no
+				// cases where a cell has a blackbox attribute since
+				// that is deprecated
+				#ifdef __GNUC__
+				#pragma GCC diagnostic push
+				#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+				#endif
+				log_assert(!it.second->get_blackbox_attribute());
+				#ifdef __GNUC__
+				#pragma GCC diagnostic pop
+				#endif
+			}
+		}
 
-	for (auto &it : cells_) {
-		log_assert(this == it.second->module);
-		log_assert(it.first == it.second->name);
-		log_assert(!it.first.empty());
-		log_assert(!it.second->type.empty());
-		for (auto &it2 : it.second->connections()) {
-			log_assert(!it2.first.empty());
-			it2.second.check(this);
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			auto it = *const_module->wires_.element(i);
+			log_assert(const_module == it.second->module);
+			log_assert(it.first == it.second->name);
+			log_assert(!it.first.empty());
+			log_assert(it.second->width >= 0);
+			log_assert(it.second->port_id >= 0);
+			for (auto &it2 : it.second->attributes)
+				log_assert(!it2.first.empty());
+			if (it.second->port_id) {
+				log_assert(GetSize(const_module->ports) >= it.second->port_id);
+				log_assert(const_module->ports.at(it.second->port_id-1) == it.first);
+				log_assert(it.second->port_input || it.second->port_output);
+				log_assert(it.second->port_id <= GetSize(ports_declared));
+				bool previously_declared = ports_declared[it.second->port_id-1].set_and_return_old();
+				log_assert(previously_declared == false);
+			} else
+				log_assert(!it.second->port_input && !it.second->port_output);
 		}
-		for (auto &it2 : it.second->attributes)
-			log_assert(!it2.first.empty());
-		for (auto &it2 : it.second->parameters)
-			log_assert(!it2.first.empty());
-		InternalCellChecker checker(this, it.second);
-		checker.check();
-		if (it.second->has_memid()) {
-			log_assert(memories.count(it.second->parameters.at(ID::MEMID).decode_string()));
-		} else if (it.second->is_mem_cell()) {
-			IdString memid = it.second->parameters.at(ID::MEMID).decode_string();
-			log_assert(!memories.count(memid));
-			log_assert(!packed_memids.count(memid));
-			packed_memids.insert(memid);
-		}
-		auto cell_mod = design->module(it.first);
-		if (cell_mod != nullptr) {
-			// assertion check below to make sure that there are no
-			// cases where a cell has a blackbox attribute since
-			// that is deprecated
-			#ifdef __GNUC__
-			#pragma GCC diagnostic push
-			#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-			#endif
-			log_assert(!it.second->get_blackbox_attribute());
-			#ifdef __GNUC__
-			#pragma GCC diagnostic pop
-			#endif
-		}
-	}
+	});
+	for (const MonotonicFlag &port_declared : ports_declared)
+		log_assert(port_declared.load() == true);
+	pool<std::string> memids_pool;
+	for (std::string &memid : memids)
+		log_assert(memids_pool.insert(memid).second);
 
-	for (auto &it : processes) {
+	for (auto &it : module->processes) {
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
 		log_assert(it.second->root_case.compare.empty());
-		std::vector<CaseRule*> all_cases = {&it.second->root_case};
+		std::vector<RTLIL::CaseRule*> all_cases = {&it.second->root_case};
 		for (size_t i = 0; i < all_cases.size(); i++) {
 			for (auto &switch_it : all_cases[i]->switches) {
 				for (auto &case_it : switch_it->cases) {
@@ -2679,34 +2693,41 @@ void RTLIL::Module::check()
 		}
 		for (auto &sync_it : it.second->syncs) {
 			switch (sync_it->type) {
-				case SyncType::ST0:
-				case SyncType::ST1:
-				case SyncType::STp:
-				case SyncType::STn:
-				case SyncType::STe:
+				case RTLIL::SyncType::ST0:
+				case RTLIL::SyncType::ST1:
+				case RTLIL::SyncType::STp:
+				case RTLIL::SyncType::STn:
+				case RTLIL::SyncType::STe:
 					log_assert(!sync_it->signal.empty());
 					break;
-				case SyncType::STa:
-				case SyncType::STg:
-				case SyncType::STi:
+				case RTLIL::SyncType::STa:
+				case RTLIL::SyncType::STg:
+				case RTLIL::SyncType::STi:
 					log_assert(sync_it->signal.empty());
 					break;
 			}
 		}
 	}
 
-	for (auto &it : connections_) {
+	for (auto &it : module->connections_) {
 		log_assert(it.first.size() == it.second.size());
 		log_assert(!it.first.has_const());
-		it.first.check(this);
-		it.second.check(this);
+		it.first.check(module);
+		it.second.check(module);
 	}
 
-	for (auto &it : attributes)
+	for (auto &it : module->attributes)
 		log_assert(!it.first.empty());
 #endif
 }
 
+void RTLIL::Module::check()
+{
+	int pool_size = ThreadPool::work_pool_size(0, cells_size(), 1000);
+	ParallelDispatchThreadPool thread_pool(pool_size);
+	check_module(this, thread_pool);
+}
+
 void RTLIL::Module::optimize()
 {
 }
@@ -5365,7 +5386,7 @@ RTLIL::SigSpec RTLIL::SigSpec::repeat(int num) const
 }
 
 #ifndef NDEBUG
-void RTLIL::SigSpec::check(Module *mod) const
+void RTLIL::SigSpec::check(const Module *mod) const
 {
 	if (rep_ == CHUNK)
 	{
diff --git a/kernel/rtlil.h b/kernel/rtlil.h
index 6dec89dfe..b9d86b91c 100644
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@@ -1760,9 +1760,9 @@ public:
 	}
 
 #ifndef NDEBUG
-	void check(Module *mod = nullptr) const;
+	void check(const Module *mod = nullptr) const;
 #else
-	void check(Module *mod = nullptr) const { (void)mod; }
+	void check(const Module *mod = nullptr) const { (void)mod; }
 #endif
 };
 

From c2bb7d6a827ab4a026b376cc98b42360f06ed2e1 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:14:09 +0000
Subject: [PATCH 13/29] Make `keep_cache_t` process all modules up-front
 instead of on-demand

We will want to query `keep_cache` from parallel threads. If we compute
the results on-demand, that means we need synchronization for cache
access in those queries, which adds complexity and overhead. Instead, prefill
the cache with the status of all relevant modules. Note that this doesn't
actually do more work --- we always consult `keep_cache` for all cells of
all selected modules, so scanning all those cells and determining the kept
status of all dependency modules is always required.

Later in this PR we're going to parallelize `scan_module` itself, and that's also
much easier to do when no other parallel threads are running.
---
 passes/opt/opt_clean.cc | 162 +++++++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 59 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index 72d22ddd3..8c9cfd13b 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -34,47 +34,95 @@ using RTLIL::id2cstr;
 
 struct keep_cache_t
 {
-	Design *design;
-	dict<Module*, bool> cache;
-	bool purge_mode = false;
+	dict<Module*, bool> keep_modules;
+	bool purge_mode;
 
-	void reset(Design *design = nullptr, bool purge_mode = false)
-	{
-		this->design = design;
-		this->purge_mode = purge_mode;
-		cache.clear();
-	}
-
-	bool query(Module *module)
-	{
-		log_assert(design != nullptr);
-
-		if (module == nullptr)
-			return false;
-
-		if (cache.count(module))
-			return cache.at(module);
-
-		cache[module] = true;
-		if (!module->get_bool_attribute(ID::keep)) {
-		    bool found_keep = false;
-		    for (auto cell : module->cells())
-			if (query(cell, true /* ignore_specify */)) {
-			    found_keep = true;
-			    break;
-			}
-		    for (auto wire : module->wires())
-			if (wire->get_bool_attribute(ID::keep)) {
-			    found_keep = true;
-			    break;
-			}
-		    cache[module] = found_keep;
+	keep_cache_t(bool purge_mode, const std::vector<RTLIL::Module *> &selected_modules)
+			: purge_mode(purge_mode) {
+		std::vector<RTLIL::Module *> scan_modules_worklist;
+		dict<RTLIL::Module *, std::vector<RTLIL::Module*>> dependents;
+		std::vector<RTLIL::Module *> propagate_kept_modules_worklist;
+		for (RTLIL::Module *module : selected_modules) {
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, dependents, true, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
 		}
 
-		return cache[module];
+		while (!scan_modules_worklist.empty()) {
+			RTLIL::Module *module = scan_modules_worklist.back();
+			scan_modules_worklist.pop_back();
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, dependents, false, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
+		}
+
+		while (!propagate_kept_modules_worklist.empty()) {
+			RTLIL::Module *module = propagate_kept_modules_worklist.back();
+			propagate_kept_modules_worklist.pop_back();
+			for (RTLIL::Module *dependent : dependents[module]) {
+				if (keep_modules[dependent])
+					continue;
+				keep_modules[dependent] = true;
+				propagate_kept_modules_worklist.push_back(dependent);
+			}
+		}
 	}
 
-	bool query(Cell *cell, bool ignore_specify = false)
+	bool query(Cell *cell) const
+	{
+		if (keep_cell(cell, purge_mode))
+			return true;
+		if (cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
+			return true;
+		if (cell->module && cell->module->design) {
+			RTLIL::Module *cell_module = cell->module->design->module(cell->type);
+			return cell_module != nullptr && keep_modules.at(cell_module);
+		}
+		return false;
+	}
+
+private:
+	bool scan_module(Module *module, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
+			bool scan_all_cells, std::vector<Module*> &worklist) const
+	{
+		bool keep = false;
+		if (module->get_bool_attribute(ID::keep)) {
+			if (!scan_all_cells)
+				return true;
+			keep = true;
+		}
+
+		for (Cell *cell : module->cells()) {
+			if (keep_cell(cell, purge_mode)) {
+				if (!scan_all_cells)
+					return true;
+				keep = true;
+			}
+			if (module->design) {
+				RTLIL::Module *cell_module = module->design->module(cell->type);
+				if (cell_module != nullptr) {
+					dependents[cell_module].push_back(module);
+					worklist.push_back(cell_module);
+				}
+			}
+		}
+		if (!scan_all_cells && keep)
+			return true;
+		for (Wire *wire : module->wires()) {
+			if (wire->get_bool_attribute(ID::keep)) {
+				return true;
+			}
+		}
+		return keep;
+	}
+
+	static bool keep_cell(Cell *cell, bool purge_mode)
 	{
 		if (cell->type.in(ID($assert), ID($assume), ID($live), ID($fair), ID($cover)))
 			return true;
@@ -82,9 +130,6 @@ struct keep_cache_t
 		if (cell->type.in(ID($overwrite_tag)))
 			return true;
 
-		if (!ignore_specify && cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
-			return true;
-
 		if (cell->type == ID($print) || cell->type == ID($check))
 			return true;
 
@@ -93,22 +138,17 @@ struct keep_cache_t
 
 		if (!purge_mode && cell->type == ID($scopeinfo))
 			return true;
-
-		if (cell->module && cell->module->design)
-			return query(cell->module->design->module(cell->type));
-
 		return false;
 	}
 };
 
-keep_cache_t keep_cache;
 static constexpr auto ct_reg = StaticCellTypes::Categories::join(
 	StaticCellTypes::Compat::mem_ff,
 	StaticCellTypes::categories.is_anyinit);
 NewCellTypes ct_all;
 int count_rm_cells, count_rm_wires;
 
-void rmunused_module_cells(Module *module, bool verbose)
+void rmunused_module_cells(Module *module, bool verbose, keep_cache_t &keep_cache)
 {
 	SigMap sigmap(module);
 	dict<IdString, pool<Cell*>> mem2cells;
@@ -597,7 +637,7 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose)
 	return did_something;
 }
 
-void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit)
+void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, keep_cache_t &keep_cache)
 {
 	if (verbose)
 		log("Finding unused cells or wires in module %s..\n", module->name);
@@ -654,7 +694,7 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool
 	if (!delcells.empty())
 		module->design->scratchpad_set_bool("opt.did_something", true);
 
-	rmunused_module_cells(module, verbose);
+	rmunused_module_cells(module, verbose, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose)) { }
 
 	if (rminit && rmunused_module_init(module, verbose))
@@ -697,17 +737,20 @@ struct OptCleanPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		keep_cache.reset(design, purge_mode);
+		std::vector<RTLIL::Module*> selected_modules;
+		for (auto module : design->selected_whole_modules_warn()) {
+			if (!module->has_processes_warn())
+				selected_modules.push_back(module);
+		}
+		keep_cache_t keep_cache(purge_mode, selected_modules);
 
 		ct_all.setup(design);
 
 		count_rm_cells = 0;
 		count_rm_wires = 0;
 
-		for (auto module : design->selected_whole_modules_warn()) {
-			if (module->has_processes_warn())
-				continue;
-			rmunused_module(module, purge_mode, true, true);
+		for (auto module : selected_modules) {
+			rmunused_module(module, purge_mode, true, true, keep_cache);
 		}
 
 		if (count_rm_cells > 0 || count_rm_wires > 0)
@@ -716,7 +759,6 @@ struct OptCleanPass : public Pass {
 		design->optimize();
 		design->check();
 
-		keep_cache.reset();
 		ct_all.clear();
 		log_pop();
 
@@ -755,17 +797,20 @@ struct CleanPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		keep_cache.reset(design);
+		std::vector<RTLIL::Module*> selected_modules;
+		for (auto module : design->selected_unboxed_whole_modules()) {
+			if (!module->has_processes())
+				selected_modules.push_back(module);
+		}
+		keep_cache_t keep_cache(purge_mode, selected_modules);
 
 		ct_all.setup(design);
 
 		count_rm_cells = 0;
 		count_rm_wires = 0;
 
-		for (auto module : design->selected_unboxed_whole_modules()) {
-			if (module->has_processes())
-				continue;
-			rmunused_module(module, purge_mode, ys_debug(), true);
+		for (auto module : selected_modules) {
+			rmunused_module(module, purge_mode, ys_debug(), true, keep_cache);
 		}
 
 		log_suppressed();
@@ -775,7 +820,6 @@ struct CleanPass : public Pass {
 		design->optimize();
 		design->check();
 
-		keep_cache.reset();
 		ct_all.clear();
 
 		request_garbage_collection();

From 72a21fe01d332cec72fbe77857ab16a1bcc14f53 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:27:09 +0000
Subject: [PATCH 14/29] Introduce `RmStats` struct to encapsulate removal
 statistics

Turns out this is not strictly necessary for this PR but it's
still a good thing to do and makes it clearer that the stats
are not modified in a possibly racy way.
---
 passes/opt/opt_clean.cc | 47 ++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index 8c9cfd13b..278c00e1b 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -146,9 +146,19 @@ static constexpr auto ct_reg = StaticCellTypes::Categories::join(
 	StaticCellTypes::Compat::mem_ff,
 	StaticCellTypes::categories.is_anyinit);
 NewCellTypes ct_all;
-int count_rm_cells, count_rm_wires;
 
-void rmunused_module_cells(Module *module, bool verbose, keep_cache_t &keep_cache)
+struct RmStats {
+	int count_rm_cells = 0;
+	int count_rm_wires = 0;
+
+	void log()
+	{
+		if (count_rm_cells > 0 || count_rm_wires > 0)
+			YOSYS_NAMESPACE_PREFIX log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+	}
+};
+
+void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
 {
 	SigMap sigmap(module);
 	dict<IdString, pool<Cell*>> mem2cells;
@@ -253,7 +263,7 @@ void rmunused_module_cells(Module *module, bool verbose, keep_cache_t &keep_cach
 		if (cell->is_builtin_ff())
 			ffinit.remove_init(cell->getPort(ID::Q));
 		module->remove(cell);
-		count_rm_cells++;
+		stats.count_rm_cells++;
 	}
 
 	for (auto it : mem_unused)
@@ -345,7 +355,7 @@ bool check_public_name(RTLIL::IdString id)
 	return true;
 }
 
-bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose)
+bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose, RmStats &stats)
 {
 	// `register_signals` and `connected_signals` will help us decide later on
 	// on picking representatives out of groups of connected signals
@@ -547,7 +557,7 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	}
 
 	module->remove(del_wires_queue);
-	count_rm_wires += GetSize(del_wires_queue);
+	stats.count_rm_wires += GetSize(del_wires_queue);
 
 	if (verbose && del_temp_wires_count)
 		log_debug("  removed %d unused temporary wires.\n", del_temp_wires_count);
@@ -637,7 +647,7 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose)
 	return did_something;
 }
 
-void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, keep_cache_t &keep_cache)
+void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
 {
 	if (verbose)
 		log("Finding unused cells or wires in module %s..\n", module->name);
@@ -694,11 +704,11 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool
 	if (!delcells.empty())
 		module->design->scratchpad_set_bool("opt.did_something", true);
 
-	rmunused_module_cells(module, verbose, keep_cache);
-	while (rmunused_module_signals(module, purge_mode, verbose)) { }
+	rmunused_module_cells(module, verbose, stats, keep_cache);
+	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
 	if (rminit && rmunused_module_init(module, verbose))
-		while (rmunused_module_signals(module, purge_mode, verbose)) { }
+		while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 }
 
 struct OptCleanPass : public Pass {
@@ -746,15 +756,11 @@ struct OptCleanPass : public Pass {
 
 		ct_all.setup(design);
 
-		count_rm_cells = 0;
-		count_rm_wires = 0;
-
+		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, true, true, keep_cache);
+			rmunused_module(module, purge_mode, true, true, stats, keep_cache);
 		}
-
-		if (count_rm_cells > 0 || count_rm_wires > 0)
-			log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+		stats.log();
 
 		design->optimize();
 		design->check();
@@ -806,16 +812,13 @@ struct CleanPass : public Pass {
 
 		ct_all.setup(design);
 
-		count_rm_cells = 0;
-		count_rm_wires = 0;
-
+		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, ys_debug(), true, keep_cache);
+			rmunused_module(module, purge_mode, ys_debug(), true, stats, keep_cache);
 		}
 
 		log_suppressed();
-		if (count_rm_cells > 0 || count_rm_wires > 0)
-			log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+		stats.log();
 
 		design->optimize();
 		design->check();

From 887c32cb54bc839eca788f81f427aca35eb59a0f Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 21:58:37 +0000
Subject: [PATCH 15/29] Create a toplevel `ParallelDispatchThreadPool` and
 parallelize `keep_cache_t::scan_module()` with it

---
 passes/opt/opt_clean.cc | 90 ++++++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index 278c00e1b..8281f18e1 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -23,6 +23,7 @@
 #include "kernel/celltypes.h"
 #include "kernel/newcelltypes.h"
 #include "kernel/ffinit.h"
+#include "kernel/threading.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <set>
@@ -37,15 +38,16 @@ struct keep_cache_t
 	dict<Module*, bool> keep_modules;
 	bool purge_mode;
 
-	keep_cache_t(bool purge_mode, const std::vector<RTLIL::Module *> &selected_modules)
+	keep_cache_t(bool purge_mode, ParallelDispatchThreadPool &thread_pool, const std::vector<RTLIL::Module *> &selected_modules)
 			: purge_mode(purge_mode) {
+
 		std::vector<RTLIL::Module *> scan_modules_worklist;
 		dict<RTLIL::Module *, std::vector<RTLIL::Module*>> dependents;
 		std::vector<RTLIL::Module *> propagate_kept_modules_worklist;
 		for (RTLIL::Module *module : selected_modules) {
 			if (keep_modules.count(module))
 				continue;
-			bool keep = scan_module(module, dependents, true, scan_modules_worklist);
+			bool keep = scan_module(module, thread_pool, dependents, ALL_CELLS, scan_modules_worklist);
 			keep_modules[module] = keep;
 			if (keep)
 				propagate_kept_modules_worklist.push_back(module);
@@ -56,7 +58,7 @@ struct keep_cache_t
 			scan_modules_worklist.pop_back();
 			if (keep_modules.count(module))
 				continue;
-			bool keep = scan_module(module, dependents, false, scan_modules_worklist);
+			bool keep = scan_module(module, thread_pool, dependents, MINIMUM_CELLS, scan_modules_worklist);
 			keep_modules[module] = keep;
 			if (keep)
 				propagate_kept_modules_worklist.push_back(module);
@@ -88,38 +90,62 @@ struct keep_cache_t
 	}
 
 private:
-	bool scan_module(Module *module, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
-			bool scan_all_cells, std::vector<Module*> &worklist) const
+	enum ScanCells {
+		// Scan every cell to see if it uses a module that is kept.
+		ALL_CELLS,
+		// Stop scanning cells if we determine early that this module is kept.
+		MINIMUM_CELLS,
+	};
+	bool scan_module(Module *module, ParallelDispatchThreadPool &thread_pool, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
+			ScanCells scan_cells, std::vector<Module*> &worklist) const
 	{
-		bool keep = false;
+		MonotonicFlag keep_module;
 		if (module->get_bool_attribute(ID::keep)) {
-			if (!scan_all_cells)
+			if (scan_cells == MINIMUM_CELLS)
 				return true;
-			keep = true;
+			keep_module.set();
 		}
 
-		for (Cell *cell : module->cells()) {
-			if (keep_cell(cell, purge_mode)) {
-				if (!scan_all_cells)
-					return true;
-				keep = true;
-			}
-			if (module->design) {
-				RTLIL::Module *cell_module = module->design->module(cell->type);
-				if (cell_module != nullptr) {
-					dependents[cell_module].push_back(module);
-					worklist.push_back(cell_module);
+		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+		ShardedVector<Module*> deps(subpool);
+		const RTLIL::Module *const_module = module;
+		bool purge_mode = this->purge_mode;
+		subpool.run([purge_mode, const_module, scan_cells, &deps, &keep_module](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			bool keep = false;
+			for (int i : ctx.item_range(const_module->cells_size())) {
+				Cell *cell = const_module->cell_at(i);
+				if (keep_cell(cell, purge_mode)) {
+					if (scan_cells == MINIMUM_CELLS) {
+						keep_module.set();
+						return;
+					}
+					keep = true;
+				}
+				if (const_module->design) {
+					RTLIL::Module *cell_module = const_module->design->module(cell->type);
+					if (cell_module != nullptr)
+						deps.insert(ctx, cell_module);
 				}
 			}
-		}
-		if (!scan_all_cells && keep)
-			return true;
-		for (Wire *wire : module->wires()) {
-			if (wire->get_bool_attribute(ID::keep)) {
-				return true;
+			if (keep) {
+				keep_module.set();
+				return;
 			}
+			for (int i : ctx.item_range(const_module->wires_size())) {
+				Wire *wire = const_module->wire_at(i);
+				if (wire->get_bool_attribute(ID::keep)) {
+					keep_module.set();
+					return;
+				}
+			}
+		});
+		if (scan_cells == MINIMUM_CELLS && keep_module.load())
+			return true;
+		for (Module *dep : deps) {
+			dependents[dep].push_back(module);
+			worklist.push_back(dep);
 		}
-		return keep;
+		return keep_module.load();
 	}
 
 	static bool keep_cell(Cell *cell, bool purge_mode)
@@ -752,7 +778,11 @@ struct OptCleanPass : public Pass {
 			if (!module->has_processes_warn())
 				selected_modules.push_back(module);
 		}
-		keep_cache_t keep_cache(purge_mode, selected_modules);
+		int thread_pool_size = 0;
+		for (RTLIL::Module *m : selected_modules)
+			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
+		ParallelDispatchThreadPool thread_pool(thread_pool_size);
+		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
 
 		ct_all.setup(design);
 
@@ -808,7 +838,11 @@ struct CleanPass : public Pass {
 			if (!module->has_processes())
 				selected_modules.push_back(module);
 		}
-		keep_cache_t keep_cache(purge_mode, selected_modules);
+		int thread_pool_size = 0;
+		for (RTLIL::Module *m : selected_modules)
+			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
+		ParallelDispatchThreadPool thread_pool(thread_pool_size);
+		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
 
 		ct_all.setup(design);
 

From a7437c636db575ed1981ab459ba3e24e48c127c2 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:06:19 +0000
Subject: [PATCH 16/29] Pass the toplevel thread pool to `rmunused_module`,
 create a `Subpool`, and parallelize `remove_temporary_cells`

---
 passes/opt/opt_clean.cc | 94 ++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index 8281f18e1..c438684a5 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -673,47 +673,53 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose)
 	return did_something;
 }
 
-void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
+void remove_temporary_cells(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
 {
-	if (verbose)
-		log("Finding unused cells or wires in module %s..\n", module->name);
+	ShardedVector<RTLIL::Cell*> delcells(subpool);
+	ShardedVector<RTLIL::SigSig> new_connections(subpool);
+	const RTLIL::Module *const_module = module;
+	subpool.run([const_module, &delcells, &new_connections](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
+				bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
+				RTLIL::SigSpec a = cell->getPort(ID::A);
+				RTLIL::SigSpec y = cell->getPort(ID::Y);
+				a.extend_u0(GetSize(y), is_signed);
 
-	std::vector<RTLIL::Cell*> delcells;
-	for (auto cell : module->cells()) {
-		if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
-			bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
-			RTLIL::SigSpec a = cell->getPort(ID::A);
-			RTLIL::SigSpec y = cell->getPort(ID::Y);
-			a.extend_u0(GetSize(y), is_signed);
-
-			if (a.has_const(State::Sz)) {
-				SigSpec new_a;
-				SigSpec new_y;
-				for (int i = 0; i < GetSize(a); ++i) {
-					SigBit b = a[i];
-					if (b == State::Sz)
-						continue;
-					new_a.append(b);
-					new_y.append(y[i]);
+				if (a.has_const(State::Sz)) {
+					RTLIL::SigSpec new_a;
+					RTLIL::SigSpec new_y;
+					for (int i = 0; i < GetSize(a); ++i) {
+						RTLIL::SigBit b = a[i];
+						if (b == State::Sz)
+							continue;
+						new_a.append(b);
+						new_y.append(y[i]);
+					}
+					a = std::move(new_a);
+					y = std::move(new_y);
 				}
-				a = std::move(new_a);
-				y = std::move(new_y);
+				if (!y.empty())
+					new_connections.insert(ctx, {y, a});
+				delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
+				RTLIL::SigSpec a = cell->getPort(ID::A);
+				RTLIL::SigSpec b = cell->getPort(ID::B);
+				if (a.has_const() && !b.has_const())
+					std::swap(a, b);
+				new_connections.insert(ctx, {a, b});
+				delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
+				delcells.insert(ctx, cell);
 			}
-			if (!y.empty())
-				module->connect(y, a);
-			delcells.push_back(cell);
-		} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
-			RTLIL::SigSpec a = cell->getPort(ID::A);
-			RTLIL::SigSpec b = cell->getPort(ID::B);
-			if (a.has_const() && !b.has_const())
-				std::swap(a, b);
-			module->connect(a, b);
-			delcells.push_back(cell);
-		} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
-			delcells.push_back(cell);
 		}
+	});
+	bool did_something = false;
+	for (RTLIL::SigSig &connection : new_connections) {
+		module->connect(connection);
 	}
-	for (auto cell : delcells) {
+	for (RTLIL::Cell *cell : delcells) {
 		if (verbose) {
 			if (cell->type == ID($connect))
 				log_debug("  removing connect cell `%s': %s <-> %s\n", cell->name,
@@ -726,10 +732,22 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool
 						log_signal(cell->getPort(ID::Y)), log_signal(cell->getPort(ID::A)));
 		}
 		module->remove(cell);
+		did_something = true;
 	}
-	if (!delcells.empty())
+	if (did_something)
 		module->design->scratchpad_set_bool("opt.did_something", true);
+}
 
+void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
+{
+	if (verbose)
+		log("Finding unused cells or wires in module %s..\n", module->name);
+
+	// Use no more than one worker per thousand cells, rounded down, so
+	// we only start multithreading with at least 2000 cells.
+	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
+	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
+	remove_temporary_cells(module, subpool, verbose);
 	rmunused_module_cells(module, verbose, stats, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
@@ -788,7 +806,7 @@ struct OptCleanPass : public Pass {
 
 		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, true, true, stats, keep_cache);
+			rmunused_module(module, thread_pool, purge_mode, true, true, stats, keep_cache);
 		}
 		stats.log();
 
@@ -848,7 +866,7 @@ struct CleanPass : public Pass {
 
 		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, ys_debug(), true, stats, keep_cache);
+			rmunused_module(module, thread_pool, purge_mode, ys_debug(), true, stats, keep_cache);
 		}
 
 		log_suppressed();

From 8e044d1045cce928f2828de2578ca5e1660d3c82 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:46:10 +0000
Subject: [PATCH 17/29] Pass the module `Subpool` to `rmunused_module_init` and
 parallelize that function

---
 passes/opt/opt_clean.cc | 134 +++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index c438684a5..488da1531 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -594,77 +594,91 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	return !del_wires_queue.empty();
 }
 
-bool rmunused_module_init(RTLIL::Module *module, bool verbose)
+bool rmunused_module_init(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
 {
-	bool did_something = false;
 
 	SigMap sigmap(module);
-	dict<SigBit, State> qbits;
 
-	for (auto cell : module->cells())
-		if (StaticCellTypes::Compat::internals_mem_ff(cell->type) && cell->hasPort(ID::Q))
-		{
-			SigSpec sig = cell->getPort(ID::Q);
-
-			for (int i = 0; i < GetSize(sig); i++)
+	const Module *const_module = module;
+	ShardedVector<std::pair<SigBit, State>> results(subpool);
+	subpool.run([const_module, &results](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (StaticCellTypes::Compat::internals_mem_ff(cell->type) && cell->hasPort(ID::Q))
 			{
-				SigBit bit = sig[i];
+				SigSpec sig = cell->getPort(ID::Q);
 
-				if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
-					continue;
+				for (int i = 0; i < GetSize(sig); i++)
+				{
+					SigBit bit = sig[i];
 
-				Const init = bit.wire->attributes.at(ID::init);
+					if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
+						continue;
 
-				if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
-					continue;
+					Const init = bit.wire->attributes.at(ID::init);
 
-				sigmap.add(bit);
-				qbits[bit] = init[i];
-			}
-		}
+					if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
+						continue;
 
-	for (auto wire : module->wires())
-	{
-		if (wire->attributes.count(ID::init) == 0)
-			continue;
-
-		Const init = wire->attributes.at(ID::init);
-
-		for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
-		{
-			if (init[i] == State::Sx || init[i] == State::Sz)
-				continue;
-
-			SigBit wire_bit = SigBit(wire, i);
-			SigBit mapped_wire_bit = sigmap(wire_bit);
-
-			if (wire_bit == mapped_wire_bit)
-				goto next_wire;
-
-			if (mapped_wire_bit.wire) {
-				if (qbits.count(mapped_wire_bit) == 0)
-					goto next_wire;
-
-				if (qbits.at(mapped_wire_bit) != init[i])
-					goto next_wire;
-			}
-			else {
-				if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
-					goto next_wire;
-
-				if (mapped_wire_bit != init[i]) {
-					log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
-					goto next_wire;
+					results.insert(ctx, {bit, init[i]});
 				}
 			}
 		}
+	});
+	dict<SigBit, State> qbits;
+	for (std::pair<SigBit, State> &p : results) {
+		sigmap.add(p.first);
+		qbits[p.first] = p.second;
+	}
 
+	ShardedVector<RTLIL::Wire*> wire_results(subpool);
+	subpool.run([const_module, &sigmap, &qbits, &wire_results](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int j : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(j);
+			if (wire->attributes.count(ID::init) == 0)
+				continue;
+			Const init = wire->attributes.at(ID::init);
+
+			for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
+			{
+				if (init[i] == State::Sx || init[i] == State::Sz)
+					continue;
+
+				SigBit wire_bit = SigBit(wire, i);
+				SigBit mapped_wire_bit = sigmap(wire_bit);
+
+				if (wire_bit == mapped_wire_bit)
+					goto next_wire;
+
+				if (mapped_wire_bit.wire) {
+					if (qbits.count(mapped_wire_bit) == 0)
+						goto next_wire;
+
+					if (qbits.at(mapped_wire_bit) != init[i])
+						goto next_wire;
+				}
+				else {
+					if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
+						goto next_wire;
+
+					if (mapped_wire_bit != init[i]) {
+						log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
+						goto next_wire;
+					}
+				}
+			}
+			wire_results.insert(ctx, wire);
+
+			next_wire:;
+		}
+	});
+
+	bool did_something = false;
+	for (RTLIL::Wire *wire : wire_results) {
 		if (verbose)
 			log_debug("  removing redundant init attribute on %s.\n", log_id(wire));
-
 		wire->attributes.erase(ID::init);
 		did_something = true;
-	next_wire:;
 	}
 
 	if (did_something)
@@ -751,7 +765,7 @@ void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_p
 	rmunused_module_cells(module, verbose, stats, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
-	if (rminit && rmunused_module_init(module, verbose))
+	if (rminit && rmunused_module_init(module, subpool, verbose))
 		while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 }
 
@@ -792,10 +806,9 @@ struct OptCleanPass : public Pass {
 		extra_args(args, argidx, design);
 
 		std::vector<RTLIL::Module*> selected_modules;
-		for (auto module : design->selected_whole_modules_warn()) {
+		for (auto module : design->selected_whole_modules_warn())
 			if (!module->has_processes_warn())
 				selected_modules.push_back(module);
-		}
 		int thread_pool_size = 0;
 		for (RTLIL::Module *m : selected_modules)
 			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
@@ -805,9 +818,8 @@ struct OptCleanPass : public Pass {
 		ct_all.setup(design);
 
 		RmStats stats;
-		for (auto module : selected_modules) {
+		for (auto module : selected_modules)
 			rmunused_module(module, thread_pool, purge_mode, true, true, stats, keep_cache);
-		}
 		stats.log();
 
 		design->optimize();
@@ -852,10 +864,9 @@ struct CleanPass : public Pass {
 		extra_args(args, argidx, design);
 
 		std::vector<RTLIL::Module*> selected_modules;
-		for (auto module : design->selected_unboxed_whole_modules()) {
+		for (auto module : design->selected_unboxed_whole_modules())
 			if (!module->has_processes())
 				selected_modules.push_back(module);
-		}
 		int thread_pool_size = 0;
 		for (RTLIL::Module *m : selected_modules)
 			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
@@ -865,9 +876,8 @@ struct CleanPass : public Pass {
 		ct_all.setup(design);
 
 		RmStats stats;
-		for (auto module : selected_modules) {
+		for (auto module : selected_modules)
 			rmunused_module(module, thread_pool, purge_mode, ys_debug(), true, stats, keep_cache);
-		}
 
 		log_suppressed();
 		stats.log();

From 19a7c8fcf352c5dcbd41ec13418ce3f5d91cfd5d Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:59:04 +0000
Subject: [PATCH 18/29] Pass the module `Subpool` to `rmunused_module_cells`
 and parallelize that function

---
 passes/opt/opt_clean.cc | 321 ++++++++++++++++++++++++++++------------
 1 file changed, 224 insertions(+), 97 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index 488da1531..ab7d78571 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -184,16 +184,15 @@ struct RmStats {
 	}
 };
 
-void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
+unsigned int hash_bit(const SigBit &bit) {
+	return static_cast<unsigned int>(hash_ops<SigBit>::hash(bit).yield());
+}
+
+void rmunused_module_cells(Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
 {
 	SigMap sigmap(module);
-	dict<IdString, pool<Cell*>> mem2cells;
-	pool<IdString> mem_unused;
-	pool<Cell*> queue, unused;
-	pool<SigBit> used_raw_bits;
-	dict<SigBit, pool<Cell*>> wire2driver;
-	dict<SigBit, vector<string>> driver_driver_logs;
-	FfInitVals ffinit(&sigmap, module);
+	FfInitVals ffinit;
+	ffinit.set_parallel(&sigmap, subpool.thread_pool(), module);
 
 	SigMap raw_sigmap;
 	for (auto &it : module->connections_) {
@@ -203,86 +202,211 @@ void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_ca
 		}
 	}
 
-	for (auto &it : module->memories) {
-		mem_unused.insert(it.first);
-	}
+	struct WireDrivers;
+	// Maps from a SigBit to a unique driver cell.
+	struct WireDriver {
+		using Accumulated = WireDrivers;
+		SigBit bit;
+		int driver_cell;
+	};
+	// Maps from a SigBit to one or more driver cells.
+	struct WireDrivers {
+		WireDrivers() : driver_cell(0) {}
+		WireDrivers(WireDriver driver) : bit(driver.bit), driver_cell(driver.driver_cell) {}
+		WireDrivers(SigBit bit) : bit(bit), driver_cell(0) {}
+		WireDrivers(WireDrivers &&other) = default;
 
-	for (Cell *cell : module->cells()) {
-		if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2))) {
-			IdString mem_id = cell->getParam(ID::MEMID).decode_string();
-			mem2cells[mem_id].insert(cell);
-		}
-	}
-
-	for (auto &it : module->cells_) {
-		Cell *cell = it.second;
-		for (auto &it2 : cell->connections()) {
-			if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first))
-				continue;
-			for (auto raw_bit : it2.second) {
-				if (raw_bit.wire == nullptr)
-					continue;
-				auto bit = sigmap(raw_bit);
-				if (bit.wire == nullptr && ct_all.cell_known(cell->type))
-					driver_driver_logs[raw_sigmap(raw_bit)].push_back(stringf("Driver-driver conflict "
-							"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
-							log_signal(raw_bit), log_id(cell), log_id(it2.first), log_signal(bit), log_id(module)));
-				if (bit.wire != nullptr)
-					wire2driver[bit].insert(cell);
-			}
-		}
-		if (keep_cache.query(cell))
-			queue.insert(cell);
-		else
-			unused.insert(cell);
-	}
-
-	for (auto &it : module->wires_) {
-		Wire *wire = it.second;
-		if (wire->port_output || wire->get_bool_attribute(ID::keep)) {
-			for (auto bit : sigmap(wire))
-			for (auto c : wire2driver[bit])
-				queue.insert(c), unused.erase(c);
-			for (auto raw_bit : SigSpec(wire))
-				used_raw_bits.insert(raw_sigmap(raw_bit));
-		}
-	}
-
-	while (!queue.empty())
-	{
-		pool<SigBit> bits;
-		pool<IdString> mems;
-		for (auto cell : queue) {
-			for (auto &it : cell->connections())
-				if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first))
-					for (auto bit : sigmap(it.second))
-						bits.insert(bit);
-
-			if (cell->type.in(ID($memrd), ID($memrd_v2))) {
-				IdString mem_id = cell->getParam(ID::MEMID).decode_string();
-				if (mem_unused.count(mem_id)) {
-					mem_unused.erase(mem_id);
-					mems.insert(mem_id);
+		class const_iterator {
+		public:
+			const_iterator(const WireDrivers &drivers, bool end)
+					: driver_cell(drivers.driver_cell), in_extra_cells(end) {
+				if (drivers.extra_driver_cells) {
+					if (end) {
+						extra_it = drivers.extra_driver_cells->end();
+					} else {
+						extra_it = drivers.extra_driver_cells->begin();
+					}
 				}
 			}
+			int operator*() const {
+				if (in_extra_cells)
+					return **extra_it;
+				return driver_cell;
+			}
+			const_iterator& operator++() {
+				if (in_extra_cells)
+					++*extra_it;
+				else
+					in_extra_cells = true;
+				return *this;
+			}
+			bool operator!=(const const_iterator &other) const {
+				return !(*this == other);
+			}
+			bool operator==(const const_iterator &other) const {
+				return in_extra_cells == other.in_extra_cells &&
+					extra_it == other.extra_it;
+			}
+		private:
+			std::optional<pool<int>::iterator> extra_it;
+			int driver_cell;
+			bool in_extra_cells;
+		};
+
+		const_iterator begin() const { return const_iterator(*this, false); }
+		const_iterator end() const { return const_iterator(*this, true); }
+
+		SigBit bit;
+		int driver_cell;
+		std::unique_ptr<pool<int>> extra_driver_cells;
+	};
+	struct WireDriversKeyEquality {
+		bool operator()(const WireDrivers &a, const WireDrivers &b) const {
+			return a.bit == b.bit;
 		}
+	};
+	struct WireDriversCollisionHandler {
+		void operator()(WireDrivers &incumbent, WireDrivers &new_value) const {
+			log_assert(new_value.extra_driver_cells == nullptr);
+			if (!incumbent.extra_driver_cells)
+				incumbent.extra_driver_cells.reset(new pool<int>());
+			incumbent.extra_driver_cells->insert(new_value.driver_cell);
+		}
+	};
+	using Wire2Drivers = ShardedHashtable<WireDriver, WireDriversKeyEquality, WireDriversCollisionHandler>;
 
-		queue.clear();
+	Wire2Drivers::Builder wire2driver_builder(subpool);
+	ShardedVector<std::pair<std::string, int>> mem2cells_vector(subpool);
+	ShardedVector<std::pair<SigBit, std::string>> driver_driver_logs(subpool);
+	ShardedVector<Wire*> keep_wires(subpool);
+	const RTLIL::Module *const_module = module;
+	int num_threads = subpool.num_threads();
+	ConcurrentWorkQueue<int> cell_queue(num_threads);
+	std::vector<std::atomic<bool>> unused(const_module->cells_size());
+	subpool.run([&sigmap, &raw_sigmap, &keep_cache, const_module, &mem2cells_vector, &driver_driver_logs, &keep_wires, &cell_queue, &wire2driver_builder, &unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			Cell *cell = const_module->cell_at(i);
+			if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2)))
+				mem2cells_vector.insert(ctx, {cell->getParam(ID::MEMID).decode_string(), i});
 
-		for (auto bit : bits)
-		for (auto c : wire2driver[bit])
-			if (unused.count(c))
-				queue.insert(c), unused.erase(c);
+			for (auto &it2 : cell->connections()) {
+				if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first))
+					continue;
+				for (auto raw_bit : it2.second) {
+					if (raw_bit.wire == nullptr)
+						continue;
+					auto bit = sigmap(raw_bit);
+					if (bit.wire == nullptr && ct_all.cell_known(cell->type)) {
+						std::string msg = stringf("Driver-driver conflict "
+								"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
+								log_signal(raw_bit), cell->name.unescape(), it2.first.unescape(), log_signal(bit), const_module->name.unescape());
+						driver_driver_logs.insert(ctx, {raw_sigmap(raw_bit), msg});
+					}
+					if (bit.wire != nullptr)
+						wire2driver_builder.insert(ctx, {{bit, i}, hash_bit(bit)});
+				}
+			}
+			bool keep = keep_cache.query(cell);
+			unused[i].store(!keep, std::memory_order_relaxed);
+			if (keep)
+				cell_queue.push(ctx, i);
+		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			Wire *wire = const_module->wire_at(i);
+			if (wire->port_output || wire->get_bool_attribute(ID::keep))
+				keep_wires.insert(ctx, wire);
+		}
+	});
+	subpool.run([&wire2driver_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		wire2driver_builder.process(ctx);
+	});
+	Wire2Drivers wire2driver(wire2driver_builder);
 
-		for (auto mem : mems)
-		for (auto c : mem2cells[mem])
-			if (unused.count(c))
-				queue.insert(c), unused.erase(c);
+	dict<std::string, pool<int>> mem2cells;
+	for (std::pair<std::string, int> &mem2cell : mem2cells_vector)
+		mem2cells[mem2cell.first].insert(mem2cell.second);
+
+	pool<SigBit> used_raw_bits;
+	int i = 0;
+	for (Wire *wire : keep_wires) {
+		for (auto bit : sigmap(wire)) {
+			const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
+			if (drivers != nullptr)
+				for (int cell_index : *drivers)
+					if (unused[cell_index].exchange(false, std::memory_order_relaxed)) {
+						ThreadIndex fake_thread_index = {i++ % num_threads};
+						cell_queue.push(fake_thread_index, cell_index);
+					}
+		}
+		for (auto raw_bit : SigSpec(wire))
+			used_raw_bits.insert(raw_sigmap(raw_bit));
 	}
 
-	unused.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
+	std::vector<std::atomic<bool>> mem_unused(module->memories.size());
+	dict<std::string, int> mem_indices;
+	for (int i = 0; i < GetSize(module->memories); ++i) {
+		mem_indices[module->memories.element(i)->first.str()] = i;
+		mem_unused[i].store(true, std::memory_order_relaxed);
+	}
 
-	for (auto cell : unused) {
+	subpool.run([const_module, &sigmap, &wire2driver, &mem2cells, &unused, &cell_queue, &mem_indices, &mem_unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		pool<SigBit> bits;
+		pool<std::string> mems;
+		while (true) {
+			std::vector<int> cell_indices = cell_queue.pop_batch(ctx);
+			if (cell_indices.empty())
+				return;
+			for (auto cell_index : cell_indices) {
+				Cell *cell = const_module->cell_at(cell_index);
+				for (auto &it : cell->connections())
+					if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first))
+						for (auto bit : sigmap(it.second))
+							bits.insert(bit);
+
+				if (cell->type.in(ID($memrd), ID($memrd_v2))) {
+					std::string mem_id = cell->getParam(ID::MEMID).decode_string();
+					if (mem_indices.count(mem_id)) {
+						int mem_index = mem_indices[mem_id];
+						if (mem_unused[mem_index].exchange(false, std::memory_order_relaxed))
+							mems.insert(mem_id);
+					}
+				}
+			}
+
+			for (auto bit : bits) {
+				const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
+				if (drivers != nullptr)
+					for (int cell_index : *drivers)
+						if (unused[cell_index].exchange(false, std::memory_order_relaxed))
+							cell_queue.push(ctx, cell_index);
+			}
+			bits.clear();
+
+			for (auto mem : mems) {
+				if (mem2cells.count(mem) == 0)
+					continue;
+				for (int cell_index : mem2cells.at(mem))
+					if (unused[cell_index].exchange(false, std::memory_order_relaxed))
+						cell_queue.push(ctx, cell_index);
+			}
+			mems.clear();
+		}
+	});
+
+	ShardedVector<int> sharded_unused_cells(subpool);
+	subpool.run([const_module, &unused, &sharded_unused_cells, &wire2driver](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of `wire2driver`
+		wire2driver.clear(ctx);
+		for (int i : ctx.item_range(const_module->cells_size()))
+			if (unused[i].load(std::memory_order_relaxed))
+				sharded_unused_cells.insert(ctx, i);
+	});
+	pool<Cell*> unused_cells;
+	for (int cell_index : sharded_unused_cells)
+		unused_cells.insert(const_module->cell_at(cell_index));
+	unused_cells.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
+
+	for (auto cell : unused_cells) {
 		if (verbose)
 			log_debug("  removing unused `%s' cell `%s'.\n", cell->type, cell->name);
 		module->design->scratchpad_set_bool("opt.did_something", true);
@@ -292,28 +416,31 @@ void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_ca
 		stats.count_rm_cells++;
 	}
 
-	for (auto it : mem_unused)
-	{
+	for (const auto &it : mem_indices) {
+		if (!mem_unused[it.second].load(std::memory_order_relaxed))
+			continue;
+		RTLIL::IdString id(it.first);
 		if (verbose)
-			log_debug("  removing unused memory `%s'.\n", it);
-		delete module->memories.at(it);
-		module->memories.erase(it);
+			log_debug("  removing unused memory `%s'.\n", id.unescape());
+		delete module->memories.at(id);
+		module->memories.erase(id);
 	}
 
-	for (auto &it : module->cells_) {
-		Cell *cell = it.second;
-		for (auto &it2 : cell->connections()) {
-			if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first))
-				continue;
-			for (auto raw_bit : raw_sigmap(it2.second))
-				used_raw_bits.insert(raw_bit);
+	if (!driver_driver_logs.empty()) {
+		// We could do this in parallel but hopefully this is rare.
+		for (auto &it : module->cells_) {
+			Cell *cell = it.second;
+			for (auto &it2 : cell->connections()) {
+				if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first))
+					continue;
+				for (auto raw_bit : raw_sigmap(it2.second))
+					used_raw_bits.insert(raw_bit);
+			}
+		}
+		for (std::pair<SigBit, std::string> &it : driver_driver_logs) {
+			if (used_raw_bits.count(it.first))
+				log_warning("%s\n", it.second);
 		}
-	}
-
-	for (auto it : driver_driver_logs) {
-		if (used_raw_bits.count(it.first))
-			for (auto msg : it.second)
-				log_warning("%s\n", msg);
 	}
 }
 
@@ -762,7 +889,7 @@ void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_p
 	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
 	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
 	remove_temporary_cells(module, subpool, verbose);
-	rmunused_module_cells(module, verbose, stats, keep_cache);
+	rmunused_module_cells(module, subpool, verbose, stats, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
 	if (rminit && rmunused_module_init(module, subpool, verbose))

From 7f3b11e56b130bcefe712831bedb31cb0b01984a Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 5 Feb 2026 19:23:10 +0000
Subject: [PATCH 19/29] Add test that connects a wire with `init` to a constant

---
 tests/opt/opt_clean_init_const.ys | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 tests/opt/opt_clean_init_const.ys

diff --git a/tests/opt/opt_clean_init_const.ys b/tests/opt/opt_clean_init_const.ys
new file mode 100644
index 000000000..1b3d5db63
--- /dev/null
+++ b/tests/opt/opt_clean_init_const.ys
@@ -0,0 +1,9 @@
+read_rtlil << EOT
+module \top
+  attribute \init 1'0
+  wire \w
+
+  connect \w 1'0
+end
+EOT
+opt_clean

From 3603cd52a0b8a34bd13953b3d4db261117649b60 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:59:44 +0000
Subject: [PATCH 20/29] Pass the module `Subpool` to `rmunused_module_signals`
 and parallelize that function

---
 passes/opt/opt_clean.cc | 482 +++++++++++++++++++++++++++-------------
 1 file changed, 327 insertions(+), 155 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index ab7d78571..c0c6b2c53 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -454,9 +454,62 @@ int count_nontrivial_wire_attrs(RTLIL::Wire *w)
 	return count;
 }
 
+struct ShardedSigBit {
+	using Accumulated = ShardedSigBit;
+	RTLIL::SigBit bit;
+	ShardedSigBit() = default;
+	ShardedSigBit(const RTLIL::SigBit &bit) : bit(bit) {}
+};
+struct ShardedSigBitEquality {
+	bool operator()(const ShardedSigBit &b1, const ShardedSigBit &b2) const {
+		return b1.bit == b2.bit;
+	}
+};
+using ShardedSigPool = ShardedHashtable<ShardedSigBit, ShardedSigBitEquality, SetCollisionHandler<ShardedSigBit>>;
+
+struct ShardedSigSpec {
+	using Accumulated = ShardedSigSpec;
+	RTLIL::SigSpec spec;
+	ShardedSigSpec() = default;
+	ShardedSigSpec(RTLIL::SigSpec spec) : spec(std::move(spec)) {}
+	ShardedSigSpec(ShardedSigSpec &&) = default;
+};
+struct ShardedSigSpecEquality {
+	bool operator()(const ShardedSigSpec &s1, const ShardedSigSpec &s2) const {
+		return s1.spec == s2.spec;
+	}
+};
+using ShardedSigSpecPool = ShardedHashtable<ShardedSigSpec, ShardedSigSpecEquality, SetCollisionHandler<ShardedSigSpec>>;
+
+struct DirectWires {
+	const SigMap &assign_map;
+	const ShardedSigSpecPool &direct_sigs;
+	dict<RTLIL::Wire *, bool> cache;
+
+	DirectWires(const SigMap &assign_map, const ShardedSigSpecPool &direct_sigs) : assign_map(assign_map), direct_sigs(direct_sigs) {}
+	void cache_result_for_bit(const SigBit &bit) {
+		if (bit.wire != nullptr)
+			is_direct(bit.wire);
+	}
+	bool is_direct(RTLIL::Wire *wire) {
+		if (wire->port_input)
+			return true;
+		auto it = cache.find(wire);
+		if (it != cache.end())
+			return it->second;
+		SigSpec direct_sig = assign_map(wire);
+		bool direct = direct_sigs.find({direct_sig, direct_sig.hash_into(Hasher()).yield()}) != nullptr;
+		cache.insert({wire, direct});
+		return direct;
+	}
+};
+
 // Should we pick `s2` over `s1` to represent a signal?
-bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool &regs, SigPool &conns, pool<RTLIL::Wire*> &direct_wires)
+bool compare_signals(const RTLIL::SigBit &s1, const RTLIL::SigBit &s2, const ShardedSigPool &regs, const ShardedSigPool &conns, DirectWires &direct_wires)
 {
+	if (s1 == s2)
+		return false;
+
 	RTLIL::Wire *w1 = s1.wire;
 	RTLIL::Wire *w2 = s2.wire;
 
@@ -470,12 +523,20 @@ bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool &regs, SigPoo
 		return !(w2->port_input && w2->port_output);
 
 	if (w1->name.isPublic() && w2->name.isPublic()) {
-		if (regs.check(s1) != regs.check(s2))
-			return regs.check(s2);
-		if (direct_wires.count(w1) != direct_wires.count(w2))
-			return direct_wires.count(w2) != 0;
-		if (conns.check_any(s1) != conns.check_any(s2))
-			return conns.check_any(s2);
+		ShardedSigPool::AccumulatedValue s1_val = {s1, s1.hash_top().yield()};
+		ShardedSigPool::AccumulatedValue s2_val = {s2, s2.hash_top().yield()};
+		bool regs1 = regs.find(s1_val) != nullptr;
+		bool regs2 = regs.find(s2_val) != nullptr;
+		if (regs1 != regs2)
+			return regs2;
+		bool w1_direct = direct_wires.is_direct(w1);
+		bool w2_direct = direct_wires.is_direct(w2);
+		if (w1_direct != w2_direct)
+			return w2_direct;
+		bool conns1 = conns.find(s1_val) != nullptr;
+		bool conns2 = conns.find(s2_val) != nullptr;
+		if (conns1 != conns2)
+			return conns2;
 	}
 
 	if (w1 == w2)
@@ -508,109 +569,185 @@ bool check_public_name(RTLIL::IdString id)
 	return true;
 }
 
-bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose, RmStats &stats)
-{
-	// `register_signals` and `connected_signals` will help us decide later on
-	// on picking representatives out of groups of connected signals
-	SigPool register_signals;
-	SigPool connected_signals;
-	if (!purge_mode)
-		for (auto &it : module->cells_) {
-			RTLIL::Cell *cell = it.second;
-			if (ct_reg(cell->type)) {
-				bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic));
-				for (auto &it2 : cell->connections())
-					if (clk2fflogic ? it2.first == ID::D : ct_all.cell_output(cell->type, it2.first))
-						register_signals.add(it2.second);
-			}
-			for (auto &it2 : cell->connections())
-				connected_signals.add(it2.second);
-		}
+void add_spec(ShardedSigPool::Builder &builder, const ThreadIndex &thread, const RTLIL::SigSpec &spec) {
+	for (SigBit bit : spec)
+		if (bit.wire != nullptr)
+			builder.insert(thread, {bit, bit.hash_top().yield()});
+}
 
+bool check_any(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) != nullptr)
+			return true;
+	return false;
+}
+
+bool check_all(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) == nullptr)
+			return false;
+	return true;
+}
+
+bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool purge_mode, bool verbose, RmStats &stats)
+{
 	SigMap assign_map(module);
 
+	const RTLIL::Module *const_module = module;
+	// `register_signals` and `connected_signals` will help us decide later on
+	// on picking representatives out of groups of connected signals
+	ShardedSigPool::Builder register_signals_builder(subpool);
+	ShardedSigPool::Builder connected_signals_builder(subpool);
 	// construct a pool of wires which are directly driven by a known celltype,
 	// this will influence our choice of representatives
-	pool<RTLIL::Wire*> direct_wires;
-	{
-		pool<RTLIL::SigSpec> direct_sigs;
-		for (auto &it : module->cells_) {
-			RTLIL::Cell *cell = it.second;
+	ShardedSigSpecPool::Builder direct_sigs_builder(subpool);
+	subpool.run([const_module, purge_mode, &assign_map, &direct_sigs_builder, &register_signals_builder, &connected_signals_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (!purge_mode) {
+				if (ct_reg(cell->type)) {
+					bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic));
+					for (auto &it2 : cell->connections())
+						if (clk2fflogic ? it2.first == ID::D : ct_all.cell_output(cell->type, it2.first))
+							add_spec(register_signals_builder, ctx, it2.second);
+				}
+				for (auto &it2 : cell->connections())
+					add_spec(connected_signals_builder, ctx, it2.second);
+			}
 			if (ct_all.cell_known(cell->type))
 				for (auto &it2 : cell->connections())
-					if (ct_all.cell_output(cell->type, it2.first))
-						direct_sigs.insert(assign_map(it2.second));
+					if (ct_all.cell_output(cell->type, it2.first)) {
+						RTLIL::SigSpec spec = assign_map(it2.second);
+						unsigned int hash = spec.hash_into(Hasher()).yield();
+						direct_sigs_builder.insert(ctx, {std::move(spec), hash});
+					}
 		}
-		for (auto &it : module->wires_) {
-			if (direct_sigs.count(assign_map(it.second)) || it.second->port_input)
-				direct_wires.insert(it.second);
-		}
-	}
+	});
+	subpool.run([&register_signals_builder, &connected_signals_builder, &direct_sigs_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		register_signals_builder.process(ctx);
+		connected_signals_builder.process(ctx);
+		direct_sigs_builder.process(ctx);
+	});
+	ShardedSigPool register_signals(register_signals_builder);
+	ShardedSigPool connected_signals(connected_signals_builder);
+	ShardedSigSpecPool direct_sigs(direct_sigs_builder);
 
-	// weight all options for representatives with `compare_signals`,
-	// the one that wins will be what `assign_map` maps to
-	for (auto &it : module->wires_) {
-		RTLIL::Wire *wire = it.second;
-		for (int i = 0; i < wire->width; i++) {
-			RTLIL::SigBit s1 = RTLIL::SigBit(wire, i), s2 = assign_map(s1);
-			if (compare_signals(s2, s1, register_signals, connected_signals, direct_wires))
-				assign_map.add(s1);
+	ShardedVector<RTLIL::SigBit> sigmap_canonical_candidates(subpool);
+	DirectWires direct_wires(assign_map, direct_sigs);
+	subpool.run([const_module, &assign_map, &register_signals, &connected_signals, &sigmap_canonical_candidates, &direct_sigs, &direct_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		std::optional<DirectWires> local_direct_wires;
+		DirectWires *this_thread_direct_wires = &direct_wires;
+		if (ctx.thread_num > 0) {
+			local_direct_wires.emplace(assign_map, direct_sigs);
+			this_thread_direct_wires = &local_direct_wires.value();
 		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			for (int j = 0; j < wire->width; ++j) {
+				RTLIL::SigBit s1(wire, j);
+				RTLIL::SigBit s2 = assign_map(s1);
+				if (compare_signals(s2, s1, register_signals, connected_signals, *this_thread_direct_wires))
+					sigmap_canonical_candidates.insert(ctx, s1);
+			}
+		}
+	});
+	// Cache all the direct_wires results that we might possible need. This avoids the results
+	// changing when we update `assign_map` below.
+	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
+		direct_wires.cache_result_for_bit(candidate);
+		direct_wires.cache_result_for_bit(assign_map(candidate));
+	}
+	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
+		RTLIL::SigBit current_canonical = assign_map(candidate);
+		if (compare_signals(current_canonical, candidate, register_signals, connected_signals, direct_wires))
+			assign_map.add(candidate);
 	}
 
 	// we are removing all connections
 	module->connections_.clear();
 
 	// used signals sigmapped
-	SigPool used_signals;
+	ShardedSigPool::Builder used_signals_builder(subpool);
 	// used signals pre-sigmapped
-	SigPool raw_used_signals;
+	ShardedSigPool::Builder raw_used_signals_builder(subpool);
 	// used signals sigmapped, ignoring drivers (we keep track of this to set `unused_bits`)
-	SigPool used_signals_nodrivers;
-
-	// gather the usage information for cells
-	for (auto &it : module->cells_) {
-		RTLIL::Cell *cell = it.second;
-		for (auto &it2 : cell->connections_) {
-			assign_map.apply(it2.second); // modify the cell connection in place
-			raw_used_signals.add(it2.second);
-			used_signals.add(it2.second);
-			if (!ct_all.cell_output(cell->type, it2.first))
-				used_signals_nodrivers.add(it2.second);
-		}
-	}
-
-	// gather the usage information for ports, wires with `keep`,
+	ShardedSigPool::Builder used_signals_nodrivers_builder(subpool);
+	struct UpdateConnection {
+		RTLIL::Cell *cell;
+		RTLIL::IdString port;
+		RTLIL::SigSpec spec;
+	};
+	ShardedVector<UpdateConnection> update_connections(subpool);
+	ShardedVector<RTLIL::Wire*> initialized_wires(subpool);
+	// gather the usage information for cells and update cell connections
+	// also gather the usage information for ports, wires with `keep`
 	// also gather init bits
+	subpool.run([const_module, &register_signals, &connected_signals, &direct_sigs, &assign_map, &used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder, &update_connections, &initialized_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of these sharded structures
+		register_signals.clear(ctx);
+		connected_signals.clear(ctx);
+		direct_sigs.clear(ctx);
+
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			for (const auto &it2 : cell->connections_) {
+				SigSpec spec = assign_map(it2.second);
+				if (spec != it2.second)
+					update_connections.insert(ctx, {cell, it2.first, spec});
+				add_spec(raw_used_signals_builder, ctx, spec);
+				add_spec(used_signals_builder, ctx, spec);
+				if (!ct_all.cell_output(cell->type, it2.first))
+					add_spec(used_signals_nodrivers_builder, ctx, spec);
+			}
+		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			if (wire->port_id > 0) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				add_spec(raw_used_signals_builder, ctx, sig);
+				assign_map.apply(sig);
+				add_spec(used_signals_builder, ctx, sig);
+				if (!wire->port_input)
+					add_spec(used_signals_nodrivers_builder, ctx, sig);
+			}
+			if (wire->get_bool_attribute(ID::keep)) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				assign_map.apply(sig);
+				add_spec(used_signals_builder, ctx, sig);
+			}
+			auto it2 = wire->attributes.find(ID::init);
+			if (it2 != wire->attributes.end())
+				initialized_wires.insert(ctx, wire);
+		}
+	});
+	subpool.run([&used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		used_signals_builder.process(ctx);
+		raw_used_signals_builder.process(ctx);
+		used_signals_nodrivers_builder.process(ctx);
+	});
+	ShardedSigPool used_signals(used_signals_builder);
+	ShardedSigPool raw_used_signals(raw_used_signals_builder);
+	ShardedSigPool used_signals_nodrivers(used_signals_nodrivers_builder);
+
 	dict<RTLIL::SigBit, RTLIL::State> init_bits;
-	for (auto &it : module->wires_) {
-		RTLIL::Wire *wire = it.second;
-		if (wire->port_id > 0) {
-			RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-			raw_used_signals.add(sig);
-			assign_map.apply(sig);
-			used_signals.add(sig);
-			if (!wire->port_input)
-				used_signals_nodrivers.add(sig);
-		}
-		if (wire->get_bool_attribute(ID::keep)) {
-			RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-			assign_map.apply(sig);
-			used_signals.add(sig);
-		}
-		auto it2 = wire->attributes.find(ID::init);
-		if (it2 != wire->attributes.end()) {
-			RTLIL::Const &val = it2->second;
-			SigSpec sig = assign_map(wire);
-			for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
-				if (val[i] != State::Sx)
-					init_bits[sig[i]] = val[i];
-			wire->attributes.erase(it2);
-		}
+	// The wires that appear in the keys of `init_bits`
+	pool<Wire*> init_bits_wires;
+	for (const UpdateConnection &update : update_connections)
+		update.cell->connections_.at(update.port) = std::move(update.spec);
+	for (RTLIL::Wire *intialized_wire : initialized_wires) {
+		auto it = intialized_wire->attributes.find(ID::init);
+		RTLIL::Const &val = it->second;
+		SigSpec sig = assign_map(intialized_wire);
+		for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
+			if (val[i] != State::Sx && sig[i].wire != nullptr) {
+				init_bits[sig[i]] = val[i];
+				init_bits_wires.insert(sig[i].wire);
+			}
+		intialized_wire->attributes.erase(it);
 	}
 
 	// set init attributes on all wires of a connected group
-	for (auto wire : module->wires()) {
+	for (RTLIL::Wire *wire : init_bits_wires) {
 		bool found = false;
 		Const val(State::Sx, wire->width);
 		for (int i = 0; i < wire->width; i++) {
@@ -625,81 +762,117 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	}
 
 	// now decide for each wire if we should be deleting it
-	pool<RTLIL::Wire*> del_wires_queue;
-	for (auto wire : module->wires())
-	{
-		SigSpec s1 = SigSpec(wire), s2 = assign_map(s1);
-		log_assert(GetSize(s1) == GetSize(s2));
+	ShardedVector<RTLIL::Wire*> del_wires(subpool);
+	ShardedVector<RTLIL::Wire*> remove_init(subpool);
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_init(subpool);
+	ShardedVector<RTLIL::SigSig> connections(subpool);
+	ShardedVector<RTLIL::Wire*> remove_unused_bits(subpool);
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_unused_bits(subpool);
+	subpool.run([const_module, purge_mode, &assign_map, &used_signals, &raw_used_signals, &used_signals_nodrivers, &del_wires, &remove_init, &set_init, &connections, &remove_unused_bits, &set_unused_bits](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			SigSpec s1 = SigSpec(wire), s2 = assign_map(s1);
+			log_assert(GetSize(s1) == GetSize(s2));
 
-		Const initval;
-		if (wire->attributes.count(ID::init))
-			initval = wire->attributes.at(ID::init);
-		if (GetSize(initval) != GetSize(wire))
-			initval.resize(GetSize(wire), State::Sx);
-		if (initval.is_fully_undef())
-			wire->attributes.erase(ID::init);
+			Const initval;
+			bool has_init_attribute = wire->attributes.count(ID::init);
+			bool init_changed = false;
+			if (has_init_attribute)
+				initval = wire->attributes.at(ID::init);
+			if (GetSize(initval) != GetSize(wire)) {
+				initval.resize(GetSize(wire), State::Sx);
+				init_changed = true;
+			}
 
-		if (GetSize(wire) == 0) {
-			// delete zero-width wires, unless they are module ports
-			if (wire->port_id == 0)
+			if (GetSize(wire) == 0) {
+				// delete zero-width wires, unless they are module ports
+				if (wire->port_id == 0)
+					goto delete_this_wire;
+			} else
+			if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
+				// do not delete anything with "keep" or module ports or initialized wires
+			} else
+			if (!purge_mode && check_public_name(wire->name) && (check_any(raw_used_signals, s1) || check_any(used_signals, s2) || s1 != s2)) {
+				// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
+			} else
+			if (!check_any(raw_used_signals, s1)) {
+				// delete wires that aren't used by anything directly
 				goto delete_this_wire;
-		} else
-		if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
-			// do not delete anything with "keep" or module ports or initialized wires
-		} else
-		if (!purge_mode && check_public_name(wire->name) && (raw_used_signals.check_any(s1) || used_signals.check_any(s2) || s1 != s2)) {
-			// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
-		} else
-		if (!raw_used_signals.check_any(s1)) {
-			// delete wires that aren't used by anything directly
-			goto delete_this_wire;
-		}
-
-		if (0)
-		{
-	delete_this_wire:
-			del_wires_queue.insert(wire);
-		}
-		else
-		{
-			RTLIL::SigSig new_conn;
-			for (int i = 0; i < GetSize(s1); i++)
-				if (s1[i] != s2[i]) {
-					if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
-						s2[i] = initval[i];
-						initval.set(i, State::Sx);
-					}
-					new_conn.first.append(s1[i]);
-					new_conn.second.append(s2[i]);
-				}
-			if (new_conn.first.size() > 0) {
-				if (initval.is_fully_undef())
-					wire->attributes.erase(ID::init);
-				else
-					wire->attributes.at(ID::init) = initval;
-				module->connect(new_conn);
 			}
 
-			if (!used_signals_nodrivers.check_all(s2)) {
+			if (0)
+			{
+		delete_this_wire:
+				del_wires.insert(ctx, wire);
+			}
+			else
+			{
+				RTLIL::SigSig new_conn;
+				for (int i = 0; i < GetSize(s1); i++)
+					if (s1[i] != s2[i]) {
+						if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
+							s2[i] = initval[i];
+							initval.set(i, State::Sx);
+							init_changed = true;
+						}
+						new_conn.first.append(s1[i]);
+						new_conn.second.append(s2[i]);
+					}
+				if (new_conn.first.size() > 0)
+					connections.insert(ctx, std::move(new_conn));
+				if (initval.is_fully_undef()) {
+					if (has_init_attribute)
+						remove_init.insert(ctx, wire);
+				} else
+					if (init_changed)
+						set_init.insert(ctx, {wire, std::move(initval)});
+
 				std::string unused_bits;
-				for (int i = 0; i < GetSize(s2); i++) {
-					if (s2[i].wire == NULL)
-						continue;
-					if (!used_signals_nodrivers.check(s2[i])) {
-						if (!unused_bits.empty())
-							unused_bits += " ";
-						unused_bits += stringf("%d", i);
+				if (!check_all(used_signals_nodrivers, s2)) {
+					for (int i = 0; i < GetSize(s2); i++) {
+						if (s2[i].wire == NULL)
+							continue;
+						SigBit b = s2[i];
+						if (used_signals_nodrivers.find({b, b.hash_top().yield()}) == nullptr) {
+							if (!unused_bits.empty())
+								unused_bits += " ";
+							unused_bits += stringf("%d", i);
+						}
 					}
 				}
-				if (unused_bits.empty() || wire->port_id != 0)
-					wire->attributes.erase(ID::unused_bits);
-				else
-					wire->attributes[ID::unused_bits] = RTLIL::Const(unused_bits);
-			} else {
-				wire->attributes.erase(ID::unused_bits);
+				if (unused_bits.empty() || wire->port_id != 0) {
+					if (wire->attributes.count(ID::unused_bits))
+						remove_unused_bits.insert(ctx, wire);
+				} else {
+					RTLIL::Const unused_bits_const(std::move(unused_bits));
+					if (wire->attributes.count(ID::unused_bits)) {
+						RTLIL::Const &unused_bits_attr = wire->attributes.at(ID::unused_bits);
+						if (unused_bits_attr != unused_bits_const)
+							set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+					} else
+						set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+				}
 			}
 		}
-	}
+	});
+	pool<RTLIL::Wire*> del_wires_queue;
+	del_wires_queue.insert(del_wires.begin(), del_wires.end());
+	for (RTLIL::Wire *wire : remove_init)
+		wire->attributes.erase(ID::init);
+	for (auto &p : set_init)
+		p.first->attributes[ID::init] = std::move(p.second);
+	for (auto &conn : connections)
+		module->connect(std::move(conn));
+	for (RTLIL::Wire *wire : remove_unused_bits)
+		wire->attributes.erase(ID::unused_bits);
+	for (auto &p : set_unused_bits)
+		p.first->attributes[ID::unused_bits] = std::move(p.second);
+
+	subpool.run([&used_signals, &raw_used_signals, &used_signals_nodrivers](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		used_signals.clear(ctx);
+		raw_used_signals.clear(ctx);
+		used_signals_nodrivers.clear(ctx);
+	});
 
 	int del_temp_wires_count = 0;
 	for (auto wire : del_wires_queue) {
@@ -890,12 +1063,11 @@ void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_p
 	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
 	remove_temporary_cells(module, subpool, verbose);
 	rmunused_module_cells(module, subpool, verbose, stats, keep_cache);
-	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
+	while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
 
 	if (rminit && rmunused_module_init(module, subpool, verbose))
-		while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
+		while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
 }
-
 struct OptCleanPass : public Pass {
 	OptCleanPass() : Pass("opt_clean", "remove unused cells and wires") { }
 	void help() override

From ac55935a685f8aa66e06154ac426cec479b4728d Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Mon, 2 Feb 2026 23:04:34 +0000
Subject: [PATCH 21/29] Add unit-tests for `ParallelDispatchThread` and friends

---
 kernel/utils.h                     |   3 +
 tests/unit/Makefile                |   4 +-
 tests/unit/kernel/threadingTest.cc | 164 +++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/kernel/threadingTest.cc

diff --git a/kernel/utils.h b/kernel/utils.h
index 68e00153f..46a196789 100644
--- a/kernel/utils.h
+++ b/kernel/utils.h
@@ -312,6 +312,9 @@ struct IntRange {
 	};
 	Int begin() const { return {start_}; }
 	Int end() const { return {end_}; }
+
+	bool operator==(const IntRange &other) const { return start_ == other.start_ && end_ == other.end_; }
+	bool operator!=(const IntRange &other) const { return !(*this == other); }
 };
 
 YOSYS_NAMESPACE_END
diff --git a/tests/unit/Makefile b/tests/unit/Makefile
index e8f76cba9..88f449bf8 100644
--- a/tests/unit/Makefile
+++ b/tests/unit/Makefile
@@ -4,10 +4,10 @@ UNAME_S := $(shell uname -s)
 GTEST_PREFIX := $(shell brew --prefix googletest 2>/dev/null)
 ifeq ($(GTEST_PREFIX),)
   GTEST_CXXFLAGS :=
-  GTEST_LDFLAGS := -lgtest -lgtest_main
+  GTEST_LDFLAGS := -lgtest -lgmock -lgtest_main
 else
   GTEST_CXXFLAGS := -I$(GTEST_PREFIX)/include
-  GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgtest_main
+  GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgmock -lgtest_main
 endif
 
 ifeq ($(UNAME_S),Darwin)
diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
new file mode 100644
index 000000000..b26a08fcc
--- /dev/null
+++ b/tests/unit/kernel/threadingTest.cc
@@ -0,0 +1,164 @@
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include "kernel/threading.h"
+
+YOSYS_NAMESPACE_BEGIN
+
+class ThreadingTest : public testing::Test {
+protected:
+	ThreadingTest() {
+		if (log_files.empty())
+			log_files.emplace_back(stdout);
+	}
+};
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolCreate) {
+	// Test creating a pool with 0 threads (treated as 1)
+	ParallelDispatchThreadPool pool0(0);
+	EXPECT_EQ(pool0.num_threads(), 1);
+
+	// Test creating a pool with 1 thread
+	ParallelDispatchThreadPool pool1(1);
+	EXPECT_EQ(pool1.num_threads(), 1);
+
+	// Test creating a pool with 2 threads
+	ParallelDispatchThreadPool pool2(2);
+	// YOSYS_MAX_THREADS or system configuration could mean we
+	// decide to only use one thread.
+	EXPECT_GE(pool2.num_threads(), 1);
+	EXPECT_LE(pool2.num_threads(), 2);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunSimple) {
+	ParallelDispatchThreadPool pool(2);
+
+	std::atomic<int> counter{0};
+	pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+		counter.fetch_add(1, std::memory_order_relaxed);
+	});
+
+	EXPECT_EQ(counter.load(), pool.num_threads());
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunMultiple) {
+	ParallelDispatchThreadPool pool(2);
+
+	std::atomic<int> counter{0};
+	// Run multiple times to verify the pool can be reused
+	for (int i = 0; i < 5; ++i)
+		pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+			counter.fetch_add(1, std::memory_order_relaxed);
+		});
+
+	EXPECT_EQ(counter.load(), pool.num_threads() * 5);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunCtxThreadNums) {
+	ParallelDispatchThreadPool pool(4);
+
+	std::vector<int> thread_nums(pool.num_threads(), -1);
+	pool.run([&thread_nums](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		thread_nums[ctx.thread_num] = ctx.thread_num;
+	});
+
+	// Every thread should have recorded its own thread number
+	for (int i = 0; i < pool.num_threads(); ++i)
+		EXPECT_EQ(thread_nums[i], i);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolItemRange) {
+	ParallelDispatchThreadPool pool(3);
+
+	const int num_items = 100;
+	std::vector<std::atomic<int>> item_counts(num_items);
+	for (std::atomic<int> &c : item_counts)
+		c.store(0);
+
+	pool.run([&item_counts](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(num_items))
+			item_counts[i].fetch_add(1);
+	});
+
+	// Each item should have been processed exactly once
+	for (int i = 0; i < num_items; ++i)
+		EXPECT_EQ(item_counts[i].load(), 1);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolSubpool) {
+	ParallelDispatchThreadPool pool(4);
+
+	// Subpool limited to 2 threads
+	ParallelDispatchThreadPool::Subpool subpool(pool, 2);
+	EXPECT_LE(subpool.num_threads(), 2);
+
+	std::atomic<int> counter{0};
+	subpool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+		counter.fetch_add(1, std::memory_order_relaxed);
+	});
+
+	EXPECT_EQ(counter.load(), subpool.num_threads());
+}
+
+TEST_F(ThreadingTest, IntRangeIteration) {
+	IntRange range{3, 7};
+	std::vector<int> values;
+	for (int i : range)
+		values.push_back(i);
+	EXPECT_THAT(values, testing::ElementsAre(3, 4, 5, 6));
+}
+
+TEST_F(ThreadingTest, IntRangeEmpty) {
+	IntRange range{5, 5};
+	for (int _ : range)
+		FAIL();
+}
+
+TEST_F(ThreadingTest, ItemRangeForWorker) {
+	EXPECT_EQ(item_range_for_worker(10, 0, 3), (IntRange{0, 4}));
+	EXPECT_EQ(item_range_for_worker(10, 1, 3), (IntRange{4, 7}));
+	EXPECT_EQ(item_range_for_worker(10, 2, 3), (IntRange{7, 10}));
+}
+
+TEST_F(ThreadingTest, ItemRangeForWorkerZeroThreads) {
+	EXPECT_EQ(item_range_for_worker(10, 0, 0), (IntRange{0, 10}));
+}
+
+TEST_F(ThreadingTest, ShardedVectorBasic) {
+	ParallelDispatchThreadPool pool(2);
+	ShardedVector<int> vec(pool);
+	pool.run([&vec](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		vec.insert(ctx, ctx.thread_num * 10);
+		vec.insert(ctx, ctx.thread_num * 10 + 1);
+	});
+
+	EXPECT_FALSE(vec.empty());
+
+	// Count elements
+	std::vector<int> elements;
+	for (int v : vec) {
+		elements.push_back(v);
+	}
+
+	if (pool.num_threads() == 2)
+		EXPECT_THAT(elements, testing::ElementsAre(0, 1, 10, 11));
+	else
+		EXPECT_THAT(elements, testing::ElementsAre(0, 1));
+}
+
+TEST_F(ThreadingTest, MonotonicFlagBasic) {
+	MonotonicFlag flag;
+	EXPECT_FALSE(flag.load());
+	flag.set();
+	EXPECT_TRUE(flag.load());
+	flag.set();
+	EXPECT_TRUE(flag.load());
+}
+
+TEST_F(ThreadingTest, MonotonicFlagSetAndReturnOld) {
+	MonotonicFlag flag;
+	EXPECT_FALSE(flag.set_and_return_old());
+	EXPECT_TRUE(flag.load());
+	EXPECT_TRUE(flag.set_and_return_old());
+}
+
+YOSYS_NAMESPACE_END

From 3910d569da21cd92208df54314ef7df3d466dcc2 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Mon, 2 Feb 2026 23:16:20 +0000
Subject: [PATCH 22/29] Add unit tests for `ConcurrentQueue` and `ThreadPool`

---
 tests/unit/kernel/threadingTest.cc | 75 ++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
index b26a08fcc..a3c19edf7 100644
--- a/tests/unit/kernel/threadingTest.cc
+++ b/tests/unit/kernel/threadingTest.cc
@@ -161,4 +161,79 @@ TEST_F(ThreadingTest, MonotonicFlagSetAndReturnOld) {
 	EXPECT_TRUE(flag.set_and_return_old());
 }
 
+TEST_F(ThreadingTest, ConcurrentQueueBasic) {
+	ConcurrentQueue<int> queue;
+	queue.push_back(1);
+	queue.push_back(2);
+	queue.push_back(3);
+
+	auto v1 = queue.pop_front();
+	auto v2 = queue.pop_front();
+	auto v3 = queue.pop_front();
+
+	ASSERT_TRUE(v1.has_value());
+	ASSERT_TRUE(v2.has_value());
+	ASSERT_TRUE(v3.has_value());
+	EXPECT_EQ(*v1, 1);
+	EXPECT_EQ(*v2, 2);
+	EXPECT_EQ(*v3, 3);
+}
+
+TEST_F(ThreadingTest, ConcurrentQueueTryPopEmpty) {
+	ConcurrentQueue<int> queue;
+	auto v = queue.try_pop_front();
+	EXPECT_FALSE(v.has_value());
+}
+
+TEST_F(ThreadingTest, ConcurrentQueueClose) {
+	ConcurrentQueue<int> queue;
+	queue.push_back(42);
+	queue.close();
+
+	// Can still pop existing elements
+	auto v1 = queue.pop_front();
+	ASSERT_TRUE(v1.has_value());
+	EXPECT_EQ(*v1, 42);
+
+	// After close and empty, pop_front returns nullopt
+	auto v2 = queue.pop_front();
+	EXPECT_FALSE(v2.has_value());
+}
+
+TEST_F(ThreadingTest, ThreadPoolCreate) {
+	// pool_size of 0 means no worker threads
+	ThreadPool pool0(0, [](int) {});
+	EXPECT_EQ(pool0.num_threads(), 0);
+
+	// pool_size of 1 means 1 worker thread
+	std::atomic<int> counter{0};
+	{
+		ThreadPool pool1(1, [&counter](int thread_num) {
+			EXPECT_EQ(thread_num, 0);
+			counter.fetch_add(1);
+		});
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	EXPECT_EQ(counter.load(), 1);
+#else
+	EXPECT_EQ(counter.load(), 0);
+#endif
+}
+
+TEST_F(ThreadingTest, ThreadPoolMultipleThreads) {
+	std::atomic<int> counter{0};
+	{
+		ThreadPool pool(2, [&counter](int) {
+			counter.fetch_add(1);
+		});
+		EXPECT_LE(pool.num_threads(), 2);
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	EXPECT_GE(counter.load(), 1);
+	EXPECT_LE(counter.load(), 2);
+#else
+	EXPECT_EQ(counter.load(), 0);
+#endif
+}
+
 YOSYS_NAMESPACE_END

From 1e96328ede8fb5f98600e30786d02613ae9acf84 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Mon, 2 Feb 2026 23:36:41 +0000
Subject: [PATCH 23/29] Add some tests for `ShardedHashSet`

---
 tests/unit/kernel/threadingTest.cc | 133 +++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
index a3c19edf7..3de71389f 100644
--- a/tests/unit/kernel/threadingTest.cc
+++ b/tests/unit/kernel/threadingTest.cc
@@ -236,4 +236,137 @@ TEST_F(ThreadingTest, ThreadPoolMultipleThreads) {
 #endif
 }
 
+// Helper types for ShardedHashtable tests
+struct IntValue {
+	using Accumulated = IntValue;
+	int value;
+	operator int() const { return value; }
+};
+
+struct IntValueEquality {
+	bool operator()(int a, int b) const { return a == b; }
+};
+
+TEST_F(ThreadingTest, ShardedHashtableBasic) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashtable<IntValue, IntValueEquality, SetCollisionHandler<IntValue>>;
+	HashSet::Builder builder(pool);
+
+	// Insert some values
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.insert(ctx, {{10}, 10});
+		builder.insert(ctx, {{20}, 20});
+		builder.insert(ctx, {{30}, 30});
+	});
+
+	// Process
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	// Build and lookup
+	HashSet set(builder);
+	const IntValue *found10 = set.find({{10}, 10});
+	const IntValue *found20 = set.find({{20}, 20});
+	const IntValue *found99 = set.find({{99}, 99});
+
+	ASSERT_NE(found10, nullptr);
+	ASSERT_NE(found20, nullptr);
+	EXPECT_EQ(found99, nullptr);
+	EXPECT_EQ(*found10, 10);
+	EXPECT_EQ(*found20, 20);
+}
+
+TEST_F(ThreadingTest, ShardedHashtableParallelInsert) {
+	ParallelDispatchThreadPool pool(3);
+
+	using HashSet = ShardedHashtable<IntValue, IntValueEquality, SetCollisionHandler<IntValue>>;
+	HashSet::Builder builder(pool);
+
+	// Insert values from multiple threads
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i = 0; i < 10; ++i) {
+			int val = ctx.thread_num * 100 + i;
+			builder.insert(ctx, {{val}, static_cast<unsigned>(val)});
+		}
+	});
+
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+
+	// Verify all values can be found
+	for (int t = 0; t < pool.num_threads(); ++t) {
+		for (int i = 0; i < 10; ++i) {
+			int val = t * 100 + i;
+			const IntValue *found = set.find({{val}, static_cast<unsigned>(val)});
+			ASSERT_NE(found, nullptr) << "Value " << val << " not found";
+			EXPECT_EQ(*found, val);
+		}
+	}
+}
+
+// Helper types for ShardedHashtable tests
+struct IntDictValue {
+	using Accumulated = IntDictValue;
+	int key;
+	int value;
+	bool operator==(const IntDictValue &other) const { return key == other.key && value == other.value; }
+	bool operator!=(const IntDictValue &other) const { return !(*this == other); }
+};
+
+struct IntDictKeyEquality {
+	bool operator()(const IntDictValue &a, const IntDictValue &b) const { return a.key == b.key; }
+};
+
+// Collision handler that sums values
+struct SumCollisionHandler {
+	void operator()(IntDictValue &existing, IntDictValue &incoming) const {
+		existing.value += incoming.value;
+	}
+};
+
+TEST_F(ThreadingTest, ShardedHashtableCollision) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashtable<IntDictValue, IntDictKeyEquality, SumCollisionHandler>;
+	HashSet::Builder builder(pool);
+
+	// Insert duplicate keys with same hash - duplicates should collapse
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.insert(ctx, {{5, 10}, 5});
+		builder.insert(ctx, {{5, 12}, 5});  // Duplicate key/hash
+		builder.insert(ctx, {{5, 14}, 5});  // Another duplicate
+	});
+
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+	const IntDictValue *found = set.find({{5, 0}, 5});
+	ASSERT_NE(found, nullptr);
+	// With default collision handler, first value is kept
+	EXPECT_EQ(*found, (IntDictValue{5, 36}));
+}
+
+TEST_F(ThreadingTest, ShardedHashtableEmpty) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashtable<IntValue, IntValueEquality, SetCollisionHandler<IntValue>>;
+	HashSet::Builder builder(pool);
+
+	// Don't insert anything, just process
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+	const IntValue *found = set.find({{42}, 42});
+	EXPECT_EQ(found, nullptr);
+}
+
 YOSYS_NAMESPACE_END

From cdfc586f18eedfe772335af64d07aef9adbb005c Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Tue, 3 Feb 2026 00:02:00 +0000
Subject: [PATCH 24/29] Add unit tests for `ConcurrentWorkQueue`

---
 tests/unit/kernel/threadingTest.cc | 70 ++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
index 3de71389f..cbab4d118 100644
--- a/tests/unit/kernel/threadingTest.cc
+++ b/tests/unit/kernel/threadingTest.cc
@@ -369,4 +369,74 @@ TEST_F(ThreadingTest, ShardedHashtableEmpty) {
 	EXPECT_EQ(found, nullptr);
 }
 
+TEST_F(ThreadingTest, ConcurrentWorkQueueSingleThread) {
+	ConcurrentWorkQueue<int> queue(1, 10);  // 1 thread, batch size 10
+	EXPECT_EQ(queue.num_threads(), 1);
+
+	ThreadIndex thread{0};
+
+	// Push some items (less than batch size)
+	for (int i = 0; i < 5; ++i)
+		queue.push(thread, i);
+
+	// Pop should return those items
+	std::vector<int> batch = queue.pop_batch(thread);
+	EXPECT_THAT(batch, testing::UnorderedElementsAre(0, 1, 2, 3, 4));
+
+	// Next pop should return empty (all threads "waiting")
+	std::vector<int> empty_batch = queue.pop_batch(thread);
+	EXPECT_TRUE(empty_batch.empty());
+}
+
+TEST_F(ThreadingTest, ConcurrentWorkQueueBatching) {
+	ConcurrentWorkQueue<int> queue(1, 3);  // batch size 3
+	ThreadIndex thread{0};
+
+	queue.push(thread, 10);
+	queue.push(thread, 20);
+	queue.push(thread, 30);
+	queue.push(thread, 40);
+	queue.push(thread, 50);
+
+	std::vector<int> popped;
+	while (true) {
+		std::vector<int> batch = queue.pop_batch(thread);
+		if (batch.empty())
+			break;
+		popped.insert(popped.end(), batch.begin(), batch.end());
+	}
+	EXPECT_THAT(popped, testing::UnorderedElementsAre(10, 20, 30, 40, 50));
+}
+
+TEST_F(ThreadingTest, ConcurrentWorkQueueParallel) {
+	ParallelDispatchThreadPool pool(2);
+	if (pool.num_threads() < 2) {
+		// Skip test if we don't have multiple threads
+		return;
+	}
+
+	ConcurrentWorkQueue<int> queue(2, 3);
+	std::atomic<int> sum{0};
+
+	pool.run([&queue, &sum](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Each thread pushes some work
+		for (int i = 0; i < 10; ++i)
+			queue.push(ctx, ctx.thread_num * 100 + i);
+
+		// Each thread processes work until done
+		while (true) {
+			std::vector<int> batch = queue.pop_batch(ctx);
+			if (batch.empty())
+				break;
+			for (int v : batch)
+				sum.fetch_add(v);
+		}
+	});
+
+	// Thread 0 pushes: 0+1+2+...+9 = 45
+	// Thread 1 pushes: 100+101+...+109 = 1045
+	// Total = 45 + 1045 = 1090
+	EXPECT_EQ(sum.load(), 1090);
+}
+
 YOSYS_NAMESPACE_END

From 1260fda83a4f54ee5853af2610f5648e82d40aa5 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 5 Feb 2026 18:20:31 +0000
Subject: [PATCH 25/29] Add 'init' attributes to RTLIL fuzzing

---
 tests/tools/rtlil-fuzz-grammar.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/tools/rtlil-fuzz-grammar.json b/tests/tools/rtlil-fuzz-grammar.json
index c27b160f4..96af9bde3 100644
--- a/tests/tools/rtlil-fuzz-grammar.json
+++ b/tests/tools/rtlil-fuzz-grammar.json
@@ -8,7 +8,7 @@
 			"end\n"
 		]
 	],
-	"<WIRE>": [ [ "  wire width ", "<WIDTH>", " ", "<WIRE_MODE>", " ", "<WIRE_ID>", "\n" ] ],
+	"<WIRE>": [ [ "<WIRE_ATTRIBUTES>", "  wire width ", "<WIDTH>", " ", "<WIRE_MODE>", " ", "<WIRE_ID>", "\n" ] ],
 	"<WIDTH>": [ [ "1" ], [ "2" ], [ "3" ], [ "4" ], [ "32" ], [ "128" ] ],
 	"<WIRE_MODE>": [ [ "input ", "<PORT_ID>" ], [ "output ", "<PORT_ID>" ], [ "inout ", "<PORT_ID>" ], [] ],
 	"<CELL>": [
@@ -71,6 +71,7 @@
 			"  end\n"
 		]
 	],
+	"<WIRE_ATTRIBUTE>": [ [ "  attribute \\init ", "<CONST>", "\n" ] ],
 	"<WIRE_ID>": [ [ "\\wire_a" ], [ "\\wire_b" ], [ "\\wire_c" ], [ "\\wire_d" ], [ "\\wire_e" ], [ "\\wire_f" ], [ "\\wire_g" ], [ "\\wire_h" ], [ "\\wire_i" ], [ "\\wire_j" ] ],
 	"<CELL_ID>": [ [ "\\cell_a" ], [ "\\cell_b" ], [ "\\cell_c" ], [ "\\cell_d" ], [ "\\cell_e" ], [ "\\cell_f" ], [ "\\cell_g" ], [ "\\cell_h" ], [ "\\cell_i" ], [ "\\cell_j" ] ],
 	"<BLACKBOX_CELL>": [ [ "\\bb1" ], [ "\\bb2" ] ],
@@ -97,6 +98,7 @@
 	"<CONNECT>": [ [ "  connect ", "<SIGSPEC>", " ", "<SIGSPEC>", "\n" ] ],
 
 	"<WIRES>": [ [ ], [ "<WIRE>", "<WIRES>" ] ],
+	"<WIRE_ATTRIBUTES>": [ [ ], [ "<WIRE_ATTRIBUTE>", "<WIRE_ATTRIBUTES>" ] ],
 	"<CELLS>": [ [ ], [ "<CELL>", "<CELLS>" ] ],
 	"<BITS>": [ [ ], [ "<BIT>", "<BITS>" ] ],
 	"<CONNECTS>": [ [ ], [ "<CONNECT>", "<CONNECTS>" ] ],

From 70cc2d67fd5f579678772c90eb2250c47460f2ec Mon Sep 17 00:00:00 2001
From: "Emil J. Tywoniak" <emil@tywoniak.eu>
Date: Mon, 16 Feb 2026 10:57:45 +0100
Subject: [PATCH 26/29] opt_clean: refactor

---
 .github/actions/setup-build-env/action.yml |    2 +-
 passes/opt/Makefile.inc                    |    3 +-
 passes/opt/opt_clean.cc                    | 1193 --------------------
 passes/opt/opt_clean/Makefile.inc          |   10 +
 passes/opt/opt_clean/cells_all.cc          |  373 ++++++
 passes/opt/opt_clean/cells_temp.cc         |  104 ++
 passes/opt/opt_clean/inits.cc              |  137 +++
 passes/opt/opt_clean/keep_cache.h          |  167 +++
 passes/opt/opt_clean/opt_clean.cc          |  152 +++
 passes/opt/opt_clean/opt_clean.h           |  103 ++
 passes/opt/opt_clean/wires.cc              |  585 ++++++++++
 11 files changed, 1634 insertions(+), 1195 deletions(-)
 delete mode 100644 passes/opt/opt_clean.cc
 create mode 100644 passes/opt/opt_clean/Makefile.inc
 create mode 100644 passes/opt/opt_clean/cells_all.cc
 create mode 100644 passes/opt/opt_clean/cells_temp.cc
 create mode 100644 passes/opt/opt_clean/inits.cc
 create mode 100644 passes/opt/opt_clean/keep_cache.h
 create mode 100644 passes/opt/opt_clean/opt_clean.cc
 create mode 100644 passes/opt/opt_clean/opt_clean.h
 create mode 100644 passes/opt/opt_clean/wires.cc

diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml
index c9dc5fc22..c1d3e2b01 100644
--- a/.github/actions/setup-build-env/action.yml
+++ b/.github/actions/setup-build-env/action.yml
@@ -42,7 +42,7 @@ runs:
       if: runner.os == 'Linux' && inputs.get-build-deps == 'true'
       uses: awalsh128/cache-apt-pkgs-action@v1.6.0
       with:
-        packages: bison clang flex libffi-dev libfl-dev libreadline-dev pkg-config tcl-dev zlib1g-dev libgtest-dev
+        packages: bison clang flex libffi-dev libfl-dev libreadline-dev pkg-config tcl-dev zlib1g-dev libgtest-dev libgmock-dev
         version: ${{ inputs.runs-on }}-buildys
 
     - name: Linux docs dependencies
diff --git a/passes/opt/Makefile.inc b/passes/opt/Makefile.inc
index 5dee824ff..e7b62fc6a 100644
--- a/passes/opt/Makefile.inc
+++ b/passes/opt/Makefile.inc
@@ -9,7 +9,6 @@ OBJS += passes/opt/opt_muxtree.o
 OBJS += passes/opt/opt_reduce.o
 OBJS += passes/opt/opt_dff.o
 OBJS += passes/opt/opt_share.o
-OBJS += passes/opt/opt_clean.o
 OBJS += passes/opt/opt_expr.o
 OBJS += passes/opt/opt_hier.o
 
@@ -40,3 +39,5 @@ PEEPOPT_PATTERN += passes/opt/peepopt_formal_clockgateff.pmg
 passes/opt/peepopt_pm.h: passes/pmgen/pmgen.py $(PEEPOPT_PATTERN)
 	$(P) mkdir -p $(dir $@) && $(PYTHON_EXECUTABLE) $< -o $@ -p peepopt $(filter-out $<,$^)
 endif
+
+include $(YOSYS_SRC)/passes/opt/opt_clean/Makefile.inc
\ No newline at end of file
diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
deleted file mode 100644
index c0c6b2c53..000000000
--- a/passes/opt/opt_clean.cc
+++ /dev/null
@@ -1,1193 +0,0 @@
-/*
- *  yosys -- Yosys Open SYnthesis Suite
- *
- *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
- *
- *  Permission to use, copy, modify, and/or distribute this software for any
- *  purpose with or without fee is hereby granted, provided that the above
- *  copyright notice and this permission notice appear in all copies.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- */
-
-#include "kernel/register.h"
-#include "kernel/sigtools.h"
-#include "kernel/log.h"
-#include "kernel/celltypes.h"
-#include "kernel/newcelltypes.h"
-#include "kernel/ffinit.h"
-#include "kernel/threading.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <set>
-
-USING_YOSYS_NAMESPACE
-PRIVATE_NAMESPACE_BEGIN
-
-using RTLIL::id2cstr;
-
-struct keep_cache_t
-{
-	dict<Module*, bool> keep_modules;
-	bool purge_mode;
-
-	keep_cache_t(bool purge_mode, ParallelDispatchThreadPool &thread_pool, const std::vector<RTLIL::Module *> &selected_modules)
-			: purge_mode(purge_mode) {
-
-		std::vector<RTLIL::Module *> scan_modules_worklist;
-		dict<RTLIL::Module *, std::vector<RTLIL::Module*>> dependents;
-		std::vector<RTLIL::Module *> propagate_kept_modules_worklist;
-		for (RTLIL::Module *module : selected_modules) {
-			if (keep_modules.count(module))
-				continue;
-			bool keep = scan_module(module, thread_pool, dependents, ALL_CELLS, scan_modules_worklist);
-			keep_modules[module] = keep;
-			if (keep)
-				propagate_kept_modules_worklist.push_back(module);
-		}
-
-		while (!scan_modules_worklist.empty()) {
-			RTLIL::Module *module = scan_modules_worklist.back();
-			scan_modules_worklist.pop_back();
-			if (keep_modules.count(module))
-				continue;
-			bool keep = scan_module(module, thread_pool, dependents, MINIMUM_CELLS, scan_modules_worklist);
-			keep_modules[module] = keep;
-			if (keep)
-				propagate_kept_modules_worklist.push_back(module);
-		}
-
-		while (!propagate_kept_modules_worklist.empty()) {
-			RTLIL::Module *module = propagate_kept_modules_worklist.back();
-			propagate_kept_modules_worklist.pop_back();
-			for (RTLIL::Module *dependent : dependents[module]) {
-				if (keep_modules[dependent])
-					continue;
-				keep_modules[dependent] = true;
-				propagate_kept_modules_worklist.push_back(dependent);
-			}
-		}
-	}
-
-	bool query(Cell *cell) const
-	{
-		if (keep_cell(cell, purge_mode))
-			return true;
-		if (cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
-			return true;
-		if (cell->module && cell->module->design) {
-			RTLIL::Module *cell_module = cell->module->design->module(cell->type);
-			return cell_module != nullptr && keep_modules.at(cell_module);
-		}
-		return false;
-	}
-
-private:
-	enum ScanCells {
-		// Scan every cell to see if it uses a module that is kept.
-		ALL_CELLS,
-		// Stop scanning cells if we determine early that this module is kept.
-		MINIMUM_CELLS,
-	};
-	bool scan_module(Module *module, ParallelDispatchThreadPool &thread_pool, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
-			ScanCells scan_cells, std::vector<Module*> &worklist) const
-	{
-		MonotonicFlag keep_module;
-		if (module->get_bool_attribute(ID::keep)) {
-			if (scan_cells == MINIMUM_CELLS)
-				return true;
-			keep_module.set();
-		}
-
-		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
-		ShardedVector<Module*> deps(subpool);
-		const RTLIL::Module *const_module = module;
-		bool purge_mode = this->purge_mode;
-		subpool.run([purge_mode, const_module, scan_cells, &deps, &keep_module](const ParallelDispatchThreadPool::RunCtx &ctx) {
-			bool keep = false;
-			for (int i : ctx.item_range(const_module->cells_size())) {
-				Cell *cell = const_module->cell_at(i);
-				if (keep_cell(cell, purge_mode)) {
-					if (scan_cells == MINIMUM_CELLS) {
-						keep_module.set();
-						return;
-					}
-					keep = true;
-				}
-				if (const_module->design) {
-					RTLIL::Module *cell_module = const_module->design->module(cell->type);
-					if (cell_module != nullptr)
-						deps.insert(ctx, cell_module);
-				}
-			}
-			if (keep) {
-				keep_module.set();
-				return;
-			}
-			for (int i : ctx.item_range(const_module->wires_size())) {
-				Wire *wire = const_module->wire_at(i);
-				if (wire->get_bool_attribute(ID::keep)) {
-					keep_module.set();
-					return;
-				}
-			}
-		});
-		if (scan_cells == MINIMUM_CELLS && keep_module.load())
-			return true;
-		for (Module *dep : deps) {
-			dependents[dep].push_back(module);
-			worklist.push_back(dep);
-		}
-		return keep_module.load();
-	}
-
-	static bool keep_cell(Cell *cell, bool purge_mode)
-	{
-		if (cell->type.in(ID($assert), ID($assume), ID($live), ID($fair), ID($cover)))
-			return true;
-
-		if (cell->type.in(ID($overwrite_tag)))
-			return true;
-
-		if (cell->type == ID($print) || cell->type == ID($check))
-			return true;
-
-		if (cell->has_keep_attr())
-			return true;
-
-		if (!purge_mode && cell->type == ID($scopeinfo))
-			return true;
-		return false;
-	}
-};
-
-static constexpr auto ct_reg = StaticCellTypes::Categories::join(
-	StaticCellTypes::Compat::mem_ff,
-	StaticCellTypes::categories.is_anyinit);
-NewCellTypes ct_all;
-
-struct RmStats {
-	int count_rm_cells = 0;
-	int count_rm_wires = 0;
-
-	void log()
-	{
-		if (count_rm_cells > 0 || count_rm_wires > 0)
-			YOSYS_NAMESPACE_PREFIX log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
-	}
-};
-
-unsigned int hash_bit(const SigBit &bit) {
-	return static_cast<unsigned int>(hash_ops<SigBit>::hash(bit).yield());
-}
-
-void rmunused_module_cells(Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
-{
-	SigMap sigmap(module);
-	FfInitVals ffinit;
-	ffinit.set_parallel(&sigmap, subpool.thread_pool(), module);
-
-	SigMap raw_sigmap;
-	for (auto &it : module->connections_) {
-		for (int i = 0; i < GetSize(it.second); i++) {
-			if (it.second[i].wire != nullptr)
-				raw_sigmap.add(it.first[i], it.second[i]);
-		}
-	}
-
-	struct WireDrivers;
-	// Maps from a SigBit to a unique driver cell.
-	struct WireDriver {
-		using Accumulated = WireDrivers;
-		SigBit bit;
-		int driver_cell;
-	};
-	// Maps from a SigBit to one or more driver cells.
-	struct WireDrivers {
-		WireDrivers() : driver_cell(0) {}
-		WireDrivers(WireDriver driver) : bit(driver.bit), driver_cell(driver.driver_cell) {}
-		WireDrivers(SigBit bit) : bit(bit), driver_cell(0) {}
-		WireDrivers(WireDrivers &&other) = default;
-
-		class const_iterator {
-		public:
-			const_iterator(const WireDrivers &drivers, bool end)
-					: driver_cell(drivers.driver_cell), in_extra_cells(end) {
-				if (drivers.extra_driver_cells) {
-					if (end) {
-						extra_it = drivers.extra_driver_cells->end();
-					} else {
-						extra_it = drivers.extra_driver_cells->begin();
-					}
-				}
-			}
-			int operator*() const {
-				if (in_extra_cells)
-					return **extra_it;
-				return driver_cell;
-			}
-			const_iterator& operator++() {
-				if (in_extra_cells)
-					++*extra_it;
-				else
-					in_extra_cells = true;
-				return *this;
-			}
-			bool operator!=(const const_iterator &other) const {
-				return !(*this == other);
-			}
-			bool operator==(const const_iterator &other) const {
-				return in_extra_cells == other.in_extra_cells &&
-					extra_it == other.extra_it;
-			}
-		private:
-			std::optional<pool<int>::iterator> extra_it;
-			int driver_cell;
-			bool in_extra_cells;
-		};
-
-		const_iterator begin() const { return const_iterator(*this, false); }
-		const_iterator end() const { return const_iterator(*this, true); }
-
-		SigBit bit;
-		int driver_cell;
-		std::unique_ptr<pool<int>> extra_driver_cells;
-	};
-	struct WireDriversKeyEquality {
-		bool operator()(const WireDrivers &a, const WireDrivers &b) const {
-			return a.bit == b.bit;
-		}
-	};
-	struct WireDriversCollisionHandler {
-		void operator()(WireDrivers &incumbent, WireDrivers &new_value) const {
-			log_assert(new_value.extra_driver_cells == nullptr);
-			if (!incumbent.extra_driver_cells)
-				incumbent.extra_driver_cells.reset(new pool<int>());
-			incumbent.extra_driver_cells->insert(new_value.driver_cell);
-		}
-	};
-	using Wire2Drivers = ShardedHashtable<WireDriver, WireDriversKeyEquality, WireDriversCollisionHandler>;
-
-	Wire2Drivers::Builder wire2driver_builder(subpool);
-	ShardedVector<std::pair<std::string, int>> mem2cells_vector(subpool);
-	ShardedVector<std::pair<SigBit, std::string>> driver_driver_logs(subpool);
-	ShardedVector<Wire*> keep_wires(subpool);
-	const RTLIL::Module *const_module = module;
-	int num_threads = subpool.num_threads();
-	ConcurrentWorkQueue<int> cell_queue(num_threads);
-	std::vector<std::atomic<bool>> unused(const_module->cells_size());
-	subpool.run([&sigmap, &raw_sigmap, &keep_cache, const_module, &mem2cells_vector, &driver_driver_logs, &keep_wires, &cell_queue, &wire2driver_builder, &unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		for (int i : ctx.item_range(const_module->cells_size())) {
-			Cell *cell = const_module->cell_at(i);
-			if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2)))
-				mem2cells_vector.insert(ctx, {cell->getParam(ID::MEMID).decode_string(), i});
-
-			for (auto &it2 : cell->connections()) {
-				if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first))
-					continue;
-				for (auto raw_bit : it2.second) {
-					if (raw_bit.wire == nullptr)
-						continue;
-					auto bit = sigmap(raw_bit);
-					if (bit.wire == nullptr && ct_all.cell_known(cell->type)) {
-						std::string msg = stringf("Driver-driver conflict "
-								"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
-								log_signal(raw_bit), cell->name.unescape(), it2.first.unescape(), log_signal(bit), const_module->name.unescape());
-						driver_driver_logs.insert(ctx, {raw_sigmap(raw_bit), msg});
-					}
-					if (bit.wire != nullptr)
-						wire2driver_builder.insert(ctx, {{bit, i}, hash_bit(bit)});
-				}
-			}
-			bool keep = keep_cache.query(cell);
-			unused[i].store(!keep, std::memory_order_relaxed);
-			if (keep)
-				cell_queue.push(ctx, i);
-		}
-		for (int i : ctx.item_range(const_module->wires_size())) {
-			Wire *wire = const_module->wire_at(i);
-			if (wire->port_output || wire->get_bool_attribute(ID::keep))
-				keep_wires.insert(ctx, wire);
-		}
-	});
-	subpool.run([&wire2driver_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		wire2driver_builder.process(ctx);
-	});
-	Wire2Drivers wire2driver(wire2driver_builder);
-
-	dict<std::string, pool<int>> mem2cells;
-	for (std::pair<std::string, int> &mem2cell : mem2cells_vector)
-		mem2cells[mem2cell.first].insert(mem2cell.second);
-
-	pool<SigBit> used_raw_bits;
-	int i = 0;
-	for (Wire *wire : keep_wires) {
-		for (auto bit : sigmap(wire)) {
-			const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
-			if (drivers != nullptr)
-				for (int cell_index : *drivers)
-					if (unused[cell_index].exchange(false, std::memory_order_relaxed)) {
-						ThreadIndex fake_thread_index = {i++ % num_threads};
-						cell_queue.push(fake_thread_index, cell_index);
-					}
-		}
-		for (auto raw_bit : SigSpec(wire))
-			used_raw_bits.insert(raw_sigmap(raw_bit));
-	}
-
-	std::vector<std::atomic<bool>> mem_unused(module->memories.size());
-	dict<std::string, int> mem_indices;
-	for (int i = 0; i < GetSize(module->memories); ++i) {
-		mem_indices[module->memories.element(i)->first.str()] = i;
-		mem_unused[i].store(true, std::memory_order_relaxed);
-	}
-
-	subpool.run([const_module, &sigmap, &wire2driver, &mem2cells, &unused, &cell_queue, &mem_indices, &mem_unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		pool<SigBit> bits;
-		pool<std::string> mems;
-		while (true) {
-			std::vector<int> cell_indices = cell_queue.pop_batch(ctx);
-			if (cell_indices.empty())
-				return;
-			for (auto cell_index : cell_indices) {
-				Cell *cell = const_module->cell_at(cell_index);
-				for (auto &it : cell->connections())
-					if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first))
-						for (auto bit : sigmap(it.second))
-							bits.insert(bit);
-
-				if (cell->type.in(ID($memrd), ID($memrd_v2))) {
-					std::string mem_id = cell->getParam(ID::MEMID).decode_string();
-					if (mem_indices.count(mem_id)) {
-						int mem_index = mem_indices[mem_id];
-						if (mem_unused[mem_index].exchange(false, std::memory_order_relaxed))
-							mems.insert(mem_id);
-					}
-				}
-			}
-
-			for (auto bit : bits) {
-				const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
-				if (drivers != nullptr)
-					for (int cell_index : *drivers)
-						if (unused[cell_index].exchange(false, std::memory_order_relaxed))
-							cell_queue.push(ctx, cell_index);
-			}
-			bits.clear();
-
-			for (auto mem : mems) {
-				if (mem2cells.count(mem) == 0)
-					continue;
-				for (int cell_index : mem2cells.at(mem))
-					if (unused[cell_index].exchange(false, std::memory_order_relaxed))
-						cell_queue.push(ctx, cell_index);
-			}
-			mems.clear();
-		}
-	});
-
-	ShardedVector<int> sharded_unused_cells(subpool);
-	subpool.run([const_module, &unused, &sharded_unused_cells, &wire2driver](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		// Parallel destruction of `wire2driver`
-		wire2driver.clear(ctx);
-		for (int i : ctx.item_range(const_module->cells_size()))
-			if (unused[i].load(std::memory_order_relaxed))
-				sharded_unused_cells.insert(ctx, i);
-	});
-	pool<Cell*> unused_cells;
-	for (int cell_index : sharded_unused_cells)
-		unused_cells.insert(const_module->cell_at(cell_index));
-	unused_cells.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
-
-	for (auto cell : unused_cells) {
-		if (verbose)
-			log_debug("  removing unused `%s' cell `%s'.\n", cell->type, cell->name);
-		module->design->scratchpad_set_bool("opt.did_something", true);
-		if (cell->is_builtin_ff())
-			ffinit.remove_init(cell->getPort(ID::Q));
-		module->remove(cell);
-		stats.count_rm_cells++;
-	}
-
-	for (const auto &it : mem_indices) {
-		if (!mem_unused[it.second].load(std::memory_order_relaxed))
-			continue;
-		RTLIL::IdString id(it.first);
-		if (verbose)
-			log_debug("  removing unused memory `%s'.\n", id.unescape());
-		delete module->memories.at(id);
-		module->memories.erase(id);
-	}
-
-	if (!driver_driver_logs.empty()) {
-		// We could do this in parallel but hopefully this is rare.
-		for (auto &it : module->cells_) {
-			Cell *cell = it.second;
-			for (auto &it2 : cell->connections()) {
-				if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first))
-					continue;
-				for (auto raw_bit : raw_sigmap(it2.second))
-					used_raw_bits.insert(raw_bit);
-			}
-		}
-		for (std::pair<SigBit, std::string> &it : driver_driver_logs) {
-			if (used_raw_bits.count(it.first))
-				log_warning("%s\n", it.second);
-		}
-	}
-}
-
-int count_nontrivial_wire_attrs(RTLIL::Wire *w)
-{
-	int count = w->attributes.size();
-	count -= w->attributes.count(ID::src);
-	count -= w->attributes.count(ID::hdlname);
-	count -= w->attributes.count(ID(scopename));
-	count -= w->attributes.count(ID::unused_bits);
-	return count;
-}
-
-struct ShardedSigBit {
-	using Accumulated = ShardedSigBit;
-	RTLIL::SigBit bit;
-	ShardedSigBit() = default;
-	ShardedSigBit(const RTLIL::SigBit &bit) : bit(bit) {}
-};
-struct ShardedSigBitEquality {
-	bool operator()(const ShardedSigBit &b1, const ShardedSigBit &b2) const {
-		return b1.bit == b2.bit;
-	}
-};
-using ShardedSigPool = ShardedHashtable<ShardedSigBit, ShardedSigBitEquality, SetCollisionHandler<ShardedSigBit>>;
-
-struct ShardedSigSpec {
-	using Accumulated = ShardedSigSpec;
-	RTLIL::SigSpec spec;
-	ShardedSigSpec() = default;
-	ShardedSigSpec(RTLIL::SigSpec spec) : spec(std::move(spec)) {}
-	ShardedSigSpec(ShardedSigSpec &&) = default;
-};
-struct ShardedSigSpecEquality {
-	bool operator()(const ShardedSigSpec &s1, const ShardedSigSpec &s2) const {
-		return s1.spec == s2.spec;
-	}
-};
-using ShardedSigSpecPool = ShardedHashtable<ShardedSigSpec, ShardedSigSpecEquality, SetCollisionHandler<ShardedSigSpec>>;
-
-struct DirectWires {
-	const SigMap &assign_map;
-	const ShardedSigSpecPool &direct_sigs;
-	dict<RTLIL::Wire *, bool> cache;
-
-	DirectWires(const SigMap &assign_map, const ShardedSigSpecPool &direct_sigs) : assign_map(assign_map), direct_sigs(direct_sigs) {}
-	void cache_result_for_bit(const SigBit &bit) {
-		if (bit.wire != nullptr)
-			is_direct(bit.wire);
-	}
-	bool is_direct(RTLIL::Wire *wire) {
-		if (wire->port_input)
-			return true;
-		auto it = cache.find(wire);
-		if (it != cache.end())
-			return it->second;
-		SigSpec direct_sig = assign_map(wire);
-		bool direct = direct_sigs.find({direct_sig, direct_sig.hash_into(Hasher()).yield()}) != nullptr;
-		cache.insert({wire, direct});
-		return direct;
-	}
-};
-
-// Should we pick `s2` over `s1` to represent a signal?
-bool compare_signals(const RTLIL::SigBit &s1, const RTLIL::SigBit &s2, const ShardedSigPool &regs, const ShardedSigPool &conns, DirectWires &direct_wires)
-{
-	if (s1 == s2)
-		return false;
-
-	RTLIL::Wire *w1 = s1.wire;
-	RTLIL::Wire *w2 = s2.wire;
-
-	if (w1 == NULL || w2 == NULL)
-		return w2 == NULL;
-
-	if (w1->port_input != w2->port_input)
-		return w2->port_input;
-
-	if ((w1->port_input && w1->port_output) != (w2->port_input && w2->port_output))
-		return !(w2->port_input && w2->port_output);
-
-	if (w1->name.isPublic() && w2->name.isPublic()) {
-		ShardedSigPool::AccumulatedValue s1_val = {s1, s1.hash_top().yield()};
-		ShardedSigPool::AccumulatedValue s2_val = {s2, s2.hash_top().yield()};
-		bool regs1 = regs.find(s1_val) != nullptr;
-		bool regs2 = regs.find(s2_val) != nullptr;
-		if (regs1 != regs2)
-			return regs2;
-		bool w1_direct = direct_wires.is_direct(w1);
-		bool w2_direct = direct_wires.is_direct(w2);
-		if (w1_direct != w2_direct)
-			return w2_direct;
-		bool conns1 = conns.find(s1_val) != nullptr;
-		bool conns2 = conns.find(s2_val) != nullptr;
-		if (conns1 != conns2)
-			return conns2;
-	}
-
-	if (w1 == w2)
-		return s2.offset < s1.offset;
-
-	if (w1->port_output != w2->port_output)
-		return w2->port_output;
-
-	if (w1->name[0] != w2->name[0])
-		return w2->name.isPublic();
-
-	int attrs1 = count_nontrivial_wire_attrs(w1);
-	int attrs2 = count_nontrivial_wire_attrs(w2);
-
-	if (attrs1 != attrs2)
-		return attrs2 > attrs1;
-
-	return w2->name.lt_by_name(w1->name);
-}
-
-bool check_public_name(RTLIL::IdString id)
-{
-	if (id.begins_with("$"))
-		return false;
-	const std::string &id_str = id.str();
-	if (id.begins_with("\\_") && (id.ends_with("_") || id_str.find("_[") != std::string::npos))
-		return false;
-	if (id_str.find(".$") != std::string::npos)
-		return false;
-	return true;
-}
-
-void add_spec(ShardedSigPool::Builder &builder, const ThreadIndex &thread, const RTLIL::SigSpec &spec) {
-	for (SigBit bit : spec)
-		if (bit.wire != nullptr)
-			builder.insert(thread, {bit, bit.hash_top().yield()});
-}
-
-bool check_any(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
-	for (SigBit b : spec)
-		if (sigs.find({b, b.hash_top().yield()}) != nullptr)
-			return true;
-	return false;
-}
-
-bool check_all(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
-	for (SigBit b : spec)
-		if (sigs.find({b, b.hash_top().yield()}) == nullptr)
-			return false;
-	return true;
-}
-
-bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool purge_mode, bool verbose, RmStats &stats)
-{
-	SigMap assign_map(module);
-
-	const RTLIL::Module *const_module = module;
-	// `register_signals` and `connected_signals` will help us decide later on
-	// on picking representatives out of groups of connected signals
-	ShardedSigPool::Builder register_signals_builder(subpool);
-	ShardedSigPool::Builder connected_signals_builder(subpool);
-	// construct a pool of wires which are directly driven by a known celltype,
-	// this will influence our choice of representatives
-	ShardedSigSpecPool::Builder direct_sigs_builder(subpool);
-	subpool.run([const_module, purge_mode, &assign_map, &direct_sigs_builder, &register_signals_builder, &connected_signals_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		for (int i : ctx.item_range(const_module->cells_size())) {
-			RTLIL::Cell *cell = const_module->cell_at(i);
-			if (!purge_mode) {
-				if (ct_reg(cell->type)) {
-					bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic));
-					for (auto &it2 : cell->connections())
-						if (clk2fflogic ? it2.first == ID::D : ct_all.cell_output(cell->type, it2.first))
-							add_spec(register_signals_builder, ctx, it2.second);
-				}
-				for (auto &it2 : cell->connections())
-					add_spec(connected_signals_builder, ctx, it2.second);
-			}
-			if (ct_all.cell_known(cell->type))
-				for (auto &it2 : cell->connections())
-					if (ct_all.cell_output(cell->type, it2.first)) {
-						RTLIL::SigSpec spec = assign_map(it2.second);
-						unsigned int hash = spec.hash_into(Hasher()).yield();
-						direct_sigs_builder.insert(ctx, {std::move(spec), hash});
-					}
-		}
-	});
-	subpool.run([&register_signals_builder, &connected_signals_builder, &direct_sigs_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		register_signals_builder.process(ctx);
-		connected_signals_builder.process(ctx);
-		direct_sigs_builder.process(ctx);
-	});
-	ShardedSigPool register_signals(register_signals_builder);
-	ShardedSigPool connected_signals(connected_signals_builder);
-	ShardedSigSpecPool direct_sigs(direct_sigs_builder);
-
-	ShardedVector<RTLIL::SigBit> sigmap_canonical_candidates(subpool);
-	DirectWires direct_wires(assign_map, direct_sigs);
-	subpool.run([const_module, &assign_map, &register_signals, &connected_signals, &sigmap_canonical_candidates, &direct_sigs, &direct_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		std::optional<DirectWires> local_direct_wires;
-		DirectWires *this_thread_direct_wires = &direct_wires;
-		if (ctx.thread_num > 0) {
-			local_direct_wires.emplace(assign_map, direct_sigs);
-			this_thread_direct_wires = &local_direct_wires.value();
-		}
-		for (int i : ctx.item_range(const_module->wires_size())) {
-			RTLIL::Wire *wire = const_module->wire_at(i);
-			for (int j = 0; j < wire->width; ++j) {
-				RTLIL::SigBit s1(wire, j);
-				RTLIL::SigBit s2 = assign_map(s1);
-				if (compare_signals(s2, s1, register_signals, connected_signals, *this_thread_direct_wires))
-					sigmap_canonical_candidates.insert(ctx, s1);
-			}
-		}
-	});
-	// Cache all the direct_wires results that we might possible need. This avoids the results
-	// changing when we update `assign_map` below.
-	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
-		direct_wires.cache_result_for_bit(candidate);
-		direct_wires.cache_result_for_bit(assign_map(candidate));
-	}
-	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
-		RTLIL::SigBit current_canonical = assign_map(candidate);
-		if (compare_signals(current_canonical, candidate, register_signals, connected_signals, direct_wires))
-			assign_map.add(candidate);
-	}
-
-	// we are removing all connections
-	module->connections_.clear();
-
-	// used signals sigmapped
-	ShardedSigPool::Builder used_signals_builder(subpool);
-	// used signals pre-sigmapped
-	ShardedSigPool::Builder raw_used_signals_builder(subpool);
-	// used signals sigmapped, ignoring drivers (we keep track of this to set `unused_bits`)
-	ShardedSigPool::Builder used_signals_nodrivers_builder(subpool);
-	struct UpdateConnection {
-		RTLIL::Cell *cell;
-		RTLIL::IdString port;
-		RTLIL::SigSpec spec;
-	};
-	ShardedVector<UpdateConnection> update_connections(subpool);
-	ShardedVector<RTLIL::Wire*> initialized_wires(subpool);
-	// gather the usage information for cells and update cell connections
-	// also gather the usage information for ports, wires with `keep`
-	// also gather init bits
-	subpool.run([const_module, &register_signals, &connected_signals, &direct_sigs, &assign_map, &used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder, &update_connections, &initialized_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		// Parallel destruction of these sharded structures
-		register_signals.clear(ctx);
-		connected_signals.clear(ctx);
-		direct_sigs.clear(ctx);
-
-		for (int i : ctx.item_range(const_module->cells_size())) {
-			RTLIL::Cell *cell = const_module->cell_at(i);
-			for (const auto &it2 : cell->connections_) {
-				SigSpec spec = assign_map(it2.second);
-				if (spec != it2.second)
-					update_connections.insert(ctx, {cell, it2.first, spec});
-				add_spec(raw_used_signals_builder, ctx, spec);
-				add_spec(used_signals_builder, ctx, spec);
-				if (!ct_all.cell_output(cell->type, it2.first))
-					add_spec(used_signals_nodrivers_builder, ctx, spec);
-			}
-		}
-		for (int i : ctx.item_range(const_module->wires_size())) {
-			RTLIL::Wire *wire = const_module->wire_at(i);
-			if (wire->port_id > 0) {
-				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-				add_spec(raw_used_signals_builder, ctx, sig);
-				assign_map.apply(sig);
-				add_spec(used_signals_builder, ctx, sig);
-				if (!wire->port_input)
-					add_spec(used_signals_nodrivers_builder, ctx, sig);
-			}
-			if (wire->get_bool_attribute(ID::keep)) {
-				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-				assign_map.apply(sig);
-				add_spec(used_signals_builder, ctx, sig);
-			}
-			auto it2 = wire->attributes.find(ID::init);
-			if (it2 != wire->attributes.end())
-				initialized_wires.insert(ctx, wire);
-		}
-	});
-	subpool.run([&used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		used_signals_builder.process(ctx);
-		raw_used_signals_builder.process(ctx);
-		used_signals_nodrivers_builder.process(ctx);
-	});
-	ShardedSigPool used_signals(used_signals_builder);
-	ShardedSigPool raw_used_signals(raw_used_signals_builder);
-	ShardedSigPool used_signals_nodrivers(used_signals_nodrivers_builder);
-
-	dict<RTLIL::SigBit, RTLIL::State> init_bits;
-	// The wires that appear in the keys of `init_bits`
-	pool<Wire*> init_bits_wires;
-	for (const UpdateConnection &update : update_connections)
-		update.cell->connections_.at(update.port) = std::move(update.spec);
-	for (RTLIL::Wire *intialized_wire : initialized_wires) {
-		auto it = intialized_wire->attributes.find(ID::init);
-		RTLIL::Const &val = it->second;
-		SigSpec sig = assign_map(intialized_wire);
-		for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
-			if (val[i] != State::Sx && sig[i].wire != nullptr) {
-				init_bits[sig[i]] = val[i];
-				init_bits_wires.insert(sig[i].wire);
-			}
-		intialized_wire->attributes.erase(it);
-	}
-
-	// set init attributes on all wires of a connected group
-	for (RTLIL::Wire *wire : init_bits_wires) {
-		bool found = false;
-		Const val(State::Sx, wire->width);
-		for (int i = 0; i < wire->width; i++) {
-			auto it = init_bits.find(RTLIL::SigBit(wire, i));
-			if (it != init_bits.end()) {
-				val.set(i, it->second);
-				found = true;
-			}
-		}
-		if (found)
-			wire->attributes[ID::init] = val;
-	}
-
-	// now decide for each wire if we should be deleting it
-	ShardedVector<RTLIL::Wire*> del_wires(subpool);
-	ShardedVector<RTLIL::Wire*> remove_init(subpool);
-	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_init(subpool);
-	ShardedVector<RTLIL::SigSig> connections(subpool);
-	ShardedVector<RTLIL::Wire*> remove_unused_bits(subpool);
-	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_unused_bits(subpool);
-	subpool.run([const_module, purge_mode, &assign_map, &used_signals, &raw_used_signals, &used_signals_nodrivers, &del_wires, &remove_init, &set_init, &connections, &remove_unused_bits, &set_unused_bits](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		for (int i : ctx.item_range(const_module->wires_size())) {
-			RTLIL::Wire *wire = const_module->wire_at(i);
-			SigSpec s1 = SigSpec(wire), s2 = assign_map(s1);
-			log_assert(GetSize(s1) == GetSize(s2));
-
-			Const initval;
-			bool has_init_attribute = wire->attributes.count(ID::init);
-			bool init_changed = false;
-			if (has_init_attribute)
-				initval = wire->attributes.at(ID::init);
-			if (GetSize(initval) != GetSize(wire)) {
-				initval.resize(GetSize(wire), State::Sx);
-				init_changed = true;
-			}
-
-			if (GetSize(wire) == 0) {
-				// delete zero-width wires, unless they are module ports
-				if (wire->port_id == 0)
-					goto delete_this_wire;
-			} else
-			if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
-				// do not delete anything with "keep" or module ports or initialized wires
-			} else
-			if (!purge_mode && check_public_name(wire->name) && (check_any(raw_used_signals, s1) || check_any(used_signals, s2) || s1 != s2)) {
-				// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
-			} else
-			if (!check_any(raw_used_signals, s1)) {
-				// delete wires that aren't used by anything directly
-				goto delete_this_wire;
-			}
-
-			if (0)
-			{
-		delete_this_wire:
-				del_wires.insert(ctx, wire);
-			}
-			else
-			{
-				RTLIL::SigSig new_conn;
-				for (int i = 0; i < GetSize(s1); i++)
-					if (s1[i] != s2[i]) {
-						if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
-							s2[i] = initval[i];
-							initval.set(i, State::Sx);
-							init_changed = true;
-						}
-						new_conn.first.append(s1[i]);
-						new_conn.second.append(s2[i]);
-					}
-				if (new_conn.first.size() > 0)
-					connections.insert(ctx, std::move(new_conn));
-				if (initval.is_fully_undef()) {
-					if (has_init_attribute)
-						remove_init.insert(ctx, wire);
-				} else
-					if (init_changed)
-						set_init.insert(ctx, {wire, std::move(initval)});
-
-				std::string unused_bits;
-				if (!check_all(used_signals_nodrivers, s2)) {
-					for (int i = 0; i < GetSize(s2); i++) {
-						if (s2[i].wire == NULL)
-							continue;
-						SigBit b = s2[i];
-						if (used_signals_nodrivers.find({b, b.hash_top().yield()}) == nullptr) {
-							if (!unused_bits.empty())
-								unused_bits += " ";
-							unused_bits += stringf("%d", i);
-						}
-					}
-				}
-				if (unused_bits.empty() || wire->port_id != 0) {
-					if (wire->attributes.count(ID::unused_bits))
-						remove_unused_bits.insert(ctx, wire);
-				} else {
-					RTLIL::Const unused_bits_const(std::move(unused_bits));
-					if (wire->attributes.count(ID::unused_bits)) {
-						RTLIL::Const &unused_bits_attr = wire->attributes.at(ID::unused_bits);
-						if (unused_bits_attr != unused_bits_const)
-							set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
-					} else
-						set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
-				}
-			}
-		}
-	});
-	pool<RTLIL::Wire*> del_wires_queue;
-	del_wires_queue.insert(del_wires.begin(), del_wires.end());
-	for (RTLIL::Wire *wire : remove_init)
-		wire->attributes.erase(ID::init);
-	for (auto &p : set_init)
-		p.first->attributes[ID::init] = std::move(p.second);
-	for (auto &conn : connections)
-		module->connect(std::move(conn));
-	for (RTLIL::Wire *wire : remove_unused_bits)
-		wire->attributes.erase(ID::unused_bits);
-	for (auto &p : set_unused_bits)
-		p.first->attributes[ID::unused_bits] = std::move(p.second);
-
-	subpool.run([&used_signals, &raw_used_signals, &used_signals_nodrivers](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		used_signals.clear(ctx);
-		raw_used_signals.clear(ctx);
-		used_signals_nodrivers.clear(ctx);
-	});
-
-	int del_temp_wires_count = 0;
-	for (auto wire : del_wires_queue) {
-		if (ys_debug() || (check_public_name(wire->name) && verbose))
-			log_debug("  removing unused non-port wire %s.\n", wire->name);
-		else
-			del_temp_wires_count++;
-	}
-
-	module->remove(del_wires_queue);
-	stats.count_rm_wires += GetSize(del_wires_queue);
-
-	if (verbose && del_temp_wires_count)
-		log_debug("  removed %d unused temporary wires.\n", del_temp_wires_count);
-
-	if (!del_wires_queue.empty())
-		module->design->scratchpad_set_bool("opt.did_something", true);
-
-	return !del_wires_queue.empty();
-}
-
-bool rmunused_module_init(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
-{
-
-	SigMap sigmap(module);
-
-	const Module *const_module = module;
-	ShardedVector<std::pair<SigBit, State>> results(subpool);
-	subpool.run([const_module, &results](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		for (int i : ctx.item_range(const_module->cells_size())) {
-			RTLIL::Cell *cell = const_module->cell_at(i);
-			if (StaticCellTypes::Compat::internals_mem_ff(cell->type) && cell->hasPort(ID::Q))
-			{
-				SigSpec sig = cell->getPort(ID::Q);
-
-				for (int i = 0; i < GetSize(sig); i++)
-				{
-					SigBit bit = sig[i];
-
-					if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
-						continue;
-
-					Const init = bit.wire->attributes.at(ID::init);
-
-					if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
-						continue;
-
-					results.insert(ctx, {bit, init[i]});
-				}
-			}
-		}
-	});
-	dict<SigBit, State> qbits;
-	for (std::pair<SigBit, State> &p : results) {
-		sigmap.add(p.first);
-		qbits[p.first] = p.second;
-	}
-
-	ShardedVector<RTLIL::Wire*> wire_results(subpool);
-	subpool.run([const_module, &sigmap, &qbits, &wire_results](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		for (int j : ctx.item_range(const_module->wires_size())) {
-			RTLIL::Wire *wire = const_module->wire_at(j);
-			if (wire->attributes.count(ID::init) == 0)
-				continue;
-			Const init = wire->attributes.at(ID::init);
-
-			for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
-			{
-				if (init[i] == State::Sx || init[i] == State::Sz)
-					continue;
-
-				SigBit wire_bit = SigBit(wire, i);
-				SigBit mapped_wire_bit = sigmap(wire_bit);
-
-				if (wire_bit == mapped_wire_bit)
-					goto next_wire;
-
-				if (mapped_wire_bit.wire) {
-					if (qbits.count(mapped_wire_bit) == 0)
-						goto next_wire;
-
-					if (qbits.at(mapped_wire_bit) != init[i])
-						goto next_wire;
-				}
-				else {
-					if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
-						goto next_wire;
-
-					if (mapped_wire_bit != init[i]) {
-						log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
-						goto next_wire;
-					}
-				}
-			}
-			wire_results.insert(ctx, wire);
-
-			next_wire:;
-		}
-	});
-
-	bool did_something = false;
-	for (RTLIL::Wire *wire : wire_results) {
-		if (verbose)
-			log_debug("  removing redundant init attribute on %s.\n", log_id(wire));
-		wire->attributes.erase(ID::init);
-		did_something = true;
-	}
-
-	if (did_something)
-		module->design->scratchpad_set_bool("opt.did_something", true);
-
-	return did_something;
-}
-
-void remove_temporary_cells(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
-{
-	ShardedVector<RTLIL::Cell*> delcells(subpool);
-	ShardedVector<RTLIL::SigSig> new_connections(subpool);
-	const RTLIL::Module *const_module = module;
-	subpool.run([const_module, &delcells, &new_connections](const ParallelDispatchThreadPool::RunCtx &ctx) {
-		for (int i : ctx.item_range(const_module->cells_size())) {
-			RTLIL::Cell *cell = const_module->cell_at(i);
-			if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
-				bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
-				RTLIL::SigSpec a = cell->getPort(ID::A);
-				RTLIL::SigSpec y = cell->getPort(ID::Y);
-				a.extend_u0(GetSize(y), is_signed);
-
-				if (a.has_const(State::Sz)) {
-					RTLIL::SigSpec new_a;
-					RTLIL::SigSpec new_y;
-					for (int i = 0; i < GetSize(a); ++i) {
-						RTLIL::SigBit b = a[i];
-						if (b == State::Sz)
-							continue;
-						new_a.append(b);
-						new_y.append(y[i]);
-					}
-					a = std::move(new_a);
-					y = std::move(new_y);
-				}
-				if (!y.empty())
-					new_connections.insert(ctx, {y, a});
-				delcells.insert(ctx, cell);
-			} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
-				RTLIL::SigSpec a = cell->getPort(ID::A);
-				RTLIL::SigSpec b = cell->getPort(ID::B);
-				if (a.has_const() && !b.has_const())
-					std::swap(a, b);
-				new_connections.insert(ctx, {a, b});
-				delcells.insert(ctx, cell);
-			} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
-				delcells.insert(ctx, cell);
-			}
-		}
-	});
-	bool did_something = false;
-	for (RTLIL::SigSig &connection : new_connections) {
-		module->connect(connection);
-	}
-	for (RTLIL::Cell *cell : delcells) {
-		if (verbose) {
-			if (cell->type == ID($connect))
-				log_debug("  removing connect cell `%s': %s <-> %s\n", cell->name,
-						log_signal(cell->getPort(ID::A)), log_signal(cell->getPort(ID::B)));
-			else if (cell->type == ID($input_port))
-				log_debug("  removing input port marker cell `%s': %s\n", cell->name,
-						log_signal(cell->getPort(ID::Y)));
-			else
-				log_debug("  removing buffer cell `%s': %s = %s\n", cell->name,
-						log_signal(cell->getPort(ID::Y)), log_signal(cell->getPort(ID::A)));
-		}
-		module->remove(cell);
-		did_something = true;
-	}
-	if (did_something)
-		module->design->scratchpad_set_bool("opt.did_something", true);
-}
-
-void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
-{
-	if (verbose)
-		log("Finding unused cells or wires in module %s..\n", module->name);
-
-	// Use no more than one worker per thousand cells, rounded down, so
-	// we only start multithreading with at least 2000 cells.
-	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
-	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
-	remove_temporary_cells(module, subpool, verbose);
-	rmunused_module_cells(module, subpool, verbose, stats, keep_cache);
-	while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
-
-	if (rminit && rmunused_module_init(module, subpool, verbose))
-		while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
-}
-struct OptCleanPass : public Pass {
-	OptCleanPass() : Pass("opt_clean", "remove unused cells and wires") { }
-	void help() override
-	{
-		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
-		log("\n");
-		log("    opt_clean [options] [selection]\n");
-		log("\n");
-		log("This pass identifies wires and cells that are unused and removes them. Other\n");
-		log("passes often remove cells but leave the wires in the design or reconnect the\n");
-		log("wires but leave the old cells in the design. This pass can be used to clean up\n");
-		log("after the passes that do the actual work.\n");
-		log("\n");
-		log("This pass only operates on completely selected modules without processes.\n");
-		log("\n");
-		log("    -purge\n");
-		log("        also remove internal nets if they have a public name\n");
-		log("\n");
-	}
-	void execute(std::vector<std::string> args, RTLIL::Design *design) override
-	{
-		bool purge_mode = false;
-
-		log_header(design, "Executing OPT_CLEAN pass (remove unused cells and wires).\n");
-		log_push();
-
-		size_t argidx;
-		for (argidx = 1; argidx < args.size(); argidx++) {
-			if (args[argidx] == "-purge") {
-				purge_mode = true;
-				continue;
-			}
-			break;
-		}
-		extra_args(args, argidx, design);
-
-		std::vector<RTLIL::Module*> selected_modules;
-		for (auto module : design->selected_whole_modules_warn())
-			if (!module->has_processes_warn())
-				selected_modules.push_back(module);
-		int thread_pool_size = 0;
-		for (RTLIL::Module *m : selected_modules)
-			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
-		ParallelDispatchThreadPool thread_pool(thread_pool_size);
-		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
-
-		ct_all.setup(design);
-
-		RmStats stats;
-		for (auto module : selected_modules)
-			rmunused_module(module, thread_pool, purge_mode, true, true, stats, keep_cache);
-		stats.log();
-
-		design->optimize();
-		design->check();
-
-		ct_all.clear();
-		log_pop();
-
-		request_garbage_collection();
-	}
-} OptCleanPass;
-
-struct CleanPass : public Pass {
-	CleanPass() : Pass("clean", "remove unused cells and wires") { }
-	void help() override
-	{
-		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
-		log("\n");
-		log("    clean [options] [selection]\n");
-		log("\n");
-		log("This is identical to 'opt_clean', but less verbose.\n");
-		log("\n");
-		log("When commands are separated using the ';;' token, this command will be executed\n");
-		log("between the commands.\n");
-		log("\n");
-		log("When commands are separated using the ';;;' token, this command will be executed\n");
-		log("in -purge mode between the commands.\n");
-		log("\n");
-	}
-	void execute(std::vector<std::string> args, RTLIL::Design *design) override
-	{
-		bool purge_mode = false;
-
-		size_t argidx;
-		for (argidx = 1; argidx < args.size(); argidx++) {
-			if (args[argidx] == "-purge") {
-				purge_mode = true;
-				continue;
-			}
-			break;
-		}
-		extra_args(args, argidx, design);
-
-		std::vector<RTLIL::Module*> selected_modules;
-		for (auto module : design->selected_unboxed_whole_modules())
-			if (!module->has_processes())
-				selected_modules.push_back(module);
-		int thread_pool_size = 0;
-		for (RTLIL::Module *m : selected_modules)
-			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
-		ParallelDispatchThreadPool thread_pool(thread_pool_size);
-		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
-
-		ct_all.setup(design);
-
-		RmStats stats;
-		for (auto module : selected_modules)
-			rmunused_module(module, thread_pool, purge_mode, ys_debug(), true, stats, keep_cache);
-
-		log_suppressed();
-		stats.log();
-
-		design->optimize();
-		design->check();
-
-		ct_all.clear();
-
-		request_garbage_collection();
-	}
-} CleanPass;
-
-PRIVATE_NAMESPACE_END
diff --git a/passes/opt/opt_clean/Makefile.inc b/passes/opt/opt_clean/Makefile.inc
new file mode 100644
index 000000000..b4bb5b070
--- /dev/null
+++ b/passes/opt/opt_clean/Makefile.inc
@@ -0,0 +1,10 @@
+OPT_CLEAN_OBJS =
+OPT_CLEAN_OBJS += passes/opt/opt_clean/cells_all.o
+OPT_CLEAN_OBJS += passes/opt/opt_clean/cells_temp.o
+OPT_CLEAN_OBJS += passes/opt/opt_clean/wires.o
+OPT_CLEAN_OBJS += passes/opt/opt_clean/inits.o
+OPT_CLEAN_OBJS += passes/opt/opt_clean/opt_clean.o
+
+$(OPT_CLEAN_OBJS): passes/opt/opt_clean/opt_clean.h
+
+OBJS += $(OPT_CLEAN_OBJS)
diff --git a/passes/opt/opt_clean/cells_all.cc b/passes/opt/opt_clean/cells_all.cc
new file mode 100644
index 000000000..aa97851e3
--- /dev/null
+++ b/passes/opt/opt_clean/cells_all.cc
@@ -0,0 +1,373 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "kernel/ffinit.h"
+#include "kernel/yosys_common.h"
+#include "passes/opt/opt_clean/opt_clean.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+unsigned int hash_bit(const SigBit &bit) {
+	return static_cast<unsigned int>(hash_ops<SigBit>::hash(bit).yield());
+}
+
+SigMap wire_sigmap(const RTLIL::Module* mod) {
+	SigMap map;
+	for (auto &it : mod->connections_) {
+		for (int i = 0; i < GetSize(it.second); i++) {
+			if (it.second[i].wire != nullptr)
+				map.add(it.first[i], it.second[i]);
+		}
+	}
+	return map;
+}
+
+struct WireDrivers;
+// Maps from a SigBit to a unique driver cell.
+struct WireDriver {
+	using Accumulated = WireDrivers;
+	SigBit bit;
+	int driver_cell;
+};
+// Maps from a SigBit to one or more driver cells.
+struct WireDrivers {
+	WireDrivers() : driver_cell(0) {}
+	WireDrivers(WireDriver driver) : bit(driver.bit), driver_cell(driver.driver_cell) {}
+	WireDrivers(SigBit bit) : bit(bit), driver_cell(0) {}
+	WireDrivers(WireDrivers &&other) = default;
+
+	class const_iterator {
+	public:
+		const_iterator(const WireDrivers &drivers, bool end)
+				: driver_cell(drivers.driver_cell), in_extra_cells(end) {
+			if (drivers.extra_driver_cells) {
+				if (end) {
+					extra_it = drivers.extra_driver_cells->end();
+				} else {
+					extra_it = drivers.extra_driver_cells->begin();
+				}
+			}
+		}
+		int operator*() const {
+			if (in_extra_cells)
+				return **extra_it;
+			return driver_cell;
+		}
+		const_iterator& operator++() {
+			if (in_extra_cells)
+				++*extra_it;
+			else
+				in_extra_cells = true;
+			return *this;
+		}
+		bool operator!=(const const_iterator &other) const {
+			return !(*this == other);
+		}
+		bool operator==(const const_iterator &other) const {
+			return in_extra_cells == other.in_extra_cells &&
+				extra_it == other.extra_it;
+		}
+	private:
+		std::optional<pool<int>::iterator> extra_it;
+		int driver_cell;
+		bool in_extra_cells;
+	};
+
+	const_iterator begin() const { return const_iterator(*this, false); }
+	const_iterator end() const { return const_iterator(*this, true); }
+
+	SigBit bit;
+	int driver_cell;
+	std::unique_ptr<pool<int>> extra_driver_cells;
+};
+struct WireDriversKeyEquality {
+	bool operator()(const WireDrivers &a, const WireDrivers &b) const {
+		return a.bit == b.bit;
+	}
+};
+struct WireDriversCollisionHandler {
+	void operator()(WireDrivers &incumbent, WireDrivers &new_value) const {
+		log_assert(new_value.extra_driver_cells == nullptr);
+		if (!incumbent.extra_driver_cells)
+			incumbent.extra_driver_cells.reset(new pool<int>());
+		incumbent.extra_driver_cells->insert(new_value.driver_cell);
+	}
+};
+using Wire2Drivers = ShardedHashtable<WireDriver, WireDriversKeyEquality, WireDriversCollisionHandler>;
+
+struct ConflictLogs {
+	ShardedVector<std::pair<SigBit, std::string>> logs;
+	ConflictLogs(ParallelDispatchThreadPool::Subpool &subpool) : logs(subpool) {}
+	void print_warnings(pool<SigBit>& used_raw_bits, const SigMap& wire_map, const RTLIL::Module* mod, CleanRunContext &clean_ctx) {
+		if (!logs.empty()) {
+			// We could do this in parallel but hopefully this is rare.
+			for (auto [_, cell] : mod->cells_) {
+				for (auto &[port, sig] : cell->connections()) {
+					if (clean_ctx.ct_all.cell_known(cell->type) && !clean_ctx.ct_all.cell_input(cell->type, port))
+						continue;
+					for (auto raw_bit : wire_map(sig))
+						used_raw_bits.insert(raw_bit);
+				}
+			}
+			for (std::pair<SigBit, std::string> &it : logs) {
+				if (used_raw_bits.count(it.first))
+					log_warning("%s\n", it.second);
+			}
+		}
+	}
+};
+
+struct CellTraversal {
+	ConcurrentWorkQueue<int> queue;
+	Wire2Drivers wire2driver;
+	dict<std::string, pool<int>> mem2cells;
+	CellTraversal(int num_threads) : queue(num_threads), wire2driver(), mem2cells() {}
+};
+
+struct CellAnalysis {
+	ShardedVector<Wire*> keep_wires;
+	std::vector<std::atomic<bool>> unused;
+
+	CellAnalysis(AnalysisContext& actx)
+	: keep_wires(actx.subpool), unused(actx.mod->cells_size()) {}
+
+	pool<SigBit> analyze_kept_wires(CellTraversal& traversal, const SigMap& sigmap, const SigMap& wire_map, int num_threads) {
+		// Also enqueue cells that drive kept wires into cell_queue
+		// and mark those cells as used
+		// and mark all bits of those wires as used
+		pool<SigBit> used_raw_bits;
+		int i = 0;
+		for (Wire *wire : keep_wires) {
+			for (auto bit : sigmap(wire)) {
+				const WireDrivers *drivers = traversal.wire2driver.find({{bit}, hash_bit(bit)});
+				if (drivers != nullptr)
+					for (int cell_index : *drivers)
+						if (unused[cell_index].exchange(false, std::memory_order_relaxed)) {
+							ThreadIndex fake_thread_index = {i++ % num_threads};
+							traversal.queue.push(fake_thread_index, cell_index);
+						}
+			}
+			for (auto raw_bit : SigSpec(wire))
+				used_raw_bits.insert(wire_map(raw_bit));
+		}
+		return used_raw_bits;
+	}
+
+	void mark_used_and_enqueue(int cell_idx, ConcurrentWorkQueue<int>& queue, const ParallelDispatchThreadPool::RunCtx &ctx) {
+		if (unused[cell_idx].exchange(false, std::memory_order_relaxed))
+			queue.push(ctx, cell_idx);
+	}
+};
+
+ConflictLogs explore(CellAnalysis& analysis, CellTraversal& traversal, const SigMap& wire_map, AnalysisContext& actx, CleanRunContext &clean_ctx) {
+	ConflictLogs logs(actx.subpool);
+	Wire2Drivers::Builder wire2driver_builder(actx.subpool);
+	ShardedVector<std::pair<std::string, int>> mem2cells_vector(actx.subpool);
+
+	// Enqueue kept cells into traversal.queue
+	// Prepare input cone traversal into traversal.wire2driver
+	// Prepare "input cone" traversal from memory to write port or meminit as analysis.mem2cells
+	// Also check driver conflicts
+	// Also mark cells unused to true unless keep (we override this later)
+	actx.subpool.run([&analysis, &traversal, &logs, &wire_map, &mem2cells_vector, &wire2driver_builder, &actx, &clean_ctx](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(actx.mod->cells_size())) {
+			Cell *cell = actx.mod->cell_at(i);
+			if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2)))
+				mem2cells_vector.insert(ctx, {cell->getParam(ID::MEMID).decode_string(), i});
+
+			for (auto &it2 : cell->connections()) {
+				if (clean_ctx.ct_all.cell_known(cell->type) && !clean_ctx.ct_all.cell_output(cell->type, it2.first))
+					continue;
+				for (auto raw_bit : it2.second) {
+					if (raw_bit.wire == nullptr)
+						continue;
+					auto bit = actx.assign_map(raw_bit);
+					if (bit.wire == nullptr && clean_ctx.ct_all.cell_known(cell->type)) {
+						std::string msg = stringf("Driver-driver conflict "
+							"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
+							log_signal(raw_bit), cell->name.unescape(), it2.first.unescape(), log_signal(bit), actx.mod->name.unescape());
+							logs.logs.insert(ctx, {wire_map(raw_bit), msg});
+						}
+						if (bit.wire != nullptr)
+							wire2driver_builder.insert(ctx, {{bit, i}, hash_bit(bit)});
+					}
+			}
+			bool keep = clean_ctx.keep_cache.query(cell);
+			analysis.unused[i].store(!keep, std::memory_order_relaxed);
+			if (keep)
+				traversal.queue.push(ctx, i);
+		}
+		for (int i : ctx.item_range(actx.mod->wires_size())) {
+			Wire *wire = actx.mod->wire_at(i);
+			if (wire->port_output || wire->get_bool_attribute(ID::keep))
+				analysis.keep_wires.insert(ctx, wire);
+		}
+	});
+	// Finish by merging per-thread collected data
+	actx.subpool.run([&wire2driver_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		wire2driver_builder.process(ctx);
+	});
+	traversal.wire2driver = wire2driver_builder;
+
+	for (std::pair<std::string, int> &mem2cell : mem2cells_vector)
+		traversal.mem2cells[mem2cell.first].insert(mem2cell.second);
+
+	return logs;
+}
+
+struct MemAnalysis {
+	std::vector<std::atomic<bool>> unused;
+	dict<std::string, int> indices;
+	MemAnalysis(const RTLIL::Module* mod) : unused(mod->memories.size()), indices() {
+		for (int i = 0; i < GetSize(mod->memories); ++i) {
+			indices[mod->memories.element(i)->first.str()] = i;
+			unused[i].store(true, std::memory_order_relaxed);
+		}
+	}
+};
+
+void fixup_unused_cells_and_mems(CellAnalysis& analysis, MemAnalysis& mem_analysis, CellTraversal& traversal, AnalysisContext& actx, CleanRunContext &clean_ctx) {
+	// Processes the cell queue in batches, traversing input cones by enqueuing more cells
+	// Discover and mark used memories and cells
+	actx.subpool.run([&analysis, &mem_analysis, &traversal, &actx, &clean_ctx](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		pool<SigBit> bits;
+		pool<std::string> mems;
+		while (true) {
+			std::vector<int> cell_indices = traversal.queue.pop_batch(ctx);
+			if (cell_indices.empty())
+				return;
+			for (auto cell_index : cell_indices) {
+				Cell *cell = actx.mod->cell_at(cell_index);
+				for (auto &it : cell->connections())
+					if (!clean_ctx.ct_all.cell_known(cell->type) || clean_ctx.ct_all.cell_input(cell->type, it.first))
+						for (auto bit : actx.assign_map(it.second))
+							bits.insert(bit);
+
+				if (cell->type.in(ID($memrd), ID($memrd_v2))) {
+					std::string mem_id = cell->getParam(ID::MEMID).decode_string();
+					if (mem_analysis.indices.count(mem_id)) {
+						int mem_index = mem_analysis.indices[mem_id];
+						// Memory fixup
+						if (mem_analysis.unused[mem_index].exchange(false, std::memory_order_relaxed))
+							mems.insert(mem_id);
+					}
+				}
+			}
+
+			for (auto bit : bits) {
+				// Cells fixup
+				const WireDrivers *drivers = traversal.wire2driver.find({{bit}, hash_bit(bit)});
+				if (drivers != nullptr)
+					for (int cell_idx : *drivers)
+						analysis.mark_used_and_enqueue(cell_idx, traversal.queue, ctx);
+			}
+			bits.clear();
+
+			for (auto mem : mems) {
+				if (traversal.mem2cells.count(mem) == 0)
+					continue;
+				// Cells fixup
+				for (int cell_idx : traversal.mem2cells.at(mem))
+					analysis.mark_used_and_enqueue(cell_idx, traversal.queue, ctx);
+			}
+			mems.clear();
+		}
+	});
+}
+
+pool<Cell*> all_unused_cells(const Module *mod, const CellAnalysis& analysis, Wire2Drivers& wire2driver, ParallelDispatchThreadPool::Subpool &subpool) {
+	pool<Cell*> unused_cells;
+	ShardedVector<int> sharded_unused_cells(subpool);
+	subpool.run([mod, &analysis, &wire2driver, &sharded_unused_cells](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of `wire2driver`
+		wire2driver.clear(ctx);
+		for (int i : ctx.item_range(mod->cells_size()))
+			if (analysis.unused[i].load(std::memory_order_relaxed))
+				sharded_unused_cells.insert(ctx, i);
+	});
+	for (int cell_index : sharded_unused_cells)
+		unused_cells.insert(mod->cell_at(cell_index));
+	unused_cells.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
+	return unused_cells;
+}
+
+void remove_cells(RTLIL::Module* mod, FfInitVals& ffinit, const pool<Cell*>& cells, bool verbose, RmStats& stats) {
+	for (auto cell : cells) {
+		if (verbose)
+			log_debug("  removing unused `%s' cell `%s'.\n", cell->type, cell->name);
+		mod->design->scratchpad_set_bool("opt.did_something", true);
+		if (cell->is_builtin_ff())
+			ffinit.remove_init(cell->getPort(ID::Q));
+		mod->remove(cell);
+		stats.count_rm_cells++;
+	}
+}
+
+void remove_mems(RTLIL::Module* mod, const MemAnalysis& mem_analysis, bool verbose) {
+	for (const auto &it : mem_analysis.indices) {
+		if (!mem_analysis.unused[it.second].load(std::memory_order_relaxed))
+			continue;
+		RTLIL::IdString id(it.first);
+		if (verbose)
+			log_debug("  removing unused memory `%s'.\n", id.unescape());
+		delete mod->memories.at(id);
+		mod->memories.erase(id);
+	}
+}
+
+PRIVATE_NAMESPACE_END
+
+YOSYS_NAMESPACE_BEGIN
+
+void rmunused_module_cells(Module *module, ParallelDispatchThreadPool::Subpool &subpool, CleanRunContext &clean_ctx)
+{
+	AnalysisContext actx(module, subpool);
+
+	// Used for logging warnings only
+	SigMap wire_map = wire_sigmap(module);
+
+	CellAnalysis analysis(actx);
+	CellTraversal traversal(subpool.num_threads());
+	// Mark all unkept cells as unused initially
+	// and queue up cell traversal from those cells
+	auto logs = explore(analysis, traversal, wire_map, actx, clean_ctx);
+	// Mark cells that drive kept wires into cell_queue and those bits as used
+	// and queue up cell traversal from those cells
+	pool<SigBit> used_raw_bits = analysis.analyze_kept_wires(traversal, actx.assign_map, wire_map, subpool.num_threads());
+
+	// Mark all memories as unused initially
+	MemAnalysis mem_analysis(module);
+	// Marked all used cells and mems as used by traversing with cell queue
+	fixup_unused_cells_and_mems(analysis, mem_analysis, traversal, actx, clean_ctx);
+	// Analyses are now fully correct
+
+	// unused_cells.contains(foo) iff analysis.used[foo] == true
+	// wire2driver is passed in only to destroy it
+	pool<Cell*> unused_cells = all_unused_cells(module, analysis, traversal.wire2driver, subpool);
+
+	FfInitVals ffinit;
+	ffinit.set_parallel(&actx.assign_map, subpool.thread_pool(), module);
+	// Now we know what to kill
+	remove_cells(module, ffinit, unused_cells, clean_ctx.flags.verbose, clean_ctx.stats);
+	remove_mems(module, mem_analysis, clean_ctx.flags.verbose);
+	logs.print_warnings(used_raw_bits, wire_map, module, clean_ctx);
+}
+
+YOSYS_NAMESPACE_END
diff --git a/passes/opt/opt_clean/cells_temp.cc b/passes/opt/opt_clean/cells_temp.cc
new file mode 100644
index 000000000..b325b68d9
--- /dev/null
+++ b/passes/opt/opt_clean/cells_temp.cc
@@ -0,0 +1,104 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "passes/opt/opt_clean/opt_clean.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+bool is_signed(RTLIL::Cell* cell) {
+	return cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
+}
+
+bool trim_buf(RTLIL::Cell* cell, ShardedVector<RTLIL::SigSig>& new_connections, const ParallelDispatchThreadPool::RunCtx &ctx) {
+	RTLIL::SigSpec a = cell->getPort(ID::A);
+	RTLIL::SigSpec y = cell->getPort(ID::Y);
+	a.extend_u0(GetSize(y), is_signed(cell));
+
+	if (a.has_const(State::Sz)) {
+		RTLIL::SigSpec new_a;
+		RTLIL::SigSpec new_y;
+		for (int i = 0; i < GetSize(a); ++i) {
+			RTLIL::SigBit b = a[i];
+			if (b == State::Sz)
+				return false;
+			new_a.append(b);
+			new_y.append(y[i]);
+		}
+		a = std::move(new_a);
+		y = std::move(new_y);
+	}
+	if (!y.empty())
+		new_connections.insert(ctx, {y, a});
+	return true;
+}
+
+bool remove(ShardedVector<RTLIL::Cell*>& cells, RTLIL::Module* mod, bool verbose) {
+	bool did_something = false;
+	for (RTLIL::Cell *cell : cells) {
+		if (verbose) {
+			if (cell->type == ID($connect))
+				log_debug("  removing connect cell `%s': %s <-> %s\n", cell->name,
+						log_signal(cell->getPort(ID::A)), log_signal(cell->getPort(ID::B)));
+			else if (cell->type == ID($input_port))
+				log_debug("  removing input port marker cell `%s': %s\n", cell->name,
+						log_signal(cell->getPort(ID::Y)));
+			else
+				log_debug("  removing buffer cell `%s': %s = %s\n", cell->name,
+						log_signal(cell->getPort(ID::Y)), log_signal(cell->getPort(ID::A)));
+		}
+		mod->remove(cell);
+		did_something = true;
+	}
+	return did_something;
+}
+PRIVATE_NAMESPACE_END
+YOSYS_NAMESPACE_BEGIN
+
+void remove_temporary_cells(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
+{
+	ShardedVector<RTLIL::Cell*> delcells(subpool);
+	ShardedVector<RTLIL::SigSig> new_connections(subpool);
+	const RTLIL::Module *const_module = module;
+	subpool.run([const_module, &delcells, &new_connections](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
+				if (trim_buf(cell, new_connections, ctx))
+					delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
+				RTLIL::SigSpec a = cell->getPort(ID::A);
+				RTLIL::SigSpec b = cell->getPort(ID::B);
+				if (a.has_const() && !b.has_const())
+					std::swap(a, b);
+				new_connections.insert(ctx, {a, b});
+				delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
+				delcells.insert(ctx, cell);
+			}
+		}
+	});
+	for (RTLIL::SigSig &connection : new_connections) {
+		module->connect(connection);
+	}
+	if (remove(delcells, module, verbose))
+		module->design->scratchpad_set_bool("opt.did_something", true);
+}
+
+YOSYS_NAMESPACE_END
diff --git a/passes/opt/opt_clean/inits.cc b/passes/opt/opt_clean/inits.cc
new file mode 100644
index 000000000..70c2ef9e2
--- /dev/null
+++ b/passes/opt/opt_clean/inits.cc
@@ -0,0 +1,137 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "passes/opt/opt_clean/opt_clean.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+ShardedVector<std::pair<SigBit, State>> build_inits(AnalysisContext& actx) {
+	ShardedVector<std::pair<SigBit, State>> results(actx.subpool);
+	actx.subpool.run([&results, &actx](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(actx.mod->cells_size())) {
+			RTLIL::Cell *cell = actx.mod->cell_at(i);
+			if (StaticCellTypes::Compat::internals_mem_ff(cell->type) && cell->hasPort(ID::Q))
+			{
+				SigSpec sig = cell->getPort(ID::Q);
+
+				for (int i = 0; i < GetSize(sig); i++)
+				{
+					SigBit bit = sig[i];
+
+					if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
+						continue;
+
+					Const init = bit.wire->attributes.at(ID::init);
+
+					if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
+						continue;
+
+					results.insert(ctx, {bit, init[i]});
+				}
+			}
+		}
+	});
+	return results;
+}
+
+dict<SigBit, State> qbits_from_inits(ShardedVector<std::pair<SigBit, State>>& inits, SigMap& assign_map) {
+	dict<SigBit, State> qbits;
+	for (std::pair<SigBit, State> &p : inits) {
+		assign_map.add(p.first);
+		qbits[p.first] = p.second;
+	}
+	return qbits;
+}
+
+ShardedVector<RTLIL::Wire*> deferred_init_transfer(const dict<SigBit, State>& qbits, AnalysisContext& actx) {
+	ShardedVector<RTLIL::Wire*> wire_results(actx.subpool);
+	actx.subpool.run([&actx, &qbits, &wire_results](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int j : ctx.item_range(actx.mod->wires_size())) {
+			RTLIL::Wire *wire = actx.mod->wire_at(j);
+			if (wire->attributes.count(ID::init) == 0)
+				continue;
+			Const init = wire->attributes.at(ID::init);
+
+			for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
+			{
+				if (init[i] == State::Sx || init[i] == State::Sz)
+					continue;
+
+				SigBit wire_bit = SigBit(wire, i);
+				SigBit mapped_wire_bit = actx.assign_map(wire_bit);
+
+				if (wire_bit == mapped_wire_bit)
+					goto next_wire;
+
+				if (mapped_wire_bit.wire) {
+					if (qbits.count(mapped_wire_bit) == 0)
+						goto next_wire;
+
+					if (qbits.at(mapped_wire_bit) != init[i])
+						goto next_wire;
+				}
+				else {
+					if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
+						goto next_wire;
+
+					if (mapped_wire_bit != init[i]) {
+						log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
+						goto next_wire;
+					}
+				}
+			}
+			wire_results.insert(ctx, wire);
+
+			next_wire:;
+		}
+	});
+	return wire_results;
+}
+
+bool remove_redundant_inits(ShardedVector<RTLIL::Wire*> wires, bool verbose) {
+	bool did_something = false;
+	for (RTLIL::Wire *wire : wires) {
+		if (verbose)
+			log_debug("  removing redundant init attribute on %s.\n", log_id(wire));
+		wire->attributes.erase(ID::init);
+		did_something = true;
+	}
+	return did_something;
+}
+
+PRIVATE_NAMESPACE_END
+YOSYS_NAMESPACE_BEGIN
+
+bool rmunused_module_init(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
+{
+	AnalysisContext actx(module, subpool);
+
+	ShardedVector<std::pair<SigBit, State>> inits = build_inits(actx);
+	dict<SigBit, State> qbits = qbits_from_inits(inits, actx.assign_map);
+	ShardedVector<RTLIL::Wire*> inits_to_transfer = deferred_init_transfer(qbits, actx);
+
+	bool did_something = remove_redundant_inits(inits_to_transfer, verbose);
+	if (did_something)
+		module->design->scratchpad_set_bool("opt.did_something", true);
+
+	return did_something;
+}
+
+YOSYS_NAMESPACE_END
diff --git a/passes/opt/opt_clean/keep_cache.h b/passes/opt/opt_clean/keep_cache.h
new file mode 100644
index 000000000..7405f24fc
--- /dev/null
+++ b/passes/opt/opt_clean/keep_cache.h
@@ -0,0 +1,167 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "kernel/rtlil.h"
+#include "kernel/sigtools.h"
+#include "kernel/threading.h"
+#include "kernel/celltypes.h"
+#include "kernel/yosys_common.h"
+
+#ifndef OPT_CLEAN_KEEP_CACHE_H
+#define OPT_CLEAN_KEEP_CACHE_H
+
+YOSYS_NAMESPACE_BEGIN
+
+struct KeepCache
+{
+	dict<Module*, bool> keep_modules;
+	bool purge_mode;
+
+	KeepCache(bool purge_mode, ParallelDispatchThreadPool &thread_pool, const std::vector<RTLIL::Module *> &selected_modules)
+			: purge_mode(purge_mode) {
+
+		std::vector<RTLIL::Module *> scan_modules_worklist;
+		dict<RTLIL::Module *, std::vector<RTLIL::Module*>> dependents;
+		std::vector<RTLIL::Module *> propagate_kept_modules_worklist;
+		for (RTLIL::Module *module : selected_modules) {
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, thread_pool, dependents, ALL_CELLS, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
+		}
+
+		while (!scan_modules_worklist.empty()) {
+			RTLIL::Module *module = scan_modules_worklist.back();
+			scan_modules_worklist.pop_back();
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, thread_pool, dependents, MINIMUM_CELLS, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
+		}
+
+		while (!propagate_kept_modules_worklist.empty()) {
+			RTLIL::Module *module = propagate_kept_modules_worklist.back();
+			propagate_kept_modules_worklist.pop_back();
+			for (RTLIL::Module *dependent : dependents[module]) {
+				if (keep_modules[dependent])
+					continue;
+				keep_modules[dependent] = true;
+				propagate_kept_modules_worklist.push_back(dependent);
+			}
+		}
+	}
+
+	bool query(Cell *cell) const
+	{
+		if (keep_cell(cell, purge_mode))
+			return true;
+		if (cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
+			return true;
+		if (cell->module && cell->module->design) {
+			RTLIL::Module *cell_module = cell->module->design->module(cell->type);
+			return cell_module != nullptr && keep_modules.at(cell_module);
+		}
+		return false;
+	}
+
+private:
+	enum ScanCells {
+		// Scan every cell to see if it uses a module that is kept.
+		ALL_CELLS,
+		// Stop scanning cells if we determine early that this module is kept.
+		MINIMUM_CELLS,
+	};
+	bool scan_module(Module *module, ParallelDispatchThreadPool &thread_pool, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
+			ScanCells scan_cells, std::vector<Module*> &worklist) const
+	{
+		MonotonicFlag keep_module;
+		if (module->get_bool_attribute(ID::keep)) {
+			if (scan_cells == MINIMUM_CELLS)
+				return true;
+			keep_module.set();
+		}
+
+		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+		ShardedVector<Module*> deps(subpool);
+		const RTLIL::Module *const_module = module;
+		bool purge_mode = this->purge_mode;
+		subpool.run([purge_mode, const_module, scan_cells, &deps, &keep_module](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			bool keep = false;
+			for (int i : ctx.item_range(const_module->cells_size())) {
+				Cell *cell = const_module->cell_at(i);
+				if (keep_cell(cell, purge_mode)) {
+					if (scan_cells == MINIMUM_CELLS) {
+						keep_module.set();
+						return;
+					}
+					keep = true;
+				}
+				if (const_module->design) {
+					RTLIL::Module *cell_module = const_module->design->module(cell->type);
+					if (cell_module != nullptr)
+						deps.insert(ctx, cell_module);
+				}
+			}
+			if (keep) {
+				keep_module.set();
+				return;
+			}
+			for (int i : ctx.item_range(const_module->wires_size())) {
+				Wire *wire = const_module->wire_at(i);
+				if (wire->get_bool_attribute(ID::keep)) {
+					keep_module.set();
+					return;
+				}
+			}
+		});
+		if (scan_cells == MINIMUM_CELLS && keep_module.load())
+			return true;
+		for (Module *dep : deps) {
+			dependents[dep].push_back(module);
+			worklist.push_back(dep);
+		}
+		return keep_module.load();
+	}
+
+	static bool keep_cell(Cell *cell, bool purge_mode)
+	{
+		if (cell->type.in(ID($assert), ID($assume), ID($live), ID($fair), ID($cover)))
+			return true;
+
+		if (cell->type.in(ID($overwrite_tag)))
+			return true;
+
+		if (cell->type == ID($print) || cell->type == ID($check))
+			return true;
+
+		if (cell->has_keep_attr())
+			return true;
+
+		if (!purge_mode && cell->type == ID($scopeinfo))
+			return true;
+		return false;
+	}
+};
+
+YOSYS_NAMESPACE_END
+#endif /* OPT_CLEAN_KEEP_CACHE_H */
diff --git a/passes/opt/opt_clean/opt_clean.cc b/passes/opt/opt_clean/opt_clean.cc
new file mode 100644
index 000000000..256257f6b
--- /dev/null
+++ b/passes/opt/opt_clean/opt_clean.cc
@@ -0,0 +1,152 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "kernel/register.h"
+#include "kernel/log.h"
+#include "passes/opt/opt_clean/opt_clean.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+void rmunused_module(RTLIL::Module *module, bool rminit, CleanRunContext &clean_ctx)
+{
+	if (clean_ctx.flags.verbose)
+		log("Finding unused cells or wires in module %s..\n", module->name);
+
+	// Use no more than one worker per thousand cells, rounded down, so
+	// we only start multithreading with at least 2000 cells.
+	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
+	ParallelDispatchThreadPool::Subpool subpool(clean_ctx.thread_pool, num_worker_threads);
+	remove_temporary_cells(module, subpool, clean_ctx.flags.verbose);
+	rmunused_module_cells(module, subpool, clean_ctx);
+	while (rmunused_module_signals(module, subpool, clean_ctx)) { }
+
+	if (rminit && rmunused_module_init(module, subpool, clean_ctx.flags.verbose))
+		while (rmunused_module_signals(module, subpool, clean_ctx)) { }
+}
+struct OptCleanPass : public Pass {
+	OptCleanPass() : Pass("opt_clean", "remove unused cells and wires") { }
+	void help() override
+	{
+		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
+		log("\n");
+		log("    opt_clean [options] [selection]\n");
+		log("\n");
+		log("This pass identifies wires and cells that are unused and removes them. Other\n");
+		log("passes often remove cells but leave the wires in the design or reconnect the\n");
+		log("wires but leave the old cells in the design. This pass can be used to clean up\n");
+		log("after the passes that do the actual work.\n");
+		log("\n");
+		log("This pass only operates on completely selected modules without processes.\n");
+		log("\n");
+		log("    -purge\n");
+		log("        also remove internal nets if they have a public name\n");
+		log("\n");
+	}
+	void execute(std::vector<std::string> args, RTLIL::Design *design) override
+	{
+		bool purge_mode = false;
+
+		log_header(design, "Executing OPT_CLEAN pass (remove unused cells and wires).\n");
+		log_push();
+
+		size_t argidx;
+		for (argidx = 1; argidx < args.size(); argidx++) {
+			if (args[argidx] == "-purge") {
+				purge_mode = true;
+				continue;
+			}
+			break;
+		}
+		extra_args(args, argidx, design);
+
+		std::vector<RTLIL::Module*> selected_modules;
+		for (auto module : design->selected_whole_modules_warn())
+			if (!module->has_processes_warn())
+				selected_modules.push_back(module);
+		int thread_pool_size = 0;
+		for (RTLIL::Module *m : selected_modules)
+			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
+		ParallelDispatchThreadPool thread_pool(thread_pool_size);
+		KeepCache keep_cache(purge_mode, thread_pool, selected_modules);
+
+		{
+			CleanRunContext clean_ctx(design, {purge_mode, true});
+			for (auto module : selected_modules)
+				rmunused_module(module, true, clean_ctx);
+			clean_ctx.stats.log();
+
+			design->optimize();
+			design->check();
+		}
+
+		log_pop();
+
+		request_garbage_collection();
+	}
+} OptCleanPass;
+
+struct CleanPass : public Pass {
+	CleanPass() : Pass("clean", "remove unused cells and wires") { }
+	void help() override
+	{
+		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
+		log("\n");
+		log("    clean [options] [selection]\n");
+		log("\n");
+		log("This is identical to 'opt_clean', but less verbose.\n");
+		log("\n");
+		log("When commands are separated using the ';;' token, this command will be executed\n");
+		log("between the commands.\n");
+		log("\n");
+		log("When commands are separated using the ';;;' token, this command will be executed\n");
+		log("in -purge mode between the commands.\n");
+		log("\n");
+	}
+	void execute(std::vector<std::string> args, RTLIL::Design *design) override
+	{
+		bool purge_mode = false;
+
+		size_t argidx;
+		for (argidx = 1; argidx < args.size(); argidx++) {
+			if (args[argidx] == "-purge") {
+				purge_mode = true;
+				continue;
+			}
+			break;
+		}
+		extra_args(args, argidx, design);
+
+		{
+			CleanRunContext clean_ctx(design, {purge_mode, ys_debug()});
+			for (auto module : clean_ctx.selected_modules)
+				rmunused_module(module, true, clean_ctx);
+
+			log_suppressed();
+			clean_ctx.stats.log();
+
+			design->optimize();
+			design->check();
+		}
+
+		request_garbage_collection();
+	}
+} CleanPass;
+
+PRIVATE_NAMESPACE_END
diff --git a/passes/opt/opt_clean/opt_clean.h b/passes/opt/opt_clean/opt_clean.h
new file mode 100644
index 000000000..7599f25e8
--- /dev/null
+++ b/passes/opt/opt_clean/opt_clean.h
@@ -0,0 +1,103 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "kernel/rtlil.h"
+#include "kernel/threading.h"
+#include "passes/opt/opt_clean/keep_cache.h"
+
+#ifndef OPT_CLEAN_SHARED_H
+#define OPT_CLEAN_SHARED_H
+
+YOSYS_NAMESPACE_BEGIN
+
+struct AnalysisContext {
+	SigMap assign_map;
+	const RTLIL::Module *mod;
+	ParallelDispatchThreadPool::Subpool &subpool;
+	AnalysisContext(RTLIL::Module* m, ParallelDispatchThreadPool::Subpool &p) : assign_map(m), mod(m), subpool(p) {}
+};
+
+struct RmStats {
+	int count_rm_cells = 0;
+	int count_rm_wires = 0;
+
+	void log()
+	{
+		if (count_rm_cells > 0 || count_rm_wires > 0)
+			YOSYS_NAMESPACE_PREFIX log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+	}
+};
+
+struct Flags {
+	bool purge = false;
+	bool verbose = false;
+};
+struct CleanRunContext {
+	static constexpr auto ct_reg = StaticCellTypes::Categories::join(
+		StaticCellTypes::Compat::mem_ff,
+		StaticCellTypes::categories.is_anyinit);
+	NewCellTypes ct_all;
+	RmStats stats;
+	ParallelDispatchThreadPool thread_pool;
+	std::vector<RTLIL::Module*> selected_modules;
+	KeepCache keep_cache;
+	Flags flags;
+
+private:
+	// Helper to compute thread pool size
+	static int compute_thread_pool_size(RTLIL::Design* design) {
+		int thread_pool_size = 0;
+		for (auto module : design->selected_unboxed_whole_modules())
+			if (!module->has_processes())
+				thread_pool_size = std::max(thread_pool_size,
+					ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+		return thread_pool_size;
+	}
+
+	static std::vector<RTLIL::Module*> get_selected_modules(RTLIL::Design* design) {
+		std::vector<RTLIL::Module*> modules;
+		for (auto module : design->selected_unboxed_whole_modules())
+			if (!module->has_processes())
+				modules.push_back(module);
+		return modules;
+	}
+
+public:
+	CleanRunContext(RTLIL::Design* design, Flags f)
+		: thread_pool(compute_thread_pool_size(design)),
+		selected_modules(get_selected_modules(design)),
+		keep_cache(f.purge, thread_pool, selected_modules),
+		flags(f)
+	{
+		ct_all.setup(design);
+	}
+
+	~CleanRunContext() {
+		ct_all.clear();
+	}
+};
+
+void remove_temporary_cells(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose);
+void rmunused_module_cells(Module *module, ParallelDispatchThreadPool::Subpool &subpool, CleanRunContext &clean_ctx);
+bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, CleanRunContext &clean_ctx);
+bool rmunused_module_init(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose);
+
+YOSYS_NAMESPACE_END
+
+#endif /* OPT_CLEAN_SHARED_H */
diff --git a/passes/opt/opt_clean/wires.cc b/passes/opt/opt_clean/wires.cc
new file mode 100644
index 000000000..d7ca215c4
--- /dev/null
+++ b/passes/opt/opt_clean/wires.cc
@@ -0,0 +1,585 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "passes/opt/opt_clean/opt_clean.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+// No collision handler for these, since we will use them such that collisions don't happen
+struct ShardedSigBit {
+	using Accumulated = ShardedSigBit;
+	RTLIL::SigBit bit;
+	ShardedSigBit() = default;
+	ShardedSigBit(const RTLIL::SigBit &bit) : bit(bit) {}
+};
+struct ShardedSigBitEquality {
+	bool operator()(const ShardedSigBit &b1, const ShardedSigBit &b2) const {
+		return b1.bit == b2.bit;
+	}
+};
+using ShardedSigPool = ShardedHashtable<ShardedSigBit, ShardedSigBitEquality, SetCollisionHandler<ShardedSigBit>>;
+
+struct ShardedSigSpec {
+	using Accumulated = ShardedSigSpec;
+	RTLIL::SigSpec spec;
+	ShardedSigSpec() = default;
+	ShardedSigSpec(RTLIL::SigSpec spec) : spec(std::move(spec)) {}
+	ShardedSigSpec(ShardedSigSpec &&) = default;
+};
+struct ShardedSigSpecEquality {
+	bool operator()(const ShardedSigSpec &s1, const ShardedSigSpec &s2) const {
+		return s1.spec == s2.spec;
+	}
+};
+using ShardedSigSpecPool = ShardedHashtable<ShardedSigSpec, ShardedSigSpecEquality, SetCollisionHandler<ShardedSigSpec>>;
+
+struct ExactCellWires {
+	const ShardedSigSpecPool &exact_cells;
+	const SigMap &assign_map;
+	dict<RTLIL::Wire *, bool> cache;
+
+	ExactCellWires(const ShardedSigSpecPool &exact_cells, const SigMap &assign_map) : exact_cells(exact_cells), assign_map(assign_map) {}
+	void cache_result_for_bit(const SigBit &bit) {
+		if (bit.wire != nullptr)
+			(void)is_exactly_cell_driven(bit.wire);
+	}
+	bool is_exactly_cell_driven(RTLIL::Wire *wire) {
+		if (wire->port_input)
+			return true;
+		auto it = cache.find(wire);
+		if (it != cache.end())
+			return it->second;
+		SigSpec sig = assign_map(wire);
+		bool direct = exact_cells.find({sig, sig.hash_into(Hasher()).yield()}) != nullptr;
+		cache.insert({wire, direct});
+		return direct;
+	}
+	void cache_all(ShardedVector<RTLIL::SigBit> &bits) {
+		for (RTLIL::SigBit candidate : bits) {
+			cache_result_for_bit(candidate);
+			cache_result_for_bit(assign_map(candidate));
+		}
+
+	}
+};
+
+int count_nontrivial_wire_attrs(RTLIL::Wire *w)
+{
+	int count = w->attributes.size();
+	count -= w->attributes.count(ID::src);
+	count -= w->attributes.count(ID::hdlname);
+	count -= w->attributes.count(ID::scopename);
+	count -= w->attributes.count(ID::unused_bits);
+	return count;
+}
+
+// Should we pick `s2` over `s1` to represent a signal?
+bool compare_signals(const RTLIL::SigBit &s1, const RTLIL::SigBit &s2, const ShardedSigPool &regs, const ShardedSigPool &conns, ExactCellWires &cell_wires)
+{
+	if (s1 == s2)
+		return false;
+
+	RTLIL::Wire *w1 = s1.wire;
+	RTLIL::Wire *w2 = s2.wire;
+
+	if (w1 == NULL || w2 == NULL)
+		return w2 == NULL;
+
+	if (w1->port_input != w2->port_input)
+		return w2->port_input;
+
+	if ((w1->port_input && w1->port_output) != (w2->port_input && w2->port_output))
+		return !(w2->port_input && w2->port_output);
+
+	if (w1->name.isPublic() && w2->name.isPublic()) {
+		ShardedSigPool::AccumulatedValue s1_val = {s1, s1.hash_top().yield()};
+		ShardedSigPool::AccumulatedValue s2_val = {s2, s2.hash_top().yield()};
+		bool regs1 = regs.find(s1_val) != nullptr;
+		bool regs2 = regs.find(s2_val) != nullptr;
+		if (regs1 != regs2)
+			return regs2;
+		bool w1_exact = cell_wires.is_exactly_cell_driven(w1);
+		bool w2_exact = cell_wires.is_exactly_cell_driven(w2);
+		if (w1_exact != w2_exact)
+			return w2_exact;
+		bool conns1 = conns.find(s1_val) != nullptr;
+		bool conns2 = conns.find(s2_val) != nullptr;
+		if (conns1 != conns2)
+			return conns2;
+	}
+
+	if (w1 == w2)
+		return s2.offset < s1.offset;
+
+	if (w1->port_output != w2->port_output)
+		return w2->port_output;
+
+	if (w1->name[0] != w2->name[0])
+		return w2->name.isPublic();
+
+	int attrs1 = count_nontrivial_wire_attrs(w1);
+	int attrs2 = count_nontrivial_wire_attrs(w2);
+
+	if (attrs1 != attrs2)
+		return attrs2 > attrs1;
+
+	return w2->name.lt_by_name(w1->name);
+}
+
+bool check_public_name(RTLIL::IdString id)
+{
+	if (id.begins_with("$"))
+		return false;
+	const std::string &id_str = id.str();
+	if (id.begins_with("\\_") && (id.ends_with("_") || id_str.find("_[") != std::string::npos))
+		return false;
+	if (id_str.find(".$") != std::string::npos)
+		return false;
+	return true;
+}
+
+void add_spec(ShardedSigPool::Builder &builder, const ThreadIndex &thread, const RTLIL::SigSpec &spec) {
+	for (SigBit bit : spec)
+		if (bit.wire != nullptr)
+			builder.insert(thread, {bit, bit.hash_top().yield()});
+}
+
+bool check_any(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) != nullptr)
+			return true;
+	return false;
+}
+
+bool check_all(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) == nullptr)
+			return false;
+	return true;
+}
+
+struct UpdateConnection {
+	RTLIL::Cell *cell;
+	RTLIL::IdString port;
+	RTLIL::SigSpec spec;
+};
+void fixup_cell_ports(ShardedVector<UpdateConnection> &update_connections)
+{
+	for (UpdateConnection &update : update_connections)
+		update.cell->connections_.at(update.port) = std::move(update.spec);
+}
+
+struct InitBits {
+	dict<SigBit, RTLIL::State> values;
+	// Wires that appear in the keys of the `values` dict
+	pool<Wire*> wires;
+
+	// Set init attributes on all wires of a connected group
+	void apply_normalised_inits() {
+		for (RTLIL::Wire *wire : wires) {
+			bool found = false;
+			Const val(State::Sx, wire->width);
+			for (int i = 0; i < wire->width; i++) {
+				auto it = values.find(RTLIL::SigBit(wire, i));
+				if (it != values.end()) {
+					val.set(i, it->second);
+					found = true;
+				}
+			}
+			if (found)
+				wire->attributes[ID::init] = val;
+		}
+	}
+};
+static InitBits consume_inits(ShardedVector<RTLIL::Wire*> &initialized_wires, const SigMap &assign_map)
+{
+	InitBits init_bits;
+	for (RTLIL::Wire *initialized_wire : initialized_wires) {
+		auto it = initialized_wire->attributes.find(ID::init);
+		RTLIL::Const &val = it->second;
+		SigSpec sig = assign_map(initialized_wire);
+		for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
+			if (val[i] != State::Sx && sig[i].wire != nullptr) {
+				init_bits.values[sig[i]] = val[i];
+				init_bits.wires.insert(sig[i].wire);
+			}
+		initialized_wire->attributes.erase(it);
+	}
+	return init_bits;
+}
+
+/**
+ * What kinds of things are signals connected to?
+ * Helps pick representatives out of groups of connected signals */
+struct SigConnKinds {
+	// Wire bits directly driven by registers (with clk2fflogic exception)
+	ShardedSigPool raw_registers;
+	// Wire bits directly connected to any cell port
+	ShardedSigPool raw_cell_connected;
+
+	// Signals exactly driven by a known cell output,
+	// this will influence only our choice of representatives.
+	// A signal is exactly driven by a cell output iff all its bits are driven by this output
+	// and all bits of this output drive a bit of this signal.
+	// Additionally, all signals that sigmap to this signal are exactly driven by the port, too
+	ShardedSigSpecPool exact_cells;
+
+	SigConnKinds(bool purge_mode, const AnalysisContext& actx, CleanRunContext& clean_ctx) {
+		ShardedSigPool::Builder raw_register_builder(actx.subpool);
+		ShardedSigPool::Builder raw_cell_connected_builder(actx.subpool);
+		ShardedSigSpecPool::Builder exact_cell_output_builder(actx.subpool);
+		actx.subpool.run([&exact_cell_output_builder, &raw_register_builder, &raw_cell_connected_builder, purge_mode, &actx, &clean_ctx](const ParallelDispatchThreadPool::RunCtx &ctx) {
+
+			for (int i : ctx.item_range(actx.mod->cells_size())) {
+				RTLIL::Cell *cell = actx.mod->cell_at(i);
+				if (!purge_mode) {
+					if (clean_ctx.ct_reg(cell->type)) {
+						// Improve witness signal naming when clk2fflogic used
+						// see commit message e36c71b5
+						bool clk2fflogic = cell->get_bool_attribute(ID::clk2fflogic);
+						for (auto &[port, sig] : cell->connections())
+							if (clk2fflogic ? port == ID::D : clean_ctx.ct_all.cell_output(cell->type, port))
+								add_spec(raw_register_builder, ctx, sig);
+					}
+					for (auto &[_, sig] : cell->connections())
+						add_spec(raw_cell_connected_builder, ctx, sig);
+				}
+				if (clean_ctx.ct_all.cell_known(cell->type))
+					for (auto &[port, sig] : cell->connections())
+						if (clean_ctx.ct_all.cell_output(cell->type, port)) {
+							RTLIL::SigSpec spec = actx.assign_map(sig);
+							unsigned int hash = spec.hash_into(Hasher()).yield();
+							exact_cell_output_builder.insert(ctx, {std::move(spec), hash});
+						}
+			}
+		});
+		actx.subpool.run([&raw_register_builder, &raw_cell_connected_builder, &exact_cell_output_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			raw_register_builder.process(ctx);
+			raw_cell_connected_builder.process(ctx);
+			exact_cell_output_builder.process(ctx);
+		});
+		raw_registers = raw_register_builder;
+		raw_cell_connected = raw_cell_connected_builder;
+		exact_cells = exact_cell_output_builder;
+	}
+	void clear(const ParallelDispatchThreadPool::RunCtx &ctx) {
+		raw_registers.clear(ctx);
+		raw_cell_connected.clear(ctx);
+		exact_cells.clear(ctx);
+	}
+};
+
+ShardedVector<RTLIL::SigBit> build_candidates(ExactCellWires& cell_wires, const SigConnKinds& sig_analysis, const AnalysisContext& actx) {
+	ShardedVector<RTLIL::SigBit> candidates(actx.subpool);
+	actx.subpool.run([&actx, &sig_analysis, &candidates, &cell_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		std::optional<ExactCellWires> local_cell_wires;
+		ExactCellWires *this_thread_cell_wires = &cell_wires;
+		if (ctx.thread_num > 0) {
+			local_cell_wires.emplace(sig_analysis.exact_cells, actx.assign_map);
+			this_thread_cell_wires = &local_cell_wires.value();
+		}
+		for (int i : ctx.item_range(actx.mod->wires_size())) {
+			RTLIL::Wire *wire = actx.mod->wire_at(i);
+			for (int j = 0; j < wire->width; ++j) {
+				RTLIL::SigBit s1(wire, j);
+				RTLIL::SigBit s2 = actx.assign_map(s1);
+				if (compare_signals(s2, s1, sig_analysis.raw_registers, sig_analysis.raw_cell_connected, *this_thread_cell_wires))
+					candidates.insert(ctx, s1);
+			}
+		}
+	});
+	return candidates;
+}
+
+void update_assign_map(SigMap& assign_map, ShardedVector<RTLIL::SigBit>& sigmap_canonical_candidates, ExactCellWires& cell_wires, const SigConnKinds& sig_analysis) {
+	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
+		RTLIL::SigBit current_canonical = assign_map(candidate);
+		// Resolves if two threads in build_candidates found different candidates
+		// for the same set
+		// TODO adds effort for single-threaded?
+		if (compare_signals(current_canonical, candidate, sig_analysis.raw_registers, sig_analysis.raw_cell_connected, cell_wires))
+			assign_map.add(candidate);
+	}
+}
+
+struct DeferredUpdates {
+	// Deferred updates to the assign_map
+	ShardedVector<UpdateConnection> update_connections;
+	// Wires we should remove init from
+	ShardedVector<RTLIL::Wire*> initialized_wires;
+	DeferredUpdates(ParallelDispatchThreadPool::Subpool &subpool) : update_connections(subpool), initialized_wires(subpool) {}
+};
+struct UsedSignals {
+	// here, "connected" means "driven or driving something"
+	// meanwhile, "used" means "driving something"
+	// sigmapped
+	ShardedSigPool connected;
+	// pre-sigmapped
+	ShardedSigPool raw_connected;
+	// sigmapped
+	ShardedSigPool used;
+
+	void clear(ParallelDispatchThreadPool::Subpool &subpool) {
+		subpool.run([this](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			connected.clear(ctx);
+			raw_connected.clear(ctx);
+			used.clear(ctx);
+		});
+	}
+};
+
+DeferredUpdates analyse_connectivity(UsedSignals& used, SigConnKinds& sig_analysis, const AnalysisContext& actx, CleanRunContext &clean_ctx) {
+	DeferredUpdates deferred(actx.subpool);
+	ShardedSigPool::Builder conn_builder(actx.subpool);
+	ShardedSigPool::Builder raw_conn_builder(actx.subpool);
+	ShardedSigPool::Builder used_builder(actx.subpool);
+
+	// gather the usage information for cells and update cell connections with the altered sigmap
+	// also gather the usage information for ports, wires with `keep`
+	// also gather init bits
+	actx.subpool.run([&deferred, &conn_builder, &raw_conn_builder, &used_builder, &sig_analysis, &actx, &clean_ctx](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of these sharded structures
+		sig_analysis.clear(ctx);
+
+		for (int i : ctx.item_range(actx.mod->cells_size())) {
+			RTLIL::Cell *cell = actx.mod->cell_at(i);
+			for (const auto &[port, sig] : cell->connections_) {
+				SigSpec spec = actx.assign_map(sig);
+				if (spec != sig)
+					deferred.update_connections.insert(ctx, {cell, port, spec});
+				add_spec(raw_conn_builder, ctx, spec);
+				add_spec(conn_builder, ctx, spec);
+				if (!clean_ctx.ct_all.cell_output(cell->type, port))
+					add_spec(used_builder, ctx, spec);
+			}
+		}
+		for (int i : ctx.item_range(actx.mod->wires_size())) {
+			RTLIL::Wire *wire = actx.mod->wire_at(i);
+			if (wire->port_id > 0) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				add_spec(raw_conn_builder, ctx, sig);
+				actx.assign_map.apply(sig);
+				add_spec(conn_builder, ctx, sig);
+				if (!wire->port_input)
+					add_spec(used_builder, ctx, sig);
+			}
+			if (wire->get_bool_attribute(ID::keep)) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				actx.assign_map.apply(sig);
+				add_spec(conn_builder, ctx, sig);
+			}
+			auto it = wire->attributes.find(ID::init);
+			if (it != wire->attributes.end())
+				deferred.initialized_wires.insert(ctx, wire);
+		}
+	});
+	actx.subpool.run([&conn_builder, &raw_conn_builder, &used_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		conn_builder.process(ctx);
+		raw_conn_builder.process(ctx);
+		used_builder.process(ctx);
+	});
+	used = {conn_builder, raw_conn_builder, used_builder};
+	return deferred;
+}
+
+struct WireDeleter {
+	pool<RTLIL::Wire*> del_wires_queue;
+	ShardedVector<RTLIL::Wire*> remove_init;
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_init;
+	ShardedVector<RTLIL::SigSig> new_connections;
+	ShardedVector<RTLIL::Wire*> remove_unused_bits;
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_unused_bits;
+	WireDeleter(UsedSignals& used_sig_analysis, bool purge_mode, const AnalysisContext& actx) :
+		remove_init(actx.subpool),
+		set_init(actx.subpool),
+		new_connections(actx.subpool),
+		remove_unused_bits(actx.subpool),
+		set_unused_bits(actx.subpool) {
+		ShardedVector<RTLIL::Wire*> del_wires(actx.subpool);
+		actx.subpool.run([&actx, purge_mode, &del_wires, &used_sig_analysis, this](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			for (int i : ctx.item_range(actx.mod->wires_size())) {
+				RTLIL::Wire *wire = actx.mod->wire_at(i);
+				SigSpec s1 = SigSpec(wire), s2 = actx.assign_map(s1);
+				log_assert(GetSize(s1) == GetSize(s2));
+
+				Const initval;
+				bool has_init_attribute = wire->attributes.count(ID::init);
+				bool init_changed = false;
+				if (has_init_attribute)
+					initval = wire->attributes.at(ID::init);
+				if (GetSize(initval) != GetSize(wire)) {
+					initval.resize(GetSize(wire), State::Sx);
+					init_changed = true;
+				}
+
+				if (GetSize(wire) == 0) {
+					// delete zero-width wires, unless they are module ports
+					if (wire->port_id == 0)
+						goto delete_this_wire;
+				} else
+				if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
+					// do not delete anything with "keep" or module ports or initialized wires
+				} else
+				if (!purge_mode && check_public_name(wire->name) && (check_any(used_sig_analysis.raw_connected, s1) || check_any(used_sig_analysis.connected, s2) || s1 != s2)) {
+					// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
+				} else
+				if (!check_any(used_sig_analysis.raw_connected, s1)) {
+					// delete wires that aren't used by anything directly
+					goto delete_this_wire;
+				}
+
+				if (0)
+				{
+			delete_this_wire:
+					del_wires.insert(ctx, wire);
+				}
+				else
+				{
+					RTLIL::SigSig new_conn;
+					for (int i = 0; i < GetSize(s1); i++)
+						if (s1[i] != s2[i]) {
+							if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
+								s2[i] = initval[i];
+								initval.set(i, State::Sx);
+								init_changed = true;
+							}
+							new_conn.first.append(s1[i]);
+							new_conn.second.append(s2[i]);
+						}
+					if (new_conn.first.size() > 0)
+						new_connections.insert(ctx, std::move(new_conn));
+					if (initval.is_fully_undef()) {
+						if (has_init_attribute)
+							remove_init.insert(ctx, wire);
+					} else
+						if (init_changed)
+							set_init.insert(ctx, {wire, std::move(initval)});
+
+					std::string unused_bits;
+					if (!check_all(used_sig_analysis.used, s2)) {
+						for (int i = 0; i < GetSize(s2); i++) {
+							if (s2[i].wire == NULL)
+								continue;
+							SigBit b = s2[i];
+							if (used_sig_analysis.used.find({b, b.hash_top().yield()}) == nullptr) {
+								if (!unused_bits.empty())
+									unused_bits += " ";
+								unused_bits += stringf("%d", i);
+							}
+						}
+					}
+					if (unused_bits.empty() || wire->port_id != 0) {
+						if (wire->attributes.count(ID::unused_bits))
+							remove_unused_bits.insert(ctx, wire);
+					} else {
+						RTLIL::Const unused_bits_const(std::move(unused_bits));
+						if (wire->attributes.count(ID::unused_bits)) {
+							RTLIL::Const &unused_bits_attr = wire->attributes.at(ID::unused_bits);
+							if (unused_bits_attr != unused_bits_const)
+								set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+						} else
+							set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+					}
+				}
+			}
+		});
+		del_wires_queue.insert(del_wires.begin(), del_wires.end());
+	}
+	// Decide for each wire if we should be deleting it
+	// and fix up attributes
+	void commit_changes(RTLIL::Module* mod) {
+		for (RTLIL::Wire *wire : remove_init)
+			wire->attributes.erase(ID::init);
+		for (auto &p : set_init)
+			p.first->attributes[ID::init] = std::move(p.second);
+		for (auto &conn : new_connections)
+			mod->connect(std::move(conn));
+		for (RTLIL::Wire *wire : remove_unused_bits)
+			wire->attributes.erase(ID::unused_bits);
+		for (auto &p : set_unused_bits)
+			p.first->attributes[ID::unused_bits] = std::move(p.second);
+	}
+	int delete_wires(RTLIL::Module* mod, bool verbose) {
+		int deleted_and_unreported = 0;
+		for (auto wire : del_wires_queue) {
+			if (ys_debug() || (check_public_name(wire->name) && verbose))
+				log_debug("  removing unused non-port wire %s.\n", wire->name);
+			else
+				deleted_and_unreported++;
+		}
+		mod->remove(del_wires_queue);
+		return deleted_and_unreported;
+	}
+};
+
+PRIVATE_NAMESPACE_END
+
+YOSYS_NAMESPACE_BEGIN
+
+bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, CleanRunContext &clean_ctx)
+{
+	// Passing actx to function == function does parallel work
+	// Not passing module as function argument == function does not modify module
+	// TODO the above sentence is false due to constness laundering in wire_at / cell_at
+	AnalysisContext actx(module, subpool);
+	SigConnKinds conn_kinds(clean_ctx.flags.purge, actx, clean_ctx);
+
+	ExactCellWires cell_wires(conn_kinds.exact_cells, actx.assign_map);
+	// Collect sigmap representative candidates as built in parallel
+	// With parallel runs, this creates redundant candidates that have to resolve in update_assign_map
+	ShardedVector<RTLIL::SigBit> new_sigmap_rep_candidates = build_candidates(cell_wires, conn_kinds, actx);
+
+	// Cache all the cell_wires results that we might possible need. This avoids the results
+	// changing when we update `assign_map` below.
+	cell_wires.cache_all(new_sigmap_rep_candidates);
+	// Modify assign_map to reflect the connectivity we want, not the one we have
+	// this changes representative selection in assign_map
+	update_assign_map(actx.assign_map, new_sigmap_rep_candidates, cell_wires, conn_kinds);
+
+	// Remove all wire-wire connections
+	module->connections_.clear();
+
+	UsedSignals used;
+	DeferredUpdates deferred = analyse_connectivity(used, conn_kinds, actx, clean_ctx);
+	fixup_cell_ports(deferred.update_connections);
+	// Rip up and re-apply init attributes onto representative wires with x-bits
+	// in place of unset init bits
+	consume_inits(deferred.initialized_wires, actx.assign_map).apply_normalised_inits();
+
+	WireDeleter deleter(used, clean_ctx.flags.purge, actx);
+
+	used.clear(subpool);
+
+	deleter.commit_changes(module);
+	int deleted_and_unreported = deleter.delete_wires(module, clean_ctx.flags.verbose);
+	int deleted_total = GetSize(deleter.del_wires_queue);
+
+	clean_ctx.stats.count_rm_wires += deleted_total;
+
+	if (clean_ctx.flags.verbose && deleted_and_unreported)
+		log_debug("  removed %d unused temporary wires.\n", deleted_and_unreported);
+
+	if (deleted_total)
+		module->design->scratchpad_set_bool("opt.did_something", true);
+
+	return deleted_total != 0;
+}
+
+YOSYS_NAMESPACE_END

From 32f5044eaf46e11b3b13c35644c7ad3cd1250c31 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 4 Mar 2026 02:18:48 +0000
Subject: [PATCH 27/29] Clarify "Not passing module as function argument"
 comment

This correct in terms of intent, it's just not fully enforced due to const laundering.
---
 passes/opt/opt_clean/wires.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passes/opt/opt_clean/wires.cc b/passes/opt/opt_clean/wires.cc
index d7ca215c4..28c792936 100644
--- a/passes/opt/opt_clean/wires.cc
+++ b/passes/opt/opt_clean/wires.cc
@@ -537,7 +537,7 @@ bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::
 {
 	// Passing actx to function == function does parallel work
 	// Not passing module as function argument == function does not modify module
-	// TODO the above sentence is false due to constness laundering in wire_at / cell_at
+	// The above sentence signals intent; it's not enforced due to constness laundering in wire_at / cell_at
 	AnalysisContext actx(module, subpool);
 	SigConnKinds conn_kinds(clean_ctx.flags.purge, actx, clean_ctx);
 

From 8d8c05b338f640b0b9fbb5f4e2e917f8a5245f13 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 5 Mar 2026 02:20:58 +0000
Subject: [PATCH 28/29] Fix `OptCleanPass` usage of `CleanRunContext` to avoid
 constructing extra `KeepCache` and `ParallelDispatchThreadPool`

---
 passes/opt/opt_clean/opt_clean.cc | 25 ++++++++++++-------------
 passes/opt/opt_clean/opt_clean.h  | 23 ++++++-----------------
 2 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/passes/opt/opt_clean/opt_clean.cc b/passes/opt/opt_clean/opt_clean.cc
index 256257f6b..37d58861d 100644
--- a/passes/opt/opt_clean/opt_clean.cc
+++ b/passes/opt/opt_clean/opt_clean.cc
@@ -40,6 +40,7 @@ void rmunused_module(RTLIL::Module *module, bool rminit, CleanRunContext &clean_
 	if (rminit && rmunused_module_init(module, subpool, clean_ctx.flags.verbose))
 		while (rmunused_module_signals(module, subpool, clean_ctx)) { }
 }
+
 struct OptCleanPass : public Pass {
 	OptCleanPass() : Pass("opt_clean", "remove unused cells and wires") { }
 	void help() override
@@ -76,18 +77,12 @@ struct OptCleanPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		std::vector<RTLIL::Module*> selected_modules;
-		for (auto module : design->selected_whole_modules_warn())
-			if (!module->has_processes_warn())
-				selected_modules.push_back(module);
-		int thread_pool_size = 0;
-		for (RTLIL::Module *m : selected_modules)
-			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
-		ParallelDispatchThreadPool thread_pool(thread_pool_size);
-		KeepCache keep_cache(purge_mode, thread_pool, selected_modules);
-
 		{
-			CleanRunContext clean_ctx(design, {purge_mode, true});
+			std::vector<RTLIL::Module*> selected_modules;
+			for (auto module : design->selected_whole_modules_warn())
+				if (!module->has_processes_warn())
+					selected_modules.push_back(module);
+			CleanRunContext clean_ctx(design, selected_modules, {purge_mode, true});
 			for (auto module : selected_modules)
 				rmunused_module(module, true, clean_ctx);
 			clean_ctx.stats.log();
@@ -134,8 +129,12 @@ struct CleanPass : public Pass {
 		extra_args(args, argidx, design);
 
 		{
-			CleanRunContext clean_ctx(design, {purge_mode, ys_debug()});
-			for (auto module : clean_ctx.selected_modules)
+			std::vector<RTLIL::Module*> selected_modules;
+			for (auto module : design->selected_unboxed_whole_modules())
+				if (!module->has_processes())
+					selected_modules.push_back(module);
+			CleanRunContext clean_ctx(design, selected_modules, {purge_mode, ys_debug()});
+			for (auto module : selected_modules)
 				rmunused_module(module, true, clean_ctx);
 
 			log_suppressed();
diff --git a/passes/opt/opt_clean/opt_clean.h b/passes/opt/opt_clean/opt_clean.h
index 7599f25e8..affb217a1 100644
--- a/passes/opt/opt_clean/opt_clean.h
+++ b/passes/opt/opt_clean/opt_clean.h
@@ -55,33 +55,22 @@ struct CleanRunContext {
 	NewCellTypes ct_all;
 	RmStats stats;
 	ParallelDispatchThreadPool thread_pool;
-	std::vector<RTLIL::Module*> selected_modules;
 	KeepCache keep_cache;
 	Flags flags;
 
 private:
 	// Helper to compute thread pool size
-	static int compute_thread_pool_size(RTLIL::Design* design) {
+	static int compute_thread_pool_size(const std::vector<RTLIL::Module*>& selected_modules) {
 		int thread_pool_size = 0;
-		for (auto module : design->selected_unboxed_whole_modules())
-			if (!module->has_processes())
-				thread_pool_size = std::max(thread_pool_size,
-					ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+		for (auto module : selected_modules)
+			thread_pool_size = std::max(thread_pool_size,
+				ThreadPool::work_pool_size(0, module->cells_size(), 1000));
 		return thread_pool_size;
 	}
 
-	static std::vector<RTLIL::Module*> get_selected_modules(RTLIL::Design* design) {
-		std::vector<RTLIL::Module*> modules;
-		for (auto module : design->selected_unboxed_whole_modules())
-			if (!module->has_processes())
-				modules.push_back(module);
-		return modules;
-	}
-
 public:
-	CleanRunContext(RTLIL::Design* design, Flags f)
-		: thread_pool(compute_thread_pool_size(design)),
-		selected_modules(get_selected_modules(design)),
+	CleanRunContext(RTLIL::Design* design, const std::vector<RTLIL::Module*>& selected_modules, Flags f)
+		: thread_pool(compute_thread_pool_size(selected_modules)),
 		keep_cache(f.purge, thread_pool, selected_modules),
 		flags(f)
 	{

From 9c51ba1b097932bc0e18b7ca3148bd1fbc0f48a5 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 5 Mar 2026 20:04:49 +0000
Subject: [PATCH 29/29] Reduce opt_clean parallelism

---
 passes/opt/opt_clean/opt_clean.cc | 2 +-
 passes/opt/opt_clean/opt_clean.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/passes/opt/opt_clean/opt_clean.cc b/passes/opt/opt_clean/opt_clean.cc
index 37d58861d..87597d721 100644
--- a/passes/opt/opt_clean/opt_clean.cc
+++ b/passes/opt/opt_clean/opt_clean.cc
@@ -31,7 +31,7 @@ void rmunused_module(RTLIL::Module *module, bool rminit, CleanRunContext &clean_
 
 	// Use no more than one worker per thousand cells, rounded down, so
 	// we only start multithreading with at least 2000 cells.
-	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
+	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 10000);
 	ParallelDispatchThreadPool::Subpool subpool(clean_ctx.thread_pool, num_worker_threads);
 	remove_temporary_cells(module, subpool, clean_ctx.flags.verbose);
 	rmunused_module_cells(module, subpool, clean_ctx);
diff --git a/passes/opt/opt_clean/opt_clean.h b/passes/opt/opt_clean/opt_clean.h
index affb217a1..c48a8188a 100644
--- a/passes/opt/opt_clean/opt_clean.h
+++ b/passes/opt/opt_clean/opt_clean.h
@@ -64,7 +64,7 @@ private:
 		int thread_pool_size = 0;
 		for (auto module : selected_modules)
 			thread_pool_size = std::max(thread_pool_size,
-				ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+				ThreadPool::work_pool_size(0, module->cells_size(), 10000));
 		return thread_pool_size;
 	}