From 5b509e69f66fd749cd1776afe8e21d74f275c3ab Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 23 Dec 2023 15:37:27 -0800
Subject: [PATCH] vvp: concat: Defer update to end of the current simulation
 cycle

A concat typically has multiple inputs. Whenever one of the input values
change the output value of the concat is updated and propagated to its
downstream consumers.

When multiple inputs change within the same cycle each input will cause a
update propagation. Depending of the overall structure of the design this
can cause a significant performance penalty.

E.g. the following synthetic structure has a exponential runtime increase
based on the value of N.

```
reg [N-1:0] x;
generate for (genvar i = 0; i < N - 1; i++)
  assign x[i+1] = ^{x[i],x[i]};
endgenerate
```

To improve this defer the value propagation of the concat to the end of the
current cycle, this allows multiple input updates to be included in a
single output update.

For the example in report #1052 this reduced the runtime from 2 minutes to
essentially 0.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
---
 vvp/concat.cc   |  28 +++++++++++--
 vvp/concat.h    | 102 ++++++++++++++++++++++++++++++++++++++++++++++++
 vvp/vpi_priv.cc |   1 +
 vvp/vvp_net.h   |  73 ----------------------------------
 4 files changed, 128 insertions(+), 76 deletions(-)
 create mode 100644 vvp/concat.h
diff --git a/vvp/concat.cc b/vvp/concat.cc
index 6379ccb9b..4d83b80f2 100644
--- a/vvp/concat.cc
+++ b/vvp/concat.cc
@@ -18,7 +18,7 @@
  */
 
 # include  "compile.h"
-# include  "vvp_net.h"
+# include  "concat.h"
 # include  <cstdlib>
 # include  <iostream>
 # include  <cassert>
@@ -64,7 +64,18 @@ void vvp_fun_concat::recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
       if (!val_.set_vec(off, bit))
 	    return;
 
-      port.ptr()->send_vec4(val_, 0);
+      if (net_)
+	    return;
+
+      net_ = port.ptr();
+      schedule_functor(this);
+}
+
+void vvp_fun_concat::run_run()
+{
+      vvp_net_t *ptr = net_;
+      net_ = nullptr;
+      ptr->send_vec4(val_, 0);
 }
 
 void compile_concat(char*label, unsigned w0, unsigned w1,
@@ -135,7 +146,18 @@ void vvp_fun_concat8::recv_vec8_pv(vvp_net_ptr_t port, const vvp_vector8_t&bit,
 
       val_.set_vec(off, bit);
 
-      port.ptr()->send_vec8(val_);
+      if (net_)
+	    return;
+
+      net_ = port.ptr();
+      schedule_functor(this);
+}
+
+void vvp_fun_concat8::run_run()
+{
+      vvp_net_t *ptr = net_;
+      net_ = nullptr;
+      ptr->send_vec8(val_);
 }
 
 void compile_concat8(char*label, unsigned w0, unsigned w1,
diff --git a/vvp/concat.h b/vvp/concat.h
new file mode 100644
index 000000000..1edc9dc00
--- /dev/null
+++ b/vvp/concat.h
@@ -0,0 +1,102 @@
+#ifndef IVL_concat_H
+#define IVL_concat_H
+/*
+ * Copyright (c) 2004-2024 Stephen Williams (steve@icarus.com)
+ *
+ *    This source code is free software; you can redistribute it
+ *    and/or modify it in source code form under the terms of the GNU
+ *    General Public License as published by the Free Software
+ *    Foundation; either version 2 of the License, or (at your option)
+ *    any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+# include "vvp_net.h"
+
+/* vvp_fun_concat
+ * This node function creates vectors (vvp_vector4_t) from the
+ * concatenation of the inputs. The inputs (4) may be vector or
+ * vector8 objects, but they are reduced to vector4 values and
+ * strength information lost.
+ *
+ * The expected widths of the input vectors must be given up front so
+ * that the positions in the output vector (and also the size of the
+ * output vector) can be worked out. The input vectors must match the
+ * expected width.
+ */
+class vvp_fun_concat  : public vvp_net_fun_t, protected vvp_gen_event_s  {
+
+    public:
+      vvp_fun_concat(unsigned w0, unsigned w1,
+		     unsigned w2, unsigned w3);
+      ~vvp_fun_concat();
+
+      void recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
+                     vvp_context_t context) final;
+
+      void recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
+			unsigned base, unsigned vwid, vvp_context_t) final;
+    private:
+      void run_run() final;
+
+      unsigned wid_[4];
+      vvp_vector4_t val_;
+      vvp_net_t *net_ = nullptr;
+};
+
+class vvp_fun_concat8  : public vvp_net_fun_t, protected vvp_gen_event_s {
+
+    public:
+      vvp_fun_concat8(unsigned w0, unsigned w1,
+		     unsigned w2, unsigned w3);
+      ~vvp_fun_concat8();
+
+      void recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
+                     vvp_context_t context) final;
+      void recv_vec8(vvp_net_ptr_t port, const vvp_vector8_t&bit) final;
+
+      void recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
+			unsigned base, unsigned vwid, vvp_context_t) final;
+      void recv_vec8_pv(vvp_net_ptr_t p, const vvp_vector8_t&bit,
+			unsigned base, unsigned vwid) final;
+
+    private:
+      void run_run() final;
+
+      unsigned wid_[4];
+      vvp_vector8_t val_;
+      vvp_net_t *net_ = nullptr;
+};
+
+/* vvp_fun_repeat
+ * This node function create vectors by repeating the input. The width
+ * is the width of the output vector, and the repeat is the number of
+ * times to repeat the input. The width of the input vector is
+ * implicit from these values.
+ */
+class vvp_fun_repeat  : public vvp_net_fun_t {
+
+    public:
+      vvp_fun_repeat(unsigned width, unsigned repeat);
+      ~vvp_fun_repeat();
+
+      void recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
+                     vvp_context_t context);
+      void recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
+			unsigned int base, unsigned int vwid,
+			vvp_context_t context) final;
+
+    private:
+      unsigned wid_;
+      unsigned rep_;
+};
+
+#endif
diff --git a/vvp/vpi_priv.cc b/vvp/vpi_priv.cc
index eddc0d8fe..26d59ac55 100644
--- a/vvp/vpi_priv.cc
+++ b/vvp/vpi_priv.cc
@@ -23,6 +23,7 @@
 # include  "schedule.h"
 # include  "logic.h"
 # include  "part.h"
+# include  "concat.h"
 #ifdef CHECK_WITH_VALGRIND
 # include  "vvp_cleanup.h"
 #endif
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index 925bc3deb..511154a7b 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -52,7 +52,6 @@ class  vvp_net_fun_t;
 class  vvp_net_fil_t;
 
 /* Core net function types. */
-class  vvp_fun_concat;
 class  vvp_fun_drive;
 class  vvp_fun_part;
 
@@ -1366,55 +1365,6 @@ class vvp_net_fil_t  : public vvp_vpi_callback {
 
 /* **** Some core net functions **** */
 
-/* vvp_fun_concat
- * This node function creates vectors (vvp_vector4_t) from the
- * concatenation of the inputs. The inputs (4) may be vector or
- * vector8 objects, but they are reduced to vector4 values and
- * strength information lost.
- *
- * The expected widths of the input vectors must be given up front so
- * that the positions in the output vector (and also the size of the
- * output vector) can be worked out. The input vectors must match the
- * expected width.
- */
-class vvp_fun_concat  : public vvp_net_fun_t {
-
-    public:
-      vvp_fun_concat(unsigned w0, unsigned w1,
-		     unsigned w2, unsigned w3);
-      ~vvp_fun_concat();
-
-      void recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
-                     vvp_context_t context) final;
-
-      void recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
-			unsigned base, unsigned vwid, vvp_context_t) final;
-    private:
-      unsigned wid_[4];
-      vvp_vector4_t val_;
-};
-
-class vvp_fun_concat8  : public vvp_net_fun_t {
-
-    public:
-      vvp_fun_concat8(unsigned w0, unsigned w1,
-		     unsigned w2, unsigned w3);
-      ~vvp_fun_concat8();
-
-      void recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
-                     vvp_context_t context) final;
-      void recv_vec8(vvp_net_ptr_t port, const vvp_vector8_t&bit) final;
-
-      void recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
-			unsigned base, unsigned vwid, vvp_context_t) final;
-      void recv_vec8_pv(vvp_net_ptr_t p, const vvp_vector8_t&bit,
-			unsigned base, unsigned vwid) final;
-
-    private:
-      unsigned wid_[4];
-      vvp_vector8_t val_;
-};
-
 /*
  * The vvp_fun_force class objects are net functors that use their input
  * to force the associated filter. They do not actually  have an
@@ -1436,29 +1386,6 @@ class vvp_fun_force : public vvp_net_fun_t {
       void recv_real(vvp_net_ptr_t port, double bit, vvp_context_t);
 };
 
-/* vvp_fun_repeat
- * This node function create vectors by repeating the input. The width
- * is the width of the output vector, and the repeat is the number of
- * times to repeat the input. The width of the input vector is
- * implicit from these values.
- */
-class vvp_fun_repeat  : public vvp_net_fun_t {
-
-    public:
-      vvp_fun_repeat(unsigned width, unsigned repeat);
-      ~vvp_fun_repeat();
-
-      void recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
-                     vvp_context_t context);
-      void recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
-			unsigned int base, unsigned int vwid,
-			vvp_context_t context) final;
-
-    private:
-      unsigned wid_;
-      unsigned rep_;
-};
-
 /* vvp_fun_drive
  * This node function takes an input vvp_vector4_t as input, and
  * repeats that value as a vvp_vector8_t with all the bits given the