diff --git a/CMakeLists.txt b/CMakeLists.txt
index 961cc8e0..de2a90c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,6 +80,7 @@ set(STA_SOURCE
   dcalc/LumpedCapDelayCalc.cc
   dcalc/NetCaps.cc
   dcalc/ParallelDelayCalc.cc
+  dcalc/PrimaDelayCalc.cc
   dcalc/UnitDelayCalc.cc
   
   graph/DelayFloat.cc
diff --git a/dcalc/DelayCalc.cc b/dcalc/DelayCalc.cc
index 61b3f007..b6ce0b50 100644
--- a/dcalc/DelayCalc.cc
+++ b/dcalc/DelayCalc.cc
@@ -24,6 +24,7 @@
 #include "ArnoldiDelayCalc.hh"
 #include "CcsCeffDelayCalc.hh"
 #include "CcsSimDelayCalc.hh"
+#include "PrimaDelayCalc.hh"
 
 namespace sta {
 
@@ -41,6 +42,7 @@ registerDelayCalcs()
   registerDelayCalc("arnoldi", makeArnoldiDelayCalc);
   registerDelayCalc("ccs_ceff", makeCcsCeffDelayCalc);
   registerDelayCalc("ccs_sim", makeCcsSimDelayCalc);
+  registerDelayCalc("prima", makePrimaDelayCalc);
 }
 
 void
diff --git a/dcalc/DelayCalc.i b/dcalc/DelayCalc.i
index d94565d4..efd55ae0 100644
--- a/dcalc/DelayCalc.i
+++ b/dcalc/DelayCalc.i
@@ -21,6 +21,7 @@
 #include "Sta.hh"
 #include "ArcDelayCalc.hh"
 #include "dcalc/ArcDcalcWaveforms.hh"
+#include "dcalc/PrimaDelayCalc.hh"
 
 %}
 
@@ -133,4 +134,16 @@ ccs_load_waveform(const Pin *in_pin,
     return Table1();
 }
 
+void
+set_prima_reduce_order(size_t order)
+{
+  cmdLinkedNetwork();
+  Sta *sta = Sta::sta();
+  PrimaDelayCalc *dcalc = dynamic_cast<PrimaDelayCalc*>(sta->arcDelayCalc());
+  if (dcalc) {
+    dcalc->setPrimaReduceOrder(order);
+    sta->delaysInvalid();
+  }
+}
+
 %} // inline
diff --git a/dcalc/PrimaDelayCalc.cc b/dcalc/PrimaDelayCalc.cc
new file mode 100644
index 00000000..b1d580a8
--- /dev/null
+++ b/dcalc/PrimaDelayCalc.cc
@@ -0,0 +1,1131 @@
+// OpenSTA, Static Timing Analyzer
+// Copyright (c) 2024, Parallax Software, Inc.
+// 
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+#include "PrimaDelayCalc.hh"
+
+#include <cmath> // abs
+
+#include "Debug.hh"
+#include "Units.hh"
+#include "TimingArc.hh"
+#include "Liberty.hh"
+#include "PortDirection.hh"
+#include "Network.hh"
+#include "Sdc.hh"
+#include "DcalcAnalysisPt.hh"
+#include "Corner.hh"
+#include "Graph.hh"
+#include "Parasitics.hh"
+#include "GraphDelayCalc.hh"
+#include "DmpDelayCalc.hh"
+
+#include <Eigen/LU>
+#include <Eigen/QR>
+
+namespace sta {
+
+using std::abs;
+using std::make_shared;
+using Eigen::SparseLU;
+using Eigen::HouseholderQR;
+using Eigen::ColPivHouseholderQR;
+
+// Lawrence Pillage - “Electronic Circuit & System Simulation Methods” 1998
+// McGraw-Hill, Inc. New York, NY.
+
+ArcDelayCalc *
+makePrimaDelayCalc(StaState *sta)
+{
+  return new PrimaDelayCalc(sta);
+}
+
+PrimaDelayCalc::PrimaDelayCalc(StaState *sta) :
+  DelayCalcBase(sta),
+  dcalc_args_(nullptr),
+  load_pin_index_map_(nullptr),
+  pin_node_map_(network_),
+  prima_order_(3),
+  make_waveforms_(false),
+  waveform_drvr_pin_(nullptr),
+  waveform_load_pin_(nullptr),
+  watch_pin_values_(network_),
+  table_dcalc_(makeDmpCeffElmoreDelayCalc(sta))
+{
+}
+
+PrimaDelayCalc::PrimaDelayCalc(const PrimaDelayCalc &dcalc) :
+  DelayCalcBase(dcalc),
+  dcalc_args_(nullptr),
+  load_pin_index_map_(nullptr),
+  pin_node_map_(network_),
+  prima_order_(dcalc.prima_order_),
+  make_waveforms_(false),
+  waveform_drvr_pin_(nullptr),
+  waveform_load_pin_(nullptr),
+  watch_pin_values_(network_),
+  table_dcalc_(makeDmpCeffElmoreDelayCalc(this))
+{
+}
+
+PrimaDelayCalc::~PrimaDelayCalc()
+{
+  delete table_dcalc_;
+}
+
+ArcDelayCalc *
+PrimaDelayCalc::copy()
+{
+  return new PrimaDelayCalc(*this);
+}
+
+// Notify algorithm components.
+void
+PrimaDelayCalc::copyState(const StaState *sta)
+{
+  StaState::copyState(sta);
+  table_dcalc_->copyState(sta);
+}
+
+Parasitic *
+PrimaDelayCalc::findParasitic(const Pin *drvr_pin,
+                              const RiseFall *rf,
+                              const DcalcAnalysisPt *dcalc_ap)
+{
+  const Corner *corner = dcalc_ap->corner();
+  const ParasiticAnalysisPt *parasitic_ap = dcalc_ap->parasiticAnalysisPt();
+  // set_load net has precidence over parasitics.
+  if (sdc_->drvrPinHasWireCap(drvr_pin, corner)
+      || network_->direction(drvr_pin)->isInternal())
+    return nullptr;
+  Parasitic *parasitic = parasitics_->findParasiticNetwork(drvr_pin, parasitic_ap);
+  if (parasitic)
+    return parasitic;
+  const MinMax *cnst_min_max = dcalc_ap->constraintMinMax();
+  Wireload *wireload = sdc_->wireload(cnst_min_max);
+  if (wireload) {
+    float pin_cap, wire_cap, fanout;
+    bool has_wire_cap;
+    graph_delay_calc_->netCaps(drvr_pin, rf, dcalc_ap, pin_cap, wire_cap,
+                               fanout, has_wire_cap);
+    parasitic = parasitics_->makeWireloadNetwork(drvr_pin, wireload,
+                                                 fanout, cnst_min_max,
+                                                 parasitic_ap);
+  }
+  return parasitic;
+}
+
+Parasitic *
+PrimaDelayCalc::reduceParasitic(const Parasitic *,
+                                const Pin *,
+                                const RiseFall *,
+                                const DcalcAnalysisPt *)
+{
+  return nullptr;
+}
+
+ArcDcalcResult
+PrimaDelayCalc::inputPortDelay(const Pin *drvr_pin,
+                               float in_slew,
+                               const RiseFall *rf,
+                               const Parasitic *parasitic,
+                               const LoadPinIndexMap &load_pin_index_map,
+                               const DcalcAnalysisPt *dcalc_ap)
+{
+  ArcDcalcResult dcalc_result(load_pin_index_map.size());
+  LibertyLibrary *drvr_library = network_->defaultLibertyLibrary();
+
+  const Parasitic *pi_elmore = nullptr;
+  if (parasitic && parasitics_->isParasiticNetwork(parasitic)) {
+    const ParasiticAnalysisPt *ap = dcalc_ap->parasiticAnalysisPt();
+    pi_elmore = parasitics_->reduceToPiElmore(parasitic, drvr_pin, rf,
+                                              dcalc_ap->corner(),
+                                              dcalc_ap->constraintMinMax(), ap);
+  }
+
+  for (auto load_pin_index : load_pin_index_map) {
+    const Pin *load_pin = load_pin_index.first;
+    size_t load_idx = load_pin_index.second;
+    ArcDelay wire_delay = 0.0;
+    Slew load_slew = in_slew;
+    bool elmore_exists = false;
+    float elmore = 0.0;
+    if (pi_elmore)
+      parasitics_->findElmore(pi_elmore, load_pin, elmore, elmore_exists);
+    if (elmore_exists)
+      // Input port with no external driver.
+      dspfWireDelaySlew(load_pin, rf, in_slew, elmore, wire_delay, load_slew);
+    thresholdAdjust(load_pin, drvr_library, rf, wire_delay, load_slew);
+    dcalc_result.setWireDelay(load_idx, wire_delay);
+    dcalc_result.setLoadSlew(load_idx, load_slew);
+  }
+  return dcalc_result;
+}
+
+ArcDcalcResult
+PrimaDelayCalc::gateDelay(const Pin *drvr_pin,
+                          const TimingArc *arc,
+                          const Slew &in_slew,
+                          float load_cap,
+                          const Parasitic *parasitic,
+                          const LoadPinIndexMap &load_pin_index_map,
+                          const DcalcAnalysisPt *dcalc_ap)
+{
+  ArcDcalcArgSeq dcalc_args;
+  dcalc_args.emplace_back(nullptr, drvr_pin, nullptr, arc, in_slew, parasitic);
+  ArcDcalcResultSeq dcalc_results = gateDelays(dcalc_args, load_cap,
+                                               load_pin_index_map, dcalc_ap);
+  return dcalc_results[0];
+}
+
+ArcDcalcResultSeq
+PrimaDelayCalc::gateDelays(ArcDcalcArgSeq &dcalc_args,
+                           float load_cap,
+                           const LoadPinIndexMap &load_pin_index_map,
+                           const DcalcAnalysisPt *dcalc_ap)
+{
+  dcalc_args_ = &dcalc_args;
+  load_pin_index_map_ = &load_pin_index_map;
+  drvr_count_ = dcalc_args.size();
+  load_cap_ = load_cap;
+  dcalc_ap_ = dcalc_ap;
+  drvr_rf_ = dcalc_args[0].arc()->toEdge()->asRiseFall();
+  parasitic_network_ = dcalc_args[0].parasitic();
+
+  bool failed = false;
+  output_waveforms_.resize(drvr_count_);
+  for (size_t drvr_idx = 0; drvr_idx < drvr_count_; drvr_idx++) {
+    ArcDcalcArg &dcalc_arg = dcalc_args[drvr_idx];
+    GateTableModel *table_model = gateTableModel(dcalc_arg.arc(), dcalc_ap);
+    if (table_model && dcalc_arg.parasitic()) {
+      OutputWaveforms *output_waveforms = table_model->outputWaveforms();
+      Slew in_slew = dcalc_arg.inSlew();
+      if (output_waveforms
+          // Bounds check because extrapolating waveforms does not work for shit.
+          && output_waveforms->slewAxis()->inBounds(in_slew)
+          && output_waveforms->capAxis()->inBounds(load_cap)) {
+        output_waveforms_[drvr_idx] = output_waveforms;
+        debugPrint(debug_, "ccs_dcalc", 1, "%s %s",
+                   dcalc_arg.drvrCell()->name(),
+                   drvr_rf_->asString());
+        LibertyCell *drvr_cell = dcalc_arg.drvrCell();
+        const LibertyLibrary *drvr_library = drvr_cell->libertyLibrary();
+        bool vdd_exists;
+        drvr_library->supplyVoltage("VDD", vdd_, vdd_exists);
+        if (!vdd_exists)
+          report_->error(1720, "VDD not defined in library %s", drvr_library->name());
+        drvr_cell->ensureVoltageWaveforms();
+        if (drvr_idx == 0) {
+          vth_ = drvr_library->outputThreshold(drvr_rf_) * vdd_;
+          vl_ = drvr_library->slewLowerThreshold(drvr_rf_) * vdd_;
+          vh_ = drvr_library->slewUpperThreshold(drvr_rf_) * vdd_;
+        }
+      }
+      else
+        failed = true;
+    }
+    else
+      failed = true;
+  }
+
+  if (failed)
+    return tableDcalcResults(load_cap);
+  else {
+    simulate();
+    return dcalcResults();
+  }
+}
+
+ArcDcalcResultSeq
+PrimaDelayCalc::tableDcalcResults(float load_cap)
+{
+  for (size_t drvr_idx = 0; drvr_idx < drvr_count_; drvr_idx++) {
+    ArcDcalcArg &dcalc_arg = (*dcalc_args_)[drvr_idx];
+    const Pin *drvr_pin = dcalc_arg.drvrPin();
+    const RiseFall *rf = dcalc_arg.drvrEdge();
+    const Parasitic *parasitic = table_dcalc_->findParasitic(drvr_pin, rf, dcalc_ap_);
+    dcalc_arg.setParasitic(parasitic);
+  }
+  return table_dcalc_->gateDelays(*dcalc_args_, load_cap, *load_pin_index_map_,
+                                  dcalc_ap_);
+}
+
+void
+PrimaDelayCalc::simulate()
+{
+  initSim();
+  stampEqns();
+  setXinit();
+
+  if (prima_order_ > 0
+      && node_count_ > prima_order_) {
+    primaReduce();
+    simulate1(Gq_, Cq_, Bq_, xq_init_, Vq_, prima_order_);
+  }
+  else {
+    MatrixXd x_to_v = MatrixXd::Identity(order_, order_);
+    simulate1(G_, C_, B_, x_init_, x_to_v, order_);
+  }
+}
+
+void
+PrimaDelayCalc::simulate1(const MatrixSd &G,
+                 const MatrixSd &C,
+                 const MatrixXd &B,
+                 const VectorXd &x_init,
+                 const MatrixXd &x_to_v,
+                 const size_t order)
+{
+  VectorXd x(order);
+  VectorXd x_prev(order);
+  VectorXd x_prev2(order);
+
+  v_.resize(order);
+  v_prev_.resize(order);
+
+  initCeffIdrvr();
+  x = x_prev = x_prev2 = x_init;
+  v_ = v_prev_ = x_to_v * x_init;
+
+  time_step_ = time_step_prev_ = timeStep();
+  debugPrint(debug_, "ccs_dcalc", 1, "time step %s", delayAsString(time_step_, this));
+
+  MatrixSd A(order, order);
+  A = G + (2.0 / time_step_) * C;
+  A.makeCompressed();
+  SparseLU<MatrixSd> A_solver;
+  A_solver.compute(A);
+
+  // Initial time depends on ceff which impact delay, so use a sim step
+  // to find an initial ceff.
+  setPortCurrents();
+  VectorXd rhs(order);
+  rhs = B * u_ + (1.0 / time_step_) * C * (3.0 * x_prev - x_prev2);
+  x = A_solver.solve(rhs);
+  v_ = x_to_v * x;
+
+  updateCeffIdrvr();
+  x = x_prev = x_prev2 = x_init;
+  v_ = v_prev_ = x_to_v * x_init;
+
+  // voltageTime is always for a rising waveform so 0.0v is initial voltage.
+  double time_begin = output_waveforms_[0]->voltageTime((*dcalc_args_)[0].inSlew(),
+                                                        ceff_[0], 0.0);
+  // Limit in case load voltage waveforms don't get to final value.
+  double time_end = time_begin + maxTime();
+
+  if (make_waveforms_)
+    recordWaveformStep(time_begin);
+
+  for (double time = time_begin; time <= time_end; time += time_step_) {
+    setPortCurrents();
+    rhs = B * u_ + (1.0 / time_step_) * C * (3.0 * x_prev - x_prev2);
+    x = A_solver.solve(rhs);
+    v_ = x_to_v * x;
+    
+    const ArcDcalcArg &dcalc_arg = (*dcalc_args_)[0];
+    debugPrint(debug_, "ccs_dcalc", 3, "%s ceff %s VDrvr %.4f Idrvr %s",
+               delayAsString(time, this),
+               units_->capacitanceUnit()->asString(ceff_[0]),
+               voltage(dcalc_arg.drvrPin()),
+               units_->currentUnit()->asString(drvr_current_[0], 4));
+
+    updateCeffIdrvr();
+
+    measureThresholds(time);
+    if (make_waveforms_)
+      recordWaveformStep(time);
+
+    if (loadWaveformsFinished())
+      break;
+
+    time_step_prev_ = time_step_;
+    x_prev2.swap(x_prev);
+    x_prev.swap(x);
+    v_prev_.swap(v_);
+  }
+}
+
+double
+PrimaDelayCalc::timeStep()
+{
+  // Needs to use LTE for time step dynamic control.
+  return driverResistance() * load_cap_ * .02;
+}
+
+double
+PrimaDelayCalc::maxTime()
+{
+  return (*dcalc_args_)[0].inSlew()
+    + (driverResistance() + resistance_sum_) * load_cap_ * 4;
+}
+
+float
+PrimaDelayCalc::driverResistance()
+{
+  const Pin *drvr_pin = (*dcalc_args_)[0].drvrPin();
+  LibertyPort *drvr_port = network_->libertyPort(drvr_pin);
+  const MinMax *min_max = dcalc_ap_->delayMinMax();
+  return drvr_port->driveResistance(drvr_rf_, min_max);
+}
+
+void
+PrimaDelayCalc::initSim()
+{
+  ceff_.resize(drvr_count_);
+  drvr_current_.resize(drvr_count_);
+
+  findNodeCount();
+  setOrder();
+
+  // Reset waveform recording.
+  times_.clear();
+  drvr_voltages_.clear();
+  load_voltages_.clear();
+
+  measure_thresholds_ = {vl_, vth_, vh_};
+}
+
+void
+PrimaDelayCalc::findNodeCount()
+{
+  includes_pin_caps_ = parasitics_->includesPinCaps(parasitic_network_);
+  coupling_cap_multiplier_ = 1.0;
+
+  node_capacitances_.clear();
+  pin_node_map_.clear();
+  node_index_map_.clear();
+
+  for (ParasiticNode *node : parasitics_->nodes(parasitic_network_)) {
+    if (!parasitics_->isExternal(node)) {
+      size_t node_idx = node_index_map_.size();
+      node_index_map_[node] = node_idx;
+      const Pin *pin = parasitics_->pin(node);
+      if (pin) {
+        pin_node_map_[pin] = node_idx;
+        debugPrint(debug_, "ccs_dcalc", 1, "pin %s node %lu",
+                   network_->pathName(pin),
+                   node_idx);
+      }
+      double cap = parasitics_->nodeGndCap(node) + pinCapacitance(node);
+      node_capacitances_.push_back(cap);
+    }
+  }
+
+  for (ParasiticCapacitor *capacitor : parasitics_->capacitors(parasitic_network_)) {
+    float cap = parasitics_->value(capacitor) * coupling_cap_multiplier_;
+    ParasiticNode *node1 = parasitics_->node1(capacitor);
+    if (node1
+        && !parasitics_->isExternal(node1)) {
+      size_t node_idx = node_index_map_[node1];
+      node_capacitances_[node_idx] += cap;
+    }
+    ParasiticNode *node2 = parasitics_->node2(capacitor);
+    if (node2
+        && !parasitics_->isExternal(node2)) {
+      size_t node_idx = node_index_map_[node2];
+      node_capacitances_[node_idx] += cap;
+    }
+  }
+  node_count_ = node_index_map_.size();
+}
+
+float
+PrimaDelayCalc::pinCapacitance(ParasiticNode *node)
+{
+  const Pin *pin = parasitics_->pin(node);
+  float pin_cap = 0.0;
+  if (pin) {
+    Port *port = network_->port(pin);
+    LibertyPort *lib_port = network_->libertyPort(port);
+    const Corner *corner = dcalc_ap_->corner();
+    const MinMax *cnst_min_max = dcalc_ap_->constraintMinMax();
+    if (lib_port) {
+      if (!includes_pin_caps_)
+        pin_cap = sdc_->pinCapacitance(pin, drvr_rf_, corner, cnst_min_max);
+    }
+    else if (network_->isTopLevelPort(pin))
+      pin_cap = sdc_->portExtCap(port, drvr_rf_, corner, cnst_min_max);
+  }
+  return pin_cap;
+}
+
+void
+PrimaDelayCalc::setOrder()
+{
+  port_count_ = drvr_count_;
+  order_ = node_count_ + port_count_;
+
+  // Matrix resize also zeros.
+  G_.resize(order_, order_);
+  C_.resize(order_, order_);
+  B_.resize(order_, port_count_);
+  u_.resize(port_count_);
+  threshold_times_.resize(node_count_);
+}
+
+void
+PrimaDelayCalc::initCeffIdrvr()
+{
+  for (size_t drvr_idx = 0; drvr_idx < drvr_count_; drvr_idx++) {
+    const ArcDcalcArg &dcalc_arg = (*dcalc_args_)[drvr_idx];
+    ceff_[drvr_idx] = load_cap_;
+    // voltageTime is always for a rising waveform so 0.0v is initial voltage.
+    drvr_current_[drvr_idx] =
+      output_waveforms_[drvr_idx]->voltageCurrent(dcalc_arg.inSlew(),
+                                                  ceff_[drvr_idx], 0.0);
+  }
+}
+
+void
+PrimaDelayCalc::setXinit()
+{
+  x_init_.resize(order_);
+  double drvr_init_volt = (drvr_rf_ == RiseFall::rise()) ? 0.0 : vdd_;
+  // Init node voltages.
+  for (size_t n = 0; n < node_count_ + port_count_; n++)
+    x_init_[n] = drvr_init_volt;
+  // Init port voltages.
+  for (size_t p = 0; p < port_count_; p++)
+    x_init_[node_count_ + p] = drvr_init_volt;
+}
+
+void
+PrimaDelayCalc::stampEqns()
+{
+  G_.setZero();
+  C_.setZero();
+  B_.setZero();
+
+  for (size_t node_idx = 0; node_idx < node_count_; node_idx++)
+    stampCapacitance(node_idx, node_capacitances_[node_idx]);
+
+  resistance_sum_ = 0.0;
+  for (ParasiticResistor *resistor : parasitics_->resistors(parasitic_network_)) {
+    ParasiticNode *node1 = parasitics_->node1(resistor);
+    ParasiticNode *node2 = parasitics_->node2(resistor);
+    // One commercial extractor creates resistors with identical from/to nodes.
+    if (node1 != node2) {
+      size_t node_idx1 = node_index_map_[node1];
+      size_t node_idx2 = node_index_map_[node2];
+      float resistance = parasitics_->value(resistor);
+      stampConductance(node_idx1, node_idx2, 1.0 / resistance);
+      resistance_sum_ += resistance;
+    }
+  }
+
+  for (size_t drvr_idx = 0; drvr_idx < drvr_count_; drvr_idx++) {
+    const ArcDcalcArg &dcalc_arg = (*dcalc_args_)[drvr_idx];
+    size_t drvr_node = pin_node_map_[dcalc_arg.drvrPin()];
+    G_.coeffRef(node_count_ + drvr_idx, drvr_node) = 1.0;
+    G_.coeffRef(node_count_ + drvr_idx, node_count_ + drvr_idx) = -1.0;
+    // special sauce
+    G_.coeffRef(drvr_node, drvr_node) += 1e-6;
+    B_.coeffRef(drvr_node, drvr_idx) = 1.0;
+  }
+
+  if (debug_->check("ccs_dcalc", 3)) {
+    reportMatrix("G", G_);
+    reportMatrix("C", C_);
+    reportMatrix("B", B_);
+  }
+}
+
+// Grounded resistor.
+void
+PrimaDelayCalc::stampConductance(size_t n1,
+                                 double g)
+{
+  G_.coeffRef(n1, n1) += g;
+}
+
+// Floating resistor.
+void
+PrimaDelayCalc::stampConductance(size_t n1,
+                                 size_t n2,
+                                 double g)
+{
+  G_.coeffRef(n1, n1) += g;
+  G_.coeffRef(n2, n2) += g;
+  G_.coeffRef(n1, n2) -= g;
+  G_.coeffRef(n2, n1) -= g;
+}
+
+// Grounded capacitance.
+void
+PrimaDelayCalc::stampCapacitance(size_t n1,
+                                 double cap)
+{
+  C_.coeffRef(n1, n1) += cap;
+}
+
+// Floating capacitance.
+void
+PrimaDelayCalc::stampCapacitance(size_t n1,
+                                 size_t n2,
+                                 double cap)
+{
+  C_.coeffRef(n1, n1) += cap;
+  C_.coeffRef(n2, n2) += cap;
+  C_.coeffRef(n1, n2) -= cap;
+  C_.coeffRef(n2, n1) -= cap;
+}
+
+////////////////////////////////////////////////////////////////
+
+void
+PrimaDelayCalc::setPortCurrents()
+{
+  for (size_t drvr_idx = 0; drvr_idx < drvr_count_; drvr_idx++)
+    u_[drvr_idx] = drvr_current_[drvr_idx];
+}
+
+void
+PrimaDelayCalc::updateCeffIdrvr()
+{
+  for (size_t drvr_idx = 0; drvr_idx < drvr_count_; drvr_idx++) {
+    const ArcDcalcArg &dcalc_arg = (*dcalc_args_)[drvr_idx];
+    const Pin *drvr_pin = dcalc_arg.drvrPin();
+    size_t node_idx = pin_node_map_[drvr_pin];
+    double drvr_current = drvr_current_[drvr_idx];
+    double v1 = voltage(node_idx);
+    double v2 = voltagePrev(node_idx);
+    double dv = v1 - v2;
+    if (drvr_rf_ == RiseFall::rise()) {
+      if (drvr_current != 0.0
+          && dv > 0.0) {
+        double ceff = drvr_current * time_step_ / dv;
+        if (output_waveforms_[drvr_idx]->capAxis()->inBounds(ceff))
+          ceff_[drvr_idx] = ceff;
+      }
+      if (v1 > (vdd_ - .01))
+        // Whoa partner. Head'n for the weeds.
+        drvr_current_[drvr_idx] = 0.0;
+      else
+        drvr_current_[drvr_idx] =
+          output_waveforms_[drvr_idx]->voltageCurrent(dcalc_arg.inSlew(),
+                                                      ceff_[drvr_idx], v1);
+    }
+    else {
+      if (drvr_current != 0.0
+          && dv < 0.0) {
+        double ceff = drvr_current * time_step_ / dv;
+        if (output_waveforms_[drvr_idx]->capAxis()->inBounds(ceff))
+          ceff_[drvr_idx] = ceff;
+      }
+      if (v1 < 0.01) {
+        // Whoa partner. Head'n for the weeds.
+        drvr_current_[drvr_idx] = 0.0;
+      }
+      else
+        drvr_current_[drvr_idx] =
+          output_waveforms_[drvr_idx]->voltageCurrent(dcalc_arg.inSlew(),
+                                                      ceff_[drvr_idx],
+                                                      vdd_ - v1);
+    }
+  }
+}
+
+bool
+PrimaDelayCalc::loadWaveformsFinished()
+{
+  for (auto pin_node : pin_node_map_) {
+    size_t node_idx = pin_node.second;
+    double v = voltage(node_idx);
+    if ((drvr_rf_ == RiseFall::rise()
+         && v < vh_ + (vdd_ - vh_) * .5)
+        || (drvr_rf_ == RiseFall::fall()
+            && (v > vl_ * .5))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+////////////////////////////////////////////////////////////////
+
+void
+PrimaDelayCalc::measureThresholds(double time)
+{
+  for (auto pin_node1 : pin_node_map_) {
+    size_t node_idx = pin_node1.second;
+    double v = voltage(node_idx);
+    double v_prev = voltagePrev(node_idx);
+    for (size_t m = 0; m < measure_threshold_count_; m++) {
+      double th = measure_thresholds_[m];
+      if ((v_prev < th && th <= v)
+          || (v_prev > th && th >= v)) {
+        double t_cross = time - time_step_ + (th - v_prev) * time_step_ / (v - v_prev);
+        debugPrint(debug_, "ccs_measure", 1, "node %lu cross %.2f %s",
+                   node_idx,
+                   th,
+                   delayAsString(t_cross, this));
+        threshold_times_[node_idx][m] = t_cross;
+      }
+    }
+  }
+}
+
+double
+PrimaDelayCalc::voltage(const Pin *pin)
+{
+  size_t node_idx = pin_node_map_[pin];
+  return v_[node_idx];
+}
+
+double
+PrimaDelayCalc::voltage(size_t node_idx)
+{
+  return v_[node_idx];
+}
+
+double
+PrimaDelayCalc::voltagePrev(size_t node_idx)
+{
+  return v_prev_[node_idx];
+}
+
+ArcDcalcResultSeq
+PrimaDelayCalc::dcalcResults()
+{
+  ArcDcalcResultSeq dcalc_results(drvr_count_);
+  for (size_t drvr_idx = 0; drvr_idx < drvr_count_; drvr_idx++) {
+    ArcDcalcArg &dcalc_arg = (*dcalc_args_)[drvr_idx];
+    ArcDcalcResult &dcalc_result = dcalc_results[drvr_idx];
+    const Pin *drvr_pin = dcalc_arg.drvrPin();
+    const LibertyLibrary *drvr_library = dcalc_arg.drvrLibrary();
+    size_t drvr_node = pin_node_map_[drvr_pin];
+    ThresholdTimes &drvr_times = threshold_times_[drvr_node];
+    float ref_time = output_waveforms_[drvr_idx]->referenceTime(dcalc_arg.inSlew());
+    ArcDelay gate_delay = drvr_times[threshold_vth] - ref_time;
+    Slew drvr_slew = abs(drvr_times[threshold_vh] - drvr_times[threshold_vl]);
+    dcalc_result.setGateDelay(gate_delay);
+    dcalc_result.setDrvrSlew(drvr_slew);
+    debugPrint(debug_, "ccs_dcalc", 2,
+               "%s gate delay %s slew %s",
+               network_->pathName(drvr_pin),
+               delayAsString(gate_delay, this),
+               delayAsString(drvr_slew, this));
+
+    dcalc_result.setLoadCount(load_pin_index_map_->size());
+    for (auto load_pin_index : *load_pin_index_map_) {
+      const Pin *load_pin = load_pin_index.first;
+      size_t load_idx = load_pin_index.second;
+      size_t load_node = pin_node_map_[load_pin];
+      ThresholdTimes &wire_times = threshold_times_[load_node];
+      ThresholdTimes &drvr_times = threshold_times_[drvr_node];
+      ArcDelay wire_delay = wire_times[threshold_vth] - drvr_times[threshold_vth];
+      Slew load_slew = abs(wire_times[threshold_vh] - wire_times[threshold_vl]);
+      debugPrint(debug_, "ccs_dcalc", 2,
+                 "load %s %s delay %s slew %s",
+                 network_->pathName(load_pin),
+                 drvr_rf_->asString(),
+                 delayAsString(wire_delay, this),
+                 delayAsString(load_slew, this));
+
+      thresholdAdjust(load_pin, drvr_library, drvr_rf_, wire_delay, load_slew);
+      dcalc_result.setWireDelay(load_idx, wire_delay);
+      dcalc_result.setLoadSlew(load_idx, load_slew);
+    }
+  }
+  return dcalc_results;
+}
+
+////////////////////////////////////////////////////////////////
+
+void
+PrimaDelayCalc::setPrimaReduceOrder(size_t order)
+{
+  prima_order_ = order;
+}
+
+// This version fills in one column of the orthonomal matrix
+// at a time as in the Gram-Schmidt wikipedia algorithm.
+void
+PrimaDelayCalc::primaReduce()
+{
+  G_.makeCompressed();
+  // Step 3: solve G*R = B for R
+  SparseLU<MatrixSd> G_solver(G_);
+  if (G_solver.info() != Eigen::Success)
+    report_->error(1752, "G matrix is singular.");
+  MatrixXd R(order_, port_count_);
+  R = G_solver.solve(B_);
+
+  // Step 4
+  HouseholderQR<MatrixXd> R_solver(R);
+  MatrixXd Q = R_solver.householderQ();
+
+  // Vq is "X" in the prima paper (too many "x" variables in the paper).
+  Vq_.resize(order_, prima_order_);
+  // Vq = first port_count columns of Q.
+  Vq_.block(0, 0, order_, port_count_) = Q.block(0, 0, order_, port_count_);
+
+  // Step 6 - Arnolid iteration
+  for (size_t k = 1; k < prima_order_; k++) {
+    VectorXd V = C_ * Vq_.col(k - 1);
+    Vq_.col(k) = G_solver.solve(V);
+
+    // Modified Gram-Schmidt orthonormalization
+    for (size_t j = 0; j < k; j++) {
+      double H = Vq_.col(j).transpose() * Vq_.col(k);
+      Vq_.col(k) = Vq_.col(k) - H * Vq_.col(j);
+    }
+    VectorXd Vq_k = Vq_.col(k);
+    HouseholderQR<MatrixXd> Vq_k_solver(Vq_k);
+    MatrixXd VqQ = Vq_k_solver.householderQ();
+    Vq_.col(k) = VqQ.col(0);
+  }
+
+  // Step 8 - Matrix projection
+  MatrixSd Vqs = Vq_.sparseView();
+  Cq_ = Vqs.transpose() * C_ * Vqs;
+  Gq_ = Vqs.transpose() * G_ * Vqs;
+  Bq_ = Vqs.transpose() * B_;
+
+  // x = Vq * x~
+  // solve x_init = Vq * x~_init for x~_init
+  xq_init_ = Vq_.colPivHouseholderQr().solve(x_init_);
+
+  if (debug_->check("ccs_dcalc", 3)) {
+    reportMatrix("Vq", Vq_);
+    reportMatrix("G~", Gq_);
+    reportMatrix("C~", Cq_);
+    reportMatrix("B~", Bq_);
+  }
+}
+
+// This version fills in port_count columns of the orthonomal matrix
+// at a time as shown in the prima algorithm figure 4.
+void
+PrimaDelayCalc::primaReduce2()
+{
+  G_.makeCompressed();
+  // Step 3: solve G*R = B for R
+  SparseLU<MatrixSd> G_solver(G_);
+  MatrixXd R(order_, port_count_);
+  R = G_solver.solve(B_);
+
+  // Step 4
+  HouseholderQR<MatrixXd> R_solver(R);
+  MatrixXd Q = R_solver.householderQ();
+
+  // Vq is "X" in the prima paper (too many "x" variables in the paper).
+  size_t n = ceil(prima_order_ / static_cast<double>(port_count_));
+  MatrixXd Vq(order_, n * port_count_);
+  // // Vq = first port_count columns of Q.
+  Vq.block(0, 0, order_, port_count_) = Q.block(0, 0, order_, port_count_);
+
+  // Step 6 - Arnolid iteration
+  for (size_t k = 1; k < n; k++) {
+    MatrixXd V = C_ * Vq.block(0, (k - 1) * port_count_, order_, port_count_);
+    MatrixXd GV = G_solver.solve(V);
+    Vq.block(0, k * port_count_, order_, port_count_) = GV;
+
+    // Modified Gram-Schmidt orthonormalization
+    for (size_t j = 0; j < k; j++) {
+      MatrixXd H = Vq.block(0, j * port_count_, order_, port_count_).transpose()
+        * Vq.block(0, k * port_count_, order_, port_count_);
+      Vq.block(0, k * port_count_, order_, port_count_) =
+        Vq.block(0, k * port_count_, order_, port_count_) - Vq.block(0, j * port_count_, order_, port_count_) * H;
+    }
+    MatrixXd Vq_k = Vq.block(0, k * port_count_, order_, port_count_);
+    HouseholderQR<MatrixXd> Vq_k_solver(Vq_k);
+    MatrixXd VqQ = Vq_k_solver.householderQ();
+    Vq.block(0, k * port_count_, order_, port_count_) = 
+      VqQ.block(0, 0, order_, port_count_);
+  }
+  Vq_.resize(order_, prima_order_);
+  Vq_ = Vq.block(0, 0, order_, prima_order_);
+
+  // Step 8 - Matrix projection
+  MatrixSd Vqs = Vq_.sparseView();
+  Cq_ = Vqs.transpose() * C_ * Vqs;
+  Gq_ = Vqs.transpose() * G_ * Vqs;
+  Bq_ = Vqs.transpose() * B_;
+
+  // x = Vq * x~
+  // solve x_init = Vq * x~_init for x~_init
+  xq_init_ = Vq_.colPivHouseholderQr().solve(x_init_);
+
+  if (debug_->check("ccs_dcalc", 3)) {
+    reportMatrix("Vq", Vq_);
+    reportMatrix("G~", Gq_);
+    reportMatrix("C~", Cq_);
+    reportMatrix("B~", Bq_);
+  }
+}
+
+////////////////////////////////////////////////////////////////
+
+void
+PrimaDelayCalc::recordWaveformStep(double time)
+{
+  times_.push_back(time);
+  if (waveform_drvr_pin_) {
+    double drvr_v = voltage(waveform_drvr_pin_);
+    drvr_voltages_.push_back(drvr_v);
+  }
+  if (waveform_load_pin_) {
+    double load_v = voltage(waveform_load_pin_);
+    load_voltages_.push_back(load_v);
+  }
+  for (auto &pin_wave : watch_pin_values_) {
+    const Pin *pin = pin_wave.first;
+    FloatSeq &waveform = pin_wave.second;
+    double pin_v = voltage(pin);
+    waveform.push_back(pin_v);
+  }
+}
+
+////////////////////////////////////////////////////////////////
+
+string
+PrimaDelayCalc::reportGateDelay(const Pin *drvr_pin,
+                                const TimingArc *arc,
+                                const Slew &in_slew,
+                                float load_cap,
+                                const Parasitic *,
+                                const LoadPinIndexMap &,
+                                const DcalcAnalysisPt *dcalc_ap,
+                                int digits)
+{
+  GateTimingModel *model = gateModel(arc, dcalc_ap);
+  if (model) {
+    float in_slew1 = delayAsFloat(in_slew);
+    return model->reportGateDelay(pinPvt(drvr_pin, dcalc_ap), in_slew1, load_cap,
+                                  false, digits);
+  }
+  return "";
+}
+
+////////////////////////////////////////////////////////////////
+
+void
+PrimaDelayCalc::watchPin(const Pin *pin)
+{
+  watch_pin_values_[pin] = FloatSeq();
+  make_waveforms_ = true;
+}
+
+void
+PrimaDelayCalc::clearWatchPins()
+{
+  watch_pin_values_.clear();
+  make_waveforms_ = false;
+}
+
+PinSeq
+PrimaDelayCalc::watchPins() const
+{
+  PinSeq pins;
+  for (auto pin_values : watch_pin_values_) {
+    const Pin *pin = pin_values.first;
+    pins.push_back(pin);
+  }
+  return pins;
+}
+
+Waveform
+PrimaDelayCalc::watchWaveform(const Pin *pin)
+{
+  FloatSeq &voltages = watch_pin_values_[pin];
+  TableAxisPtr time_axis = make_shared<TableAxis>(TableAxisVariable::time,
+                                                  new FloatSeq(times_));
+  Table1 waveform(new FloatSeq(voltages), time_axis);
+  return waveform;
+}
+
+////////////////////////////////////////////////////////////////
+
+// Waveform accessors for swig/tcl.
+Table1
+PrimaDelayCalc::drvrWaveform(const Pin *in_pin,
+                             const RiseFall *in_rf,
+                             const Pin *drvr_pin,
+                             const RiseFall *drvr_rf,
+                             const Corner *corner,
+                             const MinMax *min_max)
+{
+  makeWaveforms(in_pin, in_rf, drvr_pin, drvr_rf, nullptr, corner, min_max);
+  TableAxisPtr time_axis = make_shared<TableAxis>(TableAxisVariable::time,
+                                                  new FloatSeq(times_));
+  Table1 waveform(new FloatSeq(drvr_voltages_), time_axis);
+  return waveform;
+}
+
+Table1
+PrimaDelayCalc::loadWaveform(const Pin *in_pin,
+                             const RiseFall *in_rf,
+                             const Pin *drvr_pin,
+                             const RiseFall *drvr_rf,
+                             const Pin *load_pin,
+                             const Corner *corner,
+                             const MinMax *min_max)
+{
+  makeWaveforms(in_pin, in_rf, drvr_pin, drvr_rf, load_pin, corner, min_max);
+  TableAxisPtr time_axis = make_shared<TableAxis>(TableAxisVariable::time,
+                                                  new FloatSeq(times_));
+  Table1 waveform(new FloatSeq(load_voltages_), time_axis);
+  return waveform;
+}
+
+Table1
+PrimaDelayCalc::inputWaveform(const Pin *in_pin,
+                              const RiseFall *in_rf,
+                              const Corner *corner,
+                              const MinMax *min_max)
+{
+  LibertyPort *port = network_->libertyPort(in_pin);
+  if (port) {
+    DriverWaveform *driver_waveform = port->driverWaveform(in_rf);
+    const Vertex *in_vertex = graph_->pinLoadVertex(in_pin);
+    DcalcAnalysisPt *dcalc_ap = corner->findDcalcAnalysisPt(min_max);
+    Slew in_slew = graph_->slew(in_vertex, in_rf, dcalc_ap->index());
+    LibertyLibrary *library = port->libertyLibrary();
+    float vdd;
+    bool vdd_exists;
+    library->supplyVoltage("VDD", vdd, vdd_exists);
+    if (!vdd_exists)
+      report_->error(1751, "VDD not defined in library %s", library->name());
+    Table1 in_waveform = driver_waveform->waveform(in_slew);
+    // Scale the waveform from 0:vdd.
+    FloatSeq *scaled_values = new FloatSeq;
+    for (float value : *in_waveform.values())
+      scaled_values->push_back(value * vdd);
+    return Table1(scaled_values, in_waveform.axis1ptr());
+  }
+  return Table1();
+}
+
+void
+PrimaDelayCalc::makeWaveforms(const Pin *in_pin,
+                              const RiseFall *in_rf,
+                              const Pin *drvr_pin,
+                              const RiseFall *drvr_rf,
+                              const Pin *load_pin,
+                              const Corner *corner,
+                              const MinMax *min_max)
+{
+  Edge *edge;
+  const TimingArc *arc;
+  graph_->gateEdgeArc(in_pin, in_rf, drvr_pin, drvr_rf, edge, arc);
+  if (arc) {
+    DcalcAnalysisPt *dcalc_ap = corner->findDcalcAnalysisPt(min_max);
+    const Parasitic *parasitic = findParasitic(drvr_pin, drvr_rf, dcalc_ap);
+    if (parasitic) {
+      make_waveforms_ = true;
+      waveform_drvr_pin_ = drvr_pin;
+      waveform_load_pin_ = load_pin;
+      Vertex *drvr_vertex = graph_->pinDrvrVertex(drvr_pin);
+      graph_delay_calc_->findDriverArcDelays(drvr_vertex, edge, arc, dcalc_ap, this);
+      make_waveforms_ = false;
+      waveform_drvr_pin_ = nullptr;
+      waveform_load_pin_ = nullptr;
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////
+
+void
+PrimaDelayCalc::reportMatrix(const char *name,
+                             MatrixSd &matrix)
+{
+  report_->reportLine("%s", name);
+  reportMatrix(matrix);
+}
+
+void
+PrimaDelayCalc::reportMatrix(const char *name,
+                             MatrixXd &matrix)
+{
+  report_->reportLine("%s", name);
+  reportMatrix(matrix);
+}
+
+void
+PrimaDelayCalc::reportMatrix(const char *name,
+                             VectorXd &matrix)
+{
+  report_->reportLine("%s", name);
+  reportMatrix(matrix);
+}
+
+void
+PrimaDelayCalc::reportVector(const char *name,
+                             vector<double> &matrix)
+{
+  report_->reportLine("%s", name);
+  reportVector(matrix);
+}
+  
+void
+PrimaDelayCalc::reportMatrix(MatrixSd &matrix)
+{
+  for (Index i = 0; i < matrix.rows(); i++) {
+    string line = "| ";
+    for (Index j = 0; j < matrix.cols(); j++) {
+      string entry = stdstrPrint("%10.3e", matrix.coeff(i, j));
+      line += entry;
+      line += " ";
+    }
+    line += "|";
+    report_->reportLineString(line);
+  }
+}
+
+void
+PrimaDelayCalc::reportMatrix(MatrixXd &matrix)
+{
+  for (Index i = 0; i < matrix.rows(); i++) {
+    string line = "| ";
+    for (Index j = 0; j < matrix.cols(); j++) {
+      string entry = stdstrPrint("%10.3e", matrix.coeff(i, j));
+      line += entry;
+      line += " ";
+    }
+    line += "|";
+    report_->reportLineString(line);
+  }
+}
+
+void
+PrimaDelayCalc::reportMatrix(VectorXd &matrix)
+{
+  string line = "| ";
+  for (Index i = 0; i < matrix.rows(); i++) {
+    string entry = stdstrPrint("%10.3e", matrix.coeff(i));
+    line += entry;
+    line += " ";
+  }
+  line += "|";
+  report_->reportLineString(line);
+}
+
+void
+PrimaDelayCalc::reportVector(vector<double> &matrix)
+{
+  string line = "| ";
+  for (size_t i = 0; i < matrix.size(); i++) {
+    string entry = stdstrPrint("%10.3e", matrix[i]);
+    line += entry;
+    line += " ";
+  }
+  line += "|";
+  report_->reportLineString(line);
+}
+
+} // namespace
diff --git a/dcalc/PrimaDelayCalc.hh b/dcalc/PrimaDelayCalc.hh
new file mode 100644
index 00000000..3cf2a779
--- /dev/null
+++ b/dcalc/PrimaDelayCalc.hh
@@ -0,0 +1,263 @@
+// OpenSTA, Static Timing Analyzer
+// Copyright (c) 2024, Parallax Software, Inc.
+// 
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+#pragma once
+
+#include <vector>
+#include <map>
+#include <Eigen/SparseCore>
+#include <Eigen/SparseLU>
+
+#include "Map.hh"
+#include "LumpedCapDelayCalc.hh"
+#include "ArcDcalcWaveforms.hh"
+
+namespace sta {
+
+class ArcDelayCalc;
+class StaState;
+class Corner;
+
+using std::vector;
+using std::array;
+using Eigen::MatrixXd;
+using Eigen::MatrixXcd;
+using Eigen::VectorXd;
+using Eigen::SparseMatrix;
+using Eigen::Index;
+using std::map;
+
+typedef Map<const Pin*, size_t, PinIdLess> PinNodeMap;
+typedef Map<const ParasiticNode*, size_t> NodeIndexMap;
+typedef Map<const Pin*, size_t> PortIndexMap;
+typedef SparseMatrix<double> MatrixSd;
+typedef Map<const Pin*, VectorXd, PinIdLess> PinLMap;
+typedef map<const Pin*, FloatSeq, PinIdLess> WatchPinValuesMap;
+
+typedef Table1 Waveform;
+
+ArcDelayCalc *
+makePrimaDelayCalc(StaState *sta);
+
+class PrimaDelayCalc : public DelayCalcBase,
+                       public ArcDcalcWaveforms
+{
+public:
+  PrimaDelayCalc(StaState *sta);
+  PrimaDelayCalc(const PrimaDelayCalc &dcalc);
+  ~PrimaDelayCalc();
+  ArcDelayCalc *copy() override;
+  void copyState(const StaState *sta) override;
+  void setPrimaReduceOrder(size_t order);
+  Parasitic *findParasitic(const Pin *drvr_pin,
+                           const RiseFall *rf,
+                           const DcalcAnalysisPt *dcalc_ap) override;
+  Parasitic *reduceParasitic(const Parasitic *parasitic_network,
+                             const Pin *drvr_pin,
+                             const RiseFall *rf,
+                             const DcalcAnalysisPt *dcalc_ap) override;
+  ArcDcalcResult inputPortDelay(const Pin *drvr_pin,
+                                float in_slew,
+                                const RiseFall *rf,
+                                const Parasitic *parasitic,
+                                const LoadPinIndexMap &load_pin_index_map,
+                                const DcalcAnalysisPt *dcalc_ap) override;
+  ArcDcalcResult gateDelay(const Pin *drvr_pin,
+                           const TimingArc *arc,
+                           const Slew &in_slew,
+                           float load_cap,
+                           const Parasitic *parasitic,
+                           const LoadPinIndexMap &load_pin_index_map,
+                           const DcalcAnalysisPt *dcalc_ap) override;
+  ArcDcalcResultSeq gateDelays(ArcDcalcArgSeq &dcalc_args,
+                               float load_cap,
+                               const LoadPinIndexMap &load_pin_index_map,
+                               const DcalcAnalysisPt *dcalc_ap) override;
+  string reportGateDelay(const Pin *drvr_pin,
+                         const TimingArc *arc,
+                         const Slew &in_slew,
+                         float load_cap,
+                         const Parasitic *parasitic,
+                         const LoadPinIndexMap &load_pin_index_map,
+                         const DcalcAnalysisPt *dcalc_ap,
+                         int digits) override;
+
+  // Record waveform for drvr/load pin.
+  void watchPin(const Pin *pin);
+  void clearWatchPins();
+  PinSeq watchPins() const;
+  Waveform watchWaveform(const Pin *pin);
+  
+  Waveform inputWaveform(const Pin *in_pin,
+                         const RiseFall *in_rf,
+                         const Corner *corner,
+                         const MinMax *min_max) override;
+  Waveform drvrWaveform(const Pin *in_pin,
+                        const RiseFall *in_rf,
+                        const Pin *drvr_pin,
+                        const RiseFall *drvr_rf,
+                        const Corner *corner,
+                        const MinMax *min_max) override;
+ Waveform loadWaveform(const Pin *in_pin,
+                       const RiseFall *in_rf,
+                       const Pin *drvr_pin,
+                       const RiseFall *drvr_rf,
+                       const Pin *load_pin,
+                       const Corner *corner,
+                       const MinMax *min_max) override;
+
+protected:
+  ArcDcalcResultSeq tableDcalcResults(float load_cap);
+  void simulate();
+  void simulate1(const MatrixSd &G,
+                 const MatrixSd &C,
+                 const MatrixXd &B,
+                 const VectorXd &x_init,
+                 const MatrixXd &x_to_v,
+                 const size_t order);
+  double maxTime();
+  double timeStep();
+  float driverResistance();
+  void updateCeffIdrvr();
+  void initSim();
+  void findLoads();
+  void findNodeCount();
+  void setOrder();
+  void initCeffIdrvr();
+  void setXinit();
+  void stampEqns();
+  void stampConductance(size_t n1,
+                        double g);
+  void stampConductance(size_t n1,
+                        size_t n2,
+                        double g);
+  void stampCapacitance(size_t n1,
+                        double cap);
+  void stampCapacitance(size_t n1,
+                        size_t n2,
+                        double cap);
+  float pinCapacitance(ParasiticNode *node);
+  void setPortCurrents();
+  void measureThresholds(double time);
+  double voltage(const Pin *pin);
+  double voltage(size_t node_idx);
+  double voltagePrev(size_t node_idx);
+  bool loadWaveformsFinished();
+  ArcDcalcResultSeq dcalcResults();
+
+  void recordWaveformStep(double time);
+  void makeWaveforms(const Pin *in_pin,
+                     const RiseFall *in_rf,
+                     const Pin *drvr_pin,
+                     const RiseFall *drvr_rf,
+                     const Pin *load_pin,
+                     const Corner *corner,
+                     const MinMax *min_max);
+  void primaReduce();
+  void primaReduce2();
+
+  void reportMatrix(const char *name,
+                    MatrixSd &matrix);
+  void reportMatrix(const char *name,
+                    MatrixXd &matrix);
+  void reportMatrix(const char *name,
+                    VectorXd &matrix);
+  void reportVector(const char *name,
+                    vector<double> &matrix);
+  void reportMatrix(MatrixSd &matrix);
+  void reportMatrix(MatrixXd &matrix);
+  void reportMatrix(VectorXd &matrix);
+  void reportVector(vector<double> &matrix);
+
+  ArcDcalcArgSeq *dcalc_args_;
+  size_t drvr_count_;
+  float load_cap_;
+  const DcalcAnalysisPt *dcalc_ap_;
+  const Parasitic *parasitic_network_;
+  const RiseFall *drvr_rf_;
+  const LoadPinIndexMap *load_pin_index_map_;
+
+  PinNodeMap pin_node_map_;     // Parasitic pin -> array index
+  NodeIndexMap node_index_map_; // Parasitic node -> array index
+  vector<OutputWaveforms*> output_waveforms_;
+  double resistance_sum_;
+  
+  vector<double> node_capacitances_;
+  bool includes_pin_caps_;
+  float coupling_cap_multiplier_;
+  
+  size_t node_count_;           // Parasitic network node count
+  size_t port_count_;           // aka drvr_count_
+  size_t order_;                // node_count_ + port_count_
+
+  // MNA node eqns
+  // G*x(t) + C*x'(t) = B*u(t)
+  MatrixSd G_;
+  MatrixSd C_;
+  MatrixXd B_;
+  VectorXd x_init_;
+  VectorXd u_;
+
+  // Prima reduced MNA eqns
+  size_t prima_order_;
+  MatrixXd Vq_;
+  MatrixSd Gq_;
+  MatrixSd Cq_;
+  MatrixXd Bq_;
+  VectorXd xq_init_;
+
+  // Node voltages.
+  VectorXd v_;                  // voltage[node_idx]
+  VectorXd v_prev_;
+
+  // Indexed by driver index.
+  vector<double> ceff_;
+  vector<double> drvr_current_;
+
+  double time_step_;
+  double time_step_prev_;
+
+  // Waveform recording.
+  bool make_waveforms_;
+  const Pin *waveform_drvr_pin_;
+  const Pin *waveform_load_pin_;
+  FloatSeq drvr_voltages_;
+  FloatSeq load_voltages_;
+  WatchPinValuesMap watch_pin_values_;
+  FloatSeq times_;
+
+  float vdd_;
+  float vth_;
+  float vl_;
+  float vh_;
+
+  static constexpr size_t threshold_vl = 0;
+  static constexpr size_t threshold_vth = 1;
+  static constexpr size_t threshold_vh = 2;
+  static constexpr size_t measure_threshold_count_ = 3;
+  typedef array<double, measure_threshold_count_> ThresholdTimes;
+  // Vl Vth Vh
+  ThresholdTimes measure_thresholds_;
+  // Indexed by node number.
+  vector<ThresholdTimes> threshold_times_;
+
+  // Delay calculator to use when ccs waveforms are missing from liberty.
+  ArcDelayCalc *table_dcalc_;
+
+  using ArcDelayCalc::reduceParasitic;
+};
+
+} // namespacet
diff --git a/doc/StaApi.txt b/doc/StaApi.txt
index 812db703..6e161f27 100644
--- a/doc/StaApi.txt
+++ b/doc/StaApi.txt
@@ -15,15 +15,15 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 
 The STA is built in C++ with heavy use of STL (Standard Template
-Libraries).  It also uses the zlib library to read compressed Verilog,
-SDF, SPF, and SPEF files.
+Libraries).  It also uses the zlib library to read compressed Liberty,
+Verilog, SDF, SPF, and SPEF files.
 
 The sub-directories of the STA code are:
 
 doc
   Documentation files.
 util
-  Basic utilities.  
+  Basic utilities.
 liberty
   Liberty timing library classes and file reader.
 network
@@ -33,7 +33,7 @@ verilog
 graph
   Timing graph built from network and library cell timing arcs.
 sdc
-  SDC timing constraint classes.   
+  SDC timing constraint classes.
 sdf
   SDF reader, writer and annotator.
 dcalc
@@ -42,7 +42,7 @@ search
   Search engine used to annotate the graph with arrival, required times
   and find timing check slacks.
 parasitics
-  Parasitics API,  Spef and Spf readers.
+  Parasitics API, Spef and Spf readers.
 app
   Interface between Tcl and STA (built with SWIG).
   Main program definition.
@@ -60,45 +60,45 @@ STA API
 -------
 
 Major components of the STA such as the network, timing graph, sdc,
-and search are implemented as separate classes.  The Sta class
+and search are implemented as separate classes. The Sta class
 contains an instance of each of these components.
 
 The Sta class defines the bulk of the externally visible API used by
 the Tcl interface, and coordinates operations that involve multiple
-components.  For example, when a false path command is entered into
+components. For example, when a false path command is entered into
 the Tcl command interpreter, the Sta passes the declaration on to the
 Sdc component and tells the Search component to invalidate all arrival
 and required times.
 
 Applications should call functions defined by the Sta class rather
-than functions defined by the components.  Calling functions defined
+than functions defined by the components. Calling functions defined
 by the components will get you in trouble unless you understand them
-in detail.  For example, telling the delay calculator to recompute the
-delays leaves the arrival times that depend on them wrong.  Always
+in detail. For example, telling the delay calculator to recompute the
+delays leaves the arrival times that depend on them wrong. Always
 remember that the Sta coordinates the components.
 
 In general, objects passed as arguments to Sta functions that are
 constructors become "owned" by the STA and should not be deleted by
-the caller.  For example, a set of pins passed into
+the caller. For example, a set of pins passed into
 Sta::makeExceptionFrom are used in the resulting object (rather than
-copied into another set).  On the other hand, strings passed as
+copied into another set). On the other hand, strings passed as
 arguments are copied by the Sta functions before they are retained in
 STA data structures.
 
 In many cases the major components contain pointers to other
-components.  The StaState class is a simple container for these
+components. The StaState class is a simple container for these
 components that makes initialization of pointers to the components
 easier.
 
 An STA with modified behavior can be built by defining classes derived
 from the component classes and overloading some of the member
-functions (which may have to be modified to be virtual).  Components
-are created by Sta::makeComponents().  The Sta::makeComponents()
+functions (which may have to be modified to be virtual). Components
+are created by Sta::makeComponents(). The Sta::makeComponents()
 function in turn calls each of the Sta::make<Component> component
-constructors.  These constructors can be overloaded by redefining them
-in a class derived from Sta.  Because the components refer to each
+constructors. These constructors can be overloaded by redefining them
+in a class derived from Sta. Because the components refer to each
 other, Sta::updateComponentsState() must be called to notify the
-components if any of them are changed after creation.  
+components if any of them are changed after creation. 
 
 The file liberty/LibertyExt.cc contains an example that shows how the
 liberty reader is replaced with a custom one on the Sta object.
@@ -121,19 +121,16 @@ Utilities
 ---------
 
 The most significant utilities are the Vector, Map and Set templated
-classes built on top the respective STL classes.  The main point of
+classes built on top the respective STL classes. The main point of
 these classes is to provide Java-like iterators that can be passed
-around as one object.  STL iterators require the container to be
-useful.  Iterators uniformly use the hasNext() function to test to see
+around as one object. STL iterators require the container to be
+useful. Iterators uniformly use the hasNext() function to test to see
 if there is another member and next() to access the next iteration
 member.
 
-Most printing is done in Tcl rather than the STA C++ code.  Some
-errors and warnings are reported by the STA in C++
-
 All printing done by the STA core is done using the Report class API.
 The report class supports output redirection to a file and logging to
-a file.  The Tcl interpreter prints to "channels" that are
+a file. The Tcl interpreter prints to "channels" that are
 encapsulated by functions in the the ReportTcl class. Printing inside
 the STA is directed to the Tcl channels so that it appears with the
 Tcl interpreter output.
@@ -142,21 +139,22 @@ Network
 -------
 
 The network API is the key to making the STA a timing engine that can
-be bolted onto another application.  This API allows the STA to
+be bolted onto another application. This API allows the STA to
 efficiently communicate with external network data structures without
 the overhead of making and maintaining a copying of it.
 
 The network API encapsulates both library and netlist accessors.
-Libraries are composed of cells that have ports the define connections
-to the cell.  Netlists are built out of cell instances, pins and nets.
+Libraries are composed of cells that have ports that define connections
+to the cell. Netlists are built out of cell instances, pins and nets.
 
 The ConcreteLibrary and ConcreteNetwork classes are used by the STA
-netlist readers.  These class definitions are to support a stand alone
-STA that does not depend on external netlist data structures.
+netlist readers (notibly Verilog). These class definitions are to
+support a stand alone STA that does not depend on external netlist
+data structures.
 
 External network data structures are interfaced to the STA by casting
-pointers to network object across the interface.  The external objects
-do not have to be derived from STA network base classes.  The network
+pointers to network objects across the interface. The external objects
+do not have to be derived from STA network base classes. The network
 API functions are typically very thin functions that cast the STA
 network types to the external class types and call the corresponding
 external network database accessor.
@@ -165,40 +163,40 @@ Bus ports are expanded into ports for each bit in the bus, and
 iterators are provided for the expanded and unexpanded set of cell
 ports.
 
-Network instances are calls of cells in the design hierarchy.  Both
-hierarchcial and leaf instances are in the network.  Hierarchical
+Network instances are calls of cells in the design hierarchy. Both
+hierarchcial and leaf instances are in the network. Hierarchical
 instances have children instances at the next lower hierarchy level.
-Leaf instances have liberty cells with timing model data.  At the top
+Leaf instances have liberty cells with timing model data. At the top
 of the hierarchy is a top level instance that has instances for the
-top level netlist.  If a cell has multiple instances the entire
-sub-tree of hierarchy is repeated in the network.  This "unfolded"
+top level netlist. If a cell has multiple instances the entire
+sub-tree of hierarchy is repeated in the network. This "unfolded"
 network representation allows optimization to specialize instances of
-a hierarchical block.  A "folded" network representation that has only
+a hierarchical block. A "folded" network representation that has only
 one sub-tree for each hierarchical block means that all copies must
 have identical sub-trees, preventing optimations that specialize the
 contents.
 
 Pins are a connection between an instance and a net corresponding to a
-port.  For bus ports each bit in the bus has a corresponding pin
+port. For bus ports each bit in the bus has a corresponding pin
 (library iterators can be used to find the pins that correspond to all
-of the bits in a bus).  Ports on the top level instance also have pins
+of the bits in a bus). Ports on the top level instance also have pins
 in the network that are the top level inputs and outputs.
 
-Nets connect together a group of pins.  Both hierarchical and leaf
-pins are on a net.  Nets can connect pins on multiple levels of
+Nets connect together a group of pins. Both hierarchical and leaf
+pins are on a net. Nets can connect pins on multiple levels of
 hierarchy.
 
 The network objects inside the STA are always pointers to instances of
-undefined class objects.  The implementation and definition of the
-network objects themselves is never visible inside the STA.  The
+undefined class objects. The implementation and definition of the
+network objects themselves is never visible inside the STA. The
 network API is implemented as an adapter that performs all operations
-on all network objects.  There is one network adapter instance used by
-all STA code.  For example, to find the cell of an instance
+on all network objects. There is one network adapter instance used by
+all STA code. For example, to find the cell of an instance
  
   Cell *cell = network->cell(instance);
   
 The network adapter returns iterators for looping over groups of
-network objects.  For example, the following code iterates over the
+network objects. For example, the following code iterates over the
 children of the top level instance.
 
   Instance *top_instance = network->topInstance();
@@ -211,13 +209,16 @@ children of the top level instance.
 
 An adapter to a network database is built by defining a class derived
 from the base class Network, or NetworkEdit if it supports incremental
-editing operations.  network/ConcreteNetwork.cc and oa/OaNetwork.cc
-are sample network adapters.
+editing operations. network/ConcreteNetwork.cc is a example of
+a network adapter the supports hierarchy. An example of a network adapter
+for a flat DEF based netlist, see
+https://github.com/The-OpenROAD-Project/OpenROAD/blob/master/src/dbSta/include/db_sta/dbNetwork.hh,
+https://github.com/The-OpenROAD-Project/OpenROAD/blob/master/src/dbSta/src/dbNetwork.cc.
 
 A network adaptor to interface to an external network database must
 define the virtual functions of the Network class (about 45
-functions).  The external network objects do not have to use any STA
-network objects as base classes or even be C++ objects.  These network
+functions). The external network objects do not have to use any STA
+network objects as base classes or even be C++ objects. These network
 adapter functions should cast the network object pointers to the
 underlying network object.
 
@@ -231,8 +232,7 @@ functions to find corresponding liberty objects.
   virtual LibertyPort *libertyPort(Port *port) const;
 
 The NetworkLiberty class provides implementations of the first two
-functions for derived network classes.  The OaNetwork class shows an
-implentation that uses the NetworkLiberty class.
+functions for derived network classes.
 
 If the network adapter implements the NetworkEdit API the following
 TCL commands are supported:
@@ -253,13 +253,13 @@ Liberty
 -------
 
 The liberty timing library reader builds classes that are derived from
-the concrete library classes.  In addition to the library, cell and
+the concrete library classes. In addition to the library, cell and
 port classes, there are classes to represent timing arcs, timing
 models, wireload models, operating conditions, and scale factors for
 derating timing data.
 
 Timing arcs are grouped into sets of arcs between a pair of cell
-ports.  For example, a buffer has two timing arcs between the input
+ports. For example, a buffer has two timing arcs between the input
 and output; one for a rising output and another for a falling output.
 The timing arcs are:
 
@@ -274,7 +274,7 @@ Similarly, an inverter has two negative-unate timing arcs.
 
 On the other hand, a multiplexor, has a non-unate path from the select
 input to the output because a rise or fall change on the input can
-cause the output to either rise or fall.  There are four timing arcs
+cause the output to either rise or fall. There are four timing arcs
 in this arc set:
 
   S f -> Z r
@@ -283,19 +283,19 @@ in this arc set:
   S r -> Z f
 
 The liberty file reader can be customized to read attributes that are
-not used by the STA.  See liberty/LibertyExt.cc for an example.
+not used by the STA. See liberty/LibertyExt.cc for an example.
 
 Graph
 -----
 
 The timing graph is the central data structure used by the delay
-calculation and search algorithms.  It is annotated with timing arc
-delay values and slews (from SDF or a delay calculator).  A forward
+calculation and search algorithms. It is annotated with timing arc
+delay values and slews (from SDF or a delay calculator). A forward
 search annotates the graph with arrival times, and a backward search
 annotates required times.
 
-The graph is composed of vertices and edges.  Each pin in the design
-has a vertex.  Bidirect pins have two vertices, one for its use as an
+The graph is composed of vertices and edges. Each pin in the design
+has a vertex. Bidirect pins have two vertices, one for its use as an
 input and another for its use as an output.
 
 The Network adapter supplies functions to find and set the index
@@ -310,16 +310,16 @@ compared to storing the value in the pin structure.
 A pointer to the vertex used for a bidirectional pin driver is kept in
 a map owned by the Graph class.
 
-Edges in the graph connect vertices.  The pins connected together by a
-net have wire edges between the pin vertices.  Timing arc sets in the
+Edges in the graph connect vertices. The pins connected together by a
+net have wire edges between the pin vertices. Timing arc sets in the
 leaf instance timing models have corresponding edges in the graph
 between pins on the instance.
 
 The Graph class constructor option slew_tr_count is used to prevent
-the grpah from reserving memory to store slews.  Similarly, if the
+the grpah from reserving memory to store slews. Similarly, if the
 have_arc_delays option is false no memory is reserved for storing arc
-delay values.  This is useful if an external delay calculator is used
-to annotate delays on the graph.  In this case the Graph functions
+delay values. This is useful if an external delay calculator is used
+to annotate delays on the graph. In this case the Graph functions
 arcDelay and wireDelay should be overloaded to return delay values
 stored outside of the STA.
 
@@ -347,93 +347,89 @@ Delay Calculation
 -----------------
 
 The graph is annotated with arc delay values and slews (also known as
-transition times) by the graph delay calculator and the SDF reader.
-The GraphDelayCalc class seeds slews and arrival times from SDC
-constraints and uses a breadth first search to visit each gate output
-pin.  The GraphDelayCalc then calls a timing arc delay calculator for
-each timing arc and annotates the graph arc delays and vertex slews.
+transition times) by the graph delay calculator or the SDF reader.
+The GraphDelayCalc class seeds slews from SDC constraints and uses a
+breadth first search to visit each gate output pin. The GraphDelayCalc
+then calls a timing arc delay calculator for each timing arc and
+annotates the graph arc delays and vertex slews.
 
 The delay calculator is architeched to support multiple delay
-calculation results.  Each result has an associated delay calculation
+calculation results. Each result has an associated delay calculation
 analysis point (class DcalcAnalysisPt) that specifies the operating
 conditions and parasitics used to find the delays.
 
 The ArcDelayCalc class defines the API used by the GraphDelayCalc to
 calculate the gate delay, driver slew, load delays and load slews
-driven by a timing arc.  The following delay calculation algorithms
+driven by a timing arc. The following delay calculation algorithms
 are defined in the dcalc directory:
 
- UnitDelayCalc - All gate delays are 1.  Wire delays are zero.
+ UnitDelayCalc - All gate delays are 1. Wire delays are zero.
 
  LumpedCapArcDelayCalc - Liberty table models using lumped capacitive
- load (RSPF pi model total capacitance).  Wire delays are zero.
-
- SimpleRCArcDelayCalc -  Liberty table models using lumped capacitive
- load (RSPF pi model total capacitance).  Wire delays are the RSPF 
- elmore delay.
+ load (RSPF pi model total capacitance). Wire delays are zero.
 
  DmpCeffElmoreDelayCalc - RSPF (Driver Pi model with elmore interconnect
- delays) delay calculator.  Liberty table models using effective capacitive
+ delays) delay calculator. Liberty table models using effective capacitive
  model as described in the following paper:
    "Performance Computation for Precharacterized CMOS Gates with RC Loads",
    Florentin Dartu, Noel Menezes and Lawrence Pileggi, IEEE Transactions
    on Computer-Aided Design of Integrated Circuits and Systems, Vol 15, No 5,
    May 1996.
  Wire delays are computed by applying the driver waveform to
- the RSPF dependent source and solving the RC network.  
+ the RSPF dependent source and solving the RC network.
 
  DmpCeffTwoPoleDelayCalc - Driver Pi model with two pole interconnect
  delays and effective capacitance as in DmpCeffElmoreDelayCalc.
 
 Other delay calculators can be interfaced by defining a class based on
 ArcDelayCalc and using the registerDelayCalc function to register it
-for the "set_delay_calculator" Tcl command.  The Sta::setArcDelayCalc
+for the "set_delay_calculator" Tcl command. The Sta::setArcDelayCalc
 function can be used to set the delay calculator at run time.
 
 Search
 ------
 
 A breadth first forward search is used to find arrival times at graph
-vertices.  Vertices are annotated with instances of the Event class to
-record signal arrival and required times.  As each vertex is visited
+vertices. Vertices are annotated with instances of the Event class to
+record signal arrival and required times. As each vertex is visited
 in the forward search its required time is found using If the vertex
 is constrained by setup or hold timing checks, min/max path delay
 exceptions or gated timing checks its required time is found from the
-SDC.  The slack is the difference between the vertex required time and
-arrival time.  If the vertex is constrained it is scheduled for a
+SDC. The slack is the difference between the vertex required time and
+arrival time. If the vertex is constrained it is scheduled for a
 breadth first backward search to propagate required times to the fanin
-vertices.  Separate events (and hence arrival and required times) are
+vertices. Separate events (and hence arrival and required times) are
 used for each clock edge and exception set that cause a vertex to
 change.
 
 Arrival, required and slack calculations are incremental using a level
-based "lazy evaluation" algorithm.  The first time arrival/required
+based "lazy evaluation" algorithm. The first time arrival/required
 times are found for a vertex the arrival/required times are propagated
-to/from the vertex's logic level.  After that no search is required
+to/from the vertex's logic level. After that no search is required
 for any vertex with a lower/higher logic level when the
 arrival/required time is requested.
 
 Clock arrival times are found before data arrival times by
-Search::findClkArrivals().  Clock arrival times now include insertion
-delay (source latency).
+Search::findClkArrivals(). Clock arrival times include insertion delay
+(source latency).
 
 When an incremental netlist change is made (for instance, changing the
 drive strengh of a gate with swap_cell), the STA incrementally updates
-delay calculation, arrival times, required times and slacks.  Because
-gate delay is only weakly dependent on slew (transition time), the
-effect of the change will diminish in gates downstream of the change.
-The STA uses a tolerance on the gate delays to determine when to stop
-propagating the change.  The tolerance is set using the
+delay calculation, arrival times, required times and slacks. Because
+gate delay is only weakly dependent on slew, the effect of the change
+will diminish in gates downstream of the change.  The STA uses a
+tolerance on the gate delays to determine when to stop propagating the
+change. The tolerance is set using the
 Sta::setIncrementalDelayTolerance function.
 
   void Sta::setIncrementalDelayTolerance(float tol);
 
 The tolerance is a percentage (0.0:1.0) change in delay that causes
 downstream delays to be recomputed during incremental delay
-calculation.  The default value is 0.0 for maximum accuracy and
-slowest incremental speed.  The delay calculation will not recompute
+calculation. The default value is 0.0 for maximum accuracy and
+slowest incremental speed. The delay calculation will not recompute
 delays for downstream gates when the change in the gate delay is less
-than the tolerance.  Required times must be recomputed backward from
+than the tolerance. Required times must be recomputed backward from
 any gate delay changes, so increasing the tolerance can significantly
 reduce incremental timing run time.
 
@@ -441,21 +437,19 @@ Tcl Interface
 -------------
 
 The interface from Tcl to C++ is written in a SWIG (www.swig.org)
-interface description (tcl/StaTcl.i).  SWIG generates the interface
+interface description (tcl/StaTcl.i). SWIG generates the interface
 code from the description file.
 
-All user interface code is written in Tcl.  SDC argument parsing and
-checking is done with Tcl procedures that call a SWIG interface
-function.  All reporting commands are written in Tcl so they can be
-easily customized.
+All commands are written in Tcl. SDC argument parsing and checking is
+done with Tcl procedures that call a SWIG interface function.
 
 The Tcl 'sta' namespace is used to segregate internal STA functions
-from the global Tcl namespace.  All user visible STA and SDC commands
+from the global Tcl namespace. All user visible STA and SDC commands
 are exported to the global Tcl namespace.
 
 A lot of the internal STA state can be accessed from Tcl to make
-debugging a lot easier.  Some debugging commands require a namespace
-qualifier because they are not intended for casual users.  Some
+debugging a easier. Some debugging commands require a namespace
+qualifier because they are not intended for casual users. Some
 examples are shown below.
 
   sta::report_arrival
@@ -473,21 +467,10 @@ examples are shown below.
   sta::network_leaf_pin_count
 
 Additionally, many of the STA network and graph objects themselvs are
-exposed to Tcl using SWIG.  These Tcl objects have methods for
-inspecting them.  Examples of how to use these methods can be found in
+exposed to Tcl using SWIG. These Tcl objects have methods for
+inspecting them. Examples of how to use these methods can be found in
 the tcl/Graph.tcl and tcl/Network.tcl files.
 
-Optional Components
--------------------
-Optional components that are not included in the standard distribution are:
-
- Edif netlist reader
- OpenAccess netlist and parasitics interface
- Verific netlist interface
- Budgeter
- Interface Logic Model (ILM) generation
- Arnoldi reduced order delay calculation
-
 Architecture alternatives for using the STA Engine
 --------------------------------------------------
 
@@ -496,15 +479,15 @@ an application.
 
 * STA with TCL application
 
-The simplest example is an application written in TCL.  The
-application calls STA commands and primitives defined in /tcl and
-tcl/StaTcl.i.  A stand-alone STA executable is built and a TCL file
-that defines the application is included as part of the STA by
-modifying app/Makefile.am to add the TCL file to app/TclInitVar.cc.
+The simplest example is an application written in TCL. The application
+calls STA commands and primitives defined in the swig c++/tcl
+interface. A stand-alone STA executable is built and a TCL file that
+defines the application is included as part of the STA by modifying
+CMakeLists.txt to add the TCL file to app/TclInitVar.cc.
 
 The user calls STA commands to read design files (liberty, verilog,
 SDF, parasitics) to define and link the design. The user defines SDC
-commands or sources an SDC file.  The user calls the application's TCL
+commands or sources an SDC file. The user calls the application's TCL
 commands.
 
 A simple gate sizer is an example of an application that can be built
@@ -515,15 +498,15 @@ or insert buffers.
 * STA with C++ application
 
 The application is built by adding C++ files to the /app directory and
-modifying app/Makefile.am to include them in the executable. Interface
+modifying CMakeLists.txt to include them in the executable. Interface
 commands between C++ and TCL are put in a SWIG .i file in the /app
 directory and modifying app/StaApp.i to include them. TCL commands are
-added to the STA by modifying app/Makefile.am to add the application's
+added to the STA by modifying CMakeLists.txt to add the application's
 TCL files to TclInitVar.cc.
 
 The user calls STA commands to read design files (liberty, verilog,
 SDF, parasitics) to define and link the design. The user defines SDC
-commands or sources an SDC file.  The user calls the application's TCL
+commands or sources an SDC file. The user calls the application's TCL
 commands.
 
 * C++ application without native Network data structures linking STA libraries 
@@ -533,14 +516,14 @@ calls STA initialization functions like staMain() defined in
 app/StaMain.cc.
 
 The application must link and instanciate a TCL interpreter to read
-SDC commands like staMain().  The application can choose to expose the TCL
+SDC commands like staMain(). The application can choose to expose the TCL
 interpreter to the user or not. The STA depends on the following data
 that can be read by calling TCL commands or Sta class member functions.
 
 Liberty files that define the leaf cells used in the design. 
 Read using the read_liberty command or by calling Sta::readLibertyFile().
 
-Verilog files that define the netlist.  Read using the read_verilog
+Verilog files that define the netlist. Read using the read_verilog
 command or by calling readVerilogFile() (see verilog/Verilog.i
 read_verilog).
 
@@ -561,7 +544,7 @@ Sta::deleteInstance() to edit the network.
 
 The application defines a Network adapter (described above) so that
 the STA can use the native network data structures without duplicating
-them for the STA.  The application defines a class built on class Sta
+them in the STA. The application defines a class built on class Sta
 that defines the makeNetwork() member function to build an instance of
 the network adapter.
 
@@ -571,7 +554,7 @@ app/StaMain.cc. The application reads the netlist and builds network
 data structures that the STA accesses through the Network adapter.
 
 The application must link and instanciate a TCL interpreter to read
-SDC commands like staMain().  The application can choose to expose the TCL
+SDC commands like staMain(). The application can choose to expose the TCL
 interpreter to the user or not. The STA depends on the following data
 that can be read by calling TCL commands or Sta class member functions.
 
@@ -580,7 +563,7 @@ Read using the read_liberty command or by calling Sta::readLibertyFile.
 
 SDC commands to define timing constraints.
 Defined using SDC commands in the TCL interpreter, or sourced
-from a file using Tcl_Eval(sta::tclInterp()).
+from a file using sta::sourceTclFile.
 
 Parasitics used by delay calculation.
 Read using the read_parasitics command, Sta::readParasitics(), or
diff --git a/include/sta/Liberty.hh b/include/sta/Liberty.hh
index b9402f2c..fd8868ec 100644
--- a/include/sta/Liberty.hh
+++ b/include/sta/Liberty.hh
@@ -764,6 +764,8 @@ public:
   // Is the clock for timing checks.
   bool isCheckClk() const { return is_check_clk_; }
   void setIsCheckClk(bool is_clk);
+  bool isPad() const { return is_pad_; }
+  void setIsPad(bool is_pad);
   RiseFall *pulseClkTrigger() const { return pulse_clk_trigger_; }
   // Rise for high, fall for low.
   RiseFall *pulseClkSense() const { return pulse_clk_sense_; }
@@ -863,6 +865,7 @@ protected:
   bool level_shifter_data_:1;
   bool is_switch_:1;
   bool is_disabled_constraint_:1;
+  bool is_pad_:1;
 
 private:
   friend class LibertyLibrary;
diff --git a/include/sta/Parasitics.hh b/include/sta/Parasitics.hh
index 23e589a7..52c6caa3 100644
--- a/include/sta/Parasitics.hh
+++ b/include/sta/Parasitics.hh
@@ -269,6 +269,7 @@ public:
 protected:
   void makeWireloadNetworkWorst(Parasitic *parasitic,
 				const Pin *drvr_pin,
+                                const Net *net,
 				float wireload_cap,
 				float wireload_res,
 				float fanout);
diff --git a/liberty/Liberty.cc b/liberty/Liberty.cc
index 4208f3ee..9a6d6264 100644
--- a/liberty/Liberty.cc
+++ b/liberty/Liberty.cc
@@ -1996,7 +1996,8 @@ LibertyPort::LibertyPort(LibertyCell *cell,
   isolation_cell_enable_(false),
   level_shifter_data_(false),
   is_switch_(false),
-  is_disabled_constraint_(false)
+  is_disabled_constraint_(false),
+  is_pad_(false)
 {
   liberty_port_ = this;
   min_pulse_width_[RiseFall::riseIndex()] = 0.0;
@@ -2472,6 +2473,12 @@ LibertyPort::setIsDisabledConstraint(bool is_disabled)
   is_disabled_constraint_ = is_disabled;
 }
 
+void
+LibertyPort::setIsPad(bool is_pad)
+{
+  is_pad_ = is_pad;
+}
+
 LibertyPort *
 LibertyPort::cornerPort(const Corner *corner,
                         const MinMax *min_max)
diff --git a/liberty/LibertyReader.cc b/liberty/LibertyReader.cc
index a9dbced1..fb23fe3e 100644
--- a/liberty/LibertyReader.cc
+++ b/liberty/LibertyReader.cc
@@ -297,6 +297,7 @@ LibertyReader::defineVisitors()
   defineAttrVisitor("dont_use", &LibertyReader::visitDontUse);
   defineAttrVisitor("is_macro_cell", &LibertyReader::visitIsMacro);
   defineAttrVisitor("is_memory", &LibertyReader::visitIsMemory);
+  defineAttrVisitor("pad_cell", &LibertyReader::visitIsPadCell);
   defineAttrVisitor("is_pad", &LibertyReader::visitIsPad);
   defineAttrVisitor("is_clock_cell", &LibertyReader::visitIsClockCell);
   defineAttrVisitor("is_level_shifter", &LibertyReader::visitIsLevelShifter);
@@ -2887,13 +2888,13 @@ LibertyReader::visitIsMemory(LibertyAttr *attr)
 }
 
 void
-LibertyReader::visitIsPad(LibertyAttr *attr)
+LibertyReader::visitIsPadCell(LibertyAttr *attr)
 {
   if (cell_) {
-    bool is_pad, exists;
-    getAttrBool(attr, is_pad, exists);
+    bool pad_cell, exists;
+    getAttrBool(attr, pad_cell, exists);
     if (exists)
-      cell_->setIsPad(is_pad);
+      cell_->setIsPad(pad_cell);
   }
 }
 
@@ -3358,6 +3359,19 @@ LibertyReader::visitClock(LibertyAttr *attr)
   }
 }
 
+void
+LibertyReader::visitIsPad(LibertyAttr *attr)
+{
+  if (ports_) {
+    bool is_pad, exists;
+    getAttrBool(attr, is_pad, exists);
+    if (exists) {
+      for (LibertyPort *port : *ports_)
+        port->setIsPad(is_pad);
+    }
+  }
+}
+
 void
 LibertyReader::visitCapacitance(LibertyAttr *attr)
 {
diff --git a/liberty/LibertyReaderPvt.hh b/liberty/LibertyReaderPvt.hh
index 5eee6cb1..5fc608a6 100644
--- a/liberty/LibertyReaderPvt.hh
+++ b/liberty/LibertyReaderPvt.hh
@@ -187,6 +187,7 @@ public:
   virtual void visitDontUse(LibertyAttr *attr);
   virtual void visitIsMacro(LibertyAttr *attr);
   virtual void visitIsMemory(LibertyAttr *attr);
+  virtual void visitIsPadCell(LibertyAttr *attr);
   virtual void visitIsPad(LibertyAttr *attr);
   virtual void visitIsClockCell(LibertyAttr *attr);
   virtual void visitIsLevelShifter(LibertyAttr *attr);
diff --git a/parasitics/Parasitics.cc b/parasitics/Parasitics.cc
index 5d8a10c5..0ab184bb 100644
--- a/parasitics/Parasitics.cc
+++ b/parasitics/Parasitics.cc
@@ -203,7 +203,7 @@ Parasitics::makeWireloadNetwork(const Pin *drvr_pin,
                                 const MinMax *min_max,
 				const ParasiticAnalysisPt *ap)
 {
-  Net *net = network_->net(drvr_pin);
+  const Net *net = findParasiticNet(drvr_pin);
   Parasitic *parasitic = makeParasiticNetwork(net, false, ap);
   const OperatingConditions *op_cond = sdc_->operatingConditions(min_max);
   float wireload_cap, wireload_res;
@@ -214,7 +214,7 @@ Parasitics::makeWireloadNetwork(const Pin *drvr_pin,
     tree = op_cond->wireloadTree();
   switch (tree) {
   case WireloadTree::worst_case:
-    makeWireloadNetworkWorst(parasitic, drvr_pin, wireload_cap, 
+    makeWireloadNetworkWorst(parasitic, drvr_pin, net, wireload_cap, 
 			     wireload_res, fanout);
     break;
   case WireloadTree::balanced:
@@ -235,12 +235,12 @@ Parasitics::makeWireloadNetwork(const Pin *drvr_pin,
 void
 Parasitics::makeWireloadNetworkWorst(Parasitic *parasitic,
 				     const Pin *drvr_pin,
+                                     const Net *net,
 				     float wireload_cap,
 				     float wireload_res,
 				     float /* fanout */)
 {
   ParasiticNode *drvr_node = ensureParasiticNode(parasitic, drvr_pin, network_);
-  Net *net = network_->net(drvr_pin);
   size_t resistor_index = 1;
   ParasiticNode *load_node = ensureParasiticNode(parasitic, net, 0, network_);
   makeResistor(parasitic, resistor_index++, wireload_res, drvr_node, load_node);
diff --git a/search/ClkSkew.cc b/search/ClkSkew.cc
index d26d380e..099bca62 100644
--- a/search/ClkSkew.cc
+++ b/search/ClkSkew.cc
@@ -19,8 +19,10 @@
 #include <cmath> // abs
 #include <algorithm>
 
+#include "Fuzzy.hh"
 #include "Report.hh"
 #include "Debug.hh"
+#include "DispatchQueue.hh"
 #include "Units.hh"
 #include "TimingArc.hh"
 #include "Liberty.hh"
@@ -53,17 +55,20 @@ public:
   void operator=(const ClkSkew &clk_skew);
   PathVertex *srcPath() { return &src_path_; }
   PathVertex *tgtPath() { return &tgt_path_; }
-  float srcLatency(StaState *sta);
-  float tgtLatency(StaState *sta);
-  float srcInternalClkLatency(StaState *sta);
-  float tgtInternalClkLatency(StaState *sta);
-  Crpr crpr(StaState *sta);
-  float uncertainty(StaState *sta);
+  float srcLatency(const StaState *sta);
+  float tgtLatency(const StaState *sta);
+  float srcInternalClkLatency(const StaState *sta);
+  float tgtInternalClkLatency(const StaState *sta);
+  Crpr crpr(const StaState *sta);
+  float uncertainty(const StaState *sta);
   float skew() const { return skew_; }
+  static bool srcTgtPathNameLess(ClkSkew &clk_skew1,
+                                 ClkSkew &clk_skew2,
+                                 const StaState *sta);
 
 private:
   float clkTreeDelay(PathVertex &clk_path,
-                     StaState *sta);
+                     const StaState *sta);
 
   PathVertex src_path_;
   PathVertex tgt_path_;
@@ -109,7 +114,7 @@ ClkSkew::operator=(const ClkSkew &clk_skew)
 }
 
 float
-ClkSkew::srcLatency(StaState *sta)
+ClkSkew::srcLatency(const StaState *sta)
 {
   Arrival src_arrival = src_path_.arrival(sta);
   return delayAsFloat(src_arrival) - src_path_.clkEdge(sta)->time()
@@ -117,13 +122,13 @@ ClkSkew::srcLatency(StaState *sta)
 }
 
 float
-ClkSkew::srcInternalClkLatency(StaState *sta)
+ClkSkew::srcInternalClkLatency(const StaState *sta)
 {
   return clkTreeDelay(src_path_, sta);
 }
 
 float
-ClkSkew::tgtLatency(StaState *sta)
+ClkSkew::tgtLatency(const StaState *sta)
 {
   Arrival tgt_arrival = tgt_path_.arrival(sta);
   return delayAsFloat(tgt_arrival) - tgt_path_.clkEdge(sta)->time()
@@ -131,14 +136,14 @@ ClkSkew::tgtLatency(StaState *sta)
 }
 
 float
-ClkSkew::tgtInternalClkLatency(StaState *sta)
+ClkSkew::tgtInternalClkLatency(const StaState *sta)
 {
   return clkTreeDelay(tgt_path_, sta);
 }
 
 float
 ClkSkew::clkTreeDelay(PathVertex &clk_path,
-                      StaState *sta)
+                      const StaState *sta)
 {
   if (include_internal_latency_) {
     const Vertex *vertex = clk_path.vertex(sta);
@@ -154,14 +159,14 @@ ClkSkew::clkTreeDelay(PathVertex &clk_path,
 }
 
 Crpr
-ClkSkew::crpr(StaState *sta)
+ClkSkew::crpr(const StaState *sta)
 {
   CheckCrpr *check_crpr = sta->search()->checkCrpr();
   return check_crpr->checkCrpr(&src_path_, &tgt_path_);
 }
 
 float
-ClkSkew::uncertainty(StaState *sta)
+ClkSkew::uncertainty(const StaState *sta)
 {
   TimingRole *check_role = (src_path_.minMax(sta) == SetupHold::max())
     ? TimingRole::setup()
@@ -171,10 +176,27 @@ ClkSkew::uncertainty(StaState *sta)
                                           check_role, sta);
 }
 
+bool
+ClkSkew::srcTgtPathNameLess(ClkSkew &clk_skew1,
+                            ClkSkew &clk_skew2,
+                            const StaState *sta)
+{
+  Network *network = sta->sdcNetwork();
+  const char *src_path1 = network->pathName(clk_skew1.srcPath()->pin(sta));
+  const char *src_path2 = network->pathName(clk_skew2.srcPath()->pin(sta));
+  const char *tgt_path1 = network->pathName(clk_skew1.tgtPath()->pin(sta));
+  const char *tgt_path2 = network->pathName(clk_skew2.tgtPath()->pin(sta));
+  return stringLess(src_path1, src_path2)
+    || (stringEqual(src_path1, src_path2)
+        && stringEqual(tgt_path1, tgt_path2));
+}
+
+
 ////////////////////////////////////////////////////////////////
 
 ClkSkews::ClkSkews(StaState *sta) :
-  StaState(sta)
+  StaState(sta),
+  fanout_pred_(sta)
 {
 }
 
@@ -275,54 +297,86 @@ ClkSkews::findClkSkew(ConstClockSeq &clks,
                       bool include_internal_latency)
 {	      
   ClkSkewMap skews;
+  corner_ = corner;
+  setup_hold_ = setup_hold;
+  include_internal_latency_ = include_internal_latency;
 
-  ConstClockSet clk_set;
+  clk_set_.clear();
   for (const Clock *clk : clks)
-    clk_set.insert(clk);
+    clk_set_.insert(clk);
 
-  for (Vertex *src_vertex : *graph_->regClkVertices()) {
-    if (hasClkPaths(src_vertex, clk_set)) {
-      VertexOutEdgeIterator edge_iter(src_vertex, graph_);
-      while (edge_iter.hasNext()) {
-	Edge *edge = edge_iter.next();
-	if (edge->role()->genericRole() == TimingRole::regClkToQ()) {
-	  Vertex *q_vertex = edge->to(graph_);
-	  const RiseFall *rf = edge->timingArcSet()->isRisingFallingEdge();
-	  const RiseFallBoth *src_rf = rf
-	    ? rf->asRiseFallBoth()
-	    : RiseFallBoth::riseFall();
-	  findClkSkewFrom(src_vertex, q_vertex, src_rf, clk_set,
-			  corner, setup_hold, include_internal_latency,
-                          skews);
-	}
+  if (thread_count_ > 1) {
+    std::vector<ClkSkewMap> partial_skews(thread_count_, skews);
+    for (Vertex *src_vertex : *graph_->regClkVertices()) {
+      if (hasClkPaths(src_vertex)) {
+        dispatch_queue_->dispatch([this, src_vertex, &partial_skews](int i) {
+          findClkSkewFrom(src_vertex, partial_skews[i]);
+        });
       }
     }
+    dispatch_queue_->finishTasks();
+
+    // Reduce skews from each register source.
+    for (size_t i = 0; i < partial_skews.size(); i++) {
+      for (auto clk_skew_itr : partial_skews[i]) {
+        const Clock *clk = clk_skew_itr.first;
+        auto partial_skew = clk_skew_itr.second;
+        auto ins = skews.insert(std::make_pair(clk, partial_skew));
+        if (!ins.second) {
+          ClkSkew &final_skew = ins.first->second;
+          if (abs(partial_skew.skew()) > abs(final_skew.skew())
+              || (fuzzyEqual(abs(partial_skew.skew()), abs(final_skew.skew()))
+                  // Break ties based on source/target path names.
+                  && ClkSkew::srcTgtPathNameLess(partial_skew, final_skew, this)))
+            final_skew = partial_skew;
+        }
+      }
+    }
+  }
+  else {
+    for (Vertex *src_vertex : *graph_->regClkVertices()) {
+      if (hasClkPaths(src_vertex))
+        findClkSkewFrom(src_vertex, skews);
+    }
   }
   return skews;
 }
 
 bool
-ClkSkews::hasClkPaths(Vertex *vertex,
-		      ConstClockSet &clks)
+ClkSkews::hasClkPaths(Vertex *vertex)
 {
   VertexPathIterator path_iter(vertex, this);
   while (path_iter.hasNext()) {
     PathVertex *path = path_iter.next();
     const Clock *path_clk = path->clock(this);
-    if (clks.find(path_clk) != clks.end())
+    if (clk_set_.find(path_clk) != clk_set_.end())
       return true;
   }
   return false;
 }
 
+void
+ClkSkews::findClkSkewFrom(Vertex *src_vertex,
+                          ClkSkewMap &skews)
+{
+  VertexOutEdgeIterator edge_iter(src_vertex, graph_);
+  while (edge_iter.hasNext()) {
+    Edge *edge = edge_iter.next();
+    if (edge->role()->genericRole() == TimingRole::regClkToQ()) {
+      Vertex *q_vertex = edge->to(graph_);
+      const RiseFall *rf = edge->timingArcSet()->isRisingFallingEdge();
+      const RiseFallBoth *src_rf = rf
+        ? rf->asRiseFallBoth()
+        : RiseFallBoth::riseFall();
+      findClkSkewFrom(src_vertex, q_vertex, src_rf, skews);
+    }
+  }
+}
+
 void
 ClkSkews::findClkSkewFrom(Vertex *src_vertex,
 			  Vertex *q_vertex,
 			  const RiseFallBoth *src_rf,
-                          ConstClockSet &clk_set,
-			  const Corner *corner,
-			  const SetupHold *setup_hold,
-                          bool include_internal_latency,
 			  ClkSkewMap &skews)
 {
   VertexSet endpoints = findFanout(q_vertex);
@@ -332,18 +386,16 @@ ClkSkews::findClkSkewFrom(Vertex *src_vertex,
       Edge *edge = edge_iter.next();
       TimingRole *role = edge->role();
       if (role->isTimingCheck()
-	  && ((setup_hold == SetupHold::max()
+	  && ((setup_hold_ == SetupHold::max()
 	       && role->genericRole() == TimingRole::setup())
-	      || ((setup_hold == SetupHold::min()
+	      || ((setup_hold_ == SetupHold::min()
 		   && role->genericRole() == TimingRole::hold())))) {
 	Vertex *tgt_vertex = edge->from(graph_);
 	const RiseFall *tgt_rf1 = edge->timingArcSet()->isRisingFallingEdge();
 	const RiseFallBoth *tgt_rf = tgt_rf1
 	  ? tgt_rf1->asRiseFallBoth()
 	  : RiseFallBoth::riseFall();
-	findClkSkew(src_vertex, src_rf, tgt_vertex, tgt_rf,
-		    clk_set, corner, setup_hold,
-                    include_internal_latency, skews);
+	findClkSkew(src_vertex, src_rf, tgt_vertex, tgt_rf, skews);
       }
     }
   }
@@ -354,24 +406,20 @@ ClkSkews::findClkSkew(Vertex *src_vertex,
 		      const RiseFallBoth *src_rf,
 		      Vertex *tgt_vertex,
 		      const RiseFallBoth *tgt_rf,
-                      ConstClockSet &clk_set,
-		      const Corner *corner,
-		      const SetupHold *setup_hold,
-                      bool include_internal_latency,
                       ClkSkewMap &skews)
 {
   Unit *time_unit = units_->timeUnit();
-  const SetupHold *tgt_min_max = setup_hold->opposite();
+  const SetupHold *tgt_min_max = setup_hold_->opposite();
   VertexPathIterator src_iter(src_vertex, this);
   while (src_iter.hasNext()) {
     PathVertex *src_path = src_iter.next();
     const Clock *src_clk = src_path->clock(this);
     if (src_rf->matches(src_path->transition(this))
-	&& src_path->minMax(this) == setup_hold
-	&& clk_set.find(src_clk) != clk_set.end()) {
+	&& src_path->minMax(this) == setup_hold_
+	&& clk_set_.find(src_clk) != clk_set_.end()) {
       Corner *src_corner = src_path->pathAnalysisPt(this)->corner();
-      if (corner == nullptr
-	  || src_corner == corner) {
+      if (corner_ == nullptr
+	  || src_corner == corner_) {
 	VertexPathIterator tgt_iter(tgt_vertex, this);
 	while (tgt_iter.hasNext()) {
 	  PathVertex *tgt_path = tgt_iter.next();
@@ -381,7 +429,7 @@ ClkSkews::findClkSkew(Vertex *src_vertex,
 	      && tgt_rf->matches(tgt_path->transition(this))
 	      && tgt_path->minMax(this) == tgt_min_max
 	      && tgt_path->pathAnalysisPt(this)->corner() == src_corner) {
-	    ClkSkew probe(src_path, tgt_path, include_internal_latency, this);
+	    ClkSkew probe(src_path, tgt_path, include_internal_latency_, this);
 	    ClkSkew &clk_skew = skews[src_clk];
 	    debugPrint(debug_, "clk_skew", 2,
                        "%s %s %s -> %s %s %s crpr = %s skew = %s",
@@ -403,12 +451,38 @@ ClkSkews::findClkSkew(Vertex *src_vertex,
   }
 }
 
-class FanOutSrchPred : public SearchPred1
+VertexSet
+ClkSkews::findFanout(Vertex *from)
 {
-public:
-  FanOutSrchPred(const StaState *sta);
-  virtual bool searchThru(Edge *edge);
-};
+  VertexSet endpoints(graph_);
+  UnorderedSet<Vertex*> visited;
+  findFanout1(from, visited, endpoints);
+  return endpoints;
+}
+
+void
+ClkSkews::findFanout1(Vertex *from,
+                      UnorderedSet<Vertex*> &visited,
+                      VertexSet &endpoints)
+{
+  visited.insert(from);
+  if (from->hasChecks())
+    endpoints.insert(from);
+  if (fanout_pred_.searchFrom(from)) {
+    VertexOutEdgeIterator edge_iter(from, graph_);
+    while (edge_iter.hasNext()) {
+      Edge *edge = edge_iter.next();
+      Vertex *to = edge->to(graph_);
+      if (fanout_pred_.searchThru(edge)
+          && fanout_pred_.searchTo(to)
+          // Do not revisit downstream fanout cones.
+          && visited.insert(to).second)
+        findFanout1(to, visited, endpoints);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////
 
 FanOutSrchPred::FanOutSrchPred(const StaState *sta) :
   SearchPred1(sta)
@@ -426,25 +500,4 @@ FanOutSrchPred::searchThru(Edge *edge)
         || role == TimingRole::tristateDisable());
 }
 
-VertexSet
-ClkSkews::findFanout(Vertex *from)
-{
-  debugPrint(debug_, "fanout", 1, "%s",
-             from->name(sdc_network_));
-  VertexSet endpoints(graph_);
-  FanOutSrchPred pred(this);
-  BfsFwdIterator fanout_iter(BfsIndex::other, &pred, this);
-  fanout_iter.enqueue(from);
-  while (fanout_iter.hasNext()) {
-    Vertex *fanout = fanout_iter.next();
-    if (fanout->hasChecks()) {
-      debugPrint(debug_, "fanout", 1, " endpoint %s",
-                 fanout->name(sdc_network_));
-      endpoints.insert(fanout);
-    }
-    fanout_iter.enqueueAdjacentVertices(fanout);
-  }
-  return endpoints;
-}
-
 } // namespace
diff --git a/search/ClkSkew.hh b/search/ClkSkew.hh
index 288584ff..3c2e28a4 100644
--- a/search/ClkSkew.hh
+++ b/search/ClkSkew.hh
@@ -18,18 +18,28 @@
 
 #include <map>
 
+#include "UnorderedSet.hh"
 #include "SdcClass.hh"
 #include "StaState.hh"
 #include "Transition.hh"
 #include "SearchClass.hh"
+#include "SearchPred.hh"
 #include "PathVertex.hh"
 
 namespace sta {
 
 class ClkSkew;
+class SearchPred;
 
 typedef std::map<const Clock*, ClkSkew> ClkSkewMap;
 
+class FanOutSrchPred : public SearchPred1
+{
+public:
+  FanOutSrchPred(const StaState *sta);
+  virtual bool searchThru(Edge *edge);
+};
+
 // Find and report clock skews between source/target registers.
 class ClkSkews : public StaState
 {
@@ -51,28 +61,30 @@ protected:
                          const Corner *corner,
                          const SetupHold *setup_hold,
                          bool include_internal_latency);
-  bool hasClkPaths(Vertex *vertex,
-		   ConstClockSet &clks);
+  bool hasClkPaths(Vertex *vertex);
+  void findClkSkewFrom(Vertex *src_vertex,
+		       ClkSkewMap &skews);
   void findClkSkewFrom(Vertex *src_vertex,
 		       Vertex *q_vertex,
 		       const RiseFallBoth *src_rf,
-		       ConstClockSet &clk_set,
-		       const Corner *corner,
-		       const SetupHold *setup_hold,
-                       bool include_internal_latency,
 		       ClkSkewMap &skews);
   void findClkSkew(Vertex *src_vertex,
 		   const RiseFallBoth *src_rf,
 		   Vertex *tgt_vertex,
 		   const RiseFallBoth *tgt_rf,
-                   ConstClockSet &clk_set,
-		   const Corner *corner,
-		   const SetupHold *setup_hold,
-                   bool include_internal_latency,
 		   ClkSkewMap &skews);
   VertexSet findFanout(Vertex *from);
+  void findFanout1(Vertex *from,
+                   UnorderedSet<Vertex*> &visited,
+                   VertexSet &endpoints);
   void reportClkSkew(ClkSkew &clk_skew,
                      int digits);
+
+  ConstClockSet clk_set_;
+  const Corner *corner_;
+  const SetupHold *setup_hold_;
+  bool include_internal_latency_;
+  FanOutSrchPred fanout_pred_;
 };
-    
+
 } // namespace
diff --git a/search/Genclks.cc b/search/Genclks.cc
index c2a8a9a1..f93a5b1d 100644
--- a/search/Genclks.cc
+++ b/search/Genclks.cc
@@ -246,7 +246,8 @@ GenClkMasterSearchPred::searchThru(Edge *edge)
   const Sdc *sdc = sta_->sdc();
   TimingRole *role = edge->role();
   // Propagate clocks through constants.
-  return !(edge->isDisabledLoop()
+  return !(edge->role()->isTimingCheck()
+           || edge->isDisabledLoop()
 	   || edge->isDisabledConstraint()
 	   // Constants disable edge cond expression.
 	   || edge->isDisabledCond()
diff --git a/search/Sta.cc b/search/Sta.cc
index 36dfacab..fc1a6777 100644
--- a/search/Sta.cc
+++ b/search/Sta.cc
@@ -359,6 +359,8 @@ Sta::updateComponentsState()
   if (check_timing_)
     check_timing_->copyState(this);
   clk_network_->copyState(this);
+  if (clk_skews_)
+    clk_skews_->copyState(this);
   if (power_)
     power_->copyState(this);
 }
diff --git a/test/asap7_seq.lib.gz b/test/asap7_seq.lib.gz
new file mode 100644
index 00000000..0f03f277
Binary files /dev/null and b/test/asap7_seq.lib.gz differ
diff --git a/test/asap7_simple.lib.gz b/test/asap7_simple.lib.gz
new file mode 100644
index 00000000..b401a565
Binary files /dev/null and b/test/asap7_simple.lib.gz differ
diff --git a/test/prima3.ok b/test/prima3.ok
new file mode 100644
index 00000000..cb6d66b9
--- /dev/null
+++ b/test/prima3.ok
@@ -0,0 +1,51 @@
+Warning: asap7_simple.lib.gz line 71510, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 71986, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 72462, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 72938, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 73414, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 74830, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 71029, timing group from output port.
+Warning: asap7_simple.lib.gz line 71505, timing group from output port.
+Warning: asap7_simple.lib.gz line 71981, timing group from output port.
+Warning: asap7_simple.lib.gz line 72457, timing group from output port.
+Warning: asap7_simple.lib.gz line 72933, timing group from output port.
+Warning: asap7_simple.lib.gz line 73409, timing group from output port.
+Warning: asap7_simple.lib.gz line 73885, timing group from output port.
+Warning: asap7_simple.lib.gz line 82276, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 83692, when attribute inside table model.
+Warning: asap7_simple.lib.gz line 81795, timing group from output port.
+Warning: asap7_simple.lib.gz line 82271, timing group from output port.
+Warning: asap7_simple.lib.gz line 82747, timing group from output port.
+Startpoint: r2 (rising edge-triggered flip-flop clocked by clk)
+Endpoint: r3 (rising edge-triggered flip-flop clocked by clk)
+Path Group: clk
+Path Type: max
+
+   Slew   Delay    Time   Description
+----------------------------------------------------------------
+   0.00    0.00    0.00   clock clk (rise edge)
+           0.00    0.00   clock source latency
+  10.00    0.00    0.00 ^ clk2 (in)
+  48.15   12.04   12.04 ^ r2/CLK (DFFHQx4_ASAP7_75t_R)
+  38.97   90.82  102.86 ^ r2/Q (DFFHQx4_ASAP7_75t_R)
+  59.28   16.50  119.36 ^ u1/A (BUFx2_ASAP7_75t_R)
+  70.25   51.69  171.05 ^ u1/Y (BUFx2_ASAP7_75t_R)
+  83.74   18.32  189.37 ^ u2/B (AND2x2_ASAP7_75t_R)
+  72.19   60.76  250.13 ^ u2/Y (AND2x2_ASAP7_75t_R)
+  85.61   18.34  268.46 ^ r3/D (DFFHQx4_ASAP7_75t_R)
+                 268.46   data arrival time
+
+   0.00  500.00  500.00   clock clk (rise edge)
+           0.00  500.00   clock source latency
+  10.00    0.00  500.00 ^ clk3 (in)
+  47.52   11.84  511.84 ^ r3/CLK (DFFHQx4_ASAP7_75t_R)
+           0.00  511.84   clock reconvergence pessimism
+         -14.89  496.95   library setup time
+                 496.95   data required time
+----------------------------------------------------------------
+                 496.95   data required time
+                -268.46   data arrival time
+----------------------------------------------------------------
+                 228.48   slack (MET)
+
+
diff --git a/test/prima3.tcl b/test/prima3.tcl
new file mode 100644
index 00000000..c04dc722
--- /dev/null
+++ b/test/prima3.tcl
@@ -0,0 +1,13 @@
+# prima reg1 asap7
+read_liberty asap7_invbuf.lib.gz
+read_liberty asap7_seq.lib.gz
+read_liberty asap7_simple.lib.gz
+read_verilog reg1_asap7.v
+link_design top
+create_clock -name clk -period 500 {clk1 clk2 clk3}
+set_input_delay -clock clk 1 {in1 in2}
+set_input_transition 10 {in1 in2 clk1 clk2 clk3}
+set_propagated_clock {clk1 clk2 clk3}
+read_spef reg1_asap7.spef
+sta::set_delay_calculator prima
+report_checks -fields {input_pins slew} -format full_clock
diff --git a/test/reg1_asap7.spef b/test/reg1_asap7.spef
new file mode 100644
index 00000000..14bc4d6b
--- /dev/null
+++ b/test/reg1_asap7.spef
@@ -0,0 +1,135 @@
+*SPEF "IEEE 1481-1998"
+*DESIGN "reg1"
+*DATE "Fri Nov 20 13:23:00 2002"
+*VENDOR "Parallax Software, Inc"
+*PROGRAM "Handjob"
+*VERSION "1.0.1c"
+*DESIGN_FLOW "MISSING_NETS"
+*DIVIDER /
+*DELIMITER :
+*BUS_DELIMITER [ ]
+*T_UNIT 1.0 PS
+*C_UNIT 1.0 FF
+*R_UNIT 1.0 KOHM
+*L_UNIT 1.0 UH
+
+*POWER_NETS VDD
+*GROUND_NETS VSS
+
+*PORTS
+in1 I
+in2 I
+clk1 I
+clk2 I
+clk3 I
+out O
+
+*D_NET in1 13.4
+*CONN
+*P in1 I
+*I r1:D I *L .0036
+*CAP
+1 in1 6.7
+2 r1:D 6.7
+*RES
+3 in1 r1:D 2.42
+*END
+
+*D_NET in2 13.4
+*CONN
+*P in2 I
+*I r2:D I *L .0036
+*CAP
+1 in2 6.7
+2 r2:D 6.7
+*RES
+3 in2 r2:D 2.42
+*END
+
+*D_NET clk1 13.4
+*CONN
+*P clk1 I
+*I r1:CLK I *L .0036
+*CAP
+1 clk1 6.7
+2 r1:CLK 6.7
+*RES
+3 clk1 r1:CLK 2.42
+*END
+
+*D_NET clk2 13.4
+*CONN
+*P clk2 I
+*I r2:CLK I *L .0036
+*CAP
+1 clk2 6.7
+2 r2:CLK 6.7
+*RES
+3 clk2 r2:CLK 2.42
+*END
+
+*D_NET clk3 13.4
+*CONN
+*P clk3 I
+*I r3:CLK I *L .0036
+*CAP
+1 clk3 6.7
+2 r3:CLK 6.7
+*RES
+3 clk3 r3:CLK 2.42
+*END
+
+*D_NET r1q 13.4
+*CONN
+*I r1:Q O
+*I u2:A I *L .0086
+*CAP
+1 r1:Q 6.7
+2 u2:A 6.7
+*RES
+3 r1:Q u2:A 2.42
+*END
+
+*D_NET r2q 13.4
+*CONN
+*I r2:Q O
+*I u1:A I *L .0086
+*CAP
+1 r2:Q 6.7
+2 u1:A 6.7
+*RES
+3 r2:Q u1:A 2.42
+*END
+
+*D_NET u1z 13.4
+*CONN
+*I u1:Y O
+*I u2:B I *L .0086
+*CAP
+1 u1:Y 6.7
+2 u2:B 6.7
+*RES
+3 u1:Y u2:B 2.42
+*END
+
+*D_NET u2z 13.4
+*CONN
+*I u2:Y O
+*I r3:D I *L .0086
+*CAP
+1 u2:Y 6.7
+2 r3:D 6.7
+*RES
+3 u2:Y r3:D 2.42
+*END
+
+*D_NET out 13.4
+*CONN
+*I r3:Q O
+*P out O
+*CAP
+1 r3:Q 6.7
+2 out 6.7
+*RES
+3 r3:Q out 2.42
+*END
diff --git a/test/reg1_asap7.v b/test/reg1_asap7.v
new file mode 100644
index 00000000..5eb10b46
--- /dev/null
+++ b/test/reg1_asap7.v
@@ -0,0 +1,11 @@
+module top (in1, in2, clk1, clk2, clk3, out);
+  input in1, in2, clk1, clk2, clk3;
+  output out;
+  wire r1q, r2q, u1z, u2z;
+
+  DFFHQx4_ASAP7_75t_R r1 (.D(in1), .CLK(clk1), .Q(r1q));
+  DFFHQx4_ASAP7_75t_R r2 (.D(in2), .CLK(clk2), .Q(r2q));
+  BUFx2_ASAP7_75t_R u1 (.A(r2q), .Y(u1z));
+  AND2x2_ASAP7_75t_R u2 (.A(r1q), .B(u1z), .Y(u2z));
+  DFFHQx4_ASAP7_75t_R r3 (.D(u2z), .CLK(clk3), .Q(out));
+endmodule // top
diff --git a/test/regression_vars.tcl b/test/regression_vars.tcl
index a109a348..55482d62 100644
--- a/test/regression_vars.tcl
+++ b/test/regression_vars.tcl
@@ -123,6 +123,7 @@ record_example_tests {
 
 record_sta_tests {
   ccs_sim1
+  prima3
   verilog_attribute
 }
 
diff --git a/util/DispatchQueue.cc b/util/DispatchQueue.cc
index d59fdf18..e8a31e5f 100644
--- a/util/DispatchQueue.cc
+++ b/util/DispatchQueue.cc
@@ -35,6 +35,7 @@ DispatchQueue::terminateThreads()
       threads_[i].join();
     }
   }
+  quit_ = false;
 }
 
 void