From b952c58f9228954c55332009debd0fc9e7815274 Mon Sep 17 00:00:00 2001 From: James Cherry Date: Wed, 24 Oct 2018 14:22:21 -0700 Subject: [PATCH] arnoldi delay calculator --- dcalc/Arnoldi.hh | 83 +++ dcalc/Arnoldi.txt | 57 ++ dcalc/ArnoldiDelayCalc.cc | 1490 +++++++++++++++++++++++++++++++++++++ dcalc/ArnoldiDelayCalc.hh | 26 + dcalc/ArnoldiReduce.cc | 646 ++++++++++++++++ dcalc/ArnoldiReduce.hh | 116 +++ dcalc/DelayCalc.cc | 2 + dcalc/Makefile.am | 5 + 8 files changed, 2425 insertions(+) create mode 100644 dcalc/Arnoldi.hh create mode 100644 dcalc/Arnoldi.txt create mode 100644 dcalc/ArnoldiDelayCalc.cc create mode 100644 dcalc/ArnoldiDelayCalc.hh create mode 100644 dcalc/ArnoldiReduce.cc create mode 100644 dcalc/ArnoldiReduce.hh diff --git a/dcalc/Arnoldi.hh b/dcalc/Arnoldi.hh new file mode 100644 index 00000000..00b1b696 --- /dev/null +++ b/dcalc/Arnoldi.hh @@ -0,0 +1,83 @@ +// OpenSTA, Static Timing Analyzer +// Copyright (c) 2018, Parallax Software, Inc. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +// (c) 2018 Nefelus, Inc. +// +// Author: W. Scott + +#ifndef ARNOLDI_H +#define ARNOLDI_H + +#include "ConcreteParasiticsPvt.hh" + +namespace sta { + +struct delay_work; +class rcmodel; + +class GateTableModel; +class Pin; + +// +// single-driver arnoldi model +// +class arnoldi1 +{ +public: + arnoldi1() { order=0; n=0; d=NULL; e=NULL; U=NULL; ctot=0.0; sqc=0.0; } + ~arnoldi1(); + double elmore(int term_index); + + // + // calculate poles/residues for given rdrive + // + void calculate_poles_res(delay_work *D,double rdrive); + +public: + int order; + int n; // number of terms, including driver + double *d; // [order] + double *e; // [order-1] + double **U; // [order][n] + double ctot; + double sqc; +}; + +// This is the rcmodel, without Rd. +// n is the number of terms +// The vectors U[j] are of size n +class rcmodel : public ConcreteParasitic, + public arnoldi1 +{ +public: + rcmodel(); + virtual ~rcmodel(); + virtual float capacitance() const; + + const Pin **pinV; // [n] +}; + +struct timing_table +{ + GateTableModel *table; + const LibertyCell *cell; + const Pvt *pvt; + float in_slew; + float relcap; +}; + +} // namespace +#endif diff --git a/dcalc/Arnoldi.txt b/dcalc/Arnoldi.txt new file mode 100644 index 00000000..63ebad51 --- /dev/null +++ b/dcalc/Arnoldi.txt @@ -0,0 +1,57 @@ +The method is used for simulation with a time-and-voltage-dependent +current source. But it is simpler to describe with a linear driver. +Suppose we are given: + voltage nodes 1,..n initialized to V[j]=1.0 + a resistor network described by a conductance matrix G[j,k] + a drive resistance from node 1 to ground, Rdrv + capacitance of the nodes, c[j], also written as a + diagonal matrix C[j,k] +The node voltages will fall to zero with time. +Matrix equation: + GV+CdV/dt = -(V[0]/Rdrv)e0 +where e0 is the unit vector (1,0,0,..0). +Let G' be the matrix formed by taking G and adding 1/Rdrv in the +[0,0] position. Then we are solving G'V + CdV/dt = 0. +Let R be the inverse of G'. (In implementation, R may not actually +be formed as a matrix, instead some method of producing RV given V). +The exact solution would diagonalize sqrt(C) R sqrt(C). + +The Arnoldi method takes a matrix M and a vector of interest V, and +considers the subspace of vectors near V in terms of the action of M, +that is, the space spanned by V, MV, MMV, etc, for a small number of +powers. We do this here, but instead of finding the part of MV orthogonal +to V, we find the part of RCV that is C-orthogonal of V. We use C +as the metric, so the basis vectors U0, U1, U2 that we generate satisfy + Ui.CUj = (i==j?1:0) + +U0 = (1,1,..1)/sqrt(sum C) + representing the initial value of V, V[j] = sqrt(n)U0[j] at t=0. + sum(C) = C[0]+..C[n-1], so U0.C U0 = 1. +Let: + W = R C U0 + d0 = U0.C W + W' = W - d0 U0 + e0 = sqrt(W'.C W') + U1 = W'/e0 +Then: U0.C U0 = U1.C U1 = 1, U0.C U1 = 0, and + R C U0 = d0 U0 + e0 U1 +Next step: + W = R C U1 + d1 = U1.C W + W' = W - d1 U1 - e0 U0 + e1 = sqrt(W'.C W') + U2 = W'/e1 +and we have U2.C U2 = 1, U2.C U1 = U2.C U0 = 0, and + RC U1 = e0 U0 + d1 U1 + e1 U2 +In this way, RC, which in nonsymmetric in the original basis, +becomes a symmetric tridiagonal matrix in the U0,U1,.. basis. +We stop at, say, U3. The resulting 4x4 tridiagonal matrix is +positive definite, because it is the projection of sqrt(C)R sqrt(C) +to a subspace. So the eigenvalues of this small tridiagonal matrix +are guaranteed to be positive. This is the advantage over AWE. + +In the actual implementation, I remember there was some way of +isolating the node 0 where drive resistance is attached, so that the +tridiagonal matrix (d,e) can be recalculated without knowing Rdrv, +and then just the first d0,e0 updated when the Rdrv is known, or +when Rdrv changes in a simulation. diff --git a/dcalc/ArnoldiDelayCalc.cc b/dcalc/ArnoldiDelayCalc.cc new file mode 100644 index 00000000..8c65e2d1 --- /dev/null +++ b/dcalc/ArnoldiDelayCalc.cc @@ -0,0 +1,1490 @@ +// OpenSTA, Static Timing Analyzer +// Copyright (c) 2018, Parallax Software, Inc. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +// (c) 2018 Nefelus, Inc. +// +// Author: W. Scott + +#include +#include // abs +#include "Machine.hh" +#include "Report.hh" +#include "Debug.hh" +#include "Units.hh" +#include "Liberty.hh" +#include "TimingModel.hh" +#include "TimingArc.hh" +#include "TableModel.hh" +#include "Network.hh" +#include "Graph.hh" +#include "Parasitics.hh" +#include "Sdc.hh" +#include "DcalcAnalysisPt.hh" +#include "DelayCalc.hh" +#include "ArcDelayCalc.hh" +#include "RCDelayCalc.hh" +#include "GraphDelayCalc.hh" +#include "Arnoldi.hh" +#include "ArnoldiReduce.hh" +#include "ArnoldiDelayCalc.hh" + +namespace sta { + +// wireload8 is n^2 +// do not delete arnoldi parasitics +// handle rspf parasitics? + +// mv static functions to ArnoldiDelayCalc +// need slew only lookup for +// ra_delay +// ra_get_r +// ra_get_s + +using std::abs; + +struct delay_work; +struct delay_c; + +//////////////////////////////////////////////////////////////// + +static delay_work *delay_work_create(); +static void +delay_work_destroy(delay_work *D); +static double * +delay_work_get_residues(delay_work *D, + int term_index); + +static bool +tridiagEV(int n,double *d,double *e,double *p,double **v); + +////////////////////////////////////////////////////////////// + +struct delay_c +{ + double slew_derate; + double vlo; + double vhi; + double vlg; + double smin; + double x1; + double y1; + double vmid; // falling convention, should be >= 0.5 +}; + +// workspace for pole-residue -> delay calculations +// delay_work +// max order is 32 +struct delay_work +{ + double slew_derate; + double slew_factor; // (0,1.0] table_slew = slew_factor * full_slew + delay_c cV[2]; + delay_c *c; + + double lo_thresh; + double hi_thresh; + + int nmax; + double poles[32]; // 1/tau + double **resi; // resi[jrec][h] h=0,..order + double *v[32]; + double *w[32]; + double aa[32]; +}; + +//////////////////////////////////////////////////////////////// + +class ArnoldiDelayCalc : public RCDelayCalc +{ +public: + ArnoldiDelayCalc(StaState *sta); + virtual ~ArnoldiDelayCalc(); + virtual ArcDelayCalc *copy(); + virtual void findParasitic(const Pin *drvr_pin, + const TransRiseFall *tr, + const DcalcAnalysisPt *dcalc_ap, + // Return values. + Parasitic *¶sitic, + bool &delete_at_finish); + virtual void gateDelay(const LibertyCell *drvr_cell, + TimingArc *arc, + const Slew &in_slew, + // Pass in load_cap or drvr_parasitic. + float load_cap, + Parasitic *drvr_parasitic, + float related_out_cap, + const Pvt *pvt, + const DcalcAnalysisPt *dcalc_ap, + // Return values. + ArcDelay &gate_delay, + Slew &drvr_slew); + virtual void loadDelay(const Pin *load_pin, + // Return values. + ArcDelay &wire_delay, + Slew &load_slew); + virtual void inputPortDelay(const Pin *port_pin, + float in_slew, + const TransRiseFall *tr, + Parasitic *parasitic, + const DcalcAnalysisPt *dcalc_ap); + virtual void reportGateDelay(const LibertyCell *drvr_cell, + TimingArc *arc, + const Slew &in_slew, + float load_cap, + Parasitic *, + float related_out_cap, + const Pvt *pvt, + const DcalcAnalysisPt *dcalc_ap, + int digits, + string *result); + void delay_work_set_thresholds(delay_work *D, + double lo, + double hi, + bool rising, + double derate); + +private: + void gateDelaySlew(const LibertyCell *drvr_cell, + GateTableModel *table_model, + const Slew &in_slew, + float related_out_cap, + const Pvt *pvt, + // Return values. + ArcDelay &gate_delay, + Slew &drvr_slew); + void ar1_ceff_delay(delay_work *D,timing_table *tab, arnoldi1 *mod, + double *delays, double *slews); + double ra_rdelay_1(timing_table *tab, + double ctot); + double ra_get_r(delay_work *D, + timing_table *tab, + double rdelay, + double ctot); + double ra_get_s(delay_work *D, + timing_table *tab, + double r, + double c); + void ra_solve_for_s(delay_work *D, + double p, + double tlohi, + double &s); + // from poles and residues, solve for t20,t50,t80 + void pr_solve1(double s, + int order, + double *p, + double *rr, + double v1, + double *t1); + void pr_solve3(double s, + int order, + double *p, + double *rr, + double vhi, + double *thi, + double vmid, + double *tmid, + double vlo, + double *tlo); + + // + // routines for linear drive model and ceff + // + double pr_ceff(double s, + double rdrive, + int order, + double *p, + double *rr, + double ceff_time); + double ra_solve_for_t(double p, + double s, + double v); + void ra_solve_for_pt(double ps, + double v, + double *pt, + double *d); + void ra_calc_c(double lo, + double hi, + double *c_smin, + double *c_x1, + double *c_y1); + + rcmodel *rcmodel_; + int _pinNmax; + double *_delayV; + double *_slewV; + int pin_n_; + bool input_port_; + ArnoldiReduce *reduce_; + delay_work *delay_work_; +}; + +ArcDelayCalc * +makeArnoldiDelayCalc(StaState *sta) +{ + return new ArnoldiDelayCalc(sta); +} + +ArnoldiDelayCalc::ArnoldiDelayCalc(StaState *sta) : + RCDelayCalc(sta), + reduce_(new ArnoldiReduce(sta)), + delay_work_(delay_work_create()) +{ + _pinNmax = 1024; + _delayV = (double*)malloc(_pinNmax * sizeof(double)); + _slewV = (double*)malloc(_pinNmax * sizeof(double)); +} + +ArcDelayCalc * +ArnoldiDelayCalc::copy() +{ + return new ArnoldiDelayCalc(this); +} + +ArnoldiDelayCalc::~ArnoldiDelayCalc() +{ + delay_work_destroy(delay_work_); + free(_delayV); + free(_slewV); + delete reduce_; +} + +void +ArnoldiDelayCalc::findParasitic(const Pin *drvr_pin, + const TransRiseFall *drvr_tr, + const DcalcAnalysisPt *dcalc_ap, + // Return values. + Parasitic *¶sitic, + bool &delete_at_finish) +{ + parasitic = NULL; + delete_at_finish = false; + // set_load has precidence over parasitics. + if (!sdc_->drvrPinHasWireCap(drvr_pin)) { + const OperatingConditions *op_cond = dcalc_ap->operatingConditions(); + const Corner *corner = dcalc_ap->corner(); + const MinMax *cnst_min_max = dcalc_ap->constraintMinMax(); + const ParasiticAnalysisPt *parasitic_ap = dcalc_ap->parasiticAnalysisPt(); + Parasitic *parasitic_network = + parasitics_->findParasiticNetwork(drvr_pin, parasitic_ap); + bool delete_parasitic_network = false; + if (parasitic_network == NULL) { + Wireload *wireload = sdc_->wireloadDefaulted(cnst_min_max); + if (wireload) { + float pin_cap, wire_cap, fanout; + bool has_wire_cap; + graph_delay_calc_->netCaps(drvr_pin, drvr_tr, dcalc_ap, + pin_cap, wire_cap, fanout, has_wire_cap); + parasitic_network = parasitics_->makeWireloadNetwork(drvr_pin, + wireload, + fanout, + op_cond, + parasitic_ap); + delete_parasitic_network = true; + } + } + if (parasitic_network) { + parasitic = reduce_->reduceToArnoldi(parasitic_network, + drvr_pin, + parasitic_ap->couplingCapFactor(), + drvr_tr, op_cond, corner, + cnst_min_max, parasitic_ap); + if (delete_parasitic_network) { + Net *net = network_->net(drvr_pin); + parasitics_->deleteParasiticNetwork(net, parasitic_ap); + } + delete_at_finish = true; + } + } +} + +void +ArnoldiDelayCalc::inputPortDelay(const Pin *drvr_pin, + float in_slew, + const TransRiseFall *tr, + Parasitic *parasitic, + const DcalcAnalysisPt *dcalc_ap) +{ + RCDelayCalc::inputPortDelay(drvr_pin, in_slew, tr, parasitic, dcalc_ap); + rcmodel_ = NULL; + _delayV[0] = 0.0; + _slewV[0] = in_slew; + + int j; + if (parasitic) { + rcmodel_ = reinterpret_cast(parasitic); + pin_n_ = rcmodel_->n; + if (pin_n_ >= _pinNmax) { + _pinNmax *= 2; + if (pin_n_ >= _pinNmax) _pinNmax += pin_n_; + _pinNmax *= 2; + _delayV = (double*)realloc(_delayV,_pinNmax * sizeof(double)); + _slewV = (double*)realloc(_slewV,_pinNmax * sizeof(double)); + } + pin_n_ = 1; + + pin_n_ = rcmodel_->n; + double slew_derate = drvr_library_->slewDerateFromLibrary(); + double lo_thresh = drvr_library_->slewLowerThreshold(drvr_tr_); + double hi_thresh = drvr_library_->slewUpperThreshold(drvr_tr_); + bool rising = (drvr_tr_ == TransRiseFall::rise()); + delay_work_set_thresholds(delay_work_, lo_thresh, hi_thresh, rising, + slew_derate); + delay_c *c = delay_work_->c; + double c_log = c->vlg; + + for (j=1;jelmore(j); + _delayV[j] = 0.6931472*elmore; + _slewV[j] = in_slew + c_log*elmore/slew_derate; + } + } +} + +void +ArnoldiDelayCalc::gateDelay(const LibertyCell *drvr_cell, + TimingArc *arc, + const Slew &in_slew, + float load_cap, + Parasitic *drvr_parasitic, + float related_out_cap, + const Pvt *pvt, + const DcalcAnalysisPt *dcalc_ap, + // Return values. + ArcDelay &gate_delay, + Slew &drvr_slew) +{ + input_port_ = false; + drvr_tr_ = arc->toTrans()->asRiseFall(); + drvr_library_ = drvr_cell->libertyLibrary(); + drvr_parasitic_ = drvr_parasitic; + ConcreteParasitic *drvr_cparasitic = + reinterpret_cast(drvr_parasitic); + rcmodel_ = dynamic_cast(drvr_cparasitic); + GateTimingModel *model = gateModel(arc, dcalc_ap); + GateTableModel *table_model = dynamic_cast(model); + if (table_model && rcmodel_) + gateDelaySlew(drvr_cell, table_model, in_slew, + related_out_cap, pvt, + gate_delay, drvr_slew); + else + LumpedCapDelayCalc::gateDelay(drvr_cell, arc, in_slew, load_cap, + drvr_parasitic, related_out_cap, pvt, + dcalc_ap, gate_delay, drvr_slew); + drvr_slew_ = drvr_slew; + multi_drvr_slew_factor_ = 1.0F; +} + +void +ArnoldiDelayCalc::gateDelaySlew(const LibertyCell *drvr_cell, + GateTableModel *table_model, + const Slew &in_slew, + float related_out_cap, + const Pvt *pvt, + // Return values. + ArcDelay &gate_delay, + Slew &drvr_slew) +{ + pin_n_ = rcmodel_->n; + if (pin_n_ >= _pinNmax) { + _pinNmax *= 2; + if (pin_n_ >= _pinNmax) _pinNmax += pin_n_; + _delayV = (double*)realloc(_delayV,_pinNmax * sizeof(double)); + _slewV = (double*)realloc(_slewV,_pinNmax * sizeof(double)); + } + + pin_n_ = rcmodel_->n; + if (table_model) { + double slew_derate = drvr_library_->slewDerateFromLibrary(); + double lo_thresh = drvr_library_->slewLowerThreshold(drvr_tr_); + double hi_thresh = drvr_library_->slewUpperThreshold(drvr_tr_); + bool rising = (drvr_tr_ == TransRiseFall::rise()); + delay_work_set_thresholds(delay_work_, lo_thresh, hi_thresh, rising, + slew_derate); + if (rcmodel_->order > 0) { + timing_table tab; + tab.table = table_model; + tab.cell = drvr_cell; + tab.pvt = pvt; + tab.in_slew = in_slew; + tab.relcap = related_out_cap; + ar1_ceff_delay(delay_work_, &tab, rcmodel_, + _delayV, _slewV); + } + gate_delay = _delayV[0]; + drvr_slew = _slewV[0]; + } +} + +void +ArnoldiDelayCalc::loadDelay(const Pin *load_pin, + // Return values. + ArcDelay &wire_delay, + Slew &load_slew) +{ + wire_delay = 0.0; + load_slew = drvr_slew_ * multi_drvr_slew_factor_; + if (rcmodel_) { + // HACK + for (int i = 0; i < rcmodel_->n; i++) { + if (rcmodel_->pinV[i] == load_pin) { + wire_delay = _delayV[i] - _delayV[0]; + load_slew = _slewV[i] * multi_drvr_slew_factor_; + break; + } + } + } + thresholdAdjust(load_pin, wire_delay, load_slew); +} + +void +ArnoldiDelayCalc::reportGateDelay(const LibertyCell *, + TimingArc *, + const Slew &, + float, + Parasitic *, + float, + const Pvt *, + const DcalcAnalysisPt *, + int, + string *) +{ +} + +//////////////////////////////////////////////////////////////// + +// +// arnoldi1.cpp +// + +arnoldi1::~arnoldi1() +{ + free(d); + free(U); +} + +double +arnoldi1::elmore(int k) +{ + if (order==0) return 0.0; + if (order==1) return d[0]; + double sqctot = 1.0/U[0][0]; + double tau = d[0] + e[0]*U[1][k]*sqctot; + return tau; +} + +delay_work * +delay_work_create() +{ + int j; + delay_work *D = (delay_work*)malloc(sizeof(delay_work)); + D->nmax = 256; + D->resi = (double**)malloc(D->nmax*sizeof(double*)); + D->resi[0] = (double*)malloc(D->nmax*32*sizeof(double)); + for (j=1;jnmax;j++) D->resi[j] = D->resi[0] + j*32; + D->v[0] = (double*)malloc(32*32*sizeof(double)); + for (j=1;j<32;j++) D->v[j] = D->v[0] + j*32; + D->w[0] = (double*)malloc(32*D->nmax*sizeof(double)); + for (j=1;j<32;j++) D->w[j] = D->w[0] + j*D->nmax; + D->lo_thresh = 0.0; + D->hi_thresh = 0.0; + D->slew_derate = 0.0; + D->slew_factor = 0.0; + for (j=0;j<2;j++) { + D->cV[j].vlo = 0.0; + D->cV[j].vhi = 0.0; + D->cV[j].vlg = 0.0; + D->cV[j].smin = 0.0; + D->cV[j].x1 = 0.0; + D->cV[j].y1 = 0.0; + D->cV[j].vmid = 0.0; + } + D->c = D->cV; + return D; +} + +static void +delay_work_destroy(delay_work *D) +{ + free(D->resi[0]); + free(D->resi); + free(D->v[0]); + free(D->w[0]); + free(D); +} + +static void +delay_work_alloc(delay_work *D,int n) +{ + if (n<=D->nmax) return; + free(D->w[0]); + free(D->resi[0]); + free(D->resi); + D->nmax *= 2; + if (n > D->nmax) D->nmax = n; + int j; + D->resi = (double**)malloc(D->nmax*sizeof(double*)); + D->resi[0] = (double*)malloc(D->nmax*32*sizeof(double)); + for (j=1;jnmax;j++) D->resi[j] = D->resi[0] + j*32; + D->w[0] = (double*)malloc(32*D->nmax*sizeof(double)); + for (j=1;j<32;j++) D->w[j] = D->w[0] + j*D->nmax; +} + +void +ArnoldiDelayCalc::delay_work_set_thresholds(delay_work *D, + double lo, + double hi, + bool rising, + double derate) +{ + double mid = 0.5; // 0.0:1.0 + int i = rising?1:0; + D->c = D->cV+ i; + // WRONG + bool changed = (lo != D->c->vlo || hi != D->c->vhi); + if (changed) { + if (!(lo>0.01 && hi<0.99)) { + lo = 0.1; + hi = 0.9; + derate = 0.8; + } + D->c->slew_derate = derate; + D->c->vlo = lo; + D->c->vhi = hi; + D->c->vmid = mid; + D->c->vlg = log(hi/lo); + ra_calc_c(lo,hi, + &(D->c->smin), &(D->c->x1),&(D->c->y1)); + } + D->lo_thresh = D->c->vlo; + D->hi_thresh = D->c->vhi; + D->slew_derate = derate; + double measured_swing = D->c->vhi - D->c->vlo; + double reported_swing = measured_swing/D->slew_derate; + D->slew_factor = reported_swing; +} + +static double * +delay_work_get_residues(delay_work *D,int term_index) +{ + return D->resi[term_index]; +} + +////////////////////////////////////////////////////////////// +// +// calculate_poles_res +// + +void arnoldi1::calculate_poles_res(delay_work *D,double rdrive) +{ + if (n > D->nmax) delay_work_alloc(D,n); + double *p = D->poles; + double **v = D->v; + double **w = D->w; + double *aa = D->aa; + double **resi = D->resi; + int h,j,k; + double sum, dsave; + + dsave = d[0]; + d[0] += rdrive*ctot; + if (!tridiagEV(order,d,e,p,v)) + internalError("arnoldi delay calc failed.\n"); + d[0] = dsave; + + for (h=0;h32) + return false; + + for (i=0;i=1;h--) { + iter = 0; + while (abs(e[h])>1e-18) { // 1e-6ps + m=0; + if (m != h) { + if (iter++ == 20) + return false; + g = (d[h-1]-d[h])/(2.0*e[h]); + r = sqrt(1.0+g*g); // watch overflow + g = d[m]-d[h]+e[h]/(g + (g<0?-r:r)); + s = c = 1.0; + p = 0.0; + for (i=m+1;i<=h;i++) { + f = s*e[i]; + b = c*e[i]; + e[i-1] = r = sqrt(f*f+g*g); // watch + if (r == 0.0) { + d[i-1] -= p; + e[m] = 0.0; + break; + } + s = f/r; + c = g/r; + g = d[i-1]-p; + r = (d[i]-g)*s+2.0*c*b; + d[i-1] = g + (p=s*r); + g = c*r-b; + for (k=0;k p) { k=j; p=d[k]; } + if (k != i) { + d[k] = d[i]; + d[i] = p; + for (j=0;j= 0.0) + || (abs(2.0*f) > abs(dxold*df))) { + dxold = dx; + dx = 0.5*(xh-xl); + if (flast*f >0.0) { + // 2 successive bisections in same direction, + // accelerate + if (f<0.0) dx = 0.9348*(xh-xl); + else dx = 0.0625*(xh-xl); + } + flast = f; + rts = xl+dx; + if (xl == rts) { + return rts; + } + } else { + dxold = dx; + dx = f/df; + flast = 0.0; + temp = rts; + rts -= dx; + if (temp == rts) { + return rts; + } + } + if (abs(dx) < xacc) { + return rts; + } + get_dv(rts,s,order,p,rr,&f,&df); f -= val; + if (f<0.0) + xl = rts; + else + xh = rts; + } + if (abs(f)<1e-6) // 1uV + return rts; + return 0.5*(xl+xh); +} + +void +ArnoldiDelayCalc::pr_solve1(double s, + int order, + double *p, + double *rr, + double v1, + double *t1) +{ + double tmin = 0.0,tmax = 0.0,vmin = 0.0,vmax = 0.0; + int h, h0 = 0; + while (order>1 + && rr[order-1]<1e-8 // 1e-8V + && rr[order-1]>-1e-8) + order--; + if (rr[0]<0.5) { + for (h=1;h0.3 && rr[h]>rr[0]) { h0 = h; break; } + } + double p0 = p[h0]; + double ps,vs,ta,va; + vs = 0.0; + for (h=0;h1.0 && p[order-1]>500.0 && va>v1-0.002)) + debugPrint0(debug_, "arnoldi", 1, "err, pr_solve1, vav1) { + tmin = ta; vmin = va; + ta *= 2.0; + pr_get_v(ta,s,order,p,rr,&va); + } + if (va>v1) + debugPrint0(debug_, "arnoldi", 1, "err, pr_solve1, va>v1\n"); + tmax = ta; vmax = va; + } + } else { + // s is irrelevant + ta = s; va = vs; + while (va >= v1) { + tmin = ta; + vmin = va; + ta += 1.0/p0; + pr_get_v(ta,s,order,p,rr,&va); + } + tmax = ta; vmax = va; + } + *t1 = solve_t_bracketed(s,order,p,rr,v1,tmin,tmax,vmin,vmax); +} + +void +ArnoldiDelayCalc::pr_solve3(double s, + int order, + double *p, + double *rr, + double vhi, + double *thi, + double vmid, + double *tmid, + double vlo, + double *tlo) +{ + // falling, thi1 + && rr[order-1]<1e-8 // 1e-8V + && rr[order-1]>-1e-8) + order--; + if (rr[0]<0.5) { + for (h=1;h0.3 && rr[h]>rr[0]) { h0 = h; break; } + } + double p0 = p[h0]; + if (p0>10e+9) // 1/10ns + p0=10e+9; + double ps,vs,ta,va; + vs = 0.0; + for (h=0;hvhi) { + tmin2 = tmin5 = ta; + vmin2 = vmin5 = va; + tmin8 = ta; vmin8 = va; + if (va vhi) { + tmin2 = tmin5; + vmin2 = vmin5; + tmax2 = tmax5; + vmax2 = tmax5; + } else { + tmax2 = tmin5; + vmax2 = vmin5; + ta = vlo*s; + pr_get_v(ta,s,order,p,rr,&va); + tmin2 = ta; vmin2 = va; + } + } + } else if (vsvlo) { + tmin8 = ta; vmin8 = va; + ta += 1.0/p0; + pr_get_v(ta,s,order,p,rr,&va); + } + tmax8 = ta; vmax8 = va; + ta = vmid*s; + pr_get_v(ta,s,order,p,rr,&va); + tmin5 = ta; vmin5 = va; + if (va>vhi) { + tmin2 = ta; vmin2 = va; + } else { + tmax2 = ta; vmax2 = va; + ta = vlo*s; + pr_get_v(ta,s,order,p,rr,&va); + tmin2 = ta; vmin2 = va; + } + } else if (vsvmid) { + tmin5 = tmin8 = ta; vmin5 = tmin8 = va; + ta += 0.7/p0; + pr_get_v(ta,s,order,p,rr,&va); + } + tmax5 = ta; vmax5 = va; + if (va < vlo) { + tmax8 = ta; vmax8 = va; + } else { + tmin8 = ta; vmin8 = va; + ta += 1.0/p0; + pr_get_v(ta,s,order,p,rr,&va); + while (va>vlo) { + tmin8 = ta; vmin8 = va; + ta += 1.0/p0; + pr_get_v(ta,s,order,p,rr,&va); + } + tmax8 = ta; vmax8 = va; + } + } else { + // s is irrelevant + ta = s; va = vs; + tmin2 = tmin5 = tmin8 = ta; + vmin2 = vmin5 = vmin8 = va; + while (va > vhi) { + tmin2 = tmin5 = tmin8 = ta; + vmin2 = vmin5 = vmin8 = va; + ta += 1.0/p0; + pr_get_v(ta,s,order,p,rr,&va); + } + tmax2 = ta; vmax2 = va; + if (va < vmid) { + tmax5 = ta; vmax5 = va; + } else while (va > vmid) { + tmin5 = tmin8 = ta; + vmin5 = vmin8 = va; + ta += 1.0/p0; + pr_get_v(ta,s,order,p,rr,&va); + } + tmax5 = ta; vmax5 = va; + if (va < vlo) { + tmax8 = ta; vmax8 = va; + } else while (va > vlo) { + tmin8 = ta; + vmin8 = va; + ta += 1.0/p0; + pr_get_v(ta,s,order,p,rr,&va); + } + tmax8 = ta; vmax8 = va; + } + + *thi = solve_t_bracketed(s,order,p,rr,vhi,tmin2,tmax2,vmin2,vmax2); + *tmid= solve_t_bracketed(s,order,p,rr,vmid,tmin5,tmax5,vmin5,vmax5); + *tlo= solve_t_bracketed(s,order,p,rr,vlo,tmin8,tmax8,vmin8,vmax8); +} + +static double +calc_integ(double p,double s,double t) +{ + // integral of f(t)-vin(t) + double ps = p*s; + double pt = p*t; + double y,ept,eps; + if (t<=s) { + ept = (pt>40.0)?0.0:exp(-pt); + y = ept-1.0+pt; + } else { + pt = pt-ps; + ept = (pt>40.0)?0.0:exp(-pt); + eps = (ps>40.0)?0.0:exp(-ps); + y = ps - (1.0-eps)*ept; + } + y /= ps*p; + return y; +} + + +double +ArnoldiDelayCalc::pr_ceff(double s, + double rdrive, + int order, + double *p, + double *rr, + double ceff_time) +{ + double integi = 0.0; + double ceff, v0; + int j; + for (j=0;j1e-8) + debugPrint2(debug, "arnoldi", 1, "y f %g %g\n",y,f); + return x; +} + +double +ArnoldiDelayCalc::ra_solve_for_t(double p, + double s, + double v) +{ + double t; + double ps = p*s; + if (ps>30.0) { + t = (1.0+ps*(1.0-v)) / p; + return t; + } + double eps = exp(ps); + if ((1-ps*v)*eps >= 1.0) { + t = log((eps-1.0)/(ps*v)) / p; + } else { + t = ra_hinv((1-v)*ps, debug_)/p; + } + return t; +} + +void +ArnoldiDelayCalc::ra_solve_for_pt(double ps, + double v, + double *pt, + double *d) +{ + if (ps>30.0) { + *pt = 1.0+ps*(1.0-v); + *d = 1.0-v; + return; + } + double eps = exp(ps); + if ((1-ps*v)*eps >= 1.0) { + *pt = log((eps-1.0)/(ps*v)); + *d = eps/(eps-1.0) - 1.0/ps; + } else { + *pt = ra_hinv((1-v)*ps, debug_); + *d = (1.0-v)/(*pt - (1-v)*ps); + } +} + +void +ArnoldiDelayCalc::ra_calc_c(double vlo, + double vhi, + double *c_smin, + double *c_x1, + double *c_y1) +{ + double a = log(1.0/vhi); + *c_smin = a + ra_hinv((1.0-vhi)/vhi - a, debug_); + double b = log(1.0/vlo); + double c_s1 = b + ra_hinv((1.0-vlo)/vlo - b, debug_); + double a1 = (exp(c_s1)-1.0)/c_s1; + double den = log(a1/vlo) - ra_hinv((1.0-vhi)*c_s1, debug_); + *c_x1 = (vhi-vlo)/den; + *c_y1 = c_s1*(*c_x1); +} + +//////////////////////////////////////////////////////////////// + +// +// ceff.cpp +// + +void +ArnoldiDelayCalc::ra_solve_for_s(delay_work *D, + double p, + double tlohi, + double &s) +{ + delay_c *c = D->c; + double vhi = c->vhi; + double vlo = c->vlo; + // s is 0-100 + // solve f(x,y)=0 with f = x*(ptlo(y/x)-pthi(y/x))-(vhi-vlo) + // (x=0,y=1) + // (x=x1,y=y1) c->x1,y1 + // (x=x2,y=y2) x2=(vhi-vlo)/log(vhi/vlo) y2=(c->smin)*x2 + double x1 = c->x1; + double y1 = c->y1; + double x2 = (vhi-vlo)/c->vlg; + double y2 = (c->smin)*x2; + double ptlo,dlo; + double pthi,dhi; + double f,df,x,y; + x = c->vlg/(p*tlohi); + + if (x <= x1) { + y = y1 - 0.5*(x-x1); + if (y>1.0) y=1.0; + } else { + y = y1 - (x-x1)*(0.5 + 8*(x-x1)); + if (y.5e-12) // .5ps + debugPrint3(debug_, "arnoldi", 1, + "ra_solve_for_s p %g tlohi %s err %s\n", + p, + units_->timeUnit()->asString(tlohi), + units_->timeUnit()->asString(f)); +} + +///////////////////////////////////////////////////////////////////// +// method 0: +// r = a match to slew to (ctot, limited by cmin,cmax) +// if r>rdelay, lower r +// Now at any ceff (limited) +// If slew(r,0,ceff) is too big +// s = s_start(r,ceff), not smaller than Smin +// accept the pessimistic output slew +// Else +// solve for s + +// Rough translation of ra_get_r(sy_table) used by ar1_ceff_delay. +double +ArnoldiDelayCalc::ra_get_r(delay_work *D, + timing_table *tab, + double rdelay, + double ctot) +{ + // find the maximum r that allows a solution for s of + // (s,r,ctot)-> output_slew + // If this maximum is greater than rdelay, use rdelay. + delay_c *c = D->c; + double slew_derate = c->slew_derate; + double c_log = c->vlg; + float c1,d1,s1; + double tlohi,r; + c1 = ctot; + tab->table->gateDelay(tab->cell, tab->pvt, tab->in_slew, + c1, tab->relcap, d1, s1); + tlohi = slew_derate*s1; + r = tlohi/(c_log*c1); + if (rdelay>0.0 && r > rdelay) + r = rdelay; + // else printf("from rdelay %g to r %g\n",rdelay,r); + return r; +} + +double +ArnoldiDelayCalc::ra_get_s(delay_work *D, + timing_table *tab, + double r, + double c) +{ + delay_c *con = D->c; + double slew_derate = con->slew_derate; + double c_log = con->vlg; + double c_smin = con->smin; + double tlohi,smin,s; + float d1,s1; + tab->table->gateDelay(tab->cell, tab->pvt, tab->in_slew, + c, tab->relcap, d1, s1); + tlohi = slew_derate*s1; + smin = r*c*c_smin; // c_smin = ra_hinv((1-vhi)/vhi-log(vhi)) + log(vhi); + if (c_log*r*c >= tlohi) { + // printf("hit smin\n"); + s = smin; + } else { + s = smin+0.3*tlohi; + ra_solve_for_s(D,1.0/(r*c),tlohi,s); + } + return s; +} + +///////////////////////////////////////////////////////////////////// +// method 1: +// determine the drive resistance from change in delay versus ctot +// find the maximum r that allows a solution for s of +// (s,r,ctot)-> output_slew +// If this maximum is greater than rdelay, use rdelay. +// calculate s,r,mod -> t50_srmod, +// then t50_srmod+t50_sy-t50_sr + +double +ArnoldiDelayCalc::ra_rdelay_1(timing_table *tab, + double ctot) +{ + // determine the drive resistance from change in delay versus ctot + float c1 = ctot; + float c2 = 0.5*c1; + if (c1==c2) + return 0.0; + float d1, s1; + tab->table->gateDelay(tab->cell, tab->pvt, tab->in_slew, + c1, tab->relcap, d1, s1); + float d2, s2; + tab->table->gateDelay(tab->cell, tab->pvt, tab->in_slew, + c2, tab->relcap, d2, s2); + double dt50 = d1-d2; + if (dt50 <= 0.0) + return 0.0; + double rdelay = dt50/(c1-c2); + return rdelay; +} + +void +ArnoldiDelayCalc::ar1_ceff_delay(delay_work *D, + timing_table *tab, + arnoldi1 *mod, + double *delays, + double *slews) +{ + delay_c *con = D->c; + double slew_derate = con->slew_derate; + double vhi = con->vhi; + double vlo = con->vlo; + double ctot = mod->ctot; + double ceff,tlohi,t50_sy,r,s,t50_sr,rdelay; + float df,sf; + + debugPrint1(debug_, "arnoldi", 1, "\nctot=%s\n", + units_->capacitanceUnit()->asString(ctot)); + + rdelay = ra_rdelay_1(tab,ctot); + if (rdelay == 0.0) { + rdelay = 1e+3; // 1kohm + } + r = rdelay; + r = ra_get_r(D,tab,rdelay,ctot); + if (! (r>0.0 + && r<100e+3)) // 100khom + rdelay = 1e+3; // 1kohm + + bool bad = (r0.0 + && s<100e-9)) // 100ns + s = 0.5e-9; // .5ns + + if (debug_->check("arnoldi", 1)) { + double p = 1.0/(r*ctot); + double thix,tlox; + if (bad) printf("bad\n"); + debugPrint2(debug_, "arnoldi", 1, "at r=%s s=%s\n", + units_->resistanceUnit()->asString(r), + units_->timeUnit()->asString(s)); + thix = ra_solve_for_t(p,s,vhi); + tlox = ra_solve_for_t(p,s,vlo); + tab->table->gateDelay(tab->cell, tab->pvt,tab->in_slew, + ctot, tab->relcap, df, sf); + debugPrint3(debug_, "arnoldi", 1, + "table slew (in_slew %s ctot %s) = %s\n", + units_->timeUnit()->asString(tab->in_slew), + units_->capacitanceUnit()->asString(ctot), + units_->timeUnit()->asString(sf)); + tlohi = slew_derate*sf; + debugPrint2(debug_, "arnoldi", 1, "tlohi %s %s\n", + units_->timeUnit()->asString(tlohi), + units_->timeUnit()->asString(tlox-thix)); + } + ceff = ctot; + tab->table->gateDelay(tab->cell, tab->pvt, tab->in_slew, + ceff, tab->relcap, df, sf); + t50_sy = df; + t50_sr = ra_solve_for_t(1.0/(r*ceff),s,0.5); + + // calculate s,r,mod -> t50_srmod, + // then t50_srmod+t50_sy-t50_sr + + mod->calculate_poles_res(D,r); + double *p = D->poles; + double *rr = delay_work_get_residues(D,0); + double thi,tlo,t50_srmod; + pr_solve1(s,mod->order,p,rr,0.5,&t50_srmod); + + int ceff_it,j; + double ceff_time=0.0; + + if (!bad) { + for (ceff_it=0;ceff_it<3;ceff_it++) { + + // calculate ceff + ceff_time = s; + ceff = pr_ceff(s,r,mod->order,p,rr,ceff_time); + + if((ceff-1e-20) < 0.0) { // 1e-8pf + debugPrint0(debug_, "arnoldi", 1, + "Invalid effective capacitance, using total capacitance\n"); + ceff = ctot; + } + + // new mvs at ceff + s = ra_get_s(D,tab,r,ceff); + debugPrint1(debug_, "arnoldi", 1, "new mvs s = %s\n", + units_->timeUnit()->asString(s)); + } + } + debugPrint4(debug_, "arnoldi", 1, "r %s s %s ceff_time %s ceff %s\n", + units_->resistanceUnit()->asString(r), + units_->timeUnit()->asString(s), + units_->timeUnit()->asString(ceff_time), + units_->capacitanceUnit()->asString(ceff)); + + tab->table->gateDelay(tab->cell, tab->pvt, tab->in_slew, ceff, + tab->relcap, df, sf); + t50_sy = df; + t50_sr = ra_solve_for_t(1.0/(r*ceff),s,0.5); + for (j=0;jn;j++) { + rr = delay_work_get_residues(D,j); + pr_solve3(s,mod->order,p,rr,vhi,&thi,0.5,&t50_srmod,vlo,&tlo); + delays[j] = t50_srmod + t50_sy - t50_sr; + slews[j] = (tlo-thi)/slew_derate; + } + debugPrint3(debug_, "arnoldi", 1, "slews[0] %s thi %s tlo %s\n", + units_->timeUnit()->asString(slews[0]), + units_->timeUnit()->asString(tlo), + units_->timeUnit()->asString(thi)); +} + +} // namespace diff --git a/dcalc/ArnoldiDelayCalc.hh b/dcalc/ArnoldiDelayCalc.hh new file mode 100644 index 00000000..4ff74e6e --- /dev/null +++ b/dcalc/ArnoldiDelayCalc.hh @@ -0,0 +1,26 @@ +// OpenSTA, Static Timing Analyzer +// Copyright (c) 2018, Parallax Software, Inc. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +#ifndef ARNOLDIDELAYCALC_H +#define ARNOLDIDELAYCALC_H + +namespace sta { + +ArcDelayCalc * +makeArnoldiDelayCalc(StaState *sta); + +} // namespace +#endif diff --git a/dcalc/ArnoldiReduce.cc b/dcalc/ArnoldiReduce.cc new file mode 100644 index 00000000..7dc910ef --- /dev/null +++ b/dcalc/ArnoldiReduce.cc @@ -0,0 +1,646 @@ +// OpenSTA, Static Timing Analyzer +// Copyright (c) 2018, Parallax Software, Inc. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +// (c) 2018 Nefelus, Inc. +// +// Author: W. Scott + +#include +#include +#include +#include +#include "Machine.hh" +#include "Debug.hh" +#include "Units.hh" +#include "MinMax.hh" +#include "Sdc.hh" +#include "Network.hh" +#include "ArnoldiReduce.hh" +#include "Arnoldi.hh" +#include "ConcreteParasiticsPvt.hh" + +namespace sta { + +rcmodel::rcmodel() : + pinV(NULL) +{ +} + +rcmodel::~rcmodel() +{ + free(pinV); +} + +float +rcmodel::capacitance() const +{ + return ctot; +} + +struct ts_point +{ + ParasiticNode *node_; + int eN; + bool is_term; + int tindex; // index into termV of corresponding term + ts_edge **eV; + bool visited; + ts_edge *in_edge; + int ts; + double c; + double r; +}; + +struct ts_edge +{ + ConcreteParasiticResistor *resistor_; + ts_point *from; + ts_point *to; +}; + +//////////////////////////////////////////////////////////////// + + +const int ArnoldiReduce::ts_point_count_incr_ = 1024; +const int ArnoldiReduce::ts_edge_count_incr_ = 1024; + +ArnoldiReduce::ArnoldiReduce(StaState *sta) : + StaState(sta), + ts_pointNmax(1024), + ts_edgeNmax(1024), + termNmax(256), + dNmax(8) +{ + ts_pointV = (ts_point*)malloc(ts_pointNmax*sizeof(ts_point)); + ts_ordV = (int*)malloc(ts_pointNmax*sizeof(int)); + ts_pordV = (ts_point**)malloc(ts_pointNmax*sizeof(ts_point*)); + _u0 = (double*)malloc(ts_pointNmax*sizeof(double)); + _u1 = (double*)malloc(ts_pointNmax*sizeof(double)); + y = (double*)malloc(ts_pointNmax*sizeof(double)); + iv = (double*)malloc(ts_pointNmax*sizeof(double)); + r = (double*)malloc(ts_pointNmax*sizeof(double)); + c = (double*)malloc(ts_pointNmax*sizeof(double)); + par = (int*)malloc(ts_pointNmax*sizeof(int)); + + ts_edgeV = (ts_edge*)malloc(ts_edgeNmax*sizeof(ts_edge)); + ts_stackV = (ts_edge**)malloc(ts_edgeNmax*sizeof(ts_edge*)); + ts_eV = (ts_edge**)malloc(2*ts_edgeNmax*sizeof(ts_edge*)); + + pinV = (const Pin**)malloc(termNmax*sizeof(const Pin*)); + termV = (int*)malloc(termNmax*sizeof(int)); + outV = (int*)malloc(termNmax*sizeof(int)); + + d = (double*)malloc(dNmax*sizeof(double)); + e = (double*)malloc(dNmax*sizeof(double)); + U = (double**)malloc(dNmax*sizeof(double*)); + U0 = (double*)malloc(dNmax*termNmax*sizeof(double)); + int h; + for (h=0;h(parasitic); + drvr_pin_ = drvr_pin; + coupling_cap_factor_ = coupling_cap_factor; + tr_ = tr; + op_cond_ = op_cond; + corner_ = corner; + cnst_min_max_ = cnst_min_max; + ap_ = ap; + loadWork(); + return makeRcmodelDrv(); +} + +void +ArnoldiReduce::loadWork() +{ + pt_map_.clear(); + + int resistor_count = 0; + ConcreteParasiticDeviceSet devices; + parasitic_network_->devices(devices); + ConcreteParasiticDeviceSet::Iterator device_iter(devices); + while (device_iter.hasNext()) { + ParasiticDevice *device = device_iter.next(); + if (parasitics_->isResistor(device)) + resistor_count++; + } + + termN = parasitic_network_->pinNodes()->size(); + int subnode_count = parasitic_network_->subNodes()->size(); + ts_pointN = subnode_count + 1 + termN; + ts_edgeN = resistor_count; + allocPoints(); + allocTerms(termN); + ts_point *p0 = ts_pointV; + pterm0 = p0 + subnode_count + 1; + ts_point *pend = p0 + ts_pointN; + ts_point *p; + ts_edge *e0 = ts_edgeV; + ts_edge *eend = e0 + ts_edgeN; + + ts_edge *e; + int tindex; + for (p = p0; p!=pend; p++) { + p->node_ = NULL; + p->eN = 0; + p->is_term = false; + } + pend = pterm0; + e = e0; + int index = 0; + ConcreteParasiticSubNodeMap::Iterator + sub_node_iter(parasitic_network_->subNodes()); + while (sub_node_iter.hasNext()) { + ConcreteParasiticSubNode *node = sub_node_iter.next(); + pt_map_[node] = index; + p = p0 + index; + p->node_ = node; + p->eN = 0; + p->is_term = false; + index++; + } + + ConcreteParasiticPinNodeMap::Iterator + pin_node_iter(parasitic_network_->pinNodes()); + while (pin_node_iter.hasNext()) { + ConcreteParasiticPinNode *node = pin_node_iter.next(); + p = pend++; + pt_map_[node] = p - p0; + p->node_ = node; + p->eN = 0; + p->is_term = true; + tindex = p - pterm0; + p->tindex = tindex; + const Pin *pin = parasitics_->connectionPin(node); + pinV[tindex] = pin; + } + + ts_edge **eV = ts_eV; + ConcreteParasiticDeviceSet::Iterator device_iter2(devices); + while (device_iter2.hasNext()) { + ParasiticDevice *device = device_iter2.next(); + if (parasitics_->isResistor(device)) { + ConcreteParasiticResistor *resistor = + reinterpret_cast(device); + ts_point *pt1 = findPt(resistor->node1()); + ts_point *pt2 = findPt(resistor->node2()); + e->from = pt1; + e->to = pt2; + e->resistor_ = resistor; + pt1->eN++; + if (e->from != e->to) + pt2->eN++; + e++; + } + } + + for (p=p0;p!=pend;p++) { + if (p->node_) { + p->eV = eV; + eV += p->eN; + p->eN = 0; + } + } + for (e=e0;e!=eend;e++) { + e->from->eV[e->from->eN++] = e; + if (e->to != e->from) + e->to->eV[e->to->eN++] = e; + } +} + +void +ArnoldiReduce::allocPoints() +{ + if (ts_pointN > ts_pointNmax) { + free(par); + free(c); + free(r); + free(iv); free(y); free(_u1); free(_u0); + free(ts_pordV); + free(ts_ordV); + free(ts_pointV); + ts_pointNmax = ts_pointN + ts_point_count_incr_; + ts_pointV = (ts_point*)malloc(ts_pointNmax*sizeof(ts_point)); + ts_ordV = (int*)malloc(ts_pointNmax*sizeof(int)); + ts_pordV = (ts_point**)malloc(ts_pointNmax*sizeof(ts_point*)); + _u0 = (double*)malloc(ts_pointNmax*sizeof(double)); + _u1 = (double*)malloc(ts_pointNmax*sizeof(double)); + y = (double*)malloc(ts_pointNmax*sizeof(double)); + iv = (double*)malloc(ts_pointNmax*sizeof(double)); + r = (double*)malloc(ts_pointNmax*sizeof(double)); + c = (double*)malloc(ts_pointNmax*sizeof(double)); + par = (int*)malloc(ts_pointNmax*sizeof(int)); + } + if (ts_edgeN > ts_edgeNmax) { + free(ts_edgeV); + free(ts_eV); + free(ts_stackV); + ts_edgeNmax = ts_edgeN + ts_edge_count_incr_; + ts_edgeV = (ts_edge*)malloc(ts_edgeNmax*sizeof(ts_edge)); + ts_stackV = (ts_edge**)malloc(ts_edgeNmax*sizeof(ts_edge*)); + ts_eV = (ts_edge**)malloc(2*ts_edgeNmax*sizeof(ts_edge*)); + } +} + +void +ArnoldiReduce::allocTerms(int nterms) +{ + if (nterms > termNmax) { + free(U0); + free(outV); + free(termV); + free(pinV); + termNmax = nterms+256; + pinV = (const Pin**)malloc(termNmax*sizeof(const Pin*)); + termV = (int*)malloc(termNmax*sizeof(int)); + outV = (int*)malloc(termNmax*sizeof(int)); + + U0 = (double*)malloc(dNmax*termNmax*sizeof(double)); + int h; + for (h=0;h(node)]]; +} + +rcmodel * +ArnoldiReduce::makeRcmodelDrv() +{ + ParasiticNode *drv_node = parasitics_->findNode(parasitic_network_, + drvr_pin_); + ts_point *pdrv = findPt(drv_node); + makeRcmodelDfs(pdrv); + getRC(); + if (ctot_ < 1e-22) // 1e-10ps + return NULL; + setTerms(pdrv); + makeRcmodelFromTs(); + rcmodel *mod = makeRcmodelFromW(); + return mod; +} + +#define ts_orient( pp, ee) \ + if (ee->from!=pp) { ee->to = ee->from; ee->from = pp; } + +void +ArnoldiReduce::makeRcmodelDfs(ts_point *pdrv) +{ + bool loop = false; + int k; + ts_point *p,*q; + ts_point *p0 = ts_pointV; + ts_point *pend = p0 + ts_pointN; + for (p=p0;p!=pend;p++) + p->visited = 0; + ts_edge *e; + + ts_edge **stackV = ts_stackV; + int stackN = 1; + stackV[0] = e = pdrv->eV[0]; + ts_orient(pdrv,e); + pdrv->visited = 1; + pdrv->in_edge = NULL; + pdrv->ts = 0; + ts_ordV[0] = pdrv-p0; + ts_pordV[0] = pdrv; + ts_ordN = 1; + while (stackN>0) { + e = stackV[stackN-1]; + q = e->to; + + if (q->visited) { + // if it is a one-rseg self-loop, + // ignore, and do not even set *loop + if (e->to != e->from) + loop = true; + } else { + // try to descend + q->visited = 1; + q->ts = ts_ordN++; + ts_pordV[q->ts] = q; + ts_ordV[q->ts] = q-p0; + q->in_edge = e; + if (q->eN>1) { + for (k=0;keN;k++) if (q->eV[k] != e) break; + e = q->eV[k]; + ts_orient(q,e); + stackV[stackN++] = e; + continue; // descent + } + } + // try to ascend + while (--stackN>=0) { + e = stackV[stackN]; + p = e->from; + // find e in p->eV + for (k=0;keN;k++) if (p->eV[k]==e) break; + // if (k==p->eN) notice(0,"ERROR, e not found!\n"); + ++k; + if (k>=p->eN) continue; + e = p->eV[k]; + // check that next sibling is not the incoming edge + if (stackN>0 && e==stackV[stackN-1]) { + ++k; + if (k>=p->eN) continue; + e = p->eV[k]; + } + ts_orient(p,e); + stackV[stackN++] = e; + break; + } + + } // while (stackN) + + if (loop) + debugPrint1(debug_, "arnoldi", 1, + "net %s loop\n", + network_->pathName(drvr_pin_)); +} + +// makeRcmodelGetRC +void +ArnoldiReduce::getRC() +{ + ts_point *p, *p0 = ts_pointV; + ts_point *pend = p0 + ts_pointN; + ctot_ = 0.0; + for (p=p0;p!=pend;p++) { + p->c = 0.0; + p->r = 0.0; + if (p->node_) { + ParasiticNode *node = p->node_; + double cap = parasitics_->nodeGndCap(node, ap_) + + pinCapacitance(node); + if (cap > 0.0) { + p->c = cap; + ctot_ += cap; + } + else + p->c = 0.0; + if (p->in_edge && p->in_edge->resistor_) + p->r = parasitics_->value(p->in_edge->resistor_, ap_); + if (!(p->r>=0.0 && p->r<100e+3)) { // 0 < r < 100kohm + debugPrint2(debug_, "arnoldi", 1, + "R value %g out of range, drvr pin %s\n", + p->r, + network_->pathName(drvr_pin_)); + } + } + } +} + +float +ArnoldiReduce::pinCapacitance(ParasiticNode *node) +{ + const Pin *pin = parasitics_->connectionPin(node); + float pin_cap = 0.0; + if (pin) { + Port *port = network_->port(pin); + LibertyPort *lib_port = network_->libertyPort(port); + if (lib_port) + pin_cap = sdc_->pinCapacitance(pin,tr_, op_cond_, corner_, cnst_min_max_); + else if (network_->isTopLevelPort(pin)) + pin_cap = sdc_->portExtCap(port, tr_, cnst_min_max_); + } + return pin_cap; +} + +void +ArnoldiReduce::setTerms(ts_point *pdrv) +{ + // termV: from drv-ordered to fixed order + // outV: from drv-ordered to ts_pordV + ts_point *p; + int k,k0; + termV[0] = k0 = pdrv->tindex; + for (k=1;kts; + } +} + +// The guts of the arnoldi reducer. +void +ArnoldiReduce::makeRcmodelFromTs() +{ + ts_point *p, *p0 = ts_pointV; + int n = ts_ordN; + int nterms = termN; + int i,j,k,h; + if (debug_->check("arnoldi", 1)) { + for (k=0;kts,p-p0, + units_->capacitanceUnit()->asString(p->c)); + if (p->is_term) + debug_->print(" term%d",p->tindex); + if (p->in_edge) + debug_->print(" from T%d,P%ld r=%s", + p->in_edge->from->ts, + p->in_edge->from-p0, + units_->resistanceUnit()->asString(p->r)); + debug_->print("\n"); + } + for (i=0;ic; + for (j=1;jc; + r[j] = p->r; + par[j] = p->in_edge->from->ts; + } + + sum = 0.0; + for (j=0;jcapacitanceUnit()->asString(sum)); + ctot_ = sum; + sqc_ = sqrt(sum); + double sqrt_ctot_inv = 1.0/sqc_; + for (j=0;j0;j--) { + iv[j] += c[j]*u0[j]; + iv[par[j]] += iv[j]; + } + iv[0] += c[0]*u0[0]; + y[0] = 0.0; + for (j=1;jcheck("arnoldi", 1)) { + debugPrint1(debug_, "arnoldi", 1, + "tridiagonal reduced matrix, drvr pin %s\n", + network_->pathName(drvr_pin_)); + debugPrint2(debug_, "arnoldi", 1, "order %d n %d\n",order,n); + for (h=0;hprint("d[%d] %s", + h, + units_->timeUnit()->asString(d[h])); + if (hprint(" e[%d] %s", + h, + units_->timeUnit()->asString(e[h])); + debug_->print("\n"); + debug_->print("U[%d]",h); + for (i=0;iprint(" %6.2e",U[h][i]); + debug_->print("\n"); + } + } +} + +rcmodel * +ArnoldiReduce::makeRcmodelFromW() +{ + int j,h; + int n = termN; + rcmodel *mod = new rcmodel(); + mod->order = order; + mod->n = n; + if (order>0) { + int totd = order + order - 1 + order*n; + mod->d = (double *)malloc(totd*sizeof(double)); + if (order>1) mod->e = mod->d + order; + else mod->e = NULL; + mod->U = (double **)malloc(order*sizeof(double*)); + mod->U[0] = mod->d + order + order - 1; + for (h=1;hU[h]=mod->U[0] + h*n; + for (h=0;hd[h] = d[h]; + if (he[h] = e[h]; + for (j=0;jU[h][j] = U[h][j]; + } + } + + mod->pinV = (const Pin **)malloc(n*sizeof(const Pin*)); + for (j=0;jpinV[j] = pinV[k]; + } + + mod->ctot = ctot_; + mod->sqc = sqc_; + return mod; +} + +} // namespace diff --git a/dcalc/ArnoldiReduce.hh b/dcalc/ArnoldiReduce.hh new file mode 100644 index 00000000..200bd750 --- /dev/null +++ b/dcalc/ArnoldiReduce.hh @@ -0,0 +1,116 @@ +// OpenSTA, Static Timing Analyzer +// Copyright (c) 2018, Parallax Software, Inc. +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +// (c) 2018 Nefelus, Inc. +// +// Author: W. Scott + +#ifndef STA_ARNOLDI_REDUCE_H +#define STA_ARNOLDI_REDUCE_H + +#include "Map.hh" +#include "NetworkClass.hh" +#include "ParasiticsClass.hh" + +namespace sta { + +class ConcreteParasiticNetwork; +class ConcreteParasiticNode; + +class rcmodel; +struct ts_edge; +struct ts_point; + +typedef Map ArnolidPtMap; + +class ArnoldiReduce : public StaState +{ +public: + ArnoldiReduce(StaState *sta); + ~ArnoldiReduce(); + Parasitic *reduceToArnoldi(Parasitic *parasitic, + const Pin *drvr_pin, + float coupling_cap_factor, + const TransRiseFall *tr, + const OperatingConditions *op_cond, + const Corner *corner, + const MinMax *cnst_min_max, + const ParasiticAnalysisPt *ap); + +protected: + void loadWork(); + rcmodel *makeRcmodelDrv(); + void allocPoints(); + void allocTerms(int nterms); + ts_point *findPt(ParasiticNode *node); + void makeRcmodelDfs(ts_point *pdrv); + void getRC(); + float pinCapacitance(ParasiticNode *node); + void setTerms(ts_point *pdrv); + void makeRcmodelFromTs(); + rcmodel *makeRcmodelFromW(); + + ConcreteParasiticNetwork *parasitic_network_; + const Pin *drvr_pin_; + float coupling_cap_factor_; + const TransRiseFall *tr_; + const OperatingConditions *op_cond_; + const Corner *corner_; + const MinMax *cnst_min_max_; + const ParasiticAnalysisPt *ap_; + // ParasiticNode -> ts_point index. + ArnolidPtMap pt_map_; + + // rcWork + ts_point *ts_pointV; + int ts_pointN; + int ts_pointNmax; + static const int ts_point_count_incr_; + ts_edge *ts_edgeV; + int ts_edgeN; + int ts_edgeNmax; + static const int ts_edge_count_incr_; + ts_edge **ts_eV; + ts_edge **ts_stackV; + int *ts_ordV; + ts_point **ts_pordV; + int ts_ordN; + + int termNmax; + int termN; + ts_point *pterm0; + const Pin **pinV; // fixed order, offset from pterm0 + int *termV; // from drv-ordered to fixed order + int *outV; // from drv-ordered to ts_pordV + + int dNmax; + double *d; + double *e; + double *U0; + double **U; + + double ctot_; + double sqc_; + double *_u0, *_u1; + double *y, *iv; + double *c, *r; + int *par; + int order; +}; + +} // namespace +#endif + diff --git a/dcalc/DelayCalc.cc b/dcalc/DelayCalc.cc index 469e1772..3c6bf26c 100644 --- a/dcalc/DelayCalc.cc +++ b/dcalc/DelayCalc.cc @@ -21,6 +21,7 @@ #include "LumpedCapDelayCalc.hh" #include "SimpleRCDelayCalc.hh" #include "DmpDelayCalc.hh" +#include "ArnoldiDelayCalc.hh" #include "DelayCalc.hh" namespace sta { @@ -37,6 +38,7 @@ registerDelayCalcs() registerDelayCalc("simple_rc", makeSimpleRCDelayCalc); registerDelayCalc("dmp_ceff_elmore", makeDmpCeffElmoreDelayCalc); registerDelayCalc("dmp_ceff_two_pole", makeDmpCeffTwoPoleDelayCalc); + registerDelayCalc("arnoldi", makeArnoldiDelayCalc); } void diff --git a/dcalc/Makefile.am b/dcalc/Makefile.am index 35f710d5..00fae2ba 100644 --- a/dcalc/Makefile.am +++ b/dcalc/Makefile.am @@ -18,6 +18,9 @@ lib_LTLIBRARIES = libdcalc.la include_HEADERS = \ ArcDelayCalc.hh \ + Arnoldi.hh \ + ArnoldiDelayCalc.hh \ + ArnoldiReduce.hh \ DelayCalc.hh \ DcalcAnalysisPt.hh \ DmpCeff.hh \ @@ -32,6 +35,8 @@ include_HEADERS = \ libdcalc_la_SOURCES = \ ArcDelayCalc.cc \ + ArnoldiDelayCalc.cc \ + ArnoldiReduce.cc \ DcalcAnalysisPt.cc \ DelayCalc.cc \ DmpCeff.cc \