From b5763eb0d863dffd7ef93d9203223a20f7c833a9 Mon Sep 17 00:00:00 2001
From: Francesco Lannutti <nicolati@users.sf.net>
Date: Tue, 19 Sep 2017 21:19:23 +0200
Subject: [PATCH] Added the support for CUDA and non-CUDA models within the
 same netlist in CUSPICE

---
 src/include/ngspice/gendefs.h                 |  4 +++
 src/spicelib/analysis/CUSPICE/cucktflush.c    | 10 ++++++
 .../analysis/CUSPICE/cucktnonconupdate.c      |  4 +++
 .../analysis/CUSPICE/cucktrhsoldupdate.c      |  6 ++++
 src/spicelib/analysis/CUSPICE/cucktsetup.c    |  2 ++
 .../analysis/CUSPICE/cucktstatesupdate.c      | 26 ++++++++++++++
 src/spicelib/analysis/CUSPICE/cucktsystem.c   |  2 ++
 src/spicelib/analysis/CUSPICE/cuckttrunc.cu   |  4 +++
 src/spicelib/analysis/cktload.c               | 34 +++++++++++++++++++
 src/spicelib/analysis/cktsetup.c              |  5 +++
 src/spicelib/devices/bsim4v7/b4v7set.c        |  3 ++
 src/spicelib/devices/cap/capsetup.c           |  3 ++
 src/spicelib/devices/ind/indsetup.c           |  3 ++
 src/spicelib/devices/ind/mutsetup.c           |  3 ++
 src/spicelib/devices/res/ressetup.c           |  3 ++
 src/spicelib/devices/vsrc/vsrcset.c           |  3 ++
 16 files changed, 115 insertions(+)

diff --git a/src/include/ngspice/gendefs.h b/src/include/ngspice/gendefs.h
index 27c89d2e3..ac3a25d6d 100644
--- a/src/include/ngspice/gendefs.h
+++ b/src/include/ngspice/gendefs.h
@@ -44,6 +44,10 @@ struct GENmodel {       /* model structure for a resistor */
     GENinstance *GENinstances;  /* pointer to list of instances that have this
                                  * model */
     IFuid GENmodName;           /* pointer to character string naming this model */
+
+#ifdef USE_CUSPICE
+    unsigned int has_cuda:1 ;   /* flag to indicate is the model supports CUDA */
+#endif
 };
 
 
diff --git a/src/spicelib/analysis/CUSPICE/cucktflush.c b/src/spicelib/analysis/CUSPICE/cucktflush.c
index 7a5c21470..b72b06f3b 100644
--- a/src/spicelib/analysis/CUSPICE/cucktflush.c
+++ b/src/spicelib/analysis/CUSPICE/cucktflush.c
@@ -35,6 +35,7 @@ cuCKTflush
 CKTcircuit *ckt
 )
 {
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         long unsigned int m, mRHS ;
 
         m = (long unsigned int)(ckt->total_n_values + 1) ; // + 1 because of CKTdiagGmin
@@ -45,6 +46,15 @@ CKTcircuit *ckt
 
         /* Clean-up the CKTloadOutputRHS */
         cudaMemset (ckt->d_CKTloadOutputRHS, 0, mRHS * sizeof(double)) ;
+    } else {
+        int i, size ;
+
+        size = SMPmatSize (ckt->CKTmatrix) ;
+        for (i = 0 ; i <= size ; i++)
+            *(ckt->CKTrhs + i) = 0 ;
+
+        SMPclear (ckt->CKTmatrix) ;
+    }
 
     return (OK) ;
 }
diff --git a/src/spicelib/analysis/CUSPICE/cucktnonconupdate.c b/src/spicelib/analysis/CUSPICE/cucktnonconupdate.c
index 7f7658fa2..4bdddc9a9 100644
--- a/src/spicelib/analysis/CUSPICE/cucktnonconupdate.c
+++ b/src/spicelib/analysis/CUSPICE/cucktnonconupdate.c
@@ -47,8 +47,10 @@ CKTcircuit *ckt
 {
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         status = cudaMemcpy (ckt->d_CKTnoncon, &(ckt->CKTnoncon), sizeof(int), cudaMemcpyHostToDevice) ;
         CUDAMEMCPYCHECK (ckt->d_CKTnoncon, 1, int, status)
+    }
 
     return (OK) ;
 }
@@ -61,8 +63,10 @@ CKTcircuit *ckt
 {
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         status = cudaMemcpy (&(ckt->CKTnoncon), ckt->d_CKTnoncon, sizeof(int), cudaMemcpyDeviceToHost) ;
         CUDAMEMCPYCHECK (&(ckt->CKTnoncon), 1, int, status)
+    }
 
     return (OK) ;
 }
diff --git a/src/spicelib/analysis/CUSPICE/cucktrhsoldupdate.c b/src/spicelib/analysis/CUSPICE/cucktrhsoldupdate.c
index bdd7801c7..78feb1f98 100644
--- a/src/spicelib/analysis/CUSPICE/cucktrhsoldupdate.c
+++ b/src/spicelib/analysis/CUSPICE/cucktrhsoldupdate.c
@@ -47,8 +47,10 @@ CKTcircuit *ckt
 {
     long unsigned int size ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         size = (long unsigned int)(ckt->d_MatrixSize + 1) ;
         cudaMemset (ckt->d_CKTrhsOld, 0, size * sizeof(double)) ;
+    }
 
     return (OK) ;
 }
@@ -62,9 +64,11 @@ CKTcircuit *ckt
     long unsigned int size ;
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         size = (long unsigned int)(ckt->d_MatrixSize + 1) ;
         status = cudaMemcpy (ckt->d_CKTrhsOld, ckt->CKTrhsOld, size * sizeof(double), cudaMemcpyHostToDevice) ;
         CUDAMEMCPYCHECK (ckt->d_CKTrhsOld, size, double, status)
+    }
 
     return (OK) ;
 }
@@ -78,9 +82,11 @@ CKTcircuit *ckt
     long unsigned int size ;
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         size = (long unsigned int)(ckt->d_MatrixSize + 1) ;
         status = cudaMemcpy (ckt->CKTrhsOld, ckt->d_CKTrhsOld, size * sizeof(double), cudaMemcpyDeviceToHost) ;
         CUDAMEMCPYCHECK (ckt->CKTrhsOld, size, double, status)
+    }
 
     return (OK) ;
 }
diff --git a/src/spicelib/analysis/CUSPICE/cucktsetup.c b/src/spicelib/analysis/CUSPICE/cucktsetup.c
index 75d997465..320d5bb47 100644
--- a/src/spicelib/analysis/CUSPICE/cucktsetup.c
+++ b/src/spicelib/analysis/CUSPICE/cucktsetup.c
@@ -76,6 +76,7 @@ CKTcircuit *ckt
     size2 = (long unsigned int)ckt->CKTnumStates ;
     size3 = (long unsigned int)ckt->total_n_timeSteps ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         /* Topology Matrix Handling */
         status = cudaMalloc ((void **)&(ckt->CKTmatrix->d_CKTrhs), (n + 1) * sizeof(double)) ;
         CUDAMALLOCCHECK (ckt->CKTmatrix->d_CKTrhs, (n + 1), double, status)
@@ -158,6 +159,7 @@ CKTcircuit *ckt
         CUDAMALLOCCHECK (ckt->d_CKTtimeSteps, size3, double, status)
         status = cudaMalloc ((void **)&(ckt->d_CKTtimeStepsOut), size3 * sizeof(double)) ;
         CUDAMALLOCCHECK (ckt->d_CKTtimeStepsOut, size3, double, status)
+    }
 
     return (OK) ;
 }
diff --git a/src/spicelib/analysis/CUSPICE/cucktstatesupdate.c b/src/spicelib/analysis/CUSPICE/cucktstatesupdate.c
index 2fb453807..d2d26e0cd 100644
--- a/src/spicelib/analysis/CUSPICE/cucktstatesupdate.c
+++ b/src/spicelib/analysis/CUSPICE/cucktstatesupdate.c
@@ -28,6 +28,7 @@
 #include "ngspice/sperror.h"
 #include "cuda_runtime_api.h"
 #include "ngspice/CUSPICE/CUSPICE.h"
+#include <string.h>
 
 /* cudaMemcpy MACRO to check it for errors --> CUDAMEMCPYCHECK(name of pointer, dimension, type, status) */
 #define CUDAMEMCPYCHECK(a, b, c, d) \
@@ -47,8 +48,10 @@ CKTcircuit *ckt
 {
     long unsigned int size ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         size = (long unsigned int)ckt->CKTnumStates ;
         cudaMemset (ckt->d_CKTstate0, 0, size * sizeof(double)) ;
+    }
 
     return (OK) ;
 }
@@ -62,9 +65,11 @@ CKTcircuit *ckt
     long unsigned int size ;
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         size = (long unsigned int)ckt->CKTnumStates ;
         status = cudaMemcpy (ckt->d_CKTstate0, ckt->CKTstate0, size * sizeof(double), cudaMemcpyHostToDevice) ;
         CUDAMEMCPYCHECK (ckt->d_CKTstate0, size, double, status)
+    }
 
     return (OK) ;
 }
@@ -78,9 +83,11 @@ CKTcircuit *ckt
     long unsigned int size ;
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         size = (long unsigned int)ckt->CKTnumStates ;
         status = cudaMemcpy (ckt->CKTstate0, ckt->d_CKTstate0, size * sizeof(double), cudaMemcpyDeviceToHost) ;
         CUDAMEMCPYCHECK (ckt->CKTstate0, size, double, status)
+    }
 
     return (OK) ;
 }
@@ -91,12 +98,16 @@ cuCKTstate01copy
 CKTcircuit *ckt
 )
 {
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         long unsigned int size ;
         cudaError_t status ;
 
         size = (long unsigned int)ckt->CKTnumStates ;
         status = cudaMemcpy (ckt->d_CKTstate1, ckt->d_CKTstate0, size * sizeof(double), cudaMemcpyDeviceToDevice) ;
         CUDAMEMCPYCHECK (ckt->d_CKTstate1, size, double, status)
+    } else {
+        memcpy (ckt->CKTstate1, ckt->CKTstate0, (size_t) ckt->CKTnumStates * sizeof(double)) ;
+    }
 
     return (OK) ;
 }
@@ -110,11 +121,19 @@ CKTcircuit *ckt
     int i ;
     double *temp ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         temp = ckt->d_CKTstates [ckt->CKTmaxOrder + 1] ;
         for (i = ckt->CKTmaxOrder ; i >= 0 ; i--)
             ckt->d_CKTstates [i + 1] = ckt->d_CKTstates [i] ;
 
         ckt->d_CKTstates [0] = temp ;
+    } else {
+        temp = ckt->CKTstates [ckt->CKTmaxOrder + 1] ;
+        for (i = ckt->CKTmaxOrder ; i >= 0 ; i--) {
+            ckt->CKTstates [i + 1] = ckt->CKTstates [i] ;
+        }
+        ckt->CKTstates [0] = temp ;
+    }
 
     return (OK) ;
 }
@@ -125,6 +144,7 @@ cuCKTstate123copy
 CKTcircuit *ckt
 )
 {
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         long unsigned int size ;
         cudaError_t status ;
 
@@ -135,6 +155,10 @@ CKTcircuit *ckt
 
         status = cudaMemcpy (ckt->d_CKTstate3, ckt->d_CKTstate1, size * sizeof(double), cudaMemcpyDeviceToDevice) ;
         CUDAMEMCPYCHECK (ckt->d_CKTstate3, size, double, status)
+    } else {
+        memcpy (ckt->CKTstate2, ckt->CKTstate1, (size_t) ckt->CKTnumStates * sizeof(double)) ;
+        memcpy (ckt->CKTstate3, ckt->CKTstate1, (size_t) ckt->CKTnumStates * sizeof(double)) ;
+    }
 
     return (OK) ;
 }
@@ -147,8 +171,10 @@ CKTcircuit *ckt
 {
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         status = cudaMemcpy (ckt->d_CKTdeltaOld, ckt->CKTdeltaOld, 7 * sizeof(double), cudaMemcpyHostToDevice) ;
         CUDAMEMCPYCHECK (ckt->d_CKTdeltaOld, 7, double, status)
+    }
 
     return (OK) ;
 }
diff --git a/src/spicelib/analysis/CUSPICE/cucktsystem.c b/src/spicelib/analysis/CUSPICE/cucktsystem.c
index e512d4a4b..b186bab28 100644
--- a/src/spicelib/analysis/CUSPICE/cucktsystem.c
+++ b/src/spicelib/analysis/CUSPICE/cucktsystem.c
@@ -48,6 +48,7 @@ CKTcircuit *ckt
     long unsigned int nz, n ;
     cudaError_t status ;
 
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         nz = (long unsigned int)ckt->CKTmatrix->CKTklunz ;
         n = (long unsigned int)ckt->CKTmatrix->CKTkluN ;
 
@@ -58,6 +59,7 @@ CKTcircuit *ckt
         /* Copy back the RHS */
         status = cudaMemcpy (ckt->CKTrhs, ckt->CKTmatrix->d_CKTrhs, (n + 1) * sizeof(double), cudaMemcpyDeviceToHost) ;
         CUDAMEMCPYCHECK (ckt->CKTrhs, (n + 1), double, status)
+    }
 
     return (OK) ;
 }
diff --git a/src/spicelib/analysis/CUSPICE/cuckttrunc.cu b/src/spicelib/analysis/CUSPICE/cuckttrunc.cu
index f12acda2c..c3d159968 100644
--- a/src/spicelib/analysis/CUSPICE/cuckttrunc.cu
+++ b/src/spicelib/analysis/CUSPICE/cuckttrunc.cu
@@ -31,6 +31,7 @@ cuCKTtrunc
 CKTcircuit *ckt, double timetemp, double *timeStep
 )
 {
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         long unsigned int size ;
         double timetempGPU ;
         int thread_x, thread_y, block_x ;
@@ -88,6 +89,9 @@ CKTcircuit *ckt, double timetemp, double *timeStep
         } else {
             *timeStep = timetemp ;
         }
+    } else {
+        *timeStep = MIN (2 * *timeStep, timetemp) ;
+    }
 
     return 0 ;
 }
diff --git a/src/spicelib/analysis/cktload.c b/src/spicelib/analysis/cktload.c
index 7ffbe4016..be3659642 100644
--- a/src/spicelib/analysis/cktload.c
+++ b/src/spicelib/analysis/cktload.c
@@ -103,8 +103,15 @@ CKTload(CKTcircuit *ckt)
         return (E_NOMEM) ;
 #endif
 
+    /* Load Sparse Matrix and RHS of all the CUDA supported models */
     for (i = 0; i < DEVmaxnum; i++) {
+
+#ifdef USE_CUSPICE
+        if (DEVices[i] && DEVices[i]->DEVload && ckt->CKThead[i] && ckt->CKThead[i]->has_cuda) {
+#else
         if (DEVices[i] && DEVices[i]->DEVload && ckt->CKThead[i]) {
+#endif
+
             error = DEVices[i]->DEVload (ckt->CKThead[i], ckt);
 
 #ifdef USE_CUSPICE
@@ -127,6 +134,13 @@ CKTload(CKTcircuit *ckt)
     }
 
 #ifdef USE_CUSPICE
+    int TopologyNNZ, TopologyNNZRHS ;
+
+    TopologyNNZ = ckt->total_n_Ptr + ckt->CKTdiagElements ; // + ckt->CKTdiagElements because of CKTdiagGmin
+                                                                // without the zeroes along the diagonal
+    TopologyNNZRHS = ckt->total_n_PtrRHS ;
+
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         /* Copy the CKTdiagGmin value to the GPU */
         // The real Gmin is needed only when the matrix will reside entirely on the GPU
         // Right now, only some models support CUDA, so the matrix is only partially created on the GPU
@@ -171,6 +185,26 @@ CKTload(CKTcircuit *ckt)
         status = cuCKTsystemDtoH (ckt) ;
         if (status != 0)
             return (E_NOMEM) ;
+    }
+
+    /* Load Sparse Matrix and RHS of all the CUDA unsupported models */
+    for (i = 0; i < DEVmaxnum; i++) {
+        if (DEVices[i] && DEVices[i]->DEVload && ckt->CKThead[i] && !ckt->CKThead[i]->has_cuda) {
+            error = DEVices[i]->DEVload (ckt->CKThead[i], ckt);
+
+            if (ckt->CKTnoncon)
+                ckt->CKTtroubleNode = 0;
+#ifdef STEPDEBUG
+            if (noncon != ckt->CKTnoncon) {
+                printf("device type %s nonconvergence\n",
+                       DEVices[i]->DEVpublic.name);
+                noncon = ckt->CKTnoncon;
+            }
+#endif /* STEPDEBUG */
+            if (error) return(error);
+        }
+    }
+
 #endif
 
 #ifdef XSPICE
diff --git a/src/spicelib/analysis/cktsetup.c b/src/spicelib/analysis/cktsetup.c
index 4d3d7366f..379faea1a 100644
--- a/src/spicelib/analysis/cktsetup.c
+++ b/src/spicelib/analysis/cktsetup.c
@@ -246,6 +246,8 @@ CKTsetup(CKTcircuit *ckt)
         ckt->CKTtopologyMatrixCOOxRHS = TMALLOC (double, TopologyNNZRHS) ;
 
 
+        if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
+
         /* Topology Matrix Pre-Allocation in CSR format */
         ckt->CKTtopologyMatrixCSRp = TMALLOC (int, nz + 1) ;
 
@@ -325,6 +327,7 @@ CKTsetup(CKTcircuit *ckt)
         ret = Compress (ckt->CKTtopologyMatrixCOOiRHS, ckt->CKTtopologyMatrixCSRpRHS, n + 1, TopologyNNZRHS) ;
 
         /* Multiply the Topology Matrix by the M Vector to build the Final CSC Matrix - after the CKTload Call */
+        }
 #endif
 
     } else {
@@ -337,6 +340,7 @@ CKTsetup(CKTcircuit *ckt)
     }
 
 #ifdef USE_CUSPICE
+    if (ckt->total_n_Ptr > 0 && ckt->total_n_PtrRHS > 0) {
         ckt->d_MatrixSize = SMPmatSize (ckt->CKTmatrix) ;
         status = cuCKTsetup (ckt) ;
         if (status != 0)
@@ -361,6 +365,7 @@ CKTsetup(CKTcircuit *ckt)
         /* CUSPARSE Matrix Properties Definition */
         cusparseSetMatType ((cusparseMatDescr_t)(ckt->CKTmatrix->CKTcsrmvDescr), CUSPARSE_MATRIX_TYPE_GENERAL) ;
         cusparseSetMatIndexBase ((cusparseMatDescr_t)(ckt->CKTmatrix->CKTcsrmvDescr), CUSPARSE_INDEX_BASE_ZERO) ;
+    }
 #endif
 
 #ifdef WANT_SENSE2
diff --git a/src/spicelib/devices/bsim4v7/b4v7set.c b/src/spicelib/devices/bsim4v7/b4v7set.c
index 91bfe2c9d..a462a1e16 100644
--- a/src/spicelib/devices/bsim4v7/b4v7set.c
+++ b/src/spicelib/devices/bsim4v7/b4v7set.c
@@ -2601,6 +2601,9 @@ do { if((here->ptr = SMPmakeElt(matrix,here->first,here->second))==(double *)NUL
 
         /* How much instances we have */
         model->n_instances = i ;
+
+        /* This model supports CUDA */
+        model->gen.has_cuda = 1 ;
     }
 
     /*  loop through all the BSIM4v7 models */
diff --git a/src/spicelib/devices/cap/capsetup.c b/src/spicelib/devices/cap/capsetup.c
index 671e8e4d0..d6d91ef9b 100644
--- a/src/spicelib/devices/cap/capsetup.c
+++ b/src/spicelib/devices/cap/capsetup.c
@@ -136,6 +136,9 @@ do { if((here->ptr = SMPmakeElt(matrix, here->first, here->second)) == NULL){\
 
         /* How much instances we have */
         model->n_instances = i ;
+
+        /* This model supports CUDA */
+        model->gen.has_cuda = 1 ;
     }
 
     /*  loop through all the capacitor models */
diff --git a/src/spicelib/devices/ind/indsetup.c b/src/spicelib/devices/ind/indsetup.c
index 9f73b4ea7..889cb468d 100644
--- a/src/spicelib/devices/ind/indsetup.c
+++ b/src/spicelib/devices/ind/indsetup.c
@@ -123,6 +123,9 @@ do { if((here->ptr = SMPmakeElt(matrix, here->first, here->second)) == NULL){\
 
         /* How much instances we have */
         model->n_instances = i ;
+
+        /* This model supports CUDA */
+        model->gen.has_cuda = 1 ;
     }
 
     /*  loop through all the inductor models */
diff --git a/src/spicelib/devices/ind/mutsetup.c b/src/spicelib/devices/ind/mutsetup.c
index 0abf21076..d66162ddf 100644
--- a/src/spicelib/devices/ind/mutsetup.c
+++ b/src/spicelib/devices/ind/mutsetup.c
@@ -79,6 +79,9 @@ MUTsetup(SMPmatrix *matrix, GENmodel *inModel, CKTcircuit *ckt, int *states)
 
         /* How much instances we have */
         model->n_instances = i;
+
+        /* This model supports CUDA */
+        model->gen.has_cuda = 1 ;
     }
 
     /*  loop through all the mutual inductor models */
diff --git a/src/spicelib/devices/res/ressetup.c b/src/spicelib/devices/res/ressetup.c
index ad5accfcb..d1fa24273 100644
--- a/src/spicelib/devices/res/ressetup.c
+++ b/src/spicelib/devices/res/ressetup.c
@@ -94,6 +94,9 @@ do { if((here->ptr = SMPmakeElt(matrix, here->first, here->second)) == NULL){\
 
         /* How much instances we have */
         model->n_instances = i ;
+
+        /* This model supports CUDA */
+        model->gen.has_cuda = 1 ;
     }
 
     /*  loop through all the resistor models */
diff --git a/src/spicelib/devices/vsrc/vsrcset.c b/src/spicelib/devices/vsrc/vsrcset.c
index f83225c6e..835d6bd75 100644
--- a/src/spicelib/devices/vsrc/vsrcset.c
+++ b/src/spicelib/devices/vsrc/vsrcset.c
@@ -80,6 +80,9 @@ do { if((here->ptr = SMPmakeElt(matrix, here->first, here->second)) == NULL){\
 
         /* How much instances we have */
         model->n_instances = i ;
+
+        /* This model supports CUDA */
+        model->gen.has_cuda = 1 ;
     }
 
     /*  loop through all the voltage source models */