enable simd

2020-07-12 10:24:26 +02:00 · 2020-07-12 10:24:26 +02:00 · f12d2a3f0d
parent 0d34bb52cc
commit f12d2a3f0d
13 changed files with 25205 additions and 2 deletions
--- a/compile_min_simd.sh
+++ b/compile_min_simd.sh
@ -60,7 +60,7 @@ else
  echo "configuring for 64 bit release"
  echo
 # You may add  --enable-adms to the following command for adding adms generated devices 
-  ../configure --with-wingui --enable-xspice --enable-cider --enable-openmp --disable-debug prefix="C:/Spice64" CFLAGS="-m64 -O3 -march=native " LDFLAGS="-m64 -s"
+  ../configure --with-wingui --enable-xspice --enable-cider --enable-openmp --disable-debug prefix="C:/Spice64" CFLAGS="-m64 -O3 -mtune=native " LDFLAGS="-m64 -s"
 fi
 if [ $? -ne 0 ]; then  echo "../configure failed"; exit 1 ; fi

--- a/src/spicelib/devices/bsim3v32/b3v32acm.c
+++ b/src/spicelib/devices/bsim3v32/b3v32acm.c
@ -0,0 +1,104 @@
+inline int BSIM3v32_ACM_saturationCurrents
+(
+	BSIM3v32model *model,
+	BSIM3v32instance *here,
+        double *DrainSatCurrent,
+        double *SourceSatCurrent
+)
+{
+            return ACM_saturationCurrents(
+            model->BSIM3v32acmMod,
+            model->BSIM3v32calcacm,
+            here->BSIM3v32geo,
+            model->BSIM3v32hdif,
+            model->BSIM3v32wmlt,
+            here->BSIM3v32w,
+            model->BSIM3v32xw,
+            model->BSIM3v32jctTempSatCurDensity,
+            model->BSIM3v32jctSidewallTempSatCurDensity,
+            here->BSIM3v32drainAreaGiven,
+            here->BSIM3v32drainArea,
+            here->BSIM3v32drainPerimeterGiven,
+            here->BSIM3v32drainPerimeter,
+            here->BSIM3v32sourceAreaGiven,
+            here->BSIM3v32sourceArea,
+            here->BSIM3v32sourcePerimeterGiven,
+            here->BSIM3v32sourcePerimeter,
+            DrainSatCurrent,
+            SourceSatCurrent
+            );
+}
+
+	    
+inline int BSIM3v32_ACM_junctionCapacitances
+(
+	BSIM3v32model *model,
+	BSIM3v32instance *here,
+	double *areaDrainBulkCapacitance,
+	double *periDrainBulkCapacitance,
+	double *gateDrainBulkCapacitance,
+	double *areaSourceBulkCapacitance,
+	double *periSourceBulkCapacitance,
+	double *gateSourceBulkCapacitance
+)
+{
+	switch (model->BSIM3v32intVersion) {
+                    case BSIM3v32V324:
+                    case BSIM3v32V323:
+		      return ACM_junctionCapacitances(
+                      model->BSIM3v32acmMod,
+                      model->BSIM3v32calcacm,
+                      here->BSIM3v32geo,
+                      model->BSIM3v32hdif,
+                      model->BSIM3v32wmlt,
+                      here->BSIM3v32w,
+                      model->BSIM3v32xw,
+                      here->BSIM3v32drainAreaGiven,
+                      here->BSIM3v32drainArea,
+                      here->BSIM3v32drainPerimeterGiven,
+                      here->BSIM3v32drainPerimeter,
+                      here->BSIM3v32sourceAreaGiven,
+                      here->BSIM3v32sourceArea,
+                      here->BSIM3v32sourcePerimeterGiven,
+                      here->BSIM3v32sourcePerimeter,
+                      model->BSIM3v32unitAreaTempJctCap,
+                      model->BSIM3v32unitLengthSidewallTempJctCap,
+                      model->BSIM3v32unitLengthGateSidewallTempJctCap,
+                      areaDrainBulkCapacitance,
+                      periDrainBulkCapacitance,
+                      gateDrainBulkCapacitance,
+                      areaSourceBulkCapacitance,
+                      periSourceBulkCapacitance,
+                      gateSourceBulkCapacitance
+              	      );
+		    case BSIM3v32V322:
+                    case BSIM3v32V32:
+                    default:
+		      return ACM_junctionCapacitances(
+                      model->BSIM3v32acmMod,
+                      model->BSIM3v32calcacm,
+                      here->BSIM3v32geo,
+                      model->BSIM3v32hdif,
+                      model->BSIM3v32wmlt,
+                      here->BSIM3v32w,
+                      model->BSIM3v32xw,
+                      here->BSIM3v32drainAreaGiven,
+                      here->BSIM3v32drainArea,
+                      here->BSIM3v32drainPerimeterGiven,
+                      here->BSIM3v32drainPerimeter,
+                      here->BSIM3v32sourceAreaGiven,
+                      here->BSIM3v32sourceArea,
+                      here->BSIM3v32sourcePerimeterGiven,
+                      here->BSIM3v32sourcePerimeter,
+                      model->BSIM3v32unitAreaJctCap,
+                      model->BSIM3v32unitLengthSidewallJctCap,
+                      model->BSIM3v32unitLengthGateSidewallJctCap,
+		      areaDrainBulkCapacitance,
+                      periDrainBulkCapacitance,
+                      gateDrainBulkCapacitance,
+                      areaSourceBulkCapacitance,
+                      periSourceBulkCapacitance,
+                      gateSourceBulkCapacitance
+              	      );
+	}
+}
--- a/src/spicelib/devices/bsim3v32/b3v32acm.h
+++ b/src/spicelib/devices/bsim3v32/b3v32acm.h
@ -0,0 +1,25 @@
+#ifndef BSIM3v32ACM_H
+#define BSIM3v32ACM_H
+
+int BSIM3v32_ACM_saturationCurrents
+(
+	BSIM3v32model *model,
+	BSIM3v32instance *here,
+        double *DrainSatCurrent,
+        double *SourceSatCurrent
+);
+
+	    
+int BSIM3v32_ACM_junctionCapacitances
+(
+	BSIM3v32model *model,
+	BSIM3v32instance *here,
+	double *areaDrainBulkCapacitance,
+	double *periDrainBulkCapacitance,
+	double *gateDrainBulkCapacitance,
+	double *areaSourceBulkCapacitance,
+	double *periSourceBulkCapacitance,
+	double *gateSourceBulkCapacitance
+);
+
+#endif
--- a/src/spicelib/devices/bsim3v32/b3v32ldsel.c
+++ b/src/spicelib/devices/bsim3v32/b3v32ldsel.c
@ -0,0 +1,262 @@
+/*******************************************************************************
+ * Copyright 2020 Florian Ballenegger, Anamosic Ballenegger Design
+ *******************************************************************************
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ ******************************************************************************/
+
+#include "ngspice/ngspice.h"
+#include "bsim3v32def.h"
+
+/* NSIMD = number of // instances evaluated (=4 for AVX2) */
+#define NDATASIMD 7
+
+#define DEBUG 0
+
+#ifndef USE_OMP
+extern int BSIM3v32LoadSeq(BSIM3v32instance *here, CKTcircuit *ckt, double* data, int stride);
+extern int BSIM3v32LoadSIMD(BSIM3v32instance **heres, CKTcircuit *ckt, double data[7][NSIMD]);
+#else
+extern void BSIM3v32LoadRhsMat(GENmodel *inModel, CKTcircuit *ckt);
+extern int BSIM3v32LoadSeq(BSIM3v32instance *here, CKTcircuit *ckt, int);
+extern int BSIM3v32LoadSIMD(BSIM3v32instance **heres, CKTcircuit *ckt);
+#endif
+
+
+#ifndef USE_OMP
+int
+BSIM3v32loadSel (GENmodel *inModel, CKTcircuit *ckt)
+{
+	#ifndef USE_OMP
+	double data[NDATASIMD][NSIMD];
+	#endif
+    BSIM3v32group *group; /* a group of instance of same model, same pParam, same nqsMode, same geo and same off */
+    BSIM3v32model *model = (BSIM3v32model*)inModel;
+    BSIM3v32instance* heres[NSIMD];
+    
+	for (; model != NULL; model = BSIM3v32nextModel(model))
+	for (group=model->groupHead; group!=NULL; group=group->next)
+	{
+    	   int idx=0;
+    	   while(idx+NSIMD <= group->InstCount)
+    	   {
+    		int count=0;
+    		while((count<NSIMD) && (idx<group->InstCount))
+    		{	
+			data[0][count]=NAN;
+			heres[count] = group->InstArray[idx];
+    			int local_error = BSIM3v32LoadSeq(group->InstArray[idx++],ckt,
+				&data[0][count],NSIMD
+			);
+			if (local_error) return local_error;
+			if(!isnan(data[0][count]))
+			{
+				count++;
+			}
+    		}
+    		if(count==NSIMD)
+    		{
+			int local_error;
+			 /* process NSIMD instances at once */
+			local_error = BSIM3v32LoadSIMD(heres, ckt, data);
+        		if (local_error) return local_error;
+    		}
+		else for(int i=0;i<count;i++)
+		{
+			int local_error = BSIM3v32LoadSeq(heres[i], ckt, NULL,0);
+        		if (local_error) return local_error;
+		}
+    	   }
+	
+    	   /* remaining instances are evaluated sequencially */
+    	   for (; idx < group->InstCount; idx++) {
+    		int local_error = BSIM3v32LoadSeq(group->InstArray[idx], ckt,
+			NULL, 0);
+        	if (local_error) return local_error;
+           }
+    }
+
+    return 0; /* no error */
+}
+#endif
+
+
+#ifdef USE_OMP
+int
+BSIM3v32loadSel (GENmodel *inModel, CKTcircuit *ckt)
+{
+	/*
+	This version do omp parallel only inside groups
+	*/
+	BSIM3v32group *group;
+	BSIM3v32model *model = (BSIM3v32model*)inModel;
+	int error=0;
+	int idx=0;
+	for (; model != NULL; model = BSIM3v32nextModel(model))
+	for (group=model->groupHead; group!=NULL; group=group->next)
+	{	
+	
+	#pragma omp parallel for
+	for (idx=0; idx <= group->InstCount-NSIMD; idx+=NSIMD)
+	{
+    		int local_error;
+		int i;
+		int needeval=0;
+		for(i=0;i<NSIMD;i++)
+		{
+			group->InstArray[idx+i]->BSIM3v32SIMDCheck=-1;
+			local_error = BSIM3v32LoadSeq(group->InstArray[idx+i], ckt, 1);
+        		if (local_error) error = local_error;
+			
+			if(group->InstArray[idx+i]->BSIM3v32SIMDCheck!=-1)
+				needeval=1;
+		}
+		if(!needeval)
+			continue; /* all NSIMD instances are bypassed */
+		local_error = BSIM3v32LoadSIMD(&group->InstArray[idx], ckt);
+		if (local_error) error = local_error;
+	}
+	/* omp mess with idx val after the for loop above, so we recalc it */
+	idx = NSIMD*(group->InstCount/NSIMD);
+	for (; idx < group->InstCount; idx++) {
+		int local_error = BSIM3v32LoadSeq(group->InstArray[idx], ckt, 2);
+		if (local_error) error = local_error;
+	}
+	}
+	
+	BSIM3v32LoadRhsMat(inModel, ckt);
+	return error;
+}
+
+#if 0
+int
+BSIM3v32loadSelVrai (GENmodel *inModel, CKTcircuit *ckt)
+{
+	/*
+	This version do omp parallel for most instances of all models combined 
+	*/
+	BSIM3v32group *group;
+	BSIM3v32model *model = (BSIM3v32model*)inModel;
+    
+    	int idx;
+	int error = 0;
+	int nsimd,nsisd;
+	/* pre load all instances */
+	if(DEBUG) printf("loadomp %d\n",model->BSIM3v32InstCount);
+	#pragma omp parallel for
+	for (idx = 0; idx < model->BSIM3v32InstCount; idx++) {
+		BSIM3v32instance *here = model->BSIM3v32InstanceArray[idx];
+		if(DEBUG) printf("loadomp preload seq %d\n",idx);
+		here->BSIM3v32SIMDCheck=-1;
+		int local_error = BSIM3v32LoadSeq(here,ckt,1);
+		if (local_error) error=local_error;
+	}
+	if (error) printf("load error\n");
+        if (error) return error;
+	
+	/* sort instances to run in SIMD */
+	nsimd=0;
+	nsisd=0;
+	for (model = (BSIM3v32model*)inModel; model != NULL; model = BSIM3v32nextModel(model))
+	for (group=model->groupHead; group!=NULL; group=group->next)
+	{
+		int rev=group->InstCount;
+		group->SimdCount = 0;
+		for(idx=0;idx<group->InstCount;idx++)
+		{
+			BSIM3v32instance *here = group->InstArray[idx];
+			if(here->BSIM3v32SIMDCheck==-1)
+			{
+				/* bypassed, swap current inst to the end */
+				rev--;
+				group->InstArray[idx] = group->InstArray[rev];
+				group->InstArray[rev] = here;
+			}
+		}
+		group->EvalCount = rev;
+		group->SimdCount = rev/NSIMD;
+		nsimd += group->SimdCount;
+		nsisd += rev - NSIMD*group->SimdCount;
+	}
+	
+	if(DEBUG) printf("nsimd=%d nsisd=%d\n",nsimd,nsisd);
+	/* run SIMD in parallel */
+	#pragma omp parallel for
+	for(idx=0;idx<nsimd;idx++)
+	{
+		if(DEBUG) printf("Search SIMD index %d\n", idx);
+		int search=idx;
+		BSIM3v32model* mod;
+		BSIM3v32group* grp;
+		for (mod = (BSIM3v32model*)inModel; mod != NULL; mod = BSIM3v32nextModel(mod))
+		for (grp=mod->groupHead; grp!=NULL; grp=grp->next)
+		{
+			if(search>=0 && search < grp->SimdCount)
+			{
+				BSIM3v32instance** heres = &grp->InstArray[search*NSIMD];
+				if(DEBUG) printf("Call Simd index %d of %d\n", search*NSIMD, grp->InstCount);
+				int local_error = BSIM3v32LoadSIMD(heres, ckt);
+				if(DEBUG) printf("Call ended\n");
+				if(local_error) error=local_error;
+			}
+			search -= grp->SimdCount;
+		}
+	}
+	if(error) return error;
+	
+	if(DEBUG) printf("now switch to sisd\n");
+	/* run remaining SISD in parallel */
+	#pragma omp parallel for
+	for(idx=0;idx<nsisd;idx++)
+	{
+		int search=idx;
+		BSIM3v32model* mod;
+		BSIM3v32group* grp;
+		for (mod = (BSIM3v32model*)inModel; mod != NULL; mod = BSIM3v32nextModel(mod))
+		for (grp=mod->groupHead; grp!=NULL; grp=grp->next)
+		{
+			int n = grp->EvalCount - grp->SimdCount*NSIMD;
+			if(search>=0 && search < n)
+			{
+				if(DEBUG) printf("Call seq index %d of %d\n", search + grp->SimdCount*NSIMD,grp->InstCount);
+				int local_error = BSIM3v32LoadSeq(grp->InstArray[search + grp->SimdCount*NSIMD], ckt, 0);
+				if(DEBUG) printf("Call ended\n");
+				if(local_error) error=local_error;
+			}
+			search -= n;
+		}
+	}
+	if(DEBUG) printf("Now write the matrix\n");
+	/* Write in matrix sequentially */
+	BSIM3v32LoadRhsMat(inModel, ckt);
+	
+	return error;
+}
+#endif
+
+#endif
+
+
--- a/src/spicelib/devices/bsim3v32/b3v32ldseq.c
+++ b/src/spicelib/devices/bsim3v32/b3v32ldseq.c
--- a/src/spicelib/devices/bsim3v32/b3v32ldseq_simd4.c
+++ b/src/spicelib/devices/bsim3v32/b3v32ldseq_simd4.c
--- a/src/spicelib/devices/bsim3v32/b3v32ldseq_simd4_omp.c
+++ b/src/spicelib/devices/bsim3v32/b3v32ldseq_simd4_omp.c
--- a/src/spicelib/devices/bsim3v32/b3v32ldseq_simd8.c
+++ b/src/spicelib/devices/bsim3v32/b3v32ldseq_simd8.c
--- a/src/spicelib/devices/bsim3v32/b3v32ldsimd.c
+++ b/src/spicelib/devices/bsim3v32/b3v32ldsimd.c
@ -0,0 +1,347 @@
+/*******************************************************************************
+ * Copyright 2020 Florian Ballenegger, Anamosic Ballenegger Design
+ *******************************************************************************
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ ******************************************************************************/
+
+#include <math.h>
+#include <x86intrin.h>
+#include <signal.h>
+
+#include "ngspice/ngspice.h"
+#include "ngspice/cktdefs.h"
+#include "bsim3v32def.h"
+#include "b3v32acm.h"
+#include "ngspice/trandefs.h"
+#include "ngspice/const.h"
+#include "ngspice/sperror.h"
+#include "ngspice/devdefs.h"
+#include "ngspice/suffix.h"
+
+#define MAX_EXP 5.834617425e14
+#define MIN_EXP 1.713908431e-15
+#define EXP_THRESHOLD 34.0
+#define EPSOX 3.453133e-11
+#define EPSSI 1.03594e-10
+#define Charge_q 1.60219e-19
+#define DELTA_1 0.02
+#define DELTA_2 0.02
+#define DELTA_3 0.02
+#define DELTA_4 0.02
+
+#define USEX86INTRINSICS 1
+
+typedef double Vec4d __attribute__ ((vector_size (sizeof(double)*NSIMD), aligned (sizeof(double)*NSIMD)));
+typedef long int Vec4m __attribute__ ((vector_size (sizeof(double)*NSIMD), aligned (sizeof(double)*NSIMD)));
+
+
+#define SIMDANY(err) (err!=0)
+#define SIMDIFYCMD(cmd) /* empty */
+#define SIMDifySaveScope(sc) /* empty */
+
+#define vec4_pow0p7(x,p) vec4_mypow(x,p)
+#define vec4_powMJ(x,p) vec4_mypow(x,p)
+#define vec4_powMJSW(x,p) vec4_mypow(x,p)
+#define vec4_powMJSWG(x,p) vec4_mypow(x,p)
+
+#if USEX86INTRINSICS==1
+/* libmvec prototypes */
+/* Caution: those libmvec functions are not as precise as std libm */
+__m256d _ZGVdN4v_exp(__m256d x);
+__m256d _ZGVdN4v_log(__m256d x);
+
+#define vec4_MAX(a,b) _mm256_max_pd(a,b)
+#define vec4_exp(a) _ZGVdN4v_exp(a) 
+#define vec4_log(a) _ZGVdN4v_log(a)
+#define vec4_sqrt(a) _mm256_sqrt_pd(a)
+
+
+static inline Vec4d vec4_blend(Vec4d fa, Vec4d tr, Vec4m mask)
+{
+	return _mm256_blendv_pd(fa,tr, (Vec4d) mask);
+}
+
+static inline Vec4d vec4_fabs(Vec4d x)
+{
+	return vec4_blend(x,-x,x<0);
+}
+
+#else
+/* vector-libm prototypes */
+Vec4d vec4_exp_vectorlibm(Vec4d x); /* defined in vec4_exp.c */
+Vec4d vec4_log_vectorlibm(Vec4d x); /* defined in vec4_log.c */
+#define vec4_exp(a) vec4_exp_vectorlibm(a)
+#define vec4_log(a) vec4_log_vectorlibm(a)
+static inline Vec4d vec4_MAX(Vec4d a, Vec4d b)
+{
+	return vec4_blend(a,b,a<b);
+}
+static inline Vec4d vec4_blend(Vec4d fa, Vec4d tr, Vec4m mask)
+{
+	/* hope for good vectorization by the compiler ! */
+	Vec4d res;
+	#pragma omp simd
+	for(int i=0;i<4;i++)
+	{
+		res[i] = mask[i] ? tr[i] : fa[i];
+	}
+	return res;
+}
+static inline Vec4d vec4_fabs(Vec4d x)
+{
+	/* hope for good vectorization by the compiler ! */
+	Vec4d res;
+	#pragma omp simd
+	for(int i=0;i<4;i++)
+	{
+		res[i] = (x[i] < 0) ? -x[i] : x[i];
+	}
+	return res;
+}
+static inline Vec4d vec4_sqrt(Vec4d x)
+{
+	/* hope for good vectorization by the compiler ! */
+	Vec4d res;
+	#pragma omp simd
+	for(int i=0;i<4;i++)
+	{
+		res[i] = sqrt(x[i]);
+	}
+	return res;
+}
+#endif
+
+static inline Vec4d vec4_mypow(Vec4d x, double p)
+{
+	return vec4_exp(vec4_log(x)*p);
+}
+
+
+/* some debug utils functions */
+void vec4_printd(const char* msg, const char* name, Vec4d vecd)
+{
+	printf("%s %s %g %g %g %g\n",msg,name,vecd[0],vecd[1],vecd[2],vecd[3]);	
+}
+
+void vec4_printm(const char* msg, const char* name, Vec4m vecm)
+{
+	printf("%s %s %ld %ld %ld %ld\n",msg,name,vecm[0],vecm[1],vecm[2],vecm[3]);	
+}
+
+void vec4_CheckCollisions(Vec4m stateindexes, const char* msg)
+{
+	for(int i=0;i<NSIMD;i++)
+	for(int j=0;j<NSIMD;j++)
+	if(i!=j)
+	if(stateindexes[i]==stateindexes[j])
+	{
+		printf("%s, collisions %ld %ld %ld %ld!\n",msg,stateindexes[0],stateindexes[1],stateindexes[2],stateindexes[3]);
+		raise(SIGINT);
+	}
+}
+
+/* useful vectorized functions */
+static inline Vec4d SIMDLOADDATA(int idx, double data[7][NSIMD])
+{
+	return (Vec4d) {data[idx][0],data[idx][1],data[idx][2],data[idx][3]};
+}
+
+static inline Vec4d vec4_BSIM3v32_StateAccess(double* cktstate, Vec4m stateindexes)
+{
+	return (Vec4d) {
+	 cktstate[stateindexes[0]],
+	 cktstate[stateindexes[1]],
+	 cktstate[stateindexes[2]],
+	 cktstate[stateindexes[3]]
+	};
+}
+
+
+static inline void vec4_BSIM3v32_StateStore(double* cktstate, Vec4m stateindexes, Vec4d values)
+{
+	if(0) vec4_CheckCollisions(stateindexes,"SateStore");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		cktstate[stateindexes[idx]] = values[idx];
+	}
+}
+
+static inline void vec4_BSIM3v32_StateAdd(double* cktstate, Vec4m stateindexes, Vec4d values)
+{
+	if(0) vec4_CheckCollisions(stateindexes,"StateAdd");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		cktstate[stateindexes[idx]] += values[idx];
+	}
+}
+
+static inline void vec4_BSIM3v32_StateSub(double* cktstate, Vec4m stateindexes, Vec4d values)
+{
+	if(0) vec4_CheckCollisions(stateindexes,"StateSub");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		cktstate[stateindexes[idx]] -= values[idx];
+	}
+}
+
+static inline Vec4d vec4_exp_seq(Vec4d val)
+{
+	return (Vec4d) {exp(val[0]),exp(val[1]),exp(val[2]),exp(val[3])};
+}
+static inline Vec4d vec4_log_seq(Vec4d val)
+{
+	return (Vec4d) {log(val[0]),log(val[1]),log(val[2]),log(val[3])};
+}
+static inline Vec4d vec4_sqrt_seq(Vec4d val)
+{
+	return (Vec4d) {sqrt(val[0]),sqrt(val[1]),sqrt(val[2]),sqrt(val[3])};
+}
+static inline Vec4d vec4_MAX_seq(Vec4d a, Vec4d b)
+{
+	return (Vec4d) {MAX(a[0],b[0]),MAX(a[1],b[1]),MAX(a[2],b[2]),MAX(a[3],b[3])};
+}
+
+static inline int vec4_BSIM3v32_ACM_saturationCurrents
+(
+	BSIM3v32model *model,
+	BSIM3v32instance **heres,
+        Vec4d *DrainSatCurrent,
+        Vec4d *SourceSatCurrent
+)
+{
+	int	error;
+	double dsat,ssat;
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		error = BSIM3v32_ACM_saturationCurrents(
+		      model, heres[idx],
+		      &dsat,
+		      &ssat
+		);
+		(*DrainSatCurrent)[idx] = dsat;
+		(*SourceSatCurrent)[idx] = ssat;
+		if(error) return error;
+	}
+	return error;
+}
+
+static inline int vec4_BSIM3v32_ACM_junctionCapacitances(
+	BSIM3v32model *model,
+	BSIM3v32instance **heres,
+	Vec4d *areaDrainBulkCapacitance,
+	Vec4d *periDrainBulkCapacitance,
+	Vec4d *gateDrainBulkCapacitance,
+	Vec4d *areaSourceBulkCapacitance,
+	Vec4d *periSourceBulkCapacitance,
+	Vec4d *gateSourceBulkCapacitance
+
+)
+{
+	int	error;
+	double areaDB,periDB,gateDB,areaSB,periSB,gateSB;
+	
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		error = BSIM3v32_ACM_junctionCapacitances(
+		      model, heres[idx],
+		      &areaDB,
+		      &periDB,
+		      &gateDB,
+		      &areaSB,
+		      &periSB,
+		      &gateSB
+		);
+		(*areaDrainBulkCapacitance)[idx]=areaDB;
+		(*periDrainBulkCapacitance)[idx]=periDB;
+		(*gateDrainBulkCapacitance)[idx]=gateDB;
+		(*areaSourceBulkCapacitance)[idx]=areaSB;
+		(*periSourceBulkCapacitance)[idx]=periSB;
+		(*gateSourceBulkCapacitance)[idx]=gateSB;
+		if(error) return error;
+	}
+	return error;
+}
+
+/* geq, ceq, and zero are not translated to vectors because there are unused */
+static inline int vec4_NIintegrate(CKTcircuit* ckt, double* geq, double *ceq, double zero, Vec4m chargestate)
+{
+	int	error;
+	if (0) vec4_CheckCollisions(chargestate, "NIIntegrate");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		error = NIintegrate(ckt,geq,ceq,zero,chargestate[idx]);
+		if(error) return error;
+	}
+	return error;
+}
+
+static inline int vec4_SIMDCOUNT(Vec4m mask) {
+	return (mask[0] ? 1 : 0) + (mask[1] ? 1 : 0) + (mask[2] ? 1 : 0) + (mask[3] ? 1 : 0);
+}
+
+static inline Vec4d vec4_SIMDTOVECTOR(double val)
+{
+	return (Vec4d) {val,val,val,val};
+}
+static inline Vec4m vec4_SIMDTOVECTORMASK(int val)
+{
+	return (Vec4m) {val,val,val,val};
+}
+
+
+int BSIM3v32LoadSIMD(BSIM3v32instance **heres, CKTcircuit *ckt
+#ifndef USE_OMP
+	, double data[7][NSIMD]
+#endif
+)
+{
+    BSIM3v32model *model = BSIM3v32modPtr(heres[0]);
+    struct bsim3v32SizeDependParam *pParam;
+    pParam = heres[0]->pParam; /* same of all NSIMD instances */
+
+#if NSIMD==4
+#ifdef USE_OMP
+    #pragma message "Use OMP SIMD4 version"
+    #include "b3v32ldseq_simd4_omp.c"
+#else
+    #include "b3v32ldseq_simd4.c"
+#endif
+#elif NSIMD==8
+#ifdef USE_OMP
+    #pragma message "Use OMP SIMD8 version"
+    #include "b3v32ldseq_simd8_omp.c"
+#else
+    #include "b3v32ldseq_simd8.c"
+#endif
+#else
+#error Unsupported value for NSIMD
+#endif
+	
+    return(OK);
+	
+}
+
--- a/src/spicelib/devices/bsim3v32/b3v32ldsimd8.c
+++ b/src/spicelib/devices/bsim3v32/b3v32ldsimd8.c
@ -0,0 +1,274 @@
+/*******************************************************************************
+ * Copyright 2020 Florian Ballenegger, Anamosic Ballenegger Design
+ *******************************************************************************
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ ****************************
+/* draft version, not tested, not even compiled */
+
+#include <math.h>
+#include <x86intrin.h>
+#include <signal.h>
+
+#include "ngspice/ngspice.h"
+#include "ngspice/cktdefs.h"
+#include "bsim3v32def.h"
+#include "b3v32acm.h"
+#include "ngspice/trandefs.h"
+#include "ngspice/const.h"
+#include "ngspice/sperror.h"
+#include "ngspice/devdefs.h"
+#include "ngspice/suffix.h"
+
+#define MAX_EXP 5.834617425e14
+#define MIN_EXP 1.713908431e-15
+#define EXP_THRESHOLD 34.0
+#define EPSOX 3.453133e-11
+#define EPSSI 1.03594e-10
+#define Charge_q 1.60219e-19
+#define DELTA_1 0.02
+#define DELTA_2 0.02
+#define DELTA_3 0.02
+#define DELTA_4 0.02
+
+#define NSIMD 8
+#define USEX86INTRINSICS 1
+
+typedef double Vec8d __attribute__ ((vector_size (sizeof(double)*NSIMD), aligned (sizeof(double)*NSIMD)));
+typedef long int Vec8m __attribute__ ((vector_size (sizeof(double)*NSIMD), aligned (sizeof(double)*NSIMD)));
+
+
+#define SIMDANY(err) (err!=0)
+#define SIMDIFYCMD(cmd) /* empty */
+#define SIMDifySaveScope(sc) /* empty */
+#define SIMDVECTORMACRO(val) ((Vec8d) {val,val,val,val})
+#define SIMDVECTOR(val) vec8_SIMDTOVECTOR(val)
+#define SIMDVECTORMASK(val) vec8_SIMDTOVECTORMASK(val)
+
+
+#ifdef USEX86INTRINSICS
+#define vec8_MAX(a,b) _mm512_max_pd(a,b)
+#define vec8_exp(a) _mm512_exp_pd(a)
+#define vec8_log(a) _mm512_log_pd(a)
+#define vec8_sqrt(a) _mm512_sqrt_pd(a)
+
+static inline Vec8d vec8_blend(Vec8d fa, Vec8d tr, Vec8m mask)
+{
+	/* mask follow gcc vector extension comparison results false=0 true=-1 */
+	/* so we can't use the new _mm512_mask_blendv_pd intrinsics */
+	#define SAFER
+	#ifdef SAFER
+	/* support mask = false:0, true: 1 or -1 */
+	return (Vec8d) _mm512_ternarylogic_epi64(
+		_mm512_castpd_si512(fa),
+		_mm512_castpd_si512(tr),
+		_mm512_srai_epi64(_mm512_castpd_si512(mask), 63),
+		0xd8);
+	#else
+	/* support only mask 0 or -1, mask=1 will fail, but should be OK with this code */
+	return (Vec8d) _mm512_ternarylogic_epi64(
+		_mm512_castpd_si512(fa),
+		_mm512_castpd_si512(tr),
+		_mm512_castpd_si512(mask),
+		0xd8);
+	#endif
+	
+	
+}
+static inline Vec8d vec8_fabs(Vec8d x)
+{
+	return (Vec8d) _mm512_abs_pd(x);
+}
+#else
+#error X86 AVX512 instrinsics required for using SIMD8 version
+#endif
+
+
+/* some debug utils functions */
+void vec8_printd(const char* msg, const char* name, Vec8d vecd)
+{
+	printf("%s %s %g %g %g %g\n",msg,name,vecd[0],vecd[1],vecd[2],vecd[3]);	
+}
+
+void vec8_printm(const char* msg, const char* name, Vec8m vecm)
+{
+	printf("%s %s %ld %ld %ld %ld\n",msg,name,vecm[0],vecm[1],vecm[2],vecm[3]);	
+}
+
+void vec8_CheckCollisions(Vec8m stateindexes, const char* msg)
+{
+	for(int i=0;i<NSIMD;i++)
+	for(int j=0;j<NSIMD;j++)
+	if(i!=j)
+	if(stateindexes[i]==stateindexes[j])
+	{
+		printf("%s, collisions %ld %ld %ld %ld!\n",msg,stateindexes[0],stateindexes[1],stateindexes[2],stateindexes[3]);
+		raise(SIGINT);
+	}
+}
+
+/* useful vectorized functions */
+static inline Vec8d SIMDLOADDATA(int idx, double data[7][NSIMD])
+{
+	return (Vec8d) {data[idx][0],data[idx][1],data[idx][2],data[idx][3],
+		data[idx][4],data[idx][5],data[idx][6],data[idx][7]};
+}
+
+static inline Vec8d vec8_BSIM3v32_StateAccess(double* cktstate, Vec8m stateindexes)
+{
+	return (Vec8d) {
+	 cktstate[stateindexes[0]],
+	 cktstate[stateindexes[1]],
+	 cktstate[stateindexes[2]],
+	 cktstate[stateindexes[3]],
+	 cktstate[stateindexes[4]],
+	 cktstate[stateindexes[5]],
+	 cktstate[stateindexes[6]],
+	 cktstate[stateindexes[7]]
+	};
+}
+
+
+static inline void vec8_BSIM3v32_StateStore(double* cktstate, Vec8m stateindexes, Vec8d values)
+{
+	if(0) vec8_CheckCollisions(stateindexes,"SateStore");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		cktstate[stateindexes[idx]] = values[idx];
+	}
+}
+
+static inline void vec8_BSIM3v32_StateAdd(double* cktstate, Vec8m stateindexes, Vec8d values)
+{
+	if(0) vec8_CheckCollisions(stateindexes,"StateAdd");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		cktstate[stateindexes[idx]] += values[idx];
+	}
+}
+
+static inline void vec8_BSIM3v32_StateSub(double* cktstate, Vec8m stateindexes, Vec8d values)
+{
+	if(0) vec8_CheckCollisions(stateindexes,"StateSub");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		cktstate[stateindexes[idx]] -= values[idx];
+	}
+}
+
+
+static inline int vec8_BSIM3v32_ACM_saturationCurrents
+(
+	BSIM3v32model *model,
+	BSIM3v32instance **heres,
+        Vec8d *DrainSatCurrent,
+        Vec8d *SourceSatCurrent
+)
+{
+	int	error;
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		error = BSIM3v32_ACM_saturationCurrents(
+		      model, heres[idx],
+		      &((*DrainSatCurrent)[idx]),
+		      &((*SourceSatCurrent)[idx])
+		);
+		if(error) return error;
+	}
+	return error;
+}
+
+static inline int vec8_BSIM3v32_ACM_junctionCapacitances(
+	BSIM3v32model *model,
+	BSIM3v32instance **heres,
+	Vec8d *areaDrainBulkCapacitance,
+	Vec8d *periDrainBulkCapacitance,
+	Vec8d *gateDrainBulkCapacitance,
+	Vec8d *areaSourceBulkCapacitance,
+	Vec8d *periSourceBulkCapacitance,
+	Vec8d *gateSourceBulkCapacitance
+
+)
+{
+	int	error;
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		error = BSIM3v32_ACM_junctionCapacitances(
+		      model, heres[idx],
+		      &((*areaDrainBulkCapacitance)[idx]),
+		      &((*periDrainBulkCapacitance)[idx]),
+		      &((*gateDrainBulkCapacitance)[idx]),
+		      &((*areaSourceBulkCapacitance)[idx]),
+		      &((*periSourceBulkCapacitance)[idx]),
+		      &((*gateSourceBulkCapacitance)[idx])
+		);
+		if(error) return error;
+	}
+	return error;
+}
+
+/* geq, ceq, and zero are not vectors because there are unused */
+static inline int vec8_NIintegrate(CKTcircuit* ckt, double* geq, double *ceq, double zero, Vec8m chargestate)
+{
+	int	error;
+	if (0) vec8_CheckCollisions(chargestate, "NIIntegrate");
+	for(int idx=0;idx<NSIMD;idx++)
+	{
+		error = NIintegrate(ckt,geq,ceq,zero,chargestate[idx]);
+		if(error) return error;
+	}
+	return error;
+}
+
+static inline int vec8_SIMDCOUNT(Vec8m mask) {
+	return (mask[0] ? 1 : 0) + (mask[1] ? 1 : 0) + (mask[2] ? 1 : 0) + (mask[3] ? 1 : 0)
+	 + (mask[4] ? 1 : 0) + (mask[5] ? 1 : 0) + (mask[6] ? 1 : 0) + (mask[7] ? 1 : 0);
+}
+
+static inline Vec8d vec8_SIMDTOVECTOR(double val)
+{
+	return (Vec8d) {val,val,val,val,val,val,val,val};
+}
+static inline Vec8m vec8_SIMDTOVECTORMASK(int val)
+{
+	return (Vec8m) {val,val,val,val,val,val,val,val};
+}
+
+
+int BSIM3v32LoadSIMD8(BSIM3v32instance **heres, CKTcircuit *ckt, double data[7][NSIMD]) {
+    BSIM3v32model *model = BSIM3v32modPtr(heres[0]);
+    if(0) printf("BSIM3v32LoadSIMD %s model %s\n", heres[0]->gen.GENname, model->gen.GENmodName);
+    struct bsim3v32SizeDependParam *pParam;
+    pParam = heres[0]->pParam; /* same of all NSIMD instances */
+
+#if 1    
+    #include "b3v32ldseq_simd8.c"
+#endif
+    
+    return(OK);
+	
+}
+
--- a/src/spicelib/devices/bsim3v32/vec4_exp.c
+++ b/src/spicelib/devices/bsim3v32/vec4_exp.c
@ -0,0 +1,654 @@
+/* This program implements a show-case vector (vectorizable) double
+   precision exponential with a 4 ulp error bound.
+
+   Author: Christoph Lauter,
+
+           Sorbonne Université - LIP6 - PEQUAN team.
+
+   This program uses code generated using Sollya and Metalibm; see the
+   licences and exception texts below.
+
+   This program is
+
+   Copyright 2014-2018 Christoph Lauter Sorbonne Université
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided
+   with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+   OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* 
+
+    This code was generated using non-trivial code generation commands
+    of the Metalibm software program.
+    
+    Before using, modifying and/or integrating this code into other
+    software, review the copyright and license status of this
+    generated code. In particular, see the exception below.
+
+    This generated program is partly or entirely based on a program
+    generated using non-trivial code generation commands of the Sollya
+    software program. See the copyright notice and exception text
+    referring to that Sollya-generated part of this program generated
+    with Metalibm below.
+
+    Metalibm is
+ 
+    Copyright 2008-2013 by 
+
+    Laboratoire de l'Informatique du Parallélisme, 
+    UMR CNRS - ENS Lyon - UCB Lyon 1 - INRIA 5668
+
+    and by
+
+    Laboratoire d'Informatique de Paris 6, equipe PEQUAN,
+    UPMC Universite Paris 06 - CNRS - UMR 7606 - LIP6, Paris, France.
+
+    Contributors: Christoph Quirin Lauter 
+                  (UPMC LIP6 PEQUAN formerly LIP/ENS Lyon) 
+                  christoph.lauter@lip6.fr
+
+		  and
+
+		  Olga Kupriianova 
+		  (UPMC LIP6 PEQUAN)
+		  olga.kupriianova@lip6.fr
+
+    Metalibm was formerly developed by the Arenaire project at Ecole
+    Normale Superieure de Lyon and is now developed by Equipe PEQUAN
+    at Universite Pierre et Marie Curie Paris 6.
+
+    The Metalibm software program is free software; you can
+    redistribute it and/or modify it under the terms of the GNU Lesser
+    General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option)
+    any later version.
+
+    Metalibm is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with the Metalibm program; if not, write to the Free
+    Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+    02111-1307, USA.
+
+    This generated program is distributed WITHOUT ANY WARRANTY; without
+    even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+    PARTICULAR PURPOSE.
+    
+    As a special exception, you may create a larger work that contains
+    part or all of this software generated using Metalibm and
+    distribute that work under terms of your choice, so long as that
+    work isn't itself a numerical code generator using the skeleton of
+    this code or a modified version thereof as a code skeleton.
+    Alternatively, if you modify or redistribute this generated code
+    itself, or its skeleton, you may (at your option) remove this
+    special exception, which will cause this generated code and its
+    skeleton and the resulting Metalibm output files to be licensed
+    under the General Public licence (version 2) without this special
+    exception.
+    
+    This special exception was added by the Metalibm copyright holders 
+    on November 20th 2013.
+    
+*/
+
+
+
+/*
+    This code was generated using non-trivial code generation commands of
+    the Sollya software program.
+    
+    Before using, modifying and/or integrating this code into other
+    software, review the copyright and license status of this generated
+    code. In particular, see the exception below.
+    
+    Sollya is
+    
+    Copyright 2006-2013 by
+    
+    Laboratoire de l'Informatique du Parallelisme, UMR CNRS - ENS Lyon -
+    UCB Lyon 1 - INRIA 5668,
+    
+    Laboratoire d'Informatique de Paris 6, equipe PEQUAN, UPMC Universite
+    Paris 06 - CNRS - UMR 7606 - LIP6, Paris, France
+    
+    and by
+    
+    Centre de recherche INRIA Sophia-Antipolis Mediterranee, equipe APICS,
+    Sophia Antipolis, France.
+    
+    Contributors Ch. Lauter, S. Chevillard, M. Joldes
+    
+    christoph.lauter@ens-lyon.org
+    sylvain.chevillard@ens-lyon.org
+    joldes@lass.fr
+    
+    The Sollya software is a computer program whose purpose is to provide
+    an environment for safe floating-point code development. It is
+    particularily targeted to the automatized implementation of
+    mathematical floating-point libraries (libm). Amongst other features,
+    it offers a certified infinity norm, an automatic polynomial
+    implementer and a fast Remez algorithm.
+    
+    The Sollya software is governed by the CeCILL-C license under French
+    law and abiding by the rules of distribution of free software.  You
+    can use, modify and/ or redistribute the software under the terms of
+    the CeCILL-C license as circulated by CEA, CNRS and INRIA at the
+    following URL "http://www.cecill.info".
+    
+    As a counterpart to the access to the source code and rights to copy,
+    modify and redistribute granted by the license, users are provided
+    only with a limited warranty and the software's author, the holder of
+    the economic rights, and the successive licensors have only limited
+    liability.
+    
+    In this respect, the user's attention is drawn to the risks associated
+    with loading, using, modifying and/or developing or reproducing the
+    software by the user in light of its specific status of free software,
+    that may mean that it is complicated to manipulate, and that also
+    therefore means that it is reserved for developers and experienced
+    professionals having in-depth computer knowledge. Users are therefore
+    encouraged to load and test the software's suitability as regards
+    their requirements in conditions enabling the security of their
+    systems and/or data to be ensured and, more generally, to use and
+    operate it in the same conditions as regards security.
+    
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+    
+    The Sollya program is distributed WITHOUT ANY WARRANTY; without even
+    the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+    PURPOSE.
+    
+    This generated program is distributed WITHOUT ANY WARRANTY; without
+    even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+    PARTICULAR PURPOSE.
+    
+    As a special exception, you may create a larger work that contains
+    part or all of this software generated using Sollya and distribute
+    that work under terms of your choice, so long as that work isn't
+    itself a numerical code generator using the skeleton of this code or a
+    modified version thereof as a code skeleton.  Alternatively, if you
+    modify or redistribute this generated code itself, or its skeleton,
+    you may (at your option) remove this special exception, which will
+    cause this generated code and its skeleton and the resulting Sollya
+    output files to be licensed under the CeCILL-C licence without this
+    special exception.
+    
+    This special exception was added by the Sollya copyright holders in
+    version 4.1 of Sollya.
+    
+*/
+
+#include <stdint.h>
+
+#define VECTOR_LENGTH 4
+#define RESTRICT restrict
+#define CONST const
+
+typedef double Vec4d __attribute__ ((vector_size (sizeof(double)*VECTOR_LENGTH),
+ aligned (sizeof(double)*VECTOR_LENGTH)));
+
+/* Two caster types */
+typedef union _dblcast {
+  double   d;
+  uint64_t i;
+} dblcast;
+
+typedef union {
+  int64_t l;
+  double d;
+} db_number;
+
+/* Macro implementations of some double-double operations */
+#define Add12(s, r, a, b)                       \
+  {double _z, _a=a, _b=b;                       \
+    s = _a + _b;                                \
+    _z = s - _a;                                \
+    r = _b - _z;   }
+
+#define Mul12(rh,rl,u,v)                                \
+  {                                                     \
+    CONST double c  = 134217729.; /* 2^27 +1 */         \
+    double up, u1, u2, vp, v1, v2;                      \
+    double _u =u, _v=v;                                 \
+                                                        \
+    up = _u*c;        vp = _v*c;                        \
+    u1 = (_u-up)+up;  v1 = (_v-vp)+vp;                  \
+    u2 = _u-u1;       v2 = _v-v1;                       \
+                                                        \
+    *rh = _u*_v;                                        \
+    *rl = (((u1*v1-*rh)+(u1*v2))+(u2*v1))+(u2*v2);      \
+  }
+
+#define Mul122(resh,resl,a,bh,bl)               \
+  {                                             \
+    double _t1, _t2, _t3, _t4;                  \
+                                                \
+    Mul12(&_t1,&_t2,(a),(bh));                  \
+    _t3 = (a) * (bl);                           \
+    _t4 = _t2 + _t3;                            \
+    Add12((*(resh)),(*(resl)),_t1,_t4);         \
+  }
+
+#define Mul22(zh,zl,xh,xl,yh,yl)                        \
+  {                                                     \
+    double mh, ml;                                      \
+                                                        \
+    CONST double c = 134217729.;                        \
+    double up, u1, u2, vp, v1, v2;                      \
+                                                        \
+    up = (xh)*c;        vp = (yh)*c;                    \
+    u1 = ((xh)-up)+up;  v1 = ((yh)-vp)+vp;              \
+    u2 = (xh)-u1;       v2 = (yh)-v1;                   \
+                                                        \
+    mh = (xh)*(yh);                                     \
+    ml = (((u1*v1-mh)+(u1*v2))+(u2*v1))+(u2*v2);        \
+                                                        \
+    ml += (xh)*(yl) + (xl)*(yh);                        \
+    *zh = mh+ml;                                        \
+    *zl = mh - (*zh) + ml;                              \
+  }
+
+/* Need fabs */
+double fabs(double);
+
+
+/* Some constants */
+#define LOG2_E    1.442695040888963407359924681001892137426645954153
+#define LOG_2_HI  0.693147180559890330187045037746429443359375
+#define LOG_2_LO  5.4979230187083711552420206887059365096458163346682e-14
+#define SHIFTER   6755399441055744.0
+
+/* A metalibm generated function for the callout */
+#define f_approx_exp_arg_red_coeff_0h 1.00000000000000000000000000000000000000000000000000000000000000000000000000000000e+00
+#define f_approx_exp_arg_red_coeff_1h 1.00000000000000000000000000000000000000000000000000000000000000000000000000000000e+00
+#define f_approx_exp_arg_red_coeff_2h 5.00000000000032307490016592055326327681541442871093750000000000000000000000000000e-01
+#define f_approx_exp_arg_red_coeff_3h 1.66666666664336909908783468381443526595830917358398437500000000000000000000000000e-01
+#define f_approx_exp_arg_red_coeff_4h 4.16666661063678778198493546369718387722969055175781250000000000000000000000000000e-02
+#define f_approx_exp_arg_red_coeff_5h 8.33337739276391979703628720699271070770919322967529296875000000000000000000000000e-03
+#define f_approx_exp_arg_red_coeff_6h 1.39156772666044516173489142829566844739019870758056640625000000000000000000000000e-03
+
+
+static inline void f_approx_exp_arg_red(double * RESTRICT f_approx_exp_arg_red_resh, double * RESTRICT f_approx_exp_arg_red_resm, double x) {
+
+
+
+
+  double f_approx_exp_arg_red_t_1_0h;
+  double f_approx_exp_arg_red_t_2_0h;
+  double f_approx_exp_arg_red_t_3_0h;
+  double f_approx_exp_arg_red_t_4_0h;
+  double f_approx_exp_arg_red_t_5_0h;
+  double f_approx_exp_arg_red_t_6_0h;
+  double f_approx_exp_arg_red_t_7_0h;
+  double f_approx_exp_arg_red_t_8_0h;
+  double f_approx_exp_arg_red_t_9_0h;
+  double f_approx_exp_arg_red_t_10_0h;
+  double f_approx_exp_arg_red_t_11_0h;
+  double f_approx_exp_arg_red_t_12_0h;
+  double f_approx_exp_arg_red_t_13_0h, f_approx_exp_arg_red_t_13_0m;
+ 
+
+
+  f_approx_exp_arg_red_t_1_0h = f_approx_exp_arg_red_coeff_6h;
+  f_approx_exp_arg_red_t_2_0h = f_approx_exp_arg_red_t_1_0h * x;
+  f_approx_exp_arg_red_t_3_0h = f_approx_exp_arg_red_coeff_5h + f_approx_exp_arg_red_t_2_0h;
+  f_approx_exp_arg_red_t_4_0h = f_approx_exp_arg_red_t_3_0h * x;
+  f_approx_exp_arg_red_t_5_0h = f_approx_exp_arg_red_coeff_4h + f_approx_exp_arg_red_t_4_0h;
+  f_approx_exp_arg_red_t_6_0h = f_approx_exp_arg_red_t_5_0h * x;
+  f_approx_exp_arg_red_t_7_0h = f_approx_exp_arg_red_coeff_3h + f_approx_exp_arg_red_t_6_0h;
+  f_approx_exp_arg_red_t_8_0h = f_approx_exp_arg_red_t_7_0h * x;
+  f_approx_exp_arg_red_t_9_0h = f_approx_exp_arg_red_coeff_2h + f_approx_exp_arg_red_t_8_0h;
+  f_approx_exp_arg_red_t_10_0h = f_approx_exp_arg_red_t_9_0h * x;
+  f_approx_exp_arg_red_t_11_0h = f_approx_exp_arg_red_coeff_1h + f_approx_exp_arg_red_t_10_0h;
+  f_approx_exp_arg_red_t_12_0h = f_approx_exp_arg_red_t_11_0h * x;
+  Add12(f_approx_exp_arg_red_t_13_0h,f_approx_exp_arg_red_t_13_0m,f_approx_exp_arg_red_coeff_0h,f_approx_exp_arg_red_t_12_0h);
+  *f_approx_exp_arg_red_resh = f_approx_exp_arg_red_t_13_0h; *f_approx_exp_arg_red_resm = f_approx_exp_arg_red_t_13_0m;
+
+
+}
+
+static CONST double f_approx_twoPower_Index_Hi[32] = {
+  1,
+  1.021897148654116627,
+  1.0442737824274137548,
+  1.067140400676823697,
+  1.0905077326652576897,
+  1.114386742595892432,
+  1.1387886347566915646,
+  1.1637248587775774755,
+  1.1892071150027210269,
+  1.2152473599804689552,
+  1.241857812073484002,
+  1.2690509571917332199,
+  1.2968395546510096406,
+  1.3252366431597413232,
+  1.3542555469368926513,
+  1.3839098819638320226,
+  1.4142135623730951455,
+  1.4451808069770466503,
+  1.4768261459394993462,
+  1.5091644275934228414,
+  1.542210825407940744,
+  1.5759808451078864966,
+  1.6104903319492542835,
+  1.6457554781539649458,
+  1.681792830507429004,
+  1.718619298122477934,
+  1.7562521603732994535,
+  1.794709075003107168,
+  1.8340080864093424307,
+  1.8741676341102999626,
+  1.9152065613971474,
+  1.9571441241754001794
+};
+
+static CONST double f_approx_twoPower_Index_Mi[32] = {
+  0,
+  5.109225028973443893e-17,
+  8.551889705537964892e-17,
+  -7.899853966841582122e-17,
+  -3.046782079812471147e-17,
+  1.0410278456845570955e-16,
+  8.912812676025407777e-17,
+  3.8292048369240934987e-17,
+  3.982015231465646111e-17,
+  -7.71263069268148813e-17,
+  4.658027591836936791e-17,
+  2.667932131342186095e-18,
+  2.5382502794888314959e-17,
+  -2.858731210038861373e-17,
+  7.700948379802989461e-17,
+  -6.770511658794786287e-17,
+  -9.66729331345291345e-17,
+  -3.023758134993987319e-17,
+  -3.4839945568927957958e-17,
+  -1.016455327754295039e-16,
+  7.949834809697620856e-17,
+  -1.013691647127830398e-17,
+  2.470719256979788785e-17,
+  -1.0125679913674772604e-16,
+  8.19901002058149652e-17,
+  -1.851380418263110988e-17,
+  2.960140695448873307e-17,
+  1.822745842791208677e-17,
+  3.283107224245627203e-17,
+  -6.122763413004142561e-17,
+  -1.0619946056195962638e-16,
+  8.960767791036667767e-17
+};
+
+#define f_approx_argred_log2_of_base_times_two_to_w 4.616624130844682838e1
+#define f_approx_argred_minus_logbase_of_2_times_two_to_minus_w_hi -2.1660849392498290195e-2
+#define f_approx_argred_minus_logbase_of_2_times_two_to_minus_w_mi -7.24702129326968612e-19
+#define f_approx_argred_shifter 6755399441055744.0
+#define f_approx_argred_w 5
+#define f_approx_argred_idx_mask 31ull
+#define f_approx_argred_lower_32_bits 0xffffffffull
+
+static inline void scalar_exp_callout_inner(double * RESTRICT res_resh, double * RESTRICT res_resm, double xh) {
+  double zh;
+  double poly_resh, poly_resm;
+
+  double t;
+  double shifted_t;
+  double mAsDouble;
+  db_number argRedCaster;
+  int mAsInt;
+  int E;
+  int E1;
+  int E2;
+  int idx;
+  double rescaled_m_hi;
+  double rescaled_m_mi;
+  double table_hi;
+  double table_mi;
+  double tableTimesPoly_hi;
+  double tableTimesPoly_mi;
+  db_number twoE1;
+  db_number twoE2;
+  double twoE1tablePoly_hi;
+  double twoE1tablePoly_mi;
+
+
+  t = xh * f_approx_argred_log2_of_base_times_two_to_w;
+  shifted_t = t + f_approx_argred_shifter;
+  mAsDouble = shifted_t - f_approx_argred_shifter;
+  argRedCaster.d = shifted_t;
+  mAsInt = (int) (argRedCaster.l & f_approx_argred_lower_32_bits);
+  E = mAsInt >> f_approx_argred_w;
+  E1 = E >> 1;
+  E2 = E - E1;
+  idx = mAsInt & f_approx_argred_idx_mask;
+  Mul122(&rescaled_m_hi, &rescaled_m_mi, mAsDouble, f_approx_argred_minus_logbase_of_2_times_two_to_minus_w_hi, f_approx_argred_minus_logbase_of_2_times_two_to_minus_w_mi);
+  zh = (xh + rescaled_m_hi) + rescaled_m_mi;
+
+  f_approx_exp_arg_red(&poly_resh, &poly_resm, zh);
+
+  table_hi = f_approx_twoPower_Index_Hi[idx];
+  table_mi = f_approx_twoPower_Index_Mi[idx];
+  Mul22(&tableTimesPoly_hi,&tableTimesPoly_mi,table_hi,table_mi,poly_resh,poly_resm);
+  twoE1.l = E1 + 1023ll;
+  twoE1.l <<= 52;
+  twoE2.l = E2 + 1023ll;
+  twoE2.l <<= 52;
+  twoE1tablePoly_hi = twoE1.d * tableTimesPoly_hi;
+  twoE1tablePoly_mi = twoE1.d * tableTimesPoly_mi;
+  *res_resh = twoE2.d * twoE1tablePoly_hi;
+  *res_resm = twoE2.d * twoE1tablePoly_mi;
+
+}
+
+/* A scalar exponential for the callout */
+static inline double scalar_exp_callout(double x) {
+  dblcast xdb, xAbsdb;
+  double yh, yl, twoM600, two600;
+
+  xdb.d = x;
+  xAbsdb.i = xdb.i & 0x7fffffffffffffffull;
+  if (xAbsdb.i >= 0x7ff0000000000000ull) {
+    /* If we are here, we have an Inf or a Nan */
+    if (xAbsdb.i == 0x7ff0000000000000ull) {
+      /* Here, the input is an Inf */
+      if (xdb.i >> 63) {
+	/* x = -Inf, return 0 */
+	return 0.0;
+      }
+      /* x = +Inf, return +Inf */
+      return x;
+    }
+
+    /* Here, the input is a NaN */
+    return 1.0 + x;
+  }
+
+  /* Here, the input is real. 
+     
+     Start by checking if we have evident under- or overflow.
+
+     We have evident underflow if x <= -746.0 
+     and     evident overflow  if x >= 711.0. 
+  */
+  if (x <= -746.0) {
+    /* Return a completely underflowed result */
+    twoM600 = 2.4099198651028841177407500347125089364310049545099e-181;
+    
+    return twoM600 * twoM600;
+  }
+  if (x >= 711.0) {
+    /* Return a completely overflowed result */
+    two600 = 4.1495155688809929585124078636911611510124462322424e180;
+    
+    return two600 * two600;
+  }
+
+  /* Here, the input will not provoke any huge overflow or underflow
+     but there might still be some under- or overflow.
+
+     Now check if x is that small in magnitude that returning 1.0 + x
+     suffices to well approximate the exponential (up to a relative
+     error of 2^-53). This is surely the case when abs(x) <= 0.75 *
+     2^-26.
+  */
+  if (fabs(x) <= 1.11758708953857421875e-8) {
+    return 1.0 + x;
+  }
+
+  /* Here, the input is real. There might still be some slight under-
+     or overflow on output.
+
+     Just use a metalibm generated function.
+
+  */
+  scalar_exp_callout_inner(&yh, &yl, x);
+
+  return yh + yl;
+}
+
+/* A vector exponential callout */
+static inline Vec4d vector_exp_callout(Vec4d x) {
+  int i;
+  Vec4d y;
+  for (i=0;i<VECTOR_LENGTH;i++) {
+    y[i] = scalar_exp_callout(x[i]);
+  }
+  return y;
+}
+
+/* Generated polynomial for vector exponential */
+
+#define vector_exp_poly_coeff_0h 1.00000000000000000000000000000000000000000000000000000000000000000000000000000000e+00
+#define vector_exp_poly_coeff_1h 1.00000000000000643929354282590793445706367492675781250000000000000000000000000000e+00
+#define vector_exp_poly_coeff_2h 4.99999999999983513188084316425374709069728851318359375000000000000000000000000000e-01
+#define vector_exp_poly_coeff_3h 1.66666666665578222517041240280377678573131561279296875000000000000000000000000000e-01
+#define vector_exp_poly_coeff_4h 4.16666666679390979011188278491317760199308395385742187500000000000000000000000000e-02
+#define vector_exp_poly_coeff_5h 8.33333338463836288678709962596258264966309070587158203125000000000000000000000000e-03
+#define vector_exp_poly_coeff_6h 1.38888885906261988316401367882235717843286693096160888671875000000000000000000000e-03
+#define vector_exp_poly_coeff_7h 1.98411714150174687447750199176255136990221217274665832519531250000000000000000000e-04
+#define vector_exp_poly_coeff_8h 2.48018422092243362301385717350044046725088264793157577514648437500000000000000000e-05
+#define vector_exp_poly_coeff_9h 2.76397570196414793205260534980638453816936817020177841186523437500000000000000000e-06
+#define vector_exp_poly_coeff_10h 2.75111392508451855531313940197990497438240709016099572181701660156250000000000000e-07
+
+
+static inline void vector_exp_poly(double * RESTRICT vector_exp_poly_resh, double x) {
+
+
+
+
+  double vector_exp_poly_t_1_0h;
+  double vector_exp_poly_t_2_0h;
+  double vector_exp_poly_t_3_0h;
+  double vector_exp_poly_t_4_0h;
+  double vector_exp_poly_t_5_0h;
+  double vector_exp_poly_t_6_0h;
+  double vector_exp_poly_t_7_0h;
+  double vector_exp_poly_t_8_0h;
+  double vector_exp_poly_t_9_0h;
+  double vector_exp_poly_t_10_0h;
+  double vector_exp_poly_t_11_0h;
+  double vector_exp_poly_t_12_0h;
+  double vector_exp_poly_t_13_0h;
+  double vector_exp_poly_t_14_0h;
+  double vector_exp_poly_t_15_0h;
+  double vector_exp_poly_t_16_0h;
+  double vector_exp_poly_t_17_0h;
+  double vector_exp_poly_t_18_0h;
+  double vector_exp_poly_t_19_0h;
+  double vector_exp_poly_t_20_0h;
+  double vector_exp_poly_t_21_0h;
+ 
+
+
+  vector_exp_poly_t_1_0h = vector_exp_poly_coeff_10h;
+  vector_exp_poly_t_2_0h = vector_exp_poly_t_1_0h * x;
+  vector_exp_poly_t_3_0h = vector_exp_poly_coeff_9h + vector_exp_poly_t_2_0h;
+  vector_exp_poly_t_4_0h = vector_exp_poly_t_3_0h * x;
+  vector_exp_poly_t_5_0h = vector_exp_poly_coeff_8h + vector_exp_poly_t_4_0h;
+  vector_exp_poly_t_6_0h = vector_exp_poly_t_5_0h * x;
+  vector_exp_poly_t_7_0h = vector_exp_poly_coeff_7h + vector_exp_poly_t_6_0h;
+  vector_exp_poly_t_8_0h = vector_exp_poly_t_7_0h * x;
+  vector_exp_poly_t_9_0h = vector_exp_poly_coeff_6h + vector_exp_poly_t_8_0h;
+  vector_exp_poly_t_10_0h = vector_exp_poly_t_9_0h * x;
+  vector_exp_poly_t_11_0h = vector_exp_poly_coeff_5h + vector_exp_poly_t_10_0h;
+  vector_exp_poly_t_12_0h = vector_exp_poly_t_11_0h * x;
+  vector_exp_poly_t_13_0h = vector_exp_poly_coeff_4h + vector_exp_poly_t_12_0h;
+  vector_exp_poly_t_14_0h = vector_exp_poly_t_13_0h * x;
+  vector_exp_poly_t_15_0h = vector_exp_poly_coeff_3h + vector_exp_poly_t_14_0h;
+  vector_exp_poly_t_16_0h = vector_exp_poly_t_15_0h * x;
+  vector_exp_poly_t_17_0h = vector_exp_poly_coeff_2h + vector_exp_poly_t_16_0h;
+  vector_exp_poly_t_18_0h = vector_exp_poly_t_17_0h * x;
+  vector_exp_poly_t_19_0h = vector_exp_poly_coeff_1h + vector_exp_poly_t_18_0h;
+  vector_exp_poly_t_20_0h = vector_exp_poly_t_19_0h * x;
+  vector_exp_poly_t_21_0h = vector_exp_poly_coeff_0h + vector_exp_poly_t_20_0h;
+  *vector_exp_poly_resh = vector_exp_poly_t_21_0h;
+
+
+}
+
+/* A vector exponential */
+Vec4d vec4_exp_vectorlibm(Vec4d x) {
+  int i;
+  int okaySlots;
+  Vec4d y;
+  double shiftedXTLog2e, eDouble, t, r;
+  int E;
+  double p;
+  dblcast twoE;
+
+  /* Check if we can handle all inputs */
+  okaySlots = 0;
+  for (i=0;i<VECTOR_LENGTH;i++) {
+    okaySlots += (fabs(x[i]) < 700.0); 
+  }
+
+  /* Perform a callout if we cannot handle the input in one slot */
+  if (okaySlots != VECTOR_LENGTH) {
+    return vector_exp_callout(x);
+  }
+
+  /* Here we know that all inputs are real and do not provoke under-
+     or overflow in output 
+  */
+  for (i=0;i<VECTOR_LENGTH;i++) {
+    shiftedXTLog2e = x[i] * LOG2_E + SHIFTER;
+    eDouble = shiftedXTLog2e - SHIFTER;
+    E = (int) eDouble;
+    t = x[i] - eDouble * LOG_2_HI; /* exact: trailing bits of constant 0, Sterbenz */
+    r = t - eDouble * LOG_2_LO;
+    vector_exp_poly(&p,r);
+    twoE.i = E + 1023;
+    twoE.i <<= 52;
+    y[i] = twoE.d * p;
+  }
+  return y;
+}
--- a/src/spicelib/devices/bsim3v32/vec4_log.c
+++ b/src/spicelib/devices/bsim3v32/vec4_log.c
@ -0,0 +1,769 @@
+/* This program implements a show-case vector (vectorizable) double
+   precision logarithm with a 4 ulp error bound.
+
+   Author: Christoph Lauter,
+
+           Sorbonne Université - LIP6 - PEQUAN team.
+
+   This program uses code generated using Sollya and Metalibm; see the
+   licences and exception texts below.
+
+   This program is
+
+   Copyright 2014-2018 Christoph Lauter Sorbonne Université
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided
+   with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+   OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/*
+
+    This code was generated using non-trivial code generation commands
+    of the Metalibm software program.
+
+    Before using, modifying and/or integrating this code into other
+    software, review the copyright and license status of this
+    generated code. In particular, see the exception below.
+
+    This generated program is partly or entirely based on a program
+    generated using non-trivial code generation commands of the Sollya
+    software program. See the copyright notice and exception text
+    referring to that Sollya-generated part of this program generated
+    with Metalibm below.
+
+    Metalibm is
+
+    Copyright 2008-2013 by
+
+    Laboratoire de l'Informatique du Parallélisme,
+    UMR CNRS - ENS Lyon - UCB Lyon 1 - INRIA 5668
+
+    and by
+
+    Laboratoire d'Informatique de Paris 6, equipe PEQUAN,
+    UPMC Universite Paris 06 - CNRS - UMR 7606 - LIP6, Paris, France.
+
+    Contributors: Christoph Quirin Lauter
+                  (UPMC LIP6 PEQUAN formerly LIP/ENS Lyon)
+                  christoph.lauter@lip6.fr
+
+		  and
+
+		  Olga Kupriianova
+		  (UPMC LIP6 PEQUAN)
+		  olga.kupriianova@lip6.fr
+
+    Metalibm was formerly developed by the Arenaire project at Ecole
+    Normale Superieure de Lyon and is now developed by Equipe PEQUAN
+    at Universite Pierre et Marie Curie Paris 6.
+
+    The Metalibm software program is free software; you can
+    redistribute it and/or modify it under the terms of the GNU Lesser
+    General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option)
+    any later version.
+
+    Metalibm is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with the Metalibm program; if not, write to the Free
+    Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+    02111-1307, USA.
+
+    This generated program is distributed WITHOUT ANY WARRANTY; without
+    even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+    PARTICULAR PURPOSE.
+
+    As a special exception, you may create a larger work that contains
+    part or all of this software generated using Metalibm and
+    distribute that work under terms of your choice, so long as that
+    work isn't itself a numerical code generator using the skeleton of
+    this code or a modified version thereof as a code skeleton.
+    Alternatively, if you modify or redistribute this generated code
+    itself, or its skeleton, you may (at your option) remove this
+    special exception, which will cause this generated code and its
+    skeleton and the resulting Metalibm output files to be licensed
+    under the General Public licence (version 2) without this special
+    exception.
+
+    This special exception was added by the Metalibm copyright holders
+    on November 20th 2013.
+
+*/
+
+
+
+/*
+    This code was generated using non-trivial code generation commands of
+    the Sollya software program.
+
+    Before using, modifying and/or integrating this code into other
+    software, review the copyright and license status of this generated
+    code. In particular, see the exception below.
+
+    Sollya is
+
+    Copyright 2006-2013 by
+
+    Laboratoire de l'Informatique du Parallelisme, UMR CNRS - ENS Lyon -
+    UCB Lyon 1 - INRIA 5668,
+
+    Laboratoire d'Informatique de Paris 6, equipe PEQUAN, UPMC Universite
+    Paris 06 - CNRS - UMR 7606 - LIP6, Paris, France
+
+    and by
+
+    Centre de recherche INRIA Sophia-Antipolis Mediterranee, equipe APICS,
+    Sophia Antipolis, France.
+
+    Contributors Ch. Lauter, S. Chevillard, M. Joldes
+
+    christoph.lauter@ens-lyon.org
+    sylvain.chevillard@ens-lyon.org
+    joldes@lass.fr
+
+    The Sollya software is a computer program whose purpose is to provide
+    an environment for safe floating-point code development. It is
+    particularily targeted to the automatized implementation of
+    mathematical floating-point libraries (libm). Amongst other features,
+    it offers a certified infinity norm, an automatic polynomial
+    implementer and a fast Remez algorithm.
+
+    The Sollya software is governed by the CeCILL-C license under French
+    law and abiding by the rules of distribution of free software.  You
+    can use, modify and/ or redistribute the software under the terms of
+    the CeCILL-C license as circulated by CEA, CNRS and INRIA at the
+    following URL "http://www.cecill.info".
+
+    As a counterpart to the access to the source code and rights to copy,
+    modify and redistribute granted by the license, users are provided
+    only with a limited warranty and the software's author, the holder of
+    the economic rights, and the successive licensors have only limited
+    liability.
+
+    In this respect, the user's attention is drawn to the risks associated
+    with loading, using, modifying and/or developing or reproducing the
+    software by the user in light of its specific status of free software,
+    that may mean that it is complicated to manipulate, and that also
+    therefore means that it is reserved for developers and experienced
+    professionals having in-depth computer knowledge. Users are therefore
+    encouraged to load and test the software's suitability as regards
+    their requirements in conditions enabling the security of their
+    systems and/or data to be ensured and, more generally, to use and
+    operate it in the same conditions as regards security.
+
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+
+    The Sollya program is distributed WITHOUT ANY WARRANTY; without even
+    the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+    PURPOSE.
+
+    This generated program is distributed WITHOUT ANY WARRANTY; without
+    even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+    PARTICULAR PURPOSE.
+
+    As a special exception, you may create a larger work that contains
+    part or all of this software generated using Sollya and distribute
+    that work under terms of your choice, so long as that work isn't
+    itself a numerical code generator using the skeleton of this code or a
+    modified version thereof as a code skeleton.  Alternatively, if you
+    modify or redistribute this generated code itself, or its skeleton,
+    you may (at your option) remove this special exception, which will
+    cause this generated code and its skeleton and the resulting Sollya
+    output files to be licensed under the CeCILL-C licence without this
+    special exception.
+
+    This special exception was added by the Sollya copyright holders in
+    version 4.1 of Sollya.
+
+*/
+
+/* Modified by Florian Ballenegger, Anamosic Ballenegger Design, 2020.
+Use of gcc vector extensions instead of pointers to double */
+
+#include <stdint.h>
+
+/* Two caster types */
+typedef union _dblcast {
+  double   d;
+  uint64_t i;
+} dblcast;
+
+typedef union {
+  int64_t l;
+  double d;
+} db_number;
+
+/* Compiler tricks and hints */
+#define INLINE inline
+#define RESTRICT restrict
+#define STATIC static
+#define CONST const
+
+/* Vector length */
+#define VECTOR_LENGTH 4
+typedef double Vec4d __attribute__ ((vector_size (sizeof(double)*VECTOR_LENGTH),
+ aligned (sizeof(double)*VECTOR_LENGTH)));
+
+/* Macro implementations of some double-double operations */
+#define Add12(s, r, a, b)                       \
+  {double _z, _a=a, _b=b;                       \
+    s = _a + _b;                                \
+    _z = s - _a;                                \
+    r = _b - _z;   }
+
+#define Mul22(zh,zl,xh,xl,yh,yl)                        \
+  {                                                     \
+    double mh, ml;                                      \
+                                                        \
+    const double c = 134217729.;                        \
+    double up, u1, u2, vp, v1, v2;                      \
+                                                        \
+    up = (xh)*c;        vp = (yh)*c;                    \
+    u1 = ((xh)-up)+up;  v1 = ((yh)-vp)+vp;              \
+    u2 = (xh)-u1;       v2 = (yh)-v1;                   \
+                                                        \
+    mh = (xh)*(yh);                                     \
+    ml = (((u1*v1-mh)+(u1*v2))+(u2*v1))+(u2*v2);        \
+                                                        \
+    ml += (xh)*(yl) + (xl)*(yh);                        \
+    *zh = mh+ml;                                        \
+    *zl = mh - (*zh) + ml;                              \
+  }
+
+#define Mul122(resh,resl,a,bh,bl)               \
+  {                                             \
+    double _t1, _t2, _t3, _t4;                  \
+                                                \
+    Mul12(&_t1,&_t2,(a),(bh));                  \
+    _t3 = (a) * (bl);                           \
+    _t4 = _t2 + _t3;                            \
+    Add12((*(resh)),(*(resl)),_t1,_t4);         \
+  }
+
+#define Add22(zh,zl,xh,xl,yh,yl)                \
+  do {                                          \
+    double _r,_s;                               \
+    _r = (xh)+(yh);                             \
+    _s = ((((xh)-_r) +(yh)) + (yl)) + (xl);     \
+    *zh = _r+_s;                                \
+    *zl = (_r - (*zh)) + _s;                    \
+  } while(0)
+
+#define Mul12(rh,rl,u,v)                                \
+  {                                                     \
+    const double c  = 134217729.; /* 2^27 +1 */         \
+    double up, u1, u2, vp, v1, v2;                      \
+    double _u =u, _v=v;                                 \
+                                                        \
+    up = _u*c;        vp = _v*c;                        \
+    u1 = (_u-up)+up;  v1 = (_v-vp)+vp;                  \
+    u2 = _u-u1;       v2 = _v-v1;                       \
+                                                        \
+    *rh = _u*_v;                                        \
+    *rl = (((u1*v1-*rh)+(u1*v2))+(u2*v1))+(u2*v2);      \
+  }
+
+
+/* Need fabs */
+double fabs(double);
+
+
+/* Some constants */
+
+#define LOG_TWO_HI 0.693147180559890330187045037746429443359375
+#define LOG_TWO_LO 5.4979230187083711552420206887059365096458163346682e-14
+
+
+/* A metalibm generated function for the callout */
+#define f_approx_log_arg_red_coeff_1h 1.00000000000000000000000000000000000000000000000000000000000000000000000000000000e+00
+#define f_approx_log_arg_red_coeff_2h -4.99999999999998390176614293523016385734081268310546875000000000000000000000000000e-01
+#define f_approx_log_arg_red_coeff_3h 3.33333333333923731434111914495588280260562896728515625000000000000000000000000000e-01
+#define f_approx_log_arg_red_coeff_4h -2.50000000052116866378071335930144414305686950683593750000000000000000000000000000e-01
+#define f_approx_log_arg_red_coeff_5h 1.99999988486698782041983690760389436036348342895507812500000000000000000000000000e-01
+#define f_approx_log_arg_red_coeff_6h -1.66666258081627438603078417145297862589359283447265625000000000000000000000000000e-01
+#define f_approx_log_arg_red_coeff_7h 1.42921894210221167575980416586389765143394470214843750000000000000000000000000000e-01
+#define f_approx_log_arg_red_coeff_8h -1.25915254741829296669664017827017232775688171386718750000000000000000000000000000e-01
+
+
+STATIC INLINE void f_approx_log_arg_red(double * RESTRICT f_approx_log_arg_red_resh, double * RESTRICT f_approx_log_arg_red_resm, double xh, double xm) {
+
+
+
+
+  double f_approx_log_arg_red_t_1_0h;
+  double f_approx_log_arg_red_t_2_0h;
+  double f_approx_log_arg_red_t_3_0h;
+  double f_approx_log_arg_red_t_4_0h;
+  double f_approx_log_arg_red_t_5_0h;
+  double f_approx_log_arg_red_t_6_0h;
+  double f_approx_log_arg_red_t_7_0h;
+  double f_approx_log_arg_red_t_8_0h;
+  double f_approx_log_arg_red_t_9_0h;
+  double f_approx_log_arg_red_t_10_0h;
+  double f_approx_log_arg_red_t_11_0h;
+  double f_approx_log_arg_red_t_12_0h;
+  double f_approx_log_arg_red_t_13_0h;
+  double f_approx_log_arg_red_t_14_0h;
+  double f_approx_log_arg_red_t_15_0h, f_approx_log_arg_red_t_15_0m;
+  double f_approx_log_arg_red_t_16_0h, f_approx_log_arg_red_t_16_0m;
+ 
+
+
+  f_approx_log_arg_red_t_1_0h = f_approx_log_arg_red_coeff_8h;
+  f_approx_log_arg_red_t_2_0h = f_approx_log_arg_red_t_1_0h * xh;
+  f_approx_log_arg_red_t_3_0h = f_approx_log_arg_red_coeff_7h + f_approx_log_arg_red_t_2_0h;
+  f_approx_log_arg_red_t_4_0h = f_approx_log_arg_red_t_3_0h * xh;
+  f_approx_log_arg_red_t_5_0h = f_approx_log_arg_red_coeff_6h + f_approx_log_arg_red_t_4_0h;
+  f_approx_log_arg_red_t_6_0h = f_approx_log_arg_red_t_5_0h * xh;
+  f_approx_log_arg_red_t_7_0h = f_approx_log_arg_red_coeff_5h + f_approx_log_arg_red_t_6_0h;
+  f_approx_log_arg_red_t_8_0h = f_approx_log_arg_red_t_7_0h * xh;
+  f_approx_log_arg_red_t_9_0h = f_approx_log_arg_red_coeff_4h + f_approx_log_arg_red_t_8_0h;
+  f_approx_log_arg_red_t_10_0h = f_approx_log_arg_red_t_9_0h * xh;
+  f_approx_log_arg_red_t_11_0h = f_approx_log_arg_red_coeff_3h + f_approx_log_arg_red_t_10_0h;
+  f_approx_log_arg_red_t_12_0h = f_approx_log_arg_red_t_11_0h * xh;
+  f_approx_log_arg_red_t_13_0h = f_approx_log_arg_red_coeff_2h + f_approx_log_arg_red_t_12_0h;
+  f_approx_log_arg_red_t_14_0h = f_approx_log_arg_red_t_13_0h * xh;
+  Add12(f_approx_log_arg_red_t_15_0h,f_approx_log_arg_red_t_15_0m,f_approx_log_arg_red_coeff_1h,f_approx_log_arg_red_t_14_0h);
+  Mul22(&f_approx_log_arg_red_t_16_0h,&f_approx_log_arg_red_t_16_0m,f_approx_log_arg_red_t_15_0h,f_approx_log_arg_red_t_15_0m,xh,xm);
+  *f_approx_log_arg_red_resh = f_approx_log_arg_red_t_16_0h; *f_approx_log_arg_red_resm = f_approx_log_arg_red_t_16_0m;
+
+
+}
+
+
+#define f_approx_tablewidth 5
+#define f_approx_maxindex 14
+#define f_approx_rcpr_log_two_of_base_hi 0.69314718055994528622676398299518041312694549560546875
+#define f_approx_rcpr_log_two_of_base_mi 2.3190468138462995584177710797133615750739959242786823734316925538223586045205593109130859375e-17
+
+static const double f_approx_log_rcpr_tbl_hi[33] = {
+  0,
+  3.17486983145802981187699742804397828876972198486328125e-2,
+  6.453852113757117814341057737692608498036861419677734375e-2,
+  8.985632912186104770402295116582536138594150543212890625e-2,
+  0.1158318155251217007606356901305844075977802276611328125,
+  0.142500062607283040083672176479012705385684967041015625,
+  0.169899036795397473387225772967212833464145660400390625,
+  0.1980699137620937910764240541539038531482219696044921875,
+  0.2270574506353460753071971112149185501039028167724609375,
+  0.2468600779315257842672082233548280782997608184814453125,
+  0.2670627852490452536216025691828690469264984130859375,
+  0.2981533723190763485177967595518566668033599853515625,
+  0.319430770766361227241958431477542035281658172607421875,
+  0.34117075740276714412857472780160605907440185546875,
+  -0.329753286372467979692402195723843760788440704345703125,
+  -0.30702503529491187439504074063734151422977447509765625,
+  -0.28376817313064461867355703361681662499904632568359375,
+  -0.27193371548364175804834985683555714786052703857421875,
+  -0.2478361639045812692128123444490483961999416351318359375,
+  -0.223143551314209764857565687634632922708988189697265625,
+  -0.2105647691073496419189581274622469209134578704833984375,
+  -0.1849223384940119896402421773018431849777698516845703125,
+  -0.17185025692665922836255276706651784479618072509765625,
+  -0.1451820098444978890395162807180895470082759857177734375,
+  -0.1315763577887192614657152489598956890404224395751953125,
+  -0.1177830356563834557359626842298894189298152923583984375,
+  -0.10379679368164355934833764649738441221415996551513671875,
+  -7.522342123758753162920953627690323628485202789306640625e-2,
+  -6.062462181643483993820353816772694699466228485107421875e-2,
+  -4.58095360312942012637194011404062621295452117919921875e-2,
+  -3.077165866675368732785500469617545604705810546875e-2,
+  -1.5504186535965254478686148331689764745533466339111328125e-2,
+  0
+};
+
+
+static const double f_approx_log_rcpr_tbl_mi[33] = {
+  0,
+  3.03822630846808578765259986229142635550407126467467068542394059704747633077204227447509765625e-18,
+  -6.4704866616929329974161813916713618427728286285169519154170103547585313208401203155517578125e-18,
+  -6.2737601636895940223772151595043522169967894903434509935868934604741298244334757328033447265625e-19,
+  4.33848436980809595557198228135728192959103146527353490891076859270469867624342441558837890625e-18,
+  -9.9263882342257491397106905651454915981827472977916566876377402195430477149784564971923828125e-18,
+  -4.8680087644390707941393631766999763543363602831990049994714819803220962057821452617645263671875e-19,
+  3.74284348246143901356926696786621497402944711010920782190414257684096810407936573028564453125e-18,
+  9.551415762738488431492098722158984238118586922020904206309666051311069168150424957275390625e-18,
+  1.3617433717483680171009009478499574446783469284919833308666881066528731025755405426025390625e-17,
+  -7.3289153273201694886198949831953541788954485227476805253576941368010011501610279083251953125e-18,
+  -1.72069586744586603715170366469832022772114935873187524517646806998527608811855316162109375e-17,
+  1.3542568572648110745997524461078410815028703905694095442624469427528310916386544704437255859375e-18,
+  -1.936679006260286699473802044740827141118261398825169117277056329839979298412799835205078125e-17,
+  -2.122020616196946023332814001844389995179410458238009572207172226399052306078374385833740234375e-18,
+  1.231991620010196428468632499036271595368677926845939196720536301654647104442119598388671875e-17,
+  2.0326655811266561230291019136542876238402571524729010865595313362064189277589321136474609375e-17,
+  -7.8331963769744201243220009945333356568337002449775477268267831476578066940419375896453857421875e-19,
+  1.24322095787025231818185093190325423423584424116919953939852661051190807484090328216552734375e-17,
+  9.091270597324799048711045191818233254271755021066504787174977764152572490274906158447265625e-18,
+  4.24940531472989532850360049655226441340213720053550945643383585093033616431057453155517578125e-18,
+  -3.023661415357406426577090417003710240867302228907377570354952922571101225912570953369140625e-18,
+  6.0224538210113704760318352588172818979944380808860641962620974254605243913829326629638671875e-18,
+  -8.2424187830224753896228153425798328521705177161500548155270706729425000958144664764404296875e-18,
+  -1.112300087972958802991298461231701795529693224825161512736571012283093295991420745849609375e-17,
+  1.197168574759367729935408317875380291366461975031726568119427867031845380552113056182861328125e-18,
+  -5.47772415726659012592706002045618002605660904524354816447218041730593540705740451812744140625e-18,
+  5.93060419629324071708218111258442537327230935598090626192924190718258614651858806610107421875e-18,
+  -2.642402593872693418157455274069099088532417945381102798718675472855466068722307682037353515625e-18,
+  -1.90295986647425706325531188416869176372485943199669260195161513138373265974223613739013671875e-18,
+  -1.0431732029005967805059792190367890366163673586242621564579291515428849379532039165496826171875e-18,
+  3.27832102289242912962985506573138544887782756899054594813824881072150674299336969852447509765625e-19,
+  0
+};
+
+
+static const double f_approx_rcpr_tbl[33] = {
+  1.0,
+  0.96875,
+  0.9375,
+  0.9140625,
+  0.890625,
+  0.8671875,
+  0.84375,
+  0.8203125,
+  0.796875,
+  0.78125,
+  0.765625,
+  0.7421875,
+  0.7265625,
+  0.7109375,
+  0.6953125,
+  0.6796875,
+  0.6640625,
+  0.65625,
+  0.640625,
+  0.625,
+  0.6171875,
+  0.6015625,
+  0.59375,
+  0.578125,
+  0.5703125,
+  0.5625,
+  0.5546875,
+  0.5390625,
+  0.53125,
+  0.5234375,
+  0.515625,
+  0.5078125,
+  0.5
+};
+
+STATIC INLINE void scalar_log_callout_inner(double * RESTRICT res_resh, double * RESTRICT res_resm, double xh) {
+
+  db_number argRedCaster;
+  int E;
+  int index;
+  double ed;
+  double m;
+  double r;
+  double zh;
+  double zm;
+  double mrh, mrl;
+  double temp;
+  double polyHi;
+  double polyMi;
+  double tableHi;
+  double tableMi;
+  double scaledExpoHi;
+  double scaledExpoMi;
+  double logMHi;
+  double logMMi;
+
+
+  argRedCaster.d = xh;
+  E = 0;
+  if (argRedCaster.l < 0x0010000000000000) {
+    argRedCaster.d *= 9007199254740992.0;
+    E -= 53;
+  }
+  E += (int) ((argRedCaster.l >> 52) - 1023ll);
+  index = (int) ((argRedCaster.l & 0x000fffffffffffffull) >> (52 - f_approx_tablewidth - 1));
+  index = (index + 1) >> 1;
+  if (index >= f_approx_maxindex) E++;
+  ed = (double) E;
+  argRedCaster.l = (argRedCaster.l & 0x800fffffffffffffull) | 0x3ff0000000000000ull;
+  m = argRedCaster.d;
+  r = f_approx_rcpr_tbl[index];
+  Mul12(&mrh,&mrl,m,r);
+  temp = mrh - 1.0;
+  Add12(zh,zm,temp,mrl);
+
+  f_approx_log_arg_red(&polyHi, &polyMi, zh, zm);
+
+
+  tableHi = f_approx_log_rcpr_tbl_hi[index];
+  tableMi = f_approx_log_rcpr_tbl_mi[index];
+  Mul122(&scaledExpoHi,&scaledExpoMi,ed,f_approx_rcpr_log_two_of_base_hi,f_approx_rcpr_log_two_of_base_mi);
+  Add22(&logMHi,&logMMi,tableHi,tableMi,polyHi,polyMi);
+  Add22(res_resh,res_resm,scaledExpoHi,scaledExpoMi,logMHi,logMMi);
+
+
+}
+
+
+/* A scalar logarithm for the callout */
+STATIC INLINE double scalar_log_callout(double x) {
+  dblcast xdb;
+  double yh, yl;
+  double temp;
+
+  /* Check for special inputs: x less than the smallest positive
+     subnormal, x Inf or NaN 
+  */
+  xdb.d = x;
+  if ((xdb.i == 0x0ull) || (xdb.i >= 0x7ff0000000000000ull)) {
+    /* Here, we have a special case to handle 
+
+       The input is either +/-0, negative, +/-Inf or +/- NaN.
+
+    */
+    if ((xdb.i & 0x7fffffffffffffffull) >= 0x7ff0000000000000ull) {
+      /* The input is either Inf or NaN */
+      if ((xdb.i & 0x7fffffffffffffffull) > 0x7ff0000000000000ull) {
+	/* The input is NaN. Return the quietized NaN */
+	return 1.0 + x;
+      }
+      /* The input is +Inf or -Inf */
+      if ((xdb.i & 0x8000000000000000ull) == 0x0ull) {
+	/* The input is +Inf. Return log(+Inf) = + Inf. */
+	return x;
+      }
+      /* The input is -Inf. Let the case fall through */
+    }
+
+    /* The input is +/- 0, -Inf or a negative real number */
+    if (x == 0.0) {
+      /* The input is +/-0. Return -Inf and raise the division-by-zero
+	 exception.
+      */
+      temp = 1.0 - 1.0;   /* temp = +0.0 or -0.0 */
+      temp = temp * temp; /* temp = +0.0 */
+      return -1.0 / temp; /* Return -Inf and raise div-by-zero. */
+    }
+
+    /* The input is -Inf or a negative real number.
+
+       Return NaN and raise the invalid exception.
+
+    */
+    temp = 0.0;
+    return temp / temp; /* Return NaN and raise invalid. */
+  }
+
+  /* Here the input is a positive subnormal or normal 
+
+     Just call a Metalibm generated function.
+ 
+  */
+  scalar_log_callout_inner(&yh, &yl, x);
+
+  /* Return the result */
+  return yh + yl;
+}
+
+/* A vector logarithm callout */
+STATIC INLINE Vec4d vector_log_callout(Vec4d x) {
+  int i;
+  Vec4d y;
+  for (i=0;i<VECTOR_LENGTH;i++) {
+    y[i] = scalar_log_callout(x[i]);
+  }
+  return y;
+}
+
+/* Generated polynomial for vector logarithm */
+#define vector_log_poly_coeff_1h 1.00000000000000000000000000000000000000000000000000000000000000000000000000000000e+00
+#define vector_log_poly_coeff_2h -5.00000000000000999200722162640886381268501281738281250000000000000000000000000000e-01
+#define vector_log_poly_coeff_3h 3.33333333333384995711412557284347712993621826171875000000000000000000000000000000e-01
+#define vector_log_poly_coeff_4h -2.49999999999541949735615276040334720164537429809570312500000000000000000000000000e-01
+#define vector_log_poly_coeff_5h 1.99999999982921977670358160139585379511117935180664062500000000000000000000000000e-01
+#define vector_log_poly_coeff_6h -1.66666666708135652319455743963771965354681015014648437500000000000000000000000000e-01
+#define vector_log_poly_coeff_7h 1.42857144801517760290821001945005264133214950561523437500000000000000000000000000e-01
+#define vector_log_poly_coeff_8h -1.25000000676456918258239170427259523421525955200195312500000000000000000000000000e-01
+#define vector_log_poly_coeff_9h 1.11111007470194977919675238808849826455116271972656250000000000000000000000000000e-01
+#define vector_log_poly_coeff_10h -9.99997732686361273657382753299316391348838806152343750000000000000000000000000000e-02
+#define vector_log_poly_coeff_11h 9.09118368248343633464259028187370859086513519287109375000000000000000000000000000e-02
+#define vector_log_poly_coeff_12h -8.33440688797140172283661740948446094989776611328125000000000000000000000000000000e-02
+#define vector_log_poly_coeff_13h 7.68928106123701327057062826497713103890419006347656250000000000000000000000000000e-02
+#define vector_log_poly_coeff_14h -7.12109533797148086531336730331531725823879241943359375000000000000000000000000000e-02
+#define vector_log_poly_coeff_15h 6.65850051807088672006784690893255174160003662109375000000000000000000000000000000e-02
+#define vector_log_poly_coeff_16h -6.43233317758114681028658310424361843615770339965820312500000000000000000000000000e-02
+#define vector_log_poly_coeff_17h 6.31209736682013661246415381356200668960809707641601562500000000000000000000000000e-02
+#define vector_log_poly_coeff_18h -5.44324247927492413379191305011772783473134040832519531250000000000000000000000000e-02
+#define vector_log_poly_coeff_19h 3.23620871610351343306000160282565047964453697204589843750000000000000000000000000e-02
+#define vector_log_poly_coeff_20h -9.16877113215055876416226254832508857361972332000732421875000000000000000000000000e-03
+
+
+STATIC void vector_log_poly(double * RESTRICT vector_log_poly_resh, double x) {
+
+
+
+
+  double vector_log_poly_t_1_0h;
+  double vector_log_poly_t_2_0h;
+  double vector_log_poly_t_3_0h;
+  double vector_log_poly_t_4_0h;
+  double vector_log_poly_t_5_0h;
+  double vector_log_poly_t_6_0h;
+  double vector_log_poly_t_7_0h;
+  double vector_log_poly_t_8_0h;
+  double vector_log_poly_t_9_0h;
+  double vector_log_poly_t_10_0h;
+  double vector_log_poly_t_11_0h;
+  double vector_log_poly_t_12_0h;
+  double vector_log_poly_t_13_0h;
+  double vector_log_poly_t_14_0h;
+  double vector_log_poly_t_15_0h;
+  double vector_log_poly_t_16_0h;
+  double vector_log_poly_t_17_0h;
+  double vector_log_poly_t_18_0h;
+  double vector_log_poly_t_19_0h;
+  double vector_log_poly_t_20_0h;
+  double vector_log_poly_t_21_0h;
+  double vector_log_poly_t_22_0h;
+  double vector_log_poly_t_23_0h;
+  double vector_log_poly_t_24_0h;
+  double vector_log_poly_t_25_0h;
+  double vector_log_poly_t_26_0h;
+  double vector_log_poly_t_27_0h;
+  double vector_log_poly_t_28_0h;
+  double vector_log_poly_t_29_0h;
+  double vector_log_poly_t_30_0h;
+  double vector_log_poly_t_31_0h;
+  double vector_log_poly_t_32_0h;
+  double vector_log_poly_t_33_0h;
+  double vector_log_poly_t_34_0h;
+  double vector_log_poly_t_35_0h;
+  double vector_log_poly_t_36_0h;
+  double vector_log_poly_t_37_0h;
+  double vector_log_poly_t_38_0h;
+  double vector_log_poly_t_39_0h;
+  double vector_log_poly_t_40_0h;
+ 
+
+
+  vector_log_poly_t_1_0h = vector_log_poly_coeff_20h;
+  vector_log_poly_t_2_0h = vector_log_poly_t_1_0h * x;
+  vector_log_poly_t_3_0h = vector_log_poly_coeff_19h + vector_log_poly_t_2_0h;
+  vector_log_poly_t_4_0h = vector_log_poly_t_3_0h * x;
+  vector_log_poly_t_5_0h = vector_log_poly_coeff_18h + vector_log_poly_t_4_0h;
+  vector_log_poly_t_6_0h = vector_log_poly_t_5_0h * x;
+  vector_log_poly_t_7_0h = vector_log_poly_coeff_17h + vector_log_poly_t_6_0h;
+  vector_log_poly_t_8_0h = vector_log_poly_t_7_0h * x;
+  vector_log_poly_t_9_0h = vector_log_poly_coeff_16h + vector_log_poly_t_8_0h;
+  vector_log_poly_t_10_0h = vector_log_poly_t_9_0h * x;
+  vector_log_poly_t_11_0h = vector_log_poly_coeff_15h + vector_log_poly_t_10_0h;
+  vector_log_poly_t_12_0h = vector_log_poly_t_11_0h * x;
+  vector_log_poly_t_13_0h = vector_log_poly_coeff_14h + vector_log_poly_t_12_0h;
+  vector_log_poly_t_14_0h = vector_log_poly_t_13_0h * x;
+  vector_log_poly_t_15_0h = vector_log_poly_coeff_13h + vector_log_poly_t_14_0h;
+  vector_log_poly_t_16_0h = vector_log_poly_t_15_0h * x;
+  vector_log_poly_t_17_0h = vector_log_poly_coeff_12h + vector_log_poly_t_16_0h;
+  vector_log_poly_t_18_0h = vector_log_poly_t_17_0h * x;
+  vector_log_poly_t_19_0h = vector_log_poly_coeff_11h + vector_log_poly_t_18_0h;
+  vector_log_poly_t_20_0h = vector_log_poly_t_19_0h * x;
+  vector_log_poly_t_21_0h = vector_log_poly_coeff_10h + vector_log_poly_t_20_0h;
+  vector_log_poly_t_22_0h = vector_log_poly_t_21_0h * x;
+  vector_log_poly_t_23_0h = vector_log_poly_coeff_9h + vector_log_poly_t_22_0h;
+  vector_log_poly_t_24_0h = vector_log_poly_t_23_0h * x;
+  vector_log_poly_t_25_0h = vector_log_poly_coeff_8h + vector_log_poly_t_24_0h;
+  vector_log_poly_t_26_0h = vector_log_poly_t_25_0h * x;
+  vector_log_poly_t_27_0h = vector_log_poly_coeff_7h + vector_log_poly_t_26_0h;
+  vector_log_poly_t_28_0h = vector_log_poly_t_27_0h * x;
+  vector_log_poly_t_29_0h = vector_log_poly_coeff_6h + vector_log_poly_t_28_0h;
+  vector_log_poly_t_30_0h = vector_log_poly_t_29_0h * x;
+  vector_log_poly_t_31_0h = vector_log_poly_coeff_5h + vector_log_poly_t_30_0h;
+  vector_log_poly_t_32_0h = vector_log_poly_t_31_0h * x;
+  vector_log_poly_t_33_0h = vector_log_poly_coeff_4h + vector_log_poly_t_32_0h;
+  vector_log_poly_t_34_0h = vector_log_poly_t_33_0h * x;
+  vector_log_poly_t_35_0h = vector_log_poly_coeff_3h + vector_log_poly_t_34_0h;
+  vector_log_poly_t_36_0h = vector_log_poly_t_35_0h * x;
+  vector_log_poly_t_37_0h = vector_log_poly_coeff_2h + vector_log_poly_t_36_0h;
+  vector_log_poly_t_38_0h = vector_log_poly_t_37_0h * x;
+  vector_log_poly_t_39_0h = vector_log_poly_coeff_1h + vector_log_poly_t_38_0h;
+  vector_log_poly_t_40_0h = vector_log_poly_t_39_0h * x;
+  *vector_log_poly_resh = vector_log_poly_t_40_0h;
+
+
+}
+
+/* A vector logarithm */
+Vec4d vec4_log_vectorlibm(Vec4d x) {
+  int i;
+  int okaySlots;
+  Vec4d y;
+  dblcast xdb;
+  uint64_t tui1, tui2, tui3;
+  int E;
+  double eDouble, m, r, p, elog2h, elog2l;
+  double t1h, t1l, t2, t3;
+
+  /* Check if we can handle all inputs */
+  okaySlots = 0;
+  for (i=0;i<VECTOR_LENGTH;i++) {
+    xdb.d = x[i];
+    okaySlots += ((xdb.i >= 0x0020000000000000ull) && (xdb.i < 0x7ff0000000000000ull));
+  }
+
+  /* Perform a callout if we cannot handle the input in one slot */
+  if (okaySlots != VECTOR_LENGTH) {
+    return vector_log_callout(x);
+  }
+
+  /* Here, the input is real, and far enough from the subnormal
+     range
+  */
+  for (i=0;i<VECTOR_LENGTH;i++) {
+    xdb.d = x[i];
+    tui1 = xdb.i;
+    tui2 = tui1 + 0x0008000000000000ull;
+    tui1 >>= 52;
+    tui2 >>= 52;
+    tui3 = tui2 - tui1;
+    tui3 <<= 52;
+    E = ((int) tui2) - 1023;
+    eDouble = (double) E;
+    xdb.i = ((xdb.i & 0x000fffffffffffffull) | 0x3ff0000000000000ull) - tui3;
+    m = xdb.d;                     /* 2^E * m = x exactly, 0.75 <= m < 1.5 */
+    r = m - 1.0;                   /* exact: Sterbenz */
+    vector_log_poly(&p, r);
+    elog2h = eDouble * LOG_TWO_HI; /* exact: trailing zeros */
+    elog2l = eDouble * LOG_TWO_LO;
+    t1h = elog2h + p;
+    t2 = t1h - elog2h;
+    t1l = p - t2;                  /* exact: t1h + t1l = elog2h + p */
+    t3 = elog2l + t1l;
+    y[i] = t1h + t3; 
+  }
+  return y;
+}
--- a/visualc/vngspice.vcxproj
+++ b/visualc/vngspice.vcxproj
@ -667,13 +667,15 @@
      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
      <RuntimeTypeInfo>true</RuntimeTypeInfo>
-      <OpenMPSupport>true</OpenMPSupport>
+      <OpenMPSupport>
+      </OpenMPSupport>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <WarningLevel>Level4</WarningLevel>
      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
      <CompileAs>CompileAsC</CompileAs>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <AdditionalOptions>/openmp:experimental %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
      <AdditionalDependencies>psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
@ -1111,6 +1113,7 @@
    <ClInclude Include="..\src\spicelib\devices\bsim3v1\bsim3v1ext.h" />
    <ClInclude Include="..\src\spicelib\devices\bsim3v1\bsim3v1init.h" />
    <ClInclude Include="..\src\spicelib\devices\bsim3v1\bsim3v1itf.h" />
+    <ClInclude Include="..\src\spicelib\devices\bsim3v32\b3v32acm.h" />
    <ClInclude Include="..\src\spicelib\devices\bsim3v32\bsim3v32def.h" />
    <ClInclude Include="..\src\spicelib\devices\bsim3v32\bsim3v32ext.h" />
    <ClInclude Include="..\src\spicelib\devices\bsim3v32\bsim3v32init.h" />
@ -1846,11 +1849,19 @@
    <ClCompile Include="..\src\spicelib\devices\bsim3v1\bsim3v1init.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32acld.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32acm.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ask.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32check.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32cvtest.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32getic.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ld.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ldsel.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ldseq.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ldseq_simd4.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ldseq_simd4_omp.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ldseq_simd8.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ldsimd.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32ldsimd8.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32mask.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32mdel.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32mpar.c" />
@ -1862,6 +1873,8 @@
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32temp.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\b3v32trunc.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3v32\bsim3v32init.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\vec4_exp.c" />
+    <ClCompile Include="..\src\spicelib\devices\bsim3v32\vec4_log.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3\b3.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3\b3acld.c" />
    <ClCompile Include="..\src\spicelib\devices\bsim3\b3ask.c" />