incorporate KLU source files from SuiteSparse-3.7.0.tar.gz

2014-04-21 14:11:28 +02:00 · 2014-04-21 14:11:28 +02:00 · 691f4b14f5
parent f65715242d
commit 691f4b14f5
44 changed files with 17674 additions and 0 deletions
--- a/src/maths/KLU/UFconfig.h
+++ b/src/maths/KLU/UFconfig.h
@ -0,0 +1,151 @@
+/* ========================================================================== */
+/* === UFconfig.h =========================================================== */
+/* ========================================================================== */
+
+/* Configuration file for SuiteSparse: a Suite of Sparse matrix packages
+ * (AMD, COLAMD, CCOLAMD, CAMD, CHOLMOD, UMFPACK, CXSparse, and others).
+ *
+ * UFconfig.h provides the definition of the long integer.  On most systems,
+ * a C program can be compiled in LP64 mode, in which long's and pointers are
+ * both 64-bits, and int's are 32-bits.  Windows 64, however, uses the LLP64
+ * model, in which int's and long's are 32-bits, and long long's and pointers
+ * are 64-bits.
+ *
+ * SuiteSparse packages that include long integer versions are
+ * intended for the LP64 mode.  However, as a workaround for Windows 64
+ * (and perhaps other systems), the long integer can be redefined.
+ *
+ * If _WIN64 is defined, then the __int64 type is used instead of long.
+ *
+ * The long integer can also be defined at compile time.  For example, this
+ * could be added to UFconfig.mk:
+ *
+ * CFLAGS = -O -D'UF_long=long long' -D'UF_long_max=9223372036854775801' \
+ *   -D'UF_long_idd="lld"'
+ *
+ * This file defines UF_long as either long (on all but _WIN64) or
+ * __int64 on Windows 64.  The intent is that a UF_long is always a 64-bit
+ * integer in a 64-bit code.  ptrdiff_t might be a better choice than long;
+ * it is always the same size as a pointer.
+ *
+ * This file also defines the SUITESPARSE_VERSION and related definitions.
+ *
+ * Copyright (c) 2007, University of Florida.  No licensing restrictions
+ * apply to this file or to the UFconfig directory.  Author: Timothy A. Davis.
+ */
+
+#ifndef _UFCONFIG_H
+#define _UFCONFIG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <limits.h>
+#include <stdlib.h>
+
+/* ========================================================================== */
+/* === UF_long ============================================================== */
+/* ========================================================================== */
+
+#ifndef UF_long
+
+#ifdef _WIN64
+
+#define UF_long __int64
+#define UF_long_max _I64_MAX
+#define UF_long_idd "I64d"
+
+#else
+
+#define UF_long long
+#define UF_long_max LONG_MAX
+#define UF_long_idd "ld"
+
+#endif
+#define UF_long_id "%" UF_long_idd
+#endif
+
+/* ========================================================================== */
+/* === UFconfig parameters and functions ==================================== */
+/* ========================================================================== */
+
+/* SuiteSparse-wide parameters will be placed in this struct.  So far, they
+   are only used by RBio. */
+
+typedef struct UFconfig_struct
+{
+    void *(*malloc_memory) (size_t) ;           /* pointer to malloc */
+    void *(*realloc_memory) (void *, size_t) ;  /* pointer to realloc */
+    void (*free_memory) (void *) ;              /* pointer to free */
+    void *(*calloc_memory) (size_t, size_t) ;   /* pointer to calloc */
+
+} UFconfig ;
+
+void *UFmalloc              /* pointer to allocated block of memory */
+(
+    size_t nitems,          /* number of items to malloc (>=1 is enforced) */
+    size_t size_of_item,    /* sizeof each item */
+    int *ok,                /* TRUE if successful, FALSE otherwise */
+    UFconfig *config        /* SuiteSparse-wide configuration */
+) ;
+
+void *UFfree                /* always returns NULL */
+(
+    void *p,                /* block to free */
+    UFconfig *config        /* SuiteSparse-wide configuration */
+) ;
+
+
+/* ========================================================================== */
+/* === SuiteSparse version ================================================== */
+/* ========================================================================== */
+
+/* SuiteSparse is not a package itself, but a collection of packages, some of
+ * which must be used together (UMFPACK requires AMD, CHOLMOD requires AMD,
+ * COLAMD, CAMD, and CCOLAMD, etc).  A version number is provided here for the
+ * collection itself.  The versions of packages within each version of
+ * SuiteSparse are meant to work together.  Combining one packge from one
+ * version of SuiteSparse, with another package from another version of
+ * SuiteSparse, may or may not work.
+ *
+ * SuiteSparse Version 3.7.0 contains the following packages:
+ *
+ *  UFconfig        version 3.7.0 (version always the same as SuiteSparse)
+ *  AMD             version 2.2.3
+ *  CAMD            version 2.2.3
+ *  CCOLAMD         version 2.7.4
+ *  COLAMD          version 2.7.4
+ *  BTF             version 1.1.3
+ *  CHOLMOD         version 1.7.4
+ *  CSparse3        version 3.0.2
+ *  CSparse         version 2.2.6
+ *  CXSparse        version 2.2.6
+ *  KLU             version 1.1.3
+ *  LDL             version 2.0.4
+ *  RBio            version 2.0.2
+ *  SPQR            version 1.2.3 (also called SuiteSparseQR)
+ *  UFcollection    version 1.6.0
+ *  UMFPACK         version 5.5.2
+ *  SSMULT          version 2.0.3
+ *  spqr_rank       version 1.0.0
+ *  MATLAB_Tools    various packages & M-files.  No specific version number.
+ *
+ * Other package dependencies:
+ *  BLAS            required by CHOLMOD and UMFPACK
+ *  LAPACK          required by CHOLMOD
+ *  METIS 4.0.1     required by CHOLMOD (optional) and KLU (optional)
+ */
+
+#define SUITESPARSE_DATE "Dec 15, 2011"
+#define SUITESPARSE_VER_CODE(main,sub) ((main) * 1000 + (sub))
+#define SUITESPARSE_MAIN_VERSION 3
+#define SUITESPARSE_SUB_VERSION 7
+#define SUITESPARSE_SUBSUB_VERSION 0
+#define SUITESPARSE_VERSION \
+    SUITESPARSE_VER_CODE(SUITESPARSE_MAIN_VERSION,SUITESPARSE_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/maths/KLU/amd.h
+++ b/src/maths/KLU/amd.h
@ -0,0 +1,412 @@
+/* ========================================================================= */
+/* === AMD:  approximate minimum degree ordering =========================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD Version 2.2, Copyright (c) 2007 by Timothy A. Davis,                  */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* AMD finds a symmetric ordering P of a matrix A so that the Cholesky
+ * factorization of P*A*P' has fewer nonzeros and takes less work than the
+ * Cholesky factorization of A.  If A is not symmetric, then it performs its
+ * ordering on the matrix A+A'.  Two sets of user-callable routines are
+ * provided, one for int integers and the other for UF_long integers.
+ *
+ * The method is based on the approximate minimum degree algorithm, discussed
+ * in Amestoy, Davis, and Duff, "An approximate degree ordering algorithm",
+ * SIAM Journal of Matrix Analysis and Applications, vol. 17, no. 4, pp.
+ * 886-905, 1996.  This package can perform both the AMD ordering (with
+ * aggressive absorption), and the AMDBAR ordering (without aggressive
+ * absorption) discussed in the above paper.  This package differs from the
+ * Fortran codes discussed in the paper:
+ *
+ *       (1) it can ignore "dense" rows and columns, leading to faster run times
+ *       (2) it computes the ordering of A+A' if A is not symmetric
+ *       (3) it is followed by a depth-first post-ordering of the assembly tree
+ *           (or supernodal elimination tree)
+ *
+ * For historical reasons, the Fortran versions, amd.f and amdbar.f, have
+ * been left (nearly) unchanged.  They compute the identical ordering as
+ * described in the above paper.
+ */
+
+#ifndef AMD_H
+#define AMD_H
+
+/* make it easy for C++ programs to include AMD */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* get the definition of size_t: */
+#include <stddef.h>
+
+/* define UF_long */
+#include "UFconfig.h"
+
+int amd_order                  /* returns AMD_OK, AMD_OK_BUT_JUMBLED,
+                                * AMD_INVALID, or AMD_OUT_OF_MEMORY */
+(
+    int n,                     /* A is n-by-n.  n must be >= 0. */
+    const int Ap [ ],          /* column pointers for A, of size n+1 */
+    const int Ai [ ],          /* row indices of A, of size nz = Ap [n] */
+    int P [ ],                 /* output permutation, of size n */
+    double Control [ ],        /* input Control settings, of size AMD_CONTROL */
+    double Info [ ]            /* output Info statistics, of size AMD_INFO */
+) ;
+
+UF_long amd_l_order            /* see above for description of arguments */
+(
+    UF_long n,
+    const UF_long Ap [ ],
+    const UF_long Ai [ ],
+    UF_long P [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+/* Input arguments (not modified):
+ *
+ *       n: the matrix A is n-by-n.
+ *       Ap: an int/UF_long array of size n+1, containing column pointers of A.
+ *       Ai: an int/UF_long array of size nz, containing the row indices of A,
+ *           where nz = Ap [n].
+ *       Control:  a double array of size AMD_CONTROL, containing control
+ *           parameters.  Defaults are used if Control is NULL.
+ *
+ * Output arguments (not defined on input):
+ *
+ *       P: an int/UF_long array of size n, containing the output permutation. If
+ *           row i is the kth pivot row, then P [k] = i.  In MATLAB notation,
+ *           the reordered matrix is A (P,P).
+ *       Info: a double array of size AMD_INFO, containing statistical
+ *           information.  Ignored if Info is NULL.
+ *
+ * On input, the matrix A is stored in column-oriented form.  The row indices
+ * of nonzero entries in column j are stored in Ai [Ap [j] ... Ap [j+1]-1].
+ *
+ * If the row indices appear in ascending order in each column, and there
+ * are no duplicate entries, then amd_order is slightly more efficient in
+ * terms of time and memory usage.  If this condition does not hold, a copy
+ * of the matrix is created (where these conditions do hold), and the copy is
+ * ordered.  This feature is new to v2.0 (v1.2 and earlier required this
+ * condition to hold for the input matrix).
+ * 
+ * Row indices must be in the range 0 to
+ * n-1.  Ap [0] must be zero, and thus nz = Ap [n] is the number of nonzeros
+ * in A.  The array Ap is of size n+1, and the array Ai is of size nz = Ap [n].
+ * The matrix does not need to be symmetric, and the diagonal does not need to
+ * be present (if diagonal entries are present, they are ignored except for
+ * the output statistic Info [AMD_NZDIAG]).  The arrays Ai and Ap are not
+ * modified.  This form of the Ap and Ai arrays to represent the nonzero
+ * pattern of the matrix A is the same as that used internally by MATLAB.
+ * If you wish to use a more flexible input structure, please see the
+ * umfpack_*_triplet_to_col routines in the UMFPACK package, at
+ * http://www.cise.ufl.edu/research/sparse/umfpack.
+ *
+ * Restrictions:  n >= 0.  Ap [0] = 0.  Ap [j] <= Ap [j+1] for all j in the
+ *       range 0 to n-1.  nz = Ap [n] >= 0.  Ai [0..nz-1] must be in the range 0
+ *       to n-1.  Finally, Ai, Ap, and P must not be NULL.  If any of these
+ *       restrictions are not met, AMD returns AMD_INVALID.
+ *
+ * AMD returns:
+ *
+ *       AMD_OK if the matrix is valid and sufficient memory can be allocated to
+ *           perform the ordering.
+ *
+ *       AMD_OUT_OF_MEMORY if not enough memory can be allocated.
+ *
+ *       AMD_INVALID if the input arguments n, Ap, Ai are invalid, or if P is
+ *           NULL.
+ *
+ *       AMD_OK_BUT_JUMBLED if the matrix had unsorted columns, and/or duplicate
+ *           entries, but was otherwise valid.
+ *
+ * The AMD routine first forms the pattern of the matrix A+A', and then
+ * computes a fill-reducing ordering, P.  If P [k] = i, then row/column i of
+ * the original is the kth pivotal row.  In MATLAB notation, the permuted
+ * matrix is A (P,P), except that 0-based indexing is used instead of the
+ * 1-based indexing in MATLAB.
+ *
+ * The Control array is used to set various parameters for AMD.  If a NULL
+ * pointer is passed, default values are used.  The Control array is not
+ * modified.
+ *
+ *       Control [AMD_DENSE]:  controls the threshold for "dense" rows/columns.
+ *           A dense row/column in A+A' can cause AMD to spend a lot of time in
+ *           ordering the matrix.  If Control [AMD_DENSE] >= 0, rows/columns
+ *           with more than Control [AMD_DENSE] * sqrt (n) entries are ignored
+ *           during the ordering, and placed last in the output order.  The
+ *           default value of Control [AMD_DENSE] is 10.  If negative, no
+ *           rows/columns are treated as "dense".  Rows/columns with 16 or
+ *           fewer off-diagonal entries are never considered "dense".
+ *
+ *       Control [AMD_AGGRESSIVE]: controls whether or not to use aggressive
+ *           absorption, in which a prior element is absorbed into the current
+ *           element if is a subset of the current element, even if it is not
+ *           adjacent to the current pivot element (refer to Amestoy, Davis,
+ *           & Duff, 1996, for more details).  The default value is nonzero,
+ *           which means to perform aggressive absorption.  This nearly always
+ *           leads to a better ordering (because the approximate degrees are
+ *           more accurate) and a lower execution time.  There are cases where
+ *           it can lead to a slightly worse ordering, however.  To turn it off,
+ *           set Control [AMD_AGGRESSIVE] to 0.
+ *
+ *       Control [2..4] are not used in the current version, but may be used in
+ *           future versions.
+ *
+ * The Info array provides statistics about the ordering on output.  If it is
+ * not present, the statistics are not returned.  This is not an error
+ * condition.
+ * 
+ *       Info [AMD_STATUS]:  the return value of AMD, either AMD_OK,
+ *           AMD_OK_BUT_JUMBLED, AMD_OUT_OF_MEMORY, or AMD_INVALID.
+ *
+ *       Info [AMD_N]: n, the size of the input matrix
+ *
+ *       Info [AMD_NZ]: the number of nonzeros in A, nz = Ap [n]
+ *
+ *       Info [AMD_SYMMETRY]:  the symmetry of the matrix A.  It is the number
+ *           of "matched" off-diagonal entries divided by the total number of
+ *           off-diagonal entries.  An entry A(i,j) is matched if A(j,i) is also
+ *           an entry, for any pair (i,j) for which i != j.  In MATLAB notation,
+ *                S = spones (A) ;
+ *                B = tril (S, -1) + triu (S, 1) ;
+ *                symmetry = nnz (B & B') / nnz (B) ;
+ *
+ *       Info [AMD_NZDIAG]: the number of entries on the diagonal of A.
+ *
+ *       Info [AMD_NZ_A_PLUS_AT]:  the number of nonzeros in A+A', excluding the
+ *           diagonal.  If A is perfectly symmetric (Info [AMD_SYMMETRY] = 1)
+ *           with a fully nonzero diagonal, then Info [AMD_NZ_A_PLUS_AT] = nz-n
+ *           (the smallest possible value).  If A is perfectly unsymmetric
+ *           (Info [AMD_SYMMETRY] = 0, for an upper triangular matrix, for
+ *           example) with no diagonal, then Info [AMD_NZ_A_PLUS_AT] = 2*nz
+ *           (the largest possible value).
+ *
+ *       Info [AMD_NDENSE]: the number of "dense" rows/columns of A+A' that were
+ *           removed from A prior to ordering.  These are placed last in the
+ *           output order P.
+ *
+ *       Info [AMD_MEMORY]: the amount of memory used by AMD, in bytes.  In the
+ *           current version, this is 1.2 * Info  [AMD_NZ_A_PLUS_AT] + 9*n
+ *           times the size of an integer.  This is at most 2.4nz + 9n.  This
+ *           excludes the size of the input arguments Ai, Ap, and P, which have
+ *           a total size of nz + 2*n + 1 integers.
+ *
+ *       Info [AMD_NCMPA]: the number of garbage collections performed.
+ *
+ *       Info [AMD_LNZ]: the number of nonzeros in L (excluding the diagonal).
+ *           This is a slight upper bound because mass elimination is combined
+ *           with the approximate degree update.  It is a rough upper bound if
+ *           there are many "dense" rows/columns.  The rest of the statistics,
+ *           below, are also slight or rough upper bounds, for the same reasons.
+ *           The post-ordering of the assembly tree might also not exactly
+ *           correspond to a true elimination tree postordering.
+ *
+ *       Info [AMD_NDIV]: the number of divide operations for a subsequent LDL'
+ *           or LU factorization of the permuted matrix A (P,P).
+ *
+ *       Info [AMD_NMULTSUBS_LDL]:  the number of multiply-subtract pairs for a
+ *           subsequent LDL' factorization of A (P,P).
+ *
+ *       Info [AMD_NMULTSUBS_LU]:  the number of multiply-subtract pairs for a
+ *           subsequent LU factorization of A (P,P), assuming that no numerical
+ *           pivoting is required.
+ *
+ *       Info [AMD_DMAX]:  the maximum number of nonzeros in any column of L,
+ *           including the diagonal.
+ *
+ *       Info [14..19] are not used in the current version, but may be used in
+ *           future versions.
+ */    
+
+/* ------------------------------------------------------------------------- */
+/* direct interface to AMD */
+/* ------------------------------------------------------------------------- */
+
+/* amd_2 is the primary AMD ordering routine.  It is not meant to be
+ * user-callable because of its restrictive inputs and because it destroys
+ * the user's input matrix.  It does not check its inputs for errors, either.
+ * However, if you can work with these restrictions it can be faster than
+ * amd_order and use less memory (assuming that you can create your own copy
+ * of the matrix for AMD to destroy).  Refer to AMD/Source/amd_2.c for a
+ * description of each parameter. */
+
+void amd_2
+(
+    int n,
+    int Pe [ ],
+    int Iw [ ],
+    int Len [ ],
+    int iwlen,
+    int pfree,
+    int Nv [ ],
+    int Next [ ], 
+    int Last [ ],
+    int Head [ ],
+    int Elen [ ],
+    int Degree [ ],
+    int W [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+void amd_l2
+(
+    UF_long n,
+    UF_long Pe [ ],
+    UF_long Iw [ ],
+    UF_long Len [ ],
+    UF_long iwlen,
+    UF_long pfree,
+    UF_long Nv [ ],
+    UF_long Next [ ], 
+    UF_long Last [ ],
+    UF_long Head [ ],
+    UF_long Elen [ ],
+    UF_long Degree [ ],
+    UF_long W [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+/* ------------------------------------------------------------------------- */
+/* amd_valid */
+/* ------------------------------------------------------------------------- */
+
+/* Returns AMD_OK or AMD_OK_BUT_JUMBLED if the matrix is valid as input to
+ * amd_order; the latter is returned if the matrix has unsorted and/or
+ * duplicate row indices in one or more columns.  Returns AMD_INVALID if the
+ * matrix cannot be passed to amd_order.  For amd_order, the matrix must also
+ * be square.  The first two arguments are the number of rows and the number
+ * of columns of the matrix.  For its use in AMD, these must both equal n.
+ *
+ * NOTE: this routine returned TRUE/FALSE in v1.2 and earlier.
+ */
+
+int amd_valid
+(
+    int n_row,                 /* # of rows */
+    int n_col,                 /* # of columns */
+    const int Ap [ ],          /* column pointers, of size n_col+1 */
+    const int Ai [ ]           /* row indices, of size Ap [n_col] */
+) ;
+
+UF_long amd_l_valid
+(
+    UF_long n_row,
+    UF_long n_col,
+    const UF_long Ap [ ],
+    const UF_long Ai [ ]
+) ;
+
+/* ------------------------------------------------------------------------- */
+/* AMD memory manager and printf routines */
+/* ------------------------------------------------------------------------- */
+
+/* The user can redefine these to change the malloc, free, and printf routines
+ * that AMD uses. */
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+EXTERN void *(*amd_malloc) (size_t) ;                     /* pointer to malloc */
+EXTERN void (*amd_free) (void *) ;               /* pointer to free */
+EXTERN void *(*amd_realloc) (void *, size_t) ;            /* pointer to realloc */
+EXTERN void *(*amd_calloc) (size_t, size_t) ;             /* pointer to calloc */
+EXTERN int (*amd_printf) (const char *, ...) ;            /* pointer to printf */
+
+/* ------------------------------------------------------------------------- */
+/* AMD Control and Info arrays */
+/* ------------------------------------------------------------------------- */
+
+/* amd_defaults:  sets the default control settings */
+void amd_defaults   (double Control [ ]) ;
+void amd_l_defaults (double Control [ ]) ;
+
+/* amd_control: prints the control settings */
+void amd_control    (double Control [ ]) ;
+void amd_l_control  (double Control [ ]) ;
+
+/* amd_info: prints the statistics */
+void amd_info       (double Info [ ]) ;
+void amd_l_info     (double Info [ ]) ;
+
+#define AMD_CONTROL 5          /* size of Control array */
+#define AMD_INFO 20            /* size of Info array */
+
+/* contents of Control */
+#define AMD_DENSE 0            /* "dense" if degree > Control [0] * sqrt (n) */
+#define AMD_AGGRESSIVE 1    /* do aggressive absorption if Control [1] != 0 */
+
+/* default Control settings */
+#define AMD_DEFAULT_DENSE 10.0          /* default "dense" degree 10*sqrt(n) */
+#define AMD_DEFAULT_AGGRESSIVE 1    /* do aggressive absorption by default */
+
+/* contents of Info */
+#define AMD_STATUS 0           /* return value of amd_order and amd_l_order */
+#define AMD_N 1                /* A is n-by-n */
+#define AMD_NZ 2      /* number of nonzeros in A */ 
+#define AMD_SYMMETRY 3         /* symmetry of pattern (1 is sym., 0 is unsym.) */
+#define AMD_NZDIAG 4           /* # of entries on diagonal */
+#define AMD_NZ_A_PLUS_AT 5  /* nz in A+A' */
+#define AMD_NDENSE 6           /* number of "dense" rows/columns in A */
+#define AMD_MEMORY 7           /* amount of memory used by AMD */
+#define AMD_NCMPA 8            /* number of garbage collections in AMD */
+#define AMD_LNZ 9     /* approx. nz in L, excluding the diagonal */
+#define AMD_NDIV 10            /* number of fl. point divides for LU and LDL' */
+#define AMD_NMULTSUBS_LDL 11 /* number of fl. point (*,-) pairs for LDL' */
+#define AMD_NMULTSUBS_LU 12  /* number of fl. point (*,-) pairs for LU */
+#define AMD_DMAX 13             /* max nz. in any column of L, incl. diagonal */
+
+/* ------------------------------------------------------------------------- */
+/* return values of AMD */
+/* ------------------------------------------------------------------------- */
+
+#define AMD_OK 0           /* success */
+#define AMD_OUT_OF_MEMORY -1        /* malloc failed, or problem too large */
+#define AMD_INVALID -2              /* input arguments are not valid */
+#define AMD_OK_BUT_JUMBLED 1        /* input matrix is OK for amd_order, but
+    * columns were not sorted, and/or duplicate entries were present.  AMD had
+    * to do extra work before ordering the matrix.  This is a warning, not an
+    * error.  */
+
+/* ========================================================================== */
+/* === AMD version ========================================================== */
+/* ========================================================================== */
+
+/* AMD Version 1.2 and later include the following definitions.
+ * As an example, to test if the version you are using is 1.2 or later:
+ *
+ * #ifdef AMD_VERSION
+ *       if (AMD_VERSION >= AMD_VERSION_CODE (1,2)) ...
+ * #endif
+ *
+ * This also works during compile-time:
+ *
+ *       #if defined(AMD_VERSION) && (AMD_VERSION >= AMD_VERSION_CODE (1,2))
+ *           printf ("This is version 1.2 or later\n") ;
+ *       #else
+ *           printf ("This is an early version\n") ;
+ *       #endif
+ *
+ * Versions 1.1 and earlier of AMD do not include a #define'd version number.
+ */
+
+#define AMD_DATE "Dec 7, 2011"
+#define AMD_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define AMD_MAIN_VERSION 2
+#define AMD_SUB_VERSION 2
+#define AMD_SUBSUB_VERSION 3
+#define AMD_VERSION AMD_VERSION_CODE(AMD_MAIN_VERSION,AMD_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/maths/KLU/amd_1.c
+++ b/src/maths/KLU/amd_1.c
@ -0,0 +1,181 @@
+/* ========================================================================= */
+/* === AMD_1 =============================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* AMD_1: Construct A+A' for a sparse matrix A and perform the AMD ordering.
+ *
+ * The n-by-n sparse matrix A can be unsymmetric.  It is stored in MATLAB-style
+ * compressed-column form, with sorted row indices in each column, and no
+ * duplicate entries.  Diagonal entries may be present, but they are ignored.
+ * Row indices of column j of A are stored in Ai [Ap [j] ... Ap [j+1]-1].
+ * Ap [0] must be zero, and nz = Ap [n] is the number of entries in A.  The
+ * size of the matrix, n, must be greater than or equal to zero.
+ *
+ * This routine must be preceded by a call to AMD_aat, which computes the
+ * number of entries in each row/column in A+A', excluding the diagonal.
+ * Len [j], on input, is the number of entries in row/column j of A+A'.  This
+ * routine constructs the matrix A+A' and then calls AMD_2.  No error checking
+ * is performed (this was done in AMD_valid).
+ */
+
+#include "amd_internal.h"
+
+GLOBAL void AMD_1
+(
+    Int n,		/* n > 0 */
+    const Int Ap [ ],	/* input of size n+1, not modified */
+    const Int Ai [ ],	/* input of size nz = Ap [n], not modified */
+    Int P [ ],		/* size n output permutation */
+    Int Pinv [ ],	/* size n output inverse permutation */
+    Int Len [ ],	/* size n input, undefined on output */
+    Int slen,		/* slen >= sum (Len [0..n-1]) + 7n,
+			 * ideally slen = 1.2 * sum (Len) + 8n */
+    Int S [ ],		/* size slen workspace */
+    double Control [ ],	/* input array of size AMD_CONTROL */
+    double Info [ ]	/* output array of size AMD_INFO */
+)
+{
+    Int i, j, k, p, pfree, iwlen, pj, p1, p2, pj2, *Iw, *Pe, *Nv, *Head,
+	*Elen, *Degree, *s, *W, *Sp, *Tp ;
+
+    /* --------------------------------------------------------------------- */
+    /* construct the matrix for AMD_2 */
+    /* --------------------------------------------------------------------- */
+
+    ASSERT (n > 0) ;
+
+    iwlen = slen - 6*n ;
+    s = S ;
+    Pe = s ;	    s += n ;
+    Nv = s ;	    s += n ;
+    Head = s ;	    s += n ;
+    Elen = s ;	    s += n ;
+    Degree = s ;    s += n ;
+    W = s ;	    s += n ;
+    Iw = s ;	    s += iwlen ;
+
+    ASSERT (AMD_valid (n, n, Ap, Ai) == AMD_OK) ;
+
+    /* construct the pointers for A+A' */
+    Sp = Nv ;			/* use Nv and W as workspace for Sp and Tp [ */
+    Tp = W ;
+    pfree = 0 ;
+    for (j = 0 ; j < n ; j++)
+    {
+	Pe [j] = pfree ;
+	Sp [j] = pfree ;
+	pfree += Len [j] ;
+    }
+
+    /* Note that this restriction on iwlen is slightly more restrictive than
+     * what is strictly required in AMD_2.  AMD_2 can operate with no elbow
+     * room at all, but it will be very slow.  For better performance, at
+     * least size-n elbow room is enforced. */
+    ASSERT (iwlen >= pfree + n) ;
+
+#ifndef NDEBUG
+    for (p = 0 ; p < iwlen ; p++) Iw [p] = EMPTY ;
+#endif
+
+    for (k = 0 ; k < n ; k++)
+    {
+	AMD_DEBUG1 (("Construct row/column k= "ID" of A+A'\n", k))  ;
+	p1 = Ap [k] ;
+	p2 = Ap [k+1] ;
+
+	/* construct A+A' */
+	for (p = p1 ; p < p2 ; )
+	{
+	    /* scan the upper triangular part of A */
+	    j = Ai [p] ;
+	    ASSERT (j >= 0 && j < n) ;
+	    if (j < k)
+	    {
+		/* entry A (j,k) in the strictly upper triangular part */
+		ASSERT (Sp [j] < (j == n-1 ? pfree : Pe [j+1])) ;
+		ASSERT (Sp [k] < (k == n-1 ? pfree : Pe [k+1])) ;
+		Iw [Sp [j]++] = k ;
+		Iw [Sp [k]++] = j ;
+		p++ ;
+	    }
+	    else if (j == k)
+	    {
+		/* skip the diagonal */
+		p++ ;
+		break ;
+	    }
+	    else /* j > k */
+	    {
+		/* first entry below the diagonal */
+		break ;
+	    }
+	    /* scan lower triangular part of A, in column j until reaching
+	     * row k.  Start where last scan left off. */
+	    ASSERT (Ap [j] <= Tp [j] && Tp [j] <= Ap [j+1]) ;
+	    pj2 = Ap [j+1] ;
+	    for (pj = Tp [j] ; pj < pj2 ; )
+	    {
+		i = Ai [pj] ;
+		ASSERT (i >= 0 && i < n) ;
+		if (i < k)
+		{
+		    /* A (i,j) is only in the lower part, not in upper */
+		    ASSERT (Sp [i] < (i == n-1 ? pfree : Pe [i+1])) ;
+		    ASSERT (Sp [j] < (j == n-1 ? pfree : Pe [j+1])) ;
+		    Iw [Sp [i]++] = j ;
+		    Iw [Sp [j]++] = i ;
+		    pj++ ;
+		}
+		else if (i == k)
+		{
+		    /* entry A (k,j) in lower part and A (j,k) in upper */
+		    pj++ ;
+		    break ;
+		}
+		else /* i > k */
+		{
+		    /* consider this entry later, when k advances to i */
+		    break ;
+		}
+	    }
+	    Tp [j] = pj ;
+	}
+	Tp [k] = p ;
+    }
+
+    /* clean up, for remaining mismatched entries */
+    for (j = 0 ; j < n ; j++)
+    {
+	for (pj = Tp [j] ; pj < Ap [j+1] ; pj++)
+	{
+	    i = Ai [pj] ;
+	    ASSERT (i >= 0 && i < n) ;
+	    /* A (i,j) is only in the lower part, not in upper */
+	    ASSERT (Sp [i] < (i == n-1 ? pfree : Pe [i+1])) ;
+	    ASSERT (Sp [j] < (j == n-1 ? pfree : Pe [j+1])) ;
+	    Iw [Sp [i]++] = j ;
+	    Iw [Sp [j]++] = i ;
+	}
+    }
+
+#ifndef NDEBUG
+    for (j = 0 ; j < n-1 ; j++) ASSERT (Sp [j] == Pe [j+1]) ;
+    ASSERT (Sp [n-1] == pfree) ;
+#endif
+
+    /* Tp and Sp no longer needed ] */
+
+    /* --------------------------------------------------------------------- */
+    /* order the matrix */
+    /* --------------------------------------------------------------------- */
+
+    AMD_2 (n, Pe, Iw, Len, iwlen, pfree,
+	Nv, Pinv, P, Head, Elen, Degree, W, Control, Info) ;
+}
--- a/src/maths/KLU/amd_2.c
+++ b/src/maths/KLU/amd_2.c
--- a/src/maths/KLU/amd_aat.c
+++ b/src/maths/KLU/amd_aat.c
@ -0,0 +1,185 @@
+/* ========================================================================= */
+/* === AMD_aat ============================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* AMD_aat:  compute the symmetry of the pattern of A, and count the number of
+ * nonzeros each column of A+A' (excluding the diagonal).  Assumes the input
+ * matrix has no errors, with sorted columns and no duplicates
+ * (AMD_valid (n, n, Ap, Ai) must be AMD_OK, but this condition is not
+ * checked).
+ */
+
+#include "amd_internal.h"
+
+GLOBAL size_t AMD_aat	/* returns nz in A+A' */
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int Len [ ],	/* Len [j]: length of column j of A+A', excl diagonal*/
+    Int Tp [ ],		/* workspace of size n */
+    double Info [ ]
+)
+{
+    Int p1, p2, p, i, j, pj, pj2, k, nzdiag, nzboth, nz ;
+    double sym ;
+    size_t nzaat ;
+
+#ifndef NDEBUG
+    AMD_debug_init ("AMD AAT") ;
+    for (k = 0 ; k < n ; k++) Tp [k] = EMPTY ;
+    ASSERT (AMD_valid (n, n, Ap, Ai) == AMD_OK) ;
+#endif
+
+    if (Info != (double *) NULL)
+    {
+	/* clear the Info array, if it exists */
+	for (i = 0 ; i < AMD_INFO ; i++)
+	{
+	    Info [i] = EMPTY ;
+	}
+	Info [AMD_STATUS] = AMD_OK ;
+    }
+
+    for (k = 0 ; k < n ; k++)
+    {
+	Len [k] = 0 ;
+    }
+
+    nzdiag = 0 ;
+    nzboth = 0 ;
+    nz = Ap [n] ;
+
+    for (k = 0 ; k < n ; k++)
+    {
+	p1 = Ap [k] ;
+	p2 = Ap [k+1] ;
+	AMD_DEBUG2 (("\nAAT Column: "ID" p1: "ID" p2: "ID"\n", k, p1, p2)) ;
+
+	/* construct A+A' */
+	for (p = p1 ; p < p2 ; )
+	{
+	    /* scan the upper triangular part of A */
+	    j = Ai [p] ;
+	    if (j < k)
+	    {
+		/* entry A (j,k) is in the strictly upper triangular part,
+		 * add both A (j,k) and A (k,j) to the matrix A+A' */
+		Len [j]++ ;
+		Len [k]++ ;
+		AMD_DEBUG3 (("    upper ("ID","ID") ("ID","ID")\n", j,k, k,j));
+		p++ ;
+	    }
+	    else if (j == k)
+	    {
+		/* skip the diagonal */
+		p++ ;
+		nzdiag++ ;
+		break ;
+	    }
+	    else /* j > k */
+	    {
+		/* first entry below the diagonal */
+		break ;
+	    }
+	    /* scan lower triangular part of A, in column j until reaching
+	     * row k.  Start where last scan left off. */
+	    ASSERT (Tp [j] != EMPTY) ;
+	    ASSERT (Ap [j] <= Tp [j] && Tp [j] <= Ap [j+1]) ;
+	    pj2 = Ap [j+1] ;
+	    for (pj = Tp [j] ; pj < pj2 ; )
+	    {
+		i = Ai [pj] ;
+		if (i < k)
+		{
+		    /* A (i,j) is only in the lower part, not in upper.
+		     * add both A (i,j) and A (j,i) to the matrix A+A' */
+		    Len [i]++ ;
+		    Len [j]++ ;
+		    AMD_DEBUG3 (("    lower ("ID","ID") ("ID","ID")\n",
+			i,j, j,i)) ;
+		    pj++ ;
+		}
+		else if (i == k)
+		{
+		    /* entry A (k,j) in lower part and A (j,k) in upper */
+		    pj++ ;
+		    nzboth++ ;
+		    break ;
+		}
+		else /* i > k */
+		{
+		    /* consider this entry later, when k advances to i */
+		    break ;
+		}
+	    }
+	    Tp [j] = pj ;
+	}
+	/* Tp [k] points to the entry just below the diagonal in column k */
+	Tp [k] = p ;
+    }
+
+    /* clean up, for remaining mismatched entries */
+    for (j = 0 ; j < n ; j++)
+    {
+	for (pj = Tp [j] ; pj < Ap [j+1] ; pj++)
+	{
+	    i = Ai [pj] ;
+	    /* A (i,j) is only in the lower part, not in upper.
+	     * add both A (i,j) and A (j,i) to the matrix A+A' */
+	    Len [i]++ ;
+	    Len [j]++ ;
+	    AMD_DEBUG3 (("    lower cleanup ("ID","ID") ("ID","ID")\n",
+		i,j, j,i)) ;
+	}
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* compute the symmetry of the nonzero pattern of A */
+    /* --------------------------------------------------------------------- */
+
+    /* Given a matrix A, the symmetry of A is:
+     *	B = tril (spones (A), -1) + triu (spones (A), 1) ;
+     *  sym = nnz (B & B') / nnz (B) ;
+     *  or 1 if nnz (B) is zero.
+     */
+
+    if (nz == nzdiag)
+    {
+	sym = 1 ;
+    }
+    else
+    {
+	sym = (2 * (double) nzboth) / ((double) (nz - nzdiag)) ;
+    }
+
+    nzaat = 0 ;
+    for (k = 0 ; k < n ; k++)
+    {
+	nzaat += Len [k] ;
+    }
+
+    AMD_DEBUG1 (("AMD nz in A+A', excluding diagonal (nzaat) = %g\n",
+	(double) nzaat)) ;
+    AMD_DEBUG1 (("   nzboth: "ID" nz: "ID" nzdiag: "ID" symmetry: %g\n",
+		nzboth, nz, nzdiag, sym)) ;
+
+    if (Info != (double *) NULL)
+    {
+	Info [AMD_STATUS] = AMD_OK ;
+	Info [AMD_N] = n ;
+	Info [AMD_NZ] = nz ;
+	Info [AMD_SYMMETRY] = sym ;	    /* symmetry of pattern of A */
+	Info [AMD_NZDIAG] = nzdiag ;	    /* nonzeros on diagonal of A */
+	Info [AMD_NZ_A_PLUS_AT] = nzaat ;   /* nonzeros in A+A' */
+    }
+
+    return (nzaat) ;
+}
--- a/src/maths/KLU/amd_control.c
+++ b/src/maths/KLU/amd_control.c
@ -0,0 +1,64 @@
+/* ========================================================================= */
+/* === AMD_control ========================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable.  Prints the control parameters for AMD.  See amd.h
+ * for details.  If the Control array is not present, the defaults are
+ * printed instead.
+ */
+
+#include "amd_internal.h"
+
+GLOBAL void AMD_control
+(
+    double Control [ ]
+)
+{
+    double alpha ;
+    Int aggressive ;
+
+    if (Control != (double *) NULL)
+    {
+	alpha = Control [AMD_DENSE] ;
+	aggressive = Control [AMD_AGGRESSIVE] != 0 ;
+    }
+    else
+    {
+	alpha = AMD_DEFAULT_DENSE ;
+	aggressive = AMD_DEFAULT_AGGRESSIVE ;
+    }
+
+    PRINTF (("\nAMD version %d.%d.%d, %s: approximate minimum degree ordering\n"
+	"    dense row parameter: %g\n", AMD_MAIN_VERSION, AMD_SUB_VERSION,
+	AMD_SUBSUB_VERSION, AMD_DATE, alpha)) ;
+
+    if (alpha < 0)
+    {
+	PRINTF (("    no rows treated as dense\n")) ;
+    }
+    else
+    {
+	PRINTF ((
+	"    (rows with more than max (%g * sqrt (n), 16) entries are\n"
+	"    considered \"dense\", and placed last in output permutation)\n",
+	alpha)) ;
+    }
+
+    if (aggressive)
+    {
+	PRINTF (("    aggressive absorption:  yes\n")) ;
+    }
+    else
+    {
+	PRINTF (("    aggressive absorption:  no\n")) ;
+    }
+
+    PRINTF (("    size of AMD integer: %d\n\n", sizeof (Int))) ;
+}
--- a/src/maths/KLU/amd_defaults.c
+++ b/src/maths/KLU/amd_defaults.c
@ -0,0 +1,38 @@
+/* ========================================================================= */
+/* === AMD_defaults ======================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable.  Sets default control parameters for AMD.  See amd.h
+ * for details.
+ */
+
+#include "amd_internal.h"
+
+/* ========================================================================= */
+/* === AMD defaults ======================================================== */
+/* ========================================================================= */
+
+GLOBAL void AMD_defaults
+(
+    double Control [ ]
+)
+{
+    Int i ;
+
+    if (Control != (double *) NULL)
+    {
+	for (i = 0 ; i < AMD_CONTROL ; i++)
+	{
+	    Control [i] = 0 ;
+	}
+	Control [AMD_DENSE] = AMD_DEFAULT_DENSE ;
+	Control [AMD_AGGRESSIVE] = AMD_DEFAULT_AGGRESSIVE ;
+    }
+}
--- a/src/maths/KLU/amd_dump.c
+++ b/src/maths/KLU/amd_dump.c
@ -0,0 +1,180 @@
+/* ========================================================================= */
+/* === AMD_dump ============================================================ */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Debugging routines for AMD.  Not used if NDEBUG is not defined at compile-
+ * time (the default).  See comments in amd_internal.h on how to enable
+ * debugging.  Not user-callable.
+ */
+
+#include "amd_internal.h"
+
+#ifndef NDEBUG
+
+/* This global variable is present only when debugging */
+GLOBAL Int AMD_debug = -999 ;		/* default is no debug printing */
+
+/* ========================================================================= */
+/* === AMD_debug_init ====================================================== */
+/* ========================================================================= */
+
+/* Sets the debug print level, by reading the file debug.amd (if it exists) */
+
+GLOBAL void AMD_debug_init ( char *s )
+{
+    FILE *f ;
+    f = fopen ("debug.amd", "r") ;
+    if (f == (FILE *) NULL)
+    {
+	AMD_debug = -999 ;
+    }
+    else
+    {
+	fscanf (f, ID, &AMD_debug) ;
+	fclose (f) ;
+    }
+    if (AMD_debug >= 0)
+    {
+	printf ("%s: AMD_debug_init, D= "ID"\n", s, AMD_debug) ;
+    }
+}
+
+/* ========================================================================= */
+/* === AMD_dump ============================================================ */
+/* ========================================================================= */
+
+/* Dump AMD's data structure, except for the hash buckets.  This routine
+ * cannot be called when the hash buckets are non-empty.
+ */
+
+GLOBAL void AMD_dump (
+    Int n,	    /* A is n-by-n */
+    Int Pe [ ],	    /* pe [0..n-1]: index in iw of start of row i */
+    Int Iw [ ],	    /* workspace of size iwlen, iwlen [0..pfree-1]
+		     * holds the matrix on input */
+    Int Len [ ],    /* len [0..n-1]: length for row i */
+    Int iwlen,	    /* length of iw */
+    Int pfree,	    /* iw [pfree ... iwlen-1] is empty on input */
+    Int Nv [ ],	    /* nv [0..n-1] */
+    Int Next [ ],   /* next [0..n-1] */
+    Int Last [ ],   /* last [0..n-1] */
+    Int Head [ ],   /* head [0..n-1] */
+    Int Elen [ ],   /* size n */
+    Int Degree [ ], /* size n */
+    Int W [ ],	    /* size n */
+    Int nel
+)
+{
+    Int i, pe, elen, nv, len, e, p, k, j, deg, w, cnt, ilast ;
+
+    if (AMD_debug < 0) return ;
+    ASSERT (pfree <= iwlen) ;
+    AMD_DEBUG3 (("\nAMD dump, pfree: "ID"\n", pfree)) ;
+    for (i = 0 ; i < n ; i++)
+    {
+	pe = Pe [i] ;
+	elen = Elen [i] ;
+	nv = Nv [i] ;
+	len = Len [i] ;
+	w = W [i] ;
+
+	if (elen >= EMPTY)
+	{
+	    if (nv == 0)
+	    {
+		AMD_DEBUG3 (("\nI "ID": nonprincipal:    ", i)) ;
+		ASSERT (elen == EMPTY) ;
+		if (pe == EMPTY)
+		{
+		    AMD_DEBUG3 ((" dense node\n")) ;
+		    ASSERT (w == 1) ;
+		}
+		else
+		{
+		    ASSERT (pe < EMPTY) ;
+		    AMD_DEBUG3 ((" i "ID" -> parent "ID"\n", i, FLIP (Pe[i])));
+		}
+	    }
+	    else
+	    {
+		AMD_DEBUG3 (("\nI "ID": active principal supervariable:\n",i));
+		AMD_DEBUG3 (("   nv(i): "ID"  Flag: %d\n", nv, (nv < 0))) ;
+		ASSERT (elen >= 0) ;
+		ASSERT (nv > 0 && pe >= 0) ;
+		p = pe ;
+		AMD_DEBUG3 (("   e/s: ")) ;
+		if (elen == 0) AMD_DEBUG3 ((" : ")) ;
+		ASSERT (pe + len <= pfree) ;
+		for (k = 0 ; k < len ; k++)
+		{
+		    j = Iw [p] ;
+		    AMD_DEBUG3 (("  "ID"", j)) ;
+		    ASSERT (j >= 0 && j < n) ;
+		    if (k == elen-1) AMD_DEBUG3 ((" : ")) ;
+		    p++ ;
+		}
+		AMD_DEBUG3 (("\n")) ;
+	    }
+	}
+	else
+	{
+	    e = i ;
+	    if (w == 0)
+	    {
+		AMD_DEBUG3 (("\nE "ID": absorbed element: w "ID"\n", e, w)) ;
+		ASSERT (nv > 0 && pe < 0) ;
+		AMD_DEBUG3 ((" e "ID" -> parent "ID"\n", e, FLIP (Pe [e]))) ;
+	    }
+	    else
+	    {
+		AMD_DEBUG3 (("\nE "ID": unabsorbed element: w "ID"\n", e, w)) ;
+		ASSERT (nv > 0 && pe >= 0) ;
+		p = pe ;
+		AMD_DEBUG3 ((" : ")) ;
+		ASSERT (pe + len <= pfree) ;
+		for (k = 0 ; k < len ; k++)
+		{
+		    j = Iw [p] ;
+		    AMD_DEBUG3 (("  "ID"", j)) ;
+		    ASSERT (j >= 0 && j < n) ;
+		    p++ ;
+		}
+		AMD_DEBUG3 (("\n")) ;
+	    }
+	}
+    }
+
+    /* this routine cannot be called when the hash buckets are non-empty */
+    AMD_DEBUG3 (("\nDegree lists:\n")) ;
+    if (nel >= 0)
+    {
+	cnt = 0 ;
+	for (deg = 0 ; deg < n ; deg++)
+	{
+	    if (Head [deg] == EMPTY) continue ;
+	    ilast = EMPTY ;
+	    AMD_DEBUG3 ((ID": \n", deg)) ;
+	    for (i = Head [deg] ; i != EMPTY ; i = Next [i])
+	    {
+		AMD_DEBUG3 (("   "ID" : next "ID" last "ID" deg "ID"\n",
+		    i, Next [i], Last [i], Degree [i])) ;
+		ASSERT (i >= 0 && i < n && ilast == Last [i] &&
+		    deg == Degree [i]) ;
+		cnt += Nv [i] ;
+		ilast = i ;
+	    }
+	    AMD_DEBUG3 (("\n")) ;
+	}
+	ASSERT (cnt == n - nel) ;
+    }
+
+}
+
+#endif
--- a/src/maths/KLU/amd_global.c
+++ b/src/maths/KLU/amd_global.c
@ -0,0 +1,84 @@
+/* ========================================================================= */
+/* === amd_global ========================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+
+#ifdef MATLAB_MEX_FILE
+#include "mex.h"
+#include "matrix.h"
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+/* ========================================================================= */
+/* === Default AMD memory manager ========================================== */
+/* ========================================================================= */
+
+/* The user can redefine these global pointers at run-time to change the memory
+ * manager used by AMD.  AMD only uses malloc and free; realloc and calloc are
+ * include for completeness, in case another package wants to use the same
+ * memory manager as AMD.
+ *
+ * If compiling as a MATLAB mexFunction, the default memory manager is mxMalloc.
+ * You can also compile AMD as a standard ANSI-C library and link a mexFunction
+ * against it, and then redefine these pointers at run-time, in your
+ * mexFunction.
+ *
+ * If -DNMALLOC is defined at compile-time, no memory manager is specified at
+ * compile-time.  You must then define these functions at run-time, before
+ * calling AMD, for AMD to work properly.
+ */
+
+#ifndef NMALLOC
+#ifdef MATLAB_MEX_FILE
+/* MATLAB mexFunction: */
+void *(*amd_malloc) (size_t) = mxMalloc ;
+void (*amd_free) (void *) = mxFree ;
+void *(*amd_realloc) (void *, size_t) = mxRealloc ;
+void *(*amd_calloc) (size_t, size_t) = mxCalloc ;
+#else
+/* standard ANSI-C: */
+void *(*amd_malloc) (size_t) = malloc ;
+void (*amd_free) (void *) = free ;
+void *(*amd_realloc) (void *, size_t) = realloc ;
+void *(*amd_calloc) (size_t, size_t) = calloc ;
+#endif
+#else
+/* no memory manager defined at compile-time; you MUST define one at run-time */
+void *(*amd_malloc) (size_t) = NULL ;
+void (*amd_free) (void *) = NULL ;
+void *(*amd_realloc) (void *, size_t) = NULL ;
+void *(*amd_calloc) (size_t, size_t) = NULL ;
+#endif
+
+/* ========================================================================= */
+/* === Default AMD printf routine ========================================== */
+/* ========================================================================= */
+
+/* The user can redefine this global pointer at run-time to change the printf
+ * routine used by AMD.  If NULL, no printing occurs.  
+ *
+ * If -DNPRINT is defined at compile-time, stdio.h is not included.  Printing
+ * can then be enabled at run-time by setting amd_printf to a non-NULL function.
+ */
+
+#ifndef NPRINT
+#ifdef MATLAB_MEX_FILE
+int (*amd_printf) (const char *, ...) = mexPrintf ;
+#else
+#include <stdio.h>
+int (*amd_printf) (const char *, ...) = printf ;
+#endif
+#else
+int (*amd_printf) (const char *, ...) = NULL ;
+#endif
--- a/src/maths/KLU/amd_info.c
+++ b/src/maths/KLU/amd_info.c
@ -0,0 +1,120 @@
+/* ========================================================================= */
+/* === AMD_info ============================================================ */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable.  Prints the output statistics for AMD.  See amd.h
+ * for details.  If the Info array is not present, nothing is printed.
+ */
+
+#include "amd_internal.h"
+
+#define PRI(format,x) { if (x >= 0) { PRINTF ((format, x)) ; }}
+
+GLOBAL void AMD_info
+(
+    double Info [ ]
+)
+{
+    double n, ndiv, nmultsubs_ldl, nmultsubs_lu, lnz, lnzd ;
+
+    PRINTF (("\nAMD version %d.%d.%d, %s, results:\n",
+	AMD_MAIN_VERSION, AMD_SUB_VERSION, AMD_SUBSUB_VERSION, AMD_DATE)) ;
+
+    if (!Info)
+    {
+	return ;
+    }
+
+    n = Info [AMD_N] ;
+    ndiv = Info [AMD_NDIV] ;
+    nmultsubs_ldl = Info [AMD_NMULTSUBS_LDL] ;
+    nmultsubs_lu = Info [AMD_NMULTSUBS_LU] ;
+    lnz = Info [AMD_LNZ] ;
+    lnzd = (n >= 0 && lnz >= 0) ? (n + lnz) : (-1) ;
+
+    /* AMD return status */
+    PRINTF (("    status: ")) ;
+    if (Info [AMD_STATUS] == AMD_OK)
+    {
+	PRINTF (("OK\n")) ;
+    }
+    else if (Info [AMD_STATUS] == AMD_OUT_OF_MEMORY)
+    {
+	PRINTF (("out of memory\n")) ;
+    }
+    else if (Info [AMD_STATUS] == AMD_INVALID)
+    {
+	PRINTF (("invalid matrix\n")) ;
+    }
+    else if (Info [AMD_STATUS] == AMD_OK_BUT_JUMBLED)
+    {
+	PRINTF (("OK, but jumbled\n")) ;
+    }
+    else
+    {
+	PRINTF (("unknown\n")) ;
+    }
+
+    /* statistics about the input matrix */
+    PRI ("    n, dimension of A:                                  %.20g\n", n);
+    PRI ("    nz, number of nonzeros in A:                        %.20g\n",
+	Info [AMD_NZ]) ;
+    PRI ("    symmetry of A:                                      %.4f\n",
+	Info [AMD_SYMMETRY]) ;
+    PRI ("    number of nonzeros on diagonal:                     %.20g\n",
+	Info [AMD_NZDIAG]) ;
+    PRI ("    nonzeros in pattern of A+A' (excl. diagonal):       %.20g\n",
+	Info [AMD_NZ_A_PLUS_AT]) ;
+    PRI ("    # dense rows/columns of A+A':                       %.20g\n",
+	Info [AMD_NDENSE]) ;
+
+    /* statistics about AMD's behavior  */
+    PRI ("    memory used, in bytes:                              %.20g\n",
+	Info [AMD_MEMORY]) ;
+    PRI ("    # of memory compactions:                            %.20g\n",
+	Info [AMD_NCMPA]) ;
+
+    /* statistics about the ordering quality */
+    PRINTF (("\n"
+	"    The following approximate statistics are for a subsequent\n"
+	"    factorization of A(P,P) + A(P,P)'.  They are slight upper\n"
+	"    bounds if there are no dense rows/columns in A+A', and become\n"
+	"    looser if dense rows/columns exist.\n\n")) ;
+
+    PRI ("    nonzeros in L (excluding diagonal):                 %.20g\n",
+	lnz) ;
+    PRI ("    nonzeros in L (including diagonal):                 %.20g\n",
+	lnzd) ;
+    PRI ("    # divide operations for LDL' or LU:                 %.20g\n",
+	ndiv) ;
+    PRI ("    # multiply-subtract operations for LDL':            %.20g\n",
+	nmultsubs_ldl) ;
+    PRI ("    # multiply-subtract operations for LU:              %.20g\n",
+	nmultsubs_lu) ;
+    PRI ("    max nz. in any column of L (incl. diagonal):        %.20g\n",
+	Info [AMD_DMAX]) ;
+
+    /* total flop counts for various factorizations */
+
+    if (n >= 0 && ndiv >= 0 && nmultsubs_ldl >= 0 && nmultsubs_lu >= 0)
+    {
+	PRINTF (("\n"
+	"    chol flop count for real A, sqrt counted as 1 flop: %.20g\n"
+	"    LDL' flop count for real A:                         %.20g\n"
+	"    LDL' flop count for complex A:                      %.20g\n"
+	"    LU flop count for real A (with no pivoting):        %.20g\n"
+	"    LU flop count for complex A (with no pivoting):     %.20g\n\n",
+	n + ndiv + 2*nmultsubs_ldl,
+	    ndiv + 2*nmultsubs_ldl,
+	  9*ndiv + 8*nmultsubs_ldl,
+	    ndiv + 2*nmultsubs_lu,
+	  9*ndiv + 8*nmultsubs_lu)) ;
+    }
+}
--- a/src/maths/KLU/amd_internal.h
+++ b/src/maths/KLU/amd_internal.h
@ -0,0 +1,350 @@
+/* ========================================================================= */
+/* === amd_internal.h ====================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,                                      */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* This file is for internal use in AMD itself, and does not normally need to
+ * be included in user code (it is included in UMFPACK, however).   All others
+ * should use amd.h instead.
+ *
+ * The following compile-time definitions affect how AMD is compiled.
+ *
+ *      -DNPRINT
+ *
+ *          Disable all printing.  stdio.h will not be included.  Printing can
+ *          be re-enabled at run-time by setting the global pointer amd_printf
+ *          to printf (or mexPrintf for a MATLAB mexFunction).
+ *
+ *      -DNMALLOC
+ *
+ *          No memory manager is defined at compile-time.  You MUST define the
+ *          function pointers amd_malloc, amd_free, amd_realloc, and
+ *          amd_calloc at run-time for AMD to work properly.
+ */
+
+/* ========================================================================= */
+/* === NDEBUG ============================================================== */
+/* ========================================================================= */
+
+/*
+ * Turning on debugging takes some work (see below).   If you do not edit this
+ * file, then debugging is always turned off, regardless of whether or not
+ * -DNDEBUG is specified in your compiler options.
+ *
+ * If AMD is being compiled as a mexFunction, then MATLAB_MEX_FILE is defined,
+ * and mxAssert is used instead of assert.  If debugging is not enabled, no
+ * MATLAB include files or functions are used.  Thus, the AMD library libamd.a
+ * can be safely used in either a stand-alone C program or in another
+ * mexFunction, without any change.
+ */
+
+/*
+    AMD will be exceedingly slow when running in debug mode.  The next three
+    lines ensure that debugging is turned off.
+*/
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+/*
+    To enable debugging, uncomment the following line:
+#undef NDEBUG
+*/
+
+/* ------------------------------------------------------------------------- */
+/* ANSI include files */
+/* ------------------------------------------------------------------------- */
+
+/* from stdlib.h:  size_t, malloc, free, realloc, and calloc */
+#include <stdlib.h>
+
+#if !defined(NPRINT) || !defined(NDEBUG)
+/* from stdio.h:  printf.  Not included if NPRINT is defined at compile time.
+ * fopen and fscanf are used when debugging. */
+#include <stdio.h>
+#endif
+
+/* from limits.h:  INT_MAX and LONG_MAX */
+#include <limits.h>
+
+/* from math.h: sqrt */
+#include <math.h>
+
+/* ------------------------------------------------------------------------- */
+/* MATLAB include files (only if being used in or via MATLAB) */
+/* ------------------------------------------------------------------------- */
+
+#ifdef MATLAB_MEX_FILE
+#include "matrix.h"
+#include "mex.h"
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* basic definitions */
+/* ------------------------------------------------------------------------- */
+
+#ifdef FLIP
+#undef FLIP
+#endif
+
+#ifdef MAX
+#undef MAX
+#endif
+
+#ifdef MIN
+#undef MIN
+#endif
+
+#ifdef EMPTY
+#undef EMPTY
+#endif
+
+#ifdef GLOBAL
+#undef GLOBAL
+#endif
+
+#ifdef PRIVATE
+#undef PRIVATE
+#endif
+
+/* FLIP is a "negation about -1", and is used to mark an integer i that is
+ * normally non-negative.  FLIP (EMPTY) is EMPTY.  FLIP of a number > EMPTY
+ * is negative, and FLIP of a number < EMTPY is positive.  FLIP (FLIP (i)) = i
+ * for all integers i.  UNFLIP (i) is >= EMPTY. */
+#define EMPTY (-1)
+#define FLIP(i) (-(i)-2)
+#define UNFLIP(i) ((i < EMPTY) ? FLIP (i) : (i))
+
+/* for integer MAX/MIN, or for doubles when we don't care how NaN's behave: */
+#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+/* logical expression of p implies q: */
+#define IMPLIES(p,q) (!(p) || (q))
+
+/* Note that the IBM RS 6000 xlc predefines TRUE and FALSE in <types.h>. */
+/* The Compaq Alpha also predefines TRUE and FALSE. */
+#ifdef TRUE
+#undef TRUE
+#endif
+#ifdef FALSE
+#undef FALSE
+#endif
+
+#define TRUE (1)
+#define FALSE (0)
+#define PRIVATE static
+#define GLOBAL
+#define EMPTY (-1)
+
+/* Note that Linux's gcc 2.96 defines NULL as ((void *) 0), but other */
+/* compilers (even gcc 2.95.2 on Solaris) define NULL as 0 or (0).  We */
+/* need to use the ANSI standard value of 0. */
+#ifdef NULL
+#undef NULL
+#endif
+
+#define NULL 0
+
+/* largest value of size_t */
+#ifndef SIZE_T_MAX
+#define SIZE_T_MAX ((size_t) (-1))
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* integer type for AMD: int or UF_long */
+/* ------------------------------------------------------------------------- */
+
+/* define UF_long */
+#include "UFconfig.h"
+
+#if defined (DLONG) || defined (ZLONG)
+
+#define Int UF_long
+#define ID  UF_long_id
+#define Int_MAX UF_long_max
+
+#define AMD_order amd_l_order
+#define AMD_defaults amd_l_defaults
+#define AMD_control amd_l_control
+#define AMD_info amd_l_info
+#define AMD_1 amd_l1
+#define AMD_2 amd_l2
+#define AMD_valid amd_l_valid
+#define AMD_aat amd_l_aat
+#define AMD_postorder amd_l_postorder
+#define AMD_post_tree amd_l_post_tree
+#define AMD_dump amd_l_dump
+#define AMD_debug amd_l_debug
+#define AMD_debug_init amd_l_debug_init
+#define AMD_preprocess amd_l_preprocess
+
+#else
+
+#define Int int
+#define ID "%d"
+#define Int_MAX INT_MAX
+
+#define AMD_order amd_order
+#define AMD_defaults amd_defaults
+#define AMD_control amd_control
+#define AMD_info amd_info
+#define AMD_1 amd_1
+#define AMD_2 amd_2
+#define AMD_valid amd_valid
+#define AMD_aat amd_aat
+#define AMD_postorder amd_postorder
+#define AMD_post_tree amd_post_tree
+#define AMD_dump amd_dump
+#define AMD_debug amd_debug
+#define AMD_debug_init amd_debug_init
+#define AMD_preprocess amd_preprocess
+
+#endif
+
+/* ========================================================================= */
+/* === PRINTF macro ======================================================== */
+/* ========================================================================= */
+
+/* All output goes through the PRINTF macro.  */
+#define PRINTF(params) { if (amd_printf != NULL) (void) amd_printf params ; }
+
+/* ------------------------------------------------------------------------- */
+/* AMD routine definitions (user-callable) */
+/* ------------------------------------------------------------------------- */
+
+#include "amd.h"
+
+/* ------------------------------------------------------------------------- */
+/* AMD routine definitions (not user-callable) */
+/* ------------------------------------------------------------------------- */
+
+GLOBAL size_t AMD_aat
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int Len [ ],
+    Int Tp [ ],
+    double Info [ ]
+) ;
+
+GLOBAL void AMD_1
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int P [ ],
+    Int Pinv [ ],
+    Int Len [ ],
+    Int slen,
+    Int S [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+GLOBAL void AMD_postorder
+(
+    Int nn,
+    Int Parent [ ],
+    Int Npiv [ ],
+    Int Fsize [ ],
+    Int Order [ ],
+    Int Child [ ],
+    Int Sibling [ ],
+    Int Stack [ ]
+) ;
+
+GLOBAL Int AMD_post_tree
+(
+    Int root,
+    Int k,
+    Int Child [ ],
+    const Int Sibling [ ],
+    Int Order [ ],
+    Int Stack [ ]
+#ifndef NDEBUG
+    , Int nn
+#endif
+) ;
+
+GLOBAL void AMD_preprocess
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int Rp [ ],
+    Int Ri [ ],
+    Int W [ ],
+    Int Flag [ ]
+) ;
+
+/* ------------------------------------------------------------------------- */
+/* debugging definitions */
+/* ------------------------------------------------------------------------- */
+
+#ifndef NDEBUG
+
+/* from assert.h:  assert macro */
+#include <assert.h>
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+EXTERN Int AMD_debug ;
+
+GLOBAL void AMD_debug_init ( char *s ) ;
+
+GLOBAL void AMD_dump
+(
+    Int n,
+    Int Pe [ ],
+    Int Iw [ ],
+    Int Len [ ],
+    Int iwlen,
+    Int pfree,
+    Int Nv [ ],
+    Int Next [ ],
+    Int Last [ ],
+    Int Head [ ],
+    Int Elen [ ],
+    Int Degree [ ],
+    Int W [ ],
+    Int nel
+) ;
+
+#ifdef ASSERT
+#undef ASSERT
+#endif
+
+/* Use mxAssert if AMD is compiled into a mexFunction */
+#ifdef MATLAB_MEX_FILE
+#define ASSERT(expression) (mxAssert ((expression), ""))
+#else
+#define ASSERT(expression) (assert (expression))
+#endif
+
+#define AMD_DEBUG0(params) { PRINTF (params) ; }
+#define AMD_DEBUG1(params) { if (AMD_debug >= 1) PRINTF (params) ; }
+#define AMD_DEBUG2(params) { if (AMD_debug >= 2) PRINTF (params) ; }
+#define AMD_DEBUG3(params) { if (AMD_debug >= 3) PRINTF (params) ; }
+#define AMD_DEBUG4(params) { if (AMD_debug >= 4) PRINTF (params) ; }
+
+#else
+
+/* no debugging */
+#define ASSERT(expression)
+#define AMD_DEBUG0(params)
+#define AMD_DEBUG1(params)
+#define AMD_DEBUG2(params)
+#define AMD_DEBUG3(params)
+#define AMD_DEBUG4(params)
+
+#endif
--- a/src/maths/KLU/amd_order.c
+++ b/src/maths/KLU/amd_order.c
@ -0,0 +1,200 @@
+/* ========================================================================= */
+/* === AMD_order =========================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable AMD minimum degree ordering routine.  See amd.h for
+ * documentation.
+ */
+
+#include "amd_internal.h"
+
+/* ========================================================================= */
+/* === AMD_order =========================================================== */
+/* ========================================================================= */
+
+GLOBAL Int AMD_order
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int P [ ],
+    double Control [ ],
+    double Info [ ]
+)
+{
+    Int *Len, *S, nz, i, *Pinv, info, status, *Rp, *Ri, *Cp, *Ci, ok ;
+    size_t nzaat, slen ;
+    double mem = 0 ;
+
+#ifndef NDEBUG
+    AMD_debug_init ("amd") ;
+#endif
+
+    /* clear the Info array, if it exists */
+    info = Info != (double *) NULL ;
+    if (info)
+    {
+	for (i = 0 ; i < AMD_INFO ; i++)
+	{
+	    Info [i] = EMPTY ;
+	}
+	Info [AMD_N] = n ;
+	Info [AMD_STATUS] = AMD_OK ;
+    }
+
+    /* make sure inputs exist and n is >= 0 */
+    if (Ai == (Int *) NULL || Ap == (Int *) NULL || P == (Int *) NULL || n < 0)
+    {
+	if (info) Info [AMD_STATUS] = AMD_INVALID ;
+	return (AMD_INVALID) ;	    /* arguments are invalid */
+    }
+
+    if (n == 0)
+    {
+	return (AMD_OK) ;	    /* n is 0 so there's nothing to do */
+    }
+
+    nz = Ap [n] ;
+    if (info)
+    {
+	Info [AMD_NZ] = nz ;
+    }
+    if (nz < 0)
+    {
+	if (info) Info [AMD_STATUS] = AMD_INVALID ;
+	return (AMD_INVALID) ;
+    }
+
+    /* check if n or nz will cause size_t overflow */
+    if (((size_t) n) >= SIZE_T_MAX / sizeof (Int)
+     || ((size_t) nz) >= SIZE_T_MAX / sizeof (Int))
+    {
+	if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	return (AMD_OUT_OF_MEMORY) ;	    /* problem too large */
+    }
+
+    /* check the input matrix:	AMD_OK, AMD_INVALID, or AMD_OK_BUT_JUMBLED */
+    status = AMD_valid (n, n, Ap, Ai) ;
+
+    if (status == AMD_INVALID)
+    {
+	if (info) Info [AMD_STATUS] = AMD_INVALID ;
+	return (AMD_INVALID) ;	    /* matrix is invalid */
+    }
+
+    /* allocate two size-n integer workspaces */
+    Len = amd_malloc (n * sizeof (Int)) ;
+    Pinv = amd_malloc (n * sizeof (Int)) ;
+    mem += n ;
+    mem += n ;
+    if (!Len || !Pinv)
+    {
+	/* :: out of memory :: */
+	amd_free (Len) ;
+	amd_free (Pinv) ;
+	if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	return (AMD_OUT_OF_MEMORY) ;
+    }
+
+    if (status == AMD_OK_BUT_JUMBLED)
+    {
+	/* sort the input matrix and remove duplicate entries */
+	AMD_DEBUG1 (("Matrix is jumbled\n")) ;
+	Rp = amd_malloc ((n+1) * sizeof (Int)) ;
+	Ri = amd_malloc (MAX (nz,1) * sizeof (Int)) ;
+	mem += (n+1) ;
+	mem += MAX (nz,1) ;
+	if (!Rp || !Ri)
+	{
+	    /* :: out of memory :: */
+	    amd_free (Rp) ;
+	    amd_free (Ri) ;
+	    amd_free (Len) ;
+	    amd_free (Pinv) ;
+	    if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	    return (AMD_OUT_OF_MEMORY) ;
+	}
+	/* use Len and Pinv as workspace to create R = A' */
+	AMD_preprocess (n, Ap, Ai, Rp, Ri, Len, Pinv) ;
+	Cp = Rp ;
+	Ci = Ri ;
+    }
+    else
+    {
+	/* order the input matrix as-is.  No need to compute R = A' first */
+	Rp = NULL ;
+	Ri = NULL ;
+	Cp = (Int *) Ap ;
+	Ci = (Int *) Ai ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* determine the symmetry and count off-diagonal nonzeros in A+A' */
+    /* --------------------------------------------------------------------- */
+
+    nzaat = AMD_aat (n, Cp, Ci, Len, P, Info) ;
+    AMD_DEBUG1 (("nzaat: %g\n", (double) nzaat)) ;
+    ASSERT ((MAX (nz-n, 0) <= nzaat) && (nzaat <= 2 * (size_t) nz)) ;
+
+    /* --------------------------------------------------------------------- */
+    /* allocate workspace for matrix, elbow room, and 6 size-n vectors */
+    /* --------------------------------------------------------------------- */
+
+    S = NULL ;
+    slen = nzaat ;			/* space for matrix */
+    ok = ((slen + nzaat/5) >= slen) ;	/* check for size_t overflow */
+    slen += nzaat/5 ;			/* add elbow room */
+    for (i = 0 ; ok && i < 7 ; i++)
+    {
+	ok = ((slen + n) > slen) ;	/* check for size_t overflow */
+	slen += n ;			/* size-n elbow room, 6 size-n work */
+    }
+    mem += slen ;
+    ok = ok && (slen < SIZE_T_MAX / sizeof (Int)) ; /* check for overflow */
+    ok = ok && (slen < Int_MAX) ;	/* S[i] for Int i must be OK */
+    if (ok)
+    {
+	S = amd_malloc (slen * sizeof (Int)) ;
+    }
+    AMD_DEBUG1 (("slen %g\n", (double) slen)) ;
+    if (!S)
+    {
+	/* :: out of memory :: (or problem too large) */
+	amd_free (Rp) ;
+	amd_free (Ri) ;
+	amd_free (Len) ;
+	amd_free (Pinv) ;
+	if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	return (AMD_OUT_OF_MEMORY) ;
+    }
+    if (info)
+    {
+	/* memory usage, in bytes. */
+	Info [AMD_MEMORY] = mem * sizeof (Int) ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* order the matrix */
+    /* --------------------------------------------------------------------- */
+
+    AMD_1 (n, Cp, Ci, P, Pinv, Len, slen, S, Control, Info) ;
+
+    /* --------------------------------------------------------------------- */
+    /* free the workspace */
+    /* --------------------------------------------------------------------- */
+
+    amd_free (Rp) ;
+    amd_free (Ri) ;
+    amd_free (Len) ;
+    amd_free (Pinv) ;
+    amd_free (S) ;
+    if (info) Info [AMD_STATUS] = status ;
+    return (status) ;	    /* successful ordering */
+}
--- a/src/maths/KLU/amd_post_tree.c
+++ b/src/maths/KLU/amd_post_tree.c
@ -0,0 +1,121 @@
+/* ========================================================================= */
+/* === AMD_post_tree ======================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Post-ordering of a supernodal elimination tree.  */
+
+#include "amd_internal.h"
+
+GLOBAL Int AMD_post_tree
+(
+    Int root,			/* root of the tree */
+    Int k,			/* start numbering at k */
+    Int Child [ ],		/* input argument of size nn, undefined on
+				 * output.  Child [i] is the head of a link
+				 * list of all nodes that are children of node
+				 * i in the tree. */
+    const Int Sibling [ ],	/* input argument of size nn, not modified.
+				 * If f is a node in the link list of the
+				 * children of node i, then Sibling [f] is the
+				 * next child of node i.
+				 */
+    Int Order [ ],		/* output order, of size nn.  Order [i] = k
+				 * if node i is the kth node of the reordered
+				 * tree. */
+    Int Stack [ ]		/* workspace of size nn */
+#ifndef NDEBUG
+    , Int nn			/* nodes are in the range 0..nn-1. */
+#endif
+)
+{
+    Int f, head, h, i ;
+
+#if 0
+    /* --------------------------------------------------------------------- */
+    /* recursive version (Stack [ ] is not used): */
+    /* --------------------------------------------------------------------- */
+
+    /* this is simple, but can caouse stack overflow if nn is large */
+    i = root ;
+    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+    {
+	k = AMD_post_tree (f, k, Child, Sibling, Order, Stack, nn) ;
+    }
+    Order [i] = k++ ;
+    return (k) ;
+#endif
+
+    /* --------------------------------------------------------------------- */
+    /* non-recursive version, using an explicit stack */
+    /* --------------------------------------------------------------------- */
+
+    /* push root on the stack */
+    head = 0 ;
+    Stack [0] = root ;
+
+    while (head >= 0)
+    {
+	/* get head of stack */
+	ASSERT (head < nn) ;
+	i = Stack [head] ;
+	AMD_DEBUG1 (("head of stack "ID" \n", i)) ;
+	ASSERT (i >= 0 && i < nn) ;
+
+	if (Child [i] != EMPTY)
+	{
+	    /* the children of i are not yet ordered */
+	    /* push each child onto the stack in reverse order */
+	    /* so that small ones at the head of the list get popped first */
+	    /* and the biggest one at the end of the list gets popped last */
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		head++ ;
+		ASSERT (head < nn) ;
+		ASSERT (f >= 0 && f < nn) ;
+	    }
+	    h = head ;
+	    ASSERT (head < nn) ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (h > 0) ;
+		Stack [h--] = f ;
+		AMD_DEBUG1 (("push "ID" on stack\n", f)) ;
+		ASSERT (f >= 0 && f < nn) ;
+	    }
+	    ASSERT (Stack [h] == i) ;
+
+	    /* delete child list so that i gets ordered next time we see it */
+	    Child [i] = EMPTY ;
+	}
+	else
+	{
+	    /* the children of i (if there were any) are already ordered */
+	    /* remove i from the stack and order it.  Front i is kth front */
+	    head-- ;
+	    AMD_DEBUG1 (("pop "ID" order "ID"\n", i, k)) ;
+	    Order [i] = k++ ;
+	    ASSERT (k <= nn) ;
+	}
+
+#ifndef NDEBUG
+	AMD_DEBUG1 (("\nStack:")) ;
+	for (h = head ; h >= 0 ; h--)
+	{
+	    Int j = Stack [h] ;
+	    AMD_DEBUG1 ((" "ID, j)) ;
+	    ASSERT (j >= 0 && j < nn) ;
+	}
+	AMD_DEBUG1 (("\n\n")) ;
+	ASSERT (head < nn) ;
+#endif
+
+    }
+    return (k) ;
+}
--- a/src/maths/KLU/amd_postorder.c
+++ b/src/maths/KLU/amd_postorder.c
@ -0,0 +1,207 @@
+/* ========================================================================= */
+/* === AMD_postorder ======================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Perform a postordering (via depth-first search) of an assembly tree. */
+
+#include "amd_internal.h"
+
+GLOBAL void AMD_postorder
+(
+    /* inputs, not modified on output: */
+    Int nn,		/* nodes are in the range 0..nn-1 */
+    Int Parent [ ],	/* Parent [j] is the parent of j, or EMPTY if root */
+    Int Nv [ ],		/* Nv [j] > 0 number of pivots represented by node j,
+			 * or zero if j is not a node. */
+    Int Fsize [ ],	/* Fsize [j]: size of node j */
+
+    /* output, not defined on input: */
+    Int Order [ ],	/* output post-order */
+
+    /* workspaces of size nn: */
+    Int Child [ ],
+    Int Sibling [ ],
+    Int Stack [ ]
+)
+{
+    Int i, j, k, parent, frsize, f, fprev, maxfrsize, bigfprev, bigf, fnext ;
+
+    for (j = 0 ; j < nn ; j++)
+    {
+	Child [j] = EMPTY ;
+	Sibling [j] = EMPTY ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* place the children in link lists - bigger elements tend to be last */
+    /* --------------------------------------------------------------------- */
+
+    for (j = nn-1 ; j >= 0 ; j--)
+    {
+	if (Nv [j] > 0)
+	{
+	    /* this is an element */
+	    parent = Parent [j] ;
+	    if (parent != EMPTY)
+	    {
+		/* place the element in link list of the children its parent */
+		/* bigger elements will tend to be at the end of the list */
+		Sibling [j] = Child [parent] ;
+		Child [parent] = j ;
+	    }
+	}
+    }
+
+#ifndef NDEBUG
+    {
+	Int nels, ff, nchild ;
+	AMD_DEBUG1 (("\n\n================================ AMD_postorder:\n"));
+	nels = 0 ;
+	for (j = 0 ; j < nn ; j++)
+	{
+	    if (Nv [j] > 0)
+	    {
+		AMD_DEBUG1 (( ""ID" :  nels "ID" npiv "ID" size "ID
+		    " parent "ID" maxfr "ID"\n", j, nels,
+		    Nv [j], Fsize [j], Parent [j], Fsize [j])) ;
+		/* this is an element */
+		/* dump the link list of children */
+		nchild = 0 ;
+		AMD_DEBUG1 (("    Children: ")) ;
+		for (ff = Child [j] ; ff != EMPTY ; ff = Sibling [ff])
+		{
+		    AMD_DEBUG1 ((ID" ", ff)) ;
+		    ASSERT (Parent [ff] == j) ;
+		    nchild++ ;
+		    ASSERT (nchild < nn) ;
+		}
+		AMD_DEBUG1 (("\n")) ;
+		parent = Parent [j] ;
+		if (parent != EMPTY)
+		{
+		    ASSERT (Nv [parent] > 0) ;
+		}
+		nels++ ;
+	    }
+	}
+    }
+    AMD_DEBUG1 (("\n\nGo through the children of each node, and put\n"
+		 "the biggest child last in each list:\n")) ;
+#endif
+
+    /* --------------------------------------------------------------------- */
+    /* place the largest child last in the list of children for each node */
+    /* --------------------------------------------------------------------- */
+
+    for (i = 0 ; i < nn ; i++)
+    {
+	if (Nv [i] > 0 && Child [i] != EMPTY)
+	{
+
+#ifndef NDEBUG
+	    Int nchild ;
+	    AMD_DEBUG1 (("Before partial sort, element "ID"\n", i)) ;
+	    nchild = 0 ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (f >= 0 && f < nn) ;
+		AMD_DEBUG1 (("      f: "ID"  size: "ID"\n", f, Fsize [f])) ;
+		nchild++ ;
+		ASSERT (nchild <= nn) ;
+	    }
+#endif
+
+	    /* find the biggest element in the child list */
+	    fprev = EMPTY ;
+	    maxfrsize = EMPTY ;
+	    bigfprev = EMPTY ;
+	    bigf = EMPTY ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (f >= 0 && f < nn) ;
+		frsize = Fsize [f] ;
+		if (frsize >= maxfrsize)
+		{
+		    /* this is the biggest seen so far */
+		    maxfrsize = frsize ;
+		    bigfprev = fprev ;
+		    bigf = f ;
+		}
+		fprev = f ;
+	    }
+	    ASSERT (bigf != EMPTY) ;
+
+	    fnext = Sibling [bigf] ;
+
+	    AMD_DEBUG1 (("bigf "ID" maxfrsize "ID" bigfprev "ID" fnext "ID
+		" fprev " ID"\n", bigf, maxfrsize, bigfprev, fnext, fprev)) ;
+
+	    if (fnext != EMPTY)
+	    {
+		/* if fnext is EMPTY then bigf is already at the end of list */
+
+		if (bigfprev == EMPTY)
+		{
+		    /* delete bigf from the element of the list */
+		    Child [i] = fnext ;
+		}
+		else
+		{
+		    /* delete bigf from the middle of the list */
+		    Sibling [bigfprev] = fnext ;
+		}
+
+		/* put bigf at the end of the list */
+		Sibling [bigf] = EMPTY ;
+		ASSERT (Child [i] != EMPTY) ;
+		ASSERT (fprev != bigf) ;
+		ASSERT (fprev != EMPTY) ;
+		Sibling [fprev] = bigf ;
+	    }
+
+#ifndef NDEBUG
+	    AMD_DEBUG1 (("After partial sort, element "ID"\n", i)) ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (f >= 0 && f < nn) ;
+		AMD_DEBUG1 (("        "ID"  "ID"\n", f, Fsize [f])) ;
+		ASSERT (Nv [f] > 0) ;
+		nchild-- ;
+	    }
+	    ASSERT (nchild == 0) ;
+#endif
+
+	}
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* postorder the assembly tree */
+    /* --------------------------------------------------------------------- */
+
+    for (i = 0 ; i < nn ; i++)
+    {
+	Order [i] = EMPTY ;
+    }
+
+    k = 0 ;
+
+    for (i = 0 ; i < nn ; i++)
+    {
+	if (Parent [i] == EMPTY && Nv [i] > 0)
+	{
+	    AMD_DEBUG1 (("Root of assembly tree "ID"\n", i)) ;
+	    k = AMD_post_tree (i, k, Child, Sibling, Order, Stack
+#ifndef NDEBUG
+		, nn
+#endif
+		) ;
+	}
+    }
+}
--- a/src/maths/KLU/amd_preprocess.c
+++ b/src/maths/KLU/amd_preprocess.c
@ -0,0 +1,119 @@
+/* ========================================================================= */
+/* === AMD_preprocess ====================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Sorts, removes duplicate entries, and transposes from the nonzero pattern of
+ * a column-form matrix A, to obtain the matrix R.  The input matrix can have
+ * duplicate entries and/or unsorted columns (AMD_valid (n,Ap,Ai) must not be
+ * AMD_INVALID).
+ *
+ * This input condition is NOT checked.  This routine is not user-callable.
+ */
+
+#include "amd_internal.h"
+
+/* ========================================================================= */
+/* === AMD_preprocess ====================================================== */
+/* ========================================================================= */
+
+/* AMD_preprocess does not check its input for errors or allocate workspace.
+ * On input, the condition (AMD_valid (n,n,Ap,Ai) != AMD_INVALID) must hold.
+ */
+
+GLOBAL void AMD_preprocess
+(
+    Int n,		/* input matrix: A is n-by-n */
+    const Int Ap [ ],	/* size n+1 */
+    const Int Ai [ ],	/* size nz = Ap [n] */
+
+    /* output matrix R: */
+    Int Rp [ ],		/* size n+1 */
+    Int Ri [ ],		/* size nz (or less, if duplicates present) */
+
+    Int W [ ],		/* workspace of size n */
+    Int Flag [ ]	/* workspace of size n */
+)
+{
+
+    /* --------------------------------------------------------------------- */
+    /* local variables */
+    /* --------------------------------------------------------------------- */
+
+    Int i, j, p, p2 ;
+
+    ASSERT (AMD_valid (n, n, Ap, Ai) != AMD_INVALID) ;
+
+    /* --------------------------------------------------------------------- */
+    /* count the entries in each row of A (excluding duplicates) */
+    /* --------------------------------------------------------------------- */
+
+    for (i = 0 ; i < n ; i++)
+    {
+	W [i] = 0 ;		/* # of nonzeros in row i (excl duplicates) */
+	Flag [i] = EMPTY ;	/* Flag [i] = j if i appears in column j */
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+	p2 = Ap [j+1] ;
+	for (p = Ap [j] ; p < p2 ; p++)
+	{
+	    i = Ai [p] ;
+	    if (Flag [i] != j)
+	    {
+		/* row index i has not yet appeared in column j */
+		W [i]++ ;	    /* one more entry in row i */
+		Flag [i] = j ;	    /* flag row index i as appearing in col j*/
+	    }
+	}
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* compute the row pointers for R */
+    /* --------------------------------------------------------------------- */
+
+    Rp [0] = 0 ;
+    for (i = 0 ; i < n ; i++)
+    {
+	Rp [i+1] = Rp [i] + W [i] ;
+    }
+    for (i = 0 ; i < n ; i++)
+    {
+	W [i] = Rp [i] ;
+	Flag [i] = EMPTY ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* construct the row form matrix R */
+    /* --------------------------------------------------------------------- */
+
+    /* R = row form of pattern of A */
+    for (j = 0 ; j < n ; j++)
+    {
+	p2 = Ap [j+1] ;
+	for (p = Ap [j] ; p < p2 ; p++)
+	{
+	    i = Ai [p] ;
+	    if (Flag [i] != j)
+	    {
+		/* row index i has not yet appeared in column j */
+		Ri [W [i]++] = j ;  /* put col j in row i */
+		Flag [i] = j ;	    /* flag row index i as appearing in col j*/
+	    }
+	}
+    }
+
+#ifndef NDEBUG
+    ASSERT (AMD_valid (n, n, Rp, Ri) == AMD_OK) ;
+    for (j = 0 ; j < n ; j++)
+    {
+	ASSERT (W [j] == Rp [j+1]) ;
+    }
+#endif
+}
--- a/src/maths/KLU/amd_valid.c
+++ b/src/maths/KLU/amd_valid.c
@ -0,0 +1,93 @@
+/* ========================================================================= */
+/* === AMD_valid =========================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Check if a column-form matrix is valid or not.  The matrix A is
+ * n_row-by-n_col.  The row indices of entries in column j are in
+ * Ai [Ap [j] ... Ap [j+1]-1].  Required conditions are:
+ *
+ *	n_row >= 0
+ *	n_col >= 0
+ *	nz = Ap [n_col] >= 0	    number of entries in the matrix
+ *	Ap [0] == 0
+ *	Ap [j] <= Ap [j+1] for all j in the range 0 to n_col.
+ *      Ai [0 ... nz-1] must be in the range 0 to n_row-1.
+ *
+ * If any of the above conditions hold, AMD_INVALID is returned.  If the
+ * following condition holds, AMD_OK_BUT_JUMBLED is returned (a warning,
+ * not an error):
+ *
+ *	row indices in Ai [Ap [j] ... Ap [j+1]-1] are not sorted in ascending
+ *	    order, and/or duplicate entries exist.
+ *
+ * Otherwise, AMD_OK is returned.
+ *
+ * In v1.2 and earlier, this function returned TRUE if the matrix was valid
+ * (now returns AMD_OK), or FALSE otherwise (now returns AMD_INVALID or
+ * AMD_OK_BUT_JUMBLED).
+ */
+
+#include "amd_internal.h"
+
+GLOBAL Int AMD_valid
+(
+    /* inputs, not modified on output: */
+    Int n_row,		/* A is n_row-by-n_col */
+    Int n_col,
+    const Int Ap [ ],	/* column pointers of A, of size n_col+1 */
+    const Int Ai [ ]	/* row indices of A, of size nz = Ap [n_col] */
+)
+{
+    Int nz, j, p1, p2, ilast, i, p, result = AMD_OK ;
+
+    if (n_row < 0 || n_col < 0 || Ap == NULL || Ai == NULL)
+    {
+	return (AMD_INVALID) ;
+    }
+    nz = Ap [n_col] ;
+    if (Ap [0] != 0 || nz < 0)
+    {
+	/* column pointers must start at Ap [0] = 0, and Ap [n] must be >= 0 */
+	AMD_DEBUG0 (("column 0 pointer bad or nz < 0\n")) ;
+	return (AMD_INVALID) ;
+    }
+    for (j = 0 ; j < n_col ; j++)
+    {
+	p1 = Ap [j] ;
+	p2 = Ap [j+1] ;
+	AMD_DEBUG2 (("\nColumn: "ID" p1: "ID" p2: "ID"\n", j, p1, p2)) ;
+	if (p1 > p2)
+	{
+	    /* column pointers must be ascending */
+	    AMD_DEBUG0 (("column "ID" pointer bad\n", j)) ;
+	    return (AMD_INVALID) ;
+	}
+	ilast = EMPTY ;
+	for (p = p1 ; p < p2 ; p++)
+	{
+	    i = Ai [p] ;
+	    AMD_DEBUG3 (("row: "ID"\n", i)) ;
+	    if (i < 0 || i >= n_row)
+	    {
+		/* row index out of range */
+		AMD_DEBUG0 (("index out of range, col "ID" row "ID"\n", j, i));
+		return (AMD_INVALID) ;
+	    }
+	    if (i <= ilast)
+	    {
+		/* row index unsorted, or duplicate entry present */
+		AMD_DEBUG1 (("index unsorted/dupl col "ID" row "ID"\n", j, i));
+		result = AMD_OK_BUT_JUMBLED ;
+	    }
+	    ilast = i ;
+	}
+    }
+    return (result) ;
+}
--- a/src/maths/KLU/btf.h
+++ b/src/maths/KLU/btf.h
@ -0,0 +1,263 @@
+/* ========================================================================== */
+/* === BTF package ========================================================== */
+/* ========================================================================== */
+
+/* BTF_MAXTRANS:  find a column permutation Q to give A*Q a zero-free diagonal
+ * BTF_STRONGCOMP:  find a symmetric permutation P to put P*A*P' into block
+ *      upper triangular form.
+ * BTF_ORDER: do both of the above (btf_maxtrans then btf_strongcomp).
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+
+/* ========================================================================== */
+/* === BTF_MAXTRANS ========================================================= */
+/* ========================================================================== */
+
+/* BTF_MAXTRANS: finds a permutation of the columns of a matrix so that it has a
+ * zero-free diagonal.  The input is an m-by-n sparse matrix in compressed
+ * column form.  The array Ap of size n+1 gives the starting and ending
+ * positions of the columns in the array Ai.  Ap[0] must be zero. The array Ai
+ * contains the row indices of the nonzeros of the matrix A, and is of size
+ * Ap[n].  The row indices of column j are located in Ai[Ap[j] ... Ap[j+1]-1].
+ * Row indices must be in the range 0 to m-1.  Duplicate entries may be present
+ * in any given column.  The input matrix  is not checked for validity (row
+ * indices out of the range 0 to m-1 will lead to an undeterminate result -
+ * possibly a core dump, for example).  Row indices in any given column need
+ * not be in sorted order.  However, if they are sorted and the matrix already
+ * has a zero-free diagonal, then the identity permutation is returned.
+ *
+ * The output of btf_maxtrans is an array Match of size n.  If row i is matched
+ * with column j, then A(i,j) is nonzero, and then Match[i] = j.  If the matrix
+ * is structurally nonsingular, all entries in the Match array are unique, and
+ * Match can be viewed as a column permutation if A is square.  That is, column
+ * k of the original matrix becomes column Match[k] of the permuted matrix.  In
+ * MATLAB, this can be expressed as (for non-structurally singular matrices):
+ *
+ *      Match = maxtrans (A) ;
+ *      B = A (:, Match) ;
+ *
+ * except of course here the A matrix and Match vector are all 0-based (rows
+ * and columns in the range 0 to n-1), not 1-based (rows/cols in range 1 to n).
+ * The MATLAB dmperm routine returns a row permutation.  See the maxtrans
+ * mexFunction for more details.
+ *
+ * If row i is not matched to any column, then Match[i] is == -1.  The
+ * btf_maxtrans routine returns the number of nonzeros on diagonal of the
+ * permuted matrix.
+ *
+ * In the MATLAB mexFunction interface to btf_maxtrans, 1 is added to the Match
+ * array to obtain a 1-based permutation.  Thus, in MATLAB where A is m-by-n:
+ *
+ *      q = maxtrans (A) ;      % has entries in the range 0:n
+ *      q                       % a column permutation (only if sprank(A)==n)
+ *      B = A (:, q) ;          % permuted matrix (only if sprank(A)==n)
+ *      sum (q > 0) ;           % same as "sprank (A)"
+ *
+ * This behaviour differs from p = dmperm (A) in MATLAB, which returns the
+ * matching as p(j)=i if row i and column j are matched, and p(j)=0 if column j
+ * is unmatched.
+ *
+ *      p = dmperm (A) ;        % has entries in the range 0:m
+ *      p                       % a row permutation (only if sprank(A)==m)
+ *      B = A (p, :) ;          % permuted matrix (only if sprank(A)==m)
+ *      sum (p > 0) ;           % definition of sprank (A)
+ *
+ * This algorithm is based on the paper "On Algorithms for obtaining a maximum
+ * transversal" by Iain Duff, ACM Trans. Mathematical Software, vol 7, no. 1,
+ * pp. 315-330, and "Algorithm 575: Permutations for a zero-free diagonal",
+ * same issue, pp. 387-390.  Algorithm 575 is MC21A in the Harwell Subroutine
+ * Library.  This code is not merely a translation of the Fortran code into C.
+ * It is a completely new implementation of the basic underlying method (depth
+ * first search over a subgraph with nodes corresponding to columns matched so
+ * far, and cheap matching).  This code was written with minimal observation of
+ * the MC21A/B code itself.  See comments below for a comparison between the
+ * maxtrans and MC21A/B codes.
+ *
+ * This routine operates on a column-form matrix and produces a column
+ * permutation.  MC21A uses a row-form matrix and produces a row permutation.
+ * The difference is merely one of convention in the comments and interpretation
+ * of the inputs and outputs.  If you want a row permutation, simply pass a
+ * compressed-row sparse matrix to this routine and you will get a row
+ * permutation (just like MC21A).  Similarly, you can pass a column-oriented
+ * matrix to MC21A and it will happily return a column permutation.
+ */
+
+#ifndef _BTF_H
+#define _BTF_H
+
+/* make it easy for C++ programs to include BTF */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "UFconfig.h"
+
+int btf_maxtrans    /* returns # of columns matched */
+(
+    /* --- input, not modified: --- */
+    int nrow,       /* A is nrow-by-ncol in compressed column form */
+    int ncol,
+    int Ap [ ],     /* size ncol+1 */
+    int Ai [ ],     /* size nz = Ap [ncol] */
+    double maxwork, /* maximum amount of work to do is maxwork*nnz(A); no limit
+                     * if <= 0 */
+
+    /* --- output, not defined on input --- */
+    double *work,   /* work = -1 if maxwork > 0 and the total work performed
+                     * reached the maximum of maxwork*nnz(A).
+                     * Otherwise, work = the total work performed. */
+
+    int Match [ ],  /* size nrow.  Match [i] = j if column j matched to row i
+                     * (see above for the singular-matrix case) */
+
+    /* --- workspace, not defined on input or output --- */
+    int Work [ ]    /* size 5*ncol */
+) ;
+
+/* long integer version (all "int" parameters become "UF_long") */
+UF_long btf_l_maxtrans (UF_long, UF_long, UF_long *, UF_long *, double,
+    double *, UF_long *, UF_long *) ;
+
+
+/* ========================================================================== */
+/* === BTF_STRONGCOMP ======================================================= */
+/* ========================================================================== */
+
+/* BTF_STRONGCOMP finds the strongly connected components of a graph, returning
+ * a symmetric permutation.  The matrix A must be square, and is provided on
+ * input in compressed-column form (see BTF_MAXTRANS, above).  The diagonal of
+ * the input matrix A (or A*Q if Q is provided on input) is ignored.
+ *
+ * If Q is not NULL on input, then the strongly connected components of A*Q are
+ * found.  Q may be flagged on input, where Q[k] < 0 denotes a flagged column k.
+ * The permutation is j = BTF_UNFLIP (Q [k]).  On output, Q is modified (the
+ * flags are preserved) so that P*A*Q is in block upper triangular form.
+ *
+ * If Q is NULL, then the permutation P is returned so that P*A*P' is in upper
+ * block triangular form.
+ *
+ * The vector R gives the block boundaries, where block b is in rows/columns
+ * R[b] to R[b+1]-1 of the permuted matrix, and where b ranges from 1 to the
+ * number of strongly connected components found.
+ */
+
+int btf_strongcomp  /* return # of strongly connected components */
+(
+    /* input, not modified: */
+    int n,          /* A is n-by-n in compressed column form */
+    int Ap [ ],     /* size n+1 */
+    int Ai [ ],     /* size nz = Ap [n] */
+
+    /* optional input, modified (if present) on output: */
+    int Q [ ],      /* size n, input column permutation */
+
+    /* output, not defined on input */
+    int P [ ],      /* size n.  P [k] = j if row and column j are kth row/col
+                     * in permuted matrix. */
+
+    int R [ ],      /* size n+1.  block b is in rows/cols R[b] ... R[b+1]-1 */
+
+    /* workspace, not defined on input or output */
+    int Work [ ]    /* size 4n */
+) ;
+
+UF_long btf_l_strongcomp (UF_long, UF_long *, UF_long *, UF_long *, UF_long *,
+    UF_long *, UF_long *) ;
+
+
+/* ========================================================================== */
+/* === BTF_ORDER ============================================================ */
+/* ========================================================================== */
+
+/* BTF_ORDER permutes a square matrix into upper block triangular form.  It
+ * does this by first finding a maximum matching (or perhaps a limited matching
+ * if the work is limited), via the btf_maxtrans function.  If a complete
+ * matching is not found, BTF_ORDER completes the permutation, but flags the
+ * columns of P*A*Q to denote which columns are not matched.  If the matrix is
+ * structurally rank deficient, some of the entries on the diagonal of the
+ * permuted matrix will be zero.  BTF_ORDER then calls btf_strongcomp to find
+ * the strongly-connected components.
+ *
+ * On output, P and Q are the row and column permutations, where i = P[k] if
+ * row i of A is the kth row of P*A*Q, and j = BTF_UNFLIP(Q[k]) if column j of
+ * A is the kth column of P*A*Q.  If Q[k] < 0, then the (k,k)th entry in P*A*Q
+ * is structurally zero.
+ *
+ * The vector R gives the block boundaries, where block b is in rows/columns
+ * R[b] to R[b+1]-1 of the permuted matrix, and where b ranges from 1 to the
+ * number of strongly connected components found.
+ */
+
+int btf_order       /* returns number of blocks found */
+(
+    /* --- input, not modified: --- */
+    int n,          /* A is n-by-n in compressed column form */
+    int Ap [ ],     /* size n+1 */
+    int Ai [ ],     /* size nz = Ap [n] */
+    double maxwork, /* do at most maxwork*nnz(A) work in the maximum
+                     * transversal; no limit if <= 0 */
+
+    /* --- output, not defined on input --- */
+    double *work,   /* return value from btf_maxtrans */
+    int P [ ],      /* size n, row permutation */
+    int Q [ ],      /* size n, column permutation */
+    int R [ ],      /* size n+1.  block b is in rows/cols R[b] ... R[b+1]-1 */
+    int *nmatch,    /* # nonzeros on diagonal of P*A*Q */
+
+    /* --- workspace, not defined on input or output --- */
+    int Work [ ]    /* size 5n */
+) ;
+
+UF_long btf_l_order (UF_long, UF_long *, UF_long *, double , double *,
+    UF_long *, UF_long *, UF_long *, UF_long *, UF_long *) ;
+
+
+/* ========================================================================== */
+/* === BTF marking of singular columns ====================================== */
+/* ========================================================================== */
+
+/* BTF_FLIP is a "negation about -1", and is used to mark an integer j
+ * that is normally non-negative.  BTF_FLIP (-1) is -1.  BTF_FLIP of
+ * a number > -1 is negative, and BTF_FLIP of a number < -1 is positive.
+ * BTF_FLIP (BTF_FLIP (j)) = j for all integers j.  UNFLIP (j) acts
+ * like an "absolute value" operation, and is always >= -1.  You can test
+ * whether or not an integer j is "flipped" with the BTF_ISFLIPPED (j)
+ * macro.
+ */
+
+#define BTF_FLIP(j) (-(j)-2)
+#define BTF_ISFLIPPED(j) ((j) < -1)
+#define BTF_UNFLIP(j) ((BTF_ISFLIPPED (j)) ? BTF_FLIP (j) : (j))
+
+/* ========================================================================== */
+/* === BTF version ========================================================== */
+/* ========================================================================== */
+
+/* All versions of BTF include these definitions.
+ * As an example, to test if the version you are using is 1.2 or later:
+ *
+ *      if (BTF_VERSION >= BTF_VERSION_CODE (1,2)) ...
+ *
+ * This also works during compile-time:
+ *
+ *      #if (BTF >= BTF_VERSION_CODE (1,2))
+ *          printf ("This is version 1.2 or later\n") ;
+ *      #else
+ *          printf ("This is an early version\n") ;
+ *      #endif
+ */
+
+#define BTF_DATE "Dec 7, 2011"
+#define BTF_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define BTF_MAIN_VERSION 1
+#define BTF_SUB_VERSION 1
+#define BTF_SUBSUB_VERSION 3
+#define BTF_VERSION BTF_VERSION_CODE(BTF_MAIN_VERSION,BTF_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/maths/KLU/btf_internal.h
+++ b/src/maths/KLU/btf_internal.h
@ -0,0 +1,64 @@
+/* ========================================================================== */
+/* === btf_internal include file ============================================ */
+/* ========================================================================== */
+
+#ifndef _BTF_INTERNAL_H
+#define _BTF_INTERNAL_H
+
+/*
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+/* Not to be included in any user program. */
+
+#ifdef DLONG
+#define Int UF_long
+#define Int_id UF_long_id
+#define BTF(name) btf_l_ ## name
+#else
+#define Int int
+#define Int_id "%d"
+#define BTF(name) btf_ ## name
+#endif
+
+/* ========================================================================== */
+/* make sure debugging and printing is turned off */
+
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#ifndef NPRINT
+#define NPRINT
+#endif
+
+/* To enable debugging and assertions, uncomment this line: 
+ #undef NDEBUG
+*/
+/* To enable diagnostic printing, uncomment this line: 
+ #undef NPRINT
+*/
+
+/* ========================================================================== */
+
+#include <stdio.h>
+#include <assert.h>
+#define ASSERT(a) assert(a)
+
+#undef TRUE
+#undef FALSE
+#undef PRINTF
+#undef MIN
+
+#ifndef NPRINT
+#define PRINTF(s) { printf s ; } ;
+#else
+#define PRINTF(s)
+#endif
+
+#define TRUE 1
+#define FALSE 0
+#define EMPTY (-1)
+#define MIN(a,b) (((a) < (b)) ?  (a) : (b))
+
+#endif
--- a/src/maths/KLU/btf_maxtrans.c
+++ b/src/maths/KLU/btf_maxtrans.c
@ -0,0 +1,387 @@
+/* ========================================================================== */
+/* === BTF_MAXTRANS ========================================================= */
+/* ========================================================================== */
+
+/* Finds a column permutation that maximizes the number of entries on the
+ * diagonal of a sparse matrix.  See btf.h for more information.
+ *
+ * This function is identical to cs_maxtrans in CSparse, with the following
+ * exceptions:
+ *
+ *  (1) cs_maxtrans finds both jmatch and imatch, where jmatch [i] = j and
+ *      imatch [j] = i if row i is matched to column j.  This function returns
+ *      just jmatch (the Match array).  The MATLAB interface to cs_maxtrans
+ *      (the single-output cs_dmperm) returns imatch, not jmatch to the MATLAB
+ *      caller.
+ *
+ *  (2) cs_maxtrans includes a pre-pass that counts the number of non-empty
+ *      rows and columns (m2 and n2, respectively), and computes the matching
+ *      using the transpose of A if m2 < n2.  cs_maxtrans also returns quickly
+ *      if the diagonal of the matrix is already zero-free.  This pre-pass
+ *      allows cs_maxtrans to be much faster than maxtrans, if the use of the
+ *      transpose is warranted.
+ *
+ *      However, for square structurally non-singular matrices with one or more
+ *      zeros on the diagonal, the pre-pass is a waste of time, and for these
+ *      matrices, maxtrans can be twice as fast as cs_maxtrans.  Since the
+ *      maxtrans function is intended primarily for square matrices that are
+ *      typically structurally nonsingular, the pre-pass is not included here.
+ *      If this maxtrans function is used on a matrix with many more columns
+ *      than rows, consider passing the transpose to this function, or use
+ *      cs_maxtrans instead.
+ *
+ *  (3) cs_maxtrans can operate as a randomized algorithm, to help avoid
+ *      rare cases of excessive run-time.
+ *
+ *  (4) this maxtrans function includes an option that limits the total work
+ *      performed.  If this limit is reached, the maximum transveral might not
+ *      be found.
+ *
+ * Thus, for general usage, cs_maxtrans is preferred.  For square matrices that
+ * are typically structurally non-singular, maxtrans is preferred.  A partial
+ * maxtrans can still be very useful when solving a sparse linear system.
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */ 
+
+#include "btf.h"
+#include "btf_internal.h"
+
+
+/* ========================================================================== */
+/* === augment ============================================================== */
+/* ========================================================================== */
+
+/* Perform a depth-first-search starting at column k, to find an augmenting
+ * path.  An augmenting path is a sequence of row/column pairs (i1,k), (i2,j1),
+ * (i3,j2), ..., (i(s+1), js), such that all of the following properties hold:
+ *
+ *      * column k is not matched to any row
+ *      * entries in the path are nonzero
+ *      * the pairs (i1,j1), (i2,j2), (i3,j3) ..., (is,js) have been 
+ *          previously matched to each other
+ *      * (i(s+1), js) is nonzero, and row i(s+1) is not matched to any column
+ *
+ * Once this path is found, the matching can be changed to the set of pairs
+ * path.  An augmenting path is a sequence of row/column pairs
+ *
+ *      (i1,k), (i2,j1), (i3,j2), ..., (i(s+1), js)
+ *
+ * Once a row is matched with a column it remains matched with some column, but
+ * not necessarily the column it was first matched with.
+ *
+ * In the worst case, this function can examine every nonzero in A.  Since it
+ * is called n times by maxtrans, the total time of maxtrans can be as high as
+ * O(n*nnz(A)).  To limit this work, pass a value of maxwork > 0.  Then at
+ * most O((maxwork+1)*nnz(A)) work will be performed; the maximum matching might
+ * not be found, however.
+ *
+ * This routine is very similar to the dfs routine in klu_kernel.c, in the
+ * KLU sparse LU factorization package.  It is essentially identical to the
+ * cs_augment routine in CSparse, and its recursive version (augment function
+ * in cs_maxtransr_mex.c), except that this routine allows for the search to be
+ * terminated early if too much work is being performed.
+ *
+ * The algorithm is based on the paper "On Algorithms for obtaining a maximum
+ * transversal" by Iain Duff, ACM Trans. Mathematical Software, vol 7, no. 1,
+ * pp. 315-330, and "Algorithm 575: Permutations for a zero-free diagonal",
+ * same issue, pp. 387-390.  The code here is a new implementation of that
+ * algorithm, with different data structures and control flow.  After writing
+ * this code, I carefully compared my algorithm with MC21A/B (ACM Algorithm 575)
+ * Some of the comparisons are partial because I didn't dig deeply into all of
+ * the details of MC21A/B, such as how the stack is maintained.  The following
+ * arguments are essentially identical between this code and MC21A:
+ *
+ * maxtrans     MC21A,B
+ * --------     -------
+ * n            N           identical
+ * k            JORD        identical
+ * Ap           IP          column / row pointers
+ * Ai           ICN         row / column indices
+ * Ap[n]        LICN        length of index array (# of nonzeros in A)
+ * Match        IPERM       output column / row permutation
+ * nmatch       NUMNZ       # of nonzeros on diagonal of permuted matrix
+ * Flag         CV          mark a node as visited by the depth-first-search
+ *
+ * The following are different, but analogous:
+ *
+ * Cheap        ARP         indicates what part of the a column / row has
+ *                          already been matched.
+ *
+ * The following arguments are very different:
+ *
+ * -            LENR        # of entries in each row/column (unused in maxtrans)
+ * Pstack       OUT         Pstack keeps track of where we are in the depth-
+ *                          first-search scan of column j.  I think that OUT
+ *                          plays a similar role in MC21B, but I'm unsure.
+ * Istack       PR          keeps track of the rows in the path.  PR is a link
+ *                          list, though, whereas Istack is a stack.  Maxtrans
+ *                          does not use any link lists.
+ * Jstack       OUT? PR?    the stack for nodes in the path (unsure)
+ *
+ * The following control structures are roughly comparable:
+ *
+ * maxtrans                     MC21B
+ * --------                     -----
+ * for (k = 0 ; k < n ; k++)    DO 100 JORD=1,N
+ * while (head >= 0)            DO 70 K=1,JORD
+ * for (p = Cheap [j] ; ...)    DO 20 II=IN1,IN2
+ * for (p = head ; ...)         DO 90 K=1,JORD
+ */
+
+static Int augment
+(
+    Int k,              /* which stage of the main loop we're in */
+    Int Ap [ ],         /* column pointers, size n+1 */
+    Int Ai [ ],         /* row indices, size nz = Ap [n] */
+    Int Match [ ],      /* size n,  Match [i] = j if col j matched to i */
+    Int Cheap [ ],      /* rows Ai [Ap [j] .. Cheap [j]-1] alread matched */
+    Int Flag [ ],       /* Flag [j] = k if j already visited this stage */
+    Int Istack [ ],     /* size n.  Row index stack. */
+    Int Jstack [ ],     /* size n.  Column index stack. */
+    Int Pstack [ ],     /* size n.  Keeps track of position in adjacency list */
+    double *work,       /* work performed by the depth-first-search */
+    double maxwork      /* maximum work allowed */
+)
+{
+    /* local variables, but "global" to all DFS levels: */
+    Int found ; /* true if match found.  */
+    Int head ;  /* top of stack */
+
+    /* variables that are purely local to any one DFS level: */
+    Int j2 ;    /* the next DFS goes to node j2 */
+    Int pend ;  /* one past the end of the adjacency list for node j */
+    Int pstart ;
+    Int quick ;
+
+    /* variables that need to be pushed then popped from the stack: */
+    Int i ;     /* the row tentatively matched to i if DFS successful */
+    Int j ;     /* the DFS is at the current node j */
+    Int p ;     /* current index into the adj. list for node j */
+    /* the variables i, j, and p are stacked in Istack, Jstack, and Pstack */
+
+    quick = (maxwork > 0) ;
+
+    /* start a DFS to find a match for column k */
+    found = FALSE ;
+    i = EMPTY ;
+    head = 0 ;
+    Jstack [0] = k ;
+    ASSERT (Flag [k] != k) ;
+
+    while (head >= 0)
+    {
+        j = Jstack [head] ;
+        pend = Ap [j+1] ;
+
+        if (Flag [j] != k)          /* a node is not yet visited */
+        {
+
+            /* -------------------------------------------------------------- */
+            /* prework for node j */
+            /* -------------------------------------------------------------- */
+
+            /* first time that j has been visited */
+            Flag [j] = k ;
+            /* cheap assignment: find the next unmatched row in col j.  This
+             * loop takes at most O(nnz(A)) time for the sum total of all
+             * calls to augment. */
+            for (p = Cheap [j] ; p < pend && !found ; p++)
+            {
+                i = Ai [p] ;
+                found = (Match [i] == EMPTY) ;
+            }
+            Cheap [j] = p ;
+
+            /* -------------------------------------------------------------- */
+
+            /* prepare for DFS */
+            if (found)
+            {
+                /* end of augmenting path, column j matched with row i */
+                Istack [head] = i ;
+                break ;
+            }
+            /* set Pstack [head] to the first entry in column j to scan */
+            Pstack [head] = Ap [j] ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* quick return if too much work done */
+        /* ------------------------------------------------------------------ */
+
+        if (quick && *work > maxwork)
+        {
+            /* too much work has been performed; abort the search */
+            return (EMPTY) ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* DFS for nodes adjacent to j */
+        /* ------------------------------------------------------------------ */
+
+        /* If cheap assignment not made, continue the depth-first search.  All
+         * rows in column j are already matched.  Add the adjacent nodes to the
+         * stack by iterating through until finding another non-visited node.
+         *
+         * It is the following loop that can force maxtrans to take
+         * O(n*nnz(A)) time. */
+
+        pstart = Pstack [head] ;
+        for (p = pstart ; p < pend ; p++)
+        {
+            i = Ai [p] ;
+            j2 = Match [i] ;
+            ASSERT (j2 != EMPTY) ;
+            if (Flag [j2] != k)
+            {
+                /* Node j2 is not yet visited, start a depth-first search on
+                 * node j2.  Keep track of where we left off in the scan of adj
+                 * list of node j so we can restart j where we left off. */
+                Pstack [head] = p + 1 ;
+                /* Push j2 onto the stack and immediately break so we can
+                 * recurse on node j2.  Also keep track of row i which (if this
+                 * search for an augmenting path works) will be matched with the
+                 * current node j. */
+                Istack [head] = i ;
+                Jstack [++head] = j2 ;
+                break ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* determine how much work was just performed */
+        /* ------------------------------------------------------------------ */
+
+        *work += (p - pstart + 1) ;
+
+        /* ------------------------------------------------------------------ */
+        /* node j is done, but the postwork is postponed - see below */
+        /* ------------------------------------------------------------------ */
+
+        if (p == pend)
+        {
+            /* If all adjacent nodes of j are already visited, pop j from
+             * stack and continue.  We failed to find a match. */
+            head-- ;
+        }
+    }
+
+    /* postwork for all nodes j in the stack */
+    /* unwind the path and make the corresponding matches */
+    if (found)
+    {
+        for (p = head ; p >= 0 ; p--)
+        {
+            j = Jstack [p] ;
+            i = Istack [p] ;
+
+            /* -------------------------------------------------------------- */
+            /* postwork for node j */
+            /* -------------------------------------------------------------- */
+            /* if found, match row i with column j */
+            Match [i] = j ;
+        }
+    }
+    return (found) ;
+}
+
+
+/* ========================================================================== */
+/* === maxtrans ============================================================= */
+/* ========================================================================== */
+
+Int BTF(maxtrans)   /* returns # of columns in the matching */
+(
+    /* --- input --- */
+    Int nrow,       /* A is nrow-by-ncol in compressed column form */
+    Int ncol,
+    Int Ap [ ],     /* size ncol+1 */
+    Int Ai [ ],     /* size nz = Ap [ncol] */
+    double maxwork, /* do at most maxwork*nnz(A) work; no limit if <= 0.  This
+                     * work limit excludes the O(nnz(A)) cheap-match phase. */
+
+    /* --- output --- */
+    double *work,   /* work = -1 if maxwork > 0 and the total work performed
+                     * reached the maximum of maxwork*nnz(A)).
+                     * Otherwise, work = the total work performed. */
+
+    Int Match [ ],  /* size nrow.  Match [i] = j if column j matched to row i */
+
+    /* --- workspace --- */
+    Int Work [ ]    /* size 5*ncol */
+)
+{
+    Int *Cheap, *Flag, *Istack, *Jstack, *Pstack ;
+    Int i, j, k, nmatch, work_limit_reached, result ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get workspace and initialize */
+    /* ---------------------------------------------------------------------- */
+
+    Cheap  = Work ; Work += ncol ;
+    Flag   = Work ; Work += ncol ;
+
+    /* stack for non-recursive depth-first search in augment function */
+    Istack = Work ; Work += ncol ;
+    Jstack = Work ; Work += ncol ;
+    Pstack = Work ;
+
+    /* in column j, rows Ai [Ap [j] .. Cheap [j]-1] are known to be matched */
+    for (j = 0 ; j < ncol ; j++)
+    {
+        Cheap [j] = Ap [j] ;
+        Flag [j] = EMPTY ; 
+    }
+
+    /* all rows and columns are currently unmatched */
+    for (i = 0 ; i < nrow ; i++)
+    {
+        Match [i] = EMPTY ;
+    }
+
+    if (maxwork > 0)
+    {
+        maxwork *= Ap [ncol] ;
+    }
+    *work = 0 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* find a matching row for each column k */
+    /* ---------------------------------------------------------------------- */
+
+    nmatch = 0 ;
+    work_limit_reached = FALSE ;
+    for (k = 0 ; k < ncol ; k++)
+    {
+        /* find an augmenting path to match some row i to column k */
+        result = augment (k, Ap, Ai, Match, Cheap, Flag, Istack, Jstack, Pstack,
+            work, maxwork) ;
+        if (result == TRUE)
+        {
+            /* we found it.  Match [i] = k for some row i has been done. */
+            nmatch++ ;
+        }
+        else if (result == EMPTY)
+        {
+            /* augment gave up because of too much work, and no match found */
+            work_limit_reached = TRUE ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* return the Match, and the # of matches made */
+    /* ---------------------------------------------------------------------- */
+
+    /* At this point, row i is matched to j = Match [i] if j >= 0.  i is an
+     * unmatched row if Match [i] == EMPTY. */
+
+    if (work_limit_reached)
+    {
+        /* return -1 if the work limit of maxwork*nnz(A) was reached */
+        *work = EMPTY ;
+    }
+
+    return (nmatch) ;
+}
--- a/src/maths/KLU/btf_order.c
+++ b/src/maths/KLU/btf_order.c
@ -0,0 +1,132 @@
+/* ========================================================================== */
+/* === BTF_ORDER ============================================================ */
+/* ========================================================================== */
+
+/* Find a permutation P and Q to permute a square sparse matrix into upper block
+ * triangular form.  A(P,Q) will contain a zero-free diagonal if A has
+ * structural full-rank.  Otherwise, the number of nonzeros on the diagonal of
+ * A(P,Q) will be maximized, and will equal the structural rank of A.
+ *
+ * Q[k] will be "flipped" if a zero-free diagonal was not found.  Q[k] will be
+ * negative, and j = BTF_UNFLIP (Q [k]) gives the corresponding permutation.
+ *
+ * R defines the block boundaries of A(P,Q).  The kth block consists of rows
+ * and columns R[k] to R[k+1]-1.
+ *
+ * If maxwork > 0 on input, then the work performed in btf_maxtrans is limited
+ * to maxwork*nnz(A) (excluding the "cheap match" phase, which can take another
+ * nnz(A) work).  On output, the work parameter gives the actual work performed,
+ * or -1 if the limit was reached.  In the latter case, the diagonal of A(P,Q)
+ * might not be zero-free, and the number of nonzeros on the diagonal of A(P,Q)
+ * might not be equal to the structural rank.
+ *
+ * See btf.h for more details.
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+#include "btf.h"
+#include "btf_internal.h"
+
+/* This function only operates on square matrices (either structurally full-
+ * rank, or structurally rank deficient). */
+
+Int BTF(order)      /* returns number of blocks found */
+(
+    /* input, not modified: */
+    Int n,          /* A is n-by-n in compressed column form */
+    Int Ap [ ],     /* size n+1 */
+    Int Ai [ ],     /* size nz = Ap [n] */
+    double maxwork, /* do at most maxwork*nnz(A) work in the maximum
+                     * transversal; no limit if <= 0 */
+
+    /* output, not defined on input */
+    double *work,   /* work performed in maxtrans, or -1 if limit reached */
+    Int P [ ],      /* size n, row permutation */
+    Int Q [ ],      /* size n, column permutation */
+    Int R [ ],      /* size n+1.  block b is in rows/cols R[b] ... R[b+1]-1 */
+    Int *nmatch,    /* # nonzeros on diagonal of P*A*Q */
+
+    /* workspace, not defined on input or output */
+    Int Work [ ]    /* size 5n */
+)
+{
+    Int *Flag ;
+    Int nblocks, i, j, nbadcol ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute the maximum matching */
+    /* ---------------------------------------------------------------------- */
+
+    /* if maxwork > 0, then a maximum matching might not be found */
+
+    *nmatch = BTF(maxtrans) (n, n, Ap, Ai, maxwork, work, Q, Work) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* complete permutation if the matrix is structurally singular */
+    /* ---------------------------------------------------------------------- */
+
+    /* Since the matrix is square, ensure BTF_UNFLIP(Q[0..n-1]) is a
+     * permutation of the columns of A so that A has as many nonzeros on the
+     * diagonal as possible.
+     */
+
+    if (*nmatch < n)
+    {
+        /* get a size-n work array */
+        Flag = Work + n ;
+        for (j = 0 ; j < n ; j++)
+        {
+            Flag [j] = 0 ;
+        }
+
+        /* flag all matched columns */
+        for (i = 0 ; i < n ; i++)
+        {
+            j = Q [i] ;
+            if (j != EMPTY)
+            {
+                /* row i and column j are matched to each other */
+                Flag [j] = 1 ;
+            }
+        }
+
+        /* make a list of all unmatched columns, in Work [0..nbadcol-1]  */
+        nbadcol = 0 ;
+        for (j = n-1 ; j >= 0 ; j--)
+        {
+            if (!Flag [j])
+            {
+                /* j is matched to nobody */
+                Work [nbadcol++] = j ;
+            }
+        }
+        ASSERT (*nmatch + nbadcol == n) ;
+
+        /* make an assignment for each unmatched row */
+        for (i = 0 ; i < n ; i++)
+        {
+            if (Q [i] == EMPTY && nbadcol > 0)
+            {
+                /* get an unmatched column j */
+                j = Work [--nbadcol] ;
+                /* assign j to row i and flag the entry by "flipping" it */
+                Q [i] = BTF_FLIP (j) ;
+            }
+        }
+    }
+
+    /* The permutation of a square matrix can be recovered as follows: Row i is
+     * matched with column j, where j = BTF_UNFLIP (Q [i]) and where j
+     * will always be in the valid range 0 to n-1.  The entry A(i,j) is zero
+     * if BTF_ISFLIPPED (Q [i]) is true, and nonzero otherwise.  nmatch
+     * is the number of entries in the Q array that are non-negative. */
+
+    /* ---------------------------------------------------------------------- */
+    /* find the strongly connected components */
+    /* ---------------------------------------------------------------------- */
+
+    nblocks = BTF(strongcomp) (n, Ap, Ai, Q, P, R, Work) ;
+    return (nblocks) ;
+}
--- a/src/maths/KLU/btf_strongcomp.c
+++ b/src/maths/KLU/btf_strongcomp.c
@ -0,0 +1,593 @@
+/* ========================================================================== */
+/* === BTF_STRONGCOMP ======================================================= */
+/* ========================================================================== */
+
+/* Finds the strongly connected components of a graph, or equivalently, permutes
+ * the matrix into upper block triangular form.  See btf.h for more details.
+ * Input matrix and Q are not checked on input.
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+#include "btf.h"
+#include "btf_internal.h"
+
+#define UNVISITED (-2)      /* Flag [j] = UNVISITED if node j not visited yet */
+#define UNASSIGNED (-1)     /* Flag [j] = UNASSIGNED if node j has been visited,
+                             * but not yet assigned to a strongly-connected
+                             * component (aka block).  Flag [j] = k (k in the
+                             * range 0 to nblocks-1) if node j has been visited
+                             * (and completed, with its postwork done) and
+                             * assigned to component k. */
+
+/* This file contains two versions of the depth-first-search, a recursive one
+ * and a non-recursive one.  By default, the non-recursive one is used. */
+
+#ifndef RECURSIVE
+
+/* ========================================================================== */
+/* === dfs: non-recursive version (default) ================================= */
+/* ========================================================================== */
+
+/* Perform a depth-first-search of a graph, stored in an adjacency-list form.
+ * The row indices of column j (equivalently, the out-adjacency list of node j)
+ * are stored in Ai [Ap[j] ... Ap[j+1]-1].  Self-edge (diagonal entries) are
+ * ignored.  Ap[0] must be zero, and thus nz = Ap[n] is the number of entries
+ * in the matrix (or edges in the graph).  The row indices in each column need
+ * not be in any particular order.  If an input column permutation is given,
+ * node j (in the permuted matrix A*Q) is located in
+ * Ai [Ap[Q[j]] ... Ap[Q[j]+1]-1].  This Q can be the same as the Match array
+ * output from the maxtrans routine, for a square matrix that is structurally
+ * full rank.
+ *
+ * The algorithm is from the paper by Robert E. Tarjan, "Depth-first search and
+ * linear graph algorithms," SIAM Journal on Computing, vol. 1, no. 2,
+ * pp. 146-160, 1972.  The time taken by strongcomp is O(nnz(A)).
+ *
+ * See also MC13A/B in the Harwell subroutine library (Iain S. Duff and John
+ * K. Reid, "Algorithm 529: permutations to block triangular form," ACM Trans.
+ * on Mathematical Software, vol. 4, no. 2, pp. 189-192, 1978, and "An
+ * implementation of Tarjan's algorithm for the block triangular form of a
+ * matrix," same journal, pp. 137-147.  This code is implements the same
+ * algorithm as MC13A/B, except that the data structures are very different.
+ * Also, unlike MC13A/B, the output permutation preserves the natural ordering
+ * within each block.
+ */
+
+static void dfs
+(
+    /* inputs, not modified on output: */
+    Int j,              /* start the DFS at node j */
+    Int Ap [ ],         /* size n+1, column pointers for the matrix A */
+    Int Ai [ ],         /* row indices, size nz = Ap [n] */
+    Int Q [ ],          /* input column permutation */
+
+    /* inputs, modified on output (each array is of size n): */
+    Int Time [ ],       /* Time [j] = "time" that node j was first visited */
+    Int Flag [ ],       /* Flag [j]: see above */
+    Int Low [ ],        /* Low [j]: see definition below */
+    Int *p_nblocks,     /* number of blocks (aka strongly-connected-comp.)*/
+    Int *p_timestamp,   /* current "time" */
+
+    /* workspace, not defined on input or output: */
+    Int Cstack [ ],     /* size n, output stack to hold nodes of components */
+    Int Jstack [ ],     /* size n, stack for the variable j */
+    Int Pstack [ ]      /* size n, stack for the variable p */
+)
+{
+    /* ---------------------------------------------------------------------- */
+    /* local variables, and initializations */
+    /* ---------------------------------------------------------------------- */
+
+    /* local variables, but "global" to all DFS levels: */
+    Int chead ;     /* top of Cstack */
+    Int jhead ;     /* top of Jstack and Pstack */
+
+    /* variables that are purely local to any one DFS level: */
+    Int i ;         /* edge (j,i) considered; i can be next node to traverse */
+    Int parent ;    /* parent of node j in the DFS tree */
+    Int pend ;      /* one past the end of the adjacency list for node j */
+    Int jj ;        /* column j of A*Q is column jj of the input matrix A */
+
+    /* variables that need to be pushed then popped from the stack: */
+    Int p ;         /* current index into the adj. list for node j */
+    /* the variables j and p are stacked in Jstack and Pstack */
+
+    /* local copies of variables in the calling routine */
+    Int nblocks   = *p_nblocks ;
+    Int timestamp = *p_timestamp ;
+
+    /* ---------------------------------------------------------------------- */
+    /* start a DFS at node j (same as the recursive call dfs (EMPTY, j)) */
+    /* ---------------------------------------------------------------------- */
+
+    chead = 0 ;             /* component stack is empty */
+    jhead = 0 ;             /* Jstack and Pstack are empty */
+    Jstack [0] = j ;        /* put the first node j on the Jstack */
+    ASSERT (Flag [j] == UNVISITED) ;
+
+    while (jhead >= 0)
+    {
+        j = Jstack [jhead] ;        /* grab the node j from the top of Jstack */
+
+        /* determine which column jj of the A is column j of A*Q */
+        jj = (Q == (Int *) NULL) ? (j) : (BTF_UNFLIP (Q [j])) ;
+        pend = Ap [jj+1] ;          /* j's row index list ends at Ai [pend-1] */
+
+        if (Flag [j] == UNVISITED)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* prework at node j */
+            /* -------------------------------------------------------------- */
+
+            /* node j is being visited for the first time */
+            Cstack [++chead] = j ;          /* push j onto the stack */
+            timestamp++ ;                   /* get a timestamp */
+            Time [j] = timestamp ;          /* give the timestamp to node j */
+            Low [j] = timestamp ;
+            Flag [j] = UNASSIGNED ;         /* flag node j as visited */
+
+            /* -------------------------------------------------------------- */
+            /* set Pstack [jhead] to the first entry in column j to scan */
+            /* -------------------------------------------------------------- */
+
+            Pstack [jhead] = Ap [jj] ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* DFS rooted at node j (start it, or continue where left off) */
+        /* ------------------------------------------------------------------ */
+
+        for (p = Pstack [jhead] ; p < pend ; p++)
+        {
+            i = Ai [p] ;    /* examine the edge from node j to node i */
+            if (Flag [i] == UNVISITED)
+            {
+                /* Node i has not been visited - start a DFS at node i.
+                 * Keep track of where we left off in the scan of adjacency list
+                 * of node j so we can restart j where we left off. */
+                Pstack [jhead] = p + 1 ;
+                /* Push i onto the stack and immediately break
+                 * so we can recurse on node i. */
+                Jstack [++jhead] = i ;
+                ASSERT (Time [i] == EMPTY) ;
+                ASSERT (Low [i] == EMPTY) ;
+                /* break here to do what the recursive call dfs (j,i) does */
+                break ;
+            }
+            else if (Flag [i] == UNASSIGNED)
+            {
+                /* Node i has been visited, but still unassigned to a block
+                 * this is a back or cross edge if Time [i] < Time [j].
+                 * Note that i might equal j, in which case this code does
+                 * nothing. */
+                ASSERT (Time [i] > 0) ;
+                ASSERT (Low [i] > 0) ;
+                Low [j] = MIN (Low [j], Time [i]) ;
+            }
+        }
+
+        if (p == pend)
+        {
+            /* If all adjacent nodes of j are already visited, pop j from
+             * Jstack and do the post work for node j.  This also pops p
+             * from the Pstack. */
+            jhead-- ;
+
+            /* -------------------------------------------------------------- */
+            /* postwork at node j */
+            /* -------------------------------------------------------------- */
+
+            /* determine if node j is the head of a component */
+            if (Low [j] == Time [j])
+            {
+                /* pop all nodes in this SCC from Cstack */
+                while (TRUE)
+                {
+                    ASSERT (chead >= 0) ;       /* stack not empty (j in it) */
+                    i = Cstack [chead--] ;      /* pop a node from the Cstack */
+                    ASSERT (i >= 0) ;
+                    ASSERT (Flag [i] == UNASSIGNED) ;
+                    Flag [i] = nblocks ;        /* assign i to current block */
+                    if (i == j) break ;         /* current block ends at j */
+                }
+                nblocks++ ;     /* one more block has been found */
+            }
+            /* update Low [parent], if the parent exists */
+            if (jhead >= 0)
+            {
+                parent = Jstack [jhead] ;
+                Low [parent] = MIN (Low [parent], Low [j]) ;
+            }
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* cleanup: update timestamp and nblocks */
+    /* ---------------------------------------------------------------------- */
+
+    *p_timestamp = timestamp ;
+    *p_nblocks   = nblocks ;
+}
+
+#else
+
+/* ========================================================================== */
+/* === dfs: recursive version (only for illustration) ======================= */
+/* ========================================================================== */
+
+/* The following is a recursive version of dfs, which computes identical results
+ * as the non-recursive dfs.  It is included here because it is easier to read.
+ * Compare the comments in the code below with the identical comments in the
+ * non-recursive code above, and that will help you see the correlation between
+ * the two routines.
+ *
+ * This routine can cause stack overflow, and is thus not recommended for heavy
+ * usage, particularly for large matrices.  To help in delaying stack overflow,
+ * global variables are used, reducing the amount of information each call to
+ * dfs places on the call/return stack (the integers i, j, p, parent, and the
+ * return address).  Note that this means the recursive code is not thread-safe.
+ * To try this version, compile the code with -DRECURSIVE or include the
+ * following line at the top of this file:
+
+#define RECURSIVE
+
+ */
+
+static Int  /* for recursive illustration only, not for production use */
+    chead, timestamp, nblocks, n, *Ap, *Ai, *Flag, *Cstack, *Time, *Low,
+    *P, *R, *Q ;
+
+static void dfs
+(
+    Int parent,         /* came from parent node */
+    Int j               /* at node j in the DFS */
+)
+{
+    Int p ;         /* current index into the adj. list for node j */
+    Int i ;         /* edge (j,i) considered; i can be next node to traverse */
+    Int jj ;        /* column j of A*Q is column jj of the input matrix A */
+
+    /* ---------------------------------------------------------------------- */
+    /* prework at node j */
+    /* ---------------------------------------------------------------------- */
+
+    /* node j is being visited for the first time */
+    Cstack [++chead] = j ;          /* push j onto the stack */
+    timestamp++ ;                   /* get a timestamp */
+    Time [j] = timestamp ;          /* give the timestamp to node j */
+    Low [j] = timestamp ;
+    Flag [j] = UNASSIGNED ;         /* flag node j as visited */
+
+    /* ---------------------------------------------------------------------- */
+    /* DFS rooted at node j */
+    /* ---------------------------------------------------------------------- */
+
+    /* determine which column jj of the A is column j of A*Q */
+    jj = (Q == (Int *) NULL) ? (j) : (BTF_UNFLIP (Q [j])) ;
+    for (p = Ap [jj] ; p < Ap [jj+1] ; p++)
+    {
+        i = Ai [p] ;    /* examine the edge from node j to node i */
+        if (Flag [i] == UNVISITED)
+        {
+            /* Node i has not been visited - start a DFS at node i. */
+            dfs (j, i) ;
+        }
+        else if (Flag [i] == UNASSIGNED)
+        {
+            /* Node i has been visited, but still unassigned to a block
+             * this is a back or cross edge if Time [i] < Time [j].
+             * Note that i might equal j, in which case this code does
+             * nothing. */
+            Low [j] = MIN (Low [j], Time [i]) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* postwork at node j */
+    /* ---------------------------------------------------------------------- */
+
+    /* determine if node j is the head of a component */
+    if (Low [j] == Time [j])
+    {
+        /* pop all nodes in this strongly connected component from Cstack */
+        while (TRUE)
+        {
+            i = Cstack [chead--] ;      /* pop a node from the Cstack */
+            Flag [i] = nblocks ;        /* assign node i to current block */
+            if (i == j) break ;         /* current block ends at node j */
+        }
+        nblocks++ ;     /* one more block has been found */
+    }
+    /* update Low [parent] */
+    if (parent != EMPTY)
+    {
+        /* Note that this could be done with Low[j] = MIN(Low[j],Low[i]) just
+         * after the dfs (j,i) statement above, and then parent would not have
+         * to be an input argument.  Putting it here places all the postwork
+         * for node j in one place, thus making the non-recursive DFS easier. */
+        Low [parent] = MIN (Low [parent], Low [j]) ;
+    }
+}
+
+#endif
+
+/* ========================================================================== */
+/* === btf_strongcomp ======================================================= */
+/* ========================================================================== */
+
+#ifndef RECURSIVE
+
+Int BTF(strongcomp) /* return # of strongly connected components */
+(
+    /* input, not modified: */
+    Int n,          /* A is n-by-n in compressed column form */
+    Int Ap [ ],     /* size n+1 */
+    Int Ai [ ],     /* size nz = Ap [n] */
+
+    /* optional input, modified (if present) on output: */
+    Int Q [ ],      /* size n, input column permutation.  The permutation Q can
+                     * include a flag which indicates an unmatched row.
+                     * jold = BTF_UNFLIP (Q [jnew]) is the permutation;
+                     * this function ingnores these flags.  On output, it is
+                     * modified according to the permutation P. */
+
+    /* output, not defined on input: */
+    Int P [ ],      /* size n.  P [k] = j if row and column j are kth row/col
+                     * in permuted matrix. */
+    Int R [ ],      /* size n+1.  kth block is in rows/cols R[k] ... R[k+1]-1
+                     * of the permuted matrix. */
+
+    /* workspace, not defined on input or output: */
+    Int Work [ ]    /* size 4n */
+)
+
+#else
+
+Int BTF(strongcomp) /* recursive version - same as above except for Work size */
+(
+    Int n_in,
+    Int Ap_in [ ],
+    Int Ai_in [ ],
+    Int Q_in [ ],
+    Int P_in [ ],
+    Int R_in [ ],
+    Int Work [ ]    /* size 2n */
+)
+
+#endif
+
+{
+    Int j, k, b ;
+
+#ifndef RECURSIVE
+    Int timestamp, nblocks, *Flag, *Cstack, *Time, *Low, *Jstack, *Pstack ;
+#else
+    n = n_in ;
+    Ap = Ap_in ;
+    Ai = Ai_in ;
+    Q = Q_in ;
+    P = P_in ;
+    R = R_in ;
+    chead = EMPTY ;
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* get and initialize workspace */
+    /* ---------------------------------------------------------------------- */
+
+    /* timestamp is incremented each time a new node is visited.
+     *
+     * Time [j] is the timestamp given to node j.
+     *
+     * Low [j] is the lowest timestamp of any node reachable from j via either
+     * a path to any descendent of j in the DFS tree, or via a single edge to
+     * an either an ancestor (a back edge) or another node that's neither an
+     * ancestor nor a descendant (a cross edge).  If Low [j] is equal to
+     * the timestamp of node j (Time [j]), then node j is the "head" of a
+     * strongly connected component (SCC).  That is, it is the first node
+     * visited in its strongly connected component, and the DFS subtree rooted
+     * at node j spans all the nodes of the strongly connected component.
+     *
+     * The term "block" and "component" are used interchangebly in this code;
+     * "block" being a matrix term and "component" being a graph term for the
+     * same thing.
+     *
+     * When a node is visited, it is placed on the Cstack (for "component"
+     * stack).  When node j is found to be an SCC head, all the nodes from the
+     * top of the stack to node j itself form the nodes in the SCC.  This Cstack
+     * is used for both the recursive and non-recursive versions.
+     */
+
+    Time   = Work ; Work += n ;
+    Flag   = Work ; Work += n ;
+    Low    = P ;                /* use output array P as workspace for Low */
+    Cstack = R ;                /* use output array R as workspace for Cstack */
+
+#ifndef RECURSIVE
+    /* stack for non-recursive dfs */
+    Jstack = Work ; Work += n ;     /* stack for j */
+    Pstack = Work ;                 /* stack for p */
+#endif
+
+    for (j = 0 ; j < n ; j++)
+    {
+        Flag [j] = UNVISITED ;
+        Low [j] = EMPTY ;
+        Time [j] = EMPTY ;
+#ifndef NDEBUG
+        Cstack [j] = EMPTY ;
+#ifndef RECURSIVE
+        Jstack [j] = EMPTY ;
+        Pstack [j] = EMPTY ;
+#endif
+#endif
+    }
+
+    timestamp = 0 ;     /* each node given a timestamp when it is visited */
+    nblocks = 0 ;       /* number of blocks found so far */
+
+    /* ---------------------------------------------------------------------- */
+    /* find the connected components via a depth-first-search */
+    /* ---------------------------------------------------------------------- */
+
+    for (j = 0 ; j < n ; j++)
+    {
+        /* node j is unvisited or assigned to a block. Cstack is empty. */
+        ASSERT (Flag [j] == UNVISITED || (Flag [j] >= 0 && Flag [j] < nblocks));
+        if (Flag [j] == UNVISITED)
+        {
+#ifndef RECURSIVE
+            /* non-recursive dfs (default) */
+            dfs (j, Ap, Ai, Q, Time, Flag, Low, &nblocks, &timestamp,
+                    Cstack, Jstack, Pstack) ;
+#else
+            /* recursive dfs (for illustration only) */
+            ASSERT (chead == EMPTY) ;
+            dfs (EMPTY, j) ;
+            ASSERT (chead == EMPTY) ;
+#endif
+        }
+    }
+    ASSERT (timestamp == n) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* construct the block boundary array, R */
+    /* ---------------------------------------------------------------------- */
+
+    for (b = 0 ; b < nblocks ; b++)
+    {
+        R [b] = 0 ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        /* node j has been assigned to block b = Flag [j] */
+        ASSERT (Time [j] > 0 && Time [j] <= n) ;
+        ASSERT (Low [j] > 0 && Low [j] <= n) ;
+        ASSERT (Flag [j] >= 0 && Flag [j] < nblocks) ;
+        R [Flag [j]]++ ;
+    }
+    /* R [b] is now the number of nodes in block b.  Compute cumulative sum
+     * of R, using Time [0 ... nblocks-1] as workspace. */
+    Time [0] = 0 ;
+    for (b = 1 ; b < nblocks ; b++)
+    {
+        Time [b] = Time [b-1] + R [b-1] ;
+    }
+    for (b = 0 ; b < nblocks ; b++)
+    {
+        R [b] = Time [b] ;
+    }
+    R [nblocks] = n ;
+
+    /* ---------------------------------------------------------------------- */
+    /* construct the permutation, preserving the natural order */
+    /* ---------------------------------------------------------------------- */
+
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        P [k] = EMPTY ;
+    }
+#endif
+
+    for (j = 0 ; j < n ; j++)
+    {
+        /* place column j in the permutation */
+        P [Time [Flag [j]]++] = j ;
+    }
+
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (P [k] != EMPTY) ;
+    }
+#endif
+
+    /* Now block b consists of the nodes k1 to k2-1 in the permuted matrix,
+     * where k1 = R [b] and k2 = R [b+1].  Row and column j of the original
+     * matrix becomes row and column P [k] of the permuted matrix.  The set of
+     * of rows/columns (nodes) in block b is given by P [k1 ... k2-1], and this
+     * set is sorted in ascending order.  Thus, if the matrix consists of just
+     * one block, P is the identity permutation. */
+
+    /* ---------------------------------------------------------------------- */
+    /* if Q is present on input, set Q = Q*P' */
+    /* ---------------------------------------------------------------------- */
+
+    if (Q != (Int *) NULL)
+    {
+        /* We found a symmetric permutation P for the matrix A*Q.  The overall
+         * permutation is thus P*(A*Q)*P'.  Set Q=Q*P' so that the final
+         * permutation is P*A*Q.  Use Time as workspace.  Note that this
+         * preserves the negative values of Q if the matrix is structurally
+         * singular. */
+        for (k = 0 ; k < n ; k++)
+        {
+            Time [k] = Q [P [k]] ;
+        }
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = Time [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* how to traverse the permuted matrix */
+    /* ---------------------------------------------------------------------- */
+
+    /* If Q is not present, the following code can be used to traverse the
+     * permuted matrix P*A*P'
+     *
+     *      // compute the inverse of P
+     *      for (knew = 0 ; knew < n ; knew++)
+     *      {
+     *          // row and column kold in the old matrix is row/column knew
+     *          // in the permuted matrix P*A*P'
+     *          kold = P [knew] ;
+     *          Pinv [kold] = knew ;
+     *      }
+     *      for (b = 0 ; b < nblocks ; b++)
+     *      {
+     *          // traverse block b of the permuted matrix P*A*P'
+     *          k1 = R [b] ;
+     *          k2 = R [b+1] ;
+     *          nk = k2 - k1 ;
+     *          for (jnew = k1 ; jnew < k2 ; jnew++)
+     *          {
+     *              jold = P [jnew] ;
+     *              for (p = Ap [jold] ; p < Ap [jold+1] ; p++)
+     *              {
+     *                  iold = Ai [p] ;
+     *                  inew = Pinv [iold] ;
+     *                  // Entry in the old matrix is A (iold, jold), and its
+     *                  // position in the new matrix P*A*P' is (inew, jnew).
+     *                  // Let B be the bth diagonal block of the permuted
+     *                  // matrix.  If inew >= k1, then this entry is in row/
+     *                  // column (inew-k1, jnew-k1) of the nk-by-nk matrix B.
+     *                  // Otherwise, the entry is in the upper block triangular
+     *                  // part, not in any diagonal block.
+     *              }
+     *          }
+     *      }
+     *
+     * If Q is present replace the above statement
+     *          jold = P [jnew] ;
+     * with
+     *          jold = Q [jnew] ;
+     * or
+     *          jold = BTF_UNFLIP (Q [jnew]) ;
+     *
+     * then entry A (iold,jold) in the old (unpermuted) matrix is at (inew,jnew)
+     * in the permuted matrix P*A*Q.  Everything else remains the same as the
+     * above (simply replace P*A*P' with P*A*Q in the above comments).
+     */
+
+    /* ---------------------------------------------------------------------- */
+    /* return # of blocks / # of strongly connected components */
+    /* ---------------------------------------------------------------------- */
+
+    return (nblocks) ;
+}
--- a/src/maths/KLU/colamd.c
+++ b/src/maths/KLU/colamd.c
--- a/src/maths/KLU/colamd.h
+++ b/src/maths/KLU/colamd.h
@ -0,0 +1,255 @@
+/* ========================================================================== */
+/* === colamd/symamd prototypes and definitions ============================= */
+/* ========================================================================== */
+
+/* COLAMD / SYMAMD include file
+
+    You must include this file (colamd.h) in any routine that uses colamd,
+    symamd, or the related macros and definitions.
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis at cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Notice:
+
+	Copyright (c) 1998-2007, Timothy A. Davis, All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use, copy, modify, and/or distribute
+	this program, provided that the Copyright, this License, and the
+	Availability of the original version is retained on all copies and made
+	accessible to the end-user of any code or package that includes COLAMD
+	or any modified version of COLAMD. 
+
+    Availability:
+
+	The colamd/symamd library is available at
+
+	    http://www.cise.ufl.edu/research/sparse/colamd/
+
+	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
+	file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
+	files, and by any C code that calls the routines whose prototypes are
+	listed below, or that uses the colamd/symamd definitions listed below.
+
+*/
+
+#ifndef COLAMD_H
+#define COLAMD_H
+
+/* make it easy for C++ programs to include COLAMD */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include <stdlib.h>
+
+/* ========================================================================== */
+/* === COLAMD version ======================================================= */
+/* ========================================================================== */
+
+/* COLAMD Version 2.4 and later will include the following definitions.
+ * As an example, to test if the version you are using is 2.4 or later:
+ *
+ * #ifdef COLAMD_VERSION
+ *	if (COLAMD_VERSION >= COLAMD_VERSION_CODE (2,4)) ...
+ * #endif
+ *
+ * This also works during compile-time:
+ *
+ *  #if defined(COLAMD_VERSION) && (COLAMD_VERSION >= COLAMD_VERSION_CODE (2,4))
+ *    printf ("This is version 2.4 or later\n") ;
+ *  #else
+ *    printf ("This is an early version\n") ;
+ *  #endif
+ *
+ * Versions 2.3 and earlier of COLAMD do not include a #define'd version number.
+ */
+
+#define COLAMD_DATE "Dec 7, 2011"
+#define COLAMD_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define COLAMD_MAIN_VERSION 2
+#define COLAMD_SUB_VERSION 7
+#define COLAMD_SUBSUB_VERSION 4
+#define COLAMD_VERSION \
+	COLAMD_VERSION_CODE(COLAMD_MAIN_VERSION,COLAMD_SUB_VERSION)
+
+/* ========================================================================== */
+/* === Knob and statistics definitions ====================================== */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+#define COLAMD_KNOBS 20
+
+/* number of output statistics.  Only stats [0..6] are currently used. */
+#define COLAMD_STATS 20
+
+/* knobs [0] and stats [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
+
+/* knobs [1] and stats [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
+
+/* knobs [2]: aggressive absorption */
+#define COLAMD_AGGRESSIVE 2
+
+/* stats [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
+
+/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+#define COLAMD_STATUS 3
+
+/* stats [4..6]: error info, or info on jumbled columns */ 
+#define COLAMD_INFO1 4
+#define COLAMD_INFO2 5
+#define COLAMD_INFO3 6
+
+/* error codes returned in stats [3]: */
+#define COLAMD_OK				(0)
+#define COLAMD_OK_BUT_JUMBLED			(1)
+#define COLAMD_ERROR_A_not_present		(-1)
+#define COLAMD_ERROR_p_not_present		(-2)
+#define COLAMD_ERROR_nrow_negative		(-3)
+#define COLAMD_ERROR_ncol_negative		(-4)
+#define COLAMD_ERROR_nnz_negative		(-5)
+#define COLAMD_ERROR_p0_nonzero			(-6)
+#define COLAMD_ERROR_A_too_small		(-7)
+#define COLAMD_ERROR_col_length_negative	(-8)
+#define COLAMD_ERROR_row_index_out_of_bounds	(-9)
+#define COLAMD_ERROR_out_of_memory		(-10)
+#define COLAMD_ERROR_internal_error		(-999)
+
+
+/* ========================================================================== */
+/* === Prototypes of user-callable routines ================================= */
+/* ========================================================================== */
+
+/* define UF_long */
+#include "UFconfig.h"
+
+size_t colamd_recommended	/* returns recommended value of Alen, */
+				/* or 0 if input arguments are erroneous */
+(
+    int nnz,			/* nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+) ;
+
+size_t colamd_l_recommended	/* returns recommended value of Alen, */
+				/* or 0 if input arguments are erroneous */
+(
+    UF_long nnz,		/* nonzeros in A */
+    UF_long n_row,		/* number of rows in A */
+    UF_long n_col		/* number of columns in A */
+) ;
+
+void colamd_set_defaults	/* sets default parameters */
+(				/* knobs argument is modified on output */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
+void colamd_l_set_defaults	/* sets default parameters */
+(				/* knobs argument is modified on output */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
+int colamd			/* returns (1) if successful, (0) otherwise*/
+(				/* A and p arguments are modified on output */
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* size of the array A */
+    int A [],			/* row indices of A, of size Alen */
+    int p [],			/* column pointers of A, of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameter settings for colamd */
+    int stats [COLAMD_STATS]	/* colamd output statistics and error codes */
+) ;
+
+UF_long colamd_l		/* returns (1) if successful, (0) otherwise*/
+(				/* A and p arguments are modified on output */
+    UF_long n_row,		/* number of rows in A */
+    UF_long n_col,		/* number of columns in A */
+    UF_long Alen,		/* size of the array A */
+    UF_long A [],		/* row indices of A, of size Alen */
+    UF_long p [],		/* column pointers of A, of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameter settings for colamd */
+    UF_long stats [COLAMD_STATS]/* colamd output statistics and error codes */
+) ;
+
+int symamd				/* return (1) if OK, (0) otherwise */
+(
+    int n,				/* number of rows and columns of A */
+    int A [],				/* row indices of A */
+    int p [],				/* column pointers of A */
+    int perm [],			/* output permutation, size n_col+1 */
+    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
+    int stats [COLAMD_STATS],		/* output statistics and error codes */
+    void * (*allocate) (size_t, size_t),
+    					/* pointer to calloc (ANSI C) or */
+					/* mxCalloc (for MATLAB mexFunction) */
+    void (*release) (void *)
+    					/* pointer to free (ANSI C) or */
+    					/* mxFree (for MATLAB mexFunction) */
+) ;
+
+UF_long symamd_l			/* return (1) if OK, (0) otherwise */
+(
+    UF_long n,				/* number of rows and columns of A */
+    UF_long A [],			/* row indices of A */
+    UF_long p [],			/* column pointers of A */
+    UF_long perm [],			/* output permutation, size n_col+1 */
+    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
+    UF_long stats [COLAMD_STATS],	/* output statistics and error codes */
+    void * (*allocate) (size_t, size_t),
+    					/* pointer to calloc (ANSI C) or */
+					/* mxCalloc (for MATLAB mexFunction) */
+    void (*release) (void *)
+    					/* pointer to free (ANSI C) or */
+    					/* mxFree (for MATLAB mexFunction) */
+) ;
+
+void colamd_report
+(
+    int stats [COLAMD_STATS]
+) ;
+
+void colamd_l_report
+(
+    UF_long stats [COLAMD_STATS]
+) ;
+
+void symamd_report
+(
+    int stats [COLAMD_STATS]
+) ;
+
+void symamd_l_report
+(
+    UF_long stats [COLAMD_STATS]
+) ;
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+EXTERN int (*colamd_printf) (const char *, ...) ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* COLAMD_H */
--- a/src/maths/KLU/colamd_global.c
+++ b/src/maths/KLU/colamd_global.c
@ -0,0 +1,24 @@
+/* ========================================================================== */
+/* === colamd_global.c ====================================================== */
+/* ========================================================================== */
+
+/* ----------------------------------------------------------------------------
+ * COLAMD, Copyright (C) 2007, Timothy A. Davis.
+ * See License.txt for the Version 2.1 of the GNU Lesser General Public License
+ * http://www.cise.ufl.edu/research/sparse
+ * -------------------------------------------------------------------------- */
+
+/* Global variables for COLAMD */
+
+#ifndef NPRINT
+#ifdef MATLAB_MEX_FILE
+#include "mex.h"
+int (*colamd_printf) (const char *, ...) = mexPrintf ;
+#else
+#include <stdio.h>
+int (*colamd_printf) (const char *, ...) = printf ;
+#endif
+#else
+int (*colamd_printf) (const char *, ...) = ((void *) 0) ;
+#endif
+
--- a/src/maths/KLU/klu.c
+++ b/src/maths/KLU/klu.c
@ -0,0 +1,773 @@
+/* ========================================================================== */
+/* === klu ================================================================== */
+/* ========================================================================== */
+
+/* KLU: factorizes P*A into L*U, using the Gilbert-Peierls algorithm [1], with
+ * optional symmetric pruning by Eisenstat and Liu [2].  The code is by Tim
+ * Davis.  This algorithm is what appears as the default sparse LU routine in
+ * MATLAB version 6.0, and still appears in MATLAB 6.5 as [L,U,P] = lu (A).
+ * Note that no column ordering is provided (see COLAMD or AMD for suitable
+ * orderings).  SuperLU is based on this algorithm, except that it adds the
+ * use of dense matrix operations on "supernodes" (adjacent columns with
+ * identical).  This code doesn't use supernodes, thus its name ("Kent" LU,
+ * as in "Clark Kent", in contrast with Super-LU...).  This algorithm is slower
+ * than SuperLU and UMFPACK for large matrices with lots of nonzeros in their
+ * factors (such as for most finite-element problems).  However, for matrices
+ * with very sparse LU factors, this algorithm is typically faster than both
+ * SuperLU and UMFPACK, since in this case there is little chance to exploit
+ * dense matrix kernels (the BLAS).
+ *
+ * Only one block of A is factorized, in the BTF form.  The input n is the
+ * size of the block; k1 is the first row and column in the block.
+ *
+ * NOTE: no error checking is done on the inputs.  This version is not meant to
+ * be called directly by the user.  Use klu_factor instead.
+ *
+ * No fill-reducing ordering is provided.  The ordering quality of
+ * klu_kernel_factor is the responsibility of the caller.  The input A must
+ * pre-permuted to reduce fill-in, or fill-reducing input permutation Q must
+ * be provided.
+ *
+ * The input matrix A must be in compressed-column form, with either sorted
+ * or unsorted row indices.  Row indices for column j of A is in
+ * Ai [Ap [j] ... Ap [j+1]-1] and the same range of indices in Ax holds the
+ * numerical values.  No duplicate entries are allowed.
+ *
+ * Copyright 2004-2009, Tim Davis.  All rights reserved.  See the README
+ * file for details on permitted use.  Note that no code from The MathWorks,
+ * Inc, or from SuperLU, or from any other source appears here.  The code is
+ * written from scratch, from the algorithmic description in Gilbert & Peierls'
+ * and Eisenstat & Liu's journal papers [1,2].
+ *
+ * If an input permutation Q is provided, the factorization L*U = A (P,Q)
+ * is computed, where P is determined by partial pivoting, and Q is the input
+ * ordering.  If the pivot tolerance is less than 1, the "diagonal" entry that
+ * KLU attempts to choose is the diagonal of A (Q,Q).  In other words, the
+ * input permutation is applied symmetrically to the input matrix.  The output
+ * permutation P includes both the partial pivoting ordering and the input
+ * permutation.  If Q is NULL, then it is assumed to be the identity
+ * permutation.  Q is not modified.
+ *
+ * [1] Gilbert, J. R. and Peierls, T., "Sparse Partial Pivoting in Time
+ *      Proportional to Arithmetic Operations," SIAM J. Sci. Stat. Comp.,
+ *      vol 9, pp.  862-874, 1988.
+ * [2] Eisenstat, S. C. and Liu, J. W. H., "Exploiting Structural Symmetry in
+ *      Unsymmetric Sparse Symbolic Factorization," SIAM J. Matrix Analysis &
+ *      Applic., vol 13, pp.  202-211, 1992.
+ */
+
+/* ========================================================================== */
+
+#include "klu_internal.h"
+
+size_t KLU_kernel_factor            /* 0 if failure, size of LU if OK */
+(
+    /* inputs, not modified */
+    Int n,          /* A is n-by-n. n must be > 0. */
+    Int Ap [ ],     /* size n+1, column pointers for A */
+    Int Ai [ ],     /* size nz = Ap [n], row indices for A */
+    Entry Ax [ ],   /* size nz, values of A */
+    Int Q [ ],      /* size n, optional column permutation */
+    double Lsize,   /* estimate of number of nonzeros in L */
+
+    /* outputs, not defined on input */
+    Unit **p_LU,        /* row indices and values of L and U */
+    Entry Udiag [ ],    /* size n, diagonal of U */
+    Int Llen [ ],       /* size n, column length of L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Lip [ ],        /* size n, column pointers for L */
+    Int Uip [ ],        /* size n, column pointers for U */
+    Int P [ ],          /* row permutation, size n */
+    Int *lnz,           /* size of L */
+    Int *unz,           /* size of U */
+
+    /* workspace, undefined on input */
+    Entry *X,       /* size n double's, zero on output */
+    Int *Work,      /* size 5n Int's */
+
+    /* inputs, not modified on output */
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],      /* inverse of P from symbolic factorization */
+    double Rs [ ],      /* scale factors for A */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ],
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    double maxlnz, dunits ;
+    Unit *LU ;
+    Int *Pinv, *Lpend, *Stack, *Flag, *Ap_pos, *W ;
+    Int lsize, usize, anz, ok ;
+    size_t lusize ;
+    ASSERT (Common != NULL) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get control parameters, or use defaults */
+    /* ---------------------------------------------------------------------- */
+
+    n = MAX (1, n) ;
+    anz = Ap [n+k1] - Ap [k1] ;
+
+    if (Lsize <= 0)
+    {
+        Lsize = -Lsize ;
+        Lsize = MAX (Lsize, 1.0) ;
+        lsize = Lsize * anz + n ;
+    }
+    else
+    {
+        lsize = Lsize ;
+    }
+
+    usize = lsize ;
+
+    lsize  = MAX (n+1, lsize) ;
+    usize  = MAX (n+1, usize) ;
+
+    maxlnz = (((double) n) * ((double) n) + ((double) n)) / 2. ;
+    maxlnz = MIN (maxlnz, ((double) INT_MAX)) ;
+    lsize  = MIN (maxlnz, lsize) ;
+    usize  = MIN (maxlnz, usize) ;
+
+    PRINTF (("Welcome to klu: n %d anz %d k1 %d lsize %d usize %d maxlnz %g\n",
+        n, anz, k1, lsize, usize, maxlnz)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate workspace and outputs */
+    /* ---------------------------------------------------------------------- */
+
+    /* return arguments are not yet assigned */
+    *p_LU = (Unit *) NULL ;
+
+    /* these computations are safe from size_t overflow */
+    W = Work ;
+    Pinv = (Int *) W ;      W += n ;
+    Stack = (Int *) W ;     W += n ;
+    Flag = (Int *) W ;      W += n ;
+    Lpend = (Int *) W ;     W += n ;
+    Ap_pos = (Int *) W ;    W += n ;
+
+    dunits = DUNITS (Int, lsize) + DUNITS (Entry, lsize) +
+             DUNITS (Int, usize) + DUNITS (Entry, usize) ;
+    lusize = (size_t) dunits ;
+    ok = !INT_OVERFLOW (dunits) ; 
+    LU = ok ? KLU_malloc (lusize, sizeof (Unit), Common) : NULL ;
+    if (LU == NULL)
+    {
+        /* out of memory, or problem too large */
+        Common->status = KLU_OUT_OF_MEMORY ;
+        lusize = 0 ;
+        return (lusize) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* factorize */
+    /* ---------------------------------------------------------------------- */
+
+    /* with pruning, and non-recursive depth-first-search */
+    lusize = KLU_kernel (n, Ap, Ai, Ax, Q, lusize,
+            Pinv, P, &LU, Udiag, Llen, Ulen, Lip, Uip, lnz, unz,
+            X, Stack, Flag, Ap_pos, Lpend,
+            k1, PSinv, Rs, Offp, Offi, Offx, Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* return LU factors, or return nothing if an error occurred */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status < KLU_OK)
+    {
+        LU = KLU_free (LU, lusize, sizeof (Unit), Common) ;
+        lusize = 0 ;
+    }
+    *p_LU = LU ;
+    PRINTF ((" in klu noffdiag %d\n", Common->noffdiag)) ;
+    return (lusize) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_lsolve =========================================================== */
+/* ========================================================================== */
+
+/* Solve Lx=b.  Assumes L is unit lower triangular and where the unit diagonal
+ * entry is NOT stored.  Overwrites B  with the solution X.  B is n-by-nrhs
+ * and is stored in ROW form with row dimension nrhs.  nrhs must be in the
+ * range 1 to 4. */
+
+void KLU_lsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lip [ ],
+    Int Llen [ ],
+    Unit LU [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Lx=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], lik ;
+    Int *Li ;
+    Entry *Lx ;
+    Int k, p, len, i ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [k] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                /* unit diagonal of L is not stored*/
+                for (p = 0 ; p < len ; p++)
+                {
+                    /* X [Li [p]] -= Lx [p] * x [0] ; */
+                    MULT_SUB (X [Li [p]], Lx [p], x [0]) ;
+                }
+            }
+            break ;
+
+        case 2:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [2*k    ] ;
+                x [1] = X [2*k + 1] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+                    lik = Lx [p] ;
+                    MULT_SUB (X [2*i], lik, x [0]) ;
+                    MULT_SUB (X [2*i + 1], lik, x [1]) ;
+                }
+            }
+            break ;
+
+        case 3:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [3*k    ] ;
+                x [1] = X [3*k + 1] ;
+                x [2] = X [3*k + 2] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+                    lik = Lx [p] ;
+                    MULT_SUB (X [3*i], lik, x [0]) ;
+                    MULT_SUB (X [3*i + 1], lik, x [1]) ;
+                    MULT_SUB (X [3*i + 2], lik, x [2]) ;
+                }
+            }
+            break ;
+
+        case 4:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [4*k    ] ;
+                x [1] = X [4*k + 1] ;
+                x [2] = X [4*k + 2] ;
+                x [3] = X [4*k + 3] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+                    lik = Lx [p] ;
+                    MULT_SUB (X [4*i], lik, x [0]) ;
+                    MULT_SUB (X [4*i + 1], lik, x [1]) ;
+                    MULT_SUB (X [4*i + 2], lik, x [2]) ;
+                    MULT_SUB (X [4*i + 3], lik, x [3]) ;
+                }
+            }
+            break ;
+
+    }
+}
+
+/* ========================================================================== */
+/* === KLU_usolve =========================================================== */
+/* ========================================================================== */
+
+/* Solve Ux=b.  Assumes U is non-unit upper triangular and where the diagonal
+ * entry is NOT stored.  Overwrites B with the solution X.  B is n-by-nrhs
+ * and is stored in ROW form with row dimension nrhs.  nrhs must be in the
+ * range 1 to 4. */
+
+void KLU_usolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Uip [ ],
+    Int Ulen [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Ux=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], uik, ukk ;
+    Int *Ui ;
+    Entry *Ux ;
+    Int k, p, len, i ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                /* x [0] = X [k] / Udiag [k] ; */
+                DIV (x [0], X [k], Udiag [k]) ;
+                X [k] = x [0] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    /* X [Ui [p]] -= Ux [p] * x [0] ; */
+                    MULT_SUB (X [Ui [p]], Ux [p], x [0]) ;
+
+                }
+            }
+
+            break ;
+
+        case 2:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                ukk = Udiag [k] ;
+                /* x [0] = X [2*k    ] / ukk ;
+                x [1] = X [2*k + 1] / ukk ; */
+                DIV (x [0], X [2*k], ukk) ;
+                DIV (x [1], X [2*k + 1], ukk) ;
+
+                X [2*k    ] = x [0] ;
+                X [2*k + 1] = x [1] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+                    uik = Ux [p] ;
+                    /* X [2*i    ] -= uik * x [0] ;
+                    X [2*i + 1] -= uik * x [1] ; */
+                    MULT_SUB (X [2*i], uik, x [0]) ;
+                    MULT_SUB (X [2*i + 1], uik, x [1]) ;
+                }
+            }
+
+            break ;
+
+        case 3:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                ukk = Udiag [k] ;
+
+                DIV (x [0], X [3*k], ukk) ;
+                DIV (x [1], X [3*k + 1], ukk) ;
+                DIV (x [2], X [3*k + 2], ukk) ;
+
+                X [3*k    ] = x [0] ;
+                X [3*k + 1] = x [1] ;
+                X [3*k + 2] = x [2] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+                    uik = Ux [p] ;
+                    MULT_SUB (X [3*i], uik, x [0]) ;
+                    MULT_SUB (X [3*i + 1], uik, x [1]) ;
+                    MULT_SUB (X [3*i + 2], uik, x [2]) ;
+                }
+            }
+
+            break ;
+
+        case 4:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                ukk = Udiag [k] ;
+
+                DIV (x [0], X [4*k], ukk) ;
+                DIV (x [1], X [4*k + 1], ukk) ;
+                DIV (x [2], X [4*k + 2], ukk) ;
+                DIV (x [3], X [4*k + 3], ukk) ;
+
+                X [4*k    ] = x [0] ;
+                X [4*k + 1] = x [1] ;
+                X [4*k + 2] = x [2] ;
+                X [4*k + 3] = x [3] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+                    uik = Ux [p] ;
+
+                    MULT_SUB (X [4*i], uik, x [0]) ;
+                    MULT_SUB (X [4*i + 1], uik, x [1]) ;
+                    MULT_SUB (X [4*i + 2], uik, x [2]) ;
+                    MULT_SUB (X [4*i + 3], uik, x [3]) ;
+                }
+            }
+
+            break ;
+
+    }
+}
+
+
+/* ========================================================================== */
+/* === KLU_ltsolve ========================================================== */
+/* ========================================================================== */
+
+/* Solve L'x=b.  Assumes L is unit lower triangular and where the unit diagonal
+ * entry is NOT stored.  Overwrites B with the solution X.  B is n-by-nrhs
+ * and is stored in ROW form with row dimension nrhs.  nrhs must in the
+ * range 1 to 4. */
+
+void KLU_ltsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lip [ ],
+    Int Llen [ ],
+    Unit LU [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to L'x=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], lik ;
+    Int *Li ;
+    Entry *Lx ;
+    Int k, p, len, i ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                x [0] = X [k] ;
+                for (p = 0 ; p < len ; p++)
+                {
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        /* x [0] -= CONJ (Lx [p]) * X [Li [p]] ; */
+                        MULT_SUB_CONJ (x [0], X [Li [p]], Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        /*x [0] -= Lx [p] * X [Li [p]] ;*/
+                        MULT_SUB (x [0], Lx [p], X [Li [p]]) ;
+                    }
+                }
+                X [k] = x [0] ;
+            }
+            break ;
+
+        case 2:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                x [0] = X [2*k    ] ;
+                x [1] = X [2*k + 1] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (lik, Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        lik = Lx [p] ;
+                    }
+                    MULT_SUB (x [0], lik, X [2*i]) ;
+                    MULT_SUB (x [1], lik, X [2*i + 1]) ;
+                }
+                X [2*k    ] = x [0] ;
+                X [2*k + 1] = x [1] ;
+            }
+            break ;
+
+        case 3:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                x [0] = X [3*k    ] ;
+                x [1] = X [3*k + 1] ;
+                x [2] = X [3*k + 2] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (lik, Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        lik = Lx [p] ;
+                    }
+                    MULT_SUB (x [0], lik, X [3*i]) ;
+                    MULT_SUB (x [1], lik, X [3*i + 1]) ;
+                    MULT_SUB (x [2], lik, X [3*i + 2]) ;
+                }
+                X [3*k    ] = x [0] ;
+                X [3*k + 1] = x [1] ;
+                X [3*k + 2] = x [2] ;
+            }
+            break ;
+
+        case 4:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                x [0] = X [4*k    ] ;
+                x [1] = X [4*k + 1] ;
+                x [2] = X [4*k + 2] ;
+                x [3] = X [4*k + 3] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (lik, Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        lik = Lx [p] ;
+                    }
+                    MULT_SUB (x [0], lik, X [4*i]) ;
+                    MULT_SUB (x [1], lik, X [4*i + 1]) ;
+                    MULT_SUB (x [2], lik, X [4*i + 2]) ;
+                    MULT_SUB (x [3], lik, X [4*i + 3]) ;
+                }
+                X [4*k    ] = x [0] ;
+                X [4*k + 1] = x [1] ;
+                X [4*k + 2] = x [2] ;
+                X [4*k + 3] = x [3] ;
+            }
+            break ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === KLU_utsolve ========================================================== */
+/* ========================================================================== */
+
+/* Solve U'x=b.  Assumes U is non-unit upper triangular and where the diagonal
+ * entry is stored (and appears last in each column of U).  Overwrites B
+ * with the solution X.  B is n-by-nrhs and is stored in ROW form with row
+ * dimension nrhs.  nrhs must be in the range 1 to 4. */
+
+void KLU_utsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Uip [ ],
+    Int Ulen [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to Ux=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], uik, ukk ;
+    Int k, p, len, i ;
+    Int *Ui ;
+    Entry *Ux ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [k] ;
+                for (p = 0 ; p < len ; p++)
+                {
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        /* x [0] -= CONJ (Ux [p]) * X [Ui [p]] ; */
+                        MULT_SUB_CONJ (x [0], X [Ui [p]], Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        /* x [0] -= Ux [p] * X [Ui [p]] ; */
+                        MULT_SUB (x [0], Ux [p], X [Ui [p]]) ;
+                    }
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [k], x [0], ukk) ;
+            }
+            break ;
+
+        case 2:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [2*k    ] ;
+                x [1] = X [2*k + 1] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (uik, Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        uik = Ux [p] ;
+                    }
+                    MULT_SUB (x [0], uik, X [2*i]) ;
+                    MULT_SUB (x [1], uik, X [2*i + 1]) ;
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [2*k], x [0], ukk) ;
+                DIV (X [2*k + 1], x [1], ukk) ;
+            }
+            break ;
+
+        case 3:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [3*k    ] ;
+                x [1] = X [3*k + 1] ;
+                x [2] = X [3*k + 2] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (uik, Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        uik = Ux [p] ;
+                    }
+                    MULT_SUB (x [0], uik, X [3*i]) ;
+                    MULT_SUB (x [1], uik, X [3*i + 1]) ;
+                    MULT_SUB (x [2], uik, X [3*i + 2]) ;
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [3*k], x [0], ukk) ;
+                DIV (X [3*k + 1], x [1], ukk) ;
+                DIV (X [3*k + 2], x [2], ukk) ;
+            }
+            break ;
+
+        case 4:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [4*k    ] ;
+                x [1] = X [4*k + 1] ;
+                x [2] = X [4*k + 2] ;
+                x [3] = X [4*k + 3] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (uik, Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        uik = Ux [p] ;
+                    }
+                    MULT_SUB (x [0], uik, X [4*i]) ;
+                    MULT_SUB (x [1], uik, X [4*i + 1]) ;
+                    MULT_SUB (x [2], uik, X [4*i + 2]) ;
+                    MULT_SUB (x [3], uik, X [4*i + 3]) ;
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [4*k], x [0], ukk) ;
+                DIV (X [4*k + 1], x [1], ukk) ;
+                DIV (X [4*k + 2], x [2], ukk) ;
+                DIV (X [4*k + 3], x [3], ukk) ;
+            }
+            break ;
+    }
+}
--- a/src/maths/KLU/klu.h
+++ b/src/maths/KLU/klu.h
@ -0,0 +1,831 @@
+/* ========================================================================== */
+/* === klu include file ===================================================== */
+/* ========================================================================== */
+
+/* Include file for user programs that call klu_* routines */
+
+#ifndef _KLU_H
+#define _KLU_H
+
+/* make it easy for C++ programs to include KLU */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "amd.h"
+#include "colamd.h"
+#include "btf.h"
+
+/* -------------------------------------------------------------------------- */
+/* Symbolic object - contains the pre-ordering computed by klu_analyze */
+/* -------------------------------------------------------------------------- */
+
+typedef struct
+{
+    /* A (P,Q) is in upper block triangular form.  The kth block goes from
+     * row/col index R [k] to R [k+1]-1.  The estimated number of nonzeros
+     * in the L factor of the kth block is Lnz [k]. 
+     */
+
+    /* only computed if the AMD ordering is chosen: */
+    double symmetry ;   /* symmetry of largest block */
+    double est_flops ;  /* est. factorization flop count */
+    double lnz, unz ;   /* estimated nz in L and U, including diagonals */
+    double *Lnz ;       /* size n, but only Lnz [0..nblocks-1] is used */
+
+    /* computed for all orderings: */
+    int
+        n,              /* input matrix A is n-by-n */
+        nz,             /* # entries in input matrix */
+        *P,             /* size n */
+        *Q,             /* size n */
+        *R,             /* size n+1, but only R [0..nblocks] is used */
+        nzoff,          /* nz in off-diagonal blocks */
+        nblocks,        /* number of blocks */
+        maxblock,       /* size of largest block */
+        ordering,       /* ordering used (AMD, COLAMD, or GIVEN) */
+        do_btf ;        /* whether or not BTF preordering was requested */
+
+    /* only computed if BTF preordering requested */
+    int structural_rank ;   /* 0 to n-1 if the matrix is structurally rank
+                        * deficient.  -1 if not computed.  n if the matrix has
+                        * full structural rank */
+
+} klu_symbolic ;
+
+typedef struct          /* 64-bit version (otherwise same as above) */
+{
+    double symmetry, est_flops, lnz, unz ;
+    double *Lnz ;
+    UF_long n, nz, *P, *Q, *R, nzoff, nblocks, maxblock, ordering, do_btf,
+        structural_rank ;
+
+} klu_l_symbolic ;
+
+/* -------------------------------------------------------------------------- */
+/* Numeric object - contains the factors computed by klu_factor */
+/* -------------------------------------------------------------------------- */
+
+typedef struct
+{
+    /* LU factors of each block, the pivot row permutation, and the
+     * entries in the off-diagonal blocks */
+
+    int n ;             /* A is n-by-n */
+    int nblocks ;       /* number of diagonal blocks */
+    int lnz ;           /* actual nz in L, including diagonal */
+    int unz ;           /* actual nz in U, including diagonal */
+    int max_lnz_block ; /* max actual nz in L in any one block, incl. diag */
+    int max_unz_block ; /* max actual nz in U in any one block, incl. diag */
+    int *Pnum ;         /* size n. final pivot permutation */
+    int *Pinv ;         /* size n. inverse of final pivot permutation */
+
+    /* LU factors of each block */
+    int *Lip ;          /* size n. pointers into LUbx[block] for L */
+    int *Uip ;          /* size n. pointers into LUbx[block] for U */
+    int *Llen ;         /* size n. Llen [k] = # of entries in kth column of L */
+    int *Ulen ;         /* size n. Ulen [k] = # of entries in kth column of U */
+    void **LUbx ;       /* L and U indices and entries (excl. diagonal of U) */
+    size_t *LUsize ;    /* size of each LUbx [block], in sizeof (Unit) */
+    void *Udiag ;       /* diagonal of U */
+
+    /* scale factors; can be NULL if no scaling */
+    double *Rs ;        /* size n. Rs [i] is scale factor for row i */
+
+    /* permanent workspace for factorization and solve */
+    size_t worksize ;   /* size (in bytes) of Work */
+    void *Work ;        /* workspace */
+    void *Xwork ;       /* alias into Numeric->Work */
+    int *Iwork ;        /* alias into Numeric->Work */
+
+    /* off-diagonal entries in a conventional compressed-column sparse matrix */
+    int *Offp ;         /* size n+1, column pointers */
+    int *Offi ;         /* size nzoff, row indices */
+    void *Offx ;        /* size nzoff, numerical values */
+    int nzoff ;
+
+} klu_numeric ;
+
+typedef struct          /* 64-bit version (otherwise same as above) */
+{
+    UF_long n, nblocks, lnz, unz, max_lnz_block, max_unz_block, *Pnum, *Pinv,
+        *Lip, *Uip, *Llen, *Ulen ;
+    void **LUbx ;
+    size_t *LUsize ;
+    void *Udiag ;
+    double *Rs ;
+    size_t worksize ;
+    void *Work, *Xwork ;
+    UF_long *Iwork ;
+    UF_long *Offp, *Offi ;
+    void *Offx ;
+    UF_long nzoff ;
+
+} klu_l_numeric ;
+
+/* -------------------------------------------------------------------------- */
+/* KLU control parameters and statistics */
+/* -------------------------------------------------------------------------- */
+
+/* Common->status values */
+#define KLU_OK 0
+#define KLU_SINGULAR (1)            /* status > 0 is a warning, not an error */
+#define KLU_OUT_OF_MEMORY (-2)
+#define KLU_INVALID (-3)
+#define KLU_TOO_LARGE (-4)          /* integer overflow has occured */
+
+typedef struct klu_common_struct
+{
+
+    /* ---------------------------------------------------------------------- */
+    /* parameters */
+    /* ---------------------------------------------------------------------- */
+
+    double tol ;            /* pivot tolerance for diagonal preference */
+    double memgrow ;        /* realloc memory growth size for LU factors */
+    double initmem_amd ;    /* init. memory size with AMD: c*nnz(L) + n */
+    double initmem ;        /* init. memory size: c*nnz(A) + n */
+    double maxwork ;        /* maxwork for BTF, <= 0 if no limit */
+
+    int btf ;               /* use BTF pre-ordering, or not */
+    int ordering ;          /* 0: AMD, 1: COLAMD, 2: user P and Q,
+                             * 3: user function */
+    int scale ;             /* row scaling: -1: none (and no error check),
+                             * 0: none, 1: sum, 2: max */
+
+    /* memory management routines */
+    void *(*malloc_memory) (size_t) ;           /* pointer to malloc */
+    void *(*realloc_memory) (void *, size_t) ;  /* pointer to realloc */
+    void (*free_memory) (void *) ;              /* pointer to free */
+    void *(*calloc_memory) (size_t, size_t) ;   /* pointer to calloc */
+
+    /* pointer to user ordering function */
+    int (*user_order) (int, int *, int *, int *, struct klu_common_struct *) ;
+
+    /* pointer to user data, passed unchanged as the last parameter to the
+     * user ordering function (optional, the user function need not use this
+     * information). */
+    void *user_data ;
+
+    int halt_if_singular ;      /* how to handle a singular matrix:
+        * FALSE: keep going.  Return a Numeric object with a zero U(k,k).  A
+        *   divide-by-zero may occur when computing L(:,k).  The Numeric object
+        *   can be passed to klu_solve (a divide-by-zero will occur).  It can
+        *   also be safely passed to klu_refactor.
+        * TRUE: stop quickly.  klu_factor will free the partially-constructed
+        *   Numeric object.  klu_refactor will not free it, but will leave the
+        *   numerical values only partially defined.  This is the default. */
+
+    /* ---------------------------------------------------------------------- */
+    /* statistics */
+    /* ---------------------------------------------------------------------- */
+
+    int status ;                /* KLU_OK if OK, < 0 if error */
+    int nrealloc ;              /* # of reallocations of L and U */
+
+    int structural_rank ;       /* 0 to n-1 if the matrix is structurally rank
+        * deficient (as determined by maxtrans).  -1 if not computed.  n if the
+        * matrix has full structural rank.  This is computed by klu_analyze
+        * if a BTF preordering is requested. */
+
+    int numerical_rank ;        /* First k for which a zero U(k,k) was found,
+        * if the matrix was singular (in the range 0 to n-1).  n if the matrix
+        * has full rank. This is not a true rank-estimation.  It just reports
+        * where the first zero pivot was found.  -1 if not computed.
+        * Computed by klu_factor and klu_refactor. */
+
+    int singular_col ;          /* n if the matrix is not singular.  If in the
+        * range 0 to n-1, this is the column index of the original matrix A that
+        * corresponds to the column of U that contains a zero diagonal entry.
+        * -1 if not computed.  Computed by klu_factor and klu_refactor. */
+
+    int noffdiag ;      /* # of off-diagonal pivots, -1 if not computed */
+
+    double flops ;      /* actual factorization flop count, from klu_flops */
+    double rcond ;      /* crude reciprocal condition est., from klu_rcond */
+    double condest ;    /* accurate condition est., from klu_condest */
+    double rgrowth ;    /* reciprocal pivot rgrowth, from klu_rgrowth */
+    double work ;       /* actual work done in BTF, in klu_analyze */
+
+    size_t memusage ;   /* current memory usage, in bytes */
+    size_t mempeak ;    /* peak memory usage, in bytes */
+
+} klu_common ;
+
+typedef struct klu_l_common_struct /* 64-bit version (otherwise same as above)*/
+{
+
+    double tol, memgrow, initmem_amd, initmem, maxwork ;
+    UF_long btf, ordering, scale ;
+    void *(*malloc_memory) (size_t) ;
+    void *(*realloc_memory) (void *, size_t) ;
+    void (*free_memory) (void *) ;
+    void *(*calloc_memory) (size_t, size_t) ;
+    UF_long (*user_order) (UF_long, UF_long *, UF_long *, UF_long *,
+        struct klu_l_common_struct *) ;
+    void *user_data ;
+    UF_long halt_if_singular ;
+    UF_long status, nrealloc, structural_rank, numerical_rank, singular_col,
+        noffdiag ;
+    double flops, rcond, condest, rgrowth, work ;
+    size_t memusage, mempeak ;
+
+} klu_l_common ;
+
+/* -------------------------------------------------------------------------- */
+/* klu_defaults: sets default control parameters */
+/* -------------------------------------------------------------------------- */
+
+int klu_defaults
+(
+    klu_common *Common
+) ;
+
+UF_long klu_l_defaults (klu_l_common *Common) ;
+
+/* -------------------------------------------------------------------------- */
+/* klu_analyze:  orders and analyzes a matrix */
+/* -------------------------------------------------------------------------- */
+
+/* Order the matrix with BTF (or not), then order each block with AMD, COLAMD,
+ * a natural ordering, or with a user-provided ordering function */
+
+klu_symbolic *klu_analyze
+(
+    /* inputs, not modified */
+    int n,              /* A is n-by-n */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    klu_common *Common
+) ;
+
+klu_l_symbolic *klu_l_analyze (UF_long, UF_long *, UF_long *,
+    klu_l_common *Common) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_analyze_given: analyzes a matrix using given P and Q */
+/* -------------------------------------------------------------------------- */
+
+/* Order the matrix with BTF (or not), then use natural or given ordering
+ * P and Q on the blocks.  P and Q are interpretted as identity
+ * if NULL. */
+
+klu_symbolic *klu_analyze_given
+(
+    /* inputs, not modified */
+    int n,              /* A is n-by-n */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    int P [ ],          /* size n, user's row permutation (may be NULL) */
+    int Q [ ],          /* size n, user's column permutation (may be NULL) */
+    klu_common *Common
+) ;
+
+klu_l_symbolic *klu_l_analyze_given (UF_long, UF_long *, UF_long *, UF_long *,
+    UF_long *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_factor:  factors a matrix using the klu_analyze results */
+/* -------------------------------------------------------------------------- */
+
+klu_numeric *klu_factor /* returns KLU_OK if OK, < 0 if error */
+(
+    /* inputs, not modified */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],      /* size nz, numerical values */
+    klu_symbolic *Symbolic,
+    klu_common *Common
+) ;
+
+klu_numeric *klu_z_factor      /* returns KLU_OK if OK, < 0 if error */
+(
+     /* inputs, not modified */
+     int Ap [ ],        /* size n+1, column pointers */
+     int Ai [ ],        /* size nz, row indices */
+     double Ax [ ],     /* size 2*nz, numerical values (real,imag pairs) */
+     klu_symbolic *Symbolic,
+     klu_common *Common
+) ;
+
+/* long / real version */
+klu_l_numeric *klu_l_factor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_common *) ;
+
+/* long / complex version */
+klu_l_numeric *klu_zl_factor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_solve: solves Ax=b using the Symbolic and Numeric objects */
+/* -------------------------------------------------------------------------- */
+
+int klu_solve
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    int ldim,               /* leading dimension of B */
+    int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size ldim*nrhs */
+    klu_common *Common
+) ;
+
+int klu_z_solve
+(
+     /* inputs, not modified */
+     klu_symbolic *Symbolic,
+     klu_numeric *Numeric,
+     int ldim,               /* leading dimension of B */
+     int nrhs,               /* number of right-hand-sides */
+
+     /* right-hand-side on input, overwritten with solution to Ax=b on output */
+     double B [ ],          /* size 2*ldim*nrhs */
+     klu_common *Common
+) ;
+
+UF_long klu_l_solve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, klu_l_common *) ;
+
+UF_long klu_zl_solve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_tsolve: solves A'x=b using the Symbolic and Numeric objects */
+/* -------------------------------------------------------------------------- */
+
+int klu_tsolve
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    int ldim,               /* leading dimension of B */
+    int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size ldim*nrhs */
+    klu_common *Common
+) ;
+
+int klu_z_tsolve
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    int ldim,               /* leading dimension of B */
+    int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size 2*ldim*nrhs */
+    int conj_solve,         /* TRUE: conjugate solve, FALSE: solve A.'x=b */
+    klu_common *Common
+     
+) ;
+
+UF_long klu_l_tsolve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, klu_l_common *) ;
+
+UF_long klu_zl_tsolve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, UF_long, klu_l_common * ) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_refactor: refactorizes matrix with same ordering as klu_factor */
+/* -------------------------------------------------------------------------- */
+
+int klu_refactor            /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],      /* size nz, numerical values */
+    klu_symbolic *Symbolic,
+    /* input, and numerical values modified on output */
+    klu_numeric *Numeric,
+    klu_common *Common
+) ;
+
+int klu_z_refactor          /* return TRUE if successful, FALSE otherwise */
+(
+     /* inputs, not modified */
+     int Ap [ ],        /* size n+1, column pointers */
+     int Ai [ ],        /* size nz, row indices */
+     double Ax [ ],     /* size 2*nz, numerical values */
+     klu_symbolic *Symbolic,
+     /* input, and numerical values modified on output */
+     klu_numeric *Numeric,
+     klu_common *Common
+) ;
+
+UF_long klu_l_refactor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+UF_long klu_zl_refactor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_free_symbolic: destroys the Symbolic object */
+/* -------------------------------------------------------------------------- */
+
+int klu_free_symbolic
+(
+    klu_symbolic **Symbolic,
+    klu_common *Common
+) ;
+
+UF_long klu_l_free_symbolic (klu_l_symbolic **, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_free_numeric: destroys the Numeric object */
+/* -------------------------------------------------------------------------- */
+
+/* Note that klu_free_numeric and klu_z_free_numeric are identical; each can
+ * free both kinds of Numeric objects (real and complex) */
+
+int klu_free_numeric
+(
+    klu_numeric **Numeric,
+    klu_common *Common
+) ;
+
+int klu_z_free_numeric
+(
+     klu_numeric **Numeric,
+     klu_common *Common
+) ;
+
+UF_long klu_l_free_numeric (klu_l_numeric **, klu_l_common *) ;
+UF_long klu_zl_free_numeric (klu_l_numeric **, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_sort: sorts the columns of the LU factorization */
+/* -------------------------------------------------------------------------- */
+
+/* this is not needed except for the MATLAB interface */
+
+int klu_sort
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    /* input/output */
+    klu_numeric *Numeric,
+    klu_common *Common
+) ;
+
+int klu_z_sort
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    /* input/output */
+    klu_numeric *Numeric,
+    klu_common *Common
+) ;
+
+UF_long klu_l_sort (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+UF_long klu_zl_sort (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_flops: determines # of flops performed in numeric factorzation */
+/* -------------------------------------------------------------------------- */
+
+int klu_flops
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    /* input/output */
+    klu_common *Common
+) ;
+
+int klu_z_flops
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    /* input/output */
+    klu_common *Common
+) ;
+
+UF_long klu_l_flops (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+UF_long klu_zl_flops (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_rgrowth : compute the reciprocal pivot growth */
+/* -------------------------------------------------------------------------- */
+
+/* Pivot growth is computed after the input matrix is permuted, scaled, and
+ * off-diagonal entries pruned.  This is because the LU factorization of each
+ * block takes as input the scaled diagonal blocks of the BTF form.  The
+ * reciprocal pivot growth in column j of an LU factorization of a matrix C
+ * is the largest entry in C divided by the largest entry in U; then the overall
+ * reciprocal pivot growth is the smallest such value for all columns j.  Note
+ * that the off-diagonal entries are not scaled, since they do not take part in
+ * the LU factorization of the diagonal blocks.
+ *
+ * In MATLAB notation:
+ *
+ * rgrowth = min (max (abs ((R \ A(p,q)) - F)) ./ max (abs (U))) */
+
+int klu_rgrowth
+(
+    int Ap [ ],
+    int Ai [ ],
+    double Ax [ ],
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    klu_common *Common          /* Common->rgrowth = reciprocal pivot growth */
+) ;
+
+int klu_z_rgrowth
+(
+    int Ap [ ],
+    int Ai [ ],
+    double Ax [ ],
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    klu_common *Common          /* Common->rgrowth = reciprocal pivot growth */
+) ;
+
+UF_long klu_l_rgrowth (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+UF_long klu_zl_rgrowth (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_condest */
+/* -------------------------------------------------------------------------- */
+
+/* Computes a reasonably accurate estimate of the 1-norm condition number, using
+ * Hager's method, as modified by Higham and Tisseur (same method as used in
+ * MATLAB's condest */
+
+int klu_condest
+(
+    int Ap [ ],             /* size n+1, column pointers, not modified */
+    double Ax [ ],          /* size nz = Ap[n], numerical values, not modified*/
+    klu_symbolic *Symbolic, /* symbolic analysis, not modified */
+    klu_numeric *Numeric,   /* numeric factorization, not modified */
+    klu_common *Common      /* result returned in Common->condest */
+) ;
+
+int klu_z_condest
+(
+    int Ap [ ],
+    double Ax [ ],          /* size 2*nz */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    klu_common *Common      /* result returned in Common->condest */
+) ;
+
+UF_long klu_l_condest (UF_long *, double *, klu_l_symbolic *, klu_l_numeric *,
+    klu_l_common *) ;
+
+UF_long klu_zl_condest (UF_long *, double *, klu_l_symbolic *, klu_l_numeric *,
+    klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_rcond: compute min(abs(diag(U))) / max(abs(diag(U))) */
+/* -------------------------------------------------------------------------- */
+
+int klu_rcond
+(
+    klu_symbolic *Symbolic,         /* input, not modified */
+    klu_numeric *Numeric,           /* input, not modified */
+    klu_common *Common              /* result in Common->rcond */
+) ;
+
+int klu_z_rcond
+(
+    klu_symbolic *Symbolic,         /* input, not modified */
+    klu_numeric *Numeric,           /* input, not modified */
+    klu_common *Common              /* result in Common->rcond */
+) ;
+
+UF_long klu_l_rcond (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+UF_long klu_zl_rcond (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_scale */
+/* -------------------------------------------------------------------------- */
+
+int klu_scale           /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    int scale,          /* <0: none, no error check; 0: none, 1: sum, 2: max */
+    int n,
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    /* outputs, not defined on input */
+    double Rs [ ],
+    /* workspace, not defined on input or output */
+    int W [ ],          /* size n, can be NULL */
+    klu_common *Common
+) ;
+
+int klu_z_scale         /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    int scale,          /* <0: none, no error check; 0: none, 1: sum, 2: max */
+    int n,
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    /* outputs, not defined on input */
+    double Rs [ ],
+    /* workspace, not defined on input or output */
+    int W [ ],          /* size n, can be NULL */
+    klu_common *Common
+) ;
+
+UF_long klu_l_scale (UF_long, UF_long, UF_long *, UF_long *, double *,
+    double *, UF_long *, klu_l_common *) ;
+
+UF_long klu_zl_scale (UF_long, UF_long, UF_long *, UF_long *, double *,
+    double *, UF_long *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_extract  */
+/* -------------------------------------------------------------------------- */
+
+int klu_extract     /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs: */
+    klu_numeric *Numeric,
+    klu_symbolic *Symbolic,
+
+    /* outputs, either allocated on input, or ignored otherwise */
+
+    /* L */
+    int *Lp,        /* size n+1 */
+    int *Li,        /* size Numeric->lnz */
+    double *Lx,     /* size Numeric->lnz */
+
+    /* U */
+    int *Up,        /* size n+1 */
+    int *Ui,        /* size Numeric->unz */
+    double *Ux,     /* size Numeric->unz */
+
+    /* F */
+    int *Fp,        /* size n+1 */
+    int *Fi,        /* size Numeric->nzoff */
+    double *Fx,     /* size Numeric->nzoff */
+
+    /* P, row permutation */
+    int *P,         /* size n */
+
+    /* Q, column permutation */
+    int *Q,         /* size n */
+
+    /* Rs, scale factors */
+    double *Rs,     /* size n */
+
+    /* R, block boundaries */
+    int *R,         /* size Symbolic->nblocks+1 (nblocks is at most n) */
+
+    klu_common *Common
+) ;
+
+
+int klu_z_extract           /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs: */
+    klu_numeric *Numeric,
+    klu_symbolic *Symbolic,
+
+    /* outputs, all of which must be allocated on input */
+
+    /* L */
+    int *Lp,        /* size n+1 */
+    int *Li,        /* size nnz(L) */
+    double *Lx,     /* size nnz(L) */
+    double *Lz,     /* size nnz(L) for the complex case, ignored if real */
+
+    /* U */
+    int *Up,        /* size n+1 */
+    int *Ui,        /* size nnz(U) */
+    double *Ux,     /* size nnz(U) */
+    double *Uz,     /* size nnz(U) for the complex case, ignored if real */
+
+    /* F */
+    int *Fp,        /* size n+1 */
+    int *Fi,        /* size nnz(F) */
+    double *Fx,     /* size nnz(F) */
+    double *Fz,     /* size nnz(F) for the complex case, ignored if real */
+
+    /* P, row permutation */
+    int *P,         /* size n */
+
+    /* Q, column permutation */
+    int *Q,         /* size n */
+
+    /* Rs, scale factors */
+    double *Rs,     /* size n */
+
+    /* R, block boundaries */
+    int *R,         /* size Symbolic->nblocks+1 (nblocks is at most n) */
+
+    klu_common *Common
+) ;
+
+UF_long klu_l_extract (klu_l_numeric *, klu_l_symbolic *,
+    UF_long *, UF_long *, double *,
+    UF_long *, UF_long *, double *,
+    UF_long *, UF_long *, double *,
+    UF_long *, UF_long *, double *, UF_long *, klu_l_common *) ;
+
+UF_long klu_zl_extract (klu_l_numeric *, klu_l_symbolic *,
+    UF_long *, UF_long *, double *, double *,
+    UF_long *, UF_long *, double *, double *,
+    UF_long *, UF_long *, double *, double *,
+    UF_long *, UF_long *, double *, UF_long *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* KLU memory management routines */
+/* -------------------------------------------------------------------------- */
+
+void *klu_malloc        /* returns pointer to the newly malloc'd block */
+(
+    /* ---- input ---- */
+    size_t n,           /* number of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    klu_common *Common
+) ;
+
+void *klu_free          /* always returns NULL */
+(
+    /* ---- in/out --- */
+    void *p,            /* block of memory to free */
+    size_t n,           /* number of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    klu_common *Common
+) ;
+
+void *klu_realloc       /* returns pointer to reallocated block */
+(
+    /* ---- input ---- */
+    size_t nnew,        /* requested # of items in reallocated block */
+    size_t nold,        /* current size of block, in # of items */
+    size_t size,        /* size of each item */
+    /* ---- in/out --- */
+    void *p,            /* block of memory to realloc */
+    /* --------------- */
+    klu_common *Common
+) ;
+
+void *klu_l_malloc (size_t, size_t, klu_l_common *) ;
+void *klu_l_free (void *, size_t, size_t, klu_l_common *) ;
+void *klu_l_realloc (size_t, size_t, size_t, void *, klu_l_common *) ;
+
+
+/* ========================================================================== */
+/* === KLU version ========================================================== */
+/* ========================================================================== */
+
+/* All versions of KLU include these definitions.
+ * As an example, to test if the version you are using is 1.2 or later:
+ *
+ *      if (KLU_VERSION >= KLU_VERSION_CODE (1,2)) ...
+ *
+ * This also works during compile-time:
+ *
+ *      #if (KLU >= KLU_VERSION_CODE (1,2))
+ *          printf ("This is version 1.2 or later\n") ;
+ *      #else
+ *          printf ("This is an early version\n") ;
+ *      #endif
+ */
+
+#define KLU_DATE "Dec 7, 2011"
+#define KLU_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define KLU_MAIN_VERSION 1
+#define KLU_SUB_VERSION 1
+#define KLU_SUBSUB_VERSION 3
+#define KLU_VERSION KLU_VERSION_CODE(KLU_MAIN_VERSION,KLU_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/maths/KLU/klu_analyze.c
+++ b/src/maths/KLU/klu_analyze.c
@ -0,0 +1,488 @@
+/* ========================================================================== */
+/* === klu_analyze ========================================================== */
+/* ========================================================================== */
+
+/* Order the matrix using BTF (or not), and then AMD, COLAMD, the natural
+ * ordering, or the user-provided-function on the blocks.  Does not support
+ * using a given ordering (use klu_analyze_given for that case). */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === analyze_worker ======================================================= */
+/* ========================================================================== */
+
+static Int analyze_worker       /* returns KLU_OK or < 0 if error */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    Int nblocks,        /* # of blocks */
+    Int Pbtf [ ],       /* BTF row permutation */
+    Int Qbtf [ ],       /* BTF col permutation */
+    Int R [ ],          /* size n+1, but only Rbtf [0..nblocks] is used */
+    Int ordering,       /* what ordering to use (0, 1, or 3 for this routine) */
+
+    /* output only, not defined on input */
+    Int P [ ],          /* size n */
+    Int Q [ ],          /* size n */
+    double Lnz [ ],     /* size n, but only Lnz [0..nblocks-1] is used */
+
+    /* workspace, not defined on input or output */
+    Int Pblk [ ],       /* size maxblock */
+    Int Cp [ ],         /* size maxblock+1 */
+    Int Ci [ ],         /* size MAX (nz+1, Cilen) */
+    Int Cilen,          /* nz+1, or COLAMD_recommend(nz,n,n) for COLAMD */
+    Int Pinv [ ],       /* size maxblock */
+
+    /* input/output */
+    KLU_symbolic *Symbolic,
+    KLU_common *Common
+)
+{
+    double amd_Info [AMD_INFO], lnz, lnz1, flops, flops1 ;
+    Int k1, k2, nk, k, block, oldcol, pend, newcol, result, pc, p, newrow,
+        maxnz, nzoff, cstats [COLAMD_STATS], ok, err = KLU_INVALID ;
+
+    /* ---------------------------------------------------------------------- */
+    /* initializations */
+    /* ---------------------------------------------------------------------- */
+
+    /* compute the inverse of Pbtf */
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        P [k] = EMPTY ;
+        Q [k] = EMPTY ;
+        Pinv [k] = EMPTY ;
+    }
+#endif
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (Pbtf [k] >= 0 && Pbtf [k] < n) ;
+        Pinv [Pbtf [k]] = k ;
+    }
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++) ASSERT (Pinv [k] != EMPTY) ;
+#endif
+    nzoff = 0 ;
+    lnz = 0 ;
+    maxnz = 0 ;
+    flops = 0 ;
+    Symbolic->symmetry = EMPTY ;        /* only computed by AMD */
+
+    /* ---------------------------------------------------------------------- */
+    /* order each block */
+    /* ---------------------------------------------------------------------- */
+
+    for (block = 0 ; block < nblocks ; block++)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* the block is from rows/columns k1 to k2-1 */
+        /* ------------------------------------------------------------------ */
+
+        k1 = R [block] ;
+        k2 = R [block+1] ;
+        nk = k2 - k1 ;
+        PRINTF (("BLOCK %d, k1 %d k2-1 %d nk %d\n", block, k1, k2-1, nk)) ;
+
+        /* ------------------------------------------------------------------ */
+        /* construct the kth block, C */
+        /* ------------------------------------------------------------------ */
+
+        Lnz [block] = EMPTY ;
+        pc = 0 ;
+        for (k = k1 ; k < k2 ; k++)
+        {
+            newcol = k-k1 ;
+            Cp [newcol] = pc ;
+            oldcol = Qbtf [k] ;
+            pend = Ap [oldcol+1] ;
+            for (p = Ap [oldcol] ; p < pend ; p++)
+            {
+                newrow = Pinv [Ai [p]] ;
+                if (newrow < k1)
+                {
+                    nzoff++ ;
+                }
+                else
+                {
+                    /* (newrow,newcol) is an entry in the block */
+                    ASSERT (newrow < k2) ;
+                    newrow -= k1 ;
+                    Ci [pc++] = newrow ;
+                }
+            }
+        }
+        Cp [nk] = pc ;
+        maxnz = MAX (maxnz, pc) ;
+        ASSERT (KLU_valid (nk, Cp, Ci, NULL)) ;
+
+        /* ------------------------------------------------------------------ */
+        /* order the block C */
+        /* ------------------------------------------------------------------ */
+
+        if (nk <= 3)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* use natural ordering for tiny blocks (3-by-3 or less) */
+            /* -------------------------------------------------------------- */
+
+            for (k = 0 ; k < nk ; k++)
+            {
+                Pblk [k] = k ;
+            }
+            lnz1 = nk * (nk + 1) / 2 ;
+            flops1 = nk * (nk - 1) / 2 + (nk-1)*nk*(2*nk-1) / 6 ;
+            ok = TRUE ;
+
+        }
+        else if (ordering == 0)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* order the block with AMD (C+C') */
+            /* -------------------------------------------------------------- */
+
+            result = AMD_order (nk, Cp, Ci, Pblk, NULL, amd_Info) ;
+            ok = (result >= AMD_OK) ;
+            if (result == AMD_OUT_OF_MEMORY)
+            {
+                err = KLU_OUT_OF_MEMORY ;
+            }
+
+            /* account for memory usage in AMD */
+            Common->mempeak = MAX (Common->mempeak,
+                Common->memusage + amd_Info [AMD_MEMORY]) ;
+
+            /* get the ordering statistics from AMD */
+            lnz1 = (Int) (amd_Info [AMD_LNZ]) + nk ;
+            flops1 = 2 * amd_Info [AMD_NMULTSUBS_LU] + amd_Info [AMD_NDIV] ;
+            if (pc == maxnz)
+            {
+                /* get the symmetry of the biggest block */
+                Symbolic->symmetry = amd_Info [AMD_SYMMETRY] ;
+            }
+
+        }
+        else if (ordering == 1)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* order the block with COLAMD (C) */
+            /* -------------------------------------------------------------- */
+
+            /* order (and destroy) Ci, returning column permutation in Cp.
+             * COLAMD "cannot" fail since the matrix has already been checked,
+             * and Ci allocated. */
+
+            ok = COLAMD (nk, nk, Cilen, Ci, Cp, NULL, cstats) ;
+            lnz1 = EMPTY ;
+            flops1 = EMPTY ;
+
+            /* copy the permutation from Cp to Pblk */
+            for (k = 0 ; k < nk ; k++)
+            {
+                Pblk [k] = Cp [k] ;
+            }
+
+        }
+        else
+        {
+
+            /* -------------------------------------------------------------- */
+            /* pass the block to the user-provided ordering function */
+            /* -------------------------------------------------------------- */
+
+            lnz1 = (Common->user_order) (nk, Cp, Ci, Pblk, Common) ;
+            flops1 = EMPTY ;
+            ok = (lnz1 != 0) ;
+        }
+
+        if (!ok)
+        {
+            return (err) ;  /* ordering method failed */
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* keep track of nnz(L) and flops statistics */
+        /* ------------------------------------------------------------------ */
+
+        Lnz [block] = lnz1 ;
+        lnz = (lnz == EMPTY || lnz1 == EMPTY) ? EMPTY : (lnz + lnz1) ;
+        flops = (flops == EMPTY || flops1 == EMPTY) ? EMPTY : (flops + flops1) ;
+
+        /* ------------------------------------------------------------------ */
+        /* combine the preordering with the BTF ordering */
+        /* ------------------------------------------------------------------ */
+
+        PRINTF (("Pblk, 1-based:\n")) ;
+        for (k = 0 ; k < nk ; k++)
+        {
+            ASSERT (k + k1 < n) ;
+            ASSERT (Pblk [k] + k1 < n) ;
+            Q [k + k1] = Qbtf [Pblk [k] + k1] ;
+        }
+        for (k = 0 ; k < nk ; k++)
+        {
+            ASSERT (k + k1 < n) ;
+            ASSERT (Pblk [k] + k1 < n) ;
+            P [k + k1] = Pbtf [Pblk [k] + k1] ;
+        }
+    }
+
+    PRINTF (("nzoff %d  Ap[n] %d\n", nzoff, Ap [n])) ;
+    ASSERT (nzoff >= 0 && nzoff <= Ap [n]) ;
+
+    /* return estimates of # of nonzeros in L including diagonal */
+    Symbolic->lnz = lnz ;           /* EMPTY if COLAMD used */
+    Symbolic->unz = lnz ;
+    Symbolic->nzoff = nzoff ;
+    Symbolic->est_flops = flops ;   /* EMPTY if COLAMD or user-ordering used */
+    return (KLU_OK) ;
+}
+
+
+/* ========================================================================== */
+/* === order_and_analyze ==================================================== */
+/* ========================================================================== */
+
+/* Orders the matrix with or with BTF, then orders each block with AMD, COLAMD,
+ * or the user ordering function.  Does not handle the natural or given
+ * ordering cases. */
+
+static KLU_symbolic *order_and_analyze  /* returns NULL if error, or a valid
+                                           KLU_symbolic object if successful */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    /* --------------------- */
+    KLU_common *Common
+)
+{
+    double work ;
+    KLU_symbolic *Symbolic ;
+    double *Lnz ;
+    Int *Qbtf, *Cp, *Ci, *Pinv, *Pblk, *Pbtf, *P, *Q, *R ;
+    Int nblocks, nz, block, maxblock, k1, k2, nk, do_btf, ordering, k, Cilen,
+        *Work ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate the Symbolic object, and check input matrix */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic = KLU_alloc_symbolic (n, Ap, Ai, Common) ;
+    if (Symbolic == NULL)
+    {
+        return (NULL) ;
+    }
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    Lnz = Symbolic->Lnz ;
+    nz = Symbolic->nz ;
+
+    ordering = Common->ordering ;
+    if (ordering == 1)
+    {
+        /* COLAMD */
+        Cilen = COLAMD_recommended (nz, n, n) ;
+    }
+    else if (ordering == 0 || (ordering == 3 && Common->user_order != NULL))
+    {
+        /* AMD or user ordering function */
+        Cilen = nz+1 ;
+    }
+    else
+    {
+        /* invalid ordering */
+        Common->status = KLU_INVALID ;
+        KLU_free_symbolic (&Symbolic, Common) ;
+        return (NULL) ;
+    }
+
+    /* AMD memory management routines */
+    amd_malloc  = Common->malloc_memory ;
+    amd_free    = Common->free_memory ;
+    amd_calloc  = Common->calloc_memory ;
+    amd_realloc = Common->realloc_memory ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate workspace for BTF permutation */
+    /* ---------------------------------------------------------------------- */
+
+    Pbtf = KLU_malloc (n, sizeof (Int), Common) ;
+    Qbtf = KLU_malloc (n, sizeof (Int), Common) ;
+    if (Common->status < KLU_OK)
+    {
+        KLU_free (Pbtf, n, sizeof (Int), Common) ;
+        KLU_free (Qbtf, n, sizeof (Int), Common) ;
+        KLU_free_symbolic (&Symbolic, Common) ;
+        return (NULL) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* get the common parameters for BTF and ordering method */
+    /* ---------------------------------------------------------------------- */
+
+    do_btf = Common->btf ;
+    do_btf = (do_btf) ? TRUE : FALSE ;
+    Symbolic->ordering = ordering ;
+    Symbolic->do_btf = do_btf ;
+    Symbolic->structural_rank = EMPTY ;
+
+    /* ---------------------------------------------------------------------- */
+    /* find the block triangular form (if requested) */
+    /* ---------------------------------------------------------------------- */
+
+    Common->work = 0 ;
+
+    if (do_btf)
+    {
+        Work = KLU_malloc (5*n, sizeof (Int), Common) ;
+        if (Common->status < KLU_OK)
+        {
+            /* out of memory */
+            KLU_free (Pbtf, n, sizeof (Int), Common) ;
+            KLU_free (Qbtf, n, sizeof (Int), Common) ;
+            KLU_free_symbolic (&Symbolic, Common) ;
+            return (NULL) ;
+        }
+
+        nblocks = BTF_order (n, Ap, Ai, Common->maxwork, &work, Pbtf, Qbtf, R,
+                &(Symbolic->structural_rank), Work) ;
+        Common->structural_rank = Symbolic->structural_rank ;
+        Common->work += work ;
+
+        KLU_free (Work, 5*n, sizeof (Int), Common) ;
+
+        /* unflip Qbtf if the matrix does not have full structural rank */
+        if (Symbolic->structural_rank < n)
+        {
+            for (k = 0 ; k < n ; k++)
+            {
+                Qbtf [k] = BTF_UNFLIP (Qbtf [k]) ;
+            }
+        }
+
+        /* find the size of the largest block */
+        maxblock = 1 ;
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("block %d size %d\n", block, nk)) ;
+            maxblock = MAX (maxblock, nk) ;
+        }
+    }
+    else
+    {
+        /* BTF not requested */
+        nblocks = 1 ;
+        maxblock = n ;
+        R [0] = 0 ;
+        R [1] = n ;
+        for (k = 0 ; k < n ; k++)
+        {
+            Pbtf [k] = k ;
+            Qbtf [k] = k ;
+        }
+    }
+
+    Symbolic->nblocks = nblocks ;
+
+    PRINTF (("maxblock size %d\n", maxblock)) ;
+    Symbolic->maxblock = maxblock ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate more workspace, for analyze_worker */
+    /* ---------------------------------------------------------------------- */
+
+    Pblk = KLU_malloc (maxblock, sizeof (Int), Common) ;
+    Cp   = KLU_malloc (maxblock + 1, sizeof (Int), Common) ;
+    Ci   = KLU_malloc (MAX (Cilen, nz+1), sizeof (Int), Common) ;
+    Pinv = KLU_malloc (n, sizeof (Int), Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* order each block of the BTF ordering, and a fill-reducing ordering */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status == KLU_OK)
+    {
+        PRINTF (("calling analyze_worker\n")) ;
+        Common->status = analyze_worker (n, Ap, Ai, nblocks, Pbtf, Qbtf, R,
+            ordering, P, Q, Lnz, Pblk, Cp, Ci, Cilen, Pinv, Symbolic, Common) ;
+        PRINTF (("analyze_worker done\n")) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* free all workspace */
+    /* ---------------------------------------------------------------------- */
+
+    KLU_free (Pblk, maxblock, sizeof (Int), Common) ;
+    KLU_free (Cp, maxblock+1, sizeof (Int), Common) ;
+    KLU_free (Ci, MAX (Cilen, nz+1), sizeof (Int), Common) ;
+    KLU_free (Pinv, n, sizeof (Int), Common) ;
+    KLU_free (Pbtf, n, sizeof (Int), Common) ;
+    KLU_free (Qbtf, n, sizeof (Int), Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* return the symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status < KLU_OK)
+    {
+        KLU_free_symbolic (&Symbolic, Common) ;
+    }
+    return (Symbolic) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_analyze ========================================================== */
+/* ========================================================================== */
+
+KLU_symbolic *KLU_analyze       /* returns NULL if error, or a valid
+                                   KLU_symbolic object if successful */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    /* -------------------- */
+    KLU_common *Common
+)
+{
+
+    /* ---------------------------------------------------------------------- */
+    /* get the control parameters for BTF and ordering method */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (NULL) ;
+    }
+    Common->status = KLU_OK ;
+    Common->structural_rank = EMPTY ;
+
+    /* ---------------------------------------------------------------------- */
+    /* order and analyze */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->ordering == 2)
+    {
+        /* natural ordering */
+        return (KLU_analyze_given (n, Ap, Ai, NULL, NULL, Common)) ;
+    }
+    else
+    {
+        /* order with P and Q */
+        return (order_and_analyze (n, Ap, Ai, Common)) ;
+    }
+}
--- a/src/maths/KLU/klu_analyze_given.c
+++ b/src/maths/KLU/klu_analyze_given.c
@ -0,0 +1,369 @@
+/* ========================================================================== */
+/* === klu_analyze_given ==================================================== */
+/* ========================================================================== */
+
+/* Given an input permutation P and Q, create the Symbolic object.  BTF can
+ * be done to modify the user's P and Q (does not perform the max transversal;
+ * just finds the strongly-connected components). */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === klu_alloc_symbolic =================================================== */
+/* ========================================================================== */
+
+/* Allocate Symbolic object, and check input matrix.  Not user callable. */
+
+KLU_symbolic *KLU_alloc_symbolic
+(
+    Int n,
+    Int *Ap,
+    Int *Ai,
+    KLU_common *Common
+)
+{
+    KLU_symbolic *Symbolic ;
+    Int *P, *Q, *R ;
+    double *Lnz ;
+    Int nz, i, j, p, pend ;
+
+    if (Common == NULL)
+    {
+        return (NULL) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* A is n-by-n, with n > 0.  Ap [0] = 0 and nz = Ap [n] >= 0 required.
+     * Ap [j] <= Ap [j+1] must hold for all j = 0 to n-1.  Row indices in Ai
+     * must be in the range 0 to n-1, and no duplicate entries can be present.
+     * The list of row indices in each column of A need not be sorted.
+     */
+
+    if (n <= 0 || Ap == NULL || Ai == NULL)
+    {
+        /* Ap and Ai must be present, and n must be > 0 */
+        Common->status = KLU_INVALID ;
+        return (NULL) ;
+    }
+
+    nz = Ap [n] ;
+    if (Ap [0] != 0 || nz < 0)
+    {
+        /* nz must be >= 0 and Ap [0] must equal zero */
+        Common->status = KLU_INVALID ;
+        return (NULL) ;
+    }
+
+    for (j = 0 ; j < n ; j++)
+    {
+        if (Ap [j] > Ap [j+1])
+        {
+            /* column pointers must be non-decreasing */
+            Common->status = KLU_INVALID ;
+            return (NULL) ;
+        }
+    }
+    P = KLU_malloc (n, sizeof (Int), Common) ;
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+    for (i = 0 ; i < n ; i++)
+    {
+        P [i] = EMPTY ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        pend = Ap [j+1] ;
+        for (p = Ap [j] ; p < pend ; p++)
+        {
+            i = Ai [p] ;
+            if (i < 0 || i >= n || P [i] == j)
+            {
+                /* row index out of range, or duplicate entry */
+                KLU_free (P, n, sizeof (Int), Common) ;
+                Common->status = KLU_INVALID ;
+                return (NULL) ;
+            }
+            /* flag row i as appearing in column j */
+            P [i] = j ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic = KLU_malloc (sizeof (KLU_symbolic), 1, Common) ;
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        KLU_free (P, n, sizeof (Int), Common) ;
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+
+    Q = KLU_malloc (n, sizeof (Int), Common) ;
+    R = KLU_malloc (n+1, sizeof (Int), Common) ;
+    Lnz = KLU_malloc (n, sizeof (double), Common) ;
+
+    Symbolic->n = n ;
+    Symbolic->nz = nz ;
+    Symbolic->P = P ;
+    Symbolic->Q = Q ;
+    Symbolic->R = R ;
+    Symbolic->Lnz = Lnz ;
+
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        KLU_free_symbolic (&Symbolic, Common) ;
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+
+    return (Symbolic) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_analyze_given ==================================================== */
+/* ========================================================================== */
+
+KLU_symbolic *KLU_analyze_given     /* returns NULL if error, or a valid
+                                       KLU_symbolic object if successful */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    Int Puser [ ],      /* size n, user's row permutation (may be NULL) */
+    Int Quser [ ],      /* size n, user's column permutation (may be NULL) */
+    /* -------------------- */
+    KLU_common *Common
+)
+{
+    KLU_symbolic *Symbolic ;
+    double *Lnz ;
+    Int nblocks, nz, block, maxblock, *P, *Q, *R, nzoff, p, pend, do_btf, k ;
+
+    /* ---------------------------------------------------------------------- */
+    /* determine if input matrix is valid, and get # of nonzeros */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic = KLU_alloc_symbolic (n, Ap, Ai, Common) ;
+    if (Symbolic == NULL)
+    {
+        return (NULL) ;
+    }
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    Lnz = Symbolic->Lnz ;
+    nz = Symbolic->nz ;
+
+    /* ---------------------------------------------------------------------- */
+    /* Q = Quser, or identity if Quser is NULL */
+    /* ---------------------------------------------------------------------- */
+
+    if (Quser == (Int *) NULL)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = k ;
+        }
+    }
+    else
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = Quser [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* get the control parameters for BTF and ordering method */
+    /* ---------------------------------------------------------------------- */
+
+    do_btf = Common->btf ;
+    do_btf = (do_btf) ? TRUE : FALSE ;
+    Symbolic->ordering = 2 ;
+    Symbolic->do_btf = do_btf ;
+
+    /* ---------------------------------------------------------------------- */
+    /* find the block triangular form, if requested */
+    /* ---------------------------------------------------------------------- */
+
+    if (do_btf)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* get workspace for BTF_strongcomp */
+        /* ------------------------------------------------------------------ */
+
+        Int *Pinv, *Work, *Bi, k1, k2, nk, oldcol ;
+
+        Work = KLU_malloc (4*n, sizeof (Int), Common) ;
+        Pinv = KLU_malloc (n, sizeof (Int), Common) ;
+        if (Puser != (Int *) NULL)
+        {
+            Bi = KLU_malloc (nz+1, sizeof (Int), Common) ;
+        }
+        else
+        {
+            Bi = Ai ;
+        }
+
+        if (Common->status < KLU_OK)
+        {
+            /* out of memory */
+            KLU_free (Work, 4*n, sizeof (Int), Common) ;
+            KLU_free (Pinv, n, sizeof (Int), Common) ;
+            if (Puser != (Int *) NULL)
+            {
+                KLU_free (Bi, nz+1, sizeof (Int), Common) ;
+            }
+            KLU_free_symbolic (&Symbolic, Common) ;
+            Common->status = KLU_OUT_OF_MEMORY ;
+            return (NULL) ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* B = Puser * A */
+        /* ------------------------------------------------------------------ */
+
+        if (Puser != (Int *) NULL)
+        {
+            for (k = 0 ; k < n ; k++)
+            {
+                Pinv [Puser [k]] = k ;
+            }
+            for (p = 0 ; p < nz ; p++)
+            {
+                Bi [p] = Pinv [Ai [p]] ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* find the strongly-connected components */
+        /* ------------------------------------------------------------------ */
+
+        /* modifies Q, and determines P and R */
+        nblocks = BTF_strongcomp (n, Ap, Bi, Q, P, R, Work) ;
+
+        /* ------------------------------------------------------------------ */
+        /* P = P * Puser */
+        /* ------------------------------------------------------------------ */
+
+        if (Puser != (Int *) NULL)
+        {
+            for (k = 0 ; k < n ; k++)
+            {
+                Work [k] = Puser [P [k]] ;
+            }
+            for (k = 0 ; k < n ; k++)
+            {
+                P [k] = Work [k] ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* Pinv = inverse of P */
+        /* ------------------------------------------------------------------ */
+
+        for (k = 0 ; k < n ; k++)
+        {
+            Pinv [P [k]] = k ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* analyze each block */
+        /* ------------------------------------------------------------------ */
+
+        nzoff = 0 ;         /* nz in off-diagonal part */
+        maxblock = 1 ;      /* size of the largest block */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("BLOCK %d, k1 %d k2-1 %d nk %d\n", block, k1, k2-1, nk)) ;
+            maxblock = MAX (maxblock, nk) ;
+
+            /* -------------------------------------------------------------- */
+            /* scan the kth block, C */
+            /* -------------------------------------------------------------- */
+
+            for (k = k1 ; k < k2 ; k++)
+            {
+                oldcol = Q [k] ;
+                pend = Ap [oldcol+1] ;
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    if (Pinv [Ai [p]] < k1)
+                    {
+                        nzoff++ ;
+                    }
+                }
+            }
+
+            /* fill-in not estimated */
+            Lnz [block] = EMPTY ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* free all workspace */
+        /* ------------------------------------------------------------------ */
+
+        KLU_free (Work, 4*n, sizeof (Int), Common) ;
+        KLU_free (Pinv, n, sizeof (Int), Common) ;
+        if (Puser != (Int *) NULL)
+        {
+            KLU_free (Bi, nz+1, sizeof (Int), Common) ;
+        }
+
+    }
+    else
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* BTF not requested */
+        /* ------------------------------------------------------------------ */
+
+        nzoff = 0 ;
+        nblocks = 1 ;
+        maxblock = n ;
+        R [0] = 0 ;
+        R [1] = n ;
+        Lnz [0] = EMPTY ;
+
+        /* ------------------------------------------------------------------ */
+        /* P = Puser, or identity if Puser is NULL */
+        /* ------------------------------------------------------------------ */
+
+        for (k = 0 ; k < n ; k++)
+        {
+            P [k] = (Puser == NULL) ? k : Puser [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* return the symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic->nblocks = nblocks ;
+    Symbolic->maxblock = maxblock ;
+    Symbolic->lnz = EMPTY ;
+    Symbolic->unz = EMPTY ;
+    Symbolic->nzoff = nzoff ;
+
+    return (Symbolic) ;
+}
--- a/src/maths/KLU/klu_defaults.c
+++ b/src/maths/KLU/klu_defaults.c
@ -0,0 +1,60 @@
+/* ========================================================================== */
+/* === KLU_defaults ========================================================= */
+/* ========================================================================== */
+
+/* Sets default parameters for KLU */
+
+#include "klu_internal.h"
+
+Int KLU_defaults
+(
+    KLU_common *Common
+)
+{
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+
+    /* parameters */
+    Common->tol = 0.001 ;       /* pivot tolerance for diagonal */
+    Common->memgrow = 1.2;      /* realloc size ratio increase for LU factors */
+    Common->initmem_amd = 1.2 ; /* init. mem with AMD:  c*nnz(L) + n */
+    Common->initmem = 10 ;      /* init. mem otherwise: c*nnz(A) + n */
+    Common->btf = TRUE ;        /* use BTF pre-ordering, or not */
+    Common->maxwork = 0 ;       /* no limit to work done by btf_order */
+    Common->ordering = 0 ;      /* 0: AMD, 1: COLAMD, 2: user-provided P and Q,
+                                 * 3: user-provided function */
+    Common->scale = 2 ;         /* scale: -1: none, and do not check for errors
+                                 * in the input matrix in KLU_refactor.
+                                 * 0: none, but check for errors,
+                                 * 1: sum, 2: max */
+    Common->halt_if_singular = TRUE ;   /* quick halt if matrix is singular */
+
+    /* memory management routines */
+    Common->malloc_memory  = malloc ;
+    Common->calloc_memory  = calloc ;
+    Common->free_memory    = free ;
+    Common->realloc_memory = realloc ;
+
+    /* user ordering function and optional argument */
+    Common->user_order = NULL ;
+    Common->user_data = NULL ;
+
+    /* statistics */
+    Common->status = KLU_OK ;
+    Common->nrealloc = 0 ;
+    Common->structural_rank = EMPTY ;
+    Common->numerical_rank = EMPTY ;
+    Common->noffdiag = EMPTY ;
+    Common->flops = EMPTY ;
+    Common->rcond = EMPTY ;
+    Common->condest = EMPTY ;
+    Common->rgrowth = EMPTY ;
+    Common->work = 0 ;          /* work done by btf_order */
+
+    Common->memusage = 0 ;
+    Common->mempeak = 0 ;
+
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_diagnostics.c
+++ b/src/maths/KLU/klu_diagnostics.c
@ -0,0 +1,570 @@
+/* ========================================================================== */
+/* === KLU_diagnostics ====================================================== */
+/* ========================================================================== */
+
+/* Linear algebraic diagnostics:
+ * KLU_rgrowth: reciprocal pivot growth, takes O(|A|+|U|) time
+ * KLU_condest: condition number estimator, takes about O(|A|+5*(|L|+|U|)) time
+ * KLU_flops:   compute # flops required to factorize A into L*U
+ * KLU_rcond:   compute a really cheap estimate of the reciprocal of the
+ *              condition number, min(abs(diag(U))) / max(abs(diag(U))).
+ *              Takes O(n) time.
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === KLU_rgrowth ========================================================== */
+/* ========================================================================== */
+
+/* Compute the reciprocal pivot growth factor.  In MATLAB notation:
+ *
+ *   rgrowth = min (max (abs ((R \ A (p,q)) - F))) ./ max (abs (U)))
+ */
+
+Int KLU_rgrowth         /* return TRUE if successful, FALSE otherwise */
+(
+    Int *Ap,
+    Int *Ai,
+    double *Ax,
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double temp, max_ai, max_ui, min_block_rgrowth ;
+    Entry aik ;
+    Int *Q, *Ui, *Uip, *Ulen, *Pinv ;
+    Unit *LU ;
+    Entry *Aentry, *Ux, *Ukk ;
+    double *Rs ;
+    Int i, newrow, oldrow, k1, k2, nk, j, oldcol, k, pend, len ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+
+    if (Symbolic == NULL || Ap == NULL || Ai == NULL || Ax == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+
+    if (Numeric == NULL)
+    {
+        /* treat this as a singular matrix */
+        Common->rgrowth = 0 ;
+        Common->status = KLU_SINGULAR ;
+        return (TRUE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute the reciprocal pivot growth */
+    /* ---------------------------------------------------------------------- */
+
+    Aentry = (Entry *) Ax ;
+    Pinv = Numeric->Pinv ;
+    Rs = Numeric->Rs ;
+    Q = Symbolic->Q ;
+    Common->rgrowth = 1 ;
+
+    for (i = 0 ; i < Symbolic->nblocks ; i++)
+    {
+        k1 = Symbolic->R[i] ;
+        k2 = Symbolic->R[i+1] ;
+        nk = k2 - k1 ;
+        if (nk == 1)
+        {
+            continue ;      /* skip singleton blocks */
+        }
+        LU = (Unit *) Numeric->LUbx[i] ;
+        Uip = Numeric->Uip + k1 ;
+        Ulen = Numeric->Ulen + k1 ;
+        Ukk = ((Entry *) Numeric->Udiag) + k1 ;
+        min_block_rgrowth = 1 ;
+        for (j = 0 ; j < nk ; j++)
+        {
+            max_ai = 0 ;
+            max_ui = 0 ;
+            oldcol = Q[j + k1] ;
+            pend = Ap [oldcol + 1] ;
+            for (k = Ap [oldcol] ; k < pend ; k++)
+            {
+                oldrow = Ai [k] ;
+                newrow = Pinv [oldrow] ;
+                if (newrow < k1)
+                {
+                    continue ;  /* skip entry outside the block */
+                }
+                ASSERT (newrow < k2) ;
+                if (Rs != NULL)
+                {
+                    /* aik = Aentry [k] / Rs [oldrow] */
+                    SCALE_DIV_ASSIGN (aik, Aentry [k], Rs [newrow]) ;
+                }
+                else
+                {
+                    aik = Aentry [k] ;
+                }
+                /* temp = ABS (aik) */
+                ABS (temp, aik) ;
+                if (temp > max_ai)
+                {
+                    max_ai = temp ;
+                }
+            }
+
+            GET_POINTER (LU, Uip, Ulen, Ui, Ux, j, len) ;
+            for (k = 0 ; k < len ; k++)
+            {
+                /* temp = ABS (Ux [k]) */
+                ABS (temp, Ux [k]) ;
+                if (temp > max_ui)
+                {
+                    max_ui = temp ;
+                }
+            }
+            /* consider the diagonal element */
+            ABS (temp, Ukk [j]) ;
+            if (temp > max_ui)
+            {
+                max_ui = temp ;
+            }
+
+            /* if max_ui is 0, skip the column */
+            if (SCALAR_IS_ZERO (max_ui))
+            {
+                continue ;
+            }
+            temp = max_ai / max_ui ;
+            if (temp < min_block_rgrowth)
+            {
+                min_block_rgrowth = temp ;
+            }
+        }
+
+        if (min_block_rgrowth < Common->rgrowth)
+        {
+            Common->rgrowth = min_block_rgrowth ;
+        }
+    }
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_condest ========================================================== */
+/* ========================================================================== */
+
+/* Estimate the condition number.  Uses Higham and Tisseur's algorithm
+ * (A block algorithm for matrix 1-norm estimation, with applications to
+ * 1-norm pseudospectra, SIAM J. Matrix Anal. Appl., 21(4):1185-1201, 2000.
+ */
+
+Int KLU_condest         /* return TRUE if successful, FALSE otherwise */
+(
+    Int Ap [ ],
+    double Ax [ ],
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double xj, Xmax, csum, anorm, ainv_norm, est_old, est_new, abs_value ;
+    Entry *Udiag, *Aentry, *X, *S ;
+    Int *R ;
+    Int nblocks, i, j, jmax, jnew, pend, n ;
+#ifndef COMPLEX
+    Int unchanged ;
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Symbolic == NULL || Ap == NULL || Ax == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    abs_value = 0 ;
+    if (Numeric == NULL)
+    {
+        /* treat this as a singular matrix */
+        Common->condest = 1 / abs_value ;
+        Common->status = KLU_SINGULAR ;
+        return (TRUE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get inputs */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+    R = Symbolic->R ;
+    Udiag = Numeric->Udiag ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check if diagonal of U has a zero on it */
+    /* ---------------------------------------------------------------------- */
+
+    for (i = 0 ; i < n ; i++)
+    {
+        ABS (abs_value, Udiag [i]) ;
+        if (SCALAR_IS_ZERO (abs_value))
+        {
+            Common->condest = 1 / abs_value ;
+            Common->status = KLU_SINGULAR ;
+            return (TRUE) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* compute 1-norm (maximum column sum) of the matrix */
+    /* ---------------------------------------------------------------------- */
+
+    anorm =  0.0 ;
+    Aentry = (Entry *) Ax ;
+    for (i = 0 ; i < n ; i++)
+    {
+        pend = Ap [i + 1] ;
+        csum = 0.0 ;
+        for (j = Ap [i] ; j < pend ; j++)
+        {
+            ABS (abs_value, Aentry [j]) ;
+            csum += abs_value ;
+        }
+        if (csum > anorm)
+        {
+            anorm = csum ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* compute estimate of 1-norm of inv (A) */
+    /* ---------------------------------------------------------------------- */
+
+    /* get workspace (size 2*n Entry's) */
+    X = Numeric->Xwork ;            /* size n space used in KLU_solve, tsolve */
+    X += n ;                        /* X is size n */
+    S = X + n ;                     /* S is size n */
+
+    for (i = 0 ; i < n ; i++)
+    {
+        CLEAR (S [i]) ;
+        CLEAR (X [i]) ;
+        REAL (X [i]) = 1.0 / ((double) n) ;
+    }
+    jmax = 0 ;
+
+    ainv_norm = 0.0 ;
+    for (i = 0 ; i < 5 ; i++)
+    {
+        if (i > 0)
+        {
+            /* X [jmax] is the largest entry in X */
+            for (j = 0 ; j < n ; j++)
+            {
+                /* X [j] = 0 ;*/
+                CLEAR (X [j]) ;
+            }
+            REAL (X [jmax]) = 1 ;
+        }
+
+        KLU_solve (Symbolic, Numeric, n, 1, (double *) X, Common) ;
+        est_old = ainv_norm ;
+        ainv_norm = 0.0 ;
+
+        for (j = 0 ; j < n ; j++)
+        {
+            /* ainv_norm += ABS (X [j]) ;*/
+            ABS (abs_value, X [j]) ;
+            ainv_norm += abs_value ;
+        }
+
+#ifndef COMPLEX
+        unchanged = TRUE ;
+
+        for (j = 0 ; j < n ; j++)
+        {
+            double s = (X [j] >= 0) ? 1 : -1 ;
+            if (s != (Int) REAL (S [j]))
+            {
+                S [j] = s ;
+                unchanged = FALSE ;
+            }
+        }
+
+        if (i > 0 && (ainv_norm <= est_old || unchanged))
+        {
+            break ;
+        }
+#else
+        for (j = 0 ; j < n ; j++)
+        {
+            if (IS_NONZERO (X [j]))
+            {
+                ABS (abs_value, X [j]) ;
+                SCALE_DIV_ASSIGN (S [j], X [j], abs_value) ;
+            }
+            else
+            {
+                CLEAR (S [j]) ;
+                REAL (S [j]) = 1 ;
+            }
+        }
+
+        if (i > 0 && ainv_norm <= est_old)
+        {
+            break ;
+        }
+#endif
+
+        for (j = 0 ; j < n ; j++)
+        {
+            X [j] = S [j] ;
+        }
+
+#ifndef COMPLEX
+        /* do a transpose solve */
+        KLU_tsolve (Symbolic, Numeric, n, 1, X, Common) ;
+#else
+        /* do a conjugate transpose solve */
+        KLU_tsolve (Symbolic, Numeric, n, 1, (double *) X, 1, Common) ;
+#endif
+
+        /* jnew = the position of the largest entry in X */
+        jnew = 0 ;
+        Xmax = 0 ;
+        for (j = 0 ; j < n ; j++)
+        {
+            /* xj = ABS (X [j]) ;*/
+            ABS (xj, X [j]) ;
+            if (xj > Xmax)
+            {
+                Xmax = xj ;
+                jnew = j ;
+            }
+        }
+        if (i > 0 && jnew == jmax)
+        {
+            /* the position of the largest entry did not change
+             * from the previous iteration */
+            break ;
+        }
+        jmax = jnew ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* compute another estimate of norm(inv(A),1), and take the largest one */
+    /* ---------------------------------------------------------------------- */
+
+    for (j = 0 ; j < n ; j++)
+    {
+        CLEAR (X [j]) ;
+        if (j % 2)
+        {
+            REAL (X [j]) = 1 + ((double) j) / ((double) (n-1)) ;
+        }
+        else
+        {
+            REAL (X [j]) = -1 - ((double) j) / ((double) (n-1)) ;
+        }
+    }
+
+    KLU_solve (Symbolic, Numeric, n, 1, (double *) X, Common) ;
+
+    est_new = 0.0 ;
+    for (j = 0 ; j < n ; j++)
+    {
+        /* est_new += ABS (X [j]) ;*/
+        ABS (abs_value, X [j]) ;
+        est_new += abs_value ;
+    }
+    est_new = 2 * est_new / (3 * n) ;
+    ainv_norm = MAX (est_new, ainv_norm) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute estimate of condition number */
+    /* ---------------------------------------------------------------------- */
+
+    Common->condest = ainv_norm * anorm ;
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_flops ============================================================ */
+/* ========================================================================== */
+
+/* Compute the flop count for the LU factorization (in Common->flops) */
+
+Int KLU_flops           /* return TRUE if successful, FALSE otherwise */
+(
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double flops = 0 ;
+    Int *R, *Ui, *Uip, *Llen, *Ulen ;
+    Unit **LUbx ;
+    Unit *LU ;
+    Int k, ulen, p, n, nk, block, nblocks, k1 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->flops = EMPTY ;
+    if (Numeric == NULL || Symbolic == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    R = Symbolic->R ;
+    nblocks = Symbolic->nblocks ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    LUbx = (Unit **) Numeric->LUbx ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute the flop count */
+    /* ---------------------------------------------------------------------- */
+
+    for (block = 0 ; block < nblocks ; block++)
+    {
+        k1 = R [block] ;
+        nk = R [block+1] - k1 ;
+        if (nk > 1)
+        {
+            Llen = Numeric->Llen + k1 ;
+            Uip  = Numeric->Uip  + k1 ;
+            Ulen = Numeric->Ulen + k1 ;
+            LU = LUbx [block] ;
+            for (k = 0 ; k < nk ; k++)
+            {
+                /* compute kth column of U, and update kth column of A */
+                GET_I_POINTER (LU, Uip, Ui, k) ;
+                ulen = Ulen [k] ;
+                for (p = 0 ; p < ulen ; p++)
+                {
+                    flops += 2 * Llen [Ui [p]] ;
+                }
+                /* gather and divide by pivot to get kth column of L */
+                flops += Llen [k] ;
+            }
+        }
+    }
+    Common->flops = flops ;
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_rcond ============================================================ */
+/* ========================================================================== */
+
+/* Compute a really cheap estimate of the reciprocal of the condition number,
+ * condition number, min(abs(diag(U))) / max(abs(diag(U))).  If U has a zero
+ * pivot, or a NaN pivot, rcond will be zero.  Takes O(n) time.
+ */   
+
+Int KLU_rcond           /* return TRUE if successful, FALSE otherwise */
+(
+    KLU_symbolic *Symbolic,     /* input, not modified */
+    KLU_numeric *Numeric,       /* input, not modified */
+    KLU_common *Common          /* result in Common->rcond */
+)
+{
+    double ukk, umin = 0, umax = 0 ;
+    Entry *Udiag ;
+    Int j, n ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Symbolic == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    if (Numeric == NULL)
+    {
+        Common->rcond = 0 ;
+        Common->status = KLU_SINGULAR ;
+        return (TRUE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute rcond */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    Udiag = Numeric->Udiag ;
+    for (j = 0 ; j < n ; j++)
+    {
+        /* get the magnitude of the pivot */
+        ABS (ukk, Udiag [j]) ;
+        if (SCALAR_IS_NAN (ukk) || SCALAR_IS_ZERO (ukk))
+        {
+            /* if NaN, or zero, the rcond is zero */
+            Common->rcond = 0 ;
+            Common->status = KLU_SINGULAR ;
+            return (TRUE) ;
+        }
+        if (j == 0)
+        {
+            /* first pivot entry */
+            umin = ukk ;
+            umax = ukk ;
+        }
+        else
+        {
+            /* subsequent pivots */
+            umin = MIN (umin, ukk) ;
+            umax = MAX (umax, ukk) ;
+        }
+    }
+
+    Common->rcond = umin / umax ;
+    if (SCALAR_IS_NAN (Common->rcond) || SCALAR_IS_ZERO (Common->rcond))
+    {
+        /* this can occur if umin or umax are Inf or NaN */
+        Common->rcond = 0 ;
+        Common->status = KLU_SINGULAR ;
+    }
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_dump.c
+++ b/src/maths/KLU/klu_dump.c
@ -0,0 +1,142 @@
+/* ========================================================================== */
+/* === KLU_dump ============================================================= */
+/* ========================================================================== */
+
+/* Debug routines for klu.  Only used when NDEBUG is not defined at
+ * compile-time.
+ */
+
+#include "klu_internal.h"
+
+#ifndef NDEBUG
+
+/* ========================================================================== */
+/* === KLU_valid ============================================================ */
+/* ========================================================================== */
+
+/* Check if a column-form matrix is valid or not.  The matrix A is
+ * n-by-n.  The row indices of entries in column j are in
+ * Ai [Ap [j] ... Ap [j+1]-1].  Required conditions are:
+ *
+ *      n >= 0
+ *      nz = Ap [n_col] >= 0        number of entries in the matrix
+ *      Ap [0] == 0
+ *      Ap [j] <= Ap [j+1] for all j in the range 0 to n_col.
+ *      row indices in Ai [Ap [j] ... Ap [j+1]-1]
+ *          must be in the range 0 to n_row-1,
+ *          and no duplicate entries can exist (duplicates not checked here).
+ *
+ * Not user-callable.  Only used when debugging.
+ */
+
+Int KLU_valid (Int n, Int Ap [ ], Int Ai [ ], Entry Ax [ ])
+{
+    Int nz, j, p1, p2, i, p ;
+    PRINTF (("\ncolumn oriented matrix, n = %d\n", n)) ;
+    if (n <= 0)
+    {
+        PRINTF (("n must be >= 0: %d\n", n)) ;
+        return (FALSE) ;
+    }
+    nz = Ap [n] ;
+    if (Ap [0] != 0 || nz < 0)
+    {
+        /* column pointers must start at Ap [0] = 0, and Ap [n] must be >= 0 */
+        PRINTF (("column 0 pointer bad or nz < 0\n")) ;
+        return (FALSE) ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        p1 = Ap [j] ;
+        p2 = Ap [j+1] ;
+        PRINTF (("\nColumn: %d p1: %d p2: %d\n", j, p1, p2)) ;
+        if (p1 > p2)
+        {
+            /* column pointers must be ascending */
+            PRINTF (("column %d pointer bad\n", j)) ;
+            return (FALSE) ;
+        }
+        for (p = p1 ; p < p2 ; p++)
+        {
+            i = Ai [p] ;
+            PRINTF (("row: %d", i)) ;
+            if (i < 0 || i >= n)
+            {
+                /* row index out of range */
+                PRINTF (("index out of range, col %d row %d\n", j, i)) ;
+                return (FALSE) ;
+            }
+            if (Ax != (Entry *) NULL)
+            {
+                PRINT_ENTRY (Ax [p]) ;
+            }
+            PRINTF (("\n")) ;
+        }
+    }
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_valid_LU ========================================================= */
+/* ========================================================================== */
+
+/* This function does the same validity tests as KLU_valid but for the
+ * LU factor storage format. The flag flag_test_start_ptr is used to
+ * test if Xip [0] = 0. This is not applicable for U. So when calling this
+ * function for U, the flag should be set to false.  Only used when debugging.
+ */
+
+Int KLU_valid_LU (Int n, Int flag_test_start_ptr, Int Xip [ ],
+                   Int Xlen [ ],  Unit LU [ ])
+{
+    Int *Xi ;
+    Entry *Xx ;
+    Int j, p1, p2, i, p, len ;
+
+    PRINTF (("\ncolumn oriented matrix, n = %d\n", n)) ;
+    if (n <= 0)
+    {
+        PRINTF (("n must be >= 0: %d\n", n)) ;
+        return (FALSE) ;
+    }
+    if (flag_test_start_ptr && Xip [0] != 0)
+    {
+        /* column pointers must start at Xip [0] = 0*/
+        PRINTF (("column 0 pointer bad\n")) ;
+        return (FALSE) ;
+    }
+
+    for (j = 0 ; j < n ; j++)
+    {
+        p1 = Xip [j] ;
+        p2 = Xip [j+1] ;
+        PRINTF (("\nColumn: %d p1: %d p2: %d\n", j, p1, p2)) ;
+        if (p1 > p2)
+        {
+            /* column pointers must be ascending */
+            PRINTF (("column %d pointer bad\n", j)) ;
+            return (FALSE) ;
+        }
+        GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            i = Xi [p] ;
+            PRINTF (("row: %d", i)) ;
+            if (i < 0 || i >= n)
+            {
+                /* row index out of range */
+                PRINTF (("index out of range, col %d row %d\n", j, i)) ;
+                return (FALSE) ;
+            }
+            if (Xx != (Entry *) NULL)
+            {
+                PRINT_ENTRY (Xx [p]) ;
+            }
+            PRINTF (("\n")) ;
+        }
+    }
+
+    return (TRUE) ;
+}
+#endif
--- a/src/maths/KLU/klu_extract.c
+++ b/src/maths/KLU/klu_extract.c
@ -0,0 +1,290 @@
+/* ========================================================================== */
+/* === KLU_extract ========================================================== */
+/* ========================================================================== */
+
+/* Extract KLU factorization into conventional compressed-column matrices.
+ * If any output array is NULL, that part of the LU factorization is not
+ * extracted (this is not an error condition).
+ *
+ * nnz(L) = Numeric->lnz, nnz(U) = Numeric->unz, and nnz(F) = Numeric->Offp [n]
+ */
+
+#include "klu_internal.h"
+
+Int KLU_extract     /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs: */
+    KLU_numeric *Numeric,
+    KLU_symbolic *Symbolic,
+
+    /* outputs, all of which must be allocated on input */
+
+    /* L */
+    Int *Lp,        /* size n+1 */
+    Int *Li,        /* size nnz(L) */
+    double *Lx,     /* size nnz(L) */
+#ifdef COMPLEX
+    double *Lz,     /* size nnz(L) for the complex case, ignored if real */
+#endif
+
+    /* U */
+    Int *Up,        /* size n+1 */
+    Int *Ui,        /* size nnz(U) */
+    double *Ux,     /* size nnz(U) */
+#ifdef COMPLEX
+    double *Uz,     /* size nnz(U) for the complex case, ignored if real */
+#endif
+
+    /* F */
+    Int *Fp,        /* size n+1 */
+    Int *Fi,        /* size nnz(F) */
+    double *Fx,     /* size nnz(F) */
+#ifdef COMPLEX
+    double *Fz,     /* size nnz(F) for the complex case, ignored if real */
+#endif
+
+    /* P, row permutation */
+    Int *P,         /* size n */
+
+    /* Q, column permutation */
+    Int *Q,         /* size n */
+
+    /* Rs, scale factors */
+    double *Rs,     /* size n */
+
+    /* R, block boundaries */
+    Int *R,         /* size nblocks+1 */
+
+    KLU_common *Common
+)
+{
+    Int *Lip, *Llen, *Uip, *Ulen, *Li2, *Ui2 ;
+    Unit *LU ;
+    Entry *Lx2, *Ux2, *Ukk ;
+    Int i, k, block, nblocks, n, nz, k1, k2, nk, len, kk, p ;
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+
+    if (Symbolic == NULL || Numeric == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+
+    Common->status = KLU_OK ;
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+
+    /* ---------------------------------------------------------------------- */
+    /* extract scale factors */
+    /* ---------------------------------------------------------------------- */
+
+    if (Rs != NULL)
+    {
+        if (Numeric->Rs != NULL)
+        {
+            for (i = 0 ; i < n ; i++)
+            {
+                Rs [i] = Numeric->Rs [i] ;
+            }
+        }
+        else
+        {
+            /* no scaling */
+            for (i = 0 ; i < n ; i++)
+            {
+                Rs [i] = 1 ;
+            }
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract block boundaries */
+    /* ---------------------------------------------------------------------- */
+
+    if (R != NULL)
+    {
+        for (block = 0 ; block <= nblocks ; block++)
+        {
+            R [block] = Symbolic->R [block] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract final row permutation */
+    /* ---------------------------------------------------------------------- */
+
+    if (P != NULL)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            P [k] = Numeric->Pnum [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract column permutation */
+    /* ---------------------------------------------------------------------- */
+
+    if (Q != NULL)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = Symbolic->Q [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract each block of L */
+    /* ---------------------------------------------------------------------- */
+
+    if (Lp != NULL && Li != NULL && Lx != NULL
+#ifdef COMPLEX
+        && Lz != NULL
+#endif
+    )
+    {
+        nz = 0 ;
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = Symbolic->R [block] ;
+            k2 = Symbolic->R [block+1] ;
+            nk = k2 - k1 ;
+            if (nk == 1)
+            {
+                /* singleton block */
+                Lp [k1] = nz ;
+                Li [nz] = k1 ;
+                Lx [nz] = 1 ;
+#ifdef COMPLEX
+                Lz [nz] = 0 ;
+#endif
+                nz++ ;
+            }
+            else
+            {
+                /* non-singleton block */
+                LU = Numeric->LUbx [block] ;
+                Lip = Numeric->Lip + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                for (kk = 0 ; kk < nk ; kk++)
+                {
+                    Lp [k1+kk] = nz ;
+                    /* add the unit diagonal entry */
+                    Li [nz] = k1 + kk ;
+                    Lx [nz] = 1 ;
+#ifdef COMPLEX
+                    Lz [nz] = 0 ;
+#endif
+                    nz++ ;
+                    GET_POINTER (LU, Lip, Llen, Li2, Lx2, kk, len) ;
+                    for (p = 0 ; p < len ; p++)
+                    {
+                        Li [nz] = k1 + Li2 [p] ;
+                        Lx [nz] = REAL (Lx2 [p]) ;
+#ifdef COMPLEX
+                        Lz [nz] = IMAG (Lx2 [p]) ;
+#endif
+                        nz++ ;
+                    }
+                }
+            }
+        }
+        Lp [n] = nz ;
+        ASSERT (nz == Numeric->lnz) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract each block of U */
+    /* ---------------------------------------------------------------------- */
+
+    if (Up != NULL && Ui != NULL && Ux != NULL
+#ifdef COMPLEX
+        && Uz != NULL
+#endif
+    )
+    {
+        nz = 0 ;
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = Symbolic->R [block] ;
+            k2 = Symbolic->R [block+1] ;
+            nk = k2 - k1 ;
+            Ukk = ((Entry *) Numeric->Udiag) + k1 ;
+            if (nk == 1)
+            {
+                /* singleton block */
+                Up [k1] = nz ;
+                Ui [nz] = k1 ;
+                Ux [nz] = REAL (Ukk [0]) ;
+#ifdef COMPLEX
+                Uz [nz] = IMAG (Ukk [0]) ;
+#endif
+                nz++ ;
+            }
+            else
+            {
+                /* non-singleton block */
+                LU = Numeric->LUbx [block] ;
+                Uip = Numeric->Uip + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                for (kk = 0 ; kk < nk ; kk++)
+                {
+                    Up [k1+kk] = nz ;
+                    GET_POINTER (LU, Uip, Ulen, Ui2, Ux2, kk, len) ;
+                    for (p = 0 ; p < len ; p++)
+                    {
+                        Ui [nz] = k1 + Ui2 [p] ;
+                        Ux [nz] = REAL (Ux2 [p]) ;
+#ifdef COMPLEX
+                        Uz [nz] = IMAG (Ux2 [p]) ;
+#endif
+                        nz++ ;
+                    }
+                    /* add the diagonal entry */
+                    Ui [nz] = k1 + kk ;
+                    Ux [nz] = REAL (Ukk [kk]) ;
+#ifdef COMPLEX
+                    Uz [nz] = IMAG (Ukk [kk]) ;
+#endif
+                    nz++ ;
+                }
+            }
+        }
+        Up [n] = nz ;
+        ASSERT (nz == Numeric->unz) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract the off-diagonal blocks, F */
+    /* ---------------------------------------------------------------------- */
+
+    if (Fp != NULL && Fi != NULL && Fx != NULL
+#ifdef COMPLEX
+        && Fz != NULL
+#endif
+    )
+    {
+        for (k = 0 ; k <= n ; k++)
+        {
+            Fp [k] = Numeric->Offp [k] ;
+        }
+        nz = Fp [n] ;
+        for (k = 0 ; k < nz ; k++)
+        {
+            Fi [k] = Numeric->Offi [k] ;
+        }
+        for (k = 0 ; k < nz ; k++)
+        {
+            Fx [k] = REAL (((Entry *) Numeric->Offx) [k]) ;
+#ifdef COMPLEX
+            Fz [k] = IMAG (((Entry *) Numeric->Offx) [k]) ;
+#endif
+        }
+    }
+
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_factor.c
+++ b/src/maths/KLU/klu_factor.c
@ -0,0 +1,545 @@
+/* ========================================================================== */
+/* === KLU_factor =========================================================== */
+/* ========================================================================== */
+
+/* Factor the matrix, after ordering and analyzing it with KLU_analyze
+ * or KLU_analyze_given.
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === KLU_factor2 ========================================================== */
+/* ========================================================================== */
+
+static void factor2
+(
+    /* inputs, not modified */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    Entry Ax [ ],
+    KLU_symbolic *Symbolic,
+
+    /* inputs, modified on output: */
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double lsize ;
+    double *Lnz, *Rs ;
+    Int *P, *Q, *R, *Pnum, *Offp, *Offi, *Pblock, *Pinv, *Iwork,
+        *Lip, *Uip, *Llen, *Ulen ;
+    Entry *Offx, *X, s, *Udiag ;
+    Unit **LUbx ;
+    Int k1, k2, nk, k, block, oldcol, pend, oldrow, n, lnz, unz, p, newrow,
+        nblocks, poff, nzoff, lnz_block, unz_block, scale, max_lnz_block,
+        max_unz_block ;
+
+    /* ---------------------------------------------------------------------- */
+    /* initializations */
+    /* ---------------------------------------------------------------------- */
+
+    /* get the contents of the Symbolic object */
+    n = Symbolic->n ;
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    Lnz = Symbolic->Lnz ;
+    nblocks = Symbolic->nblocks ;
+    nzoff = Symbolic->nzoff ;
+
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    Lip = Numeric->Lip ;
+    Uip = Numeric->Uip ;
+    Llen = Numeric->Llen ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+    Udiag = Numeric->Udiag ;
+
+    Rs = Numeric->Rs ;
+    Pinv = Numeric->Pinv ;
+    X = (Entry *) Numeric->Xwork ;              /* X is of size n */
+    Iwork = Numeric->Iwork ;                    /* 5*maxblock for KLU_factor */
+                                                /* 1*maxblock for Pblock */
+    Pblock = Iwork + 5*((size_t) Symbolic->maxblock) ;
+    Common->nrealloc = 0 ;
+    scale = Common->scale ;
+    max_lnz_block = 1 ;
+    max_unz_block = 1 ;
+
+    /* compute the inverse of P from symbolic analysis.  Will be updated to
+     * become the inverse of the numerical factorization when the factorization
+     * is done, for use in KLU_refactor */
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        Pinv [k] = EMPTY ;
+    }
+#endif
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (P [k] >= 0 && P [k] < n) ;
+        Pinv [P [k]] = k ;
+    }
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++) ASSERT (Pinv [k] != EMPTY) ;
+#endif
+
+    lnz = 0 ;
+    unz = 0 ;
+    Common->noffdiag = 0 ;
+    Offp [0] = 0 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* optionally check input matrix and compute scale factors */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale >= 0)
+    {
+        /* use Pnum as workspace. NOTE: scale factors are not yet permuted
+         * according to the final pivot row ordering, so Rs [oldrow] is the
+         * scale factor for A (oldrow,:), for the user's matrix A.  Pnum is
+         * used as workspace in KLU_scale.  When the factorization is done,
+         * the scale factors are permuted according to the final pivot row
+         * permutation, so that Rs [k] is the scale factor for the kth row of
+         * A(p,q) where p and q are the final row and column permutations. */
+        KLU_scale (scale, n, Ap, Ai, (double *) Ax, Rs, Pnum, Common) ;
+        if (Common->status < KLU_OK)
+        {
+            /* matrix is invalid */
+            return ;
+        }
+    }
+
+#ifndef NDEBUG
+    if (scale > 0)
+    {
+        for (k = 0 ; k < n ; k++) PRINTF (("Rs [%d] %g\n", k, Rs [k])) ;
+    }
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* factor each block using klu */
+    /* ---------------------------------------------------------------------- */
+
+    for (block = 0 ; block < nblocks ; block++)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* the block is from rows/columns k1 to k2-1 */
+        /* ------------------------------------------------------------------ */
+
+        k1 = R [block] ;
+        k2 = R [block+1] ;
+        nk = k2 - k1 ;
+        PRINTF (("FACTOR BLOCK %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
+
+        if (nk == 1)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* singleton case */
+            /* -------------------------------------------------------------- */
+
+            poff = Offp [k1] ;
+            oldcol = Q [k1] ;
+            pend = Ap [oldcol+1] ;
+            CLEAR (s) ;
+
+            if (scale <= 0)
+            {
+                /* no scaling */
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    oldrow = Ai [p] ;
+                    newrow = Pinv [oldrow] ;
+                    if (newrow < k1)
+                    {
+                        Offi [poff] = oldrow ;
+                        Offx [poff] = Ax [p] ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        ASSERT (newrow == k1) ;
+                        PRINTF (("singleton block %d", block)) ;
+                        PRINT_ENTRY (Ax [p]) ;
+                        s = Ax [p] ;
+                    }
+                }
+            }
+            else
+            {
+                /* row scaling.  NOTE: scale factors are not yet permuted
+                 * according to the pivot row permutation, so Rs [oldrow] is
+                 * used below.  When the factorization is done, the scale
+                 * factors are permuted, so that Rs [newrow] will be used in
+                 * klu_solve, klu_tsolve, and klu_rgrowth */
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    oldrow = Ai [p] ;
+                    newrow = Pinv [oldrow] ;
+                    if (newrow < k1)
+                    {
+                        Offi [poff] = oldrow ;
+                        /* Offx [poff] = Ax [p] / Rs [oldrow] ; */
+                        SCALE_DIV_ASSIGN (Offx [poff], Ax [p], Rs [oldrow]) ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        ASSERT (newrow == k1) ;
+                        PRINTF (("singleton block %d ", block)) ;
+                        PRINT_ENTRY (Ax[p]) ;
+                        SCALE_DIV_ASSIGN (s, Ax [p], Rs [oldrow]) ;
+                    }
+                }
+            }
+
+            Udiag [k1] = s ;
+
+            if (IS_ZERO (s))
+            {
+                /* singular singleton */
+                Common->status = KLU_SINGULAR ;
+                Common->numerical_rank = k1 ;
+                Common->singular_col = oldcol ;
+                if (Common->halt_if_singular)
+                {
+                    return ;
+                }
+            }
+
+            Offp [k1+1] = poff ;
+            Pnum [k1] = P [k1] ;
+            lnz++ ;
+            unz++ ;
+
+        }
+        else
+        {
+
+            /* -------------------------------------------------------------- */
+            /* construct and factorize the kth block */
+            /* -------------------------------------------------------------- */
+
+            if (Lnz [block] < 0)
+            {
+                /* COLAMD was used - no estimate of fill-in */
+                /* use 10 times the nnz in A, plus n */
+                lsize = -(Common->initmem) ;
+            }
+            else
+            {
+                lsize = Common->initmem_amd * Lnz [block] + nk ;
+            }
+
+            /* allocates 1 arrays: LUbx [block] */
+            Numeric->LUsize [block] = KLU_kernel_factor (nk, Ap, Ai, Ax, Q,
+                    lsize, &LUbx [block], Udiag + k1, Llen + k1, Ulen + k1,
+                    Lip + k1, Uip + k1, Pblock, &lnz_block, &unz_block,
+                    X, Iwork, k1, Pinv, Rs, Offp, Offi, Offx, Common) ;
+
+            if (Common->status < KLU_OK ||
+               (Common->status == KLU_SINGULAR && Common->halt_if_singular))
+            {
+                /* out of memory, invalid inputs, or singular */
+                return ;
+            }
+
+            PRINTF (("\n----------------------- L %d:\n", block)) ;
+            ASSERT (KLU_valid_LU (nk, TRUE, Lip+k1, Llen+k1, LUbx [block])) ;
+            PRINTF (("\n----------------------- U %d:\n", block)) ;
+            ASSERT (KLU_valid_LU (nk, FALSE, Uip+k1, Ulen+k1, LUbx [block])) ;
+
+            /* -------------------------------------------------------------- */
+            /* get statistics */
+            /* -------------------------------------------------------------- */
+
+            lnz += lnz_block ;
+            unz += unz_block ;
+            max_lnz_block = MAX (max_lnz_block, lnz_block) ;
+            max_unz_block = MAX (max_unz_block, unz_block) ;
+
+            if (Lnz [block] == EMPTY)
+            {
+                /* revise estimate for subsequent factorization */
+                Lnz [block] = MAX (lnz_block, unz_block) ;
+            }
+
+            /* -------------------------------------------------------------- */
+            /* combine the klu row ordering with the symbolic pre-ordering */
+            /* -------------------------------------------------------------- */
+
+            PRINTF (("Pnum, 1-based:\n")) ;
+            for (k = 0 ; k < nk ; k++)
+            {
+                ASSERT (k + k1 < n) ;
+                ASSERT (Pblock [k] + k1 < n) ;
+                Pnum [k + k1] = P [Pblock [k] + k1] ;
+                PRINTF (("Pnum (%d + %d + 1 = %d) = %d + 1 = %d\n",
+                    k, k1, k+k1+1, Pnum [k+k1], Pnum [k+k1]+1)) ;
+            }
+
+            /* the local pivot row permutation Pblock is no longer needed */
+        }
+    }
+    ASSERT (nzoff == Offp [n]) ;
+    PRINTF (("\n------------------- Off diagonal entries:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    Numeric->lnz = lnz ;
+    Numeric->unz = unz ;
+    Numeric->max_lnz_block = max_lnz_block ;
+    Numeric->max_unz_block = max_unz_block ;
+
+    /* compute the inverse of Pnum */
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        Pinv [k] = EMPTY ;
+    }
+#endif
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (Pnum [k] >= 0 && Pnum [k] < n) ;
+        Pinv [Pnum [k]] = k ;
+    }
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++) ASSERT (Pinv [k] != EMPTY) ;
+#endif
+
+    /* permute scale factors Rs according to pivotal row order */
+    if (scale > 0)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            REAL (X [k]) = Rs [Pnum [k]] ;
+        }
+        for (k = 0 ; k < n ; k++)
+        {
+            Rs [k] = REAL (X [k]) ;
+        }
+    }
+
+    PRINTF (("\n------------------- Off diagonal entries, old:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    /* apply the pivot row permutations to the off-diagonal entries */
+    for (p = 0 ; p < nzoff ; p++)
+    {
+        ASSERT (Offi [p] >= 0 && Offi [p] < n) ;
+        Offi [p] = Pinv [Offi [p]] ;
+    }
+
+    PRINTF (("\n------------------- Off diagonal entries, new:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+#ifndef NDEBUG
+    {
+        PRINTF (("\n ############# KLU_BTF_FACTOR done, nblocks %d\n",nblocks));
+        Entry ss, *Udiag = Numeric->Udiag ;
+        for (block = 0 ; block < nblocks && Common->status == KLU_OK ; block++)
+        {
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("\n======================KLU_factor output: k1 %d k2 %d nk %d\n",k1,k2,nk)) ;
+            if (nk == 1)
+            {
+                PRINTF (("singleton  ")) ;
+                /* ENTRY_PRINT (singleton [block]) ; */
+                ss = Udiag [k1] ;
+                PRINT_ENTRY (ss) ;
+            }
+            else
+            {
+                Int *Lip, *Uip, *Llen, *Ulen ;
+                Unit *LU ;
+                Lip = Numeric->Lip + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                LU = (Unit *) Numeric->LUbx [block] ;
+                PRINTF (("\n---- L block %d\n", block));
+                ASSERT (KLU_valid_LU (nk, TRUE, Lip, Llen, LU)) ;
+                Uip = Numeric->Uip + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                PRINTF (("\n---- U block %d\n", block)) ;
+                ASSERT (KLU_valid_LU (nk, FALSE, Uip, Ulen, LU)) ;
+            }
+        }
+    }
+#endif
+}
+
+
+
+/* ========================================================================== */
+/* === KLU_factor =========================================================== */
+/* ========================================================================== */
+
+KLU_numeric *KLU_factor         /* returns NULL if error, or a valid
+                                   KLU_numeric object if successful */
+(
+    /* --- inputs --- */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    KLU_symbolic *Symbolic,
+    /* -------------- */
+    KLU_common *Common
+)
+{
+    Int n, nzoff, nblocks, maxblock, k, ok = TRUE ;
+    Int *R ;
+    KLU_numeric *Numeric ;
+    size_t n1, nzoff1, s, b6, n3 ;
+
+    if (Common == NULL)
+    {
+        return (NULL) ;
+    }
+    Common->status = KLU_OK ;
+    Common->numerical_rank = EMPTY ;
+    Common->singular_col = EMPTY ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    /* check for a valid Symbolic object */
+    if (Symbolic == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (NULL) ;
+    }
+
+    n = Symbolic->n ;
+    nzoff = Symbolic->nzoff ;
+    nblocks = Symbolic->nblocks ;
+    maxblock = Symbolic->maxblock ;
+    R = Symbolic->R ;
+    PRINTF (("KLU_factor:  n %d nzoff %d nblocks %d maxblock %d\n",
+        n, nzoff, nblocks, maxblock)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get control parameters and make sure they are in the proper range */
+    /* ---------------------------------------------------------------------- */
+
+    Common->initmem_amd = MAX (1.0, Common->initmem_amd) ;
+    Common->initmem = MAX (1.0, Common->initmem) ;
+    Common->tol = MIN (Common->tol, 1.0) ;
+    Common->tol = MAX (0.0, Common->tol) ;
+    Common->memgrow = MAX (1.0, Common->memgrow) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate the Numeric object  */
+    /* ---------------------------------------------------------------------- */
+
+    /* this will not cause size_t overflow (already checked by KLU_symbolic) */
+    n1 = ((size_t) n) + 1 ;
+    nzoff1 = ((size_t) nzoff) + 1 ;
+
+    Numeric = KLU_malloc (sizeof (KLU_numeric), 1, Common) ;
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+    Numeric->n = n ;
+    Numeric->nblocks = nblocks ;
+    Numeric->nzoff = nzoff ;
+    Numeric->Pnum = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Offp = KLU_malloc (n1, sizeof (Int), Common) ;
+    Numeric->Offi = KLU_malloc (nzoff1, sizeof (Int), Common) ;
+    Numeric->Offx = KLU_malloc (nzoff1, sizeof (Entry), Common) ;
+
+    Numeric->Lip  = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Uip  = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Llen = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Ulen = KLU_malloc (n, sizeof (Int), Common) ;
+
+    Numeric->LUsize = KLU_malloc (nblocks, sizeof (size_t), Common) ;
+
+    Numeric->LUbx = KLU_malloc (nblocks, sizeof (Unit *), Common) ;
+    if (Numeric->LUbx != NULL)
+    {
+        for (k = 0 ; k < nblocks ; k++)
+        {
+            Numeric->LUbx [k] = NULL ;
+        }
+    }
+
+    Numeric->Udiag = KLU_malloc (n, sizeof (Entry), Common) ;
+
+    if (Common->scale > 0)
+    {
+        Numeric->Rs = KLU_malloc (n, sizeof (double), Common) ;
+    }
+    else
+    {
+        /* no scaling */
+        Numeric->Rs = NULL ;
+    }
+
+    Numeric->Pinv = KLU_malloc (n, sizeof (Int), Common) ;
+
+    /* allocate permanent workspace for factorization and solve.  Note that the
+     * solver will use an Xwork of size 4n, whereas the factorization codes use
+     * an Xwork of size n and integer space (Iwork) of size 6n. KLU_condest
+     * uses an Xwork of size 2n.  Total size is:
+     *
+     *    n*sizeof(Entry) + max (6*maxblock*sizeof(Int), 3*n*sizeof(Entry))
+     */
+    s = KLU_mult_size_t (n, sizeof (Entry), &ok) ;
+    n3 = KLU_mult_size_t (n, 3 * sizeof (Entry), &ok) ;
+    b6 = KLU_mult_size_t (maxblock, 6 * sizeof (Int), &ok) ;
+    Numeric->worksize = KLU_add_size_t (s, MAX (n3, b6), &ok) ;
+    Numeric->Work = KLU_malloc (Numeric->worksize, 1, Common) ;
+    Numeric->Xwork = Numeric->Work ;
+    Numeric->Iwork = (Int *) ((Entry *) Numeric->Xwork + n) ;
+    if (!ok || Common->status < KLU_OK)
+    {
+        /* out of memory or problem too large */
+        Common->status = ok ? KLU_OUT_OF_MEMORY : KLU_TOO_LARGE ;
+        KLU_free_numeric (&Numeric, Common) ;
+        return (NULL) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* factorize the blocks */
+    /* ---------------------------------------------------------------------- */
+
+    factor2 (Ap, Ai, (Entry *) Ax, Symbolic, Numeric, Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* return or free the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory or inputs invalid */
+        KLU_free_numeric (&Numeric, Common) ;
+    }
+    else if (Common->status == KLU_SINGULAR)
+    {
+        if (Common->halt_if_singular)
+        {
+            /* Matrix is singular, and the Numeric object is only partially
+             * defined because we halted early.  This is the default case for
+             * a singular matrix. */
+            KLU_free_numeric (&Numeric, Common) ;
+        }
+    }
+    else if (Common->status == KLU_OK)
+    {
+        /* successful non-singular factorization */
+        Common->numerical_rank = n ;
+        Common->singular_col = n ;
+    }
+    return (Numeric) ;
+}
--- a/src/maths/KLU/klu_free_numeric.c
+++ b/src/maths/KLU/klu_free_numeric.c
@ -0,0 +1,71 @@
+/* ========================================================================== */
+/* === KLU_free_numeric ===================================================== */
+/* ========================================================================== */
+
+/* Free the KLU Numeric object. */
+
+#include "klu_internal.h"
+
+Int KLU_free_numeric
+(
+    KLU_numeric **NumericHandle,
+    KLU_common  *Common
+)
+{
+    KLU_numeric *Numeric ;
+    Unit **LUbx ;
+    size_t *LUsize ;
+    Int block, n, nzoff, nblocks ;
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (NumericHandle == NULL || *NumericHandle == NULL)
+    {
+        return (TRUE) ;
+    }
+
+    Numeric = *NumericHandle ;
+
+    n = Numeric->n ;
+    nzoff = Numeric->nzoff ;
+    nblocks = Numeric->nblocks ;
+    LUsize = Numeric->LUsize ;
+
+    LUbx = (Unit **) Numeric->LUbx ;
+    if (LUbx != NULL)
+    {
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            KLU_free (LUbx [block], LUsize ? LUsize [block] : 0,
+                sizeof (Unit), Common) ;
+        }
+    }
+
+    KLU_free (Numeric->Pnum, n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Offp, n+1, sizeof (Int), Common) ;
+    KLU_free (Numeric->Offi, nzoff+1, sizeof (Int), Common) ;
+    KLU_free (Numeric->Offx, nzoff+1, sizeof (Entry), Common) ;
+
+    KLU_free (Numeric->Lip,  n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Llen, n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Uip,  n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Ulen, n, sizeof (Int), Common) ;
+
+    KLU_free (Numeric->LUsize, nblocks, sizeof (size_t), Common) ;
+
+    KLU_free (Numeric->LUbx, nblocks, sizeof (Unit *), Common) ;
+
+    KLU_free (Numeric->Udiag, n, sizeof (Entry), Common) ;
+
+    KLU_free (Numeric->Rs,   n, sizeof (double), Common) ;
+    KLU_free (Numeric->Pinv, n, sizeof (Int), Common) ;
+
+    KLU_free (Numeric->Work, Numeric->worksize, 1, Common) ;
+
+    KLU_free (Numeric, 1, sizeof (KLU_numeric), Common) ;
+
+    *NumericHandle = NULL ;
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_free_symbolic.c
+++ b/src/maths/KLU/klu_free_symbolic.c
@ -0,0 +1,34 @@
+/* ========================================================================== */
+/* === KLU_free_symbolic ==================================================== */
+/* ========================================================================== */
+
+/* Free the KLU Symbolic object. */
+
+#include "klu_internal.h"
+
+Int KLU_free_symbolic
+(
+    KLU_symbolic **SymbolicHandle,
+    KLU_common   *Common
+)
+{
+    KLU_symbolic *Symbolic ;
+    Int n ;
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (SymbolicHandle == NULL || *SymbolicHandle == NULL)
+    {
+        return (TRUE) ;
+    }
+    Symbolic = *SymbolicHandle ;
+    n = Symbolic->n ;
+    KLU_free (Symbolic->P, n, sizeof (Int), Common) ;
+    KLU_free (Symbolic->Q, n, sizeof (Int), Common) ;
+    KLU_free (Symbolic->R, n+1, sizeof (Int), Common) ;
+    KLU_free (Symbolic->Lnz, n, sizeof (double), Common) ;
+    KLU_free (Symbolic, 1, sizeof (KLU_symbolic), Common) ;
+    *SymbolicHandle = NULL ;
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_internal.h
+++ b/src/maths/KLU/klu_internal.h
@ -0,0 +1,243 @@
+/* ========================================================================== */
+/* === KLU/Include/klu_internal.h =========================================== */
+/* ========================================================================== */
+
+/* For internal use in KLU routines only, not for user programs */
+
+#ifndef _KLU_INTERNAL_H
+#define _KLU_INTERNAL_H
+
+#include "klu.h" 
+#include "btf.h"
+#include "klu_version.h"
+
+/* ========================================================================== */
+/* make sure debugging and printing is turned off */
+
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#ifndef NPRINT
+#define NPRINT
+#endif
+
+/* To enable debugging and assertions, uncomment this line:
+ #undef NDEBUG
+ */
+
+/* To enable diagnostic printing, uncomment this line:
+ #undef NPRINT
+ */
+
+/* ========================================================================== */
+
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <math.h>
+
+#undef ASSERT
+#ifndef NDEBUG
+#define ASSERT(a) assert(a)
+#else
+#define ASSERT(a)
+#endif
+
+#define SCALAR_IS_NAN(x) ((x) != (x))
+
+/* true if an integer (stored in double x) would overflow (or if x is NaN) */
+#define INT_OVERFLOW(x) ((!((x) * (1.0+1e-8) <= (double) INT_MAX)) \
+                        || SCALAR_IS_NAN (x))
+
+#undef TRUE
+#undef FALSE
+#undef MAX
+#undef MIN
+#undef PRINTF
+#undef FLIP
+
+#ifndef NPRINT
+#define PRINTF(s) { printf s ; } ;
+#else
+#define PRINTF(s)
+#endif
+
+#define TRUE 1
+#define FALSE 0
+#define MAX(a,b) (((a) > (b)) ?  (a) : (b))
+#define MIN(a,b) (((a) < (b)) ?  (a) : (b))
+
+/* FLIP is a "negation about -1", and is used to mark an integer i that is
+ * normally non-negative.  FLIP (EMPTY) is EMPTY.  FLIP of a number > EMPTY
+ * is negative, and FLIP of a number < EMTPY is positive.  FLIP (FLIP (i)) = i
+ * for all integers i.  UNFLIP (i) is >= EMPTY. */
+#define EMPTY (-1)
+#define FLIP(i) (-(i)-2)
+#define UNFLIP(i) (((i) < EMPTY) ? FLIP (i) : (i))
+
+
+size_t KLU_kernel   /* final size of LU on output */
+(
+    /* input, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers for A */
+    Int Ai [ ],         /* size nz = Ap [n], row indices for A */
+    Entry Ax [ ],       /* size nz, values of A */
+    Int Q [ ],          /* size n, optional input permutation */
+    size_t lusize,      /* initial size of LU */
+
+    /* output, not defined on input */
+    Int Pinv [ ],       /* size n */
+    Int P [ ],          /* size n */
+    Unit **p_LU,        /* size lusize on input, size Uxp[n] on output*/
+    Entry Udiag [ ],    /* size n, diagonal of U */
+    Int Llen [ ],       /* size n, column length of L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Lip [ ],        /* size n+1 */
+    Int Uip [ ],        /* size n+1 */
+    Int *lnz,           /* size of L */
+    Int *unz,           /* size of U */
+
+    /* workspace, not defined on input */
+    Entry X [ ],   /* size n, zero on output */
+
+    /* workspace, not defined on input or output */
+    Int Stack [ ],  /* size n */
+    Int Flag [ ],   /* size n */
+    Int adj_pos [ ],    /* size n */
+    
+    /* workspace for pruning only */
+    Int Lpend [ ],      /* size n workspace */
+
+    /* inputs, not modified on output */
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],      /* inverse of P from symbolic factorization */
+    double Rs [ ],      /* scale factors for A */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ],
+    KLU_common *Common  /* the control input/output structure */
+) ;
+
+
+size_t KLU_kernel_factor            /* 0 if failure, size of LU if OK */
+(
+    /* inputs, not modified */
+    Int n,          /* A is n-by-n. n must be > 0. */
+    Int Ap [ ],     /* size n+1, column pointers for A */
+    Int Ai [ ],     /* size nz = Ap [n], row indices for A */
+    Entry Ax [ ],   /* size nz, values of A */
+    Int Q [ ],      /* size n, optional column permutation */
+    double Lsize,   /* initial size of L and U */
+
+    /* outputs, not defined on input */
+    Unit **p_LU,        /* row indices and values of L and U */
+    Entry Udiag [ ],    /* size n, diagonal of U */
+    Int Llen [ ],       /* size n, column length of L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Lip [ ],        /* size n+1, column pointers of L */
+    Int Uip [ ],        /* size n+1, column pointers of U */
+    Int P [ ],          /* row permutation, size n */
+    Int *lnz,           /* size of L */
+    Int *unz,           /* size of U */
+
+    /* workspace, undefined on input */
+    Entry *X,       /* size n entries.  Zero on output */
+    Int *Work,      /* size 5n Int's */
+
+    /* inputs, not modified on output */
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],      /* inverse of P from symbolic factorization */
+    double Rs [ ],      /* scale factors for A */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ],
+    KLU_common *Common  /* the control input/output structure */
+) ;
+
+void KLU_lsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lp [ ],
+    Int Li [ ],
+    Unit LU [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Lx=b on output */
+    Entry X [ ]
+) ;
+
+void KLU_ltsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lp [ ],
+    Int Li [ ],
+    Unit LU [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to L'x=b on output */
+    Entry X [ ]
+) ;
+
+
+void KLU_usolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Up [ ],
+    Int Ui [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Ux=b on output */
+    Entry X [ ]
+) ;
+
+void KLU_utsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Up [ ],
+    Int Ui [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to U'x=b on output */
+    Entry X [ ]
+) ;
+
+Int KLU_valid 
+(
+    Int n, 
+    Int Ap [ ], 
+    Int Ai [ ], 
+    Entry Ax [ ]
+) ;
+
+Int KLU_valid_LU 
+(
+    Int n, 
+    Int flag_test_start_ptr, 
+    Int Xip [ ],
+    Int Xlen [ ],  
+    Unit LU [ ]
+);
+
+size_t KLU_add_size_t (size_t a, size_t b, Int *ok) ;
+
+size_t KLU_mult_size_t (size_t a, size_t k, Int *ok) ;
+
+KLU_symbolic *KLU_alloc_symbolic (Int n, Int *Ap, Int *Ai, KLU_common *Common) ;
+
+#endif
--- a/src/maths/KLU/klu_kernel.c
+++ b/src/maths/KLU/klu_kernel.c
--- a/src/maths/KLU/klu_memory.c
+++ b/src/maths/KLU/klu_memory.c
@ -0,0 +1,225 @@
+/* ========================================================================== */
+/* === KLU_memory =========================================================== */
+/* ========================================================================== */
+
+/* KLU memory management routines:
+ *
+ * KLU_malloc                   malloc wrapper
+ * KLU_free                     free wrapper
+ * KLU_realloc                  realloc wrapper
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === KLU_add_size_t ======================================================= */
+/* ========================================================================== */
+
+/* Safely compute a+b, and check for size_t overflow */
+
+size_t KLU_add_size_t (size_t a, size_t b, Int *ok)
+{
+    (*ok) = (*ok) && ((a + b) >= MAX (a,b)) ;
+    return ((*ok) ? (a + b) : ((size_t) -1)) ;
+}
+
+/* ========================================================================== */
+/* === KLU_mult_size_t ====================================================== */
+/* ========================================================================== */
+
+/* Safely compute a*k, where k should be small, and check for size_t overflow */
+
+size_t KLU_mult_size_t (size_t a, size_t k, Int *ok)
+{
+    size_t i, s = 0 ;
+    for (i = 0 ; i < k ; i++)
+    {
+        s = KLU_add_size_t (s, a, ok) ;
+    }
+    return ((*ok) ? s : ((size_t) -1)) ;
+}
+
+/* ========================================================================== */
+/* === KLU_malloc =========================================================== */
+/* ========================================================================== */
+
+/* Wrapper around malloc routine (mxMalloc for a mexFunction).  Allocates
+ * space of size MAX(1,n)*size, where size is normally a sizeof (...).
+ *
+ * This routine and KLU_realloc do not set Common->status to KLU_OK on success,
+ * so that a sequence of KLU_malloc's or KLU_realloc's can be used.  If any of
+ * them fails, the Common->status will hold the most recent error status.
+ *
+ * Usage, for a pointer to Int:
+ *
+ *      p = KLU_malloc (n, sizeof (Int), Common)
+ *
+ * Uses a pointer to the malloc routine (or its equivalent) defined in Common.
+ */
+
+void *KLU_malloc        /* returns pointer to the newly malloc'd block */
+(
+    /* ---- input ---- */
+    size_t n,           /* number of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    void *p ;
+    size_t s ;
+    Int ok = TRUE ;
+
+    if (Common == NULL)
+    {
+        p = NULL ;
+    }
+    else if (size == 0)
+    {
+        /* size must be > 0 */
+        Common->status = KLU_INVALID ;
+        p = NULL ;
+    }
+    else if (n >= INT_MAX)
+    {
+        /* object is too big to allocate; p[i] where i is an Int will not
+         * be enough. */
+        Common->status = KLU_TOO_LARGE ;
+        p = NULL ;
+    }
+    else
+    {
+        /* call malloc, or its equivalent */
+        s = KLU_mult_size_t (MAX (1,n), size, &ok) ;
+        p = ok ? ((Common->malloc_memory) (s)) : NULL ;
+        if (p == NULL)
+        {
+            /* failure: out of memory */
+            Common->status = KLU_OUT_OF_MEMORY ;
+        }
+        else
+        {
+            Common->memusage += s ;
+            Common->mempeak = MAX (Common->mempeak, Common->memusage) ;
+        }
+    }
+    return (p) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_free ============================================================= */
+/* ========================================================================== */
+
+/* Wrapper around free routine (mxFree for a mexFunction).  Returns NULL,
+ * which can be assigned to the pointer being freed, as in:
+ *
+ *      p = KLU_free (p, n, sizeof (int), Common) ;
+ */
+
+void *KLU_free          /* always returns NULL */
+(
+    /* ---- in/out --- */
+    void *p,            /* block of memory to free */
+    /* ---- input --- */
+    size_t n,           /* size of block to free, in # of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    size_t s ;
+    Int ok = TRUE ;
+    if (p != NULL && Common != NULL)
+    {
+        /* only free the object if the pointer is not NULL */
+        /* call free, or its equivalent */
+        (Common->free_memory) (p) ;
+        s = KLU_mult_size_t (MAX (1,n), size, &ok) ;
+        Common->memusage -= s ;
+    }
+    /* return NULL, and the caller should assign this to p.  This avoids
+     * freeing the same pointer twice. */
+    return (NULL) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_realloc ========================================================== */
+/* ========================================================================== */
+
+/* Wrapper around realloc routine (mxRealloc for a mexFunction).  Given a
+ * pointer p to a block allocated by KLU_malloc, it changes the size of the
+ * block pointed to by p to be MAX(1,nnew)*size in size.  It may return a
+ * pointer different than p.  This should be used as (for a pointer to Int):
+ *
+ *      p = KLU_realloc (nnew, nold, sizeof (Int), p, Common) ;
+ *
+ * If p is NULL, this is the same as p = KLU_malloc (...).
+ * A size of nnew=0 is treated as nnew=1.
+ *
+ * If the realloc fails, p is returned unchanged and Common->status is set
+ * to KLU_OUT_OF_MEMORY.  If successful, Common->status is not modified,
+ * and p is returned (possibly changed) and pointing to a large block of memory.
+ *
+ * Uses a pointer to the realloc routine (or its equivalent) defined in Common.
+ */
+
+void *KLU_realloc       /* returns pointer to reallocated block */
+(
+    /* ---- input ---- */
+    size_t nnew,        /* requested # of items in reallocated block */
+    size_t nold,        /* old # of items */
+    size_t size,        /* size of each item */
+    /* ---- in/out --- */
+    void *p,            /* block of memory to realloc */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    void *pnew ;
+    size_t snew, sold ;
+    Int ok = TRUE ;
+
+    if (Common == NULL)
+    {
+        p = NULL ;
+    }
+    else if (size == 0)
+    {
+        /* size must be > 0 */
+        Common->status = KLU_INVALID ;
+        p = NULL ;
+    }
+    else if (p == NULL)
+    {
+        /* A fresh object is being allocated. */
+        p = KLU_malloc (nnew, size, Common) ;
+    }
+    else if (nnew >= INT_MAX)
+    {
+        /* failure: nnew is too big.  Do not change p */
+        Common->status = KLU_TOO_LARGE ;
+    }
+    else
+    {
+        /* The object exists, and is changing to some other nonzero size. */
+        /* call realloc, or its equivalent */
+        snew = KLU_mult_size_t (MAX (1,nnew), size, &ok) ;
+        sold = KLU_mult_size_t (MAX (1,nold), size, &ok) ;
+        pnew = ok ? ((Common->realloc_memory) (p, snew)) : NULL ;
+        if (pnew == NULL)
+        {
+            /* Do not change p, since it still points to allocated memory */
+            Common->status = KLU_OUT_OF_MEMORY ;
+        }
+        else
+        {
+            /* success: return the new p and change the size of the block */
+            Common->memusage += (snew - sold) ;
+            Common->mempeak = MAX (Common->mempeak, Common->memusage) ;
+            p = pnew ;
+        }
+    }
+    return (p) ;
+}
--- a/src/maths/KLU/klu_refactor.c
+++ b/src/maths/KLU/klu_refactor.c
@ -0,0 +1,478 @@
+/* ========================================================================== */
+/* === KLU_refactor ========================================================= */
+/* ========================================================================== */
+
+/* Factor the matrix, after ordering and analyzing it with KLU_analyze, and
+ * factoring it once with KLU_factor.  This routine cannot do any numerical
+ * pivoting.  The pattern of the input matrix (Ap, Ai) must be identical to
+ * the pattern given to KLU_factor.
+ */
+
+#include "klu_internal.h"
+
+
+/* ========================================================================== */
+/* === KLU_refactor ========================================================= */
+/* ========================================================================== */
+
+Int KLU_refactor        /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    KLU_symbolic *Symbolic,
+
+    /* input/output */
+    KLU_numeric *Numeric,
+    KLU_common  *Common
+)
+{
+    Entry ukk, ujk, s ;
+    Entry *Offx, *Lx, *Ux, *X, *Az, *Udiag ;
+    double *Rs ;
+    Int *P, *Q, *R, *Pnum, *Offp, *Offi, *Ui, *Li, *Pinv, *Lip, *Uip, *Llen,
+        *Ulen ;
+    Unit **LUbx ;
+    Unit *LU ;
+    Int k1, k2, nk, k, block, oldcol, pend, oldrow, n, p, newrow, scale,
+        nblocks, poff, i, j, up, ulen, llen, maxblock, nzoff ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    if (Numeric == NULL)
+    {
+        /* invalid Numeric object */
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+
+    Common->numerical_rank = EMPTY ;
+    Common->singular_col = EMPTY ;
+
+    Az = (Entry *) Ax ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    nblocks = Symbolic->nblocks ;
+    maxblock = Symbolic->maxblock ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    LUbx = (Unit **) Numeric->LUbx ;
+
+    scale = Common->scale ;
+    if (scale > 0)
+    {
+        /* factorization was not scaled, but refactorization is scaled */
+        if (Numeric->Rs == NULL)
+        {
+            Numeric->Rs = KLU_malloc (n, sizeof (double), Common) ;
+            if (Common->status < KLU_OK)
+            {
+                Common->status = KLU_OUT_OF_MEMORY ;
+                return (FALSE) ;
+            }
+        }
+    }
+    else
+    {
+        /* no scaling for refactorization; ensure Numeric->Rs is freed.  This
+         * does nothing if Numeric->Rs is already NULL. */
+        Numeric->Rs = KLU_free (Numeric->Rs, n, sizeof (double), Common) ;
+    }
+    Rs = Numeric->Rs ;
+
+    Pinv = Numeric->Pinv ;
+    X = (Entry *) Numeric->Xwork ;
+    Common->nrealloc = 0 ;
+    Udiag = Numeric->Udiag ;
+    nzoff = Symbolic->nzoff ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check the input matrix compute the row scale factors, Rs */
+    /* ---------------------------------------------------------------------- */
+
+    /* do no scale, or check the input matrix, if scale < 0 */
+    if (scale >= 0)
+    {
+        /* check for out-of-range indices, but do not check for duplicates */
+        if (!KLU_scale (scale, n, Ap, Ai, Ax, Rs, NULL, Common))
+        {
+            return (FALSE) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* clear workspace X */
+    /* ---------------------------------------------------------------------- */
+
+    for (k = 0 ; k < maxblock ; k++)
+    {
+        /* X [k] = 0 */
+        CLEAR (X [k]) ;
+    }
+
+    poff = 0 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* factor each block */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale <= 0)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* no scaling */
+        /* ------------------------------------------------------------------ */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+
+            if (nk == 1)
+            {
+
+                /* ---------------------------------------------------------- */
+                /* singleton case */
+                /* ---------------------------------------------------------- */
+
+                oldcol = Q [k1] ;
+                pend = Ap [oldcol+1] ;
+                CLEAR (s) ;
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    newrow = Pinv [Ai [p]] - k1 ;
+                    if (newrow < 0 && poff < nzoff)
+                    {
+                        /* entry in off-diagonal block */
+                        Offx [poff] = Az [p] ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        /* singleton */
+                        s = Az [p] ;
+                    }
+                }
+                Udiag [k1] = s ;
+
+            }
+            else
+            {
+
+                /* ---------------------------------------------------------- */
+                /* construct and factor the kth block */
+                /* ---------------------------------------------------------- */
+
+                Lip  = Numeric->Lip  + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                Uip  = Numeric->Uip  + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                LU = LUbx [block] ;
+
+                for (k = 0 ; k < nk ; k++)
+                {
+
+                    /* ------------------------------------------------------ */
+                    /* scatter kth column of the block into workspace X */
+                    /* ------------------------------------------------------ */
+
+                    oldcol = Q [k+k1] ;
+                    pend = Ap [oldcol+1] ;
+                    for (p = Ap [oldcol] ; p < pend ; p++)
+                    {
+                        newrow = Pinv [Ai [p]] - k1 ;
+                        if (newrow < 0 && poff < nzoff)
+                        {
+                            /* entry in off-diagonal block */
+                            Offx [poff] = Az [p] ;
+                            poff++ ;
+                        }
+                        else
+                        {
+                            /* (newrow,k) is an entry in the block */
+                            X [newrow] = Az [p] ;
+                        }
+                    }
+
+                    /* ------------------------------------------------------ */
+                    /* compute kth column of U, and update kth column of A */
+                    /* ------------------------------------------------------ */
+
+                    GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, ulen) ;
+                    for (up = 0 ; up < ulen ; up++)
+                    {
+                        j = Ui [up] ;
+                        ujk = X [j] ;
+                        /* X [j] = 0 */
+                        CLEAR (X [j]) ;
+                        Ux [up] = ujk ;
+                        GET_POINTER (LU, Lip, Llen, Li, Lx, j, llen) ;
+                        for (p = 0 ; p < llen ; p++)
+                        {
+                            /* X [Li [p]] -= Lx [p] * ujk */
+                            MULT_SUB (X [Li [p]], Lx [p], ujk) ;
+                        }
+                    }
+                    /* get the diagonal entry of U */
+                    ukk = X [k] ;
+                    /* X [k] = 0 */
+                    CLEAR (X [k]) ;
+                    /* singular case */
+                    if (IS_ZERO (ukk))
+                    {
+                        /* matrix is numerically singular */
+                        Common->status = KLU_SINGULAR ;
+                        if (Common->numerical_rank == EMPTY)
+                        {
+                            Common->numerical_rank = k+k1 ;
+                            Common->singular_col = Q [k+k1] ;
+                        }
+                        if (Common->halt_if_singular)
+                        {
+                            /* do not continue the factorization */
+                            return (FALSE) ;
+                        }
+                    }
+                    Udiag [k+k1] = ukk ;
+                    /* gather and divide by pivot to get kth column of L */
+                    GET_POINTER (LU, Lip, Llen, Li, Lx, k, llen) ;
+                    for (p = 0 ; p < llen ; p++)
+                    {
+                        i = Li [p] ;
+                        DIV (Lx [p], X [i], ukk) ;
+                        CLEAR (X [i]) ;
+                    }
+
+                }
+            }
+        }
+
+    }
+    else
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* scaling */
+        /* ------------------------------------------------------------------ */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+
+            if (nk == 1)
+            {
+
+                /* ---------------------------------------------------------- */
+                /* singleton case */
+                /* ---------------------------------------------------------- */
+
+                oldcol = Q [k1] ;
+                pend = Ap [oldcol+1] ;
+                CLEAR (s) ;
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    oldrow = Ai [p] ;
+                    newrow = Pinv [oldrow] - k1 ;
+                    if (newrow < 0 && poff < nzoff)
+                    {
+                        /* entry in off-diagonal block */
+                        /* Offx [poff] = Az [p] / Rs [oldrow] */
+                        SCALE_DIV_ASSIGN (Offx [poff], Az [p], Rs [oldrow]) ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        /* singleton */
+                        /* s = Az [p] / Rs [oldrow] */
+                        SCALE_DIV_ASSIGN (s, Az [p], Rs [oldrow]) ;
+                    }
+                }
+                Udiag [k1] = s ;
+
+            }
+            else
+            {
+
+                /* ---------------------------------------------------------- */
+                /* construct and factor the kth block */
+                /* ---------------------------------------------------------- */
+
+                Lip  = Numeric->Lip  + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                Uip  = Numeric->Uip  + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                LU = LUbx [block] ;
+
+                for (k = 0 ; k < nk ; k++)
+                {
+
+                    /* ------------------------------------------------------ */
+                    /* scatter kth column of the block into workspace X */
+                    /* ------------------------------------------------------ */
+
+                    oldcol = Q [k+k1] ;
+                    pend = Ap [oldcol+1] ;
+                    for (p = Ap [oldcol] ; p < pend ; p++)
+                    {
+                        oldrow = Ai [p] ;
+                        newrow = Pinv [oldrow] - k1 ;
+                        if (newrow < 0 && poff < nzoff)
+                        {
+                            /* entry in off-diagonal part */
+                            /* Offx [poff] = Az [p] / Rs [oldrow] */
+                            SCALE_DIV_ASSIGN (Offx [poff], Az [p], Rs [oldrow]);
+                            poff++ ;
+                        }
+                        else
+                        {
+                            /* (newrow,k) is an entry in the block */
+                            /* X [newrow] = Az [p] / Rs [oldrow] */
+                            SCALE_DIV_ASSIGN (X [newrow], Az [p], Rs [oldrow]) ;
+                        }
+                    }
+
+                    /* ------------------------------------------------------ */
+                    /* compute kth column of U, and update kth column of A */
+                    /* ------------------------------------------------------ */
+
+                    GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, ulen) ;
+                    for (up = 0 ; up < ulen ; up++)
+                    {
+                        j = Ui [up] ;
+                        ujk = X [j] ;
+                        /* X [j] = 0 */
+                        CLEAR (X [j]) ;
+                        Ux [up] = ujk ;
+                        GET_POINTER (LU, Lip, Llen, Li, Lx, j, llen) ;
+                        for (p = 0 ; p < llen ; p++)
+                        {
+                            /* X [Li [p]] -= Lx [p] * ujk */
+                            MULT_SUB (X [Li [p]], Lx [p], ujk) ;
+                        }
+                    }
+                    /* get the diagonal entry of U */
+                    ukk = X [k] ;
+                    /* X [k] = 0 */
+                    CLEAR (X [k]) ;
+                    /* singular case */
+                    if (IS_ZERO (ukk))
+                    {
+                        /* matrix is numerically singular */
+                        Common->status = KLU_SINGULAR ;
+                        if (Common->numerical_rank == EMPTY)
+                        {
+                            Common->numerical_rank = k+k1 ;
+                            Common->singular_col = Q [k+k1] ;
+                        }
+                        if (Common->halt_if_singular)
+                        {
+                            /* do not continue the factorization */
+                            return (FALSE) ;
+                        }
+                    }
+                    Udiag [k+k1] = ukk ;
+                    /* gather and divide by pivot to get kth column of L */
+                    GET_POINTER (LU, Lip, Llen, Li, Lx, k, llen) ;
+                    for (p = 0 ; p < llen ; p++)
+                    {
+                        i = Li [p] ;
+                        DIV (Lx [p], X [i], ukk) ;
+                        CLEAR (X [i]) ;
+                    }
+                }
+            }
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* permute scale factors Rs according to pivotal row order */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale > 0)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            REAL (X [k]) = Rs [Pnum [k]] ;
+        }
+        for (k = 0 ; k < n ; k++)
+        {
+            Rs [k] = REAL (X [k]) ;
+        }
+    }
+
+#ifndef NDEBUG
+    ASSERT (Offp [n] == poff) ;
+    ASSERT (Symbolic->nzoff == poff) ;
+    PRINTF (("\n------------------- Off diagonal entries, new:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+    if (Common->status == KLU_OK)
+    {
+        PRINTF (("\n ########### KLU_BTF_REFACTOR done, nblocks %d\n",nblocks));
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF ((
+                "\n================KLU_refactor output: k1 %d k2 %d nk %d\n",
+                k1, k2, nk)) ;
+            if (nk == 1)
+            {
+                PRINTF (("singleton  ")) ;
+                PRINT_ENTRY (Udiag [k1]) ;
+            }
+            else
+            {
+                Lip = Numeric->Lip + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                LU = (Unit *) Numeric->LUbx [block] ;
+                PRINTF (("\n---- L block %d\n", block)) ;
+                ASSERT (KLU_valid_LU (nk, TRUE, Lip, Llen, LU)) ;
+                Uip = Numeric->Uip + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                PRINTF (("\n---- U block %d\n", block)) ;
+                ASSERT (KLU_valid_LU (nk, FALSE, Uip, Ulen, LU)) ;
+            }
+        }
+    }
+#endif
+
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_scale.c
+++ b/src/maths/KLU/klu_scale.c
@ -0,0 +1,159 @@
+/* ========================================================================== */
+/* === KLU_scale ============================================================ */
+/* ========================================================================== */
+
+/* Scale a matrix and check to see if it is valid.  Can be called by the user.
+ * This is called by KLU_factor and KLU_refactor.  Returns TRUE if the input
+ * matrix is valid, FALSE otherwise.  If the W input argument is non-NULL,
+ * then the input matrix is checked for duplicate entries.
+ *
+ * scaling methods:
+ *      <0: no scaling, do not compute Rs, and do not check input matrix.
+ *      0: no scaling
+ *      1: the scale factor for row i is sum (abs (A (i,:)))
+ *      2 or more: the scale factor for row i is max (abs (A (i,:)))
+ */
+
+#include "klu_internal.h"
+
+Int KLU_scale           /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    Int scale,          /* 0: none, 1: sum, 2: max */
+    Int n,
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    /* outputs, not defined on input */
+    double Rs [ ],      /* size n, can be NULL if scale <= 0 */
+    /* workspace, not defined on input or output */
+    Int W [ ],          /* size n, can be NULL */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    double a ;
+    Entry *Az ;
+    Int row, col, p, pend, check_duplicates ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    if (scale < 0)
+    {
+        /* return without checking anything and without computing the
+         * scale factors */
+        return (TRUE) ;
+    }
+
+    Az = (Entry *) Ax ;
+
+    if (n <= 0 || Ap == NULL || Ai == NULL || Az == NULL ||
+        (scale > 0 && Rs == NULL))
+    {
+        /* Ap, Ai, Ax and Rs must be present, and n must be > 0 */
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    if (Ap [0] != 0 || Ap [n] < 0)
+    {
+        /* nz = Ap [n] must be >= 0 and Ap [0] must equal zero */
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    for (col = 0 ; col < n ; col++)
+    {
+        if (Ap [col] > Ap [col+1])
+        {
+            /* column pointers must be non-decreasing */
+            Common->status = KLU_INVALID ;
+            return (FALSE) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* scale */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale > 0)
+    {
+        /* initialize row sum or row max */
+        for (row = 0 ; row < n ; row++)
+        {
+            Rs [row] = 0 ;
+        }
+    }
+
+    /* check for duplicates only if W is present */
+    check_duplicates = (W != (Int *) NULL) ;
+    if (check_duplicates)
+    {
+        for (row = 0 ; row < n ; row++)
+        {
+            W [row] = EMPTY ;
+        }
+    }
+
+    for (col = 0 ; col < n ; col++)
+    {
+        pend = Ap [col+1] ;
+        for (p = Ap [col] ; p < pend ; p++)
+        {
+            row = Ai [p] ;
+            if (row < 0 || row >= n)
+            {
+                /* row index out of range, or duplicate entry */
+                Common->status = KLU_INVALID ;
+                return (FALSE) ;
+            }
+            if (check_duplicates)
+            {
+                if (W [row] == col)
+                {
+                    /* duplicate entry */
+                    Common->status = KLU_INVALID ;
+                    return (FALSE) ;
+                }
+                /* flag row i as appearing in column col */
+                W [row] = col ;
+            }
+            /* a = ABS (Az [p]) ;*/
+            ABS (a, Az [p]) ;
+            if (scale == 1)
+            {
+                /* accumulate the abs. row sum */
+                Rs [row] += a ;
+            }
+            else if (scale > 1)
+            {
+                /* find the max abs. value in the row */
+                Rs [row] = MAX (Rs [row], a) ;
+            }
+        }
+    }
+
+    if (scale > 0)
+    {
+        /* do not scale empty rows */
+        for (row = 0 ; row < n ; row++)
+        {
+            /* matrix is singular */
+            PRINTF (("Rs [%d] = %g\n", row, Rs [row])) ;
+
+            if (Rs [row] == 0.0)
+            {
+                PRINTF (("Row %d of A is all zero\n", row)) ;
+                Rs [row] = 1.0 ;
+            }
+        }
+    }
+
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_solve.c
+++ b/src/maths/KLU/klu_solve.c
@ -0,0 +1,396 @@
+/* ========================================================================== */
+/* === KLU_solve ============================================================ */
+/* ========================================================================== */
+
+/* Solve Ax=b using the symbolic and numeric objects from KLU_analyze
+ * (or KLU_analyze_given) and KLU_factor.  Note that no iterative refinement is
+ * performed.  Uses Numeric->Xwork as workspace (undefined on input and output),
+ * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
+ * Numeric->Iwork).
+ */
+
+#include "klu_internal.h"
+
+Int KLU_solve
+(
+    /* inputs, not modified */
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    Int d,                  /* leading dimension of B */
+    Int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size n*nrhs, in column-oriented form, with
+                             * leading dimension d. */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    Entry x [4], offik, s ;
+    double rs, *Rs ;
+    Entry *Offx, *X, *Bz, *Udiag ;
+    Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
+    Unit **LUbx ;
+    Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
+        B == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Bz = (Entry *) B ;
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    ASSERT (nblocks == Numeric->nblocks) ;
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    Lip  = Numeric->Lip ;
+    Llen = Numeric->Llen ;
+    Uip  = Numeric->Uip ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+    Udiag = Numeric->Udiag ;
+
+    Rs = Numeric->Rs ;
+    X = (Entry *) Numeric->Xwork ;
+
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* solve in chunks of 4 columns at a time */
+    /* ---------------------------------------------------------------------- */
+
+    for (chunk = 0 ; chunk < nrhs ; chunk += 4)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* get the size of the current chunk */
+        /* ------------------------------------------------------------------ */
+
+        nr = MIN (nrhs - chunk, 4) ;
+
+        /* ------------------------------------------------------------------ */
+        /* scale and permute the right hand side, X = P*(R\B) */
+        /* ------------------------------------------------------------------ */
+
+        if (Rs == NULL)
+        {
+
+            /* no scaling */
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        X [k] = Bz [Pnum [k]] ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        X [2*k    ] = Bz [i      ] ;
+                        X [2*k + 1] = Bz  [i + d  ] ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        X [3*k    ] = Bz [i      ] ;
+                        X [3*k + 1] = Bz [i + d  ] ;
+                        X [3*k + 2] = Bz [i + d*2] ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        X [4*k    ] = Bz [i      ] ;
+                        X [4*k + 1] = Bz [i + d  ] ;
+                        X [4*k + 2] = Bz [i + d*2] ;
+                        X [4*k + 3] = Bz [i + d*3] ;
+                    }
+                    break ;
+            }
+
+        }
+        else
+        {
+
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        SCALE_DIV_ASSIGN (X [k], Bz  [Pnum [k]], Rs [k]) ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
+                        SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
+                        SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
+                        SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
+                        SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
+                        SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
+                        SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
+                    }
+                    break ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* solve X = (L*U + Off)\X */
+        /* ------------------------------------------------------------------ */
+
+        for (block = nblocks-1 ; block >= 0 ; block--)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block of size nk is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
+
+            /* solve the block system */
+            if (nk == 1)
+            {
+                s = Udiag [k1] ;
+                switch (nr)
+                {
+
+                    case 1:
+                        DIV (X [k1], X [k1], s) ;
+                        break ;
+
+                    case 2:
+                        DIV (X [2*k1], X [2*k1], s) ;
+                        DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
+                        break ;
+
+                    case 3:
+                        DIV (X [3*k1], X [3*k1], s) ;
+                        DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
+                        DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
+                        break ;
+
+                    case 4:
+                        DIV (X [4*k1], X [4*k1], s) ;
+                        DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
+                        DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
+                        DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
+                        break ;
+
+                }
+            }
+            else
+            {
+                KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
+                        X + nr*k1) ;
+                KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
+                        Udiag + k1, nr, X + nr*k1) ;
+            }
+
+            /* -------------------------------------------------------------- */
+            /* block back-substitution for the off-diagonal-block entries */
+            /* -------------------------------------------------------------- */
+
+            if (block > 0)
+            {
+                switch (nr)
+                {
+
+                    case 1:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [k] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
+                            }
+                        }
+                        break ;
+
+                    case 2:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [2*k    ] ;
+                            x [1] = X [2*k + 1] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+                                offik = Offx [p] ;
+                                MULT_SUB (X [2*i], offik, x [0]) ;
+                                MULT_SUB (X [2*i + 1], offik, x [1]) ;
+                            }
+                        }
+                        break ;
+
+                    case 3:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [3*k    ] ;
+                            x [1] = X [3*k + 1] ;
+                            x [2] = X [3*k + 2] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+                                offik = Offx [p] ;
+                                MULT_SUB (X [3*i], offik, x [0]) ;
+                                MULT_SUB (X [3*i + 1], offik, x [1]) ;
+                                MULT_SUB (X [3*i + 2], offik, x [2]) ;
+                            }
+                        }
+                        break ;
+
+                    case 4:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [4*k    ] ;
+                            x [1] = X [4*k + 1] ;
+                            x [2] = X [4*k + 2] ;
+                            x [3] = X [4*k + 3] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+                                offik = Offx [p] ;
+                                MULT_SUB (X [4*i], offik, x [0]) ;
+                                MULT_SUB (X [4*i + 1], offik, x [1]) ;
+                                MULT_SUB (X [4*i + 2], offik, x [2]) ;
+                                MULT_SUB (X [4*i + 3], offik, x [3]) ;
+                            }
+                        }
+                        break ;
+                }
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* permute the result, Bz  = Q*X */
+        /* ------------------------------------------------------------------ */
+
+        switch (nr)
+        {
+
+            case 1:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    Bz  [Q [k]] = X [k] ;
+                }
+                break ;
+
+            case 2:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    Bz  [i      ] = X [2*k    ] ;
+                    Bz  [i + d  ] = X [2*k + 1] ;
+                }
+                break ;
+
+            case 3:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    Bz  [i      ] = X [3*k    ] ;
+                    Bz  [i + d  ] = X [3*k + 1] ;
+                    Bz  [i + d*2] = X [3*k + 2] ;
+                }
+                break ;
+
+            case 4:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    Bz  [i      ] = X [4*k    ] ;
+                    Bz  [i + d  ] = X [4*k + 1] ;
+                    Bz  [i + d*2] = X [4*k + 2] ;
+                    Bz  [i + d*3] = X [4*k + 3] ;
+                }
+                break ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* go to the next chunk of B */
+        /* ------------------------------------------------------------------ */
+
+        Bz  += d*4 ;
+    }
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_sort.c
+++ b/src/maths/KLU/klu_sort.c
@ -0,0 +1,156 @@
+/* ========================================================================== */
+/* === KLU_sort ============================================================= */
+/* ========================================================================== */
+
+/* sorts the columns of L and U so that the row indices appear in strictly
+ * increasing order.
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === sort ================================================================= */
+/* ========================================================================== */
+
+/* Sort L or U using a double-transpose */
+
+static void sort (Int n, Int *Xip, Int *Xlen, Unit *LU, Int *Tp, Int *Tj,
+    Entry *Tx, Int *W)
+{
+    Int *Xi ;
+    Entry *Xx ;
+    Int p, i, j, len, nz, tp, xlen, pend ;
+
+    ASSERT (KLU_valid_LU (n, FALSE, Xip, Xlen, LU)) ;
+
+    /* count the number of entries in each row of L or U */ 
+    for (i = 0 ; i < n ; i++)
+    {
+        W [i] = 0 ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            W [Xi [p]]++ ;
+        }
+    }
+
+    /* construct the row pointers for T */
+    nz = 0 ;
+    for (i = 0 ; i < n ; i++)
+    {
+        Tp [i] = nz ;
+        nz += W [i] ;
+    }
+    Tp [n] = nz ;
+    for (i = 0 ; i < n ; i++)
+    {
+        W [i] = Tp [i] ;
+    }
+
+    /* transpose the matrix into Tp, Ti, Tx */
+    for (j = 0 ; j < n ; j++)
+    {
+        GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            tp = W [Xi [p]]++ ;
+            Tj [tp] = j ;
+            Tx [tp] = Xx [p] ;
+        }
+    }
+
+    /* transpose the matrix back into Xip, Xlen, Xi, Xx */
+    for (j = 0 ; j < n ; j++)
+    {
+        W [j] = 0 ;
+    }
+    for (i = 0 ; i < n ; i++)
+    {
+        pend = Tp [i+1] ;
+        for (p = Tp [i] ; p < pend ; p++)
+        {
+            j = Tj [p] ;
+            GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+            xlen = W [j]++ ;
+            Xi [xlen] = i ;
+            Xx [xlen] = Tx [p] ;
+        }
+    }
+
+    ASSERT (KLU_valid_LU (n, FALSE, Xip, Xlen, LU)) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_sort ============================================================= */
+/* ========================================================================== */
+
+Int KLU_sort
+(
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    Int *R, *W, *Tp, *Ti, *Lip, *Uip, *Llen, *Ulen ;
+    Entry *Tx ;
+    Unit **LUbx ;
+    Int n, nk, nz, block, nblocks, maxblock, k1 ;
+    size_t m1 ;
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    n = Symbolic->n ;
+    R = Symbolic->R ;
+    nblocks = Symbolic->nblocks ;
+    maxblock = Symbolic->maxblock ;
+
+    Lip  = Numeric->Lip ;
+    Llen = Numeric->Llen ;
+    Uip  = Numeric->Uip ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+
+    m1 = ((size_t) maxblock) + 1 ;
+
+    /* allocate workspace */
+    nz = MAX (Numeric->max_lnz_block, Numeric->max_unz_block) ;
+    W  = KLU_malloc (maxblock, sizeof (Int), Common) ;
+    Tp = KLU_malloc (m1, sizeof (Int), Common) ;
+    Ti = KLU_malloc (nz, sizeof (Int), Common) ;
+    Tx = KLU_malloc (nz, sizeof (Entry), Common) ;
+
+    PRINTF (("\n======================= Start sort:\n")) ;
+
+    if (Common->status == KLU_OK)
+    {
+        /* sort each block of L and U */
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = R [block] ;
+            nk = R [block+1] - k1 ;
+            if (nk > 1)
+            {
+                PRINTF (("\n-------------------block: %d nk %d\n", block, nk)) ;
+                sort (nk, Lip + k1, Llen + k1, LUbx [block], Tp, Ti, Tx, W) ;
+                sort (nk, Uip + k1, Ulen + k1, LUbx [block], Tp, Ti, Tx, W) ;
+            }
+        }
+    }
+
+    PRINTF (("\n======================= sort done.\n")) ;
+
+    /* free workspace */
+    KLU_free (W, maxblock, sizeof (Int), Common) ;
+    KLU_free (Tp, m1, sizeof (Int), Common) ;
+    KLU_free (Ti, nz, sizeof (Int), Common) ;
+    KLU_free (Tx, nz, sizeof (Entry), Common) ;
+    return (Common->status == KLU_OK) ;
+}
--- a/src/maths/KLU/klu_tsolve.c
+++ b/src/maths/KLU/klu_tsolve.c
@ -0,0 +1,465 @@
+/* ========================================================================== */
+/* === KLU_tsolve =========================================================== */
+/* ========================================================================== */
+
+/* Solve A'x=b using the symbolic and numeric objects from KLU_analyze
+ * (or KLU_analyze_given) and KLU_factor.  Note that no iterative refinement is
+ * performed.  Uses Numeric->Xwork as workspace (undefined on input and output),
+ * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
+ * Numeric->Iwork).
+ */
+
+#include "klu_internal.h"
+
+Int KLU_tsolve
+(
+    /* inputs, not modified */
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    Int d,                  /* leading dimension of B */
+    Int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size n*nrhs, in column-oriented form, with
+                             * leading dimension d. */
+#ifdef COMPLEX
+    Int conj_solve,         /* TRUE for conjugate transpose solve, FALSE for
+                             * array transpose solve.  Used for the complex
+                             * case only. */
+#endif
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    Entry x [4], offik, s ;
+    double rs, *Rs ;
+    Entry *Offx, *X, *Bz, *Udiag ;
+    Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
+    Unit **LUbx ;
+    Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
+        B == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Bz = (Entry *) B ;
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    ASSERT (nblocks == Numeric->nblocks) ;
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    Lip  = Numeric->Lip ;
+    Llen = Numeric->Llen ;
+    Uip  = Numeric->Uip ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+    Udiag = Numeric->Udiag ;
+
+    Rs = Numeric->Rs ;
+    X = (Entry *) Numeric->Xwork ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* solve in chunks of 4 columns at a time */
+    /* ---------------------------------------------------------------------- */
+
+    for (chunk = 0 ; chunk < nrhs ; chunk += 4)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* get the size of the current chunk */
+        /* ------------------------------------------------------------------ */
+
+        nr = MIN (nrhs - chunk, 4) ;
+
+        /* ------------------------------------------------------------------ */
+        /* permute the right hand side, X = Q'*B */
+        /* ------------------------------------------------------------------ */
+
+        switch (nr)
+        {
+
+            case 1:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    X [k] = Bz  [Q [k]] ;
+                }
+                break ;
+
+            case 2:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    X [2*k    ] = Bz [i      ] ;
+                    X [2*k + 1] = Bz [i + d  ] ;
+                }
+                break ;
+
+            case 3:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    X [3*k    ] = Bz [i      ] ;
+                    X [3*k + 1] = Bz [i + d  ] ;
+                    X [3*k + 2] = Bz [i + d*2] ;
+                }
+                break ;
+
+            case 4:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    X [4*k    ] = Bz [i      ] ;
+                    X [4*k + 1] = Bz [i + d  ] ;
+                    X [4*k + 2] = Bz [i + d*2] ;
+                    X [4*k + 3] = Bz [i + d*3] ;
+                }
+                break ;
+
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* solve X = (L*U + Off)'\X */
+        /* ------------------------------------------------------------------ */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block of size nk is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
+
+            /* -------------------------------------------------------------- */
+            /* block back-substitution for the off-diagonal-block entries */
+            /* -------------------------------------------------------------- */
+
+            if (block > 0)
+            {
+                switch (nr)
+                    {
+
+                    case 1:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    MULT_SUB_CONJ (X [k], X [Offi [p]],
+                                            Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
+                                }
+                            }
+                        }
+                        break ;
+
+                    case 2:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [2*k    ] ;
+                            x [1] = X [2*k + 1] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    CONJ (offik, Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    offik = Offx [p] ;
+                                }
+                                MULT_SUB (x [0], offik, X [2*i]) ;
+                                MULT_SUB (x [1], offik, X [2*i + 1]) ;
+                            }
+                            X [2*k    ] = x [0] ;
+                            X [2*k + 1] = x [1] ;
+                        }
+                        break ;
+
+                    case 3:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [3*k    ] ;
+                            x [1] = X [3*k + 1] ;
+                            x [2] = X [3*k + 2] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    CONJ (offik, Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    offik = Offx [p] ;
+                                }
+                                MULT_SUB (x [0], offik, X [3*i]) ;
+                                MULT_SUB (x [1], offik, X [3*i + 1]) ;
+                                MULT_SUB (x [2], offik, X [3*i + 2]) ;
+                            }
+                            X [3*k    ] = x [0] ;
+                            X [3*k + 1] = x [1] ;
+                            X [3*k + 2] = x [2] ;
+                        }
+                        break ;
+
+                    case 4:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [4*k    ] ;
+                            x [1] = X [4*k + 1] ;
+                            x [2] = X [4*k + 2] ;
+                            x [3] = X [4*k + 3] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    CONJ(offik, Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    offik = Offx [p] ;
+                                }
+                                MULT_SUB (x [0], offik, X [4*i]) ;
+                                MULT_SUB (x [1], offik, X [4*i + 1]) ;
+                                MULT_SUB (x [2], offik, X [4*i + 2]) ;
+                                MULT_SUB (x [3], offik, X [4*i + 3]) ;
+                            }
+                            X [4*k    ] = x [0] ;
+                            X [4*k + 1] = x [1] ;
+                            X [4*k + 2] = x [2] ;
+                            X [4*k + 3] = x [3] ;
+                        }
+                        break ;
+                    }
+            }
+
+            /* -------------------------------------------------------------- */
+            /* solve the block system */
+            /* -------------------------------------------------------------- */
+
+            if (nk == 1)
+            {
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (s, Udiag [k1]) ;
+                }
+                else
+#endif
+                {
+                    s = Udiag [k1] ;
+                }
+                switch (nr)
+                {
+
+                    case 1:
+                        DIV (X [k1], X [k1], s) ;
+                        break ;
+
+                    case 2:
+                        DIV (X [2*k1], X [2*k1], s) ;
+                        DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
+                        break ;
+
+                    case 3:
+                        DIV (X [3*k1], X [3*k1], s) ;
+                        DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
+                        DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
+                        break ;
+
+                    case 4:
+                        DIV (X [4*k1], X [4*k1], s) ;
+                        DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
+                        DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
+                        DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
+                        break ;
+
+                }
+            }
+            else
+            {
+                KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
+                        Udiag + k1, nr,
+#ifdef COMPLEX
+                        conj_solve,
+#endif
+                        X + nr*k1) ;
+                KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
+#ifdef COMPLEX
+                        conj_solve,
+#endif
+                        X + nr*k1) ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* scale and permute the result, Bz  = P'(R\X) */
+        /* ------------------------------------------------------------------ */
+
+        if (Rs == NULL)
+        {
+
+            /* no scaling */
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        Bz  [Pnum [k]] = X [k] ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        Bz  [i      ] = X [2*k    ] ;
+                        Bz  [i + d  ] = X [2*k + 1] ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        Bz  [i      ] = X [3*k    ] ;
+                        Bz  [i + d  ] = X [3*k + 1] ;
+                        Bz  [i + d*2] = X [3*k + 2] ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        Bz  [i      ] = X [4*k    ] ;
+                        Bz  [i + d  ] = X [4*k + 1] ;
+                        Bz  [i + d*2] = X [4*k + 2] ;
+                        Bz  [i + d*3] = X [4*k + 3] ;
+                    }
+                    break ;
+            }
+
+        }
+        else
+        {
+
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
+                    }
+                    break ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* go to the next chunk of B */
+        /* ------------------------------------------------------------------ */
+
+        Bz  += d*4 ;
+    }
+    return (TRUE) ;
+}
--- a/src/maths/KLU/klu_version.h
+++ b/src/maths/KLU/klu_version.h
@ -0,0 +1,694 @@
+#ifndef _KLU_VERSION_H
+#define _KLU_VERSION_H
+
+#ifdef DLONG
+#define Int UF_long
+#define Int_id UF_long_id
+#define Int_MAX UF_long_max
+#else
+#define Int int
+#define Int_id "%d"
+#define Int_MAX INT_MAX
+#endif
+
+#define NPRINT  
+
+#define BYTES(type,n) (sizeof (type) * (n))
+#define CEILING(b,u)  (((b)+(u)-1) / (u))
+#define UNITS(type,n) (CEILING (BYTES (type,n), sizeof (Unit)))
+#define DUNITS(type,n) (ceil (BYTES (type, (double) n) / sizeof (Unit)))
+
+#define GET_I_POINTER(LU, Xip, Xi, k) \
+{ \
+    Xi = (Int *) (LU + Xip [k]) ; \
+}
+
+#define GET_X_POINTER(LU, Xip, Xlen, Xx, k) \
+{ \
+    Xx = (Entry *) (LU + Xip [k] + UNITS (Int, Xlen [k])) ; \
+}
+
+#define GET_POINTER(LU, Xip, Xlen, Xi, Xx, k, xlen) \
+{ \
+    Unit *xp = LU + Xip [k] ; \
+    xlen = Xlen [k] ; \
+    Xi = (Int *) xp ; \
+    Xx = (Entry *) (xp + UNITS (Int, xlen)) ; \
+}
+
+/* function names */
+#ifdef COMPLEX 
+
+#ifdef DLONG
+
+#define KLU_scale klu_zl_scale
+#define KLU_solve klu_zl_solve
+#define KLU_tsolve klu_zl_tsolve
+#define KLU_free_numeric klu_zl_free_numeric
+#define KLU_factor klu_zl_factor
+#define KLU_refactor klu_zl_refactor
+#define KLU_kernel_factor klu_zl_kernel_factor 
+#define KLU_lsolve klu_zl_lsolve
+#define KLU_ltsolve klu_zl_ltsolve
+#define KLU_usolve klu_zl_usolve
+#define KLU_utsolve klu_zl_utsolve
+#define KLU_kernel klu_zl_kernel
+#define KLU_valid klu_zl_valid
+#define KLU_valid_LU klu_zl_valid_LU
+#define KLU_sort klu_zl_sort
+#define KLU_rgrowth klu_zl_rgrowth
+#define KLU_rcond klu_zl_rcond
+#define KLU_extract klu_zl_extract
+#define KLU_condest klu_zl_condest
+#define KLU_flops klu_zl_flops
+
+#else
+
+#define KLU_scale klu_z_scale
+#define KLU_solve klu_z_solve
+#define KLU_tsolve klu_z_tsolve
+#define KLU_free_numeric klu_z_free_numeric
+#define KLU_factor klu_z_factor
+#define KLU_refactor klu_z_refactor
+#define KLU_kernel_factor klu_z_kernel_factor 
+#define KLU_lsolve klu_z_lsolve
+#define KLU_ltsolve klu_z_ltsolve
+#define KLU_usolve klu_z_usolve
+#define KLU_utsolve klu_z_utsolve
+#define KLU_kernel klu_z_kernel
+#define KLU_valid klu_z_valid
+#define KLU_valid_LU klu_z_valid_LU
+#define KLU_sort klu_z_sort
+#define KLU_rgrowth klu_z_rgrowth
+#define KLU_rcond klu_z_rcond
+#define KLU_extract klu_z_extract
+#define KLU_condest klu_z_condest
+#define KLU_flops klu_z_flops
+
+#endif
+
+#else
+
+#ifdef DLONG
+
+#define KLU_scale klu_l_scale
+#define KLU_solve klu_l_solve
+#define KLU_tsolve klu_l_tsolve
+#define KLU_free_numeric klu_l_free_numeric
+#define KLU_factor klu_l_factor
+#define KLU_refactor klu_l_refactor
+#define KLU_kernel_factor klu_l_kernel_factor 
+#define KLU_lsolve klu_l_lsolve
+#define KLU_ltsolve klu_l_ltsolve
+#define KLU_usolve klu_l_usolve
+#define KLU_utsolve klu_l_utsolve
+#define KLU_kernel klu_l_kernel
+#define KLU_valid klu_l_valid
+#define KLU_valid_LU klu_l_valid_LU
+#define KLU_sort klu_l_sort
+#define KLU_rgrowth klu_l_rgrowth
+#define KLU_rcond klu_l_rcond
+#define KLU_extract klu_l_extract
+#define KLU_condest klu_l_condest
+#define KLU_flops klu_l_flops
+
+#else
+
+#define KLU_scale klu_scale
+#define KLU_solve klu_solve
+#define KLU_tsolve klu_tsolve
+#define KLU_free_numeric klu_free_numeric
+#define KLU_factor klu_factor
+#define KLU_refactor klu_refactor
+#define KLU_kernel_factor klu_kernel_factor 
+#define KLU_lsolve klu_lsolve
+#define KLU_ltsolve klu_ltsolve
+#define KLU_usolve klu_usolve
+#define KLU_utsolve klu_utsolve
+#define KLU_kernel klu_kernel
+#define KLU_valid klu_valid
+#define KLU_valid_LU klu_valid_LU
+#define KLU_sort klu_sort
+#define KLU_rgrowth klu_rgrowth
+#define KLU_rcond klu_rcond
+#define KLU_extract klu_extract
+#define KLU_condest klu_condest
+#define KLU_flops klu_flops
+
+#endif
+
+#endif
+
+
+#ifdef DLONG
+
+#define KLU_analyze klu_l_analyze
+#define KLU_analyze_given klu_l_analyze_given
+#define KLU_alloc_symbolic klu_l_alloc_symbolic
+#define KLU_free_symbolic klu_l_free_symbolic
+#define KLU_defaults klu_l_defaults
+#define KLU_free klu_l_free
+#define KLU_malloc klu_l_malloc
+#define KLU_realloc klu_l_realloc
+#define KLU_add_size_t klu_l_add_size_t
+#define KLU_mult_size_t klu_l_mult_size_t
+
+#define KLU_symbolic klu_l_symbolic
+#define KLU_numeric klu_l_numeric
+#define KLU_common klu_l_common
+
+#define BTF_order btf_l_order
+#define BTF_strongcomp btf_l_strongcomp
+
+#define AMD_order amd_l_order
+#define COLAMD colamd_l
+#define COLAMD_recommended colamd_l_recommended
+
+#else
+
+#define KLU_analyze klu_analyze
+#define KLU_analyze_given klu_analyze_given
+#define KLU_alloc_symbolic klu_alloc_symbolic
+#define KLU_free_symbolic klu_free_symbolic
+#define KLU_defaults klu_defaults
+#define KLU_free klu_free
+#define KLU_malloc klu_malloc
+#define KLU_realloc klu_realloc
+#define KLU_add_size_t klu_add_size_t
+#define KLU_mult_size_t klu_mult_size_t
+
+#define KLU_symbolic klu_symbolic
+#define KLU_numeric klu_numeric
+#define KLU_common klu_common
+
+#define BTF_order btf_order
+#define BTF_strongcomp btf_strongcomp
+
+#define AMD_order amd_order
+#define COLAMD colamd
+#define COLAMD_recommended colamd_recommended
+
+#endif
+
+
+/* -------------------------------------------------------------------------- */
+/* Numerical relop macros for correctly handling the NaN case */
+/* -------------------------------------------------------------------------- */
+
+/*
+SCALAR_IS_NAN(x):
+    True if x is NaN.  False otherwise.  The commonly-existing isnan(x)
+    function could be used, but it's not in Kernighan & Ritchie 2nd edition
+    (ANSI C).  It may appear in <math.h>, but I'm not certain about
+    portability.  The expression x != x is true if and only if x is NaN,
+    according to the IEEE 754 floating-point standard.
+
+SCALAR_IS_ZERO(x):
+    True if x is zero.  False if x is nonzero, NaN, or +/- Inf.
+    This is (x == 0) if the compiler is IEEE 754 compliant.
+
+SCALAR_IS_NONZERO(x):
+    True if x is nonzero, NaN, or +/- Inf.  False if x zero.
+    This is (x != 0) if the compiler is IEEE 754 compliant.
+
+SCALAR_IS_LTZERO(x):
+    True if x is < zero or -Inf.  False if x is >= 0, NaN, or +Inf.
+    This is (x < 0) if the compiler is IEEE 754 compliant.
+*/
+
+/* These all work properly, according to the IEEE 754 standard ... except on */
+/* a PC with windows.  Works fine in Linux on the same PC... */
+#define SCALAR_IS_NAN(x)        ((x) != (x))
+#define SCALAR_IS_ZERO(x)       ((x) == 0.)
+#define SCALAR_IS_NONZERO(x)    ((x) != 0.)
+#define SCALAR_IS_LTZERO(x)     ((x) < 0.)
+
+
+/* scalar absolute value macro. If x is NaN, the result is NaN: */
+#define SCALAR_ABS(x) ((SCALAR_IS_LTZERO (x)) ? -(x) : (x))
+
+/* print a scalar (avoid printing "-0" for negative zero).  */
+#ifdef NPRINT
+#define PRINT_SCALAR(a)
+#else
+#define PRINT_SCALAR(a) \
+{ \
+    if (SCALAR_IS_NONZERO (a)) \
+    { \
+        PRINTF ((" (%g)", (a))) ; \
+    } \
+    else \
+    { \
+        PRINTF ((" (0)")) ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+/* Real floating-point arithmetic */
+/* -------------------------------------------------------------------------- */
+
+#ifndef COMPLEX
+
+typedef double Unit ;
+#define Entry double
+
+#define SPLIT(s)                    (1)
+#define REAL(c)                     (c)
+#define IMAG(c)                     (0.)
+#define ASSIGN(c,s1,s2,p,split)     { (c) = (s1)[p] ; }
+#define CLEAR(c)                    { (c) = 0. ; }
+#define CLEAR_AND_INCREMENT(p)      { *p++ = 0. ; }
+#define IS_NAN(a)                   SCALAR_IS_NAN (a)
+#define IS_ZERO(a)                  SCALAR_IS_ZERO (a)
+#define IS_NONZERO(a)               SCALAR_IS_NONZERO (a)
+#define SCALE_DIV(c,s)              { (c) /= (s) ; }
+#define SCALE_DIV_ASSIGN(a,c,s)     { a = c / s ; }
+#define SCALE(c,s)                  { (c) *= (s) ; }
+#define ASSEMBLE(c,a)               { (c) += (a) ; }
+#define ASSEMBLE_AND_INCREMENT(c,p) { (c) += *p++ ; }
+#define DECREMENT(c,a)              { (c) -= (a) ; }
+#define MULT(c,a,b)                 { (c) = (a) * (b) ; }
+#define MULT_CONJ(c,a,b)            { (c) = (a) * (b) ; }
+#define MULT_SUB(c,a,b)             { (c) -= (a) * (b) ; }
+#define MULT_SUB_CONJ(c,a,b)        { (c) -= (a) * (b) ; }
+#define DIV(c,a,b)                  { (c) = (a) / (b) ; }
+#define RECIPROCAL(c)               { (c) = 1.0 / (c) ; }
+#define DIV_CONJ(c,a,b)             { (c) = (a) / (b) ; }
+#define APPROX_ABS(s,a)             { (s) = SCALAR_ABS (a) ; }
+#define ABS(s,a)                    { (s) = SCALAR_ABS (a) ; }
+#define PRINT_ENTRY(a)              PRINT_SCALAR (a)
+#define CONJ(a,x)                   a = x
+
+/* for flop counts */
+#define MULTSUB_FLOPS   2.      /* c -= a*b */
+#define DIV_FLOPS       1.      /* c = a/b */
+#define ABS_FLOPS       0.      /* c = abs (a) */
+#define ASSEMBLE_FLOPS  1.      /* c += a */
+#define DECREMENT_FLOPS 1.      /* c -= a */
+#define MULT_FLOPS      1.      /* c = a*b */
+#define SCALE_FLOPS     1.      /* c = a/s */
+
+#else
+
+/* -------------------------------------------------------------------------- */
+/* Complex floating-point arithmetic */
+/* -------------------------------------------------------------------------- */
+
+/*
+    Note:  An alternative to this Double_Complex type would be to use a
+    struct { double r ; double i ; }.  The problem with that method
+    (used by the Sun Performance Library, for example) is that ANSI C provides
+    no guarantee about the layout of a struct.  It is possible that the sizeof
+    the struct above would be greater than 2 * sizeof (double).  This would
+    mean that the complex BLAS could not be used.  The method used here avoids
+    that possibility.  ANSI C *does* guarantee that an array of structs has
+    the same size as n times the size of one struct.
+
+    The ANSI C99 version of the C language includes a "double _Complex" type.
+    It should be possible in that case to do the following:
+
+    #define Entry double _Complex
+
+    and remove the Double_Complex struct.  The macros, below, could then be
+    replaced with instrinsic operators.  Note that the #define Real and
+    #define Imag should also be removed (they only appear in this file).
+
+    For the MULT, MULT_SUB, MULT_SUB_CONJ, and MULT_CONJ macros,
+    the output argument c cannot be the same as any input argument.
+
+*/
+
+typedef struct
+{
+    double component [2] ;      /* real and imaginary parts */
+
+} Double_Complex ;
+
+typedef Double_Complex Unit ;
+#define Entry Double_Complex
+#define Real component [0]
+#define Imag component [1]
+
+/* for flop counts */
+#define MULTSUB_FLOPS   8.      /* c -= a*b */
+#define DIV_FLOPS       9.      /* c = a/b */
+#define ABS_FLOPS       6.      /* c = abs (a), count sqrt as one flop */
+#define ASSEMBLE_FLOPS  2.      /* c += a */
+#define DECREMENT_FLOPS 2.      /* c -= a */
+#define MULT_FLOPS      6.      /* c = a*b */
+#define SCALE_FLOPS     2.      /* c = a/s or c = a*s */
+
+/* -------------------------------------------------------------------------- */
+
+/* real part of c */
+#define REAL(c) ((c).Real)
+
+/* -------------------------------------------------------------------------- */
+
+/* imag part of c */
+#define IMAG(c) ((c).Imag)
+
+/* -------------------------------------------------------------------------- */
+
+/* Return TRUE if a complex number is in split form, FALSE if in packed form */
+#define SPLIT(sz) ((sz) != (double *) NULL)
+
+/* c = (s1) + (s2)*i, if s2 is null, then X is in "packed" format (compatible
+ * with Entry and ANSI C99 double _Complex type).  */
+/*#define ASSIGN(c,s1,s2,p,split)       \
+{ \
+    if (split) \
+    { \
+        (c).Real = (s1)[p] ; \
+        (c).Imag = (s2)[p] ; \
+    }  \
+    else \
+    { \
+        (c) = ((Entry *)(s1))[p] ; \
+    }  \
+}*/
+
+/* -------------------------------------------------------------------------- */
+#define CONJ(a, x) \
+{ \
+    a.Real = x.Real ; \
+    a.Imag = -x.Imag ; \
+}
+
+/* c = 0 */
+#define CLEAR(c) \
+{ \
+    (c).Real = 0. ; \
+    (c).Imag = 0. ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* *p++ = 0 */
+#define CLEAR_AND_INCREMENT(p) \
+{ \
+    p->Real = 0. ; \
+    p->Imag = 0. ; \
+    p++ ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* True if a == 0 */
+#define IS_ZERO(a) \
+    (SCALAR_IS_ZERO ((a).Real) && SCALAR_IS_ZERO ((a).Imag))
+
+/* -------------------------------------------------------------------------- */
+
+/* True if a is NaN */
+#define IS_NAN(a) \
+    (SCALAR_IS_NAN ((a).Real) || SCALAR_IS_NAN ((a).Imag))
+
+/* -------------------------------------------------------------------------- */
+
+/* True if a != 0 */
+#define IS_NONZERO(a) \
+    (SCALAR_IS_NONZERO ((a).Real) || SCALAR_IS_NONZERO ((a).Imag))
+
+/* -------------------------------------------------------------------------- */
+
+/* a = c/s */
+#define SCALE_DIV_ASSIGN(a,c,s) \
+{ \
+    a.Real = c.Real / s ; \
+    a.Imag = c.Imag / s ; \
+}
+
+/* c /= s */
+#define SCALE_DIV(c,s) \
+{ \
+    (c).Real /= (s) ; \
+    (c).Imag /= (s) ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c *= s */
+#define SCALE(c,s) \
+{ \
+    (c).Real *= (s) ; \
+    (c).Imag *= (s) ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c += a */
+#define ASSEMBLE(c,a) \
+{ \
+    (c).Real += (a).Real ; \
+    (c).Imag += (a).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c += *p++ */
+#define ASSEMBLE_AND_INCREMENT(c,p) \
+{ \
+    (c).Real += p->Real ; \
+    (c).Imag += p->Imag ; \
+    p++ ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c -= a */
+#define DECREMENT(c,a) \
+{ \
+    (c).Real -= (a).Real ; \
+    (c).Imag -= (a).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a*b, assert because c cannot be the same as a or b */
+#define MULT(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real = (a).Real * (b).Real - (a).Imag * (b).Imag ; \
+    (c).Imag = (a).Imag * (b).Real + (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a*conjugate(b), assert because c cannot be the same as a or b */
+#define MULT_CONJ(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real = (a).Real * (b).Real + (a).Imag * (b).Imag ; \
+    (c).Imag = (a).Imag * (b).Real - (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c -= a*b, assert because c cannot be the same as a or b */
+#define MULT_SUB(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real -= (a).Real * (b).Real - (a).Imag * (b).Imag ; \
+    (c).Imag -= (a).Imag * (b).Real + (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c -= a*conjugate(b), assert because c cannot be the same as a or b */
+#define MULT_SUB_CONJ(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real -= (a).Real * (b).Real + (a).Imag * (b).Imag ; \
+    (c).Imag -= (a).Imag * (b).Real - (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a/b, be careful to avoid underflow and overflow */
+#ifdef MATHWORKS
+#define DIV(c,a,b) \
+{ \
+    (void) utDivideComplex ((a).Real, (a).Imag, (b).Real, (b).Imag, \
+        &((c).Real), &((c).Imag)) ; \
+}
+#else
+/* This uses ACM Algo 116, by R. L. Smith, 1962. */
+/* c can be the same variable as a or b. */
+/* Ignore NaN case for double relop br>=bi. */
+#define DIV(c,a,b) \
+{ \
+    double r, den, ar, ai, br, bi ; \
+    br = (b).Real ; \
+    bi = (b).Imag ; \
+    ar = (a).Real ; \
+    ai = (a).Imag ; \
+    if (SCALAR_ABS (br) >= SCALAR_ABS (bi)) \
+    { \
+        r = bi / br ; \
+        den = br + r * bi ; \
+        (c).Real = (ar + ai * r) / den ; \
+        (c).Imag = (ai - ar * r) / den ; \
+    } \
+    else \
+    { \
+        r = br / bi ; \
+        den = r * br + bi ; \
+        (c).Real = (ar * r + ai) / den ; \
+        (c).Imag = (ai * r - ar) / den ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+/* c = 1/c, be careful to avoid underflow and overflow */
+/* Not used if MATHWORKS is defined. */
+/* This uses ACM Algo 116, by R. L. Smith, 1962. */
+/* Ignore NaN case for double relop cr>=ci. */
+#define RECIPROCAL(c) \
+{ \
+    double r, den, cr, ci ; \
+    cr = (c).Real ; \
+    ci = (c).Imag ; \
+    if (SCALAR_ABS (cr) >= SCALAR_ABS (ci)) \
+    { \
+        r = ci / cr ; \
+        den = cr + r * ci ; \
+        (c).Real = 1.0 / den ; \
+        (c).Imag = - r / den ; \
+    } \
+    else \
+    { \
+        r = cr / ci ; \
+        den = r * cr + ci ; \
+        (c).Real = r / den ; \
+        (c).Imag = - 1.0 / den ; \
+    } \
+}
+
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a/conjugate(b), be careful to avoid underflow and overflow */
+#ifdef MATHWORKS
+#define DIV_CONJ(c,a,b) \
+{ \
+    (void) utDivideComplex ((a).Real, (a).Imag, (b).Real, (-(b).Imag), \
+        &((c).Real), &((c).Imag)) ; \
+}
+#else
+/* This uses ACM Algo 116, by R. L. Smith, 1962. */
+/* c can be the same variable as a or b. */
+/* Ignore NaN case for double relop br>=bi. */
+#define DIV_CONJ(c,a,b) \
+{ \
+    double r, den, ar, ai, br, bi ; \
+    br = (b).Real ; \
+    bi = (b).Imag ; \
+    ar = (a).Real ; \
+    ai = (a).Imag ; \
+    if (SCALAR_ABS (br) >= SCALAR_ABS (bi)) \
+    { \
+        r = (-bi) / br ; \
+        den = br - r * bi ; \
+        (c).Real = (ar + ai * r) / den ; \
+        (c).Imag = (ai - ar * r) / den ; \
+    } \
+    else \
+    { \
+        r = br / (-bi) ; \
+        den =  r * br - bi; \
+        (c).Real = (ar * r + ai) / den ; \
+        (c).Imag = (ai * r - ar) / den ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+/* approximate absolute value, s = |r|+|i| */
+#define APPROX_ABS(s,a) \
+{ \
+    (s) = SCALAR_ABS ((a).Real) + SCALAR_ABS ((a).Imag) ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* exact absolute value, s = sqrt (a.real^2 + amag^2) */
+#ifdef MATHWORKS
+#define ABS(s,a) \
+{ \
+    (s) = utFdlibm_hypot ((a).Real, (a).Imag) ; \
+}
+#else
+/* Ignore NaN case for the double relops ar>=ai and ar+ai==ar. */
+#define ABS(s,a) \
+{ \
+    double r, ar, ai ; \
+    ar = SCALAR_ABS ((a).Real) ; \
+    ai = SCALAR_ABS ((a).Imag) ; \
+    if (ar >= ai) \
+    { \
+        if (ar + ai == ar) \
+        { \
+            (s) = ar ; \
+        } \
+        else \
+        { \
+            r = ai / ar ; \
+            (s) = ar * sqrt (1.0 + r*r) ; \
+        } \
+    } \
+    else \
+    { \
+        if (ai + ar == ai) \
+        { \
+            (s) = ai ; \
+        } \
+        else \
+        { \
+            r = ar / ai ; \
+            (s) = ai * sqrt (1.0 + r*r) ; \
+        } \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+/* print an entry (avoid printing "-0" for negative zero).  */
+#ifdef NPRINT
+#define PRINT_ENTRY(a)
+#else
+#define PRINT_ENTRY(a) \
+{ \
+    if (SCALAR_IS_NONZERO ((a).Real)) \
+    { \
+        PRINTF ((" (%g", (a).Real)) ; \
+    } \
+    else \
+    { \
+        PRINTF ((" (0")) ; \
+    } \
+    if (SCALAR_IS_LTZERO ((a).Imag)) \
+    { \
+        PRINTF ((" - %gi)", -(a).Imag)) ; \
+    } \
+    else if (SCALAR_IS_ZERO ((a).Imag)) \
+    { \
+        PRINTF ((" + 0i)")) ; \
+    } \
+    else \
+    { \
+        PRINTF ((" + %gi)", (a).Imag)) ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+#endif  /* #ifndef COMPLEX */
+
+#endif