diff --git a/src/maths/KLU/UFconfig.h b/src/maths/KLU/UFconfig.h
new file mode 100644
index 000000000..4b36a9e31
--- /dev/null
+++ b/src/maths/KLU/UFconfig.h
@@ -0,0 +1,151 @@
+/* ========================================================================== */
+/* === UFconfig.h =========================================================== */
+/* ========================================================================== */
+
+/* Configuration file for SuiteSparse: a Suite of Sparse matrix packages
+ * (AMD, COLAMD, CCOLAMD, CAMD, CHOLMOD, UMFPACK, CXSparse, and others).
+ *
+ * UFconfig.h provides the definition of the long integer.  On most systems,
+ * a C program can be compiled in LP64 mode, in which long's and pointers are
+ * both 64-bits, and int's are 32-bits.  Windows 64, however, uses the LLP64
+ * model, in which int's and long's are 32-bits, and long long's and pointers
+ * are 64-bits.
+ *
+ * SuiteSparse packages that include long integer versions are
+ * intended for the LP64 mode.  However, as a workaround for Windows 64
+ * (and perhaps other systems), the long integer can be redefined.
+ *
+ * If _WIN64 is defined, then the __int64 type is used instead of long.
+ *
+ * The long integer can also be defined at compile time.  For example, this
+ * could be added to UFconfig.mk:
+ *
+ * CFLAGS = -O -D'UF_long=long long' -D'UF_long_max=9223372036854775801' \
+ *   -D'UF_long_idd="lld"'
+ *
+ * This file defines UF_long as either long (on all but _WIN64) or
+ * __int64 on Windows 64.  The intent is that a UF_long is always a 64-bit
+ * integer in a 64-bit code.  ptrdiff_t might be a better choice than long;
+ * it is always the same size as a pointer.
+ *
+ * This file also defines the SUITESPARSE_VERSION and related definitions.
+ *
+ * Copyright (c) 2007, University of Florida.  No licensing restrictions
+ * apply to this file or to the UFconfig directory.  Author: Timothy A. Davis.
+ */
+
+#ifndef _UFCONFIG_H
+#define _UFCONFIG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <limits.h>
+#include <stdlib.h>
+
+/* ========================================================================== */
+/* === UF_long ============================================================== */
+/* ========================================================================== */
+
+#ifndef UF_long
+
+#ifdef _WIN64
+
+#define UF_long __int64
+#define UF_long_max _I64_MAX
+#define UF_long_idd "I64d"
+
+#else
+
+#define UF_long long
+#define UF_long_max LONG_MAX
+#define UF_long_idd "ld"
+
+#endif
+#define UF_long_id "%" UF_long_idd
+#endif
+
+/* ========================================================================== */
+/* === UFconfig parameters and functions ==================================== */
+/* ========================================================================== */
+
+/* SuiteSparse-wide parameters will be placed in this struct.  So far, they
+   are only used by RBio. */
+
+typedef struct UFconfig_struct
+{
+    void *(*malloc_memory) (size_t) ;           /* pointer to malloc */
+    void *(*realloc_memory) (void *, size_t) ;  /* pointer to realloc */
+    void (*free_memory) (void *) ;              /* pointer to free */
+    void *(*calloc_memory) (size_t, size_t) ;   /* pointer to calloc */
+
+} UFconfig ;
+
+void *UFmalloc              /* pointer to allocated block of memory */
+(
+    size_t nitems,          /* number of items to malloc (>=1 is enforced) */
+    size_t size_of_item,    /* sizeof each item */
+    int *ok,                /* TRUE if successful, FALSE otherwise */
+    UFconfig *config        /* SuiteSparse-wide configuration */
+) ;
+
+void *UFfree                /* always returns NULL */
+(
+    void *p,                /* block to free */
+    UFconfig *config        /* SuiteSparse-wide configuration */
+) ;
+
+
+/* ========================================================================== */
+/* === SuiteSparse version ================================================== */
+/* ========================================================================== */
+
+/* SuiteSparse is not a package itself, but a collection of packages, some of
+ * which must be used together (UMFPACK requires AMD, CHOLMOD requires AMD,
+ * COLAMD, CAMD, and CCOLAMD, etc).  A version number is provided here for the
+ * collection itself.  The versions of packages within each version of
+ * SuiteSparse are meant to work together.  Combining one packge from one
+ * version of SuiteSparse, with another package from another version of
+ * SuiteSparse, may or may not work.
+ *
+ * SuiteSparse Version 3.7.0 contains the following packages:
+ *
+ *  UFconfig        version 3.7.0 (version always the same as SuiteSparse)
+ *  AMD             version 2.2.3
+ *  CAMD            version 2.2.3
+ *  CCOLAMD         version 2.7.4
+ *  COLAMD          version 2.7.4
+ *  BTF             version 1.1.3
+ *  CHOLMOD         version 1.7.4
+ *  CSparse3        version 3.0.2
+ *  CSparse         version 2.2.6
+ *  CXSparse        version 2.2.6
+ *  KLU             version 1.1.3
+ *  LDL             version 2.0.4
+ *  RBio            version 2.0.2
+ *  SPQR            version 1.2.3 (also called SuiteSparseQR)
+ *  UFcollection    version 1.6.0
+ *  UMFPACK         version 5.5.2
+ *  SSMULT          version 2.0.3
+ *  spqr_rank       version 1.0.0
+ *  MATLAB_Tools    various packages & M-files.  No specific version number.
+ *
+ * Other package dependencies:
+ *  BLAS            required by CHOLMOD and UMFPACK
+ *  LAPACK          required by CHOLMOD
+ *  METIS 4.0.1     required by CHOLMOD (optional) and KLU (optional)
+ */
+
+#define SUITESPARSE_DATE "Dec 15, 2011"
+#define SUITESPARSE_VER_CODE(main,sub) ((main) * 1000 + (sub))
+#define SUITESPARSE_MAIN_VERSION 3
+#define SUITESPARSE_SUB_VERSION 7
+#define SUITESPARSE_SUBSUB_VERSION 0
+#define SUITESPARSE_VERSION \
+    SUITESPARSE_VER_CODE(SUITESPARSE_MAIN_VERSION,SUITESPARSE_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/maths/KLU/amd.h b/src/maths/KLU/amd.h
new file mode 100644
index 000000000..2fcaef117
--- /dev/null
+++ b/src/maths/KLU/amd.h
@@ -0,0 +1,412 @@
+/* ========================================================================= */
+/* === AMD:  approximate minimum degree ordering =========================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD Version 2.2, Copyright (c) 2007 by Timothy A. Davis,                  */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* AMD finds a symmetric ordering P of a matrix A so that the Cholesky
+ * factorization of P*A*P' has fewer nonzeros and takes less work than the
+ * Cholesky factorization of A.  If A is not symmetric, then it performs its
+ * ordering on the matrix A+A'.  Two sets of user-callable routines are
+ * provided, one for int integers and the other for UF_long integers.
+ *
+ * The method is based on the approximate minimum degree algorithm, discussed
+ * in Amestoy, Davis, and Duff, "An approximate degree ordering algorithm",
+ * SIAM Journal of Matrix Analysis and Applications, vol. 17, no. 4, pp.
+ * 886-905, 1996.  This package can perform both the AMD ordering (with
+ * aggressive absorption), and the AMDBAR ordering (without aggressive
+ * absorption) discussed in the above paper.  This package differs from the
+ * Fortran codes discussed in the paper:
+ *
+ *       (1) it can ignore "dense" rows and columns, leading to faster run times
+ *       (2) it computes the ordering of A+A' if A is not symmetric
+ *       (3) it is followed by a depth-first post-ordering of the assembly tree
+ *           (or supernodal elimination tree)
+ *
+ * For historical reasons, the Fortran versions, amd.f and amdbar.f, have
+ * been left (nearly) unchanged.  They compute the identical ordering as
+ * described in the above paper.
+ */
+
+#ifndef AMD_H
+#define AMD_H
+
+/* make it easy for C++ programs to include AMD */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* get the definition of size_t: */
+#include <stddef.h>
+
+/* define UF_long */
+#include "UFconfig.h"
+
+int amd_order                  /* returns AMD_OK, AMD_OK_BUT_JUMBLED,
+                                * AMD_INVALID, or AMD_OUT_OF_MEMORY */
+(
+    int n,                     /* A is n-by-n.  n must be >= 0. */
+    const int Ap [ ],          /* column pointers for A, of size n+1 */
+    const int Ai [ ],          /* row indices of A, of size nz = Ap [n] */
+    int P [ ],                 /* output permutation, of size n */
+    double Control [ ],        /* input Control settings, of size AMD_CONTROL */
+    double Info [ ]            /* output Info statistics, of size AMD_INFO */
+) ;
+
+UF_long amd_l_order            /* see above for description of arguments */
+(
+    UF_long n,
+    const UF_long Ap [ ],
+    const UF_long Ai [ ],
+    UF_long P [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+/* Input arguments (not modified):
+ *
+ *       n: the matrix A is n-by-n.
+ *       Ap: an int/UF_long array of size n+1, containing column pointers of A.
+ *       Ai: an int/UF_long array of size nz, containing the row indices of A,
+ *           where nz = Ap [n].
+ *       Control:  a double array of size AMD_CONTROL, containing control
+ *           parameters.  Defaults are used if Control is NULL.
+ *
+ * Output arguments (not defined on input):
+ *
+ *       P: an int/UF_long array of size n, containing the output permutation. If
+ *           row i is the kth pivot row, then P [k] = i.  In MATLAB notation,
+ *           the reordered matrix is A (P,P).
+ *       Info: a double array of size AMD_INFO, containing statistical
+ *           information.  Ignored if Info is NULL.
+ *
+ * On input, the matrix A is stored in column-oriented form.  The row indices
+ * of nonzero entries in column j are stored in Ai [Ap [j] ... Ap [j+1]-1].
+ *
+ * If the row indices appear in ascending order in each column, and there
+ * are no duplicate entries, then amd_order is slightly more efficient in
+ * terms of time and memory usage.  If this condition does not hold, a copy
+ * of the matrix is created (where these conditions do hold), and the copy is
+ * ordered.  This feature is new to v2.0 (v1.2 and earlier required this
+ * condition to hold for the input matrix).
+ * 
+ * Row indices must be in the range 0 to
+ * n-1.  Ap [0] must be zero, and thus nz = Ap [n] is the number of nonzeros
+ * in A.  The array Ap is of size n+1, and the array Ai is of size nz = Ap [n].
+ * The matrix does not need to be symmetric, and the diagonal does not need to
+ * be present (if diagonal entries are present, they are ignored except for
+ * the output statistic Info [AMD_NZDIAG]).  The arrays Ai and Ap are not
+ * modified.  This form of the Ap and Ai arrays to represent the nonzero
+ * pattern of the matrix A is the same as that used internally by MATLAB.
+ * If you wish to use a more flexible input structure, please see the
+ * umfpack_*_triplet_to_col routines in the UMFPACK package, at
+ * http://www.cise.ufl.edu/research/sparse/umfpack.
+ *
+ * Restrictions:  n >= 0.  Ap [0] = 0.  Ap [j] <= Ap [j+1] for all j in the
+ *       range 0 to n-1.  nz = Ap [n] >= 0.  Ai [0..nz-1] must be in the range 0
+ *       to n-1.  Finally, Ai, Ap, and P must not be NULL.  If any of these
+ *       restrictions are not met, AMD returns AMD_INVALID.
+ *
+ * AMD returns:
+ *
+ *       AMD_OK if the matrix is valid and sufficient memory can be allocated to
+ *           perform the ordering.
+ *
+ *       AMD_OUT_OF_MEMORY if not enough memory can be allocated.
+ *
+ *       AMD_INVALID if the input arguments n, Ap, Ai are invalid, or if P is
+ *           NULL.
+ *
+ *       AMD_OK_BUT_JUMBLED if the matrix had unsorted columns, and/or duplicate
+ *           entries, but was otherwise valid.
+ *
+ * The AMD routine first forms the pattern of the matrix A+A', and then
+ * computes a fill-reducing ordering, P.  If P [k] = i, then row/column i of
+ * the original is the kth pivotal row.  In MATLAB notation, the permuted
+ * matrix is A (P,P), except that 0-based indexing is used instead of the
+ * 1-based indexing in MATLAB.
+ *
+ * The Control array is used to set various parameters for AMD.  If a NULL
+ * pointer is passed, default values are used.  The Control array is not
+ * modified.
+ *
+ *       Control [AMD_DENSE]:  controls the threshold for "dense" rows/columns.
+ *           A dense row/column in A+A' can cause AMD to spend a lot of time in
+ *           ordering the matrix.  If Control [AMD_DENSE] >= 0, rows/columns
+ *           with more than Control [AMD_DENSE] * sqrt (n) entries are ignored
+ *           during the ordering, and placed last in the output order.  The
+ *           default value of Control [AMD_DENSE] is 10.  If negative, no
+ *           rows/columns are treated as "dense".  Rows/columns with 16 or
+ *           fewer off-diagonal entries are never considered "dense".
+ *
+ *       Control [AMD_AGGRESSIVE]: controls whether or not to use aggressive
+ *           absorption, in which a prior element is absorbed into the current
+ *           element if is a subset of the current element, even if it is not
+ *           adjacent to the current pivot element (refer to Amestoy, Davis,
+ *           & Duff, 1996, for more details).  The default value is nonzero,
+ *           which means to perform aggressive absorption.  This nearly always
+ *           leads to a better ordering (because the approximate degrees are
+ *           more accurate) and a lower execution time.  There are cases where
+ *           it can lead to a slightly worse ordering, however.  To turn it off,
+ *           set Control [AMD_AGGRESSIVE] to 0.
+ *
+ *       Control [2..4] are not used in the current version, but may be used in
+ *           future versions.
+ *
+ * The Info array provides statistics about the ordering on output.  If it is
+ * not present, the statistics are not returned.  This is not an error
+ * condition.
+ * 
+ *       Info [AMD_STATUS]:  the return value of AMD, either AMD_OK,
+ *           AMD_OK_BUT_JUMBLED, AMD_OUT_OF_MEMORY, or AMD_INVALID.
+ *
+ *       Info [AMD_N]: n, the size of the input matrix
+ *
+ *       Info [AMD_NZ]: the number of nonzeros in A, nz = Ap [n]
+ *
+ *       Info [AMD_SYMMETRY]:  the symmetry of the matrix A.  It is the number
+ *           of "matched" off-diagonal entries divided by the total number of
+ *           off-diagonal entries.  An entry A(i,j) is matched if A(j,i) is also
+ *           an entry, for any pair (i,j) for which i != j.  In MATLAB notation,
+ *                S = spones (A) ;
+ *                B = tril (S, -1) + triu (S, 1) ;
+ *                symmetry = nnz (B & B') / nnz (B) ;
+ *
+ *       Info [AMD_NZDIAG]: the number of entries on the diagonal of A.
+ *
+ *       Info [AMD_NZ_A_PLUS_AT]:  the number of nonzeros in A+A', excluding the
+ *           diagonal.  If A is perfectly symmetric (Info [AMD_SYMMETRY] = 1)
+ *           with a fully nonzero diagonal, then Info [AMD_NZ_A_PLUS_AT] = nz-n
+ *           (the smallest possible value).  If A is perfectly unsymmetric
+ *           (Info [AMD_SYMMETRY] = 0, for an upper triangular matrix, for
+ *           example) with no diagonal, then Info [AMD_NZ_A_PLUS_AT] = 2*nz
+ *           (the largest possible value).
+ *
+ *       Info [AMD_NDENSE]: the number of "dense" rows/columns of A+A' that were
+ *           removed from A prior to ordering.  These are placed last in the
+ *           output order P.
+ *
+ *       Info [AMD_MEMORY]: the amount of memory used by AMD, in bytes.  In the
+ *           current version, this is 1.2 * Info  [AMD_NZ_A_PLUS_AT] + 9*n
+ *           times the size of an integer.  This is at most 2.4nz + 9n.  This
+ *           excludes the size of the input arguments Ai, Ap, and P, which have
+ *           a total size of nz + 2*n + 1 integers.
+ *
+ *       Info [AMD_NCMPA]: the number of garbage collections performed.
+ *
+ *       Info [AMD_LNZ]: the number of nonzeros in L (excluding the diagonal).
+ *           This is a slight upper bound because mass elimination is combined
+ *           with the approximate degree update.  It is a rough upper bound if
+ *           there are many "dense" rows/columns.  The rest of the statistics,
+ *           below, are also slight or rough upper bounds, for the same reasons.
+ *           The post-ordering of the assembly tree might also not exactly
+ *           correspond to a true elimination tree postordering.
+ *
+ *       Info [AMD_NDIV]: the number of divide operations for a subsequent LDL'
+ *           or LU factorization of the permuted matrix A (P,P).
+ *
+ *       Info [AMD_NMULTSUBS_LDL]:  the number of multiply-subtract pairs for a
+ *           subsequent LDL' factorization of A (P,P).
+ *
+ *       Info [AMD_NMULTSUBS_LU]:  the number of multiply-subtract pairs for a
+ *           subsequent LU factorization of A (P,P), assuming that no numerical
+ *           pivoting is required.
+ *
+ *       Info [AMD_DMAX]:  the maximum number of nonzeros in any column of L,
+ *           including the diagonal.
+ *
+ *       Info [14..19] are not used in the current version, but may be used in
+ *           future versions.
+ */    
+
+/* ------------------------------------------------------------------------- */
+/* direct interface to AMD */
+/* ------------------------------------------------------------------------- */
+
+/* amd_2 is the primary AMD ordering routine.  It is not meant to be
+ * user-callable because of its restrictive inputs and because it destroys
+ * the user's input matrix.  It does not check its inputs for errors, either.
+ * However, if you can work with these restrictions it can be faster than
+ * amd_order and use less memory (assuming that you can create your own copy
+ * of the matrix for AMD to destroy).  Refer to AMD/Source/amd_2.c for a
+ * description of each parameter. */
+
+void amd_2
+(
+    int n,
+    int Pe [ ],
+    int Iw [ ],
+    int Len [ ],
+    int iwlen,
+    int pfree,
+    int Nv [ ],
+    int Next [ ], 
+    int Last [ ],
+    int Head [ ],
+    int Elen [ ],
+    int Degree [ ],
+    int W [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+void amd_l2
+(
+    UF_long n,
+    UF_long Pe [ ],
+    UF_long Iw [ ],
+    UF_long Len [ ],
+    UF_long iwlen,
+    UF_long pfree,
+    UF_long Nv [ ],
+    UF_long Next [ ], 
+    UF_long Last [ ],
+    UF_long Head [ ],
+    UF_long Elen [ ],
+    UF_long Degree [ ],
+    UF_long W [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+/* ------------------------------------------------------------------------- */
+/* amd_valid */
+/* ------------------------------------------------------------------------- */
+
+/* Returns AMD_OK or AMD_OK_BUT_JUMBLED if the matrix is valid as input to
+ * amd_order; the latter is returned if the matrix has unsorted and/or
+ * duplicate row indices in one or more columns.  Returns AMD_INVALID if the
+ * matrix cannot be passed to amd_order.  For amd_order, the matrix must also
+ * be square.  The first two arguments are the number of rows and the number
+ * of columns of the matrix.  For its use in AMD, these must both equal n.
+ *
+ * NOTE: this routine returned TRUE/FALSE in v1.2 and earlier.
+ */
+
+int amd_valid
+(
+    int n_row,                 /* # of rows */
+    int n_col,                 /* # of columns */
+    const int Ap [ ],          /* column pointers, of size n_col+1 */
+    const int Ai [ ]           /* row indices, of size Ap [n_col] */
+) ;
+
+UF_long amd_l_valid
+(
+    UF_long n_row,
+    UF_long n_col,
+    const UF_long Ap [ ],
+    const UF_long Ai [ ]
+) ;
+
+/* ------------------------------------------------------------------------- */
+/* AMD memory manager and printf routines */
+/* ------------------------------------------------------------------------- */
+
+/* The user can redefine these to change the malloc, free, and printf routines
+ * that AMD uses. */
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+EXTERN void *(*amd_malloc) (size_t) ;                     /* pointer to malloc */
+EXTERN void (*amd_free) (void *) ;               /* pointer to free */
+EXTERN void *(*amd_realloc) (void *, size_t) ;            /* pointer to realloc */
+EXTERN void *(*amd_calloc) (size_t, size_t) ;             /* pointer to calloc */
+EXTERN int (*amd_printf) (const char *, ...) ;            /* pointer to printf */
+
+/* ------------------------------------------------------------------------- */
+/* AMD Control and Info arrays */
+/* ------------------------------------------------------------------------- */
+
+/* amd_defaults:  sets the default control settings */
+void amd_defaults   (double Control [ ]) ;
+void amd_l_defaults (double Control [ ]) ;
+
+/* amd_control: prints the control settings */
+void amd_control    (double Control [ ]) ;
+void amd_l_control  (double Control [ ]) ;
+
+/* amd_info: prints the statistics */
+void amd_info       (double Info [ ]) ;
+void amd_l_info     (double Info [ ]) ;
+
+#define AMD_CONTROL 5          /* size of Control array */
+#define AMD_INFO 20            /* size of Info array */
+
+/* contents of Control */
+#define AMD_DENSE 0            /* "dense" if degree > Control [0] * sqrt (n) */
+#define AMD_AGGRESSIVE 1    /* do aggressive absorption if Control [1] != 0 */
+
+/* default Control settings */
+#define AMD_DEFAULT_DENSE 10.0          /* default "dense" degree 10*sqrt(n) */
+#define AMD_DEFAULT_AGGRESSIVE 1    /* do aggressive absorption by default */
+
+/* contents of Info */
+#define AMD_STATUS 0           /* return value of amd_order and amd_l_order */
+#define AMD_N 1                /* A is n-by-n */
+#define AMD_NZ 2      /* number of nonzeros in A */ 
+#define AMD_SYMMETRY 3         /* symmetry of pattern (1 is sym., 0 is unsym.) */
+#define AMD_NZDIAG 4           /* # of entries on diagonal */
+#define AMD_NZ_A_PLUS_AT 5  /* nz in A+A' */
+#define AMD_NDENSE 6           /* number of "dense" rows/columns in A */
+#define AMD_MEMORY 7           /* amount of memory used by AMD */
+#define AMD_NCMPA 8            /* number of garbage collections in AMD */
+#define AMD_LNZ 9     /* approx. nz in L, excluding the diagonal */
+#define AMD_NDIV 10            /* number of fl. point divides for LU and LDL' */
+#define AMD_NMULTSUBS_LDL 11 /* number of fl. point (*,-) pairs for LDL' */
+#define AMD_NMULTSUBS_LU 12  /* number of fl. point (*,-) pairs for LU */
+#define AMD_DMAX 13             /* max nz. in any column of L, incl. diagonal */
+
+/* ------------------------------------------------------------------------- */
+/* return values of AMD */
+/* ------------------------------------------------------------------------- */
+
+#define AMD_OK 0           /* success */
+#define AMD_OUT_OF_MEMORY -1        /* malloc failed, or problem too large */
+#define AMD_INVALID -2              /* input arguments are not valid */
+#define AMD_OK_BUT_JUMBLED 1        /* input matrix is OK for amd_order, but
+    * columns were not sorted, and/or duplicate entries were present.  AMD had
+    * to do extra work before ordering the matrix.  This is a warning, not an
+    * error.  */
+
+/* ========================================================================== */
+/* === AMD version ========================================================== */
+/* ========================================================================== */
+
+/* AMD Version 1.2 and later include the following definitions.
+ * As an example, to test if the version you are using is 1.2 or later:
+ *
+ * #ifdef AMD_VERSION
+ *       if (AMD_VERSION >= AMD_VERSION_CODE (1,2)) ...
+ * #endif
+ *
+ * This also works during compile-time:
+ *
+ *       #if defined(AMD_VERSION) && (AMD_VERSION >= AMD_VERSION_CODE (1,2))
+ *           printf ("This is version 1.2 or later\n") ;
+ *       #else
+ *           printf ("This is an early version\n") ;
+ *       #endif
+ *
+ * Versions 1.1 and earlier of AMD do not include a #define'd version number.
+ */
+
+#define AMD_DATE "Dec 7, 2011"
+#define AMD_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define AMD_MAIN_VERSION 2
+#define AMD_SUB_VERSION 2
+#define AMD_SUBSUB_VERSION 3
+#define AMD_VERSION AMD_VERSION_CODE(AMD_MAIN_VERSION,AMD_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/maths/KLU/amd_1.c b/src/maths/KLU/amd_1.c
new file mode 100644
index 000000000..30cb27724
--- /dev/null
+++ b/src/maths/KLU/amd_1.c
@@ -0,0 +1,181 @@
+/* ========================================================================= */
+/* === AMD_1 =============================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* AMD_1: Construct A+A' for a sparse matrix A and perform the AMD ordering.
+ *
+ * The n-by-n sparse matrix A can be unsymmetric.  It is stored in MATLAB-style
+ * compressed-column form, with sorted row indices in each column, and no
+ * duplicate entries.  Diagonal entries may be present, but they are ignored.
+ * Row indices of column j of A are stored in Ai [Ap [j] ... Ap [j+1]-1].
+ * Ap [0] must be zero, and nz = Ap [n] is the number of entries in A.  The
+ * size of the matrix, n, must be greater than or equal to zero.
+ *
+ * This routine must be preceded by a call to AMD_aat, which computes the
+ * number of entries in each row/column in A+A', excluding the diagonal.
+ * Len [j], on input, is the number of entries in row/column j of A+A'.  This
+ * routine constructs the matrix A+A' and then calls AMD_2.  No error checking
+ * is performed (this was done in AMD_valid).
+ */
+
+#include "amd_internal.h"
+
+GLOBAL void AMD_1
+(
+    Int n,		/* n > 0 */
+    const Int Ap [ ],	/* input of size n+1, not modified */
+    const Int Ai [ ],	/* input of size nz = Ap [n], not modified */
+    Int P [ ],		/* size n output permutation */
+    Int Pinv [ ],	/* size n output inverse permutation */
+    Int Len [ ],	/* size n input, undefined on output */
+    Int slen,		/* slen >= sum (Len [0..n-1]) + 7n,
+			 * ideally slen = 1.2 * sum (Len) + 8n */
+    Int S [ ],		/* size slen workspace */
+    double Control [ ],	/* input array of size AMD_CONTROL */
+    double Info [ ]	/* output array of size AMD_INFO */
+)
+{
+    Int i, j, k, p, pfree, iwlen, pj, p1, p2, pj2, *Iw, *Pe, *Nv, *Head,
+	*Elen, *Degree, *s, *W, *Sp, *Tp ;
+
+    /* --------------------------------------------------------------------- */
+    /* construct the matrix for AMD_2 */
+    /* --------------------------------------------------------------------- */
+
+    ASSERT (n > 0) ;
+
+    iwlen = slen - 6*n ;
+    s = S ;
+    Pe = s ;	    s += n ;
+    Nv = s ;	    s += n ;
+    Head = s ;	    s += n ;
+    Elen = s ;	    s += n ;
+    Degree = s ;    s += n ;
+    W = s ;	    s += n ;
+    Iw = s ;	    s += iwlen ;
+
+    ASSERT (AMD_valid (n, n, Ap, Ai) == AMD_OK) ;
+
+    /* construct the pointers for A+A' */
+    Sp = Nv ;			/* use Nv and W as workspace for Sp and Tp [ */
+    Tp = W ;
+    pfree = 0 ;
+    for (j = 0 ; j < n ; j++)
+    {
+	Pe [j] = pfree ;
+	Sp [j] = pfree ;
+	pfree += Len [j] ;
+    }
+
+    /* Note that this restriction on iwlen is slightly more restrictive than
+     * what is strictly required in AMD_2.  AMD_2 can operate with no elbow
+     * room at all, but it will be very slow.  For better performance, at
+     * least size-n elbow room is enforced. */
+    ASSERT (iwlen >= pfree + n) ;
+
+#ifndef NDEBUG
+    for (p = 0 ; p < iwlen ; p++) Iw [p] = EMPTY ;
+#endif
+
+    for (k = 0 ; k < n ; k++)
+    {
+	AMD_DEBUG1 (("Construct row/column k= "ID" of A+A'\n", k))  ;
+	p1 = Ap [k] ;
+	p2 = Ap [k+1] ;
+
+	/* construct A+A' */
+	for (p = p1 ; p < p2 ; )
+	{
+	    /* scan the upper triangular part of A */
+	    j = Ai [p] ;
+	    ASSERT (j >= 0 && j < n) ;
+	    if (j < k)
+	    {
+		/* entry A (j,k) in the strictly upper triangular part */
+		ASSERT (Sp [j] < (j == n-1 ? pfree : Pe [j+1])) ;
+		ASSERT (Sp [k] < (k == n-1 ? pfree : Pe [k+1])) ;
+		Iw [Sp [j]++] = k ;
+		Iw [Sp [k]++] = j ;
+		p++ ;
+	    }
+	    else if (j == k)
+	    {
+		/* skip the diagonal */
+		p++ ;
+		break ;
+	    }
+	    else /* j > k */
+	    {
+		/* first entry below the diagonal */
+		break ;
+	    }
+	    /* scan lower triangular part of A, in column j until reaching
+	     * row k.  Start where last scan left off. */
+	    ASSERT (Ap [j] <= Tp [j] && Tp [j] <= Ap [j+1]) ;
+	    pj2 = Ap [j+1] ;
+	    for (pj = Tp [j] ; pj < pj2 ; )
+	    {
+		i = Ai [pj] ;
+		ASSERT (i >= 0 && i < n) ;
+		if (i < k)
+		{
+		    /* A (i,j) is only in the lower part, not in upper */
+		    ASSERT (Sp [i] < (i == n-1 ? pfree : Pe [i+1])) ;
+		    ASSERT (Sp [j] < (j == n-1 ? pfree : Pe [j+1])) ;
+		    Iw [Sp [i]++] = j ;
+		    Iw [Sp [j]++] = i ;
+		    pj++ ;
+		}
+		else if (i == k)
+		{
+		    /* entry A (k,j) in lower part and A (j,k) in upper */
+		    pj++ ;
+		    break ;
+		}
+		else /* i > k */
+		{
+		    /* consider this entry later, when k advances to i */
+		    break ;
+		}
+	    }
+	    Tp [j] = pj ;
+	}
+	Tp [k] = p ;
+    }
+
+    /* clean up, for remaining mismatched entries */
+    for (j = 0 ; j < n ; j++)
+    {
+	for (pj = Tp [j] ; pj < Ap [j+1] ; pj++)
+	{
+	    i = Ai [pj] ;
+	    ASSERT (i >= 0 && i < n) ;
+	    /* A (i,j) is only in the lower part, not in upper */
+	    ASSERT (Sp [i] < (i == n-1 ? pfree : Pe [i+1])) ;
+	    ASSERT (Sp [j] < (j == n-1 ? pfree : Pe [j+1])) ;
+	    Iw [Sp [i]++] = j ;
+	    Iw [Sp [j]++] = i ;
+	}
+    }
+
+#ifndef NDEBUG
+    for (j = 0 ; j < n-1 ; j++) ASSERT (Sp [j] == Pe [j+1]) ;
+    ASSERT (Sp [n-1] == pfree) ;
+#endif
+
+    /* Tp and Sp no longer needed ] */
+
+    /* --------------------------------------------------------------------- */
+    /* order the matrix */
+    /* --------------------------------------------------------------------- */
+
+    AMD_2 (n, Pe, Iw, Len, iwlen, pfree,
+	Nv, Pinv, P, Head, Elen, Degree, W, Control, Info) ;
+}
diff --git a/src/maths/KLU/amd_2.c b/src/maths/KLU/amd_2.c
new file mode 100644
index 000000000..97b4f7a36
--- /dev/null
+++ b/src/maths/KLU/amd_2.c
@@ -0,0 +1,1842 @@
+/* ========================================================================= */
+/* === AMD_2 =============================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* AMD_2:  performs the AMD ordering on a symmetric sparse matrix A, followed
+ * by a postordering (via depth-first search) of the assembly tree using the
+ * AMD_postorder routine.
+ */
+
+#include "amd_internal.h"
+
+/* ========================================================================= */
+/* === clear_flag ========================================================== */
+/* ========================================================================= */
+
+static Int clear_flag (Int wflg, Int wbig, Int W [ ], Int n)
+{
+    Int x ;
+    if (wflg < 2 || wflg >= wbig)
+    {
+	for (x = 0 ; x < n ; x++)
+	{
+	    if (W [x] != 0) W [x] = 1 ;
+	}
+	wflg = 2 ;
+    }
+    /*  at this point, W [0..n-1] < wflg holds */
+    return (wflg) ;
+}
+
+
+/* ========================================================================= */
+/* === AMD_2 =============================================================== */
+/* ========================================================================= */
+
+GLOBAL void AMD_2
+(
+    Int n,		/* A is n-by-n, where n > 0 */
+    Int Pe [ ],		/* Pe [0..n-1]: index in Iw of row i on input */
+    Int Iw [ ],		/* workspace of size iwlen. Iw [0..pfree-1]
+			 * holds the matrix on input */
+    Int Len [ ],	/* Len [0..n-1]: length for row/column i on input */
+    Int iwlen,		/* length of Iw. iwlen >= pfree + n */
+    Int pfree,		/* Iw [pfree ... iwlen-1] is empty on input */
+
+    /* 7 size-n workspaces, not defined on input: */
+    Int Nv [ ],		/* the size of each supernode on output */
+    Int Next [ ],	/* the output inverse permutation */
+    Int Last [ ],	/* the output permutation */
+    Int Head [ ],
+    Int Elen [ ],	/* the size columns of L for each supernode */
+    Int Degree [ ],
+    Int W [ ],
+
+    /* control parameters and output statistics */
+    double Control [ ],	/* array of size AMD_CONTROL */
+    double Info [ ]	/* array of size AMD_INFO */
+)
+{
+
+/*
+ * Given a representation of the nonzero pattern of a symmetric matrix, A,
+ * (excluding the diagonal) perform an approximate minimum (UMFPACK/MA38-style)
+ * degree ordering to compute a pivot order such that the introduction of
+ * nonzeros (fill-in) in the Cholesky factors A = LL' is kept low.  At each
+ * step, the pivot selected is the one with the minimum UMFAPACK/MA38-style
+ * upper-bound on the external degree.  This routine can optionally perform
+ * aggresive absorption (as done by MC47B in the Harwell Subroutine
+ * Library).
+ *
+ * The approximate degree algorithm implemented here is the symmetric analog of
+ * the degree update algorithm in MA38 and UMFPACK (the Unsymmetric-pattern
+ * MultiFrontal PACKage, both by Davis and Duff).  The routine is based on the
+ * MA27 minimum degree ordering algorithm by Iain Duff and John Reid.
+ *
+ * This routine is a translation of the original AMDBAR and MC47B routines,
+ * in Fortran, with the following modifications:
+ *
+ * (1) dense rows/columns are removed prior to ordering the matrix, and placed
+ *	last in the output order.  The presence of a dense row/column can
+ *	increase the ordering time by up to O(n^2), unless they are removed
+ *	prior to ordering.
+ *
+ * (2) the minimum degree ordering is followed by a postordering (depth-first
+ *	search) of the assembly tree.  Note that mass elimination (discussed
+ *	below) combined with the approximate degree update can lead to the mass
+ *	elimination of nodes with lower exact degree than the current pivot
+ *	element.  No additional fill-in is caused in the representation of the
+ *	Schur complement.  The mass-eliminated nodes merge with the current
+ *	pivot element.  They are ordered prior to the current pivot element.
+ *	Because they can have lower exact degree than the current element, the
+ *	merger of two or more of these nodes in the current pivot element can
+ *	lead to a single element that is not a "fundamental supernode".  The
+ *	diagonal block can have zeros in it.  Thus, the assembly tree used here
+ *	is not guaranteed to be the precise supernodal elemination tree (with
+ *	"funadmental" supernodes), and the postordering performed by this
+ *	routine is not guaranteed to be a precise postordering of the
+ *	elimination tree.
+ *
+ * (3) input parameters are added, to control aggressive absorption and the
+ *	detection of "dense" rows/columns of A.
+ *
+ * (4) additional statistical information is returned, such as the number of
+ *	nonzeros in L, and the flop counts for subsequent LDL' and LU
+ *	factorizations.  These are slight upper bounds, because of the mass
+ *	elimination issue discussed above.
+ *
+ * (5) additional routines are added to interface this routine to MATLAB
+ *	to provide a simple C-callable user-interface, to check inputs for
+ *	errors, compute the symmetry of the pattern of A and the number of
+ *	nonzeros in each row/column of A+A', to compute the pattern of A+A',
+ *	to perform the assembly tree postordering, and to provide debugging
+ *	ouput.  Many of these functions are also provided by the Fortran
+ *	Harwell Subroutine Library routine MC47A.
+ *
+ * (6) both int and UF_long versions are provided.  In the descriptions below
+ *	and integer is and int or UF_long depending on which version is
+ *	being used.
+
+ **********************************************************************
+ ***** CAUTION:  ARGUMENTS ARE NOT CHECKED FOR ERRORS ON INPUT.  ******
+ **********************************************************************
+ ** If you want error checking, a more versatile input format, and a **
+ ** simpler user interface, use amd_order or amd_l_order instead.    **
+ ** This routine is not meant to be user-callable.                   **
+ **********************************************************************
+
+ * ----------------------------------------------------------------------------
+ * References:
+ * ----------------------------------------------------------------------------
+ *
+ *  [1] Timothy A. Davis and Iain Duff, "An unsymmetric-pattern multifrontal
+ *	method for sparse LU factorization", SIAM J. Matrix Analysis and
+ *	Applications, vol. 18, no. 1, pp. 140-158.  Discusses UMFPACK / MA38,
+ *	which first introduced the approximate minimum degree used by this
+ *	routine.
+ *
+ *  [2] Patrick Amestoy, Timothy A. Davis, and Iain S. Duff, "An approximate
+ *	minimum degree ordering algorithm," SIAM J. Matrix Analysis and
+ *	Applications, vol. 17, no. 4, pp. 886-905, 1996.  Discusses AMDBAR and
+ *	MC47B, which are the Fortran versions of this routine.
+ *
+ *  [3] Alan George and Joseph Liu, "The evolution of the minimum degree
+ *	ordering algorithm," SIAM Review, vol. 31, no. 1, pp. 1-19, 1989.
+ *	We list below the features mentioned in that paper that this code
+ *	includes:
+ *
+ *	mass elimination:
+ *	    Yes.  MA27 relied on supervariable detection for mass elimination.
+ *
+ *	indistinguishable nodes:
+ *	    Yes (we call these "supervariables").  This was also in the MA27
+ *	    code - although we modified the method of detecting them (the
+ *	    previous hash was the true degree, which we no longer keep track
+ *	    of).  A supervariable is a set of rows with identical nonzero
+ *	    pattern.  All variables in a supervariable are eliminated together.
+ *	    Each supervariable has as its numerical name that of one of its
+ *	    variables (its principal variable).
+ *
+ *	quotient graph representation:
+ *	    Yes.  We use the term "element" for the cliques formed during
+ *	    elimination.  This was also in the MA27 code.  The algorithm can
+ *	    operate in place, but it will work more efficiently if given some
+ *	    "elbow room."
+ *
+ *	element absorption:
+ *	    Yes.  This was also in the MA27 code.
+ *
+ *	external degree:
+ *	    Yes.  The MA27 code was based on the true degree.
+ *
+ *	incomplete degree update and multiple elimination:
+ *	    No.  This was not in MA27, either.  Our method of degree update
+ *	    within MC47B is element-based, not variable-based.  It is thus
+ *	    not well-suited for use with incomplete degree update or multiple
+ *	    elimination.
+ *
+ * Authors, and Copyright (C) 2004 by:
+ * Timothy A. Davis, Patrick Amestoy, Iain S. Duff, John K. Reid.
+ *
+ * Acknowledgements: This work (and the UMFPACK package) was supported by the
+ * National Science Foundation (ASC-9111263, DMS-9223088, and CCR-0203270).
+ * The UMFPACK/MA38 approximate degree update algorithm, the unsymmetric analog
+ * which forms the basis of AMD, was developed while Tim Davis was supported by
+ * CERFACS (Toulouse, France) in a post-doctoral position.  This C version, and
+ * the etree postorder, were written while Tim Davis was on sabbatical at
+ * Stanford University and Lawrence Berkeley National Laboratory.
+
+ * ----------------------------------------------------------------------------
+ * INPUT ARGUMENTS (unaltered):
+ * ----------------------------------------------------------------------------
+
+ * n:  The matrix order.  Restriction:  n >= 1.
+ *
+ * iwlen:  The size of the Iw array.  On input, the matrix is stored in
+ *	Iw [0..pfree-1].  However, Iw [0..iwlen-1] should be slightly larger
+ *	than what is required to hold the matrix, at least iwlen >= pfree + n.
+ *	Otherwise, excessive compressions will take place.  The recommended
+ *	value of iwlen is 1.2 * pfree + n, which is the value used in the
+ *	user-callable interface to this routine (amd_order.c).  The algorithm
+ *	will not run at all if iwlen < pfree.  Restriction: iwlen >= pfree + n.
+ *	Note that this is slightly more restrictive than the actual minimum
+ *	(iwlen >= pfree), but AMD_2 will be very slow with no elbow room.
+ *	Thus, this routine enforces a bare minimum elbow room of size n.
+ *
+ * pfree: On input the tail end of the array, Iw [pfree..iwlen-1], is empty,
+ *	and the matrix is stored in Iw [0..pfree-1].  During execution,
+ *	additional data is placed in Iw, and pfree is modified so that
+ *	Iw [pfree..iwlen-1] is always the unused part of Iw.
+ *
+ * Control:  A double array of size AMD_CONTROL containing input parameters
+ *	that affect how the ordering is computed.  If NULL, then default
+ *	settings are used.
+ *
+ *	Control [AMD_DENSE] is used to determine whether or not a given input
+ *	row is "dense".  A row is "dense" if the number of entries in the row
+ *	exceeds Control [AMD_DENSE] times sqrt (n), except that rows with 16 or
+ *	fewer entries are never considered "dense".  To turn off the detection
+ *	of dense rows, set Control [AMD_DENSE] to a negative number, or to a
+ *	number larger than sqrt (n).  The default value of Control [AMD_DENSE]
+ *	is AMD_DEFAULT_DENSE, which is defined in amd.h as 10.
+ *
+ *	Control [AMD_AGGRESSIVE] is used to determine whether or not aggressive
+ *	absorption is to be performed.  If nonzero, then aggressive absorption
+ *	is performed (this is the default).
+
+ * ----------------------------------------------------------------------------
+ * INPUT/OUPUT ARGUMENTS:
+ * ----------------------------------------------------------------------------
+ *
+ * Pe:  An integer array of size n.  On input, Pe [i] is the index in Iw of
+ *	the start of row i.  Pe [i] is ignored if row i has no off-diagonal
+ *	entries.  Thus Pe [i] must be in the range 0 to pfree-1 for non-empty
+ *	rows.
+ *
+ *	During execution, it is used for both supervariables and elements:
+ *
+ *	Principal supervariable i:  index into Iw of the description of
+ *	    supervariable i.  A supervariable represents one or more rows of
+ *	    the matrix with identical nonzero pattern.  In this case,
+ *	    Pe [i] >= 0.
+ *
+ *	Non-principal supervariable i:  if i has been absorbed into another
+ *	    supervariable j, then Pe [i] = FLIP (j), where FLIP (j) is defined
+ *	    as (-(j)-2).  Row j has the same pattern as row i.  Note that j
+ *	    might later be absorbed into another supervariable j2, in which
+ *	    case Pe [i] is still FLIP (j), and Pe [j] = FLIP (j2) which is
+ *	    < EMPTY, where EMPTY is defined as (-1) in amd_internal.h.
+ *
+ *	Unabsorbed element e:  the index into Iw of the description of element
+ *	    e, if e has not yet been absorbed by a subsequent element.  Element
+ *	    e is created when the supervariable of the same name is selected as
+ *	    the pivot.  In this case, Pe [i] >= 0.
+ *
+ *	Absorbed element e:  if element e is absorbed into element e2, then
+ *	    Pe [e] = FLIP (e2).  This occurs when the pattern of e (which we
+ *	    refer to as Le) is found to be a subset of the pattern of e2 (that
+ *	    is, Le2).  In this case, Pe [i] < EMPTY.  If element e is "null"
+ *	    (it has no nonzeros outside its pivot block), then Pe [e] = EMPTY,
+ *	    and e is the root of an assembly subtree (or the whole tree if
+ *	    there is just one such root).
+ *
+ *	Dense variable i:  if i is "dense", then Pe [i] = EMPTY.
+ *
+ *	On output, Pe holds the assembly tree/forest, which implicitly
+ *	represents a pivot order with identical fill-in as the actual order
+ *	(via a depth-first search of the tree), as follows.  If Nv [i] > 0,
+ *	then i represents a node in the assembly tree, and the parent of i is
+ *	Pe [i], or EMPTY if i is a root.  If Nv [i] = 0, then (i, Pe [i])
+ *	represents an edge in a subtree, the root of which is a node in the
+ *	assembly tree.  Note that i refers to a row/column in the original
+ *	matrix, not the permuted matrix.
+ *
+ * Info:  A double array of size AMD_INFO.  If present, (that is, not NULL),
+ *	then statistics about the ordering are returned in the Info array.
+ *	See amd.h for a description.
+
+ * ----------------------------------------------------------------------------
+ * INPUT/MODIFIED (undefined on output):
+ * ----------------------------------------------------------------------------
+ *
+ * Len:  An integer array of size n.  On input, Len [i] holds the number of
+ *	entries in row i of the matrix, excluding the diagonal.  The contents
+ *	of Len are undefined on output.
+ *
+ * Iw:  An integer array of size iwlen.  On input, Iw [0..pfree-1] holds the
+ *	description of each row i in the matrix.  The matrix must be symmetric,
+ *	and both upper and lower triangular parts must be present.  The
+ *	diagonal must not be present.  Row i is held as follows:
+ *
+ *	    Len [i]:  the length of the row i data structure in the Iw array.
+ *	    Iw [Pe [i] ... Pe [i] + Len [i] - 1]:
+ *		the list of column indices for nonzeros in row i (simple
+ *		supervariables), excluding the diagonal.  All supervariables
+ *		start with one row/column each (supervariable i is just row i).
+ *		If Len [i] is zero on input, then Pe [i] is ignored on input.
+ *
+ *	    Note that the rows need not be in any particular order, and there
+ *	    may be empty space between the rows.
+ *
+ *	During execution, the supervariable i experiences fill-in.  This is
+ *	represented by placing in i a list of the elements that cause fill-in
+ *	in supervariable i:
+ *
+ *	    Len [i]:  the length of supervariable i in the Iw array.
+ *	    Iw [Pe [i] ... Pe [i] + Elen [i] - 1]:
+ *		the list of elements that contain i.  This list is kept short
+ *		by removing absorbed elements.
+ *	    Iw [Pe [i] + Elen [i] ... Pe [i] + Len [i] - 1]:
+ *		the list of supervariables in i.  This list is kept short by
+ *		removing nonprincipal variables, and any entry j that is also
+ *		contained in at least one of the elements (j in Le) in the list
+ *		for i (e in row i).
+ *
+ *	When supervariable i is selected as pivot, we create an element e of
+ *	the same name (e=i):
+ *
+ *	    Len [e]:  the length of element e in the Iw array.
+ *	    Iw [Pe [e] ... Pe [e] + Len [e] - 1]:
+ *		the list of supervariables in element e.
+ *
+ *	An element represents the fill-in that occurs when supervariable i is
+ *	selected as pivot (which represents the selection of row i and all
+ *	non-principal variables whose principal variable is i).  We use the
+ *	term Le to denote the set of all supervariables in element e.  Absorbed
+ *	supervariables and elements are pruned from these lists when
+ *	computationally convenient.
+ *
+ *  CAUTION:  THE INPUT MATRIX IS OVERWRITTEN DURING COMPUTATION.
+ *  The contents of Iw are undefined on output.
+
+ * ----------------------------------------------------------------------------
+ * OUTPUT (need not be set on input):
+ * ----------------------------------------------------------------------------
+ *
+ * Nv:  An integer array of size n.  During execution, ABS (Nv [i]) is equal to
+ *	the number of rows that are represented by the principal supervariable
+ *	i.  If i is a nonprincipal or dense variable, then Nv [i] = 0.
+ *	Initially, Nv [i] = 1 for all i.  Nv [i] < 0 signifies that i is a
+ *	principal variable in the pattern Lme of the current pivot element me.
+ *	After element me is constructed, Nv [i] is set back to a positive
+ *	value.
+ *
+ *	On output, Nv [i] holds the number of pivots represented by super
+ *	row/column i of the original matrix, or Nv [i] = 0 for non-principal
+ *	rows/columns.  Note that i refers to a row/column in the original
+ *	matrix, not the permuted matrix.
+ *
+ * Elen:  An integer array of size n.  See the description of Iw above.  At the
+ *	start of execution, Elen [i] is set to zero for all rows i.  During
+ *	execution, Elen [i] is the number of elements in the list for
+ *	supervariable i.  When e becomes an element, Elen [e] = FLIP (esize) is
+ *	set, where esize is the size of the element (the number of pivots, plus
+ *	the number of nonpivotal entries).  Thus Elen [e] < EMPTY.
+ *	Elen (i) = EMPTY set when variable i becomes nonprincipal.
+ *
+ *	For variables, Elen (i) >= EMPTY holds until just before the
+ *	postordering and permutation vectors are computed.  For elements,
+ *	Elen [e] < EMPTY holds.
+ *
+ *	On output, Elen [i] is the degree of the row/column in the Cholesky
+ *	factorization of the permuted matrix, corresponding to the original row
+ *	i, if i is a super row/column.  It is equal to EMPTY if i is
+ *	non-principal.  Note that i refers to a row/column in the original
+ *	matrix, not the permuted matrix.
+ *
+ *	Note that the contents of Elen on output differ from the Fortran
+ *	version (Elen holds the inverse permutation in the Fortran version,
+ *	which is instead returned in the Next array in this C version,
+ *	described below).
+ *
+ * Last: In a degree list, Last [i] is the supervariable preceding i, or EMPTY
+ *	if i is the head of the list.  In a hash bucket, Last [i] is the hash
+ *	key for i.
+ *
+ *	Last [Head [hash]] is also used as the head of a hash bucket if
+ *	Head [hash] contains a degree list (see the description of Head,
+ *	below).
+ *
+ *	On output, Last [0..n-1] holds the permutation.  That is, if
+ *	i = Last [k], then row i is the kth pivot row (where k ranges from 0 to
+ *	n-1).  Row Last [k] of A is the kth row in the permuted matrix, PAP'.
+ *
+ * Next: Next [i] is the supervariable following i in a link list, or EMPTY if
+ *	i is the last in the list.  Used for two kinds of lists:  degree lists
+ *	and hash buckets (a supervariable can be in only one kind of list at a
+ *	time).
+ *
+ *	On output Next [0..n-1] holds the inverse permutation. 	That is, if
+ *	k = Next [i], then row i is the kth pivot row. Row i of A appears as
+ *	the (Next[i])-th row in the permuted matrix, PAP'.
+ *
+ *	Note that the contents of Next on output differ from the Fortran
+ *	version (Next is undefined on output in the Fortran version).
+
+ * ----------------------------------------------------------------------------
+ * LOCAL WORKSPACE (not input or output - used only during execution):
+ * ----------------------------------------------------------------------------
+ *
+ * Degree:  An integer array of size n.  If i is a supervariable, then
+ *	Degree [i] holds the current approximation of the external degree of
+ *	row i (an upper bound).  The external degree is the number of nonzeros
+ *	in row i, minus ABS (Nv [i]), the diagonal part.  The bound is equal to
+ *	the exact external degree if Elen [i] is less than or equal to two.
+ *
+ *	We also use the term "external degree" for elements e to refer to
+ *	|Le \ Lme|.  If e is an element, then Degree [e] is |Le|, which is the
+ *	degree of the off-diagonal part of the element e (not including the
+ *	diagonal part).
+ *
+ * Head:   An integer array of size n.  Head is used for degree lists.
+ *	Head [deg] is the first supervariable in a degree list.  All
+ *	supervariables i in a degree list Head [deg] have the same approximate
+ *	degree, namely, deg = Degree [i].  If the list Head [deg] is empty then
+ *	Head [deg] = EMPTY.
+ *
+ *	During supervariable detection Head [hash] also serves as a pointer to
+ *	a hash bucket.  If Head [hash] >= 0, there is a degree list of degree
+ *	hash.  The hash bucket head pointer is Last [Head [hash]].  If
+ *	Head [hash] = EMPTY, then the degree list and hash bucket are both
+ *	empty.  If Head [hash] < EMPTY, then the degree list is empty, and
+ *	FLIP (Head [hash]) is the head of the hash bucket.  After supervariable
+ *	detection is complete, all hash buckets are empty, and the
+ *	(Last [Head [hash]] = EMPTY) condition is restored for the non-empty
+ *	degree lists.
+ *
+ * W:  An integer array of size n.  The flag array W determines the status of
+ *	elements and variables, and the external degree of elements.
+ *
+ *	for elements:
+ *	    if W [e] = 0, then the element e is absorbed.
+ *	    if W [e] >= wflg, then W [e] - wflg is the size of the set
+ *		|Le \ Lme|, in terms of nonzeros (the sum of ABS (Nv [i]) for
+ *		each principal variable i that is both in the pattern of
+ *		element e and NOT in the pattern of the current pivot element,
+ *		me).
+ *	    if wflg > W [e] > 0, then e is not absorbed and has not yet been
+ *		seen in the scan of the element lists in the computation of
+ *		|Le\Lme| in Scan 1 below.
+ *
+ *	for variables:
+ *	    during supervariable detection, if W [j] != wflg then j is
+ *	    not in the pattern of variable i.
+ *
+ *	The W array is initialized by setting W [i] = 1 for all i, and by
+ *	setting wflg = 2.  It is reinitialized if wflg becomes too large (to
+ *	ensure that wflg+n does not cause integer overflow).
+
+ * ----------------------------------------------------------------------------
+ * LOCAL INTEGERS:
+ * ----------------------------------------------------------------------------
+ */
+
+    Int deg, degme, dext, lemax, e, elenme, eln, i, ilast, inext, j,
+	jlast, jnext, k, knt1, knt2, knt3, lenj, ln, me, mindeg, nel, nleft,
+	nvi, nvj, nvpiv, slenme, wbig, we, wflg, wnvi, ok, ndense, ncmpa,
+	dense, aggressive ;
+
+    unsigned Int hash ;	    /* unsigned, so that hash % n is well defined.*/
+
+/*
+ * deg:		the degree of a variable or element
+ * degme:	size, |Lme|, of the current element, me (= Degree [me])
+ * dext:	external degree, |Le \ Lme|, of some element e
+ * lemax:	largest |Le| seen so far (called dmax in Fortran version)
+ * e:		an element
+ * elenme:	the length, Elen [me], of element list of pivotal variable
+ * eln:		the length, Elen [...], of an element list
+ * hash:	the computed value of the hash function
+ * i:		a supervariable
+ * ilast:	the entry in a link list preceding i
+ * inext:	the entry in a link list following i
+ * j:		a supervariable
+ * jlast:	the entry in a link list preceding j
+ * jnext:	the entry in a link list, or path, following j
+ * k:		the pivot order of an element or variable
+ * knt1:	loop counter used during element construction
+ * knt2:	loop counter used during element construction
+ * knt3:	loop counter used during compression
+ * lenj:	Len [j]
+ * ln:		length of a supervariable list
+ * me:		current supervariable being eliminated, and the current
+ *		    element created by eliminating that supervariable
+ * mindeg:	current minimum degree
+ * nel:		number of pivots selected so far
+ * nleft:	n - nel, the number of nonpivotal rows/columns remaining
+ * nvi:		the number of variables in a supervariable i (= Nv [i])
+ * nvj:		the number of variables in a supervariable j (= Nv [j])
+ * nvpiv:	number of pivots in current element
+ * slenme:	number of variables in variable list of pivotal variable
+ * wbig:	= INT_MAX - n for the int version, UF_long_max - n for the
+ *		    UF_long version.  wflg is not allowed to be >= wbig.
+ * we:		W [e]
+ * wflg:	used for flagging the W array.  See description of Iw.
+ * wnvi:	wflg - Nv [i]
+ * x:		either a supervariable or an element
+ *
+ * ok:		true if supervariable j can be absorbed into i
+ * ndense:	number of "dense" rows/columns
+ * dense:	rows/columns with initial degree > dense are considered "dense"
+ * aggressive:	true if aggressive absorption is being performed
+ * ncmpa:	number of garbage collections
+
+ * ----------------------------------------------------------------------------
+ * LOCAL DOUBLES, used for statistical output only (except for alpha):
+ * ----------------------------------------------------------------------------
+ */
+
+    double f, r, ndiv, s, nms_lu, nms_ldl, dmax, alpha, lnz, lnzme ;
+
+/*
+ * f:		nvpiv
+ * r:		degme + nvpiv
+ * ndiv:	number of divisions for LU or LDL' factorizations
+ * s:		number of multiply-subtract pairs for LU factorization, for the
+ *		    current element me
+ * nms_lu	number of multiply-subtract pairs for LU factorization
+ * nms_ldl	number of multiply-subtract pairs for LDL' factorization
+ * dmax:	the largest number of entries in any column of L, including the
+ *		    diagonal
+ * alpha:	"dense" degree ratio
+ * lnz:		the number of nonzeros in L (excluding the diagonal)
+ * lnzme:	the number of nonzeros in L (excl. the diagonal) for the
+ *		    current element me
+
+ * ----------------------------------------------------------------------------
+ * LOCAL "POINTERS" (indices into the Iw array)
+ * ----------------------------------------------------------------------------
+*/
+
+    Int p, p1, p2, p3, p4, pdst, pend, pj, pme, pme1, pme2, pn, psrc ;
+
+/*
+ * Any parameter (Pe [...] or pfree) or local variable starting with "p" (for
+ * Pointer) is an index into Iw, and all indices into Iw use variables starting
+ * with "p."  The only exception to this rule is the iwlen input argument.
+ *
+ * p:           pointer into lots of things
+ * p1:          Pe [i] for some variable i (start of element list)
+ * p2:          Pe [i] + Elen [i] -  1 for some variable i
+ * p3:          index of first supervariable in clean list
+ * p4:		
+ * pdst:        destination pointer, for compression
+ * pend:        end of memory to compress
+ * pj:          pointer into an element or variable
+ * pme:         pointer into the current element (pme1...pme2)
+ * pme1:        the current element, me, is stored in Iw [pme1...pme2]
+ * pme2:        the end of the current element
+ * pn:          pointer into a "clean" variable, also used to compress
+ * psrc:        source pointer, for compression
+*/
+
+/* ========================================================================= */
+/*  INITIALIZATIONS */
+/* ========================================================================= */
+
+    /* Note that this restriction on iwlen is slightly more restrictive than
+     * what is actually required in AMD_2.  AMD_2 can operate with no elbow
+     * room at all, but it will be slow.  For better performance, at least
+     * size-n elbow room is enforced. */
+    ASSERT (iwlen >= pfree + n) ;
+    ASSERT (n > 0) ;
+
+    /* initialize output statistics */
+    lnz = 0 ;
+    ndiv = 0 ;
+    nms_lu = 0 ;
+    nms_ldl = 0 ;
+    dmax = 1 ;
+    me = EMPTY ;
+
+    mindeg = 0 ;
+    ncmpa = 0 ;
+    nel = 0 ;
+    lemax = 0 ;
+
+    /* get control parameters */
+    if (Control != (double *) NULL)
+    {
+	alpha = Control [AMD_DENSE] ;
+	aggressive = (Control [AMD_AGGRESSIVE] != 0) ;
+    }
+    else
+    {
+	alpha = AMD_DEFAULT_DENSE ;
+	aggressive = AMD_DEFAULT_AGGRESSIVE ;
+    }
+    /* Note: if alpha is NaN, this is undefined: */
+    if (alpha < 0)
+    {
+	/* only remove completely dense rows/columns */
+	dense = n-2 ;
+    }
+    else
+    {
+	dense = alpha * sqrt ((double) n) ;
+    }
+    dense = MAX (16, dense) ;
+    dense = MIN (n,  dense) ;
+    AMD_DEBUG1 (("\n\nAMD (debug), alpha %g, aggr. "ID"\n",
+	alpha, aggressive)) ;
+
+    for (i = 0 ; i < n ; i++)
+    {
+	Last [i] = EMPTY ;
+	Head [i] = EMPTY ;
+	Next [i] = EMPTY ;
+	/* if separate Hhead array is used for hash buckets: *
+	Hhead [i] = EMPTY ;
+	*/
+	Nv [i] = 1 ;
+	W [i] = 1 ;
+	Elen [i] = 0 ;
+	Degree [i] = Len [i] ;
+    }
+
+#ifndef NDEBUG
+    AMD_DEBUG1 (("\n======Nel "ID" initial\n", nel)) ;
+    AMD_dump (n, Pe, Iw, Len, iwlen, pfree, Nv, Next, Last,
+		Head, Elen, Degree, W, -1) ;
+#endif
+
+    /* initialize wflg */
+    wbig = Int_MAX - n ;
+    wflg = clear_flag (0, wbig, W, n) ;
+
+    /* --------------------------------------------------------------------- */
+    /* initialize degree lists and eliminate dense and empty rows */
+    /* --------------------------------------------------------------------- */
+
+    ndense = 0 ;
+
+    for (i = 0 ; i < n ; i++)
+    {
+	deg = Degree [i] ;
+	ASSERT (deg >= 0 && deg < n) ;
+	if (deg == 0)
+	{
+
+	    /* -------------------------------------------------------------
+	     * we have a variable that can be eliminated at once because
+	     * there is no off-diagonal non-zero in its row.  Note that
+	     * Nv [i] = 1 for an empty variable i.  It is treated just
+	     * the same as an eliminated element i.
+	     * ------------------------------------------------------------- */
+
+	    Elen [i] = FLIP (1) ;
+	    nel++ ;
+	    Pe [i] = EMPTY ;
+	    W [i] = 0 ;
+
+	}
+	else if (deg > dense)
+	{
+
+	    /* -------------------------------------------------------------
+	     * Dense variables are not treated as elements, but as unordered,
+	     * non-principal variables that have no parent.  They do not take
+	     * part in the postorder, since Nv [i] = 0.  Note that the Fortran
+	     * version does not have this option.
+	     * ------------------------------------------------------------- */
+
+	    AMD_DEBUG1 (("Dense node "ID" degree "ID"\n", i, deg)) ;
+	    ndense++ ;
+	    Nv [i] = 0 ;		/* do not postorder this node */
+	    Elen [i] = EMPTY ;
+	    nel++ ;
+	    Pe [i] = EMPTY ;
+
+	}
+	else
+	{
+
+	    /* -------------------------------------------------------------
+	     * place i in the degree list corresponding to its degree
+	     * ------------------------------------------------------------- */
+
+	    inext = Head [deg] ;
+	    ASSERT (inext >= EMPTY && inext < n) ;
+	    if (inext != EMPTY) Last [inext] = i ;
+	    Next [i] = inext ;
+	    Head [deg] = i ;
+
+	}
+    }
+
+/* ========================================================================= */
+/* WHILE (selecting pivots) DO */
+/* ========================================================================= */
+
+    while (nel < n)
+    {
+
+#ifndef NDEBUG
+	AMD_DEBUG1 (("\n======Nel "ID"\n", nel)) ;
+	if (AMD_debug >= 2)
+	{
+	    AMD_dump (n, Pe, Iw, Len, iwlen, pfree, Nv, Next,
+		    Last, Head, Elen, Degree, W, nel) ;
+	}
+#endif
+
+/* ========================================================================= */
+/* GET PIVOT OF MINIMUM DEGREE */
+/* ========================================================================= */
+
+	/* ----------------------------------------------------------------- */
+	/* find next supervariable for elimination */
+	/* ----------------------------------------------------------------- */
+
+	ASSERT (mindeg >= 0 && mindeg < n) ;
+	for (deg = mindeg ; deg < n ; deg++)
+	{
+	    me = Head [deg] ;
+	    if (me != EMPTY) break ;
+	}
+	mindeg = deg ;
+	ASSERT (me >= 0 && me < n) ;
+	AMD_DEBUG1 (("=================me: "ID"\n", me)) ;
+
+	/* ----------------------------------------------------------------- */
+	/* remove chosen variable from link list */
+	/* ----------------------------------------------------------------- */
+
+	inext = Next [me] ;
+	ASSERT (inext >= EMPTY && inext < n) ;
+	if (inext != EMPTY) Last [inext] = EMPTY ;
+	Head [deg] = inext ;
+
+	/* ----------------------------------------------------------------- */
+	/* me represents the elimination of pivots nel to nel+Nv[me]-1. */
+	/* place me itself as the first in this set. */
+	/* ----------------------------------------------------------------- */
+
+	elenme = Elen [me] ;
+	nvpiv = Nv [me] ;
+	ASSERT (nvpiv > 0) ;
+	nel += nvpiv ;
+
+/* ========================================================================= */
+/* CONSTRUCT NEW ELEMENT */
+/* ========================================================================= */
+
+	/* -----------------------------------------------------------------
+	 * At this point, me is the pivotal supervariable.  It will be
+	 * converted into the current element.  Scan list of the pivotal
+	 * supervariable, me, setting tree pointers and constructing new list
+	 * of supervariables for the new element, me.  p is a pointer to the
+	 * current position in the old list.
+	 * ----------------------------------------------------------------- */
+
+	/* flag the variable "me" as being in Lme by negating Nv [me] */
+	Nv [me] = -nvpiv ;
+	degme = 0 ;
+	ASSERT (Pe [me] >= 0 && Pe [me] < iwlen) ;
+
+	if (elenme == 0)
+	{
+
+	    /* ------------------------------------------------------------- */
+	    /* construct the new element in place */
+	    /* ------------------------------------------------------------- */
+
+	    pme1 = Pe [me] ;
+	    pme2 = pme1 - 1 ;
+
+	    for (p = pme1 ; p <= pme1 + Len [me] - 1 ; p++)
+	    {
+		i = Iw [p] ;
+		ASSERT (i >= 0 && i < n && Nv [i] >= 0) ;
+		nvi = Nv [i] ;
+		if (nvi > 0)
+		{
+
+		    /* ----------------------------------------------------- */
+		    /* i is a principal variable not yet placed in Lme. */
+		    /* store i in new list */
+		    /* ----------------------------------------------------- */
+
+		    /* flag i as being in Lme by negating Nv [i] */
+		    degme += nvi ;
+		    Nv [i] = -nvi ;
+		    Iw [++pme2] = i ;
+
+		    /* ----------------------------------------------------- */
+		    /* remove variable i from degree list. */
+		    /* ----------------------------------------------------- */
+
+		    ilast = Last [i] ;
+		    inext = Next [i] ;
+		    ASSERT (ilast >= EMPTY && ilast < n) ;
+		    ASSERT (inext >= EMPTY && inext < n) ;
+		    if (inext != EMPTY) Last [inext] = ilast ;
+		    if (ilast != EMPTY)
+		    {
+			Next [ilast] = inext ;
+		    }
+		    else
+		    {
+			/* i is at the head of the degree list */
+			ASSERT (Degree [i] >= 0 && Degree [i] < n) ;
+			Head [Degree [i]] = inext ;
+		    }
+		}
+	    }
+	}
+	else
+	{
+
+	    /* ------------------------------------------------------------- */
+	    /* construct the new element in empty space, Iw [pfree ...] */
+	    /* ------------------------------------------------------------- */
+
+	    p = Pe [me] ;
+	    pme1 = pfree ;
+	    slenme = Len [me] - elenme ;
+
+	    for (knt1 = 1 ; knt1 <= elenme + 1 ; knt1++)
+	    {
+
+		if (knt1 > elenme)
+		{
+		    /* search the supervariables in me. */
+		    e = me ;
+		    pj = p ;
+		    ln = slenme ;
+		    AMD_DEBUG2 (("Search sv: "ID" "ID" "ID"\n", me,pj,ln)) ;
+		}
+		else
+		{
+		    /* search the elements in me. */
+		    e = Iw [p++] ;
+		    ASSERT (e >= 0 && e < n) ;
+		    pj = Pe [e] ;
+		    ln = Len [e] ;
+		    AMD_DEBUG2 (("Search element e "ID" in me "ID"\n", e,me)) ;
+		    ASSERT (Elen [e] < EMPTY && W [e] > 0 && pj >= 0) ;
+		}
+		ASSERT (ln >= 0 && (ln == 0 || (pj >= 0 && pj < iwlen))) ;
+
+		/* ---------------------------------------------------------
+		 * search for different supervariables and add them to the
+		 * new list, compressing when necessary. this loop is
+		 * executed once for each element in the list and once for
+		 * all the supervariables in the list.
+		 * --------------------------------------------------------- */
+
+		for (knt2 = 1 ; knt2 <= ln ; knt2++)
+		{
+		    i = Iw [pj++] ;
+		    ASSERT (i >= 0 && i < n && (i == me || Elen [i] >= EMPTY));
+		    nvi = Nv [i] ;
+		    AMD_DEBUG2 ((": "ID" "ID" "ID" "ID"\n",
+				i, Elen [i], Nv [i], wflg)) ;
+
+		    if (nvi > 0)
+		    {
+
+			/* ------------------------------------------------- */
+			/* compress Iw, if necessary */
+			/* ------------------------------------------------- */
+
+			if (pfree >= iwlen)
+			{
+
+			    AMD_DEBUG1 (("GARBAGE COLLECTION\n")) ;
+
+			    /* prepare for compressing Iw by adjusting pointers
+			     * and lengths so that the lists being searched in
+			     * the inner and outer loops contain only the
+			     * remaining entries. */
+
+			    Pe [me] = p ;
+			    Len [me] -= knt1 ;
+			    /* check if nothing left of supervariable me */
+			    if (Len [me] == 0) Pe [me] = EMPTY ;
+			    Pe [e] = pj ;
+			    Len [e] = ln - knt2 ;
+			    /* nothing left of element e */
+			    if (Len [e] == 0) Pe [e] = EMPTY ;
+
+			    ncmpa++ ;	/* one more garbage collection */
+
+			    /* store first entry of each object in Pe */
+			    /* FLIP the first entry in each object */
+			    for (j = 0 ; j < n ; j++)
+			    {
+				pn = Pe [j] ;
+				if (pn >= 0)
+				{
+				    ASSERT (pn >= 0 && pn < iwlen) ;
+				    Pe [j] = Iw [pn] ;
+				    Iw [pn] = FLIP (j) ;
+				}
+			    }
+
+			    /* psrc/pdst point to source/destination */
+			    psrc = 0 ;
+			    pdst = 0 ;
+			    pend = pme1 - 1 ;
+
+			    while (psrc <= pend)
+			    {
+				/* search for next FLIP'd entry */
+				j = FLIP (Iw [psrc++]) ;
+				if (j >= 0)
+				{
+				    AMD_DEBUG2 (("Got object j: "ID"\n", j)) ;
+				    Iw [pdst] = Pe [j] ;
+				    Pe [j] = pdst++ ;
+				    lenj = Len [j] ;
+				    /* copy from source to destination */
+				    for (knt3 = 0 ; knt3 <= lenj - 2 ; knt3++)
+				    {
+					Iw [pdst++] = Iw [psrc++] ;
+				    }
+				}
+			    }
+
+			    /* move the new partially-constructed element */
+			    p1 = pdst ;
+			    for (psrc = pme1 ; psrc <= pfree-1 ; psrc++)
+			    {
+				Iw [pdst++] = Iw [psrc] ;
+			    }
+			    pme1 = p1 ;
+			    pfree = pdst ;
+			    pj = Pe [e] ;
+			    p = Pe [me] ;
+
+			}
+
+			/* ------------------------------------------------- */
+			/* i is a principal variable not yet placed in Lme */
+			/* store i in new list */
+			/* ------------------------------------------------- */
+
+			/* flag i as being in Lme by negating Nv [i] */
+			degme += nvi ;
+			Nv [i] = -nvi ;
+			Iw [pfree++] = i ;
+			AMD_DEBUG2 (("     s: "ID"     nv "ID"\n", i, Nv [i]));
+
+			/* ------------------------------------------------- */
+			/* remove variable i from degree link list */
+			/* ------------------------------------------------- */
+
+			ilast = Last [i] ;
+			inext = Next [i] ;
+			ASSERT (ilast >= EMPTY && ilast < n) ;
+			ASSERT (inext >= EMPTY && inext < n) ;
+			if (inext != EMPTY) Last [inext] = ilast ;
+			if (ilast != EMPTY)
+			{
+			    Next [ilast] = inext ;
+			}
+			else
+			{
+			    /* i is at the head of the degree list */
+			    ASSERT (Degree [i] >= 0 && Degree [i] < n) ;
+			    Head [Degree [i]] = inext ;
+			}
+		    }
+		}
+
+		if (e != me)
+		{
+		    /* set tree pointer and flag to indicate element e is
+		     * absorbed into new element me (the parent of e is me) */
+		    AMD_DEBUG1 ((" Element "ID" => "ID"\n", e, me)) ;
+		    Pe [e] = FLIP (me) ;
+		    W [e] = 0 ;
+		}
+	    }
+
+	    pme2 = pfree - 1 ;
+	}
+
+	/* ----------------------------------------------------------------- */
+	/* me has now been converted into an element in Iw [pme1..pme2] */
+	/* ----------------------------------------------------------------- */
+
+	/* degme holds the external degree of new element */
+	Degree [me] = degme ;
+	Pe [me] = pme1 ;
+	Len [me] = pme2 - pme1 + 1 ;
+	ASSERT (Pe [me] >= 0 && Pe [me] < iwlen) ;
+
+	Elen [me] = FLIP (nvpiv + degme) ;
+	/* FLIP (Elen (me)) is now the degree of pivot (including
+	 * diagonal part). */
+
+#ifndef NDEBUG
+	AMD_DEBUG2 (("New element structure: length= "ID"\n", pme2-pme1+1)) ;
+	for (pme = pme1 ; pme <= pme2 ; pme++) AMD_DEBUG3 ((" "ID"", Iw[pme]));
+	AMD_DEBUG3 (("\n")) ;
+#endif
+
+	/* ----------------------------------------------------------------- */
+	/* make sure that wflg is not too large. */
+	/* ----------------------------------------------------------------- */
+
+	/* With the current value of wflg, wflg+n must not cause integer
+	 * overflow */
+
+	wflg = clear_flag (wflg, wbig, W, n) ;
+
+/* ========================================================================= */
+/* COMPUTE (W [e] - wflg) = |Le\Lme| FOR ALL ELEMENTS */
+/* ========================================================================= */
+
+	/* -----------------------------------------------------------------
+	 * Scan 1:  compute the external degrees of previous elements with
+	 * respect to the current element.  That is:
+	 *       (W [e] - wflg) = |Le \ Lme|
+	 * for each element e that appears in any supervariable in Lme.  The
+	 * notation Le refers to the pattern (list of supervariables) of a
+	 * previous element e, where e is not yet absorbed, stored in
+	 * Iw [Pe [e] + 1 ... Pe [e] + Len [e]].  The notation Lme
+	 * refers to the pattern of the current element (stored in
+	 * Iw [pme1..pme2]).   If aggressive absorption is enabled, and
+	 * (W [e] - wflg) becomes zero, then the element e will be absorbed
+	 * in Scan 2.
+	 * ----------------------------------------------------------------- */
+
+	AMD_DEBUG2 (("me: ")) ;
+	for (pme = pme1 ; pme <= pme2 ; pme++)
+	{
+	    i = Iw [pme] ;
+	    ASSERT (i >= 0 && i < n) ;
+	    eln = Elen [i] ;
+	    AMD_DEBUG3 ((""ID" Elen "ID": \n", i, eln)) ;
+	    if (eln > 0)
+	    {
+		/* note that Nv [i] has been negated to denote i in Lme: */
+		nvi = -Nv [i] ;
+		ASSERT (nvi > 0 && Pe [i] >= 0 && Pe [i] < iwlen) ;
+		wnvi = wflg - nvi ;
+		for (p = Pe [i] ; p <= Pe [i] + eln - 1 ; p++)
+		{
+		    e = Iw [p] ;
+		    ASSERT (e >= 0 && e < n) ;
+		    we = W [e] ;
+		    AMD_DEBUG4 (("    e "ID" we "ID" ", e, we)) ;
+		    if (we >= wflg)
+		    {
+			/* unabsorbed element e has been seen in this loop */
+			AMD_DEBUG4 (("    unabsorbed, first time seen")) ;
+			we -= nvi ;
+		    }
+		    else if (we != 0)
+		    {
+			/* e is an unabsorbed element */
+			/* this is the first we have seen e in all of Scan 1 */
+			AMD_DEBUG4 (("    unabsorbed")) ;
+			we = Degree [e] + wnvi ;
+		    }
+		    AMD_DEBUG4 (("\n")) ;
+		    W [e] = we ;
+		}
+	    }
+	}
+	AMD_DEBUG2 (("\n")) ;
+
+/* ========================================================================= */
+/* DEGREE UPDATE AND ELEMENT ABSORPTION */
+/* ========================================================================= */
+
+	/* -----------------------------------------------------------------
+	 * Scan 2:  for each i in Lme, sum up the degree of Lme (which is
+	 * degme), plus the sum of the external degrees of each Le for the
+	 * elements e appearing within i, plus the supervariables in i.
+	 * Place i in hash list.
+	 * ----------------------------------------------------------------- */
+
+	for (pme = pme1 ; pme <= pme2 ; pme++)
+	{
+	    i = Iw [pme] ;
+	    ASSERT (i >= 0 && i < n && Nv [i] < 0 && Elen [i] >= 0) ;
+	    AMD_DEBUG2 (("Updating: i "ID" "ID" "ID"\n", i, Elen[i], Len [i]));
+	    p1 = Pe [i] ;
+	    p2 = p1 + Elen [i] - 1 ;
+	    pn = p1 ;
+	    hash = 0 ;
+	    deg = 0 ;
+	    ASSERT (p1 >= 0 && p1 < iwlen && p2 >= -1 && p2 < iwlen) ;
+
+	    /* ------------------------------------------------------------- */
+	    /* scan the element list associated with supervariable i */
+	    /* ------------------------------------------------------------- */
+
+	    /* UMFPACK/MA38-style approximate degree: */
+	    if (aggressive)
+	    {
+		for (p = p1 ; p <= p2 ; p++)
+		{
+		    e = Iw [p] ;
+		    ASSERT (e >= 0 && e < n) ;
+		    we = W [e] ;
+		    if (we != 0)
+		    {
+			/* e is an unabsorbed element */
+			/* dext = | Le \ Lme | */
+			dext = we - wflg ;
+			if (dext > 0)
+			{
+			    deg += dext ;
+			    Iw [pn++] = e ;
+			    hash += e ;
+			    AMD_DEBUG4 ((" e: "ID" hash = "ID"\n",e,hash)) ;
+			}
+			else
+			{
+			    /* external degree of e is zero, absorb e into me*/
+			    AMD_DEBUG1 ((" Element "ID" =>"ID" (aggressive)\n",
+				e, me)) ;
+			    ASSERT (dext == 0) ;
+			    Pe [e] = FLIP (me) ;
+			    W [e] = 0 ;
+			}
+		    }
+		}
+	    }
+	    else
+	    {
+		for (p = p1 ; p <= p2 ; p++)
+		{
+		    e = Iw [p] ;
+		    ASSERT (e >= 0 && e < n) ;
+		    we = W [e] ;
+		    if (we != 0)
+		    {
+			/* e is an unabsorbed element */
+			dext = we - wflg ;
+			ASSERT (dext >= 0) ;
+			deg += dext ;
+			Iw [pn++] = e ;
+			hash += e ;
+			AMD_DEBUG4 (("	e: "ID" hash = "ID"\n",e,hash)) ;
+		    }
+		}
+	    }
+
+	    /* count the number of elements in i (including me): */
+	    Elen [i] = pn - p1 + 1 ;
+
+	    /* ------------------------------------------------------------- */
+	    /* scan the supervariables in the list associated with i */
+	    /* ------------------------------------------------------------- */
+
+	    /* The bulk of the AMD run time is typically spent in this loop,
+	     * particularly if the matrix has many dense rows that are not
+	     * removed prior to ordering. */
+	    p3 = pn ;
+	    p4 = p1 + Len [i] ;
+	    for (p = p2 + 1 ; p < p4 ; p++)
+	    {
+		j = Iw [p] ;
+		ASSERT (j >= 0 && j < n) ;
+		nvj = Nv [j] ;
+		if (nvj > 0)
+		{
+		    /* j is unabsorbed, and not in Lme. */
+		    /* add to degree and add to new list */
+		    deg += nvj ;
+		    Iw [pn++] = j ;
+		    hash += j ;
+		    AMD_DEBUG4 (("  s: "ID" hash "ID" Nv[j]= "ID"\n",
+				j, hash, nvj)) ;
+		}
+	    }
+
+	    /* ------------------------------------------------------------- */
+	    /* update the degree and check for mass elimination */
+	    /* ------------------------------------------------------------- */
+
+	    /* with aggressive absorption, deg==0 is identical to the
+	     * Elen [i] == 1 && p3 == pn test, below. */
+	    ASSERT (IMPLIES (aggressive, (deg==0) == (Elen[i]==1 && p3==pn))) ;
+
+	    if (Elen [i] == 1 && p3 == pn)
+	    {
+
+		/* --------------------------------------------------------- */
+		/* mass elimination */
+		/* --------------------------------------------------------- */
+
+		/* There is nothing left of this node except for an edge to
+		 * the current pivot element.  Elen [i] is 1, and there are
+		 * no variables adjacent to node i.  Absorb i into the
+		 * current pivot element, me.  Note that if there are two or
+		 * more mass eliminations, fillin due to mass elimination is
+		 * possible within the nvpiv-by-nvpiv pivot block.  It is this
+		 * step that causes AMD's analysis to be an upper bound.
+		 *
+		 * The reason is that the selected pivot has a lower
+		 * approximate degree than the true degree of the two mass
+		 * eliminated nodes.  There is no edge between the two mass
+		 * eliminated nodes.  They are merged with the current pivot
+		 * anyway.
+		 *
+		 * No fillin occurs in the Schur complement, in any case,
+		 * and this effect does not decrease the quality of the
+		 * ordering itself, just the quality of the nonzero and
+		 * flop count analysis.  It also means that the post-ordering
+		 * is not an exact elimination tree post-ordering. */
+
+		AMD_DEBUG1 (("  MASS i "ID" => parent e "ID"\n", i, me)) ;
+		Pe [i] = FLIP (me) ;
+		nvi = -Nv [i] ;
+		degme -= nvi ;
+		nvpiv += nvi ;
+		nel += nvi ;
+		Nv [i] = 0 ;
+		Elen [i] = EMPTY ;
+
+	    }
+	    else
+	    {
+
+		/* --------------------------------------------------------- */
+		/* update the upper-bound degree of i */
+		/* --------------------------------------------------------- */
+
+		/* the following degree does not yet include the size
+		 * of the current element, which is added later: */
+
+		Degree [i] = MIN (Degree [i], deg) ;
+
+		/* --------------------------------------------------------- */
+		/* add me to the list for i */
+		/* --------------------------------------------------------- */
+
+		/* move first supervariable to end of list */
+		Iw [pn] = Iw [p3] ;
+		/* move first element to end of element part of list */
+		Iw [p3] = Iw [p1] ;
+		/* add new element, me, to front of list. */
+		Iw [p1] = me ;
+		/* store the new length of the list in Len [i] */
+		Len [i] = pn - p1 + 1 ;
+
+		/* --------------------------------------------------------- */
+		/* place in hash bucket.  Save hash key of i in Last [i]. */
+		/* --------------------------------------------------------- */
+
+		/* NOTE: this can fail if hash is negative, because the ANSI C
+		 * standard does not define a % b when a and/or b are negative.
+		 * That's why hash is defined as an unsigned Int, to avoid this
+		 * problem. */
+		hash = hash % n ;
+		ASSERT (((Int) hash) >= 0 && ((Int) hash) < n) ;
+
+		/* if the Hhead array is not used: */
+		j = Head [hash] ;
+		if (j <= EMPTY)
+		{
+		    /* degree list is empty, hash head is FLIP (j) */
+		    Next [i] = FLIP (j) ;
+		    Head [hash] = FLIP (i) ;
+		}
+		else
+		{
+		    /* degree list is not empty, use Last [Head [hash]] as
+		     * hash head. */
+		    Next [i] = Last [j] ;
+		    Last [j] = i ;
+		}
+
+		/* if a separate Hhead array is used: *
+		Next [i] = Hhead [hash] ;
+		Hhead [hash] = i ;
+		*/
+
+		Last [i] = hash ;
+	    }
+	}
+
+	Degree [me] = degme ;
+
+	/* ----------------------------------------------------------------- */
+	/* Clear the counter array, W [...], by incrementing wflg. */
+	/* ----------------------------------------------------------------- */
+
+	/* make sure that wflg+n does not cause integer overflow */
+	lemax =  MAX (lemax, degme) ;
+	wflg += lemax ;
+	wflg = clear_flag (wflg, wbig, W, n) ;
+	/*  at this point, W [0..n-1] < wflg holds */
+
+/* ========================================================================= */
+/* SUPERVARIABLE DETECTION */
+/* ========================================================================= */
+
+	AMD_DEBUG1 (("Detecting supervariables:\n")) ;
+	for (pme = pme1 ; pme <= pme2 ; pme++)
+	{
+	    i = Iw [pme] ;
+	    ASSERT (i >= 0 && i < n) ;
+	    AMD_DEBUG2 (("Consider i "ID" nv "ID"\n", i, Nv [i])) ;
+	    if (Nv [i] < 0)
+	    {
+		/* i is a principal variable in Lme */
+
+		/* ---------------------------------------------------------
+		 * examine all hash buckets with 2 or more variables.  We do
+		 * this by examing all unique hash keys for supervariables in
+		 * the pattern Lme of the current element, me
+		 * --------------------------------------------------------- */
+
+		/* let i = head of hash bucket, and empty the hash bucket */
+		ASSERT (Last [i] >= 0 && Last [i] < n) ;
+		hash = Last [i] ;
+
+		/* if Hhead array is not used: */
+		j = Head [hash] ;
+		if (j == EMPTY)
+		{
+		    /* hash bucket and degree list are both empty */
+		    i = EMPTY ;
+		}
+		else if (j < EMPTY)
+		{
+		    /* degree list is empty */
+		    i = FLIP (j) ;
+		    Head [hash] = EMPTY ;
+		}
+		else
+		{
+		    /* degree list is not empty, restore Last [j] of head j */
+		    i = Last [j] ;
+		    Last [j] = EMPTY ;
+		}
+
+		/* if separate Hhead array is used: *
+		i = Hhead [hash] ;
+		Hhead [hash] = EMPTY ;
+		*/
+
+		ASSERT (i >= EMPTY && i < n) ;
+		AMD_DEBUG2 (("----i "ID" hash "ID"\n", i, hash)) ;
+
+		while (i != EMPTY && Next [i] != EMPTY)
+		{
+
+		    /* -----------------------------------------------------
+		     * this bucket has one or more variables following i.
+		     * scan all of them to see if i can absorb any entries
+		     * that follow i in hash bucket.  Scatter i into w.
+		     * ----------------------------------------------------- */
+
+		    ln = Len [i] ;
+		    eln = Elen [i] ;
+		    ASSERT (ln >= 0 && eln >= 0) ;
+		    ASSERT (Pe [i] >= 0 && Pe [i] < iwlen) ;
+		    /* do not flag the first element in the list (me) */
+		    for (p = Pe [i] + 1 ; p <= Pe [i] + ln - 1 ; p++)
+		    {
+			ASSERT (Iw [p] >= 0 && Iw [p] < n) ;
+			W [Iw [p]] = wflg ;
+		    }
+
+		    /* ----------------------------------------------------- */
+		    /* scan every other entry j following i in bucket */
+		    /* ----------------------------------------------------- */
+
+		    jlast = i ;
+		    j = Next [i] ;
+		    ASSERT (j >= EMPTY && j < n) ;
+
+		    while (j != EMPTY)
+		    {
+			/* ------------------------------------------------- */
+			/* check if j and i have identical nonzero pattern */
+			/* ------------------------------------------------- */
+
+			AMD_DEBUG3 (("compare i "ID" and j "ID"\n", i,j)) ;
+
+			/* check if i and j have the same Len and Elen */
+			ASSERT (Len [j] >= 0 && Elen [j] >= 0) ;
+			ASSERT (Pe [j] >= 0 && Pe [j] < iwlen) ;
+			ok = (Len [j] == ln) && (Elen [j] == eln) ;
+			/* skip the first element in the list (me) */
+			for (p = Pe [j] + 1 ; ok && p <= Pe [j] + ln - 1 ; p++)
+			{
+			    ASSERT (Iw [p] >= 0 && Iw [p] < n) ;
+			    if (W [Iw [p]] != wflg) ok = 0 ;
+			}
+			if (ok)
+			{
+			    /* --------------------------------------------- */
+			    /* found it!  j can be absorbed into i */
+			    /* --------------------------------------------- */
+
+			    AMD_DEBUG1 (("found it! j "ID" => i "ID"\n", j,i));
+			    Pe [j] = FLIP (i) ;
+			    /* both Nv [i] and Nv [j] are negated since they */
+			    /* are in Lme, and the absolute values of each */
+			    /* are the number of variables in i and j: */
+			    Nv [i] += Nv [j] ;
+			    Nv [j] = 0 ;
+			    Elen [j] = EMPTY ;
+			    /* delete j from hash bucket */
+			    ASSERT (j != Next [j]) ;
+			    j = Next [j] ;
+			    Next [jlast] = j ;
+
+			}
+			else
+			{
+			    /* j cannot be absorbed into i */
+			    jlast = j ;
+			    ASSERT (j != Next [j]) ;
+			    j = Next [j] ;
+			}
+			ASSERT (j >= EMPTY && j < n) ;
+		    }
+
+		    /* -----------------------------------------------------
+		     * no more variables can be absorbed into i
+		     * go to next i in bucket and clear flag array
+		     * ----------------------------------------------------- */
+
+		    wflg++ ;
+		    i = Next [i] ;
+		    ASSERT (i >= EMPTY && i < n) ;
+
+		}
+	    }
+	}
+	AMD_DEBUG2 (("detect done\n")) ;
+
+/* ========================================================================= */
+/* RESTORE DEGREE LISTS AND REMOVE NONPRINCIPAL SUPERVARIABLES FROM ELEMENT */
+/* ========================================================================= */
+
+	p = pme1 ;
+	nleft = n - nel ;
+	for (pme = pme1 ; pme <= pme2 ; pme++)
+	{
+	    i = Iw [pme] ;
+	    ASSERT (i >= 0 && i < n) ;
+	    nvi = -Nv [i] ;
+	    AMD_DEBUG3 (("Restore i "ID" "ID"\n", i, nvi)) ;
+	    if (nvi > 0)
+	    {
+		/* i is a principal variable in Lme */
+		/* restore Nv [i] to signify that i is principal */
+		Nv [i] = nvi ;
+
+		/* --------------------------------------------------------- */
+		/* compute the external degree (add size of current element) */
+		/* --------------------------------------------------------- */
+
+		deg = Degree [i] + degme - nvi ;
+		deg = MIN (deg, nleft - nvi) ;
+		ASSERT (IMPLIES (aggressive, deg > 0) && deg >= 0 && deg < n) ;
+
+		/* --------------------------------------------------------- */
+		/* place the supervariable at the head of the degree list */
+		/* --------------------------------------------------------- */
+
+		inext = Head [deg] ;
+		ASSERT (inext >= EMPTY && inext < n) ;
+		if (inext != EMPTY) Last [inext] = i ;
+		Next [i] = inext ;
+		Last [i] = EMPTY ;
+		Head [deg] = i ;
+
+		/* --------------------------------------------------------- */
+		/* save the new degree, and find the minimum degree */
+		/* --------------------------------------------------------- */
+
+		mindeg = MIN (mindeg, deg) ;
+		Degree [i] = deg ;
+
+		/* --------------------------------------------------------- */
+		/* place the supervariable in the element pattern */
+		/* --------------------------------------------------------- */
+
+		Iw [p++] = i ;
+
+	    }
+	}
+	AMD_DEBUG2 (("restore done\n")) ;
+
+/* ========================================================================= */
+/* FINALIZE THE NEW ELEMENT */
+/* ========================================================================= */
+
+	AMD_DEBUG2 (("ME = "ID" DONE\n", me)) ;
+	Nv [me] = nvpiv ;
+	/* save the length of the list for the new element me */
+	Len [me] = p - pme1 ;
+	if (Len [me] == 0)
+	{
+	    /* there is nothing left of the current pivot element */
+	    /* it is a root of the assembly tree */
+	    Pe [me] = EMPTY ;
+	    W [me] = 0 ;
+	}
+	if (elenme != 0)
+	{
+	    /* element was not constructed in place: deallocate part of */
+	    /* it since newly nonprincipal variables may have been removed */
+	    pfree = p ;
+	}
+
+	/* The new element has nvpiv pivots and the size of the contribution
+	 * block for a multifrontal method is degme-by-degme, not including
+	 * the "dense" rows/columns.  If the "dense" rows/columns are included,
+	 * the frontal matrix is no larger than
+	 * (degme+ndense)-by-(degme+ndense).
+	 */
+
+	if (Info != (double *) NULL)
+	{
+	    f = nvpiv ;
+	    r = degme + ndense ;
+	    dmax = MAX (dmax, f + r) ;
+
+	    /* number of nonzeros in L (excluding the diagonal) */
+	    lnzme = f*r + (f-1)*f/2 ;
+	    lnz += lnzme ;
+
+	    /* number of divide operations for LDL' and for LU */
+	    ndiv += lnzme ;
+
+	    /* number of multiply-subtract pairs for LU */
+	    s = f*r*r + r*(f-1)*f + (f-1)*f*(2*f-1)/6 ;
+	    nms_lu += s ;
+
+	    /* number of multiply-subtract pairs for LDL' */
+	    nms_ldl += (s + lnzme)/2 ;
+	}
+
+#ifndef NDEBUG
+	AMD_DEBUG2 (("finalize done nel "ID" n "ID"\n   ::::\n", nel, n)) ;
+	for (pme = Pe [me] ; pme <= Pe [me] + Len [me] - 1 ; pme++)
+	{
+	      AMD_DEBUG3 ((" "ID"", Iw [pme])) ;
+	}
+	AMD_DEBUG3 (("\n")) ;
+#endif
+
+    }
+
+/* ========================================================================= */
+/* DONE SELECTING PIVOTS */
+/* ========================================================================= */
+
+    if (Info != (double *) NULL)
+    {
+
+	/* count the work to factorize the ndense-by-ndense submatrix */
+	f = ndense ;
+	dmax = MAX (dmax, (double) ndense) ;
+
+	/* number of nonzeros in L (excluding the diagonal) */
+	lnzme = (f-1)*f/2 ;
+	lnz += lnzme ;
+
+	/* number of divide operations for LDL' and for LU */
+	ndiv += lnzme ;
+
+	/* number of multiply-subtract pairs for LU */
+	s = (f-1)*f*(2*f-1)/6 ;
+	nms_lu += s ;
+
+	/* number of multiply-subtract pairs for LDL' */
+	nms_ldl += (s + lnzme)/2 ;
+
+	/* number of nz's in L (excl. diagonal) */
+	Info [AMD_LNZ] = lnz ;
+
+	/* number of divide ops for LU and LDL' */
+	Info [AMD_NDIV] = ndiv ;
+
+	/* number of multiply-subtract pairs for LDL' */
+	Info [AMD_NMULTSUBS_LDL] = nms_ldl ;
+
+	/* number of multiply-subtract pairs for LU */
+	Info [AMD_NMULTSUBS_LU] = nms_lu ;
+
+	/* number of "dense" rows/columns */
+	Info [AMD_NDENSE] = ndense ;
+
+	/* largest front is dmax-by-dmax */
+	Info [AMD_DMAX] = dmax ;
+
+	/* number of garbage collections in AMD */
+	Info [AMD_NCMPA] = ncmpa ;
+
+	/* successful ordering */
+	Info [AMD_STATUS] = AMD_OK ;
+    }
+
+/* ========================================================================= */
+/* POST-ORDERING */
+/* ========================================================================= */
+
+/* -------------------------------------------------------------------------
+ * Variables at this point:
+ *
+ * Pe: holds the elimination tree.  The parent of j is FLIP (Pe [j]),
+ *	or EMPTY if j is a root.  The tree holds both elements and
+ *	non-principal (unordered) variables absorbed into them.
+ *	Dense variables are non-principal and unordered.
+ *
+ * Elen: holds the size of each element, including the diagonal part.
+ *	FLIP (Elen [e]) > 0 if e is an element.  For unordered
+ *	variables i, Elen [i] is EMPTY.
+ *
+ * Nv: Nv [e] > 0 is the number of pivots represented by the element e.
+ *	For unordered variables i, Nv [i] is zero.
+ *
+ * Contents no longer needed:
+ *	W, Iw, Len, Degree, Head, Next, Last.
+ *
+ * The matrix itself has been destroyed.
+ *
+ * n: the size of the matrix.
+ * No other scalars needed (pfree, iwlen, etc.)
+ * ------------------------------------------------------------------------- */
+
+    /* restore Pe */
+    for (i = 0 ; i < n ; i++)
+    {
+	Pe [i] = FLIP (Pe [i]) ;
+    }
+
+    /* restore Elen, for output information, and for postordering */
+    for (i = 0 ; i < n ; i++)
+    {
+	Elen [i] = FLIP (Elen [i]) ;
+    }
+
+/* Now the parent of j is Pe [j], or EMPTY if j is a root.  Elen [e] > 0
+ * is the size of element e.  Elen [i] is EMPTY for unordered variable i. */
+
+#ifndef NDEBUG
+    AMD_DEBUG2 (("\nTree:\n")) ;
+    for (i = 0 ; i < n ; i++)
+    {
+	AMD_DEBUG2 ((" "ID" parent: "ID"   ", i, Pe [i])) ;
+	ASSERT (Pe [i] >= EMPTY && Pe [i] < n) ;
+	if (Nv [i] > 0)
+	{
+	    /* this is an element */
+	    e = i ;
+	    AMD_DEBUG2 ((" element, size is "ID"\n", Elen [i])) ;
+	    ASSERT (Elen [e] > 0) ;
+	}
+	AMD_DEBUG2 (("\n")) ;
+    }
+    AMD_DEBUG2 (("\nelements:\n")) ;
+    for (e = 0 ; e < n ; e++)
+    {
+	if (Nv [e] > 0)
+	{
+	    AMD_DEBUG3 (("Element e= "ID" size "ID" nv "ID" \n", e,
+		Elen [e], Nv [e])) ;
+	}
+    }
+    AMD_DEBUG2 (("\nvariables:\n")) ;
+    for (i = 0 ; i < n ; i++)
+    {
+	Int cnt ;
+	if (Nv [i] == 0)
+	{
+	    AMD_DEBUG3 (("i unordered: "ID"\n", i)) ;
+	    j = Pe [i] ;
+	    cnt = 0 ;
+	    AMD_DEBUG3 (("  j: "ID"\n", j)) ;
+	    if (j == EMPTY)
+	    {
+		AMD_DEBUG3 (("	i is a dense variable\n")) ;
+	    }
+	    else
+	    {
+		ASSERT (j >= 0 && j < n) ;
+		while (Nv [j] == 0)
+		{
+		    AMD_DEBUG3 (("	j : "ID"\n", j)) ;
+		    j = Pe [j] ;
+		    AMD_DEBUG3 (("	j:: "ID"\n", j)) ;
+		    cnt++ ;
+		    if (cnt > n) break ;
+		}
+		e = j ;
+		AMD_DEBUG3 (("	got to e: "ID"\n", e)) ;
+	    }
+	}
+    }
+#endif
+
+/* ========================================================================= */
+/* compress the paths of the variables */
+/* ========================================================================= */
+
+    for (i = 0 ; i < n ; i++)
+    {
+	if (Nv [i] == 0)
+	{
+
+	    /* -------------------------------------------------------------
+	     * i is an un-ordered row.  Traverse the tree from i until
+	     * reaching an element, e.  The element, e, was the principal
+	     * supervariable of i and all nodes in the path from i to when e
+	     * was selected as pivot.
+	     * ------------------------------------------------------------- */
+
+	    AMD_DEBUG1 (("Path compression, i unordered: "ID"\n", i)) ;
+	    j = Pe [i] ;
+	    ASSERT (j >= EMPTY && j < n) ;
+	    AMD_DEBUG3 (("	j: "ID"\n", j)) ;
+	    if (j == EMPTY)
+	    {
+		/* Skip a dense variable.  It has no parent. */
+		AMD_DEBUG3 (("      i is a dense variable\n")) ;
+		continue ;
+	    }
+
+	    /* while (j is a variable) */
+	    while (Nv [j] == 0)
+	    {
+		AMD_DEBUG3 (("		j : "ID"\n", j)) ;
+		j = Pe [j] ;
+		AMD_DEBUG3 (("		j:: "ID"\n", j)) ;
+		ASSERT (j >= 0 && j < n) ;
+	    }
+	    /* got to an element e */
+	    e = j ;
+	    AMD_DEBUG3 (("got to e: "ID"\n", e)) ;
+
+	    /* -------------------------------------------------------------
+	     * traverse the path again from i to e, and compress the path
+	     * (all nodes point to e).  Path compression allows this code to
+	     * compute in O(n) time.
+	     * ------------------------------------------------------------- */
+
+	    j = i ;
+	    /* while (j is a variable) */
+	    while (Nv [j] == 0)
+	    {
+		jnext = Pe [j] ;
+		AMD_DEBUG3 (("j "ID" jnext "ID"\n", j, jnext)) ;
+		Pe [j] = e ;
+		j = jnext ;
+		ASSERT (j >= 0 && j < n) ;
+	    }
+	}
+    }
+
+/* ========================================================================= */
+/* postorder the assembly tree */
+/* ========================================================================= */
+
+    AMD_postorder (n, Pe, Nv, Elen,
+	W,			/* output order */
+	Head, Next, Last) ;	/* workspace */
+
+/* ========================================================================= */
+/* compute output permutation and inverse permutation */
+/* ========================================================================= */
+
+    /* W [e] = k means that element e is the kth element in the new
+     * order.  e is in the range 0 to n-1, and k is in the range 0 to
+     * the number of elements.  Use Head for inverse order. */
+
+    for (k = 0 ; k < n ; k++)
+    {
+	Head [k] = EMPTY ;
+	Next [k] = EMPTY ;
+    }
+    for (e = 0 ; e < n ; e++)
+    {
+	k = W [e] ;
+	ASSERT ((k == EMPTY) == (Nv [e] == 0)) ;
+	if (k != EMPTY)
+	{
+	    ASSERT (k >= 0 && k < n) ;
+	    Head [k] = e ;
+	}
+    }
+
+    /* construct output inverse permutation in Next,
+     * and permutation in Last */
+    nel = 0 ;
+    for (k = 0 ; k < n ; k++)
+    {
+	e = Head [k] ;
+	if (e == EMPTY) break ;
+	ASSERT (e >= 0 && e < n && Nv [e] > 0) ;
+	Next [e] = nel ;
+	nel += Nv [e] ;
+    }
+    ASSERT (nel == n - ndense) ;
+
+    /* order non-principal variables (dense, & those merged into supervar's) */
+    for (i = 0 ; i < n ; i++)
+    {
+	if (Nv [i] == 0)
+	{
+	    e = Pe [i] ;
+	    ASSERT (e >= EMPTY && e < n) ;
+	    if (e != EMPTY)
+	    {
+		/* This is an unordered variable that was merged
+		 * into element e via supernode detection or mass
+		 * elimination of i when e became the pivot element.
+		 * Place i in order just before e. */
+		ASSERT (Next [i] == EMPTY && Nv [e] > 0) ;
+		Next [i] = Next [e] ;
+		Next [e]++ ;
+	    }
+	    else
+	    {
+		/* This is a dense unordered variable, with no parent.
+		 * Place it last in the output order. */
+		Next [i] = nel++ ;
+	    }
+	}
+    }
+    ASSERT (nel == n) ;
+
+    AMD_DEBUG2 (("\n\nPerm:\n")) ;
+    for (i = 0 ; i < n ; i++)
+    {
+	k = Next [i] ;
+	ASSERT (k >= 0 && k < n) ;
+	Last [k] = i ;
+	AMD_DEBUG2 (("   perm ["ID"] = "ID"\n", k, i)) ;
+    }
+}
diff --git a/src/maths/KLU/amd_aat.c b/src/maths/KLU/amd_aat.c
new file mode 100644
index 000000000..4f02b755f
--- /dev/null
+++ b/src/maths/KLU/amd_aat.c
@@ -0,0 +1,185 @@
+/* ========================================================================= */
+/* === AMD_aat ============================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* AMD_aat:  compute the symmetry of the pattern of A, and count the number of
+ * nonzeros each column of A+A' (excluding the diagonal).  Assumes the input
+ * matrix has no errors, with sorted columns and no duplicates
+ * (AMD_valid (n, n, Ap, Ai) must be AMD_OK, but this condition is not
+ * checked).
+ */
+
+#include "amd_internal.h"
+
+GLOBAL size_t AMD_aat	/* returns nz in A+A' */
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int Len [ ],	/* Len [j]: length of column j of A+A', excl diagonal*/
+    Int Tp [ ],		/* workspace of size n */
+    double Info [ ]
+)
+{
+    Int p1, p2, p, i, j, pj, pj2, k, nzdiag, nzboth, nz ;
+    double sym ;
+    size_t nzaat ;
+
+#ifndef NDEBUG
+    AMD_debug_init ("AMD AAT") ;
+    for (k = 0 ; k < n ; k++) Tp [k] = EMPTY ;
+    ASSERT (AMD_valid (n, n, Ap, Ai) == AMD_OK) ;
+#endif
+
+    if (Info != (double *) NULL)
+    {
+	/* clear the Info array, if it exists */
+	for (i = 0 ; i < AMD_INFO ; i++)
+	{
+	    Info [i] = EMPTY ;
+	}
+	Info [AMD_STATUS] = AMD_OK ;
+    }
+
+    for (k = 0 ; k < n ; k++)
+    {
+	Len [k] = 0 ;
+    }
+
+    nzdiag = 0 ;
+    nzboth = 0 ;
+    nz = Ap [n] ;
+
+    for (k = 0 ; k < n ; k++)
+    {
+	p1 = Ap [k] ;
+	p2 = Ap [k+1] ;
+	AMD_DEBUG2 (("\nAAT Column: "ID" p1: "ID" p2: "ID"\n", k, p1, p2)) ;
+
+	/* construct A+A' */
+	for (p = p1 ; p < p2 ; )
+	{
+	    /* scan the upper triangular part of A */
+	    j = Ai [p] ;
+	    if (j < k)
+	    {
+		/* entry A (j,k) is in the strictly upper triangular part,
+		 * add both A (j,k) and A (k,j) to the matrix A+A' */
+		Len [j]++ ;
+		Len [k]++ ;
+		AMD_DEBUG3 (("    upper ("ID","ID") ("ID","ID")\n", j,k, k,j));
+		p++ ;
+	    }
+	    else if (j == k)
+	    {
+		/* skip the diagonal */
+		p++ ;
+		nzdiag++ ;
+		break ;
+	    }
+	    else /* j > k */
+	    {
+		/* first entry below the diagonal */
+		break ;
+	    }
+	    /* scan lower triangular part of A, in column j until reaching
+	     * row k.  Start where last scan left off. */
+	    ASSERT (Tp [j] != EMPTY) ;
+	    ASSERT (Ap [j] <= Tp [j] && Tp [j] <= Ap [j+1]) ;
+	    pj2 = Ap [j+1] ;
+	    for (pj = Tp [j] ; pj < pj2 ; )
+	    {
+		i = Ai [pj] ;
+		if (i < k)
+		{
+		    /* A (i,j) is only in the lower part, not in upper.
+		     * add both A (i,j) and A (j,i) to the matrix A+A' */
+		    Len [i]++ ;
+		    Len [j]++ ;
+		    AMD_DEBUG3 (("    lower ("ID","ID") ("ID","ID")\n",
+			i,j, j,i)) ;
+		    pj++ ;
+		}
+		else if (i == k)
+		{
+		    /* entry A (k,j) in lower part and A (j,k) in upper */
+		    pj++ ;
+		    nzboth++ ;
+		    break ;
+		}
+		else /* i > k */
+		{
+		    /* consider this entry later, when k advances to i */
+		    break ;
+		}
+	    }
+	    Tp [j] = pj ;
+	}
+	/* Tp [k] points to the entry just below the diagonal in column k */
+	Tp [k] = p ;
+    }
+
+    /* clean up, for remaining mismatched entries */
+    for (j = 0 ; j < n ; j++)
+    {
+	for (pj = Tp [j] ; pj < Ap [j+1] ; pj++)
+	{
+	    i = Ai [pj] ;
+	    /* A (i,j) is only in the lower part, not in upper.
+	     * add both A (i,j) and A (j,i) to the matrix A+A' */
+	    Len [i]++ ;
+	    Len [j]++ ;
+	    AMD_DEBUG3 (("    lower cleanup ("ID","ID") ("ID","ID")\n",
+		i,j, j,i)) ;
+	}
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* compute the symmetry of the nonzero pattern of A */
+    /* --------------------------------------------------------------------- */
+
+    /* Given a matrix A, the symmetry of A is:
+     *	B = tril (spones (A), -1) + triu (spones (A), 1) ;
+     *  sym = nnz (B & B') / nnz (B) ;
+     *  or 1 if nnz (B) is zero.
+     */
+
+    if (nz == nzdiag)
+    {
+	sym = 1 ;
+    }
+    else
+    {
+	sym = (2 * (double) nzboth) / ((double) (nz - nzdiag)) ;
+    }
+
+    nzaat = 0 ;
+    for (k = 0 ; k < n ; k++)
+    {
+	nzaat += Len [k] ;
+    }
+
+    AMD_DEBUG1 (("AMD nz in A+A', excluding diagonal (nzaat) = %g\n",
+	(double) nzaat)) ;
+    AMD_DEBUG1 (("   nzboth: "ID" nz: "ID" nzdiag: "ID" symmetry: %g\n",
+		nzboth, nz, nzdiag, sym)) ;
+
+    if (Info != (double *) NULL)
+    {
+	Info [AMD_STATUS] = AMD_OK ;
+	Info [AMD_N] = n ;
+	Info [AMD_NZ] = nz ;
+	Info [AMD_SYMMETRY] = sym ;	    /* symmetry of pattern of A */
+	Info [AMD_NZDIAG] = nzdiag ;	    /* nonzeros on diagonal of A */
+	Info [AMD_NZ_A_PLUS_AT] = nzaat ;   /* nonzeros in A+A' */
+    }
+
+    return (nzaat) ;
+}
diff --git a/src/maths/KLU/amd_control.c b/src/maths/KLU/amd_control.c
new file mode 100644
index 000000000..c2aec9f0b
--- /dev/null
+++ b/src/maths/KLU/amd_control.c
@@ -0,0 +1,64 @@
+/* ========================================================================= */
+/* === AMD_control ========================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable.  Prints the control parameters for AMD.  See amd.h
+ * for details.  If the Control array is not present, the defaults are
+ * printed instead.
+ */
+
+#include "amd_internal.h"
+
+GLOBAL void AMD_control
+(
+    double Control [ ]
+)
+{
+    double alpha ;
+    Int aggressive ;
+
+    if (Control != (double *) NULL)
+    {
+	alpha = Control [AMD_DENSE] ;
+	aggressive = Control [AMD_AGGRESSIVE] != 0 ;
+    }
+    else
+    {
+	alpha = AMD_DEFAULT_DENSE ;
+	aggressive = AMD_DEFAULT_AGGRESSIVE ;
+    }
+
+    PRINTF (("\nAMD version %d.%d.%d, %s: approximate minimum degree ordering\n"
+	"    dense row parameter: %g\n", AMD_MAIN_VERSION, AMD_SUB_VERSION,
+	AMD_SUBSUB_VERSION, AMD_DATE, alpha)) ;
+
+    if (alpha < 0)
+    {
+	PRINTF (("    no rows treated as dense\n")) ;
+    }
+    else
+    {
+	PRINTF ((
+	"    (rows with more than max (%g * sqrt (n), 16) entries are\n"
+	"    considered \"dense\", and placed last in output permutation)\n",
+	alpha)) ;
+    }
+
+    if (aggressive)
+    {
+	PRINTF (("    aggressive absorption:  yes\n")) ;
+    }
+    else
+    {
+	PRINTF (("    aggressive absorption:  no\n")) ;
+    }
+
+    PRINTF (("    size of AMD integer: %d\n\n", sizeof (Int))) ;
+}
diff --git a/src/maths/KLU/amd_defaults.c b/src/maths/KLU/amd_defaults.c
new file mode 100644
index 000000000..ffe3f4bd5
--- /dev/null
+++ b/src/maths/KLU/amd_defaults.c
@@ -0,0 +1,38 @@
+/* ========================================================================= */
+/* === AMD_defaults ======================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable.  Sets default control parameters for AMD.  See amd.h
+ * for details.
+ */
+
+#include "amd_internal.h"
+
+/* ========================================================================= */
+/* === AMD defaults ======================================================== */
+/* ========================================================================= */
+
+GLOBAL void AMD_defaults
+(
+    double Control [ ]
+)
+{
+    Int i ;
+
+    if (Control != (double *) NULL)
+    {
+	for (i = 0 ; i < AMD_CONTROL ; i++)
+	{
+	    Control [i] = 0 ;
+	}
+	Control [AMD_DENSE] = AMD_DEFAULT_DENSE ;
+	Control [AMD_AGGRESSIVE] = AMD_DEFAULT_AGGRESSIVE ;
+    }
+}
diff --git a/src/maths/KLU/amd_dump.c b/src/maths/KLU/amd_dump.c
new file mode 100644
index 000000000..89d67b875
--- /dev/null
+++ b/src/maths/KLU/amd_dump.c
@@ -0,0 +1,180 @@
+/* ========================================================================= */
+/* === AMD_dump ============================================================ */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Debugging routines for AMD.  Not used if NDEBUG is not defined at compile-
+ * time (the default).  See comments in amd_internal.h on how to enable
+ * debugging.  Not user-callable.
+ */
+
+#include "amd_internal.h"
+
+#ifndef NDEBUG
+
+/* This global variable is present only when debugging */
+GLOBAL Int AMD_debug = -999 ;		/* default is no debug printing */
+
+/* ========================================================================= */
+/* === AMD_debug_init ====================================================== */
+/* ========================================================================= */
+
+/* Sets the debug print level, by reading the file debug.amd (if it exists) */
+
+GLOBAL void AMD_debug_init ( char *s )
+{
+    FILE *f ;
+    f = fopen ("debug.amd", "r") ;
+    if (f == (FILE *) NULL)
+    {
+	AMD_debug = -999 ;
+    }
+    else
+    {
+	fscanf (f, ID, &AMD_debug) ;
+	fclose (f) ;
+    }
+    if (AMD_debug >= 0)
+    {
+	printf ("%s: AMD_debug_init, D= "ID"\n", s, AMD_debug) ;
+    }
+}
+
+/* ========================================================================= */
+/* === AMD_dump ============================================================ */
+/* ========================================================================= */
+
+/* Dump AMD's data structure, except for the hash buckets.  This routine
+ * cannot be called when the hash buckets are non-empty.
+ */
+
+GLOBAL void AMD_dump (
+    Int n,	    /* A is n-by-n */
+    Int Pe [ ],	    /* pe [0..n-1]: index in iw of start of row i */
+    Int Iw [ ],	    /* workspace of size iwlen, iwlen [0..pfree-1]
+		     * holds the matrix on input */
+    Int Len [ ],    /* len [0..n-1]: length for row i */
+    Int iwlen,	    /* length of iw */
+    Int pfree,	    /* iw [pfree ... iwlen-1] is empty on input */
+    Int Nv [ ],	    /* nv [0..n-1] */
+    Int Next [ ],   /* next [0..n-1] */
+    Int Last [ ],   /* last [0..n-1] */
+    Int Head [ ],   /* head [0..n-1] */
+    Int Elen [ ],   /* size n */
+    Int Degree [ ], /* size n */
+    Int W [ ],	    /* size n */
+    Int nel
+)
+{
+    Int i, pe, elen, nv, len, e, p, k, j, deg, w, cnt, ilast ;
+
+    if (AMD_debug < 0) return ;
+    ASSERT (pfree <= iwlen) ;
+    AMD_DEBUG3 (("\nAMD dump, pfree: "ID"\n", pfree)) ;
+    for (i = 0 ; i < n ; i++)
+    {
+	pe = Pe [i] ;
+	elen = Elen [i] ;
+	nv = Nv [i] ;
+	len = Len [i] ;
+	w = W [i] ;
+
+	if (elen >= EMPTY)
+	{
+	    if (nv == 0)
+	    {
+		AMD_DEBUG3 (("\nI "ID": nonprincipal:    ", i)) ;
+		ASSERT (elen == EMPTY) ;
+		if (pe == EMPTY)
+		{
+		    AMD_DEBUG3 ((" dense node\n")) ;
+		    ASSERT (w == 1) ;
+		}
+		else
+		{
+		    ASSERT (pe < EMPTY) ;
+		    AMD_DEBUG3 ((" i "ID" -> parent "ID"\n", i, FLIP (Pe[i])));
+		}
+	    }
+	    else
+	    {
+		AMD_DEBUG3 (("\nI "ID": active principal supervariable:\n",i));
+		AMD_DEBUG3 (("   nv(i): "ID"  Flag: %d\n", nv, (nv < 0))) ;
+		ASSERT (elen >= 0) ;
+		ASSERT (nv > 0 && pe >= 0) ;
+		p = pe ;
+		AMD_DEBUG3 (("   e/s: ")) ;
+		if (elen == 0) AMD_DEBUG3 ((" : ")) ;
+		ASSERT (pe + len <= pfree) ;
+		for (k = 0 ; k < len ; k++)
+		{
+		    j = Iw [p] ;
+		    AMD_DEBUG3 (("  "ID"", j)) ;
+		    ASSERT (j >= 0 && j < n) ;
+		    if (k == elen-1) AMD_DEBUG3 ((" : ")) ;
+		    p++ ;
+		}
+		AMD_DEBUG3 (("\n")) ;
+	    }
+	}
+	else
+	{
+	    e = i ;
+	    if (w == 0)
+	    {
+		AMD_DEBUG3 (("\nE "ID": absorbed element: w "ID"\n", e, w)) ;
+		ASSERT (nv > 0 && pe < 0) ;
+		AMD_DEBUG3 ((" e "ID" -> parent "ID"\n", e, FLIP (Pe [e]))) ;
+	    }
+	    else
+	    {
+		AMD_DEBUG3 (("\nE "ID": unabsorbed element: w "ID"\n", e, w)) ;
+		ASSERT (nv > 0 && pe >= 0) ;
+		p = pe ;
+		AMD_DEBUG3 ((" : ")) ;
+		ASSERT (pe + len <= pfree) ;
+		for (k = 0 ; k < len ; k++)
+		{
+		    j = Iw [p] ;
+		    AMD_DEBUG3 (("  "ID"", j)) ;
+		    ASSERT (j >= 0 && j < n) ;
+		    p++ ;
+		}
+		AMD_DEBUG3 (("\n")) ;
+	    }
+	}
+    }
+
+    /* this routine cannot be called when the hash buckets are non-empty */
+    AMD_DEBUG3 (("\nDegree lists:\n")) ;
+    if (nel >= 0)
+    {
+	cnt = 0 ;
+	for (deg = 0 ; deg < n ; deg++)
+	{
+	    if (Head [deg] == EMPTY) continue ;
+	    ilast = EMPTY ;
+	    AMD_DEBUG3 ((ID": \n", deg)) ;
+	    for (i = Head [deg] ; i != EMPTY ; i = Next [i])
+	    {
+		AMD_DEBUG3 (("   "ID" : next "ID" last "ID" deg "ID"\n",
+		    i, Next [i], Last [i], Degree [i])) ;
+		ASSERT (i >= 0 && i < n && ilast == Last [i] &&
+		    deg == Degree [i]) ;
+		cnt += Nv [i] ;
+		ilast = i ;
+	    }
+	    AMD_DEBUG3 (("\n")) ;
+	}
+	ASSERT (cnt == n - nel) ;
+    }
+
+}
+
+#endif
diff --git a/src/maths/KLU/amd_global.c b/src/maths/KLU/amd_global.c
new file mode 100644
index 000000000..93f2b4518
--- /dev/null
+++ b/src/maths/KLU/amd_global.c
@@ -0,0 +1,84 @@
+/* ========================================================================= */
+/* === amd_global ========================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+
+#ifdef MATLAB_MEX_FILE
+#include "mex.h"
+#include "matrix.h"
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+/* ========================================================================= */
+/* === Default AMD memory manager ========================================== */
+/* ========================================================================= */
+
+/* The user can redefine these global pointers at run-time to change the memory
+ * manager used by AMD.  AMD only uses malloc and free; realloc and calloc are
+ * include for completeness, in case another package wants to use the same
+ * memory manager as AMD.
+ *
+ * If compiling as a MATLAB mexFunction, the default memory manager is mxMalloc.
+ * You can also compile AMD as a standard ANSI-C library and link a mexFunction
+ * against it, and then redefine these pointers at run-time, in your
+ * mexFunction.
+ *
+ * If -DNMALLOC is defined at compile-time, no memory manager is specified at
+ * compile-time.  You must then define these functions at run-time, before
+ * calling AMD, for AMD to work properly.
+ */
+
+#ifndef NMALLOC
+#ifdef MATLAB_MEX_FILE
+/* MATLAB mexFunction: */
+void *(*amd_malloc) (size_t) = mxMalloc ;
+void (*amd_free) (void *) = mxFree ;
+void *(*amd_realloc) (void *, size_t) = mxRealloc ;
+void *(*amd_calloc) (size_t, size_t) = mxCalloc ;
+#else
+/* standard ANSI-C: */
+void *(*amd_malloc) (size_t) = malloc ;
+void (*amd_free) (void *) = free ;
+void *(*amd_realloc) (void *, size_t) = realloc ;
+void *(*amd_calloc) (size_t, size_t) = calloc ;
+#endif
+#else
+/* no memory manager defined at compile-time; you MUST define one at run-time */
+void *(*amd_malloc) (size_t) = NULL ;
+void (*amd_free) (void *) = NULL ;
+void *(*amd_realloc) (void *, size_t) = NULL ;
+void *(*amd_calloc) (size_t, size_t) = NULL ;
+#endif
+
+/* ========================================================================= */
+/* === Default AMD printf routine ========================================== */
+/* ========================================================================= */
+
+/* The user can redefine this global pointer at run-time to change the printf
+ * routine used by AMD.  If NULL, no printing occurs.  
+ *
+ * If -DNPRINT is defined at compile-time, stdio.h is not included.  Printing
+ * can then be enabled at run-time by setting amd_printf to a non-NULL function.
+ */
+
+#ifndef NPRINT
+#ifdef MATLAB_MEX_FILE
+int (*amd_printf) (const char *, ...) = mexPrintf ;
+#else
+#include <stdio.h>
+int (*amd_printf) (const char *, ...) = printf ;
+#endif
+#else
+int (*amd_printf) (const char *, ...) = NULL ;
+#endif
diff --git a/src/maths/KLU/amd_info.c b/src/maths/KLU/amd_info.c
new file mode 100644
index 000000000..0a842adf0
--- /dev/null
+++ b/src/maths/KLU/amd_info.c
@@ -0,0 +1,120 @@
+/* ========================================================================= */
+/* === AMD_info ============================================================ */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable.  Prints the output statistics for AMD.  See amd.h
+ * for details.  If the Info array is not present, nothing is printed.
+ */
+
+#include "amd_internal.h"
+
+#define PRI(format,x) { if (x >= 0) { PRINTF ((format, x)) ; }}
+
+GLOBAL void AMD_info
+(
+    double Info [ ]
+)
+{
+    double n, ndiv, nmultsubs_ldl, nmultsubs_lu, lnz, lnzd ;
+
+    PRINTF (("\nAMD version %d.%d.%d, %s, results:\n",
+	AMD_MAIN_VERSION, AMD_SUB_VERSION, AMD_SUBSUB_VERSION, AMD_DATE)) ;
+
+    if (!Info)
+    {
+	return ;
+    }
+
+    n = Info [AMD_N] ;
+    ndiv = Info [AMD_NDIV] ;
+    nmultsubs_ldl = Info [AMD_NMULTSUBS_LDL] ;
+    nmultsubs_lu = Info [AMD_NMULTSUBS_LU] ;
+    lnz = Info [AMD_LNZ] ;
+    lnzd = (n >= 0 && lnz >= 0) ? (n + lnz) : (-1) ;
+
+    /* AMD return status */
+    PRINTF (("    status: ")) ;
+    if (Info [AMD_STATUS] == AMD_OK)
+    {
+	PRINTF (("OK\n")) ;
+    }
+    else if (Info [AMD_STATUS] == AMD_OUT_OF_MEMORY)
+    {
+	PRINTF (("out of memory\n")) ;
+    }
+    else if (Info [AMD_STATUS] == AMD_INVALID)
+    {
+	PRINTF (("invalid matrix\n")) ;
+    }
+    else if (Info [AMD_STATUS] == AMD_OK_BUT_JUMBLED)
+    {
+	PRINTF (("OK, but jumbled\n")) ;
+    }
+    else
+    {
+	PRINTF (("unknown\n")) ;
+    }
+
+    /* statistics about the input matrix */
+    PRI ("    n, dimension of A:                                  %.20g\n", n);
+    PRI ("    nz, number of nonzeros in A:                        %.20g\n",
+	Info [AMD_NZ]) ;
+    PRI ("    symmetry of A:                                      %.4f\n",
+	Info [AMD_SYMMETRY]) ;
+    PRI ("    number of nonzeros on diagonal:                     %.20g\n",
+	Info [AMD_NZDIAG]) ;
+    PRI ("    nonzeros in pattern of A+A' (excl. diagonal):       %.20g\n",
+	Info [AMD_NZ_A_PLUS_AT]) ;
+    PRI ("    # dense rows/columns of A+A':                       %.20g\n",
+	Info [AMD_NDENSE]) ;
+
+    /* statistics about AMD's behavior  */
+    PRI ("    memory used, in bytes:                              %.20g\n",
+	Info [AMD_MEMORY]) ;
+    PRI ("    # of memory compactions:                            %.20g\n",
+	Info [AMD_NCMPA]) ;
+
+    /* statistics about the ordering quality */
+    PRINTF (("\n"
+	"    The following approximate statistics are for a subsequent\n"
+	"    factorization of A(P,P) + A(P,P)'.  They are slight upper\n"
+	"    bounds if there are no dense rows/columns in A+A', and become\n"
+	"    looser if dense rows/columns exist.\n\n")) ;
+
+    PRI ("    nonzeros in L (excluding diagonal):                 %.20g\n",
+	lnz) ;
+    PRI ("    nonzeros in L (including diagonal):                 %.20g\n",
+	lnzd) ;
+    PRI ("    # divide operations for LDL' or LU:                 %.20g\n",
+	ndiv) ;
+    PRI ("    # multiply-subtract operations for LDL':            %.20g\n",
+	nmultsubs_ldl) ;
+    PRI ("    # multiply-subtract operations for LU:              %.20g\n",
+	nmultsubs_lu) ;
+    PRI ("    max nz. in any column of L (incl. diagonal):        %.20g\n",
+	Info [AMD_DMAX]) ;
+
+    /* total flop counts for various factorizations */
+
+    if (n >= 0 && ndiv >= 0 && nmultsubs_ldl >= 0 && nmultsubs_lu >= 0)
+    {
+	PRINTF (("\n"
+	"    chol flop count for real A, sqrt counted as 1 flop: %.20g\n"
+	"    LDL' flop count for real A:                         %.20g\n"
+	"    LDL' flop count for complex A:                      %.20g\n"
+	"    LU flop count for real A (with no pivoting):        %.20g\n"
+	"    LU flop count for complex A (with no pivoting):     %.20g\n\n",
+	n + ndiv + 2*nmultsubs_ldl,
+	    ndiv + 2*nmultsubs_ldl,
+	  9*ndiv + 8*nmultsubs_ldl,
+	    ndiv + 2*nmultsubs_lu,
+	  9*ndiv + 8*nmultsubs_lu)) ;
+    }
+}
diff --git a/src/maths/KLU/amd_internal.h b/src/maths/KLU/amd_internal.h
new file mode 100644
index 000000000..750aee82b
--- /dev/null
+++ b/src/maths/KLU/amd_internal.h
@@ -0,0 +1,350 @@
+/* ========================================================================= */
+/* === amd_internal.h ====================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,                                      */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* This file is for internal use in AMD itself, and does not normally need to
+ * be included in user code (it is included in UMFPACK, however).   All others
+ * should use amd.h instead.
+ *
+ * The following compile-time definitions affect how AMD is compiled.
+ *
+ *      -DNPRINT
+ *
+ *          Disable all printing.  stdio.h will not be included.  Printing can
+ *          be re-enabled at run-time by setting the global pointer amd_printf
+ *          to printf (or mexPrintf for a MATLAB mexFunction).
+ *
+ *      -DNMALLOC
+ *
+ *          No memory manager is defined at compile-time.  You MUST define the
+ *          function pointers amd_malloc, amd_free, amd_realloc, and
+ *          amd_calloc at run-time for AMD to work properly.
+ */
+
+/* ========================================================================= */
+/* === NDEBUG ============================================================== */
+/* ========================================================================= */
+
+/*
+ * Turning on debugging takes some work (see below).   If you do not edit this
+ * file, then debugging is always turned off, regardless of whether or not
+ * -DNDEBUG is specified in your compiler options.
+ *
+ * If AMD is being compiled as a mexFunction, then MATLAB_MEX_FILE is defined,
+ * and mxAssert is used instead of assert.  If debugging is not enabled, no
+ * MATLAB include files or functions are used.  Thus, the AMD library libamd.a
+ * can be safely used in either a stand-alone C program or in another
+ * mexFunction, without any change.
+ */
+
+/*
+    AMD will be exceedingly slow when running in debug mode.  The next three
+    lines ensure that debugging is turned off.
+*/
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+/*
+    To enable debugging, uncomment the following line:
+#undef NDEBUG
+*/
+
+/* ------------------------------------------------------------------------- */
+/* ANSI include files */
+/* ------------------------------------------------------------------------- */
+
+/* from stdlib.h:  size_t, malloc, free, realloc, and calloc */
+#include <stdlib.h>
+
+#if !defined(NPRINT) || !defined(NDEBUG)
+/* from stdio.h:  printf.  Not included if NPRINT is defined at compile time.
+ * fopen and fscanf are used when debugging. */
+#include <stdio.h>
+#endif
+
+/* from limits.h:  INT_MAX and LONG_MAX */
+#include <limits.h>
+
+/* from math.h: sqrt */
+#include <math.h>
+
+/* ------------------------------------------------------------------------- */
+/* MATLAB include files (only if being used in or via MATLAB) */
+/* ------------------------------------------------------------------------- */
+
+#ifdef MATLAB_MEX_FILE
+#include "matrix.h"
+#include "mex.h"
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* basic definitions */
+/* ------------------------------------------------------------------------- */
+
+#ifdef FLIP
+#undef FLIP
+#endif
+
+#ifdef MAX
+#undef MAX
+#endif
+
+#ifdef MIN
+#undef MIN
+#endif
+
+#ifdef EMPTY
+#undef EMPTY
+#endif
+
+#ifdef GLOBAL
+#undef GLOBAL
+#endif
+
+#ifdef PRIVATE
+#undef PRIVATE
+#endif
+
+/* FLIP is a "negation about -1", and is used to mark an integer i that is
+ * normally non-negative.  FLIP (EMPTY) is EMPTY.  FLIP of a number > EMPTY
+ * is negative, and FLIP of a number < EMTPY is positive.  FLIP (FLIP (i)) = i
+ * for all integers i.  UNFLIP (i) is >= EMPTY. */
+#define EMPTY (-1)
+#define FLIP(i) (-(i)-2)
+#define UNFLIP(i) ((i < EMPTY) ? FLIP (i) : (i))
+
+/* for integer MAX/MIN, or for doubles when we don't care how NaN's behave: */
+#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+/* logical expression of p implies q: */
+#define IMPLIES(p,q) (!(p) || (q))
+
+/* Note that the IBM RS 6000 xlc predefines TRUE and FALSE in <types.h>. */
+/* The Compaq Alpha also predefines TRUE and FALSE. */
+#ifdef TRUE
+#undef TRUE
+#endif
+#ifdef FALSE
+#undef FALSE
+#endif
+
+#define TRUE (1)
+#define FALSE (0)
+#define PRIVATE static
+#define GLOBAL
+#define EMPTY (-1)
+
+/* Note that Linux's gcc 2.96 defines NULL as ((void *) 0), but other */
+/* compilers (even gcc 2.95.2 on Solaris) define NULL as 0 or (0).  We */
+/* need to use the ANSI standard value of 0. */
+#ifdef NULL
+#undef NULL
+#endif
+
+#define NULL 0
+
+/* largest value of size_t */
+#ifndef SIZE_T_MAX
+#define SIZE_T_MAX ((size_t) (-1))
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* integer type for AMD: int or UF_long */
+/* ------------------------------------------------------------------------- */
+
+/* define UF_long */
+#include "UFconfig.h"
+
+#if defined (DLONG) || defined (ZLONG)
+
+#define Int UF_long
+#define ID  UF_long_id
+#define Int_MAX UF_long_max
+
+#define AMD_order amd_l_order
+#define AMD_defaults amd_l_defaults
+#define AMD_control amd_l_control
+#define AMD_info amd_l_info
+#define AMD_1 amd_l1
+#define AMD_2 amd_l2
+#define AMD_valid amd_l_valid
+#define AMD_aat amd_l_aat
+#define AMD_postorder amd_l_postorder
+#define AMD_post_tree amd_l_post_tree
+#define AMD_dump amd_l_dump
+#define AMD_debug amd_l_debug
+#define AMD_debug_init amd_l_debug_init
+#define AMD_preprocess amd_l_preprocess
+
+#else
+
+#define Int int
+#define ID "%d"
+#define Int_MAX INT_MAX
+
+#define AMD_order amd_order
+#define AMD_defaults amd_defaults
+#define AMD_control amd_control
+#define AMD_info amd_info
+#define AMD_1 amd_1
+#define AMD_2 amd_2
+#define AMD_valid amd_valid
+#define AMD_aat amd_aat
+#define AMD_postorder amd_postorder
+#define AMD_post_tree amd_post_tree
+#define AMD_dump amd_dump
+#define AMD_debug amd_debug
+#define AMD_debug_init amd_debug_init
+#define AMD_preprocess amd_preprocess
+
+#endif
+
+/* ========================================================================= */
+/* === PRINTF macro ======================================================== */
+/* ========================================================================= */
+
+/* All output goes through the PRINTF macro.  */
+#define PRINTF(params) { if (amd_printf != NULL) (void) amd_printf params ; }
+
+/* ------------------------------------------------------------------------- */
+/* AMD routine definitions (user-callable) */
+/* ------------------------------------------------------------------------- */
+
+#include "amd.h"
+
+/* ------------------------------------------------------------------------- */
+/* AMD routine definitions (not user-callable) */
+/* ------------------------------------------------------------------------- */
+
+GLOBAL size_t AMD_aat
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int Len [ ],
+    Int Tp [ ],
+    double Info [ ]
+) ;
+
+GLOBAL void AMD_1
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int P [ ],
+    Int Pinv [ ],
+    Int Len [ ],
+    Int slen,
+    Int S [ ],
+    double Control [ ],
+    double Info [ ]
+) ;
+
+GLOBAL void AMD_postorder
+(
+    Int nn,
+    Int Parent [ ],
+    Int Npiv [ ],
+    Int Fsize [ ],
+    Int Order [ ],
+    Int Child [ ],
+    Int Sibling [ ],
+    Int Stack [ ]
+) ;
+
+GLOBAL Int AMD_post_tree
+(
+    Int root,
+    Int k,
+    Int Child [ ],
+    const Int Sibling [ ],
+    Int Order [ ],
+    Int Stack [ ]
+#ifndef NDEBUG
+    , Int nn
+#endif
+) ;
+
+GLOBAL void AMD_preprocess
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int Rp [ ],
+    Int Ri [ ],
+    Int W [ ],
+    Int Flag [ ]
+) ;
+
+/* ------------------------------------------------------------------------- */
+/* debugging definitions */
+/* ------------------------------------------------------------------------- */
+
+#ifndef NDEBUG
+
+/* from assert.h:  assert macro */
+#include <assert.h>
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+EXTERN Int AMD_debug ;
+
+GLOBAL void AMD_debug_init ( char *s ) ;
+
+GLOBAL void AMD_dump
+(
+    Int n,
+    Int Pe [ ],
+    Int Iw [ ],
+    Int Len [ ],
+    Int iwlen,
+    Int pfree,
+    Int Nv [ ],
+    Int Next [ ],
+    Int Last [ ],
+    Int Head [ ],
+    Int Elen [ ],
+    Int Degree [ ],
+    Int W [ ],
+    Int nel
+) ;
+
+#ifdef ASSERT
+#undef ASSERT
+#endif
+
+/* Use mxAssert if AMD is compiled into a mexFunction */
+#ifdef MATLAB_MEX_FILE
+#define ASSERT(expression) (mxAssert ((expression), ""))
+#else
+#define ASSERT(expression) (assert (expression))
+#endif
+
+#define AMD_DEBUG0(params) { PRINTF (params) ; }
+#define AMD_DEBUG1(params) { if (AMD_debug >= 1) PRINTF (params) ; }
+#define AMD_DEBUG2(params) { if (AMD_debug >= 2) PRINTF (params) ; }
+#define AMD_DEBUG3(params) { if (AMD_debug >= 3) PRINTF (params) ; }
+#define AMD_DEBUG4(params) { if (AMD_debug >= 4) PRINTF (params) ; }
+
+#else
+
+/* no debugging */
+#define ASSERT(expression)
+#define AMD_DEBUG0(params)
+#define AMD_DEBUG1(params)
+#define AMD_DEBUG2(params)
+#define AMD_DEBUG3(params)
+#define AMD_DEBUG4(params)
+
+#endif
diff --git a/src/maths/KLU/amd_order.c b/src/maths/KLU/amd_order.c
new file mode 100644
index 000000000..d3f6853d1
--- /dev/null
+++ b/src/maths/KLU/amd_order.c
@@ -0,0 +1,200 @@
+/* ========================================================================= */
+/* === AMD_order =========================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* User-callable AMD minimum degree ordering routine.  See amd.h for
+ * documentation.
+ */
+
+#include "amd_internal.h"
+
+/* ========================================================================= */
+/* === AMD_order =========================================================== */
+/* ========================================================================= */
+
+GLOBAL Int AMD_order
+(
+    Int n,
+    const Int Ap [ ],
+    const Int Ai [ ],
+    Int P [ ],
+    double Control [ ],
+    double Info [ ]
+)
+{
+    Int *Len, *S, nz, i, *Pinv, info, status, *Rp, *Ri, *Cp, *Ci, ok ;
+    size_t nzaat, slen ;
+    double mem = 0 ;
+
+#ifndef NDEBUG
+    AMD_debug_init ("amd") ;
+#endif
+
+    /* clear the Info array, if it exists */
+    info = Info != (double *) NULL ;
+    if (info)
+    {
+	for (i = 0 ; i < AMD_INFO ; i++)
+	{
+	    Info [i] = EMPTY ;
+	}
+	Info [AMD_N] = n ;
+	Info [AMD_STATUS] = AMD_OK ;
+    }
+
+    /* make sure inputs exist and n is >= 0 */
+    if (Ai == (Int *) NULL || Ap == (Int *) NULL || P == (Int *) NULL || n < 0)
+    {
+	if (info) Info [AMD_STATUS] = AMD_INVALID ;
+	return (AMD_INVALID) ;	    /* arguments are invalid */
+    }
+
+    if (n == 0)
+    {
+	return (AMD_OK) ;	    /* n is 0 so there's nothing to do */
+    }
+
+    nz = Ap [n] ;
+    if (info)
+    {
+	Info [AMD_NZ] = nz ;
+    }
+    if (nz < 0)
+    {
+	if (info) Info [AMD_STATUS] = AMD_INVALID ;
+	return (AMD_INVALID) ;
+    }
+
+    /* check if n or nz will cause size_t overflow */
+    if (((size_t) n) >= SIZE_T_MAX / sizeof (Int)
+     || ((size_t) nz) >= SIZE_T_MAX / sizeof (Int))
+    {
+	if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	return (AMD_OUT_OF_MEMORY) ;	    /* problem too large */
+    }
+
+    /* check the input matrix:	AMD_OK, AMD_INVALID, or AMD_OK_BUT_JUMBLED */
+    status = AMD_valid (n, n, Ap, Ai) ;
+
+    if (status == AMD_INVALID)
+    {
+	if (info) Info [AMD_STATUS] = AMD_INVALID ;
+	return (AMD_INVALID) ;	    /* matrix is invalid */
+    }
+
+    /* allocate two size-n integer workspaces */
+    Len = amd_malloc (n * sizeof (Int)) ;
+    Pinv = amd_malloc (n * sizeof (Int)) ;
+    mem += n ;
+    mem += n ;
+    if (!Len || !Pinv)
+    {
+	/* :: out of memory :: */
+	amd_free (Len) ;
+	amd_free (Pinv) ;
+	if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	return (AMD_OUT_OF_MEMORY) ;
+    }
+
+    if (status == AMD_OK_BUT_JUMBLED)
+    {
+	/* sort the input matrix and remove duplicate entries */
+	AMD_DEBUG1 (("Matrix is jumbled\n")) ;
+	Rp = amd_malloc ((n+1) * sizeof (Int)) ;
+	Ri = amd_malloc (MAX (nz,1) * sizeof (Int)) ;
+	mem += (n+1) ;
+	mem += MAX (nz,1) ;
+	if (!Rp || !Ri)
+	{
+	    /* :: out of memory :: */
+	    amd_free (Rp) ;
+	    amd_free (Ri) ;
+	    amd_free (Len) ;
+	    amd_free (Pinv) ;
+	    if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	    return (AMD_OUT_OF_MEMORY) ;
+	}
+	/* use Len and Pinv as workspace to create R = A' */
+	AMD_preprocess (n, Ap, Ai, Rp, Ri, Len, Pinv) ;
+	Cp = Rp ;
+	Ci = Ri ;
+    }
+    else
+    {
+	/* order the input matrix as-is.  No need to compute R = A' first */
+	Rp = NULL ;
+	Ri = NULL ;
+	Cp = (Int *) Ap ;
+	Ci = (Int *) Ai ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* determine the symmetry and count off-diagonal nonzeros in A+A' */
+    /* --------------------------------------------------------------------- */
+
+    nzaat = AMD_aat (n, Cp, Ci, Len, P, Info) ;
+    AMD_DEBUG1 (("nzaat: %g\n", (double) nzaat)) ;
+    ASSERT ((MAX (nz-n, 0) <= nzaat) && (nzaat <= 2 * (size_t) nz)) ;
+
+    /* --------------------------------------------------------------------- */
+    /* allocate workspace for matrix, elbow room, and 6 size-n vectors */
+    /* --------------------------------------------------------------------- */
+
+    S = NULL ;
+    slen = nzaat ;			/* space for matrix */
+    ok = ((slen + nzaat/5) >= slen) ;	/* check for size_t overflow */
+    slen += nzaat/5 ;			/* add elbow room */
+    for (i = 0 ; ok && i < 7 ; i++)
+    {
+	ok = ((slen + n) > slen) ;	/* check for size_t overflow */
+	slen += n ;			/* size-n elbow room, 6 size-n work */
+    }
+    mem += slen ;
+    ok = ok && (slen < SIZE_T_MAX / sizeof (Int)) ; /* check for overflow */
+    ok = ok && (slen < Int_MAX) ;	/* S[i] for Int i must be OK */
+    if (ok)
+    {
+	S = amd_malloc (slen * sizeof (Int)) ;
+    }
+    AMD_DEBUG1 (("slen %g\n", (double) slen)) ;
+    if (!S)
+    {
+	/* :: out of memory :: (or problem too large) */
+	amd_free (Rp) ;
+	amd_free (Ri) ;
+	amd_free (Len) ;
+	amd_free (Pinv) ;
+	if (info) Info [AMD_STATUS] = AMD_OUT_OF_MEMORY ;
+	return (AMD_OUT_OF_MEMORY) ;
+    }
+    if (info)
+    {
+	/* memory usage, in bytes. */
+	Info [AMD_MEMORY] = mem * sizeof (Int) ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* order the matrix */
+    /* --------------------------------------------------------------------- */
+
+    AMD_1 (n, Cp, Ci, P, Pinv, Len, slen, S, Control, Info) ;
+
+    /* --------------------------------------------------------------------- */
+    /* free the workspace */
+    /* --------------------------------------------------------------------- */
+
+    amd_free (Rp) ;
+    amd_free (Ri) ;
+    amd_free (Len) ;
+    amd_free (Pinv) ;
+    amd_free (S) ;
+    if (info) Info [AMD_STATUS] = status ;
+    return (status) ;	    /* successful ordering */
+}
diff --git a/src/maths/KLU/amd_post_tree.c b/src/maths/KLU/amd_post_tree.c
new file mode 100644
index 000000000..b4e063d52
--- /dev/null
+++ b/src/maths/KLU/amd_post_tree.c
@@ -0,0 +1,121 @@
+/* ========================================================================= */
+/* === AMD_post_tree ======================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Post-ordering of a supernodal elimination tree.  */
+
+#include "amd_internal.h"
+
+GLOBAL Int AMD_post_tree
+(
+    Int root,			/* root of the tree */
+    Int k,			/* start numbering at k */
+    Int Child [ ],		/* input argument of size nn, undefined on
+				 * output.  Child [i] is the head of a link
+				 * list of all nodes that are children of node
+				 * i in the tree. */
+    const Int Sibling [ ],	/* input argument of size nn, not modified.
+				 * If f is a node in the link list of the
+				 * children of node i, then Sibling [f] is the
+				 * next child of node i.
+				 */
+    Int Order [ ],		/* output order, of size nn.  Order [i] = k
+				 * if node i is the kth node of the reordered
+				 * tree. */
+    Int Stack [ ]		/* workspace of size nn */
+#ifndef NDEBUG
+    , Int nn			/* nodes are in the range 0..nn-1. */
+#endif
+)
+{
+    Int f, head, h, i ;
+
+#if 0
+    /* --------------------------------------------------------------------- */
+    /* recursive version (Stack [ ] is not used): */
+    /* --------------------------------------------------------------------- */
+
+    /* this is simple, but can caouse stack overflow if nn is large */
+    i = root ;
+    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+    {
+	k = AMD_post_tree (f, k, Child, Sibling, Order, Stack, nn) ;
+    }
+    Order [i] = k++ ;
+    return (k) ;
+#endif
+
+    /* --------------------------------------------------------------------- */
+    /* non-recursive version, using an explicit stack */
+    /* --------------------------------------------------------------------- */
+
+    /* push root on the stack */
+    head = 0 ;
+    Stack [0] = root ;
+
+    while (head >= 0)
+    {
+	/* get head of stack */
+	ASSERT (head < nn) ;
+	i = Stack [head] ;
+	AMD_DEBUG1 (("head of stack "ID" \n", i)) ;
+	ASSERT (i >= 0 && i < nn) ;
+
+	if (Child [i] != EMPTY)
+	{
+	    /* the children of i are not yet ordered */
+	    /* push each child onto the stack in reverse order */
+	    /* so that small ones at the head of the list get popped first */
+	    /* and the biggest one at the end of the list gets popped last */
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		head++ ;
+		ASSERT (head < nn) ;
+		ASSERT (f >= 0 && f < nn) ;
+	    }
+	    h = head ;
+	    ASSERT (head < nn) ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (h > 0) ;
+		Stack [h--] = f ;
+		AMD_DEBUG1 (("push "ID" on stack\n", f)) ;
+		ASSERT (f >= 0 && f < nn) ;
+	    }
+	    ASSERT (Stack [h] == i) ;
+
+	    /* delete child list so that i gets ordered next time we see it */
+	    Child [i] = EMPTY ;
+	}
+	else
+	{
+	    /* the children of i (if there were any) are already ordered */
+	    /* remove i from the stack and order it.  Front i is kth front */
+	    head-- ;
+	    AMD_DEBUG1 (("pop "ID" order "ID"\n", i, k)) ;
+	    Order [i] = k++ ;
+	    ASSERT (k <= nn) ;
+	}
+
+#ifndef NDEBUG
+	AMD_DEBUG1 (("\nStack:")) ;
+	for (h = head ; h >= 0 ; h--)
+	{
+	    Int j = Stack [h] ;
+	    AMD_DEBUG1 ((" "ID, j)) ;
+	    ASSERT (j >= 0 && j < nn) ;
+	}
+	AMD_DEBUG1 (("\n\n")) ;
+	ASSERT (head < nn) ;
+#endif
+
+    }
+    return (k) ;
+}
diff --git a/src/maths/KLU/amd_postorder.c b/src/maths/KLU/amd_postorder.c
new file mode 100644
index 000000000..4adcea3c0
--- /dev/null
+++ b/src/maths/KLU/amd_postorder.c
@@ -0,0 +1,207 @@
+/* ========================================================================= */
+/* === AMD_postorder ======================================================= */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Perform a postordering (via depth-first search) of an assembly tree. */
+
+#include "amd_internal.h"
+
+GLOBAL void AMD_postorder
+(
+    /* inputs, not modified on output: */
+    Int nn,		/* nodes are in the range 0..nn-1 */
+    Int Parent [ ],	/* Parent [j] is the parent of j, or EMPTY if root */
+    Int Nv [ ],		/* Nv [j] > 0 number of pivots represented by node j,
+			 * or zero if j is not a node. */
+    Int Fsize [ ],	/* Fsize [j]: size of node j */
+
+    /* output, not defined on input: */
+    Int Order [ ],	/* output post-order */
+
+    /* workspaces of size nn: */
+    Int Child [ ],
+    Int Sibling [ ],
+    Int Stack [ ]
+)
+{
+    Int i, j, k, parent, frsize, f, fprev, maxfrsize, bigfprev, bigf, fnext ;
+
+    for (j = 0 ; j < nn ; j++)
+    {
+	Child [j] = EMPTY ;
+	Sibling [j] = EMPTY ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* place the children in link lists - bigger elements tend to be last */
+    /* --------------------------------------------------------------------- */
+
+    for (j = nn-1 ; j >= 0 ; j--)
+    {
+	if (Nv [j] > 0)
+	{
+	    /* this is an element */
+	    parent = Parent [j] ;
+	    if (parent != EMPTY)
+	    {
+		/* place the element in link list of the children its parent */
+		/* bigger elements will tend to be at the end of the list */
+		Sibling [j] = Child [parent] ;
+		Child [parent] = j ;
+	    }
+	}
+    }
+
+#ifndef NDEBUG
+    {
+	Int nels, ff, nchild ;
+	AMD_DEBUG1 (("\n\n================================ AMD_postorder:\n"));
+	nels = 0 ;
+	for (j = 0 ; j < nn ; j++)
+	{
+	    if (Nv [j] > 0)
+	    {
+		AMD_DEBUG1 (( ""ID" :  nels "ID" npiv "ID" size "ID
+		    " parent "ID" maxfr "ID"\n", j, nels,
+		    Nv [j], Fsize [j], Parent [j], Fsize [j])) ;
+		/* this is an element */
+		/* dump the link list of children */
+		nchild = 0 ;
+		AMD_DEBUG1 (("    Children: ")) ;
+		for (ff = Child [j] ; ff != EMPTY ; ff = Sibling [ff])
+		{
+		    AMD_DEBUG1 ((ID" ", ff)) ;
+		    ASSERT (Parent [ff] == j) ;
+		    nchild++ ;
+		    ASSERT (nchild < nn) ;
+		}
+		AMD_DEBUG1 (("\n")) ;
+		parent = Parent [j] ;
+		if (parent != EMPTY)
+		{
+		    ASSERT (Nv [parent] > 0) ;
+		}
+		nels++ ;
+	    }
+	}
+    }
+    AMD_DEBUG1 (("\n\nGo through the children of each node, and put\n"
+		 "the biggest child last in each list:\n")) ;
+#endif
+
+    /* --------------------------------------------------------------------- */
+    /* place the largest child last in the list of children for each node */
+    /* --------------------------------------------------------------------- */
+
+    for (i = 0 ; i < nn ; i++)
+    {
+	if (Nv [i] > 0 && Child [i] != EMPTY)
+	{
+
+#ifndef NDEBUG
+	    Int nchild ;
+	    AMD_DEBUG1 (("Before partial sort, element "ID"\n", i)) ;
+	    nchild = 0 ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (f >= 0 && f < nn) ;
+		AMD_DEBUG1 (("      f: "ID"  size: "ID"\n", f, Fsize [f])) ;
+		nchild++ ;
+		ASSERT (nchild <= nn) ;
+	    }
+#endif
+
+	    /* find the biggest element in the child list */
+	    fprev = EMPTY ;
+	    maxfrsize = EMPTY ;
+	    bigfprev = EMPTY ;
+	    bigf = EMPTY ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (f >= 0 && f < nn) ;
+		frsize = Fsize [f] ;
+		if (frsize >= maxfrsize)
+		{
+		    /* this is the biggest seen so far */
+		    maxfrsize = frsize ;
+		    bigfprev = fprev ;
+		    bigf = f ;
+		}
+		fprev = f ;
+	    }
+	    ASSERT (bigf != EMPTY) ;
+
+	    fnext = Sibling [bigf] ;
+
+	    AMD_DEBUG1 (("bigf "ID" maxfrsize "ID" bigfprev "ID" fnext "ID
+		" fprev " ID"\n", bigf, maxfrsize, bigfprev, fnext, fprev)) ;
+
+	    if (fnext != EMPTY)
+	    {
+		/* if fnext is EMPTY then bigf is already at the end of list */
+
+		if (bigfprev == EMPTY)
+		{
+		    /* delete bigf from the element of the list */
+		    Child [i] = fnext ;
+		}
+		else
+		{
+		    /* delete bigf from the middle of the list */
+		    Sibling [bigfprev] = fnext ;
+		}
+
+		/* put bigf at the end of the list */
+		Sibling [bigf] = EMPTY ;
+		ASSERT (Child [i] != EMPTY) ;
+		ASSERT (fprev != bigf) ;
+		ASSERT (fprev != EMPTY) ;
+		Sibling [fprev] = bigf ;
+	    }
+
+#ifndef NDEBUG
+	    AMD_DEBUG1 (("After partial sort, element "ID"\n", i)) ;
+	    for (f = Child [i] ; f != EMPTY ; f = Sibling [f])
+	    {
+		ASSERT (f >= 0 && f < nn) ;
+		AMD_DEBUG1 (("        "ID"  "ID"\n", f, Fsize [f])) ;
+		ASSERT (Nv [f] > 0) ;
+		nchild-- ;
+	    }
+	    ASSERT (nchild == 0) ;
+#endif
+
+	}
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* postorder the assembly tree */
+    /* --------------------------------------------------------------------- */
+
+    for (i = 0 ; i < nn ; i++)
+    {
+	Order [i] = EMPTY ;
+    }
+
+    k = 0 ;
+
+    for (i = 0 ; i < nn ; i++)
+    {
+	if (Parent [i] == EMPTY && Nv [i] > 0)
+	{
+	    AMD_DEBUG1 (("Root of assembly tree "ID"\n", i)) ;
+	    k = AMD_post_tree (i, k, Child, Sibling, Order, Stack
+#ifndef NDEBUG
+		, nn
+#endif
+		) ;
+	}
+    }
+}
diff --git a/src/maths/KLU/amd_preprocess.c b/src/maths/KLU/amd_preprocess.c
new file mode 100644
index 000000000..86ea07f86
--- /dev/null
+++ b/src/maths/KLU/amd_preprocess.c
@@ -0,0 +1,119 @@
+/* ========================================================================= */
+/* === AMD_preprocess ====================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Sorts, removes duplicate entries, and transposes from the nonzero pattern of
+ * a column-form matrix A, to obtain the matrix R.  The input matrix can have
+ * duplicate entries and/or unsorted columns (AMD_valid (n,Ap,Ai) must not be
+ * AMD_INVALID).
+ *
+ * This input condition is NOT checked.  This routine is not user-callable.
+ */
+
+#include "amd_internal.h"
+
+/* ========================================================================= */
+/* === AMD_preprocess ====================================================== */
+/* ========================================================================= */
+
+/* AMD_preprocess does not check its input for errors or allocate workspace.
+ * On input, the condition (AMD_valid (n,n,Ap,Ai) != AMD_INVALID) must hold.
+ */
+
+GLOBAL void AMD_preprocess
+(
+    Int n,		/* input matrix: A is n-by-n */
+    const Int Ap [ ],	/* size n+1 */
+    const Int Ai [ ],	/* size nz = Ap [n] */
+
+    /* output matrix R: */
+    Int Rp [ ],		/* size n+1 */
+    Int Ri [ ],		/* size nz (or less, if duplicates present) */
+
+    Int W [ ],		/* workspace of size n */
+    Int Flag [ ]	/* workspace of size n */
+)
+{
+
+    /* --------------------------------------------------------------------- */
+    /* local variables */
+    /* --------------------------------------------------------------------- */
+
+    Int i, j, p, p2 ;
+
+    ASSERT (AMD_valid (n, n, Ap, Ai) != AMD_INVALID) ;
+
+    /* --------------------------------------------------------------------- */
+    /* count the entries in each row of A (excluding duplicates) */
+    /* --------------------------------------------------------------------- */
+
+    for (i = 0 ; i < n ; i++)
+    {
+	W [i] = 0 ;		/* # of nonzeros in row i (excl duplicates) */
+	Flag [i] = EMPTY ;	/* Flag [i] = j if i appears in column j */
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+	p2 = Ap [j+1] ;
+	for (p = Ap [j] ; p < p2 ; p++)
+	{
+	    i = Ai [p] ;
+	    if (Flag [i] != j)
+	    {
+		/* row index i has not yet appeared in column j */
+		W [i]++ ;	    /* one more entry in row i */
+		Flag [i] = j ;	    /* flag row index i as appearing in col j*/
+	    }
+	}
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* compute the row pointers for R */
+    /* --------------------------------------------------------------------- */
+
+    Rp [0] = 0 ;
+    for (i = 0 ; i < n ; i++)
+    {
+	Rp [i+1] = Rp [i] + W [i] ;
+    }
+    for (i = 0 ; i < n ; i++)
+    {
+	W [i] = Rp [i] ;
+	Flag [i] = EMPTY ;
+    }
+
+    /* --------------------------------------------------------------------- */
+    /* construct the row form matrix R */
+    /* --------------------------------------------------------------------- */
+
+    /* R = row form of pattern of A */
+    for (j = 0 ; j < n ; j++)
+    {
+	p2 = Ap [j+1] ;
+	for (p = Ap [j] ; p < p2 ; p++)
+	{
+	    i = Ai [p] ;
+	    if (Flag [i] != j)
+	    {
+		/* row index i has not yet appeared in column j */
+		Ri [W [i]++] = j ;  /* put col j in row i */
+		Flag [i] = j ;	    /* flag row index i as appearing in col j*/
+	    }
+	}
+    }
+
+#ifndef NDEBUG
+    ASSERT (AMD_valid (n, n, Rp, Ri) == AMD_OK) ;
+    for (j = 0 ; j < n ; j++)
+    {
+	ASSERT (W [j] == Rp [j+1]) ;
+    }
+#endif
+}
diff --git a/src/maths/KLU/amd_valid.c b/src/maths/KLU/amd_valid.c
new file mode 100644
index 000000000..4d05925c2
--- /dev/null
+++ b/src/maths/KLU/amd_valid.c
@@ -0,0 +1,93 @@
+/* ========================================================================= */
+/* === AMD_valid =========================================================== */
+/* ========================================================================= */
+
+/* ------------------------------------------------------------------------- */
+/* AMD, Copyright (c) Timothy A. Davis,					     */
+/* Patrick R. Amestoy, and Iain S. Duff.  See ../README.txt for License.     */
+/* email: davis at cise.ufl.edu    CISE Department, Univ. of Florida.        */
+/* web: http://www.cise.ufl.edu/research/sparse/amd                          */
+/* ------------------------------------------------------------------------- */
+
+/* Check if a column-form matrix is valid or not.  The matrix A is
+ * n_row-by-n_col.  The row indices of entries in column j are in
+ * Ai [Ap [j] ... Ap [j+1]-1].  Required conditions are:
+ *
+ *	n_row >= 0
+ *	n_col >= 0
+ *	nz = Ap [n_col] >= 0	    number of entries in the matrix
+ *	Ap [0] == 0
+ *	Ap [j] <= Ap [j+1] for all j in the range 0 to n_col.
+ *      Ai [0 ... nz-1] must be in the range 0 to n_row-1.
+ *
+ * If any of the above conditions hold, AMD_INVALID is returned.  If the
+ * following condition holds, AMD_OK_BUT_JUMBLED is returned (a warning,
+ * not an error):
+ *
+ *	row indices in Ai [Ap [j] ... Ap [j+1]-1] are not sorted in ascending
+ *	    order, and/or duplicate entries exist.
+ *
+ * Otherwise, AMD_OK is returned.
+ *
+ * In v1.2 and earlier, this function returned TRUE if the matrix was valid
+ * (now returns AMD_OK), or FALSE otherwise (now returns AMD_INVALID or
+ * AMD_OK_BUT_JUMBLED).
+ */
+
+#include "amd_internal.h"
+
+GLOBAL Int AMD_valid
+(
+    /* inputs, not modified on output: */
+    Int n_row,		/* A is n_row-by-n_col */
+    Int n_col,
+    const Int Ap [ ],	/* column pointers of A, of size n_col+1 */
+    const Int Ai [ ]	/* row indices of A, of size nz = Ap [n_col] */
+)
+{
+    Int nz, j, p1, p2, ilast, i, p, result = AMD_OK ;
+
+    if (n_row < 0 || n_col < 0 || Ap == NULL || Ai == NULL)
+    {
+	return (AMD_INVALID) ;
+    }
+    nz = Ap [n_col] ;
+    if (Ap [0] != 0 || nz < 0)
+    {
+	/* column pointers must start at Ap [0] = 0, and Ap [n] must be >= 0 */
+	AMD_DEBUG0 (("column 0 pointer bad or nz < 0\n")) ;
+	return (AMD_INVALID) ;
+    }
+    for (j = 0 ; j < n_col ; j++)
+    {
+	p1 = Ap [j] ;
+	p2 = Ap [j+1] ;
+	AMD_DEBUG2 (("\nColumn: "ID" p1: "ID" p2: "ID"\n", j, p1, p2)) ;
+	if (p1 > p2)
+	{
+	    /* column pointers must be ascending */
+	    AMD_DEBUG0 (("column "ID" pointer bad\n", j)) ;
+	    return (AMD_INVALID) ;
+	}
+	ilast = EMPTY ;
+	for (p = p1 ; p < p2 ; p++)
+	{
+	    i = Ai [p] ;
+	    AMD_DEBUG3 (("row: "ID"\n", i)) ;
+	    if (i < 0 || i >= n_row)
+	    {
+		/* row index out of range */
+		AMD_DEBUG0 (("index out of range, col "ID" row "ID"\n", j, i));
+		return (AMD_INVALID) ;
+	    }
+	    if (i <= ilast)
+	    {
+		/* row index unsorted, or duplicate entry present */
+		AMD_DEBUG1 (("index unsorted/dupl col "ID" row "ID"\n", j, i));
+		result = AMD_OK_BUT_JUMBLED ;
+	    }
+	    ilast = i ;
+	}
+    }
+    return (result) ;
+}
diff --git a/src/maths/KLU/btf.h b/src/maths/KLU/btf.h
new file mode 100644
index 000000000..32be79cf9
--- /dev/null
+++ b/src/maths/KLU/btf.h
@@ -0,0 +1,263 @@
+/* ========================================================================== */
+/* === BTF package ========================================================== */
+/* ========================================================================== */
+
+/* BTF_MAXTRANS:  find a column permutation Q to give A*Q a zero-free diagonal
+ * BTF_STRONGCOMP:  find a symmetric permutation P to put P*A*P' into block
+ *      upper triangular form.
+ * BTF_ORDER: do both of the above (btf_maxtrans then btf_strongcomp).
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+
+/* ========================================================================== */
+/* === BTF_MAXTRANS ========================================================= */
+/* ========================================================================== */
+
+/* BTF_MAXTRANS: finds a permutation of the columns of a matrix so that it has a
+ * zero-free diagonal.  The input is an m-by-n sparse matrix in compressed
+ * column form.  The array Ap of size n+1 gives the starting and ending
+ * positions of the columns in the array Ai.  Ap[0] must be zero. The array Ai
+ * contains the row indices of the nonzeros of the matrix A, and is of size
+ * Ap[n].  The row indices of column j are located in Ai[Ap[j] ... Ap[j+1]-1].
+ * Row indices must be in the range 0 to m-1.  Duplicate entries may be present
+ * in any given column.  The input matrix  is not checked for validity (row
+ * indices out of the range 0 to m-1 will lead to an undeterminate result -
+ * possibly a core dump, for example).  Row indices in any given column need
+ * not be in sorted order.  However, if they are sorted and the matrix already
+ * has a zero-free diagonal, then the identity permutation is returned.
+ *
+ * The output of btf_maxtrans is an array Match of size n.  If row i is matched
+ * with column j, then A(i,j) is nonzero, and then Match[i] = j.  If the matrix
+ * is structurally nonsingular, all entries in the Match array are unique, and
+ * Match can be viewed as a column permutation if A is square.  That is, column
+ * k of the original matrix becomes column Match[k] of the permuted matrix.  In
+ * MATLAB, this can be expressed as (for non-structurally singular matrices):
+ *
+ *      Match = maxtrans (A) ;
+ *      B = A (:, Match) ;
+ *
+ * except of course here the A matrix and Match vector are all 0-based (rows
+ * and columns in the range 0 to n-1), not 1-based (rows/cols in range 1 to n).
+ * The MATLAB dmperm routine returns a row permutation.  See the maxtrans
+ * mexFunction for more details.
+ *
+ * If row i is not matched to any column, then Match[i] is == -1.  The
+ * btf_maxtrans routine returns the number of nonzeros on diagonal of the
+ * permuted matrix.
+ *
+ * In the MATLAB mexFunction interface to btf_maxtrans, 1 is added to the Match
+ * array to obtain a 1-based permutation.  Thus, in MATLAB where A is m-by-n:
+ *
+ *      q = maxtrans (A) ;      % has entries in the range 0:n
+ *      q                       % a column permutation (only if sprank(A)==n)
+ *      B = A (:, q) ;          % permuted matrix (only if sprank(A)==n)
+ *      sum (q > 0) ;           % same as "sprank (A)"
+ *
+ * This behaviour differs from p = dmperm (A) in MATLAB, which returns the
+ * matching as p(j)=i if row i and column j are matched, and p(j)=0 if column j
+ * is unmatched.
+ *
+ *      p = dmperm (A) ;        % has entries in the range 0:m
+ *      p                       % a row permutation (only if sprank(A)==m)
+ *      B = A (p, :) ;          % permuted matrix (only if sprank(A)==m)
+ *      sum (p > 0) ;           % definition of sprank (A)
+ *
+ * This algorithm is based on the paper "On Algorithms for obtaining a maximum
+ * transversal" by Iain Duff, ACM Trans. Mathematical Software, vol 7, no. 1,
+ * pp. 315-330, and "Algorithm 575: Permutations for a zero-free diagonal",
+ * same issue, pp. 387-390.  Algorithm 575 is MC21A in the Harwell Subroutine
+ * Library.  This code is not merely a translation of the Fortran code into C.
+ * It is a completely new implementation of the basic underlying method (depth
+ * first search over a subgraph with nodes corresponding to columns matched so
+ * far, and cheap matching).  This code was written with minimal observation of
+ * the MC21A/B code itself.  See comments below for a comparison between the
+ * maxtrans and MC21A/B codes.
+ *
+ * This routine operates on a column-form matrix and produces a column
+ * permutation.  MC21A uses a row-form matrix and produces a row permutation.
+ * The difference is merely one of convention in the comments and interpretation
+ * of the inputs and outputs.  If you want a row permutation, simply pass a
+ * compressed-row sparse matrix to this routine and you will get a row
+ * permutation (just like MC21A).  Similarly, you can pass a column-oriented
+ * matrix to MC21A and it will happily return a column permutation.
+ */
+
+#ifndef _BTF_H
+#define _BTF_H
+
+/* make it easy for C++ programs to include BTF */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "UFconfig.h"
+
+int btf_maxtrans    /* returns # of columns matched */
+(
+    /* --- input, not modified: --- */
+    int nrow,       /* A is nrow-by-ncol in compressed column form */
+    int ncol,
+    int Ap [ ],     /* size ncol+1 */
+    int Ai [ ],     /* size nz = Ap [ncol] */
+    double maxwork, /* maximum amount of work to do is maxwork*nnz(A); no limit
+                     * if <= 0 */
+
+    /* --- output, not defined on input --- */
+    double *work,   /* work = -1 if maxwork > 0 and the total work performed
+                     * reached the maximum of maxwork*nnz(A).
+                     * Otherwise, work = the total work performed. */
+
+    int Match [ ],  /* size nrow.  Match [i] = j if column j matched to row i
+                     * (see above for the singular-matrix case) */
+
+    /* --- workspace, not defined on input or output --- */
+    int Work [ ]    /* size 5*ncol */
+) ;
+
+/* long integer version (all "int" parameters become "UF_long") */
+UF_long btf_l_maxtrans (UF_long, UF_long, UF_long *, UF_long *, double,
+    double *, UF_long *, UF_long *) ;
+
+
+/* ========================================================================== */
+/* === BTF_STRONGCOMP ======================================================= */
+/* ========================================================================== */
+
+/* BTF_STRONGCOMP finds the strongly connected components of a graph, returning
+ * a symmetric permutation.  The matrix A must be square, and is provided on
+ * input in compressed-column form (see BTF_MAXTRANS, above).  The diagonal of
+ * the input matrix A (or A*Q if Q is provided on input) is ignored.
+ *
+ * If Q is not NULL on input, then the strongly connected components of A*Q are
+ * found.  Q may be flagged on input, where Q[k] < 0 denotes a flagged column k.
+ * The permutation is j = BTF_UNFLIP (Q [k]).  On output, Q is modified (the
+ * flags are preserved) so that P*A*Q is in block upper triangular form.
+ *
+ * If Q is NULL, then the permutation P is returned so that P*A*P' is in upper
+ * block triangular form.
+ *
+ * The vector R gives the block boundaries, where block b is in rows/columns
+ * R[b] to R[b+1]-1 of the permuted matrix, and where b ranges from 1 to the
+ * number of strongly connected components found.
+ */
+
+int btf_strongcomp  /* return # of strongly connected components */
+(
+    /* input, not modified: */
+    int n,          /* A is n-by-n in compressed column form */
+    int Ap [ ],     /* size n+1 */
+    int Ai [ ],     /* size nz = Ap [n] */
+
+    /* optional input, modified (if present) on output: */
+    int Q [ ],      /* size n, input column permutation */
+
+    /* output, not defined on input */
+    int P [ ],      /* size n.  P [k] = j if row and column j are kth row/col
+                     * in permuted matrix. */
+
+    int R [ ],      /* size n+1.  block b is in rows/cols R[b] ... R[b+1]-1 */
+
+    /* workspace, not defined on input or output */
+    int Work [ ]    /* size 4n */
+) ;
+
+UF_long btf_l_strongcomp (UF_long, UF_long *, UF_long *, UF_long *, UF_long *,
+    UF_long *, UF_long *) ;
+
+
+/* ========================================================================== */
+/* === BTF_ORDER ============================================================ */
+/* ========================================================================== */
+
+/* BTF_ORDER permutes a square matrix into upper block triangular form.  It
+ * does this by first finding a maximum matching (or perhaps a limited matching
+ * if the work is limited), via the btf_maxtrans function.  If a complete
+ * matching is not found, BTF_ORDER completes the permutation, but flags the
+ * columns of P*A*Q to denote which columns are not matched.  If the matrix is
+ * structurally rank deficient, some of the entries on the diagonal of the
+ * permuted matrix will be zero.  BTF_ORDER then calls btf_strongcomp to find
+ * the strongly-connected components.
+ *
+ * On output, P and Q are the row and column permutations, where i = P[k] if
+ * row i of A is the kth row of P*A*Q, and j = BTF_UNFLIP(Q[k]) if column j of
+ * A is the kth column of P*A*Q.  If Q[k] < 0, then the (k,k)th entry in P*A*Q
+ * is structurally zero.
+ *
+ * The vector R gives the block boundaries, where block b is in rows/columns
+ * R[b] to R[b+1]-1 of the permuted matrix, and where b ranges from 1 to the
+ * number of strongly connected components found.
+ */
+
+int btf_order       /* returns number of blocks found */
+(
+    /* --- input, not modified: --- */
+    int n,          /* A is n-by-n in compressed column form */
+    int Ap [ ],     /* size n+1 */
+    int Ai [ ],     /* size nz = Ap [n] */
+    double maxwork, /* do at most maxwork*nnz(A) work in the maximum
+                     * transversal; no limit if <= 0 */
+
+    /* --- output, not defined on input --- */
+    double *work,   /* return value from btf_maxtrans */
+    int P [ ],      /* size n, row permutation */
+    int Q [ ],      /* size n, column permutation */
+    int R [ ],      /* size n+1.  block b is in rows/cols R[b] ... R[b+1]-1 */
+    int *nmatch,    /* # nonzeros on diagonal of P*A*Q */
+
+    /* --- workspace, not defined on input or output --- */
+    int Work [ ]    /* size 5n */
+) ;
+
+UF_long btf_l_order (UF_long, UF_long *, UF_long *, double , double *,
+    UF_long *, UF_long *, UF_long *, UF_long *, UF_long *) ;
+
+
+/* ========================================================================== */
+/* === BTF marking of singular columns ====================================== */
+/* ========================================================================== */
+
+/* BTF_FLIP is a "negation about -1", and is used to mark an integer j
+ * that is normally non-negative.  BTF_FLIP (-1) is -1.  BTF_FLIP of
+ * a number > -1 is negative, and BTF_FLIP of a number < -1 is positive.
+ * BTF_FLIP (BTF_FLIP (j)) = j for all integers j.  UNFLIP (j) acts
+ * like an "absolute value" operation, and is always >= -1.  You can test
+ * whether or not an integer j is "flipped" with the BTF_ISFLIPPED (j)
+ * macro.
+ */
+
+#define BTF_FLIP(j) (-(j)-2)
+#define BTF_ISFLIPPED(j) ((j) < -1)
+#define BTF_UNFLIP(j) ((BTF_ISFLIPPED (j)) ? BTF_FLIP (j) : (j))
+
+/* ========================================================================== */
+/* === BTF version ========================================================== */
+/* ========================================================================== */
+
+/* All versions of BTF include these definitions.
+ * As an example, to test if the version you are using is 1.2 or later:
+ *
+ *      if (BTF_VERSION >= BTF_VERSION_CODE (1,2)) ...
+ *
+ * This also works during compile-time:
+ *
+ *      #if (BTF >= BTF_VERSION_CODE (1,2))
+ *          printf ("This is version 1.2 or later\n") ;
+ *      #else
+ *          printf ("This is an early version\n") ;
+ *      #endif
+ */
+
+#define BTF_DATE "Dec 7, 2011"
+#define BTF_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define BTF_MAIN_VERSION 1
+#define BTF_SUB_VERSION 1
+#define BTF_SUBSUB_VERSION 3
+#define BTF_VERSION BTF_VERSION_CODE(BTF_MAIN_VERSION,BTF_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/maths/KLU/btf_internal.h b/src/maths/KLU/btf_internal.h
new file mode 100644
index 000000000..83f76f1ec
--- /dev/null
+++ b/src/maths/KLU/btf_internal.h
@@ -0,0 +1,64 @@
+/* ========================================================================== */
+/* === btf_internal include file ============================================ */
+/* ========================================================================== */
+
+#ifndef _BTF_INTERNAL_H
+#define _BTF_INTERNAL_H
+
+/*
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+/* Not to be included in any user program. */
+
+#ifdef DLONG
+#define Int UF_long
+#define Int_id UF_long_id
+#define BTF(name) btf_l_ ## name
+#else
+#define Int int
+#define Int_id "%d"
+#define BTF(name) btf_ ## name
+#endif
+
+/* ========================================================================== */
+/* make sure debugging and printing is turned off */
+
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#ifndef NPRINT
+#define NPRINT
+#endif
+
+/* To enable debugging and assertions, uncomment this line: 
+ #undef NDEBUG
+*/
+/* To enable diagnostic printing, uncomment this line: 
+ #undef NPRINT
+*/
+
+/* ========================================================================== */
+
+#include <stdio.h>
+#include <assert.h>
+#define ASSERT(a) assert(a)
+
+#undef TRUE
+#undef FALSE
+#undef PRINTF
+#undef MIN
+
+#ifndef NPRINT
+#define PRINTF(s) { printf s ; } ;
+#else
+#define PRINTF(s)
+#endif
+
+#define TRUE 1
+#define FALSE 0
+#define EMPTY (-1)
+#define MIN(a,b) (((a) < (b)) ?  (a) : (b))
+
+#endif
diff --git a/src/maths/KLU/btf_maxtrans.c b/src/maths/KLU/btf_maxtrans.c
new file mode 100644
index 000000000..ab3a26ea0
--- /dev/null
+++ b/src/maths/KLU/btf_maxtrans.c
@@ -0,0 +1,387 @@
+/* ========================================================================== */
+/* === BTF_MAXTRANS ========================================================= */
+/* ========================================================================== */
+
+/* Finds a column permutation that maximizes the number of entries on the
+ * diagonal of a sparse matrix.  See btf.h for more information.
+ *
+ * This function is identical to cs_maxtrans in CSparse, with the following
+ * exceptions:
+ *
+ *  (1) cs_maxtrans finds both jmatch and imatch, where jmatch [i] = j and
+ *      imatch [j] = i if row i is matched to column j.  This function returns
+ *      just jmatch (the Match array).  The MATLAB interface to cs_maxtrans
+ *      (the single-output cs_dmperm) returns imatch, not jmatch to the MATLAB
+ *      caller.
+ *
+ *  (2) cs_maxtrans includes a pre-pass that counts the number of non-empty
+ *      rows and columns (m2 and n2, respectively), and computes the matching
+ *      using the transpose of A if m2 < n2.  cs_maxtrans also returns quickly
+ *      if the diagonal of the matrix is already zero-free.  This pre-pass
+ *      allows cs_maxtrans to be much faster than maxtrans, if the use of the
+ *      transpose is warranted.
+ *
+ *      However, for square structurally non-singular matrices with one or more
+ *      zeros on the diagonal, the pre-pass is a waste of time, and for these
+ *      matrices, maxtrans can be twice as fast as cs_maxtrans.  Since the
+ *      maxtrans function is intended primarily for square matrices that are
+ *      typically structurally nonsingular, the pre-pass is not included here.
+ *      If this maxtrans function is used on a matrix with many more columns
+ *      than rows, consider passing the transpose to this function, or use
+ *      cs_maxtrans instead.
+ *
+ *  (3) cs_maxtrans can operate as a randomized algorithm, to help avoid
+ *      rare cases of excessive run-time.
+ *
+ *  (4) this maxtrans function includes an option that limits the total work
+ *      performed.  If this limit is reached, the maximum transveral might not
+ *      be found.
+ *
+ * Thus, for general usage, cs_maxtrans is preferred.  For square matrices that
+ * are typically structurally non-singular, maxtrans is preferred.  A partial
+ * maxtrans can still be very useful when solving a sparse linear system.
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */ 
+
+#include "btf.h"
+#include "btf_internal.h"
+
+
+/* ========================================================================== */
+/* === augment ============================================================== */
+/* ========================================================================== */
+
+/* Perform a depth-first-search starting at column k, to find an augmenting
+ * path.  An augmenting path is a sequence of row/column pairs (i1,k), (i2,j1),
+ * (i3,j2), ..., (i(s+1), js), such that all of the following properties hold:
+ *
+ *      * column k is not matched to any row
+ *      * entries in the path are nonzero
+ *      * the pairs (i1,j1), (i2,j2), (i3,j3) ..., (is,js) have been 
+ *          previously matched to each other
+ *      * (i(s+1), js) is nonzero, and row i(s+1) is not matched to any column
+ *
+ * Once this path is found, the matching can be changed to the set of pairs
+ * path.  An augmenting path is a sequence of row/column pairs
+ *
+ *      (i1,k), (i2,j1), (i3,j2), ..., (i(s+1), js)
+ *
+ * Once a row is matched with a column it remains matched with some column, but
+ * not necessarily the column it was first matched with.
+ *
+ * In the worst case, this function can examine every nonzero in A.  Since it
+ * is called n times by maxtrans, the total time of maxtrans can be as high as
+ * O(n*nnz(A)).  To limit this work, pass a value of maxwork > 0.  Then at
+ * most O((maxwork+1)*nnz(A)) work will be performed; the maximum matching might
+ * not be found, however.
+ *
+ * This routine is very similar to the dfs routine in klu_kernel.c, in the
+ * KLU sparse LU factorization package.  It is essentially identical to the
+ * cs_augment routine in CSparse, and its recursive version (augment function
+ * in cs_maxtransr_mex.c), except that this routine allows for the search to be
+ * terminated early if too much work is being performed.
+ *
+ * The algorithm is based on the paper "On Algorithms for obtaining a maximum
+ * transversal" by Iain Duff, ACM Trans. Mathematical Software, vol 7, no. 1,
+ * pp. 315-330, and "Algorithm 575: Permutations for a zero-free diagonal",
+ * same issue, pp. 387-390.  The code here is a new implementation of that
+ * algorithm, with different data structures and control flow.  After writing
+ * this code, I carefully compared my algorithm with MC21A/B (ACM Algorithm 575)
+ * Some of the comparisons are partial because I didn't dig deeply into all of
+ * the details of MC21A/B, such as how the stack is maintained.  The following
+ * arguments are essentially identical between this code and MC21A:
+ *
+ * maxtrans     MC21A,B
+ * --------     -------
+ * n            N           identical
+ * k            JORD        identical
+ * Ap           IP          column / row pointers
+ * Ai           ICN         row / column indices
+ * Ap[n]        LICN        length of index array (# of nonzeros in A)
+ * Match        IPERM       output column / row permutation
+ * nmatch       NUMNZ       # of nonzeros on diagonal of permuted matrix
+ * Flag         CV          mark a node as visited by the depth-first-search
+ *
+ * The following are different, but analogous:
+ *
+ * Cheap        ARP         indicates what part of the a column / row has
+ *                          already been matched.
+ *
+ * The following arguments are very different:
+ *
+ * -            LENR        # of entries in each row/column (unused in maxtrans)
+ * Pstack       OUT         Pstack keeps track of where we are in the depth-
+ *                          first-search scan of column j.  I think that OUT
+ *                          plays a similar role in MC21B, but I'm unsure.
+ * Istack       PR          keeps track of the rows in the path.  PR is a link
+ *                          list, though, whereas Istack is a stack.  Maxtrans
+ *                          does not use any link lists.
+ * Jstack       OUT? PR?    the stack for nodes in the path (unsure)
+ *
+ * The following control structures are roughly comparable:
+ *
+ * maxtrans                     MC21B
+ * --------                     -----
+ * for (k = 0 ; k < n ; k++)    DO 100 JORD=1,N
+ * while (head >= 0)            DO 70 K=1,JORD
+ * for (p = Cheap [j] ; ...)    DO 20 II=IN1,IN2
+ * for (p = head ; ...)         DO 90 K=1,JORD
+ */
+
+static Int augment
+(
+    Int k,              /* which stage of the main loop we're in */
+    Int Ap [ ],         /* column pointers, size n+1 */
+    Int Ai [ ],         /* row indices, size nz = Ap [n] */
+    Int Match [ ],      /* size n,  Match [i] = j if col j matched to i */
+    Int Cheap [ ],      /* rows Ai [Ap [j] .. Cheap [j]-1] alread matched */
+    Int Flag [ ],       /* Flag [j] = k if j already visited this stage */
+    Int Istack [ ],     /* size n.  Row index stack. */
+    Int Jstack [ ],     /* size n.  Column index stack. */
+    Int Pstack [ ],     /* size n.  Keeps track of position in adjacency list */
+    double *work,       /* work performed by the depth-first-search */
+    double maxwork      /* maximum work allowed */
+)
+{
+    /* local variables, but "global" to all DFS levels: */
+    Int found ; /* true if match found.  */
+    Int head ;  /* top of stack */
+
+    /* variables that are purely local to any one DFS level: */
+    Int j2 ;    /* the next DFS goes to node j2 */
+    Int pend ;  /* one past the end of the adjacency list for node j */
+    Int pstart ;
+    Int quick ;
+
+    /* variables that need to be pushed then popped from the stack: */
+    Int i ;     /* the row tentatively matched to i if DFS successful */
+    Int j ;     /* the DFS is at the current node j */
+    Int p ;     /* current index into the adj. list for node j */
+    /* the variables i, j, and p are stacked in Istack, Jstack, and Pstack */
+
+    quick = (maxwork > 0) ;
+
+    /* start a DFS to find a match for column k */
+    found = FALSE ;
+    i = EMPTY ;
+    head = 0 ;
+    Jstack [0] = k ;
+    ASSERT (Flag [k] != k) ;
+
+    while (head >= 0)
+    {
+        j = Jstack [head] ;
+        pend = Ap [j+1] ;
+
+        if (Flag [j] != k)          /* a node is not yet visited */
+        {
+
+            /* -------------------------------------------------------------- */
+            /* prework for node j */
+            /* -------------------------------------------------------------- */
+
+            /* first time that j has been visited */
+            Flag [j] = k ;
+            /* cheap assignment: find the next unmatched row in col j.  This
+             * loop takes at most O(nnz(A)) time for the sum total of all
+             * calls to augment. */
+            for (p = Cheap [j] ; p < pend && !found ; p++)
+            {
+                i = Ai [p] ;
+                found = (Match [i] == EMPTY) ;
+            }
+            Cheap [j] = p ;
+
+            /* -------------------------------------------------------------- */
+
+            /* prepare for DFS */
+            if (found)
+            {
+                /* end of augmenting path, column j matched with row i */
+                Istack [head] = i ;
+                break ;
+            }
+            /* set Pstack [head] to the first entry in column j to scan */
+            Pstack [head] = Ap [j] ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* quick return if too much work done */
+        /* ------------------------------------------------------------------ */
+
+        if (quick && *work > maxwork)
+        {
+            /* too much work has been performed; abort the search */
+            return (EMPTY) ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* DFS for nodes adjacent to j */
+        /* ------------------------------------------------------------------ */
+
+        /* If cheap assignment not made, continue the depth-first search.  All
+         * rows in column j are already matched.  Add the adjacent nodes to the
+         * stack by iterating through until finding another non-visited node.
+         *
+         * It is the following loop that can force maxtrans to take
+         * O(n*nnz(A)) time. */
+
+        pstart = Pstack [head] ;
+        for (p = pstart ; p < pend ; p++)
+        {
+            i = Ai [p] ;
+            j2 = Match [i] ;
+            ASSERT (j2 != EMPTY) ;
+            if (Flag [j2] != k)
+            {
+                /* Node j2 is not yet visited, start a depth-first search on
+                 * node j2.  Keep track of where we left off in the scan of adj
+                 * list of node j so we can restart j where we left off. */
+                Pstack [head] = p + 1 ;
+                /* Push j2 onto the stack and immediately break so we can
+                 * recurse on node j2.  Also keep track of row i which (if this
+                 * search for an augmenting path works) will be matched with the
+                 * current node j. */
+                Istack [head] = i ;
+                Jstack [++head] = j2 ;
+                break ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* determine how much work was just performed */
+        /* ------------------------------------------------------------------ */
+
+        *work += (p - pstart + 1) ;
+
+        /* ------------------------------------------------------------------ */
+        /* node j is done, but the postwork is postponed - see below */
+        /* ------------------------------------------------------------------ */
+
+        if (p == pend)
+        {
+            /* If all adjacent nodes of j are already visited, pop j from
+             * stack and continue.  We failed to find a match. */
+            head-- ;
+        }
+    }
+
+    /* postwork for all nodes j in the stack */
+    /* unwind the path and make the corresponding matches */
+    if (found)
+    {
+        for (p = head ; p >= 0 ; p--)
+        {
+            j = Jstack [p] ;
+            i = Istack [p] ;
+
+            /* -------------------------------------------------------------- */
+            /* postwork for node j */
+            /* -------------------------------------------------------------- */
+            /* if found, match row i with column j */
+            Match [i] = j ;
+        }
+    }
+    return (found) ;
+}
+
+
+/* ========================================================================== */
+/* === maxtrans ============================================================= */
+/* ========================================================================== */
+
+Int BTF(maxtrans)   /* returns # of columns in the matching */
+(
+    /* --- input --- */
+    Int nrow,       /* A is nrow-by-ncol in compressed column form */
+    Int ncol,
+    Int Ap [ ],     /* size ncol+1 */
+    Int Ai [ ],     /* size nz = Ap [ncol] */
+    double maxwork, /* do at most maxwork*nnz(A) work; no limit if <= 0.  This
+                     * work limit excludes the O(nnz(A)) cheap-match phase. */
+
+    /* --- output --- */
+    double *work,   /* work = -1 if maxwork > 0 and the total work performed
+                     * reached the maximum of maxwork*nnz(A)).
+                     * Otherwise, work = the total work performed. */
+
+    Int Match [ ],  /* size nrow.  Match [i] = j if column j matched to row i */
+
+    /* --- workspace --- */
+    Int Work [ ]    /* size 5*ncol */
+)
+{
+    Int *Cheap, *Flag, *Istack, *Jstack, *Pstack ;
+    Int i, j, k, nmatch, work_limit_reached, result ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get workspace and initialize */
+    /* ---------------------------------------------------------------------- */
+
+    Cheap  = Work ; Work += ncol ;
+    Flag   = Work ; Work += ncol ;
+
+    /* stack for non-recursive depth-first search in augment function */
+    Istack = Work ; Work += ncol ;
+    Jstack = Work ; Work += ncol ;
+    Pstack = Work ;
+
+    /* in column j, rows Ai [Ap [j] .. Cheap [j]-1] are known to be matched */
+    for (j = 0 ; j < ncol ; j++)
+    {
+        Cheap [j] = Ap [j] ;
+        Flag [j] = EMPTY ; 
+    }
+
+    /* all rows and columns are currently unmatched */
+    for (i = 0 ; i < nrow ; i++)
+    {
+        Match [i] = EMPTY ;
+    }
+
+    if (maxwork > 0)
+    {
+        maxwork *= Ap [ncol] ;
+    }
+    *work = 0 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* find a matching row for each column k */
+    /* ---------------------------------------------------------------------- */
+
+    nmatch = 0 ;
+    work_limit_reached = FALSE ;
+    for (k = 0 ; k < ncol ; k++)
+    {
+        /* find an augmenting path to match some row i to column k */
+        result = augment (k, Ap, Ai, Match, Cheap, Flag, Istack, Jstack, Pstack,
+            work, maxwork) ;
+        if (result == TRUE)
+        {
+            /* we found it.  Match [i] = k for some row i has been done. */
+            nmatch++ ;
+        }
+        else if (result == EMPTY)
+        {
+            /* augment gave up because of too much work, and no match found */
+            work_limit_reached = TRUE ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* return the Match, and the # of matches made */
+    /* ---------------------------------------------------------------------- */
+
+    /* At this point, row i is matched to j = Match [i] if j >= 0.  i is an
+     * unmatched row if Match [i] == EMPTY. */
+
+    if (work_limit_reached)
+    {
+        /* return -1 if the work limit of maxwork*nnz(A) was reached */
+        *work = EMPTY ;
+    }
+
+    return (nmatch) ;
+}
diff --git a/src/maths/KLU/btf_order.c b/src/maths/KLU/btf_order.c
new file mode 100644
index 000000000..7198d0cdd
--- /dev/null
+++ b/src/maths/KLU/btf_order.c
@@ -0,0 +1,132 @@
+/* ========================================================================== */
+/* === BTF_ORDER ============================================================ */
+/* ========================================================================== */
+
+/* Find a permutation P and Q to permute a square sparse matrix into upper block
+ * triangular form.  A(P,Q) will contain a zero-free diagonal if A has
+ * structural full-rank.  Otherwise, the number of nonzeros on the diagonal of
+ * A(P,Q) will be maximized, and will equal the structural rank of A.
+ *
+ * Q[k] will be "flipped" if a zero-free diagonal was not found.  Q[k] will be
+ * negative, and j = BTF_UNFLIP (Q [k]) gives the corresponding permutation.
+ *
+ * R defines the block boundaries of A(P,Q).  The kth block consists of rows
+ * and columns R[k] to R[k+1]-1.
+ *
+ * If maxwork > 0 on input, then the work performed in btf_maxtrans is limited
+ * to maxwork*nnz(A) (excluding the "cheap match" phase, which can take another
+ * nnz(A) work).  On output, the work parameter gives the actual work performed,
+ * or -1 if the limit was reached.  In the latter case, the diagonal of A(P,Q)
+ * might not be zero-free, and the number of nonzeros on the diagonal of A(P,Q)
+ * might not be equal to the structural rank.
+ *
+ * See btf.h for more details.
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+#include "btf.h"
+#include "btf_internal.h"
+
+/* This function only operates on square matrices (either structurally full-
+ * rank, or structurally rank deficient). */
+
+Int BTF(order)      /* returns number of blocks found */
+(
+    /* input, not modified: */
+    Int n,          /* A is n-by-n in compressed column form */
+    Int Ap [ ],     /* size n+1 */
+    Int Ai [ ],     /* size nz = Ap [n] */
+    double maxwork, /* do at most maxwork*nnz(A) work in the maximum
+                     * transversal; no limit if <= 0 */
+
+    /* output, not defined on input */
+    double *work,   /* work performed in maxtrans, or -1 if limit reached */
+    Int P [ ],      /* size n, row permutation */
+    Int Q [ ],      /* size n, column permutation */
+    Int R [ ],      /* size n+1.  block b is in rows/cols R[b] ... R[b+1]-1 */
+    Int *nmatch,    /* # nonzeros on diagonal of P*A*Q */
+
+    /* workspace, not defined on input or output */
+    Int Work [ ]    /* size 5n */
+)
+{
+    Int *Flag ;
+    Int nblocks, i, j, nbadcol ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute the maximum matching */
+    /* ---------------------------------------------------------------------- */
+
+    /* if maxwork > 0, then a maximum matching might not be found */
+
+    *nmatch = BTF(maxtrans) (n, n, Ap, Ai, maxwork, work, Q, Work) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* complete permutation if the matrix is structurally singular */
+    /* ---------------------------------------------------------------------- */
+
+    /* Since the matrix is square, ensure BTF_UNFLIP(Q[0..n-1]) is a
+     * permutation of the columns of A so that A has as many nonzeros on the
+     * diagonal as possible.
+     */
+
+    if (*nmatch < n)
+    {
+        /* get a size-n work array */
+        Flag = Work + n ;
+        for (j = 0 ; j < n ; j++)
+        {
+            Flag [j] = 0 ;
+        }
+
+        /* flag all matched columns */
+        for (i = 0 ; i < n ; i++)
+        {
+            j = Q [i] ;
+            if (j != EMPTY)
+            {
+                /* row i and column j are matched to each other */
+                Flag [j] = 1 ;
+            }
+        }
+
+        /* make a list of all unmatched columns, in Work [0..nbadcol-1]  */
+        nbadcol = 0 ;
+        for (j = n-1 ; j >= 0 ; j--)
+        {
+            if (!Flag [j])
+            {
+                /* j is matched to nobody */
+                Work [nbadcol++] = j ;
+            }
+        }
+        ASSERT (*nmatch + nbadcol == n) ;
+
+        /* make an assignment for each unmatched row */
+        for (i = 0 ; i < n ; i++)
+        {
+            if (Q [i] == EMPTY && nbadcol > 0)
+            {
+                /* get an unmatched column j */
+                j = Work [--nbadcol] ;
+                /* assign j to row i and flag the entry by "flipping" it */
+                Q [i] = BTF_FLIP (j) ;
+            }
+        }
+    }
+
+    /* The permutation of a square matrix can be recovered as follows: Row i is
+     * matched with column j, where j = BTF_UNFLIP (Q [i]) and where j
+     * will always be in the valid range 0 to n-1.  The entry A(i,j) is zero
+     * if BTF_ISFLIPPED (Q [i]) is true, and nonzero otherwise.  nmatch
+     * is the number of entries in the Q array that are non-negative. */
+
+    /* ---------------------------------------------------------------------- */
+    /* find the strongly connected components */
+    /* ---------------------------------------------------------------------- */
+
+    nblocks = BTF(strongcomp) (n, Ap, Ai, Q, P, R, Work) ;
+    return (nblocks) ;
+}
diff --git a/src/maths/KLU/btf_strongcomp.c b/src/maths/KLU/btf_strongcomp.c
new file mode 100644
index 000000000..aeb555ad8
--- /dev/null
+++ b/src/maths/KLU/btf_strongcomp.c
@@ -0,0 +1,593 @@
+/* ========================================================================== */
+/* === BTF_STRONGCOMP ======================================================= */
+/* ========================================================================== */
+
+/* Finds the strongly connected components of a graph, or equivalently, permutes
+ * the matrix into upper block triangular form.  See btf.h for more details.
+ * Input matrix and Q are not checked on input.
+ *
+ * Copyright (c) 2004-2007.  Tim Davis, University of Florida,
+ * with support from Sandia National Laboratories.  All Rights Reserved.
+ */
+
+#include "btf.h"
+#include "btf_internal.h"
+
+#define UNVISITED (-2)      /* Flag [j] = UNVISITED if node j not visited yet */
+#define UNASSIGNED (-1)     /* Flag [j] = UNASSIGNED if node j has been visited,
+                             * but not yet assigned to a strongly-connected
+                             * component (aka block).  Flag [j] = k (k in the
+                             * range 0 to nblocks-1) if node j has been visited
+                             * (and completed, with its postwork done) and
+                             * assigned to component k. */
+
+/* This file contains two versions of the depth-first-search, a recursive one
+ * and a non-recursive one.  By default, the non-recursive one is used. */
+
+#ifndef RECURSIVE
+
+/* ========================================================================== */
+/* === dfs: non-recursive version (default) ================================= */
+/* ========================================================================== */
+
+/* Perform a depth-first-search of a graph, stored in an adjacency-list form.
+ * The row indices of column j (equivalently, the out-adjacency list of node j)
+ * are stored in Ai [Ap[j] ... Ap[j+1]-1].  Self-edge (diagonal entries) are
+ * ignored.  Ap[0] must be zero, and thus nz = Ap[n] is the number of entries
+ * in the matrix (or edges in the graph).  The row indices in each column need
+ * not be in any particular order.  If an input column permutation is given,
+ * node j (in the permuted matrix A*Q) is located in
+ * Ai [Ap[Q[j]] ... Ap[Q[j]+1]-1].  This Q can be the same as the Match array
+ * output from the maxtrans routine, for a square matrix that is structurally
+ * full rank.
+ *
+ * The algorithm is from the paper by Robert E. Tarjan, "Depth-first search and
+ * linear graph algorithms," SIAM Journal on Computing, vol. 1, no. 2,
+ * pp. 146-160, 1972.  The time taken by strongcomp is O(nnz(A)).
+ *
+ * See also MC13A/B in the Harwell subroutine library (Iain S. Duff and John
+ * K. Reid, "Algorithm 529: permutations to block triangular form," ACM Trans.
+ * on Mathematical Software, vol. 4, no. 2, pp. 189-192, 1978, and "An
+ * implementation of Tarjan's algorithm for the block triangular form of a
+ * matrix," same journal, pp. 137-147.  This code is implements the same
+ * algorithm as MC13A/B, except that the data structures are very different.
+ * Also, unlike MC13A/B, the output permutation preserves the natural ordering
+ * within each block.
+ */
+
+static void dfs
+(
+    /* inputs, not modified on output: */
+    Int j,              /* start the DFS at node j */
+    Int Ap [ ],         /* size n+1, column pointers for the matrix A */
+    Int Ai [ ],         /* row indices, size nz = Ap [n] */
+    Int Q [ ],          /* input column permutation */
+
+    /* inputs, modified on output (each array is of size n): */
+    Int Time [ ],       /* Time [j] = "time" that node j was first visited */
+    Int Flag [ ],       /* Flag [j]: see above */
+    Int Low [ ],        /* Low [j]: see definition below */
+    Int *p_nblocks,     /* number of blocks (aka strongly-connected-comp.)*/
+    Int *p_timestamp,   /* current "time" */
+
+    /* workspace, not defined on input or output: */
+    Int Cstack [ ],     /* size n, output stack to hold nodes of components */
+    Int Jstack [ ],     /* size n, stack for the variable j */
+    Int Pstack [ ]      /* size n, stack for the variable p */
+)
+{
+    /* ---------------------------------------------------------------------- */
+    /* local variables, and initializations */
+    /* ---------------------------------------------------------------------- */
+
+    /* local variables, but "global" to all DFS levels: */
+    Int chead ;     /* top of Cstack */
+    Int jhead ;     /* top of Jstack and Pstack */
+
+    /* variables that are purely local to any one DFS level: */
+    Int i ;         /* edge (j,i) considered; i can be next node to traverse */
+    Int parent ;    /* parent of node j in the DFS tree */
+    Int pend ;      /* one past the end of the adjacency list for node j */
+    Int jj ;        /* column j of A*Q is column jj of the input matrix A */
+
+    /* variables that need to be pushed then popped from the stack: */
+    Int p ;         /* current index into the adj. list for node j */
+    /* the variables j and p are stacked in Jstack and Pstack */
+
+    /* local copies of variables in the calling routine */
+    Int nblocks   = *p_nblocks ;
+    Int timestamp = *p_timestamp ;
+
+    /* ---------------------------------------------------------------------- */
+    /* start a DFS at node j (same as the recursive call dfs (EMPTY, j)) */
+    /* ---------------------------------------------------------------------- */
+
+    chead = 0 ;             /* component stack is empty */
+    jhead = 0 ;             /* Jstack and Pstack are empty */
+    Jstack [0] = j ;        /* put the first node j on the Jstack */
+    ASSERT (Flag [j] == UNVISITED) ;
+
+    while (jhead >= 0)
+    {
+        j = Jstack [jhead] ;        /* grab the node j from the top of Jstack */
+
+        /* determine which column jj of the A is column j of A*Q */
+        jj = (Q == (Int *) NULL) ? (j) : (BTF_UNFLIP (Q [j])) ;
+        pend = Ap [jj+1] ;          /* j's row index list ends at Ai [pend-1] */
+
+        if (Flag [j] == UNVISITED)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* prework at node j */
+            /* -------------------------------------------------------------- */
+
+            /* node j is being visited for the first time */
+            Cstack [++chead] = j ;          /* push j onto the stack */
+            timestamp++ ;                   /* get a timestamp */
+            Time [j] = timestamp ;          /* give the timestamp to node j */
+            Low [j] = timestamp ;
+            Flag [j] = UNASSIGNED ;         /* flag node j as visited */
+
+            /* -------------------------------------------------------------- */
+            /* set Pstack [jhead] to the first entry in column j to scan */
+            /* -------------------------------------------------------------- */
+
+            Pstack [jhead] = Ap [jj] ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* DFS rooted at node j (start it, or continue where left off) */
+        /* ------------------------------------------------------------------ */
+
+        for (p = Pstack [jhead] ; p < pend ; p++)
+        {
+            i = Ai [p] ;    /* examine the edge from node j to node i */
+            if (Flag [i] == UNVISITED)
+            {
+                /* Node i has not been visited - start a DFS at node i.
+                 * Keep track of where we left off in the scan of adjacency list
+                 * of node j so we can restart j where we left off. */
+                Pstack [jhead] = p + 1 ;
+                /* Push i onto the stack and immediately break
+                 * so we can recurse on node i. */
+                Jstack [++jhead] = i ;
+                ASSERT (Time [i] == EMPTY) ;
+                ASSERT (Low [i] == EMPTY) ;
+                /* break here to do what the recursive call dfs (j,i) does */
+                break ;
+            }
+            else if (Flag [i] == UNASSIGNED)
+            {
+                /* Node i has been visited, but still unassigned to a block
+                 * this is a back or cross edge if Time [i] < Time [j].
+                 * Note that i might equal j, in which case this code does
+                 * nothing. */
+                ASSERT (Time [i] > 0) ;
+                ASSERT (Low [i] > 0) ;
+                Low [j] = MIN (Low [j], Time [i]) ;
+            }
+        }
+
+        if (p == pend)
+        {
+            /* If all adjacent nodes of j are already visited, pop j from
+             * Jstack and do the post work for node j.  This also pops p
+             * from the Pstack. */
+            jhead-- ;
+
+            /* -------------------------------------------------------------- */
+            /* postwork at node j */
+            /* -------------------------------------------------------------- */
+
+            /* determine if node j is the head of a component */
+            if (Low [j] == Time [j])
+            {
+                /* pop all nodes in this SCC from Cstack */
+                while (TRUE)
+                {
+                    ASSERT (chead >= 0) ;       /* stack not empty (j in it) */
+                    i = Cstack [chead--] ;      /* pop a node from the Cstack */
+                    ASSERT (i >= 0) ;
+                    ASSERT (Flag [i] == UNASSIGNED) ;
+                    Flag [i] = nblocks ;        /* assign i to current block */
+                    if (i == j) break ;         /* current block ends at j */
+                }
+                nblocks++ ;     /* one more block has been found */
+            }
+            /* update Low [parent], if the parent exists */
+            if (jhead >= 0)
+            {
+                parent = Jstack [jhead] ;
+                Low [parent] = MIN (Low [parent], Low [j]) ;
+            }
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* cleanup: update timestamp and nblocks */
+    /* ---------------------------------------------------------------------- */
+
+    *p_timestamp = timestamp ;
+    *p_nblocks   = nblocks ;
+}
+
+#else
+
+/* ========================================================================== */
+/* === dfs: recursive version (only for illustration) ======================= */
+/* ========================================================================== */
+
+/* The following is a recursive version of dfs, which computes identical results
+ * as the non-recursive dfs.  It is included here because it is easier to read.
+ * Compare the comments in the code below with the identical comments in the
+ * non-recursive code above, and that will help you see the correlation between
+ * the two routines.
+ *
+ * This routine can cause stack overflow, and is thus not recommended for heavy
+ * usage, particularly for large matrices.  To help in delaying stack overflow,
+ * global variables are used, reducing the amount of information each call to
+ * dfs places on the call/return stack (the integers i, j, p, parent, and the
+ * return address).  Note that this means the recursive code is not thread-safe.
+ * To try this version, compile the code with -DRECURSIVE or include the
+ * following line at the top of this file:
+
+#define RECURSIVE
+
+ */
+
+static Int  /* for recursive illustration only, not for production use */
+    chead, timestamp, nblocks, n, *Ap, *Ai, *Flag, *Cstack, *Time, *Low,
+    *P, *R, *Q ;
+
+static void dfs
+(
+    Int parent,         /* came from parent node */
+    Int j               /* at node j in the DFS */
+)
+{
+    Int p ;         /* current index into the adj. list for node j */
+    Int i ;         /* edge (j,i) considered; i can be next node to traverse */
+    Int jj ;        /* column j of A*Q is column jj of the input matrix A */
+
+    /* ---------------------------------------------------------------------- */
+    /* prework at node j */
+    /* ---------------------------------------------------------------------- */
+
+    /* node j is being visited for the first time */
+    Cstack [++chead] = j ;          /* push j onto the stack */
+    timestamp++ ;                   /* get a timestamp */
+    Time [j] = timestamp ;          /* give the timestamp to node j */
+    Low [j] = timestamp ;
+    Flag [j] = UNASSIGNED ;         /* flag node j as visited */
+
+    /* ---------------------------------------------------------------------- */
+    /* DFS rooted at node j */
+    /* ---------------------------------------------------------------------- */
+
+    /* determine which column jj of the A is column j of A*Q */
+    jj = (Q == (Int *) NULL) ? (j) : (BTF_UNFLIP (Q [j])) ;
+    for (p = Ap [jj] ; p < Ap [jj+1] ; p++)
+    {
+        i = Ai [p] ;    /* examine the edge from node j to node i */
+        if (Flag [i] == UNVISITED)
+        {
+            /* Node i has not been visited - start a DFS at node i. */
+            dfs (j, i) ;
+        }
+        else if (Flag [i] == UNASSIGNED)
+        {
+            /* Node i has been visited, but still unassigned to a block
+             * this is a back or cross edge if Time [i] < Time [j].
+             * Note that i might equal j, in which case this code does
+             * nothing. */
+            Low [j] = MIN (Low [j], Time [i]) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* postwork at node j */
+    /* ---------------------------------------------------------------------- */
+
+    /* determine if node j is the head of a component */
+    if (Low [j] == Time [j])
+    {
+        /* pop all nodes in this strongly connected component from Cstack */
+        while (TRUE)
+        {
+            i = Cstack [chead--] ;      /* pop a node from the Cstack */
+            Flag [i] = nblocks ;        /* assign node i to current block */
+            if (i == j) break ;         /* current block ends at node j */
+        }
+        nblocks++ ;     /* one more block has been found */
+    }
+    /* update Low [parent] */
+    if (parent != EMPTY)
+    {
+        /* Note that this could be done with Low[j] = MIN(Low[j],Low[i]) just
+         * after the dfs (j,i) statement above, and then parent would not have
+         * to be an input argument.  Putting it here places all the postwork
+         * for node j in one place, thus making the non-recursive DFS easier. */
+        Low [parent] = MIN (Low [parent], Low [j]) ;
+    }
+}
+
+#endif
+
+/* ========================================================================== */
+/* === btf_strongcomp ======================================================= */
+/* ========================================================================== */
+
+#ifndef RECURSIVE
+
+Int BTF(strongcomp) /* return # of strongly connected components */
+(
+    /* input, not modified: */
+    Int n,          /* A is n-by-n in compressed column form */
+    Int Ap [ ],     /* size n+1 */
+    Int Ai [ ],     /* size nz = Ap [n] */
+
+    /* optional input, modified (if present) on output: */
+    Int Q [ ],      /* size n, input column permutation.  The permutation Q can
+                     * include a flag which indicates an unmatched row.
+                     * jold = BTF_UNFLIP (Q [jnew]) is the permutation;
+                     * this function ingnores these flags.  On output, it is
+                     * modified according to the permutation P. */
+
+    /* output, not defined on input: */
+    Int P [ ],      /* size n.  P [k] = j if row and column j are kth row/col
+                     * in permuted matrix. */
+    Int R [ ],      /* size n+1.  kth block is in rows/cols R[k] ... R[k+1]-1
+                     * of the permuted matrix. */
+
+    /* workspace, not defined on input or output: */
+    Int Work [ ]    /* size 4n */
+)
+
+#else
+
+Int BTF(strongcomp) /* recursive version - same as above except for Work size */
+(
+    Int n_in,
+    Int Ap_in [ ],
+    Int Ai_in [ ],
+    Int Q_in [ ],
+    Int P_in [ ],
+    Int R_in [ ],
+    Int Work [ ]    /* size 2n */
+)
+
+#endif
+
+{
+    Int j, k, b ;
+
+#ifndef RECURSIVE
+    Int timestamp, nblocks, *Flag, *Cstack, *Time, *Low, *Jstack, *Pstack ;
+#else
+    n = n_in ;
+    Ap = Ap_in ;
+    Ai = Ai_in ;
+    Q = Q_in ;
+    P = P_in ;
+    R = R_in ;
+    chead = EMPTY ;
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* get and initialize workspace */
+    /* ---------------------------------------------------------------------- */
+
+    /* timestamp is incremented each time a new node is visited.
+     *
+     * Time [j] is the timestamp given to node j.
+     *
+     * Low [j] is the lowest timestamp of any node reachable from j via either
+     * a path to any descendent of j in the DFS tree, or via a single edge to
+     * an either an ancestor (a back edge) or another node that's neither an
+     * ancestor nor a descendant (a cross edge).  If Low [j] is equal to
+     * the timestamp of node j (Time [j]), then node j is the "head" of a
+     * strongly connected component (SCC).  That is, it is the first node
+     * visited in its strongly connected component, and the DFS subtree rooted
+     * at node j spans all the nodes of the strongly connected component.
+     *
+     * The term "block" and "component" are used interchangebly in this code;
+     * "block" being a matrix term and "component" being a graph term for the
+     * same thing.
+     *
+     * When a node is visited, it is placed on the Cstack (for "component"
+     * stack).  When node j is found to be an SCC head, all the nodes from the
+     * top of the stack to node j itself form the nodes in the SCC.  This Cstack
+     * is used for both the recursive and non-recursive versions.
+     */
+
+    Time   = Work ; Work += n ;
+    Flag   = Work ; Work += n ;
+    Low    = P ;                /* use output array P as workspace for Low */
+    Cstack = R ;                /* use output array R as workspace for Cstack */
+
+#ifndef RECURSIVE
+    /* stack for non-recursive dfs */
+    Jstack = Work ; Work += n ;     /* stack for j */
+    Pstack = Work ;                 /* stack for p */
+#endif
+
+    for (j = 0 ; j < n ; j++)
+    {
+        Flag [j] = UNVISITED ;
+        Low [j] = EMPTY ;
+        Time [j] = EMPTY ;
+#ifndef NDEBUG
+        Cstack [j] = EMPTY ;
+#ifndef RECURSIVE
+        Jstack [j] = EMPTY ;
+        Pstack [j] = EMPTY ;
+#endif
+#endif
+    }
+
+    timestamp = 0 ;     /* each node given a timestamp when it is visited */
+    nblocks = 0 ;       /* number of blocks found so far */
+
+    /* ---------------------------------------------------------------------- */
+    /* find the connected components via a depth-first-search */
+    /* ---------------------------------------------------------------------- */
+
+    for (j = 0 ; j < n ; j++)
+    {
+        /* node j is unvisited or assigned to a block. Cstack is empty. */
+        ASSERT (Flag [j] == UNVISITED || (Flag [j] >= 0 && Flag [j] < nblocks));
+        if (Flag [j] == UNVISITED)
+        {
+#ifndef RECURSIVE
+            /* non-recursive dfs (default) */
+            dfs (j, Ap, Ai, Q, Time, Flag, Low, &nblocks, &timestamp,
+                    Cstack, Jstack, Pstack) ;
+#else
+            /* recursive dfs (for illustration only) */
+            ASSERT (chead == EMPTY) ;
+            dfs (EMPTY, j) ;
+            ASSERT (chead == EMPTY) ;
+#endif
+        }
+    }
+    ASSERT (timestamp == n) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* construct the block boundary array, R */
+    /* ---------------------------------------------------------------------- */
+
+    for (b = 0 ; b < nblocks ; b++)
+    {
+        R [b] = 0 ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        /* node j has been assigned to block b = Flag [j] */
+        ASSERT (Time [j] > 0 && Time [j] <= n) ;
+        ASSERT (Low [j] > 0 && Low [j] <= n) ;
+        ASSERT (Flag [j] >= 0 && Flag [j] < nblocks) ;
+        R [Flag [j]]++ ;
+    }
+    /* R [b] is now the number of nodes in block b.  Compute cumulative sum
+     * of R, using Time [0 ... nblocks-1] as workspace. */
+    Time [0] = 0 ;
+    for (b = 1 ; b < nblocks ; b++)
+    {
+        Time [b] = Time [b-1] + R [b-1] ;
+    }
+    for (b = 0 ; b < nblocks ; b++)
+    {
+        R [b] = Time [b] ;
+    }
+    R [nblocks] = n ;
+
+    /* ---------------------------------------------------------------------- */
+    /* construct the permutation, preserving the natural order */
+    /* ---------------------------------------------------------------------- */
+
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        P [k] = EMPTY ;
+    }
+#endif
+
+    for (j = 0 ; j < n ; j++)
+    {
+        /* place column j in the permutation */
+        P [Time [Flag [j]]++] = j ;
+    }
+
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (P [k] != EMPTY) ;
+    }
+#endif
+
+    /* Now block b consists of the nodes k1 to k2-1 in the permuted matrix,
+     * where k1 = R [b] and k2 = R [b+1].  Row and column j of the original
+     * matrix becomes row and column P [k] of the permuted matrix.  The set of
+     * of rows/columns (nodes) in block b is given by P [k1 ... k2-1], and this
+     * set is sorted in ascending order.  Thus, if the matrix consists of just
+     * one block, P is the identity permutation. */
+
+    /* ---------------------------------------------------------------------- */
+    /* if Q is present on input, set Q = Q*P' */
+    /* ---------------------------------------------------------------------- */
+
+    if (Q != (Int *) NULL)
+    {
+        /* We found a symmetric permutation P for the matrix A*Q.  The overall
+         * permutation is thus P*(A*Q)*P'.  Set Q=Q*P' so that the final
+         * permutation is P*A*Q.  Use Time as workspace.  Note that this
+         * preserves the negative values of Q if the matrix is structurally
+         * singular. */
+        for (k = 0 ; k < n ; k++)
+        {
+            Time [k] = Q [P [k]] ;
+        }
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = Time [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* how to traverse the permuted matrix */
+    /* ---------------------------------------------------------------------- */
+
+    /* If Q is not present, the following code can be used to traverse the
+     * permuted matrix P*A*P'
+     *
+     *      // compute the inverse of P
+     *      for (knew = 0 ; knew < n ; knew++)
+     *      {
+     *          // row and column kold in the old matrix is row/column knew
+     *          // in the permuted matrix P*A*P'
+     *          kold = P [knew] ;
+     *          Pinv [kold] = knew ;
+     *      }
+     *      for (b = 0 ; b < nblocks ; b++)
+     *      {
+     *          // traverse block b of the permuted matrix P*A*P'
+     *          k1 = R [b] ;
+     *          k2 = R [b+1] ;
+     *          nk = k2 - k1 ;
+     *          for (jnew = k1 ; jnew < k2 ; jnew++)
+     *          {
+     *              jold = P [jnew] ;
+     *              for (p = Ap [jold] ; p < Ap [jold+1] ; p++)
+     *              {
+     *                  iold = Ai [p] ;
+     *                  inew = Pinv [iold] ;
+     *                  // Entry in the old matrix is A (iold, jold), and its
+     *                  // position in the new matrix P*A*P' is (inew, jnew).
+     *                  // Let B be the bth diagonal block of the permuted
+     *                  // matrix.  If inew >= k1, then this entry is in row/
+     *                  // column (inew-k1, jnew-k1) of the nk-by-nk matrix B.
+     *                  // Otherwise, the entry is in the upper block triangular
+     *                  // part, not in any diagonal block.
+     *              }
+     *          }
+     *      }
+     *
+     * If Q is present replace the above statement
+     *          jold = P [jnew] ;
+     * with
+     *          jold = Q [jnew] ;
+     * or
+     *          jold = BTF_UNFLIP (Q [jnew]) ;
+     *
+     * then entry A (iold,jold) in the old (unpermuted) matrix is at (inew,jnew)
+     * in the permuted matrix P*A*Q.  Everything else remains the same as the
+     * above (simply replace P*A*P' with P*A*Q in the above comments).
+     */
+
+    /* ---------------------------------------------------------------------- */
+    /* return # of blocks / # of strongly connected components */
+    /* ---------------------------------------------------------------------- */
+
+    return (nblocks) ;
+}
diff --git a/src/maths/KLU/colamd.c b/src/maths/KLU/colamd.c
new file mode 100644
index 000000000..5fe20d628
--- /dev/null
+++ b/src/maths/KLU/colamd.c
@@ -0,0 +1,3611 @@
+/* ========================================================================== */
+/* === colamd/symamd - a sparse matrix column ordering algorithm ============ */
+/* ========================================================================== */
+
+/* COLAMD / SYMAMD
+
+    colamd:  an approximate minimum degree column ordering algorithm,
+    	for LU factorization of symmetric or unsymmetric matrices,
+	QR factorization, least squares, interior point methods for
+	linear programming problems, and other related problems.
+
+    symamd:  an approximate minimum degree ordering algorithm for Cholesky
+    	factorization of symmetric matrices.
+
+    Purpose:
+
+	Colamd computes a permutation Q such that the Cholesky factorization of
+	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
+	than A'A.  This also provides a good ordering for sparse partial
+	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
+	factorization, and P is computed during numerical factorization via
+	conventional partial pivoting with row interchanges.  Colamd is the
+	column ordering method used in SuperLU, part of the ScaLAPACK library.
+	It is also available as built-in function in MATLAB Version 6,
+	available from MathWorks, Inc. (http://www.mathworks.com).  This
+	routine can be used in place of colmmd in MATLAB.
+
+    	Symamd computes a permutation P of a symmetric matrix A such that the
+	Cholesky factorization of PAP' has less fill-in and requires fewer
+	floating point operations than A.  Symamd constructs a matrix M such
+	that M'M has the same nonzero pattern of A, and then orders the columns
+	of M using colmmd.  The column ordering of M is then returned as the
+	row and column ordering P of A. 
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis at cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Copyright and License:
+
+	Copyright (c) 1998-2007, Timothy A. Davis, All Rights Reserved.
+	COLAMD is also available under alternate licenses, contact T. Davis
+	for details.
+
+	This library is free software; you can redistribute it and/or
+	modify it under the terms of the GNU Lesser General Public
+	License as published by the Free Software Foundation; either
+	version 2.1 of the License, or (at your option) any later version.
+
+	This library is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+	Lesser General Public License for more details.
+
+	You should have received a copy of the GNU Lesser General Public
+	License along with this library; if not, write to the Free Software
+	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+	USA
+
+	Permission is hereby granted to use or copy this program under the
+	terms of the GNU LGPL, provided that the Copyright, this License,
+	and the Availability of the original version is retained on all copies.
+	User documentation of any code that uses this code or any modified
+	version of this code must cite the Copyright, this License, the
+	Availability note, and "Used by permission." Permission to modify
+	the code and to distribute modified code is granted, provided the
+	Copyright, this License, and the Availability note are retained,
+	and a notice that the code was modified is included.
+
+    Availability:
+
+	The colamd/symamd library is available at
+
+	    http://www.cise.ufl.edu/research/sparse/colamd/
+
+	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.c
+	file.  It requires the colamd.h file.  It is required by the colamdmex.c
+	and symamdmex.c files, for the MATLAB interface to colamd and symamd.
+	Appears as ACM Algorithm 836.
+
+    See the ChangeLog file for changes since Version 1.0.
+
+    References:
+
+	T. A. Davis, J. R. Gilbert, S. Larimore, E. Ng, An approximate column
+	minimum degree ordering algorithm, ACM Transactions on Mathematical
+	Software, vol. 30, no. 3., pp. 353-376, 2004.
+
+	T. A. Davis, J. R. Gilbert, S. Larimore, E. Ng, Algorithm 836: COLAMD,
+	an approximate column minimum degree ordering algorithm, ACM
+	Transactions on Mathematical Software, vol. 30, no. 3., pp. 377-380,
+	2004.
+
+*/
+
+/* ========================================================================== */
+/* === Description of user-callable routines ================================ */
+/* ========================================================================== */
+
+/* COLAMD includes both int and UF_long versions of all its routines.  The
+ * description below is for the int version.  For UF_long, all int arguments
+ * become UF_long.  UF_long is normally defined as long, except for WIN64.
+
+    ----------------------------------------------------------------------------
+    colamd_recommended:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    size_t colamd_recommended (int nnz, int n_row, int n_col) ;
+	    size_t colamd_l_recommended (UF_long nnz, UF_long n_row,
+		UF_long n_col) ;
+
+	Purpose:
+
+	    Returns recommended value of Alen for use by colamd.  Returns 0
+	    if any input argument is negative.  The use of this routine
+	    is optional.  Not needed for symamd, which dynamically allocates
+	    its own memory.
+
+	    Note that in v2.4 and earlier, these routines returned int or long.
+	    They now return a value of type size_t.
+
+	Arguments (all input arguments):
+
+	    int nnz ;		Number of nonzeros in the matrix A.  This must
+				be the same value as p [n_col] in the call to
+				colamd - otherwise you will get a wrong value
+				of the recommended memory to use.
+
+	    int n_row ;		Number of rows in the matrix A.
+
+	    int n_col ;		Number of columns in the matrix A.
+
+    ----------------------------------------------------------------------------
+    colamd_set_defaults:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    colamd_set_defaults (double knobs [COLAMD_KNOBS]) ;
+	    colamd_l_set_defaults (double knobs [COLAMD_KNOBS]) ;
+
+	Purpose:
+
+	    Sets the default parameters.  The use of this routine is optional.
+
+	Arguments:
+
+	    double knobs [COLAMD_KNOBS] ;	Output only.
+
+		NOTE: the meaning of the dense row/col knobs has changed in v2.4
+
+		knobs [0] and knobs [1] control dense row and col detection:
+
+		Colamd: rows with more than
+		max (16, knobs [COLAMD_DENSE_ROW] * sqrt (n_col))
+		entries are removed prior to ordering.  Columns with more than
+		max (16, knobs [COLAMD_DENSE_COL] * sqrt (MIN (n_row,n_col)))
+		entries are removed prior to
+		ordering, and placed last in the output column ordering. 
+
+		Symamd: uses only knobs [COLAMD_DENSE_ROW], which is knobs [0].
+		Rows and columns with more than
+		max (16, knobs [COLAMD_DENSE_ROW] * sqrt (n))
+		entries are removed prior to ordering, and placed last in the
+		output ordering.
+
+		COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+		respectively, in colamd.h.  Default values of these two knobs
+		are both 10.  Currently, only knobs [0] and knobs [1] are
+		used, but future versions may use more knobs.  If so, they will
+		be properly set to their defaults by the future version of
+		colamd_set_defaults, so that the code that calls colamd will
+		not need to change, assuming that you either use
+		colamd_set_defaults, or pass a (double *) NULL pointer as the
+		knobs array to colamd or symamd.
+
+	    knobs [2]: aggressive absorption
+
+	        knobs [COLAMD_AGGRESSIVE] controls whether or not to do
+	        aggressive absorption during the ordering.  Default is TRUE.
+
+
+    ----------------------------------------------------------------------------
+    colamd:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int colamd (int n_row, int n_col, int Alen, int *A, int *p,
+	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS]) ;
+	    UF_long colamd_l (UF_long n_row, UF_long n_col, UF_long Alen,
+		UF_long *A, UF_long *p, double knobs [COLAMD_KNOBS],
+		UF_long stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Computes a column ordering (Q) of A such that P(AQ)=LU or
+	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
+	    operations than factorizing the unpermuted matrix A or A'A,
+	    respectively.
+	    
+	Returns:
+
+	    TRUE (1) if successful, FALSE (0) otherwise.
+
+	Arguments:
+
+	    int n_row ;		Input argument.
+
+		Number of rows in the matrix A.
+		Restriction:  n_row >= 0.
+		Colamd returns FALSE if n_row is negative.
+
+	    int n_col ;		Input argument.
+
+		Number of columns in the matrix A.
+		Restriction:  n_col >= 0.
+		Colamd returns FALSE if n_col is negative.
+
+	    int Alen ;		Input argument.
+
+		Restriction (see note):
+		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col
+		Colamd returns FALSE if these conditions are not met.
+
+		Note:  this restriction makes an modest assumption regarding
+		the size of the two typedef's structures in colamd.h.
+		We do, however, guarantee that
+
+			Alen >= colamd_recommended (nnz, n_row, n_col)
+
+		will be sufficient.  Note: the macro version does not check
+		for integer overflow, and thus is not recommended.  Use
+		the colamd_recommended routine instead.
+
+	    int A [Alen] ;	Input argument, undefined on output.
+
+		A is an integer array of size Alen.  Alen must be at least as
+		large as the bare minimum value given above, but this is very
+		low, and can result in excessive run time.  For best
+		performance, we recommend that Alen be greater than or equal to
+		colamd_recommended (nnz, n_row, n_col), which adds
+		nnz/5 to the bare minimum value given above.
+
+		On input, the row indices of the entries in column c of the
+		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
+		in a given column c need not be in ascending order, and
+		duplicate row indices may be be present.  However, colamd will
+		work a little faster if both of these conditions are met
+		(Colamd puts the matrix into this format, if it finds that the
+		the conditions are not met).
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are modified during ordering, and are
+		undefined on output.
+
+	    int p [n_col+1] ;	Both input and output argument.
+
+		p is an integer array of size n_col+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n_col-1.  The value p [n_col] is
+		thus the total number of entries in the pattern of the matrix A.
+		Colamd returns FALSE if these conditions are not met.
+
+		On output, if colamd returns TRUE, the array p holds the column
+		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
+		the first column index in the new ordering, and p [n_col-1] is
+		the last.  That is, p [k] = j means that column j of A is the
+		kth pivot column, in AQ, where k is in the range 0 to n_col-1
+		(p [0] = j means that column j of A is the first column in AQ).
+
+		If colamd returns FALSE, then no permutation is returned, and
+		p is undefined on output.
+
+	    double knobs [COLAMD_KNOBS] ;	Input argument.
+
+		See colamd_set_defaults for a description.
+
+	    int stats [COLAMD_STATS] ;		Output argument.
+
+		Statistics on the ordering, and error status.
+		See colamd.h for related definitions.
+		Colamd returns FALSE if stats is not present.
+
+		stats [0]:  number of dense or empty rows ignored.
+
+		stats [1]:  number of dense or empty columns ignored (and
+				ordered last in the output permutation p)
+				Note that a row can become "empty" if it
+				contains only "dense" and/or "empty" columns,
+				and similarly a column can become "empty" if it
+				only contains "dense" and/or "empty" rows.
+
+		stats [2]:  number of garbage collections performed.
+				This can be excessively high if Alen is close
+				to the minimum required value.
+
+		stats [3]:  status code.  < 0 is an error code.
+			    > 1 is a warning or notice.
+
+			0	OK.  Each column of the input matrix contained
+				row indices in increasing order, with no
+				duplicates.
+
+			1	OK, but columns of input matrix were jumbled
+				(unsorted columns or duplicate entries).  Colamd
+				had to do some extra work to sort the matrix
+				first and remove duplicate entries, but it
+				still was able to return a valid permutation
+				(return value of colamd was TRUE).
+
+					stats [4]: highest numbered column that
+						is unsorted or has duplicate
+						entries.
+					stats [5]: last seen duplicate or
+						unsorted row index.
+					stats [6]: number of duplicate or
+						unsorted row indices.
+
+			-1	A is a null pointer
+
+			-2	p is a null pointer
+
+			-3 	n_row is negative
+
+					stats [4]: n_row
+
+			-4	n_col is negative
+
+					stats [4]: n_col
+
+			-5	number of nonzeros in matrix is negative
+
+					stats [4]: number of nonzeros, p [n_col]
+
+			-6	p [0] is nonzero
+
+					stats [4]: p [0]
+
+			-7	A is too small
+
+					stats [4]: required size
+					stats [5]: actual size (Alen)
+
+			-8	a column has a negative number of entries
+
+					stats [4]: column with < 0 entries
+					stats [5]: number of entries in col
+
+			-9	a row index is out of bounds
+
+					stats [4]: column with bad row index
+					stats [5]: bad row index
+					stats [6]: n_row, # of rows of matrx
+
+			-10	(unused; see symamd.c)
+
+			-999	(unused; see symamd.c)
+
+		Future versions may return more statistics in the stats array.
+
+	Example:
+	
+	    See http://www.cise.ufl.edu/research/sparse/colamd/example.c
+	    for a complete example.
+
+	    To order the columns of a 5-by-4 matrix with 11 nonzero entries in
+	    the following nonzero pattern
+
+	    	x 0 x 0
+		x 0 x x
+		0 x x 0
+		0 0 x x
+		x x 0 0
+
+	    with default knobs and no output statistics, do the following:
+
+		#include "colamd.h"
+		#define ALEN 100
+		int A [ALEN] = {0, 1, 4, 2, 4, 0, 1, 2, 3, 1, 3} ;
+		int p [ ] = {0, 3, 5, 9, 11} ;
+		int stats [COLAMD_STATS] ;
+		colamd (5, 4, ALEN, A, p, (double *) NULL, stats) ;
+
+	    The permutation is returned in the array p, and A is destroyed.
+
+    ----------------------------------------------------------------------------
+    symamd:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int symamd (int n, int *A, int *p, int *perm,
+	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS],
+		void (*allocate) (size_t, size_t), void (*release) (void *)) ;
+	    UF_long symamd_l (UF_long n, UF_long *A, UF_long *p, UF_long *perm,
+	    	double knobs [COLAMD_KNOBS], UF_long stats [COLAMD_STATS],
+		void (*allocate) (size_t, size_t), void (*release) (void *)) ;
+
+	Purpose:
+
+    	    The symamd routine computes an ordering P of a symmetric sparse
+	    matrix A such that the Cholesky factorization PAP' = LL' remains
+	    sparse.  It is based on a column ordering of a matrix M constructed
+	    so that the nonzero pattern of M'M is the same as A.  The matrix A
+	    is assumed to be symmetric; only the strictly lower triangular part
+	    is accessed.  You must pass your selected memory allocator (usually
+	    calloc/free or mxCalloc/mxFree) to symamd, for it to allocate
+	    memory for the temporary matrix M.
+
+	Returns:
+
+	    TRUE (1) if successful, FALSE (0) otherwise.
+
+	Arguments:
+
+	    int n ;		Input argument.
+
+	    	Number of rows and columns in the symmetrix matrix A.
+		Restriction:  n >= 0.
+		Symamd returns FALSE if n is negative.
+
+	    int A [nnz] ;	Input argument.
+
+	    	A is an integer array of size nnz, where nnz = p [n].
+		
+		The row indices of the entries in column c of the matrix are
+		held in A [(p [c]) ... (p [c+1]-1)].  The row indices in a
+		given column c need not be in ascending order, and duplicate
+		row indices may be present.  However, symamd will run faster
+		if the columns are in sorted order with no duplicate entries. 
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n-1, and columns are in the range 0 to n-1.  Symamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are not modified.
+
+	    int p [n+1] ;   	Input argument.
+
+		p is an integer array of size n+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n-1.  The value p [n] is
+		thus the total number of entries in the pattern of the matrix A.
+		Symamd returns FALSE if these conditions are not met.
+
+		The contents of p are not modified.
+
+	    int perm [n+1] ;   	Output argument.
+
+		On output, if symamd returns TRUE, the array perm holds the
+		permutation P, where perm [0] is the first index in the new
+		ordering, and perm [n-1] is the last.  That is, perm [k] = j
+		means that row and column j of A is the kth column in PAP',
+		where k is in the range 0 to n-1 (perm [0] = j means
+		that row and column j of A are the first row and column in
+		PAP').  The array is used as a workspace during the ordering,
+		which is why it must be of length n+1, not just n.
+
+	    double knobs [COLAMD_KNOBS] ;	Input argument.
+
+		See colamd_set_defaults for a description.
+
+	    int stats [COLAMD_STATS] ;		Output argument.
+
+		Statistics on the ordering, and error status.
+		See colamd.h for related definitions.
+		Symamd returns FALSE if stats is not present.
+
+		stats [0]:  number of dense or empty row and columns ignored
+				(and ordered last in the output permutation 
+				perm).  Note that a row/column can become
+				"empty" if it contains only "dense" and/or
+				"empty" columns/rows.
+
+		stats [1]:  (same as stats [0])
+
+		stats [2]:  number of garbage collections performed.
+
+		stats [3]:  status code.  < 0 is an error code.
+			    > 1 is a warning or notice.
+
+			0	OK.  Each column of the input matrix contained
+				row indices in increasing order, with no
+				duplicates.
+
+			1	OK, but columns of input matrix were jumbled
+				(unsorted columns or duplicate entries).  Symamd
+				had to do some extra work to sort the matrix
+				first and remove duplicate entries, but it
+				still was able to return a valid permutation
+				(return value of symamd was TRUE).
+
+					stats [4]: highest numbered column that
+						is unsorted or has duplicate
+						entries.
+					stats [5]: last seen duplicate or
+						unsorted row index.
+					stats [6]: number of duplicate or
+						unsorted row indices.
+
+			-1	A is a null pointer
+
+			-2	p is a null pointer
+
+			-3	(unused, see colamd.c)
+
+			-4 	n is negative
+
+					stats [4]: n
+
+			-5	number of nonzeros in matrix is negative
+
+					stats [4]: # of nonzeros (p [n]).
+
+			-6	p [0] is nonzero
+
+					stats [4]: p [0]
+
+			-7	(unused)
+
+			-8	a column has a negative number of entries
+
+					stats [4]: column with < 0 entries
+					stats [5]: number of entries in col
+
+			-9	a row index is out of bounds
+
+					stats [4]: column with bad row index
+					stats [5]: bad row index
+					stats [6]: n_row, # of rows of matrx
+
+			-10	out of memory (unable to allocate temporary
+				workspace for M or count arrays using the
+				"allocate" routine passed into symamd).
+
+		Future versions may return more statistics in the stats array.
+
+	    void * (*allocate) (size_t, size_t)
+
+	    	A pointer to a function providing memory allocation.  The
+		allocated memory must be returned initialized to zero.  For a
+		C application, this argument should normally be a pointer to
+		calloc.  For a MATLAB mexFunction, the routine mxCalloc is
+		passed instead.
+
+	    void (*release) (size_t, size_t)
+
+	    	A pointer to a function that frees memory allocated by the
+		memory allocation routine above.  For a C application, this
+		argument should normally be a pointer to free.  For a MATLAB
+		mexFunction, the routine mxFree is passed instead.
+
+
+    ----------------------------------------------------------------------------
+    colamd_report:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    colamd_report (int stats [COLAMD_STATS]) ;
+	    colamd_l_report (UF_long stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Prints the error status and statistics recorded in the stats
+	    array on the standard error output (for a standard C routine)
+	    or on the MATLAB output (for a mexFunction).
+
+	Arguments:
+
+	    int stats [COLAMD_STATS] ;	Input only.  Statistics from colamd.
+
+
+    ----------------------------------------------------------------------------
+    symamd_report:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    symamd_report (int stats [COLAMD_STATS]) ;
+	    symamd_l_report (UF_long stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Prints the error status and statistics recorded in the stats
+	    array on the standard error output (for a standard C routine)
+	    or on the MATLAB output (for a mexFunction).
+
+	Arguments:
+
+	    int stats [COLAMD_STATS] ;	Input only.  Statistics from symamd.
+
+
+*/
+
+/* ========================================================================== */
+/* === Scaffolding code definitions  ======================================== */
+/* ========================================================================== */
+
+/* Ensure that debugging is turned off: */
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+/* turn on debugging by uncommenting the following line
+ #undef NDEBUG
+*/
+
+/*
+   Our "scaffolding code" philosophy:  In our opinion, well-written library
+   code should keep its "debugging" code, and just normally have it turned off
+   by the compiler so as not to interfere with performance.  This serves
+   several purposes:
+
+   (1) assertions act as comments to the reader, telling you what the code
+	expects at that point.  All assertions will always be true (unless
+	there really is a bug, of course).
+
+   (2) leaving in the scaffolding code assists anyone who would like to modify
+	the code, or understand the algorithm (by reading the debugging output,
+	one can get a glimpse into what the code is doing).
+
+   (3) (gasp!) for actually finding bugs.  This code has been heavily tested
+	and "should" be fully functional and bug-free ... but you never know...
+
+    The code will become outrageously slow when debugging is
+    enabled.  To control the level of debugging output, set an environment
+    variable D to 0 (little), 1 (some), 2, 3, or 4 (lots).  When debugging,
+    you should see the following message on the standard output:
+
+    	colamd: debug version, D = 1 (THIS WILL BE SLOW!)
+
+    or a similar message for symamd.  If you don't, then debugging has not
+    been enabled.
+
+*/
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include "colamd.h"
+#include <limits.h>
+#include <math.h>
+
+#ifdef MATLAB_MEX_FILE
+#include "mex.h"
+#include "matrix.h"
+#endif /* MATLAB_MEX_FILE */
+
+#if !defined (NPRINT) || !defined (NDEBUG)
+#include <stdio.h>
+#endif
+
+#ifndef NULL
+#define NULL ((void *) 0)
+#endif
+
+/* ========================================================================== */
+/* === int or UF_long ======================================================= */
+/* ========================================================================== */
+
+/* define UF_long */
+#include "UFconfig.h"
+
+#ifdef DLONG
+
+#define Int UF_long
+#define ID  UF_long_id
+#define Int_MAX UF_long_max
+
+#define COLAMD_recommended colamd_l_recommended
+#define COLAMD_set_defaults colamd_l_set_defaults
+#define COLAMD_MAIN colamd_l
+#define SYMAMD_MAIN symamd_l
+#define COLAMD_report colamd_l_report
+#define SYMAMD_report symamd_l_report
+
+#else
+
+#define Int int
+#define ID "%d"
+#define Int_MAX INT_MAX
+
+#define COLAMD_recommended colamd_recommended
+#define COLAMD_set_defaults colamd_set_defaults
+#define COLAMD_MAIN colamd
+#define SYMAMD_MAIN symamd
+#define COLAMD_report colamd_report
+#define SYMAMD_report symamd_report
+
+#endif
+
+/* ========================================================================== */
+/* === Row and Column structures ============================================ */
+/* ========================================================================== */
+
+/* User code that makes use of the colamd/symamd routines need not directly */
+/* reference these structures.  They are used only for colamd_recommended. */
+
+typedef struct Colamd_Col_struct
+{
+    Int start ;		/* index for A of first row in this column, or DEAD */
+			/* if column is dead */
+    Int length ;	/* number of rows in this column */
+    union
+    {
+	Int thickness ;	/* number of original columns represented by this */
+			/* col, if the column is alive */
+	Int parent ;	/* parent in parent tree super-column structure, if */
+			/* the column is dead */
+    } shared1 ;
+    union
+    {
+	Int score ;	/* the score used to maintain heap, if col is alive */
+	Int order ;	/* pivot ordering of this column, if col is dead */
+    } shared2 ;
+    union
+    {
+	Int headhash ;	/* head of a hash bucket, if col is at the head of */
+			/* a degree list */
+	Int hash ;	/* hash value, if col is not in a degree list */
+	Int prev ;	/* previous column in degree list, if col is in a */
+			/* degree list (but not at the head of a degree list) */
+    } shared3 ;
+    union
+    {
+	Int degree_next ;	/* next column, if col is in a degree list */
+	Int hash_next ;		/* next column, if col is in a hash list */
+    } shared4 ;
+
+} Colamd_Col ;
+
+typedef struct Colamd_Row_struct
+{
+    Int start ;		/* index for A of first col in this row */
+    Int length ;	/* number of principal columns in this row */
+    union
+    {
+	Int degree ;	/* number of principal & non-principal columns in row */
+	Int p ;		/* used as a row pointer in init_rows_cols () */
+    } shared1 ;
+    union
+    {
+	Int mark ;	/* for computing set differences and marking dead rows*/
+	Int first_column ;/* first column in row (used in garbage collection) */
+    } shared2 ;
+
+} Colamd_Row ;
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
+#define PUBLIC
+#define PRIVATE static
+
+#define DENSE_DEGREE(alpha,n) \
+    ((Int) MAX (16.0, (alpha) * sqrt ((double) (n))))
+
+#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ONES_COMPLEMENT(r) (-(r)-1)
+
+/* -------------------------------------------------------------------------- */
+/* Change for version 2.1:  define TRUE and FALSE only if not yet defined */  
+/* -------------------------------------------------------------------------- */
+
+#ifndef TRUE
+#define TRUE (1)
+#endif
+
+#ifndef FALSE
+#define FALSE (0)
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+#define EMPTY	(-1)
+
+/* Row and column status */
+#define ALIVE	(0)
+#define DEAD	(-1)
+
+/* Column status */
+#define DEAD_PRINCIPAL		(-1)
+#define DEAD_NON_PRINCIPAL	(-2)
+
+/* Macros for row and column status update and checking. */
+#define ROW_IS_DEAD(r)			ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
+#define ROW_IS_MARKED_DEAD(row_mark)	(row_mark < ALIVE)
+#define ROW_IS_ALIVE(r)			(Row [r].shared2.mark >= ALIVE)
+#define COL_IS_DEAD(c)			(Col [c].start < ALIVE)
+#define COL_IS_ALIVE(c)			(Col [c].start >= ALIVE)
+#define COL_IS_DEAD_PRINCIPAL(c)	(Col [c].start == DEAD_PRINCIPAL)
+#define KILL_ROW(r)			{ Row [r].shared2.mark = DEAD ; }
+#define KILL_PRINCIPAL_COL(c)		{ Col [c].start = DEAD_PRINCIPAL ; }
+#define KILL_NON_PRINCIPAL_COL(c)	{ Col [c].start = DEAD_NON_PRINCIPAL ; }
+
+/* ========================================================================== */
+/* === Colamd reporting mechanism =========================================== */
+/* ========================================================================== */
+
+#if defined (MATLAB_MEX_FILE) || defined (MATHWORKS)
+/* In MATLAB, matrices are 1-based to the user, but 0-based internally */
+#define INDEX(i) ((i)+1)
+#else
+/* In C, matrices are 0-based and indices are reported as such in *_report */
+#define INDEX(i) (i)
+#endif
+
+/* All output goes through the PRINTF macro.  */
+#define PRINTF(params) { if (colamd_printf != NULL) (void) colamd_printf params ; }
+
+/* ========================================================================== */
+/* === Prototypes of PRIVATE routines ======================================= */
+/* ========================================================================== */
+
+PRIVATE Int init_rows_cols
+(
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A [],
+    Int p [],
+    Int stats [COLAMD_STATS]
+) ;
+
+PRIVATE void init_scoring
+(
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A [],
+    Int head [],
+    double knobs [COLAMD_KNOBS],
+    Int *p_n_row2,
+    Int *p_n_col2,
+    Int *p_max_deg
+) ;
+
+PRIVATE Int find_ordering
+(
+    Int n_row,
+    Int n_col,
+    Int Alen,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A [],
+    Int head [],
+    Int n_col2,
+    Int max_deg,
+    Int pfree,
+    Int aggressive
+) ;
+
+PRIVATE void order_children
+(
+    Int n_col,
+    Colamd_Col Col [],
+    Int p []
+) ;
+
+PRIVATE void detect_super_cols
+(
+
+#ifndef NDEBUG
+    Int n_col,
+    Colamd_Row Row [],
+#endif /* NDEBUG */
+
+    Colamd_Col Col [],
+    Int A [],
+    Int head [],
+    Int row_start,
+    Int row_length
+) ;
+
+PRIVATE Int garbage_collection
+(
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A [],
+    Int *pfree
+) ;
+
+PRIVATE Int clear_mark
+(
+    Int tag_mark,
+    Int max_mark,
+    Int n_row,
+    Colamd_Row Row []
+) ;
+
+PRIVATE void print_report
+(
+    char *method,
+    Int stats [COLAMD_STATS]
+) ;
+
+/* ========================================================================== */
+/* === Debugging prototypes and definitions ================================= */
+/* ========================================================================== */
+
+#ifndef NDEBUG
+
+#include <assert.h>
+
+/* colamd_debug is the *ONLY* global variable, and is only */
+/* present when debugging */
+
+PRIVATE Int colamd_debug = 0 ;	/* debug print level */
+
+#define DEBUG0(params) { PRINTF (params) ; }
+#define DEBUG1(params) { if (colamd_debug >= 1) PRINTF (params) ; }
+#define DEBUG2(params) { if (colamd_debug >= 2) PRINTF (params) ; }
+#define DEBUG3(params) { if (colamd_debug >= 3) PRINTF (params) ; }
+#define DEBUG4(params) { if (colamd_debug >= 4) PRINTF (params) ; }
+
+#ifdef MATLAB_MEX_FILE
+#define ASSERT(expression) (mxAssert ((expression), ""))
+#else
+#define ASSERT(expression) (assert (expression))
+#endif /* MATLAB_MEX_FILE */
+
+PRIVATE void colamd_get_debug	/* gets the debug print level from getenv */
+(
+    char *method
+) ;
+
+PRIVATE void debug_deg_lists
+(
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int head [],
+    Int min_score,
+    Int should,
+    Int max_deg
+) ;
+
+PRIVATE void debug_mark
+(
+    Int n_row,
+    Colamd_Row Row [],
+    Int tag_mark,
+    Int max_mark
+) ;
+
+PRIVATE void debug_matrix
+(
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A []
+) ;
+
+PRIVATE void debug_structures
+(
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A [],
+    Int n_col2
+) ;
+
+#else /* NDEBUG */
+
+/* === No debugging ========================================================= */
+
+#define DEBUG0(params) ;
+#define DEBUG1(params) ;
+#define DEBUG2(params) ;
+#define DEBUG3(params) ;
+#define DEBUG4(params) ;
+
+#define ASSERT(expression)
+
+#endif /* NDEBUG */
+
+/* ========================================================================== */
+/* === USER-CALLABLE ROUTINES: ============================================== */
+/* ========================================================================== */
+
+/* ========================================================================== */
+/* === colamd_recommended =================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_recommended routine returns the suggested size for Alen.  This
+    value has been determined to provide good balance between the number of
+    garbage collections and the memory requirements for colamd.  If any
+    argument is negative, or if integer overflow occurs, a 0 is returned as an
+    error condition.  2*nnz space is required for the row and column
+    indices of the matrix. COLAMD_C (n_col) + COLAMD_R (n_row) space is
+    required for the Col and Row arrays, respectively, which are internal to
+    colamd (roughly 6*n_col + 4*n_row).  An additional n_col space is the
+    minimal amount of "elbow room", and nnz/5 more space is recommended for
+    run time efficiency.
+
+    Alen is approximately 2.2*nnz + 7*n_col + 4*n_row + 10.
+
+    This function is not needed when using symamd.
+*/
+
+/* add two values of type size_t, and check for integer overflow */
+static size_t t_add (size_t a, size_t b, int *ok)
+{
+    (*ok) = (*ok) && ((a + b) >= MAX (a,b)) ;
+    return ((*ok) ? (a + b) : 0) ;
+}
+
+/* compute a*k where k is a small integer, and check for integer overflow */
+static size_t t_mult (size_t a, size_t k, int *ok)
+{
+    size_t i, s = 0 ;
+    for (i = 0 ; i < k ; i++)
+    {
+	s = t_add (s, a, ok) ;
+    }
+    return (s) ;
+}
+
+/* size of the Col and Row structures */
+#define COLAMD_C(n_col,ok) \
+    ((t_mult (t_add (n_col, 1, ok), sizeof (Colamd_Col), ok) / sizeof (Int)))
+
+#define COLAMD_R(n_row,ok) \
+    ((t_mult (t_add (n_row, 1, ok), sizeof (Colamd_Row), ok) / sizeof (Int)))
+
+
+PUBLIC size_t COLAMD_recommended	/* returns recommended value of Alen. */
+(
+    /* === Parameters ======================================================= */
+
+    Int nnz,			/* number of nonzeros in A */
+    Int n_row,			/* number of rows in A */
+    Int n_col			/* number of columns in A */
+)
+{
+    size_t s, c, r ;
+    int ok = TRUE ;
+    if (nnz < 0 || n_row < 0 || n_col < 0)
+    {
+	return (0) ;
+    }
+    s = t_mult (nnz, 2, &ok) ;	    /* 2*nnz */
+    c = COLAMD_C (n_col, &ok) ;	    /* size of column structures */
+    r = COLAMD_R (n_row, &ok) ;	    /* size of row structures */
+    s = t_add (s, c, &ok) ;
+    s = t_add (s, r, &ok) ;
+    s = t_add (s, n_col, &ok) ;	    /* elbow room */
+    s = t_add (s, nnz/5, &ok) ;	    /* elbow room */
+    ok = ok && (s < Int_MAX) ;
+    return (ok ? s : 0) ;
+}
+
+
+/* ========================================================================== */
+/* === colamd_set_defaults ================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_set_defaults routine sets the default values of the user-
+    controllable parameters for colamd and symamd:
+
+	Colamd: rows with more than max (16, knobs [0] * sqrt (n_col))
+	entries are removed prior to ordering.  Columns with more than
+	max (16, knobs [1] * sqrt (MIN (n_row,n_col))) entries are removed
+	prior to ordering, and placed last in the output column ordering. 
+
+	Symamd: Rows and columns with more than max (16, knobs [0] * sqrt (n))
+	entries are removed prior to ordering, and placed last in the
+	output ordering.
+
+	knobs [0]	dense row control
+
+	knobs [1]	dense column control
+
+	knobs [2]	if nonzero, do aggresive absorption
+
+	knobs [3..19]	unused, but future versions might use this
+
+*/
+
+PUBLIC void COLAMD_set_defaults
+(
+    /* === Parameters ======================================================= */
+
+    double knobs [COLAMD_KNOBS]		/* knob array */
+)
+{
+    /* === Local variables ================================================== */
+
+    Int i ;
+
+    if (!knobs)
+    {
+	return ;			/* no knobs to initialize */
+    }
+    for (i = 0 ; i < COLAMD_KNOBS ; i++)
+    {
+	knobs [i] = 0 ;
+    }
+    knobs [COLAMD_DENSE_ROW] = 10 ;
+    knobs [COLAMD_DENSE_COL] = 10 ;
+    knobs [COLAMD_AGGRESSIVE] = TRUE ;	/* default: do aggressive absorption*/
+}
+
+
+/* ========================================================================== */
+/* === symamd =============================================================== */
+/* ========================================================================== */
+
+PUBLIC Int SYMAMD_MAIN			/* return TRUE if OK, FALSE otherwise */
+(
+    /* === Parameters ======================================================= */
+
+    Int n,				/* number of rows and columns of A */
+    Int A [],				/* row indices of A */
+    Int p [],				/* column pointers of A */
+    Int perm [],			/* output permutation, size n+1 */
+    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
+    Int stats [COLAMD_STATS],		/* output statistics and error codes */
+    void * (*allocate) (size_t, size_t),
+    					/* pointer to calloc (ANSI C) or */
+					/* mxCalloc (for MATLAB mexFunction) */
+    void (*release) (void *)
+    					/* pointer to free (ANSI C) or */
+    					/* mxFree (for MATLAB mexFunction) */
+)
+{
+    /* === Local variables ================================================== */
+
+    Int *count ;		/* length of each column of M, and col pointer*/
+    Int *mark ;			/* mark array for finding duplicate entries */
+    Int *M ;			/* row indices of matrix M */
+    size_t Mlen ;		/* length of M */
+    Int n_row ;			/* number of rows in M */
+    Int nnz ;			/* number of entries in A */
+    Int i ;			/* row index of A */
+    Int j ;			/* column index of A */
+    Int k ;			/* row index of M */ 
+    Int mnz ;			/* number of nonzeros in M */
+    Int pp ;			/* index into a column of A */
+    Int last_row ;		/* last row seen in the current column */
+    Int length ;		/* number of nonzeros in a column */
+
+    double cknobs [COLAMD_KNOBS] ;		/* knobs for colamd */
+    double default_knobs [COLAMD_KNOBS] ;	/* default knobs for colamd */
+
+#ifndef NDEBUG
+    colamd_get_debug ("symamd") ;
+#endif /* NDEBUG */
+
+    /* === Check the input arguments ======================================== */
+
+    if (!stats)
+    {
+	DEBUG0 (("symamd: stats not present\n")) ;
+	return (FALSE) ;
+    }
+    for (i = 0 ; i < COLAMD_STATS ; i++)
+    {
+	stats [i] = 0 ;
+    }
+    stats [COLAMD_STATUS] = COLAMD_OK ;
+    stats [COLAMD_INFO1] = -1 ;
+    stats [COLAMD_INFO2] = -1 ;
+
+    if (!A)
+    {
+    	stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+	DEBUG0 (("symamd: A not present\n")) ;
+	return (FALSE) ;
+    }
+
+    if (!p)		/* p is not present */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+	DEBUG0 (("symamd: p not present\n")) ;
+    	return (FALSE) ;
+    }
+
+    if (n < 0)		/* n must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+	stats [COLAMD_INFO1] = n ;
+	DEBUG0 (("symamd: n negative %d\n", n)) ;
+    	return (FALSE) ;
+    }
+
+    nnz = p [n] ;
+    if (nnz < 0)	/* nnz must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+	stats [COLAMD_INFO1] = nnz ;
+	DEBUG0 (("symamd: number of entries negative %d\n", nnz)) ;
+	return (FALSE) ;
+    }
+
+    if (p [0] != 0)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
+	stats [COLAMD_INFO1] = p [0] ;
+	DEBUG0 (("symamd: p[0] not zero %d\n", p [0])) ;
+	return (FALSE) ;
+    }
+
+    /* === If no knobs, set default knobs =================================== */
+
+    if (!knobs)
+    {
+	COLAMD_set_defaults (default_knobs) ;
+	knobs = default_knobs ;
+    }
+
+    /* === Allocate count and mark ========================================== */
+
+    count = (Int *) ((*allocate) (n+1, sizeof (Int))) ;
+    if (!count)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+	DEBUG0 (("symamd: allocate count (size %d) failed\n", n+1)) ;
+	return (FALSE) ;
+    }
+
+    mark = (Int *) ((*allocate) (n+1, sizeof (Int))) ;
+    if (!mark)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+	(*release) ((void *) count) ;
+	DEBUG0 (("symamd: allocate mark (size %d) failed\n", n+1)) ;
+	return (FALSE) ;
+    }
+
+    /* === Compute column counts of M, check if A is valid ================== */
+
+    stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
+
+    for (i = 0 ; i < n ; i++)
+    {
+    	mark [i] = -1 ;
+    }
+
+    for (j = 0 ; j < n ; j++)
+    {
+	last_row = -1 ;
+
+	length = p [j+1] - p [j] ;
+	if (length < 0)
+	{
+	    /* column pointers must be non-decreasing */
+	    stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+	    stats [COLAMD_INFO1] = j ;
+	    stats [COLAMD_INFO2] = length ;
+	    (*release) ((void *) count) ;
+	    (*release) ((void *) mark) ;
+	    DEBUG0 (("symamd: col %d negative length %d\n", j, length)) ;
+	    return (FALSE) ;
+	}
+
+	for (pp = p [j] ; pp < p [j+1] ; pp++)
+	{
+	    i = A [pp] ;
+	    if (i < 0 || i >= n)
+	    {
+		/* row index i, in column j, is out of bounds */
+		stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+		stats [COLAMD_INFO1] = j ;
+		stats [COLAMD_INFO2] = i ;
+		stats [COLAMD_INFO3] = n ;
+		(*release) ((void *) count) ;
+		(*release) ((void *) mark) ;
+		DEBUG0 (("symamd: row %d col %d out of bounds\n", i, j)) ;
+		return (FALSE) ;
+	    }
+
+	    if (i <= last_row || mark [i] == j)
+	    {
+		/* row index is unsorted or repeated (or both), thus col */
+		/* is jumbled.  This is a notice, not an error condition. */
+		stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+		stats [COLAMD_INFO1] = j ;
+		stats [COLAMD_INFO2] = i ;
+		(stats [COLAMD_INFO3]) ++ ;
+		DEBUG1 (("symamd: row %d col %d unsorted/duplicate\n", i, j)) ;
+	    }
+
+	    if (i > j && mark [i] != j)
+	    {
+		/* row k of M will contain column indices i and j */
+		count [i]++ ;
+		count [j]++ ;
+	    }
+
+	    /* mark the row as having been seen in this column */
+	    mark [i] = j ;
+
+	    last_row = i ;
+	}
+    }
+
+    /* v2.4: removed free(mark) */
+
+    /* === Compute column pointers of M ===================================== */
+
+    /* use output permutation, perm, for column pointers of M */
+    perm [0] = 0 ;
+    for (j = 1 ; j <= n ; j++)
+    {
+	perm [j] = perm [j-1] + count [j-1] ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+	count [j] = perm [j] ;
+    }
+
+    /* === Construct M ====================================================== */
+
+    mnz = perm [n] ;
+    n_row = mnz / 2 ;
+    Mlen = COLAMD_recommended (mnz, n_row, n) ;
+    M = (Int *) ((*allocate) (Mlen, sizeof (Int))) ;
+    DEBUG0 (("symamd: M is %d-by-%d with %d entries, Mlen = %g\n",
+    	n_row, n, mnz, (double) Mlen)) ;
+
+    if (!M)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
+	(*release) ((void *) count) ;
+	(*release) ((void *) mark) ;
+	DEBUG0 (("symamd: allocate M (size %g) failed\n", (double) Mlen)) ;
+	return (FALSE) ;
+    }
+
+    k = 0 ;
+
+    if (stats [COLAMD_STATUS] == COLAMD_OK)
+    {
+	/* Matrix is OK */
+	for (j = 0 ; j < n ; j++)
+	{
+	    ASSERT (p [j+1] - p [j] >= 0) ;
+	    for (pp = p [j] ; pp < p [j+1] ; pp++)
+	    {
+		i = A [pp] ;
+		ASSERT (i >= 0 && i < n) ;
+		if (i > j)
+		{
+		    /* row k of M contains column indices i and j */
+		    M [count [i]++] = k ;
+		    M [count [j]++] = k ;
+		    k++ ;
+		}
+	    }
+	}
+    }
+    else
+    {
+	/* Matrix is jumbled.  Do not add duplicates to M.  Unsorted cols OK. */
+	DEBUG0 (("symamd: Duplicates in A.\n")) ;
+	for (i = 0 ; i < n ; i++)
+	{
+	    mark [i] = -1 ;
+	}
+	for (j = 0 ; j < n ; j++)
+	{
+	    ASSERT (p [j+1] - p [j] >= 0) ;
+	    for (pp = p [j] ; pp < p [j+1] ; pp++)
+	    {
+		i = A [pp] ;
+		ASSERT (i >= 0 && i < n) ;
+		if (i > j && mark [i] != j)
+		{
+		    /* row k of M contains column indices i and j */
+		    M [count [i]++] = k ;
+		    M [count [j]++] = k ;
+		    k++ ;
+		    mark [i] = j ;
+		}
+	    }
+	}
+	/* v2.4: free(mark) moved below */
+    }
+
+    /* count and mark no longer needed */
+    (*release) ((void *) count) ;
+    (*release) ((void *) mark) ;	/* v2.4: free (mark) moved here */
+    ASSERT (k == n_row) ;
+
+    /* === Adjust the knobs for M =========================================== */
+
+    for (i = 0 ; i < COLAMD_KNOBS ; i++)
+    {
+	cknobs [i] = knobs [i] ;
+    }
+
+    /* there are no dense rows in M */
+    cknobs [COLAMD_DENSE_ROW] = -1 ;
+    cknobs [COLAMD_DENSE_COL] = knobs [COLAMD_DENSE_ROW] ;
+
+    /* === Order the columns of M =========================================== */
+
+    /* v2.4: colamd cannot fail here, so the error check is removed */
+    (void) COLAMD_MAIN (n_row, n, (Int) Mlen, M, perm, cknobs, stats) ;
+
+    /* Note that the output permutation is now in perm */
+
+    /* === get the statistics for symamd from colamd ======================== */
+
+    /* a dense column in colamd means a dense row and col in symamd */
+    stats [COLAMD_DENSE_ROW] = stats [COLAMD_DENSE_COL] ;
+
+    /* === Free M =========================================================== */
+
+    (*release) ((void *) M) ;
+    DEBUG0 (("symamd: done.\n")) ;
+    return (TRUE) ;
+
+}
+
+/* ========================================================================== */
+/* === colamd =============================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd routine computes a column ordering Q of a sparse matrix
+    A such that the LU factorization P(AQ) = LU remains sparse, where P is
+    selected via partial pivoting.   The routine can also be viewed as
+    providing a permutation Q such that the Cholesky factorization
+    (AQ)'(AQ) = LL' remains sparse.
+*/
+
+PUBLIC Int COLAMD_MAIN		/* returns TRUE if successful, FALSE otherwise*/
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,			/* number of rows in A */
+    Int n_col,			/* number of columns in A */
+    Int Alen,			/* length of A */
+    Int A [],			/* row indices of A */
+    Int p [],			/* pointers to columns in A */
+    double knobs [COLAMD_KNOBS],/* parameters (uses defaults if NULL) */
+    Int stats [COLAMD_STATS]	/* output statistics and error codes */
+)
+{
+    /* === Local variables ================================================== */
+
+    Int i ;			/* loop index */
+    Int nnz ;			/* nonzeros in A */
+    size_t Row_size ;		/* size of Row [], in integers */
+    size_t Col_size ;		/* size of Col [], in integers */
+    size_t need ;		/* minimum required length of A */
+    Colamd_Row *Row ;		/* pointer into A of Row [0..n_row] array */
+    Colamd_Col *Col ;		/* pointer into A of Col [0..n_col] array */
+    Int n_col2 ;		/* number of non-dense, non-empty columns */
+    Int n_row2 ;		/* number of non-dense, non-empty rows */
+    Int ngarbage ;		/* number of garbage collections performed */
+    Int max_deg ;		/* maximum row degree */
+    double default_knobs [COLAMD_KNOBS] ;	/* default knobs array */
+    Int aggressive ;		/* do aggressive absorption */
+    int ok ;
+
+#ifndef NDEBUG
+    colamd_get_debug ("colamd") ;
+#endif /* NDEBUG */
+
+    /* === Check the input arguments ======================================== */
+
+    if (!stats)
+    {
+	DEBUG0 (("colamd: stats not present\n")) ;
+	return (FALSE) ;
+    }
+    for (i = 0 ; i < COLAMD_STATS ; i++)
+    {
+	stats [i] = 0 ;
+    }
+    stats [COLAMD_STATUS] = COLAMD_OK ;
+    stats [COLAMD_INFO1] = -1 ;
+    stats [COLAMD_INFO2] = -1 ;
+
+    if (!A)		/* A is not present */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+	DEBUG0 (("colamd: A not present\n")) ;
+	return (FALSE) ;
+    }
+
+    if (!p)		/* p is not present */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+	DEBUG0 (("colamd: p not present\n")) ;
+    	return (FALSE) ;
+    }
+
+    if (n_row < 0)	/* n_row must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
+	stats [COLAMD_INFO1] = n_row ;
+	DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
+    	return (FALSE) ;
+    }
+
+    if (n_col < 0)	/* n_col must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+	stats [COLAMD_INFO1] = n_col ;
+	DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
+    	return (FALSE) ;
+    }
+
+    nnz = p [n_col] ;
+    if (nnz < 0)	/* nnz must be >= 0 */
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+	stats [COLAMD_INFO1] = nnz ;
+	DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
+	return (FALSE) ;
+    }
+
+    if (p [0] != 0)
+    {
+	stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero	;
+	stats [COLAMD_INFO1] = p [0] ;
+	DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
+	return (FALSE) ;
+    }
+
+    /* === If no knobs, set default knobs =================================== */
+
+    if (!knobs)
+    {
+	COLAMD_set_defaults (default_knobs) ;
+	knobs = default_knobs ;
+    }
+
+    aggressive = (knobs [COLAMD_AGGRESSIVE] != FALSE) ;
+
+    /* === Allocate the Row and Col arrays from array A ===================== */
+
+    ok = TRUE ;
+    Col_size = COLAMD_C (n_col, &ok) ;	    /* size of Col array of structs */
+    Row_size = COLAMD_R (n_row, &ok) ;	    /* size of Row array of structs */
+
+    /* need = 2*nnz + n_col + Col_size + Row_size ; */
+    need = t_mult (nnz, 2, &ok) ;
+    need = t_add (need, n_col, &ok) ;
+    need = t_add (need, Col_size, &ok) ;
+    need = t_add (need, Row_size, &ok) ;
+
+    if (!ok || need > (size_t) Alen || need > Int_MAX)
+    {
+	/* not enough space in array A to perform the ordering */
+	stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
+	stats [COLAMD_INFO1] = need ;
+	stats [COLAMD_INFO2] = Alen ;
+	DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
+	return (FALSE) ;
+    }
+
+    Alen -= Col_size + Row_size ;
+    Col = (Colamd_Col *) &A [Alen] ;
+    Row = (Colamd_Row *) &A [Alen + Col_size] ;
+
+    /* === Construct the row and column data structures ===================== */
+
+    if (!init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+    {
+	/* input matrix is invalid */
+	DEBUG0 (("colamd: Matrix invalid\n")) ;
+	return (FALSE) ;
+    }
+
+    /* === Initialize scores, kill dense rows/columns ======================= */
+
+    init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+	&n_row2, &n_col2, &max_deg) ;
+
+    /* === Order the supercolumns =========================================== */
+
+    ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+	n_col2, max_deg, 2*nnz, aggressive) ;
+
+    /* === Order the non-principal columns ================================== */
+
+    order_children (n_col, Col, p) ;
+
+    /* === Return statistics in stats ======================================= */
+
+    stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
+    stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
+    stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
+    DEBUG0 (("colamd: done.\n")) ; 
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === colamd_report ======================================================== */
+/* ========================================================================== */
+
+PUBLIC void COLAMD_report
+(
+    Int stats [COLAMD_STATS]
+)
+{
+    print_report ("colamd", stats) ;
+}
+
+
+/* ========================================================================== */
+/* === symamd_report ======================================================== */
+/* ========================================================================== */
+
+PUBLIC void SYMAMD_report
+(
+    Int stats [COLAMD_STATS]
+)
+{
+    print_report ("symamd", stats) ;
+}
+
+
+
+/* ========================================================================== */
+/* === NON-USER-CALLABLE ROUTINES: ========================================== */
+/* ========================================================================== */
+
+/* There are no user-callable routines beyond this point in the file */
+
+
+/* ========================================================================== */
+/* === init_rows_cols ======================================================= */
+/* ========================================================================== */
+
+/*
+    Takes the column form of the matrix in A and creates the row form of the
+    matrix.  Also, row and column attributes are stored in the Col and Row
+    structs.  If the columns are un-sorted or contain duplicate row indices,
+    this routine will also sort and remove duplicate row indices from the
+    column form of the matrix.  Returns FALSE if the matrix is invalid,
+    TRUE otherwise.  Not user-callable.
+*/
+
+PRIVATE Int init_rows_cols	/* returns TRUE if OK, or FALSE otherwise */
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,			/* number of rows of A */
+    Int n_col,			/* number of columns of A */
+    Colamd_Row Row [],		/* of size n_row+1 */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    Int A [],			/* row indices of A, of size Alen */
+    Int p [],			/* pointers to columns in A, of size n_col+1 */
+    Int stats [COLAMD_STATS]	/* colamd statistics */ 
+)
+{
+    /* === Local variables ================================================== */
+
+    Int col ;			/* a column index */
+    Int row ;			/* a row index */
+    Int *cp ;			/* a column pointer */
+    Int *cp_end ;		/* a pointer to the end of a column */
+    Int *rp ;			/* a row pointer */
+    Int *rp_end ;		/* a pointer to the end of a row */
+    Int last_row ;		/* previous row */
+
+    /* === Initialize columns, and check column pointers ==================== */
+
+    for (col = 0 ; col < n_col ; col++)
+    {
+	Col [col].start = p [col] ;
+	Col [col].length = p [col+1] - p [col] ;
+
+	if (Col [col].length < 0)
+	{
+	    /* column pointers must be non-decreasing */
+	    stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+	    stats [COLAMD_INFO1] = col ;
+	    stats [COLAMD_INFO2] = Col [col].length ;
+	    DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
+	    return (FALSE) ;
+	}
+
+	Col [col].shared1.thickness = 1 ;
+	Col [col].shared2.score = 0 ;
+	Col [col].shared3.prev = EMPTY ;
+	Col [col].shared4.degree_next = EMPTY ;
+    }
+
+    /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
+
+    /* === Scan columns, compute row degrees, and check row indices ========= */
+
+    stats [COLAMD_INFO3] = 0 ;	/* number of duplicate or unsorted row indices*/
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].length = 0 ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    for (col = 0 ; col < n_col ; col++)
+    {
+	last_row = -1 ;
+
+	cp = &A [p [col]] ;
+	cp_end = &A [p [col+1]] ;
+
+	while (cp < cp_end)
+	{
+	    row = *cp++ ;
+
+	    /* make sure row indices within range */
+	    if (row < 0 || row >= n_row)
+	    {
+		stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+		stats [COLAMD_INFO1] = col ;
+		stats [COLAMD_INFO2] = row ;
+		stats [COLAMD_INFO3] = n_row ;
+		DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
+		return (FALSE) ;
+	    }
+
+	    if (row <= last_row || Row [row].shared2.mark == col)
+	    {
+		/* row index are unsorted or repeated (or both), thus col */
+		/* is jumbled.  This is a notice, not an error condition. */
+		stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+		stats [COLAMD_INFO1] = col ;
+		stats [COLAMD_INFO2] = row ;
+		(stats [COLAMD_INFO3]) ++ ;
+		DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
+	    }
+
+	    if (Row [row].shared2.mark != col)
+	    {
+		Row [row].length++ ;
+	    }
+	    else
+	    {
+		/* this is a repeated entry in the column, */
+		/* it will be removed */
+		Col [col].length-- ;
+	    }
+
+	    /* mark the row as having been seen in this column */
+	    Row [row].shared2.mark = col ;
+
+	    last_row = row ;
+	}
+    }
+
+    /* === Compute row pointers ============================================= */
+
+    /* row form of the matrix starts directly after the column */
+    /* form of matrix in A */
+    Row [0].start = p [n_col] ;
+    Row [0].shared1.p = Row [0].start ;
+    Row [0].shared2.mark = -1 ;
+    for (row = 1 ; row < n_row ; row++)
+    {
+	Row [row].start = Row [row-1].start + Row [row-1].length ;
+	Row [row].shared1.p = Row [row].start ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    /* === Create row form ================================================== */
+
+    if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+    {
+	/* if cols jumbled, watch for repeated row indices */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		row = *cp++ ;
+		if (Row [row].shared2.mark != col)
+		{
+		    A [(Row [row].shared1.p)++] = col ;
+		    Row [row].shared2.mark = col ;
+		}
+	    }
+	}
+    }
+    else
+    {
+	/* if cols not jumbled, we don't need the mark (this is faster) */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		A [(Row [*cp++].shared1.p)++] = col ;
+	    }
+	}
+    }
+
+    /* === Clear the row marks and set row degrees ========================== */
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].shared2.mark = 0 ;
+	Row [row].shared1.degree = Row [row].length ;
+    }
+
+    /* === See if we need to re-create columns ============================== */
+
+    if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+    {
+    	DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
+
+#ifndef NDEBUG
+	/* make sure column lengths are correct */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    p [col] = Col [col].length ;
+	}
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		p [*rp++]-- ;
+	    }
+	}
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    ASSERT (p [col] == 0) ;
+	}
+	/* now p is all zero (different than when debugging is turned off) */
+#endif /* NDEBUG */
+
+	/* === Compute col pointers ========================================= */
+
+	/* col form of the matrix starts at A [0]. */
+	/* Note, we may have a gap between the col form and the row */
+	/* form if there were duplicate entries, if so, it will be */
+	/* removed upon the first garbage collection */
+	Col [0].start = 0 ;
+	p [0] = Col [0].start ;
+	for (col = 1 ; col < n_col ; col++)
+	{
+	    /* note that the lengths here are for pruned columns, i.e. */
+	    /* no duplicate row indices will exist for these columns */
+	    Col [col].start = Col [col-1].start + Col [col-1].length ;
+	    p [col] = Col [col].start ;
+	}
+
+	/* === Re-create col form =========================================== */
+
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		A [(p [*rp++])++] = row ;
+	    }
+	}
+    }
+
+    /* === Done.  Matrix is not (or no longer) jumbled ====================== */
+
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === init_scoring ========================================================= */
+/* ========================================================================== */
+
+/*
+    Kills dense or empty columns and rows, calculates an initial score for
+    each column, and places all columns in the degree lists.  Not user-callable.
+*/
+
+PRIVATE void init_scoring
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,			/* number of rows of A */
+    Int n_col,			/* number of columns of A */
+    Colamd_Row Row [],		/* of size n_row+1 */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    Int A [],			/* column form and row form of A */
+    Int head [],		/* of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameters */
+    Int *p_n_row2,		/* number of non-dense, non-empty rows */
+    Int *p_n_col2,		/* number of non-dense, non-empty columns */
+    Int *p_max_deg		/* maximum row degree */
+)
+{
+    /* === Local variables ================================================== */
+
+    Int c ;			/* a column index */
+    Int r, row ;		/* a row index */
+    Int *cp ;			/* a column pointer */
+    Int deg ;			/* degree of a row or column */
+    Int *cp_end ;		/* a pointer to the end of a column */
+    Int *new_cp ;		/* new column pointer */
+    Int col_length ;		/* length of pruned column */
+    Int score ;			/* current column score */
+    Int n_col2 ;		/* number of non-dense, non-empty columns */
+    Int n_row2 ;		/* number of non-dense, non-empty rows */
+    Int dense_row_count ;	/* remove rows with more entries than this */
+    Int dense_col_count ;	/* remove cols with more entries than this */
+    Int min_score ;		/* smallest column score */
+    Int max_deg ;		/* maximum row degree */
+    Int next_col ;		/* Used to add to degree list.*/
+
+#ifndef NDEBUG
+    Int debug_count ;		/* debug only. */
+#endif /* NDEBUG */
+
+    /* === Extract knobs ==================================================== */
+
+    /* Note: if knobs contains a NaN, this is undefined: */
+    if (knobs [COLAMD_DENSE_ROW] < 0)
+    {
+	/* only remove completely dense rows */
+	dense_row_count = n_col-1 ;
+    }
+    else
+    {
+	dense_row_count = DENSE_DEGREE (knobs [COLAMD_DENSE_ROW], n_col) ;
+    }
+    if (knobs [COLAMD_DENSE_COL] < 0)
+    {
+	/* only remove completely dense columns */
+	dense_col_count = n_row-1 ;
+    }
+    else
+    {
+	dense_col_count =
+	    DENSE_DEGREE (knobs [COLAMD_DENSE_COL], MIN (n_row, n_col)) ;
+    }
+
+    DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
+    max_deg = 0 ;
+    n_col2 = n_col ;
+    n_row2 = n_row ;
+
+    /* === Kill empty columns =============================================== */
+
+    /* Put the empty columns at the end in their natural order, so that LU */
+    /* factorization can proceed as far as possible. */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	deg = Col [c].length ;
+	if (deg == 0)
+	{
+	    /* this is a empty column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense columns =============================================== */
+
+    /* Put the dense columns at the end, in their natural order */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip any dead columns */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	deg = Col [c].length ;
+	if (deg > dense_col_count)
+	{
+	    /* this is a dense column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    /* decrement the row degrees */
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + Col [c].length ;
+	    while (cp < cp_end)
+	    {
+		Row [*cp++].shared1.degree-- ;
+	    }
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense and empty rows ======================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	deg = Row [r].shared1.degree ;
+	ASSERT (deg >= 0 && deg <= n_col) ;
+	if (deg > dense_row_count || deg == 0)
+	{
+	    /* kill a dense or empty row */
+	    KILL_ROW (r) ;
+	    --n_row2 ;
+	}
+	else
+	{
+	    /* keep track of max degree of remaining rows */
+	    max_deg = MAX (max_deg, deg) ;
+	}
+    }
+    DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
+
+    /* === Compute initial column scores ==================================== */
+
+    /* At this point the row degrees are accurate.  They reflect the number */
+    /* of "live" (non-dense) columns in each row.  No empty rows exist. */
+    /* Some "live" columns may contain only dead rows, however.  These are */
+    /* pruned in the code below. */
+
+    /* now find the initial matlab score for each column */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip dead column */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	score = 0 ;
+	cp = &A [Col [c].start] ;
+	new_cp = cp ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    /* skip if dead */
+	    if (ROW_IS_DEAD (row))
+	    {
+		continue ;
+	    }
+	    /* compact the column */
+	    *new_cp++ = row ;
+	    /* add row's external degree */
+	    score += Row [row].shared1.degree - 1 ;
+	    /* guard against integer overflow */
+	    score = MIN (score, n_col) ;
+	}
+	/* determine pruned column length */
+	col_length = (Int) (new_cp - &A [Col [c].start]) ;
+	if (col_length == 0)
+	{
+	    /* a newly-made null column (all rows in this col are "dense" */
+	    /* and have already been killed) */
+	    DEBUG2 (("Newly null killed: %d\n", c)) ;
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+	else
+	{
+	    /* set column length and set score */
+	    ASSERT (score >= 0) ;
+	    ASSERT (score <= n_col) ;
+	    Col [c].length = col_length ;
+	    Col [c].shared2.score = score ;
+	}
+    }
+    DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n",
+    	n_col-n_col2)) ;
+
+    /* At this point, all empty rows and columns are dead.  All live columns */
+    /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
+    /* yet).  Rows may contain dead columns, but all live rows contain at */
+    /* least one live column. */
+
+#ifndef NDEBUG
+    debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
+#endif /* NDEBUG */
+
+    /* === Initialize degree lists ========================================== */
+
+#ifndef NDEBUG
+    debug_count = 0 ;
+#endif /* NDEBUG */
+
+    /* clear the hash buckets */
+    for (c = 0 ; c <= n_col ; c++)
+    {
+	head [c] = EMPTY ;
+    }
+    min_score = n_col ;
+    /* place in reverse order, so low column indices are at the front */
+    /* of the lists.  This is to encourage natural tie-breaking */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* only add principal columns to degree lists */
+	if (COL_IS_ALIVE (c))
+	{
+	    DEBUG4 (("place %d score %d minscore %d ncol %d\n",
+		c, Col [c].shared2.score, min_score, n_col)) ;
+
+	    /* === Add columns score to DList =============================== */
+
+	    score = Col [c].shared2.score ;
+
+	    ASSERT (min_score >= 0) ;
+	    ASSERT (min_score <= n_col) ;
+	    ASSERT (score >= 0) ;
+	    ASSERT (score <= n_col) ;
+	    ASSERT (head [score] >= EMPTY) ;
+
+	    /* now add this column to dList at proper score location */
+	    next_col = head [score] ;
+	    Col [c].shared3.prev = EMPTY ;
+	    Col [c].shared4.degree_next = next_col ;
+
+	    /* if there already was a column with the same score, set its */
+	    /* previous pointer to this new column */
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = c ;
+	    }
+	    head [score] = c ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, score) ;
+
+#ifndef NDEBUG
+	    debug_count++ ;
+#endif /* NDEBUG */
+
+	}
+    }
+
+#ifndef NDEBUG
+    DEBUG1 (("colamd: Live cols %d out of %d, non-princ: %d\n",
+	debug_count, n_col, n_col-debug_count)) ;
+    ASSERT (debug_count == n_col2) ;
+    debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
+#endif /* NDEBUG */
+
+    /* === Return number of remaining columns, and max row degree =========== */
+
+    *p_n_col2 = n_col2 ;
+    *p_n_row2 = n_row2 ;
+    *p_max_deg = max_deg ;
+}
+
+
+/* ========================================================================== */
+/* === find_ordering ======================================================== */
+/* ========================================================================== */
+
+/*
+    Order the principal columns of the supercolumn form of the matrix
+    (no supercolumns on input).  Uses a minimum approximate column minimum
+    degree ordering method.  Not user-callable.
+*/
+
+PRIVATE Int find_ordering	/* return the number of garbage collections */
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,			/* number of rows of A */
+    Int n_col,			/* number of columns of A */
+    Int Alen,			/* size of A, 2*nnz + n_col or larger */
+    Colamd_Row Row [],		/* of size n_row+1 */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    Int A [],			/* column form and row form of A */
+    Int head [],		/* of size n_col+1 */
+    Int n_col2,			/* Remaining columns to order */
+    Int max_deg,		/* Maximum row degree */
+    Int pfree,			/* index of first free slot (2*nnz on entry) */
+    Int aggressive
+)
+{
+    /* === Local variables ================================================== */
+
+    Int k ;			/* current pivot ordering step */
+    Int pivot_col ;		/* current pivot column */
+    Int *cp ;			/* a column pointer */
+    Int *rp ;			/* a row pointer */
+    Int pivot_row ;		/* current pivot row */
+    Int *new_cp ;		/* modified column pointer */
+    Int *new_rp ;		/* modified row pointer */
+    Int pivot_row_start ;	/* pointer to start of pivot row */
+    Int pivot_row_degree ;	/* number of columns in pivot row */
+    Int pivot_row_length ;	/* number of supercolumns in pivot row */
+    Int pivot_col_score ;	/* score of pivot column */
+    Int needed_memory ;		/* free space needed for pivot row */
+    Int *cp_end ;		/* pointer to the end of a column */
+    Int *rp_end ;		/* pointer to the end of a row */
+    Int row ;			/* a row index */
+    Int col ;			/* a column index */
+    Int max_score ;		/* maximum possible score */
+    Int cur_score ;		/* score of current column */
+    unsigned Int hash ;		/* hash value for supernode detection */
+    Int head_column ;		/* head of hash bucket */
+    Int first_col ;		/* first column in hash bucket */
+    Int tag_mark ;		/* marker value for mark array */
+    Int row_mark ;		/* Row [row].shared2.mark */
+    Int set_difference ;	/* set difference size of row with pivot row */
+    Int min_score ;		/* smallest column score */
+    Int col_thickness ;		/* "thickness" (no. of columns in a supercol) */
+    Int max_mark ;		/* maximum value of tag_mark */
+    Int pivot_col_thickness ;	/* number of columns represented by pivot col */
+    Int prev_col ;		/* Used by Dlist operations. */
+    Int next_col ;		/* Used by Dlist operations. */
+    Int ngarbage ;		/* number of garbage collections performed */
+
+#ifndef NDEBUG
+    Int debug_d ;		/* debug loop counter */
+    Int debug_step = 0 ;	/* debug loop counter */
+#endif /* NDEBUG */
+
+    /* === Initialization and clear mark ==================================== */
+
+    max_mark = INT_MAX - n_col ;	/* INT_MAX defined in <limits.h> */
+    tag_mark = clear_mark (0, max_mark, n_row, Row) ;
+    min_score = 0 ;
+    ngarbage = 0 ;
+    DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
+
+    /* === Order the columns ================================================ */
+
+    for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
+    {
+
+#ifndef NDEBUG
+	if (debug_step % 100 == 0)
+	{
+	    DEBUG2 (("\n...       Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	else
+	{
+	    DEBUG3 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	debug_step++ ;
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+	debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif /* NDEBUG */
+
+	/* === Select pivot column, and order it ============================ */
+
+	/* make sure degree list isn't empty */
+	ASSERT (min_score >= 0) ;
+	ASSERT (min_score <= n_col) ;
+	ASSERT (head [min_score] >= EMPTY) ;
+
+#ifndef NDEBUG
+	for (debug_d = 0 ; debug_d < min_score ; debug_d++)
+	{
+	    ASSERT (head [debug_d] == EMPTY) ;
+	}
+#endif /* NDEBUG */
+
+	/* get pivot column from head of minimum degree list */
+	while (head [min_score] == EMPTY && min_score < n_col)
+	{
+	    min_score++ ;
+	}
+	pivot_col = head [min_score] ;
+	ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
+	next_col = Col [pivot_col].shared4.degree_next ;
+	head [min_score] = next_col ;
+	if (next_col != EMPTY)
+	{
+	    Col [next_col].shared3.prev = EMPTY ;
+	}
+
+	ASSERT (COL_IS_ALIVE (pivot_col)) ;
+
+	/* remember score for defrag check */
+	pivot_col_score = Col [pivot_col].shared2.score ;
+
+	/* the pivot column is the kth column in the pivot order */
+	Col [pivot_col].shared2.order = k ;
+
+	/* increment order count by column thickness */
+	pivot_col_thickness = Col [pivot_col].shared1.thickness ;
+	k += pivot_col_thickness ;
+	ASSERT (pivot_col_thickness > 0) ;
+	DEBUG3 (("Pivot col: %d thick %d\n", pivot_col, pivot_col_thickness)) ;
+
+	/* === Garbage_collection, if necessary ============================= */
+
+	needed_memory = MIN (pivot_col_score, n_col - k) ;
+	if (pfree + needed_memory >= Alen)
+	{
+	    pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+	    ngarbage++ ;
+	    /* after garbage collection we will have enough */
+	    ASSERT (pfree + needed_memory < Alen) ;
+	    /* garbage collection has wiped out the Row[].shared2.mark array */
+	    tag_mark = clear_mark (0, max_mark, n_row, Row) ;
+
+#ifndef NDEBUG
+	    debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif /* NDEBUG */
+	}
+
+	/* === Compute pivot row pattern ==================================== */
+
+	/* get starting location for this new merged row */
+	pivot_row_start = pfree ;
+
+	/* initialize new row counts to zero */
+	pivot_row_degree = 0 ;
+
+	/* tag pivot column as having been visited so it isn't included */
+	/* in merged pivot row */
+	Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
+
+	/* pivot row is the union of all rows in the pivot column pattern */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+	    /* skip if row is dead */
+	    if (ROW_IS_ALIVE (row))
+	    {
+		rp = &A [Row [row].start] ;
+		rp_end = rp + Row [row].length ;
+		while (rp < rp_end)
+		{
+		    /* get a column */
+		    col = *rp++ ;
+		    /* add the column, if alive and untagged */
+		    col_thickness = Col [col].shared1.thickness ;
+		    if (col_thickness > 0 && COL_IS_ALIVE (col))
+		    {
+			/* tag column in pivot row */
+			Col [col].shared1.thickness = -col_thickness ;
+			ASSERT (pfree < Alen) ;
+			/* place column in pivot row */
+			A [pfree++] = col ;
+			pivot_row_degree += col_thickness ;
+		    }
+		}
+	    }
+	}
+
+	/* clear tag on pivot column */
+	Col [pivot_col].shared1.thickness = pivot_col_thickness ;
+	max_deg = MAX (max_deg, pivot_row_degree) ;
+
+#ifndef NDEBUG
+	DEBUG3 (("check2\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif /* NDEBUG */
+
+	/* === Kill all rows used to construct pivot row ==================== */
+
+	/* also kill pivot row, temporarily */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* may be killing an already dead row */
+	    row = *cp++ ;
+	    DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
+	    KILL_ROW (row) ;
+	}
+
+	/* === Select a row index to use as the new pivot row =============== */
+
+	pivot_row_length = pfree - pivot_row_start ;
+	if (pivot_row_length > 0)
+	{
+	    /* pick the "pivot" row arbitrarily (first row in col) */
+	    pivot_row = A [Col [pivot_col].start] ;
+	    DEBUG3 (("Pivotal row is %d\n", pivot_row)) ;
+	}
+	else
+	{
+	    /* there is no pivot row, since it is of zero length */
+	    pivot_row = EMPTY ;
+	    ASSERT (pivot_row_length == 0) ;
+	}
+	ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
+
+	/* === Approximate degree computation =============================== */
+
+	/* Here begins the computation of the approximate degree.  The column */
+	/* score is the sum of the pivot row "length", plus the size of the */
+	/* set differences of each row in the column minus the pattern of the */
+	/* pivot row itself.  The column ("thickness") itself is also */
+	/* excluded from the column score (we thus use an approximate */
+	/* external degree). */
+
+	/* The time taken by the following code (compute set differences, and */
+	/* add them up) is proportional to the size of the data structure */
+	/* being scanned - that is, the sum of the sizes of each column in */
+	/* the pivot row.  Thus, the amortized time to compute a column score */
+	/* is proportional to the size of that column (where size, in this */
+	/* context, is the column "length", or the number of row indices */
+	/* in that column).  The number of row indices in a column is */
+	/* monotonically non-decreasing, from the length of the original */
+	/* column on input to colamd. */
+
+	/* === Compute set differences ====================================== */
+
+	DEBUG3 (("** Computing set differences phase. **\n")) ;
+
+	/* pivot row is currently dead - it will be revived later. */
+
+	DEBUG3 (("Pivot row: ")) ;
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    DEBUG3 (("Col: %d\n", col)) ;
+
+	    /* clear tags used to construct pivot row pattern */
+	    col_thickness = -Col [col].shared1.thickness ;
+	    ASSERT (col_thickness > 0) ;
+	    Col [col].shared1.thickness = col_thickness ;
+
+	    /* === Remove column from degree list =========================== */
+
+	    cur_score = Col [col].shared2.score ;
+	    prev_col = Col [col].shared3.prev ;
+	    next_col = Col [col].shared4.degree_next ;
+	    ASSERT (cur_score >= 0) ;
+	    ASSERT (cur_score <= n_col) ;
+	    ASSERT (cur_score >= EMPTY) ;
+	    if (prev_col == EMPTY)
+	    {
+		head [cur_score] = next_col ;
+	    }
+	    else
+	    {
+		Col [prev_col].shared4.degree_next = next_col ;
+	    }
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = prev_col ;
+	    }
+
+	    /* === Scan the column ========================================== */
+
+	    cp = &A [Col [col].start] ;
+	    cp_end = cp + Col [col].length ;
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    continue ;
+		}
+		ASSERT (row != pivot_row) ;
+		set_difference = row_mark - tag_mark ;
+		/* check if the row has been seen yet */
+		if (set_difference < 0)
+		{
+		    ASSERT (Row [row].shared1.degree <= max_deg) ;
+		    set_difference = Row [row].shared1.degree ;
+		}
+		/* subtract column thickness from this row's set difference */
+		set_difference -= col_thickness ;
+		ASSERT (set_difference >= 0) ;
+		/* absorb this row if the set difference becomes zero */
+		if (set_difference == 0 && aggressive)
+		{
+		    DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
+		    KILL_ROW (row) ;
+		}
+		else
+		{
+		    /* save the new mark */
+		    Row [row].shared2.mark = set_difference + tag_mark ;
+		}
+	    }
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k-pivot_row_degree, max_deg) ;
+#endif /* NDEBUG */
+
+	/* === Add up set differences for each column ======================= */
+
+	DEBUG3 (("** Adding set differences phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    /* get a column */
+	    col = *rp++ ;
+	    ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    hash = 0 ;
+	    cur_score = 0 ;
+	    cp = &A [Col [col].start] ;
+	    /* compact the column */
+	    new_cp = cp ;
+	    cp_end = cp + Col [col].length ;
+
+	    DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ;
+
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		ASSERT(row >= 0 && row < n_row) ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    DEBUG4 ((" Row %d, dead\n", row)) ;
+		    continue ;
+		}
+		DEBUG4 ((" Row %d, set diff %d\n", row, row_mark-tag_mark));
+		ASSERT (row_mark >= tag_mark) ;
+		/* compact the column */
+		*new_cp++ = row ;
+		/* compute hash function */
+		hash += row ;
+		/* add set difference */
+		cur_score += row_mark - tag_mark ;
+		/* integer overflow... */
+		cur_score = MIN (cur_score, n_col) ;
+	    }
+
+	    /* recompute the column's length */
+	    Col [col].length = (Int) (new_cp - &A [Col [col].start]) ;
+
+	    /* === Further mass elimination ================================= */
+
+	    if (Col [col].length == 0)
+	    {
+		DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
+		/* nothing left but the pivot row in this column */
+		KILL_PRINCIPAL_COL (col) ;
+		pivot_row_degree -= Col [col].shared1.thickness ;
+		ASSERT (pivot_row_degree >= 0) ;
+		/* order it */
+		Col [col].shared2.order = k ;
+		/* increment order count by column thickness */
+		k += Col [col].shared1.thickness ;
+	    }
+	    else
+	    {
+		/* === Prepare for supercolumn detection ==================== */
+
+		DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ;
+
+		/* save score so far */
+		Col [col].shared2.score = cur_score ;
+
+		/* add column to hash table, for supercolumn detection */
+		hash %= n_col + 1 ;
+
+		DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
+		ASSERT (((Int) hash) <= n_col) ;
+
+		head_column = head [hash] ;
+		if (head_column > EMPTY)
+		{
+		    /* degree list "hash" is non-empty, use prev (shared3) of */
+		    /* first column in degree list as head of hash bucket */
+		    first_col = Col [head_column].shared3.headhash ;
+		    Col [head_column].shared3.headhash = col ;
+		}
+		else
+		{
+		    /* degree list "hash" is empty, use head as hash bucket */
+		    first_col = - (head_column + 2) ;
+		    head [hash] = - (col + 2) ;
+		}
+		Col [col].shared4.hash_next = first_col ;
+
+		/* save hash function in Col [col].shared3.hash */
+		Col [col].shared3.hash = (Int) hash ;
+		ASSERT (COL_IS_ALIVE (col)) ;
+	    }
+	}
+
+	/* The approximate external column degree is now computed.  */
+
+	/* === Supercolumn detection ======================================== */
+
+	DEBUG3 (("** Supercolumn detection phase. **\n")) ;
+
+	detect_super_cols (
+
+#ifndef NDEBUG
+		n_col, Row,
+#endif /* NDEBUG */
+
+		Col, A, head, pivot_row_start, pivot_row_length) ;
+
+	/* === Kill the pivotal column ====================================== */
+
+	KILL_PRINCIPAL_COL (pivot_col) ;
+
+	/* === Clear mark =================================================== */
+
+	tag_mark = clear_mark (tag_mark+max_deg+1, max_mark, n_row, Row) ;
+
+#ifndef NDEBUG
+	DEBUG3 (("check3\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif /* NDEBUG */
+
+	/* === Finalize the new pivot row, and column scores ================ */
+
+	DEBUG3 (("** Finalize scores phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	/* compact the pivot row */
+	new_rp = rp ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    /* skip dead columns */
+	    if (COL_IS_DEAD (col))
+	    {
+		continue ;
+	    }
+	    *new_rp++ = col ;
+	    /* add new pivot row to column */
+	    A [Col [col].start + (Col [col].length++)] = pivot_row ;
+
+	    /* retrieve score so far and add on pivot row's degree. */
+	    /* (we wait until here for this in case the pivot */
+	    /* row's degree was reduced due to mass elimination). */
+	    cur_score = Col [col].shared2.score + pivot_row_degree ;
+
+	    /* calculate the max possible score as the number of */
+	    /* external columns minus the 'k' value minus the */
+	    /* columns thickness */
+	    max_score = n_col - k - Col [col].shared1.thickness ;
+
+	    /* make the score the external degree of the union-of-rows */
+	    cur_score -= Col [col].shared1.thickness ;
+
+	    /* make sure score is less or equal than the max score */
+	    cur_score = MIN (cur_score, max_score) ;
+	    ASSERT (cur_score >= 0) ;
+
+	    /* store updated score */
+	    Col [col].shared2.score = cur_score ;
+
+	    /* === Place column back in degree list ========================= */
+
+	    ASSERT (min_score >= 0) ;
+	    ASSERT (min_score <= n_col) ;
+	    ASSERT (cur_score >= 0) ;
+	    ASSERT (cur_score <= n_col) ;
+	    ASSERT (head [cur_score] >= EMPTY) ;
+	    next_col = head [cur_score] ;
+	    Col [col].shared4.degree_next = next_col ;
+	    Col [col].shared3.prev = EMPTY ;
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = col ;
+	    }
+	    head [cur_score] = col ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, cur_score) ;
+
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+#endif /* NDEBUG */
+
+	/* === Resurrect the new pivot row ================================== */
+
+	if (pivot_row_degree > 0)
+	{
+	    /* update pivot row length to reflect any cols that were killed */
+	    /* during super-col detection and mass elimination */
+	    Row [pivot_row].start  = pivot_row_start ;
+	    Row [pivot_row].length = (Int) (new_rp - &A[pivot_row_start]) ;
+	    ASSERT (Row [pivot_row].length > 0) ;
+	    Row [pivot_row].shared1.degree = pivot_row_degree ;
+	    Row [pivot_row].shared2.mark = 0 ;
+	    /* pivot row is no longer dead */
+
+	    DEBUG1 (("Resurrect Pivot_row %d deg: %d\n",
+			pivot_row, pivot_row_degree)) ;
+	}
+    }
+
+    /* === All principal columns have now been ordered ====================== */
+
+    return (ngarbage) ;
+}
+
+
+/* ========================================================================== */
+/* === order_children ======================================================= */
+/* ========================================================================== */
+
+/*
+    The find_ordering routine has ordered all of the principal columns (the
+    representatives of the supercolumns).  The non-principal columns have not
+    yet been ordered.  This routine orders those columns by walking up the
+    parent tree (a column is a child of the column which absorbed it).  The
+    final permutation vector is then placed in p [0 ... n_col-1], with p [0]
+    being the first column, and p [n_col-1] being the last.  It doesn't look
+    like it at first glance, but be assured that this routine takes time linear
+    in the number of columns.  Although not immediately obvious, the time
+    taken by this routine is O (n_col), that is, linear in the number of
+    columns.  Not user-callable.
+*/
+
+PRIVATE void order_children
+(
+    /* === Parameters ======================================================= */
+
+    Int n_col,			/* number of columns of A */
+    Colamd_Col Col [],		/* of size n_col+1 */
+    Int p []			/* p [0 ... n_col-1] is the column permutation*/
+)
+{
+    /* === Local variables ================================================== */
+
+    Int i ;			/* loop counter for all columns */
+    Int c ;			/* column index */
+    Int parent ;		/* index of column's parent */
+    Int order ;			/* column's order */
+
+    /* === Order each non-principal column ================================== */
+
+    for (i = 0 ; i < n_col ; i++)
+    {
+	/* find an un-ordered non-principal column */
+	ASSERT (COL_IS_DEAD (i)) ;
+	if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
+	{
+	    parent = i ;
+	    /* once found, find its principal parent */
+	    do
+	    {
+		parent = Col [parent].shared1.parent ;
+	    } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+
+	    /* now, order all un-ordered non-principal columns along path */
+	    /* to this parent.  collapse tree at the same time */
+	    c = i ;
+	    /* get order of parent */
+	    order = Col [parent].shared2.order ;
+
+	    do
+	    {
+		ASSERT (Col [c].shared2.order == EMPTY) ;
+
+		/* order this column */
+		Col [c].shared2.order = order++ ;
+		/* collaps tree */
+		Col [c].shared1.parent = parent ;
+
+		/* get immediate parent of this column */
+		c = Col [c].shared1.parent ;
+
+		/* continue until we hit an ordered column.  There are */
+		/* guarranteed not to be anymore unordered columns */
+		/* above an ordered column */
+	    } while (Col [c].shared2.order == EMPTY) ;
+
+	    /* re-order the super_col parent to largest order for this group */
+	    Col [parent].shared2.order = order ;
+	}
+    }
+
+    /* === Generate the permutation ========================================= */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	p [Col [c].shared2.order] = c ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === detect_super_cols ==================================================== */
+/* ========================================================================== */
+
+/*
+    Detects supercolumns by finding matches between columns in the hash buckets.
+    Check amongst columns in the set A [row_start ... row_start + row_length-1].
+    The columns under consideration are currently *not* in the degree lists,
+    and have already been placed in the hash buckets.
+
+    The hash bucket for columns whose hash function is equal to h is stored
+    as follows:
+
+	if head [h] is >= 0, then head [h] contains a degree list, so:
+
+		head [h] is the first column in degree bucket h.
+		Col [head [h]].headhash gives the first column in hash bucket h.
+
+	otherwise, the degree list is empty, and:
+
+		-(head [h] + 2) is the first column in hash bucket h.
+
+    For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
+    column" pointer.  Col [c].shared3.hash is used instead as the hash number
+    for that column.  The value of Col [c].shared4.hash_next is the next column
+    in the same hash bucket.
+
+    Assuming no, or "few" hash collisions, the time taken by this routine is
+    linear in the sum of the sizes (lengths) of each column whose score has
+    just been computed in the approximate degree computation.
+    Not user-callable.
+*/
+
+PRIVATE void detect_super_cols
+(
+    /* === Parameters ======================================================= */
+
+#ifndef NDEBUG
+    /* these two parameters are only needed when debugging is enabled: */
+    Int n_col,			/* number of columns of A */
+    Colamd_Row Row [],		/* of size n_row+1 */
+#endif /* NDEBUG */
+
+    Colamd_Col Col [],		/* of size n_col+1 */
+    Int A [],			/* row indices of A */
+    Int head [],		/* head of degree lists and hash buckets */
+    Int row_start,		/* pointer to set of columns to check */
+    Int row_length		/* number of columns to check */
+)
+{
+    /* === Local variables ================================================== */
+
+    Int hash ;			/* hash value for a column */
+    Int *rp ;			/* pointer to a row */
+    Int c ;			/* a column index */
+    Int super_c ;		/* column index of the column to absorb into */
+    Int *cp1 ;			/* column pointer for column super_c */
+    Int *cp2 ;			/* column pointer for column c */
+    Int length ;		/* length of column super_c */
+    Int prev_c ;		/* column preceding c in hash bucket */
+    Int i ;			/* loop counter */
+    Int *rp_end ;		/* pointer to the end of the row */
+    Int col ;			/* a column index in the row to check */
+    Int head_column ;		/* first column in hash bucket or degree list */
+    Int first_col ;		/* first column in hash bucket */
+
+    /* === Consider each column in the row ================================== */
+
+    rp = &A [row_start] ;
+    rp_end = rp + row_length ;
+    while (rp < rp_end)
+    {
+	col = *rp++ ;
+	if (COL_IS_DEAD (col))
+	{
+	    continue ;
+	}
+
+	/* get hash number for this column */
+	hash = Col [col].shared3.hash ;
+	ASSERT (hash <= n_col) ;
+
+	/* === Get the first column in this hash bucket ===================== */
+
+	head_column = head [hash] ;
+	if (head_column > EMPTY)
+	{
+	    first_col = Col [head_column].shared3.headhash ;
+	}
+	else
+	{
+	    first_col = - (head_column + 2) ;
+	}
+
+	/* === Consider each column in the hash bucket ====================== */
+
+	for (super_c = first_col ; super_c != EMPTY ;
+	    super_c = Col [super_c].shared4.hash_next)
+	{
+	    ASSERT (COL_IS_ALIVE (super_c)) ;
+	    ASSERT (Col [super_c].shared3.hash == hash) ;
+	    length = Col [super_c].length ;
+
+	    /* prev_c is the column preceding column c in the hash bucket */
+	    prev_c = super_c ;
+
+	    /* === Compare super_c with all columns after it ================ */
+
+	    for (c = Col [super_c].shared4.hash_next ;
+		 c != EMPTY ; c = Col [c].shared4.hash_next)
+	    {
+		ASSERT (c != super_c) ;
+		ASSERT (COL_IS_ALIVE (c)) ;
+		ASSERT (Col [c].shared3.hash == hash) ;
+
+		/* not identical if lengths or scores are different */
+		if (Col [c].length != length ||
+		    Col [c].shared2.score != Col [super_c].shared2.score)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* compare the two columns */
+		cp1 = &A [Col [super_c].start] ;
+		cp2 = &A [Col [c].start] ;
+
+		for (i = 0 ; i < length ; i++)
+		{
+		    /* the columns are "clean" (no dead rows) */
+		    ASSERT (ROW_IS_ALIVE (*cp1))  ;
+		    ASSERT (ROW_IS_ALIVE (*cp2))  ;
+		    /* row indices will same order for both supercols, */
+		    /* no gather scatter nessasary */
+		    if (*cp1++ != *cp2++)
+		    {
+			break ;
+		    }
+		}
+
+		/* the two columns are different if the for-loop "broke" */
+		if (i != length)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* === Got it!  two columns are identical =================== */
+
+		ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ;
+
+		Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
+		Col [c].shared1.parent = super_c ;
+		KILL_NON_PRINCIPAL_COL (c) ;
+		/* order c later, in order_children() */
+		Col [c].shared2.order = EMPTY ;
+		/* remove c from hash bucket */
+		Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
+	    }
+	}
+
+	/* === Empty this hash bucket ======================================= */
+
+	if (head_column > EMPTY)
+	{
+	    /* corresponding degree list "hash" is not empty */
+	    Col [head_column].shared3.headhash = EMPTY ;
+	}
+	else
+	{
+	    /* corresponding degree list "hash" is empty */
+	    head [hash] = EMPTY ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === garbage_collection =================================================== */
+/* ========================================================================== */
+
+/*
+    Defragments and compacts columns and rows in the workspace A.  Used when
+    all avaliable memory has been used while performing row merging.  Returns
+    the index of the first free position in A, after garbage collection.  The
+    time taken by this routine is linear is the size of the array A, which is
+    itself linear in the number of nonzeros in the input matrix.
+    Not user-callable.
+*/
+
+PRIVATE Int garbage_collection  /* returns the new value of pfree */
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,			/* number of rows */
+    Int n_col,			/* number of columns */
+    Colamd_Row Row [],		/* row info */
+    Colamd_Col Col [],		/* column info */
+    Int A [],			/* A [0 ... Alen-1] holds the matrix */
+    Int *pfree			/* &A [0] ... pfree is in use */
+)
+{
+    /* === Local variables ================================================== */
+
+    Int *psrc ;			/* source pointer */
+    Int *pdest ;		/* destination pointer */
+    Int j ;			/* counter */
+    Int r ;			/* a row index */
+    Int c ;			/* a column index */
+    Int length ;		/* length of a row or column */
+
+#ifndef NDEBUG
+    Int debug_rows ;
+    DEBUG2 (("Defrag..\n")) ;
+    for (psrc = &A[0] ; psrc < pfree ; psrc++) ASSERT (*psrc >= 0) ;
+    debug_rows = 0 ;
+#endif /* NDEBUG */
+
+    /* === Defragment the columns =========================================== */
+
+    pdest = &A[0] ;
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    psrc = &A [Col [c].start] ;
+
+	    /* move and compact the column */
+	    ASSERT (pdest <= psrc) ;
+	    Col [c].start = (Int) (pdest - &A [0]) ;
+	    length = Col [c].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		r = *psrc++ ;
+		if (ROW_IS_ALIVE (r))
+		{
+		    *pdest++ = r ;
+		}
+	    }
+	    Col [c].length = (Int) (pdest - &A [Col [c].start]) ;
+	}
+    }
+
+    /* === Prepare to defragment the rows =================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_DEAD (r) || (Row [r].length == 0))
+	{
+	    /* This row is already dead, or is of zero length.  Cannot compact
+	     * a row of zero length, so kill it.  NOTE: in the current version,
+	     * there are no zero-length live rows.  Kill the row (for the first
+	     * time, or again) just to be safe. */
+	    KILL_ROW (r) ;
+	}
+	else
+	{
+	    /* save first column index in Row [r].shared2.first_column */
+	    psrc = &A [Row [r].start] ;
+	    Row [r].shared2.first_column = *psrc ;
+	    ASSERT (ROW_IS_ALIVE (r)) ;
+	    /* flag the start of the row with the one's complement of row */
+	    *psrc = ONES_COMPLEMENT (r) ;
+#ifndef NDEBUG
+	    debug_rows++ ;
+#endif /* NDEBUG */
+	}
+    }
+
+    /* === Defragment the rows ============================================== */
+
+    psrc = pdest ;
+    while (psrc < pfree)
+    {
+	/* find a negative number ... the start of a row */
+	if (*psrc++ < 0)
+	{
+	    psrc-- ;
+	    /* get the row index */
+	    r = ONES_COMPLEMENT (*psrc) ;
+	    ASSERT (r >= 0 && r < n_row) ;
+	    /* restore first column index */
+	    *psrc = Row [r].shared2.first_column ;
+	    ASSERT (ROW_IS_ALIVE (r)) ;
+	    ASSERT (Row [r].length > 0) ;
+	    /* move and compact the row */
+	    ASSERT (pdest <= psrc) ;
+	    Row [r].start = (Int) (pdest - &A [0]) ;
+	    length = Row [r].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		c = *psrc++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    *pdest++ = c ;
+		}
+	    }
+	    Row [r].length = (Int) (pdest - &A [Row [r].start]) ;
+	    ASSERT (Row [r].length > 0) ;
+#ifndef NDEBUG
+	    debug_rows-- ;
+#endif /* NDEBUG */
+	}
+    }
+    /* ensure we found all the rows */
+    ASSERT (debug_rows == 0) ;
+
+    /* === Return the new value of pfree ==================================== */
+
+    return ((Int) (pdest - &A [0])) ;
+}
+
+
+/* ========================================================================== */
+/* === clear_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Clears the Row [].shared2.mark array, and returns the new tag_mark.
+    Return value is the new tag_mark.  Not user-callable.
+*/
+
+PRIVATE Int clear_mark	/* return the new value for tag_mark */
+(
+    /* === Parameters ======================================================= */
+
+    Int tag_mark,	/* new value of tag_mark */
+    Int max_mark,	/* max allowed value of tag_mark */
+
+    Int n_row,		/* number of rows in A */
+    Colamd_Row Row []	/* Row [0 ... n_row-1].shared2.mark is set to zero */
+)
+{
+    /* === Local variables ================================================== */
+
+    Int r ;
+
+    if (tag_mark <= 0 || tag_mark >= max_mark)
+    {
+	for (r = 0 ; r < n_row ; r++)
+	{
+	    if (ROW_IS_ALIVE (r))
+	    {
+		Row [r].shared2.mark = 0 ;
+	    }
+	}
+	tag_mark = 1 ;
+    }
+
+    return (tag_mark) ;
+}
+
+
+/* ========================================================================== */
+/* === print_report ========================================================= */
+/* ========================================================================== */
+
+PRIVATE void print_report
+(
+    char *method,
+    Int stats [COLAMD_STATS]
+)
+{
+
+    Int i1, i2, i3 ;
+
+    PRINTF (("\n%s version %d.%d, %s: ", method,
+	    COLAMD_MAIN_VERSION, COLAMD_SUB_VERSION, COLAMD_DATE)) ;
+
+    if (!stats)
+    {
+    	PRINTF (("No statistics available.\n")) ;
+	return ;
+    }
+
+    i1 = stats [COLAMD_INFO1] ;
+    i2 = stats [COLAMD_INFO2] ;
+    i3 = stats [COLAMD_INFO3] ;
+
+    if (stats [COLAMD_STATUS] >= 0)
+    {
+    	PRINTF (("OK.  ")) ;
+    }
+    else
+    {
+    	PRINTF (("ERROR.  ")) ;
+    }
+
+    switch (stats [COLAMD_STATUS])
+    {
+
+	case COLAMD_OK_BUT_JUMBLED:
+
+	    PRINTF(("Matrix has unsorted or duplicate row indices.\n")) ;
+
+	    PRINTF(("%s: number of duplicate or out-of-order row indices: %d\n",
+	    method, i3)) ;
+
+	    PRINTF(("%s: last seen duplicate or out-of-order row index:   %d\n",
+	    method, INDEX (i2))) ;
+
+	    PRINTF(("%s: last seen in column:                             %d",
+	    method, INDEX (i1))) ;
+
+	    /* no break - fall through to next case instead */
+
+	case COLAMD_OK:
+
+	    PRINTF(("\n")) ;
+
+ 	    PRINTF(("%s: number of dense or empty rows ignored:           %d\n",
+	    method, stats [COLAMD_DENSE_ROW])) ;
+
+	    PRINTF(("%s: number of dense or empty columns ignored:        %d\n",
+	    method, stats [COLAMD_DENSE_COL])) ;
+
+	    PRINTF(("%s: number of garbage collections performed:         %d\n",
+	    method, stats [COLAMD_DEFRAG_COUNT])) ;
+	    break ;
+
+	case COLAMD_ERROR_A_not_present:
+
+	    PRINTF(("Array A (row indices of matrix) not present.\n")) ;
+	    break ;
+
+	case COLAMD_ERROR_p_not_present:
+
+	    PRINTF(("Array p (column pointers for matrix) not present.\n")) ;
+	    break ;
+
+	case COLAMD_ERROR_nrow_negative:
+
+	    PRINTF(("Invalid number of rows (%d).\n", i1)) ;
+	    break ;
+
+	case COLAMD_ERROR_ncol_negative:
+
+	    PRINTF(("Invalid number of columns (%d).\n", i1)) ;
+	    break ;
+
+	case COLAMD_ERROR_nnz_negative:
+
+	    PRINTF(("Invalid number of nonzero entries (%d).\n", i1)) ;
+	    break ;
+
+	case COLAMD_ERROR_p0_nonzero:
+
+	    PRINTF(("Invalid column pointer, p [0] = %d, must be zero.\n", i1));
+	    break ;
+
+	case COLAMD_ERROR_A_too_small:
+
+	    PRINTF(("Array A too small.\n")) ;
+	    PRINTF(("        Need Alen >= %d, but given only Alen = %d.\n",
+	    i1, i2)) ;
+	    break ;
+
+	case COLAMD_ERROR_col_length_negative:
+
+	    PRINTF
+	    (("Column %d has a negative number of nonzero entries (%d).\n",
+	    INDEX (i1), i2)) ;
+	    break ;
+
+	case COLAMD_ERROR_row_index_out_of_bounds:
+
+	    PRINTF
+	    (("Row index (row %d) out of bounds (%d to %d) in column %d.\n",
+	    INDEX (i2), INDEX (0), INDEX (i3-1), INDEX (i1))) ;
+	    break ;
+
+	case COLAMD_ERROR_out_of_memory:
+
+	    PRINTF(("Out of memory.\n")) ;
+	    break ;
+
+	/* v2.4: internal-error case deleted */
+    }
+}
+
+
+
+
+/* ========================================================================== */
+/* === colamd debugging routines ============================================ */
+/* ========================================================================== */
+
+/* When debugging is disabled, the remainder of this file is ignored. */
+
+#ifndef NDEBUG
+
+
+/* ========================================================================== */
+/* === debug_structures ===================================================== */
+/* ========================================================================== */
+
+/*
+    At this point, all empty rows and columns are dead.  All live columns
+    are "clean" (containing no dead rows) and simplicial (no supercolumns
+    yet).  Rows may contain dead columns, but all live rows contain at
+    least one live column.
+*/
+
+PRIVATE void debug_structures
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A [],
+    Int n_col2
+)
+{
+    /* === Local variables ================================================== */
+
+    Int i ;
+    Int c ;
+    Int *cp ;
+    Int *cp_end ;
+    Int len ;
+    Int score ;
+    Int r ;
+    Int *rp ;
+    Int *rp_end ;
+    Int deg ;
+
+    /* === Check A, Row, and Col ============================================ */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    len = Col [c].length ;
+	    score = Col [c].shared2.score ;
+	    DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
+	    ASSERT (len > 0) ;
+	    ASSERT (score >= 0) ;
+	    ASSERT (Col [c].shared1.thickness == 1) ;
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + len ;
+	    while (cp < cp_end)
+	    {
+		r = *cp++ ;
+		ASSERT (ROW_IS_ALIVE (r)) ;
+	    }
+	}
+	else
+	{
+	    i = Col [c].shared2.order ;
+	    ASSERT (i >= n_col2 && i < n_col) ;
+	}
+    }
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    i = 0 ;
+	    len = Row [r].length ;
+	    deg = Row [r].shared1.degree ;
+	    ASSERT (len > 0) ;
+	    ASSERT (deg > 0) ;
+	    rp = &A [Row [r].start] ;
+	    rp_end = rp + len ;
+	    while (rp < rp_end)
+	    {
+		c = *rp++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    i++ ;
+		}
+	    }
+	    ASSERT (i > 0) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_deg_lists ====================================================== */
+/* ========================================================================== */
+
+/*
+    Prints the contents of the degree lists.  Counts the number of columns
+    in the degree list and compares it to the total it should have.  Also
+    checks the row degrees.
+*/
+
+PRIVATE void debug_deg_lists
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int head [],
+    Int min_score,
+    Int should,
+    Int max_deg
+)
+{
+    /* === Local variables ================================================== */
+
+    Int deg ;
+    Int col ;
+    Int have ;
+    Int row ;
+
+    /* === Check the degree lists =========================================== */
+
+    if (n_col > 10000 && colamd_debug <= 0)
+    {
+	return ;
+    }
+    have = 0 ;
+    DEBUG4 (("Degree lists: %d\n", min_score)) ;
+    for (deg = 0 ; deg <= n_col ; deg++)
+    {
+	col = head [deg] ;
+	if (col == EMPTY)
+	{
+	    continue ;
+	}
+	DEBUG4 (("%d:", deg)) ;
+	while (col != EMPTY)
+	{
+	    DEBUG4 ((" %d", col)) ;
+	    have += Col [col].shared1.thickness ;
+	    ASSERT (COL_IS_ALIVE (col)) ;
+	    col = Col [col].shared4.degree_next ;
+	}
+	DEBUG4 (("\n")) ;
+    }
+    DEBUG4 (("should %d have %d\n", should, have)) ;
+    ASSERT (should == have) ;
+
+    /* === Check the row degrees ============================================ */
+
+    if (n_row > 10000 && colamd_debug <= 0)
+    {
+	return ;
+    }
+    for (row = 0 ; row < n_row ; row++)
+    {
+	if (ROW_IS_ALIVE (row))
+	{
+	    ASSERT (Row [row].shared1.degree <= max_deg) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Ensures that the tag_mark is less that the maximum and also ensures that
+    each entry in the mark array is less than the tag mark.
+*/
+
+PRIVATE void debug_mark
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,
+    Colamd_Row Row [],
+    Int tag_mark,
+    Int max_mark
+)
+{
+    /* === Local variables ================================================== */
+
+    Int r ;
+
+    /* === Check the Row marks ============================================== */
+
+    ASSERT (tag_mark > 0 && tag_mark <= max_mark) ;
+    if (n_row > 10000 && colamd_debug <= 0)
+    {
+	return ;
+    }
+    for (r = 0 ; r < n_row ; r++)
+    {
+	ASSERT (Row [r].shared2.mark < tag_mark) ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_matrix ========================================================= */
+/* ========================================================================== */
+
+/*
+    Prints out the contents of the columns and the rows.
+*/
+
+PRIVATE void debug_matrix
+(
+    /* === Parameters ======================================================= */
+
+    Int n_row,
+    Int n_col,
+    Colamd_Row Row [],
+    Colamd_Col Col [],
+    Int A []
+)
+{
+    /* === Local variables ================================================== */
+
+    Int r ;
+    Int c ;
+    Int *rp ;
+    Int *rp_end ;
+    Int *cp ;
+    Int *cp_end ;
+
+    /* === Dump the rows and columns of the matrix ========================== */
+
+    if (colamd_debug < 3)
+    {
+	return ;
+    }
+    DEBUG3 (("DUMP MATRIX:\n")) ;
+    for (r = 0 ; r < n_row ; r++)
+    {
+	DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
+	if (ROW_IS_DEAD (r))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d degree %d\n",
+		Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
+	rp = &A [Row [r].start] ;
+	rp_end = rp + Row [r].length ;
+	while (rp < rp_end)
+	{
+	    c = *rp++ ;
+	    DEBUG4 (("	%d col %d\n", COL_IS_ALIVE (c), c)) ;
+	}
+    }
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
+		Col [c].start, Col [c].length,
+		Col [c].shared1.thickness, Col [c].shared2.score)) ;
+	cp = &A [Col [c].start] ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    r = *cp++ ;
+	    DEBUG4 (("	%d row %d\n", ROW_IS_ALIVE (r), r)) ;
+	}
+    }
+}
+
+PRIVATE void colamd_get_debug
+(
+    char *method
+)
+{
+    FILE *f ;
+    colamd_debug = 0 ;		/* no debug printing */
+    f = fopen ("debug", "r") ;
+    if (f == (FILE *) NULL)
+    {
+	colamd_debug = 0 ;
+    }
+    else
+    {
+	fscanf (f, "%d", &colamd_debug) ;
+	fclose (f) ;
+    }
+    DEBUG0 (("%s: debug version, D = %d (THIS WILL BE SLOW!)\n",
+    	method, colamd_debug)) ;
+}
+
+#endif /* NDEBUG */
diff --git a/src/maths/KLU/colamd.h b/src/maths/KLU/colamd.h
new file mode 100644
index 000000000..91665cef7
--- /dev/null
+++ b/src/maths/KLU/colamd.h
@@ -0,0 +1,255 @@
+/* ========================================================================== */
+/* === colamd/symamd prototypes and definitions ============================= */
+/* ========================================================================== */
+
+/* COLAMD / SYMAMD include file
+
+    You must include this file (colamd.h) in any routine that uses colamd,
+    symamd, or the related macros and definitions.
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis at cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Notice:
+
+	Copyright (c) 1998-2007, Timothy A. Davis, All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use, copy, modify, and/or distribute
+	this program, provided that the Copyright, this License, and the
+	Availability of the original version is retained on all copies and made
+	accessible to the end-user of any code or package that includes COLAMD
+	or any modified version of COLAMD. 
+
+    Availability:
+
+	The colamd/symamd library is available at
+
+	    http://www.cise.ufl.edu/research/sparse/colamd/
+
+	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
+	file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
+	files, and by any C code that calls the routines whose prototypes are
+	listed below, or that uses the colamd/symamd definitions listed below.
+
+*/
+
+#ifndef COLAMD_H
+#define COLAMD_H
+
+/* make it easy for C++ programs to include COLAMD */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include <stdlib.h>
+
+/* ========================================================================== */
+/* === COLAMD version ======================================================= */
+/* ========================================================================== */
+
+/* COLAMD Version 2.4 and later will include the following definitions.
+ * As an example, to test if the version you are using is 2.4 or later:
+ *
+ * #ifdef COLAMD_VERSION
+ *	if (COLAMD_VERSION >= COLAMD_VERSION_CODE (2,4)) ...
+ * #endif
+ *
+ * This also works during compile-time:
+ *
+ *  #if defined(COLAMD_VERSION) && (COLAMD_VERSION >= COLAMD_VERSION_CODE (2,4))
+ *    printf ("This is version 2.4 or later\n") ;
+ *  #else
+ *    printf ("This is an early version\n") ;
+ *  #endif
+ *
+ * Versions 2.3 and earlier of COLAMD do not include a #define'd version number.
+ */
+
+#define COLAMD_DATE "Dec 7, 2011"
+#define COLAMD_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define COLAMD_MAIN_VERSION 2
+#define COLAMD_SUB_VERSION 7
+#define COLAMD_SUBSUB_VERSION 4
+#define COLAMD_VERSION \
+	COLAMD_VERSION_CODE(COLAMD_MAIN_VERSION,COLAMD_SUB_VERSION)
+
+/* ========================================================================== */
+/* === Knob and statistics definitions ====================================== */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+#define COLAMD_KNOBS 20
+
+/* number of output statistics.  Only stats [0..6] are currently used. */
+#define COLAMD_STATS 20
+
+/* knobs [0] and stats [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
+
+/* knobs [1] and stats [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
+
+/* knobs [2]: aggressive absorption */
+#define COLAMD_AGGRESSIVE 2
+
+/* stats [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
+
+/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+#define COLAMD_STATUS 3
+
+/* stats [4..6]: error info, or info on jumbled columns */ 
+#define COLAMD_INFO1 4
+#define COLAMD_INFO2 5
+#define COLAMD_INFO3 6
+
+/* error codes returned in stats [3]: */
+#define COLAMD_OK				(0)
+#define COLAMD_OK_BUT_JUMBLED			(1)
+#define COLAMD_ERROR_A_not_present		(-1)
+#define COLAMD_ERROR_p_not_present		(-2)
+#define COLAMD_ERROR_nrow_negative		(-3)
+#define COLAMD_ERROR_ncol_negative		(-4)
+#define COLAMD_ERROR_nnz_negative		(-5)
+#define COLAMD_ERROR_p0_nonzero			(-6)
+#define COLAMD_ERROR_A_too_small		(-7)
+#define COLAMD_ERROR_col_length_negative	(-8)
+#define COLAMD_ERROR_row_index_out_of_bounds	(-9)
+#define COLAMD_ERROR_out_of_memory		(-10)
+#define COLAMD_ERROR_internal_error		(-999)
+
+
+/* ========================================================================== */
+/* === Prototypes of user-callable routines ================================= */
+/* ========================================================================== */
+
+/* define UF_long */
+#include "UFconfig.h"
+
+size_t colamd_recommended	/* returns recommended value of Alen, */
+				/* or 0 if input arguments are erroneous */
+(
+    int nnz,			/* nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+) ;
+
+size_t colamd_l_recommended	/* returns recommended value of Alen, */
+				/* or 0 if input arguments are erroneous */
+(
+    UF_long nnz,		/* nonzeros in A */
+    UF_long n_row,		/* number of rows in A */
+    UF_long n_col		/* number of columns in A */
+) ;
+
+void colamd_set_defaults	/* sets default parameters */
+(				/* knobs argument is modified on output */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
+void colamd_l_set_defaults	/* sets default parameters */
+(				/* knobs argument is modified on output */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
+int colamd			/* returns (1) if successful, (0) otherwise*/
+(				/* A and p arguments are modified on output */
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* size of the array A */
+    int A [],			/* row indices of A, of size Alen */
+    int p [],			/* column pointers of A, of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameter settings for colamd */
+    int stats [COLAMD_STATS]	/* colamd output statistics and error codes */
+) ;
+
+UF_long colamd_l		/* returns (1) if successful, (0) otherwise*/
+(				/* A and p arguments are modified on output */
+    UF_long n_row,		/* number of rows in A */
+    UF_long n_col,		/* number of columns in A */
+    UF_long Alen,		/* size of the array A */
+    UF_long A [],		/* row indices of A, of size Alen */
+    UF_long p [],		/* column pointers of A, of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameter settings for colamd */
+    UF_long stats [COLAMD_STATS]/* colamd output statistics and error codes */
+) ;
+
+int symamd				/* return (1) if OK, (0) otherwise */
+(
+    int n,				/* number of rows and columns of A */
+    int A [],				/* row indices of A */
+    int p [],				/* column pointers of A */
+    int perm [],			/* output permutation, size n_col+1 */
+    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
+    int stats [COLAMD_STATS],		/* output statistics and error codes */
+    void * (*allocate) (size_t, size_t),
+    					/* pointer to calloc (ANSI C) or */
+					/* mxCalloc (for MATLAB mexFunction) */
+    void (*release) (void *)
+    					/* pointer to free (ANSI C) or */
+    					/* mxFree (for MATLAB mexFunction) */
+) ;
+
+UF_long symamd_l			/* return (1) if OK, (0) otherwise */
+(
+    UF_long n,				/* number of rows and columns of A */
+    UF_long A [],			/* row indices of A */
+    UF_long p [],			/* column pointers of A */
+    UF_long perm [],			/* output permutation, size n_col+1 */
+    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
+    UF_long stats [COLAMD_STATS],	/* output statistics and error codes */
+    void * (*allocate) (size_t, size_t),
+    					/* pointer to calloc (ANSI C) or */
+					/* mxCalloc (for MATLAB mexFunction) */
+    void (*release) (void *)
+    					/* pointer to free (ANSI C) or */
+    					/* mxFree (for MATLAB mexFunction) */
+) ;
+
+void colamd_report
+(
+    int stats [COLAMD_STATS]
+) ;
+
+void colamd_l_report
+(
+    UF_long stats [COLAMD_STATS]
+) ;
+
+void symamd_report
+(
+    int stats [COLAMD_STATS]
+) ;
+
+void symamd_l_report
+(
+    UF_long stats [COLAMD_STATS]
+) ;
+
+#ifndef EXTERN
+#define EXTERN extern
+#endif
+
+EXTERN int (*colamd_printf) (const char *, ...) ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* COLAMD_H */
diff --git a/src/maths/KLU/colamd_global.c b/src/maths/KLU/colamd_global.c
new file mode 100644
index 000000000..4d1ae2230
--- /dev/null
+++ b/src/maths/KLU/colamd_global.c
@@ -0,0 +1,24 @@
+/* ========================================================================== */
+/* === colamd_global.c ====================================================== */
+/* ========================================================================== */
+
+/* ----------------------------------------------------------------------------
+ * COLAMD, Copyright (C) 2007, Timothy A. Davis.
+ * See License.txt for the Version 2.1 of the GNU Lesser General Public License
+ * http://www.cise.ufl.edu/research/sparse
+ * -------------------------------------------------------------------------- */
+
+/* Global variables for COLAMD */
+
+#ifndef NPRINT
+#ifdef MATLAB_MEX_FILE
+#include "mex.h"
+int (*colamd_printf) (const char *, ...) = mexPrintf ;
+#else
+#include <stdio.h>
+int (*colamd_printf) (const char *, ...) = printf ;
+#endif
+#else
+int (*colamd_printf) (const char *, ...) = ((void *) 0) ;
+#endif
+
diff --git a/src/maths/KLU/klu.c b/src/maths/KLU/klu.c
new file mode 100644
index 000000000..eb85884c3
--- /dev/null
+++ b/src/maths/KLU/klu.c
@@ -0,0 +1,773 @@
+/* ========================================================================== */
+/* === klu ================================================================== */
+/* ========================================================================== */
+
+/* KLU: factorizes P*A into L*U, using the Gilbert-Peierls algorithm [1], with
+ * optional symmetric pruning by Eisenstat and Liu [2].  The code is by Tim
+ * Davis.  This algorithm is what appears as the default sparse LU routine in
+ * MATLAB version 6.0, and still appears in MATLAB 6.5 as [L,U,P] = lu (A).
+ * Note that no column ordering is provided (see COLAMD or AMD for suitable
+ * orderings).  SuperLU is based on this algorithm, except that it adds the
+ * use of dense matrix operations on "supernodes" (adjacent columns with
+ * identical).  This code doesn't use supernodes, thus its name ("Kent" LU,
+ * as in "Clark Kent", in contrast with Super-LU...).  This algorithm is slower
+ * than SuperLU and UMFPACK for large matrices with lots of nonzeros in their
+ * factors (such as for most finite-element problems).  However, for matrices
+ * with very sparse LU factors, this algorithm is typically faster than both
+ * SuperLU and UMFPACK, since in this case there is little chance to exploit
+ * dense matrix kernels (the BLAS).
+ *
+ * Only one block of A is factorized, in the BTF form.  The input n is the
+ * size of the block; k1 is the first row and column in the block.
+ *
+ * NOTE: no error checking is done on the inputs.  This version is not meant to
+ * be called directly by the user.  Use klu_factor instead.
+ *
+ * No fill-reducing ordering is provided.  The ordering quality of
+ * klu_kernel_factor is the responsibility of the caller.  The input A must
+ * pre-permuted to reduce fill-in, or fill-reducing input permutation Q must
+ * be provided.
+ *
+ * The input matrix A must be in compressed-column form, with either sorted
+ * or unsorted row indices.  Row indices for column j of A is in
+ * Ai [Ap [j] ... Ap [j+1]-1] and the same range of indices in Ax holds the
+ * numerical values.  No duplicate entries are allowed.
+ *
+ * Copyright 2004-2009, Tim Davis.  All rights reserved.  See the README
+ * file for details on permitted use.  Note that no code from The MathWorks,
+ * Inc, or from SuperLU, or from any other source appears here.  The code is
+ * written from scratch, from the algorithmic description in Gilbert & Peierls'
+ * and Eisenstat & Liu's journal papers [1,2].
+ *
+ * If an input permutation Q is provided, the factorization L*U = A (P,Q)
+ * is computed, where P is determined by partial pivoting, and Q is the input
+ * ordering.  If the pivot tolerance is less than 1, the "diagonal" entry that
+ * KLU attempts to choose is the diagonal of A (Q,Q).  In other words, the
+ * input permutation is applied symmetrically to the input matrix.  The output
+ * permutation P includes both the partial pivoting ordering and the input
+ * permutation.  If Q is NULL, then it is assumed to be the identity
+ * permutation.  Q is not modified.
+ *
+ * [1] Gilbert, J. R. and Peierls, T., "Sparse Partial Pivoting in Time
+ *      Proportional to Arithmetic Operations," SIAM J. Sci. Stat. Comp.,
+ *      vol 9, pp.  862-874, 1988.
+ * [2] Eisenstat, S. C. and Liu, J. W. H., "Exploiting Structural Symmetry in
+ *      Unsymmetric Sparse Symbolic Factorization," SIAM J. Matrix Analysis &
+ *      Applic., vol 13, pp.  202-211, 1992.
+ */
+
+/* ========================================================================== */
+
+#include "klu_internal.h"
+
+size_t KLU_kernel_factor            /* 0 if failure, size of LU if OK */
+(
+    /* inputs, not modified */
+    Int n,          /* A is n-by-n. n must be > 0. */
+    Int Ap [ ],     /* size n+1, column pointers for A */
+    Int Ai [ ],     /* size nz = Ap [n], row indices for A */
+    Entry Ax [ ],   /* size nz, values of A */
+    Int Q [ ],      /* size n, optional column permutation */
+    double Lsize,   /* estimate of number of nonzeros in L */
+
+    /* outputs, not defined on input */
+    Unit **p_LU,        /* row indices and values of L and U */
+    Entry Udiag [ ],    /* size n, diagonal of U */
+    Int Llen [ ],       /* size n, column length of L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Lip [ ],        /* size n, column pointers for L */
+    Int Uip [ ],        /* size n, column pointers for U */
+    Int P [ ],          /* row permutation, size n */
+    Int *lnz,           /* size of L */
+    Int *unz,           /* size of U */
+
+    /* workspace, undefined on input */
+    Entry *X,       /* size n double's, zero on output */
+    Int *Work,      /* size 5n Int's */
+
+    /* inputs, not modified on output */
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],      /* inverse of P from symbolic factorization */
+    double Rs [ ],      /* scale factors for A */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ],
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    double maxlnz, dunits ;
+    Unit *LU ;
+    Int *Pinv, *Lpend, *Stack, *Flag, *Ap_pos, *W ;
+    Int lsize, usize, anz, ok ;
+    size_t lusize ;
+    ASSERT (Common != NULL) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get control parameters, or use defaults */
+    /* ---------------------------------------------------------------------- */
+
+    n = MAX (1, n) ;
+    anz = Ap [n+k1] - Ap [k1] ;
+
+    if (Lsize <= 0)
+    {
+        Lsize = -Lsize ;
+        Lsize = MAX (Lsize, 1.0) ;
+        lsize = Lsize * anz + n ;
+    }
+    else
+    {
+        lsize = Lsize ;
+    }
+
+    usize = lsize ;
+
+    lsize  = MAX (n+1, lsize) ;
+    usize  = MAX (n+1, usize) ;
+
+    maxlnz = (((double) n) * ((double) n) + ((double) n)) / 2. ;
+    maxlnz = MIN (maxlnz, ((double) INT_MAX)) ;
+    lsize  = MIN (maxlnz, lsize) ;
+    usize  = MIN (maxlnz, usize) ;
+
+    PRINTF (("Welcome to klu: n %d anz %d k1 %d lsize %d usize %d maxlnz %g\n",
+        n, anz, k1, lsize, usize, maxlnz)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate workspace and outputs */
+    /* ---------------------------------------------------------------------- */
+
+    /* return arguments are not yet assigned */
+    *p_LU = (Unit *) NULL ;
+
+    /* these computations are safe from size_t overflow */
+    W = Work ;
+    Pinv = (Int *) W ;      W += n ;
+    Stack = (Int *) W ;     W += n ;
+    Flag = (Int *) W ;      W += n ;
+    Lpend = (Int *) W ;     W += n ;
+    Ap_pos = (Int *) W ;    W += n ;
+
+    dunits = DUNITS (Int, lsize) + DUNITS (Entry, lsize) +
+             DUNITS (Int, usize) + DUNITS (Entry, usize) ;
+    lusize = (size_t) dunits ;
+    ok = !INT_OVERFLOW (dunits) ; 
+    LU = ok ? KLU_malloc (lusize, sizeof (Unit), Common) : NULL ;
+    if (LU == NULL)
+    {
+        /* out of memory, or problem too large */
+        Common->status = KLU_OUT_OF_MEMORY ;
+        lusize = 0 ;
+        return (lusize) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* factorize */
+    /* ---------------------------------------------------------------------- */
+
+    /* with pruning, and non-recursive depth-first-search */
+    lusize = KLU_kernel (n, Ap, Ai, Ax, Q, lusize,
+            Pinv, P, &LU, Udiag, Llen, Ulen, Lip, Uip, lnz, unz,
+            X, Stack, Flag, Ap_pos, Lpend,
+            k1, PSinv, Rs, Offp, Offi, Offx, Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* return LU factors, or return nothing if an error occurred */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status < KLU_OK)
+    {
+        LU = KLU_free (LU, lusize, sizeof (Unit), Common) ;
+        lusize = 0 ;
+    }
+    *p_LU = LU ;
+    PRINTF ((" in klu noffdiag %d\n", Common->noffdiag)) ;
+    return (lusize) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_lsolve =========================================================== */
+/* ========================================================================== */
+
+/* Solve Lx=b.  Assumes L is unit lower triangular and where the unit diagonal
+ * entry is NOT stored.  Overwrites B  with the solution X.  B is n-by-nrhs
+ * and is stored in ROW form with row dimension nrhs.  nrhs must be in the
+ * range 1 to 4. */
+
+void KLU_lsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lip [ ],
+    Int Llen [ ],
+    Unit LU [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Lx=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], lik ;
+    Int *Li ;
+    Entry *Lx ;
+    Int k, p, len, i ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [k] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                /* unit diagonal of L is not stored*/
+                for (p = 0 ; p < len ; p++)
+                {
+                    /* X [Li [p]] -= Lx [p] * x [0] ; */
+                    MULT_SUB (X [Li [p]], Lx [p], x [0]) ;
+                }
+            }
+            break ;
+
+        case 2:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [2*k    ] ;
+                x [1] = X [2*k + 1] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+                    lik = Lx [p] ;
+                    MULT_SUB (X [2*i], lik, x [0]) ;
+                    MULT_SUB (X [2*i + 1], lik, x [1]) ;
+                }
+            }
+            break ;
+
+        case 3:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [3*k    ] ;
+                x [1] = X [3*k + 1] ;
+                x [2] = X [3*k + 2] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+                    lik = Lx [p] ;
+                    MULT_SUB (X [3*i], lik, x [0]) ;
+                    MULT_SUB (X [3*i + 1], lik, x [1]) ;
+                    MULT_SUB (X [3*i + 2], lik, x [2]) ;
+                }
+            }
+            break ;
+
+        case 4:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                x [0] = X [4*k    ] ;
+                x [1] = X [4*k + 1] ;
+                x [2] = X [4*k + 2] ;
+                x [3] = X [4*k + 3] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+                    lik = Lx [p] ;
+                    MULT_SUB (X [4*i], lik, x [0]) ;
+                    MULT_SUB (X [4*i + 1], lik, x [1]) ;
+                    MULT_SUB (X [4*i + 2], lik, x [2]) ;
+                    MULT_SUB (X [4*i + 3], lik, x [3]) ;
+                }
+            }
+            break ;
+
+    }
+}
+
+/* ========================================================================== */
+/* === KLU_usolve =========================================================== */
+/* ========================================================================== */
+
+/* Solve Ux=b.  Assumes U is non-unit upper triangular and where the diagonal
+ * entry is NOT stored.  Overwrites B with the solution X.  B is n-by-nrhs
+ * and is stored in ROW form with row dimension nrhs.  nrhs must be in the
+ * range 1 to 4. */
+
+void KLU_usolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Uip [ ],
+    Int Ulen [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Ux=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], uik, ukk ;
+    Int *Ui ;
+    Entry *Ux ;
+    Int k, p, len, i ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                /* x [0] = X [k] / Udiag [k] ; */
+                DIV (x [0], X [k], Udiag [k]) ;
+                X [k] = x [0] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    /* X [Ui [p]] -= Ux [p] * x [0] ; */
+                    MULT_SUB (X [Ui [p]], Ux [p], x [0]) ;
+
+                }
+            }
+
+            break ;
+
+        case 2:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                ukk = Udiag [k] ;
+                /* x [0] = X [2*k    ] / ukk ;
+                x [1] = X [2*k + 1] / ukk ; */
+                DIV (x [0], X [2*k], ukk) ;
+                DIV (x [1], X [2*k + 1], ukk) ;
+
+                X [2*k    ] = x [0] ;
+                X [2*k + 1] = x [1] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+                    uik = Ux [p] ;
+                    /* X [2*i    ] -= uik * x [0] ;
+                    X [2*i + 1] -= uik * x [1] ; */
+                    MULT_SUB (X [2*i], uik, x [0]) ;
+                    MULT_SUB (X [2*i + 1], uik, x [1]) ;
+                }
+            }
+
+            break ;
+
+        case 3:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                ukk = Udiag [k] ;
+
+                DIV (x [0], X [3*k], ukk) ;
+                DIV (x [1], X [3*k + 1], ukk) ;
+                DIV (x [2], X [3*k + 2], ukk) ;
+
+                X [3*k    ] = x [0] ;
+                X [3*k + 1] = x [1] ;
+                X [3*k + 2] = x [2] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+                    uik = Ux [p] ;
+                    MULT_SUB (X [3*i], uik, x [0]) ;
+                    MULT_SUB (X [3*i + 1], uik, x [1]) ;
+                    MULT_SUB (X [3*i + 2], uik, x [2]) ;
+                }
+            }
+
+            break ;
+
+        case 4:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                ukk = Udiag [k] ;
+
+                DIV (x [0], X [4*k], ukk) ;
+                DIV (x [1], X [4*k + 1], ukk) ;
+                DIV (x [2], X [4*k + 2], ukk) ;
+                DIV (x [3], X [4*k + 3], ukk) ;
+
+                X [4*k    ] = x [0] ;
+                X [4*k + 1] = x [1] ;
+                X [4*k + 2] = x [2] ;
+                X [4*k + 3] = x [3] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+                    uik = Ux [p] ;
+
+                    MULT_SUB (X [4*i], uik, x [0]) ;
+                    MULT_SUB (X [4*i + 1], uik, x [1]) ;
+                    MULT_SUB (X [4*i + 2], uik, x [2]) ;
+                    MULT_SUB (X [4*i + 3], uik, x [3]) ;
+                }
+            }
+
+            break ;
+
+    }
+}
+
+
+/* ========================================================================== */
+/* === KLU_ltsolve ========================================================== */
+/* ========================================================================== */
+
+/* Solve L'x=b.  Assumes L is unit lower triangular and where the unit diagonal
+ * entry is NOT stored.  Overwrites B with the solution X.  B is n-by-nrhs
+ * and is stored in ROW form with row dimension nrhs.  nrhs must in the
+ * range 1 to 4. */
+
+void KLU_ltsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lip [ ],
+    Int Llen [ ],
+    Unit LU [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to L'x=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], lik ;
+    Int *Li ;
+    Entry *Lx ;
+    Int k, p, len, i ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                x [0] = X [k] ;
+                for (p = 0 ; p < len ; p++)
+                {
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        /* x [0] -= CONJ (Lx [p]) * X [Li [p]] ; */
+                        MULT_SUB_CONJ (x [0], X [Li [p]], Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        /*x [0] -= Lx [p] * X [Li [p]] ;*/
+                        MULT_SUB (x [0], Lx [p], X [Li [p]]) ;
+                    }
+                }
+                X [k] = x [0] ;
+            }
+            break ;
+
+        case 2:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                x [0] = X [2*k    ] ;
+                x [1] = X [2*k + 1] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (lik, Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        lik = Lx [p] ;
+                    }
+                    MULT_SUB (x [0], lik, X [2*i]) ;
+                    MULT_SUB (x [1], lik, X [2*i + 1]) ;
+                }
+                X [2*k    ] = x [0] ;
+                X [2*k + 1] = x [1] ;
+            }
+            break ;
+
+        case 3:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                x [0] = X [3*k    ] ;
+                x [1] = X [3*k + 1] ;
+                x [2] = X [3*k + 2] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (lik, Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        lik = Lx [p] ;
+                    }
+                    MULT_SUB (x [0], lik, X [3*i]) ;
+                    MULT_SUB (x [1], lik, X [3*i + 1]) ;
+                    MULT_SUB (x [2], lik, X [3*i + 2]) ;
+                }
+                X [3*k    ] = x [0] ;
+                X [3*k + 1] = x [1] ;
+                X [3*k + 2] = x [2] ;
+            }
+            break ;
+
+        case 4:
+
+            for (k = n-1 ; k >= 0 ; k--)
+            {
+                x [0] = X [4*k    ] ;
+                x [1] = X [4*k + 1] ;
+                x [2] = X [4*k + 2] ;
+                x [3] = X [4*k + 3] ;
+                GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Li [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (lik, Lx [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        lik = Lx [p] ;
+                    }
+                    MULT_SUB (x [0], lik, X [4*i]) ;
+                    MULT_SUB (x [1], lik, X [4*i + 1]) ;
+                    MULT_SUB (x [2], lik, X [4*i + 2]) ;
+                    MULT_SUB (x [3], lik, X [4*i + 3]) ;
+                }
+                X [4*k    ] = x [0] ;
+                X [4*k + 1] = x [1] ;
+                X [4*k + 2] = x [2] ;
+                X [4*k + 3] = x [3] ;
+            }
+            break ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === KLU_utsolve ========================================================== */
+/* ========================================================================== */
+
+/* Solve U'x=b.  Assumes U is non-unit upper triangular and where the diagonal
+ * entry is stored (and appears last in each column of U).  Overwrites B
+ * with the solution X.  B is n-by-nrhs and is stored in ROW form with row
+ * dimension nrhs.  nrhs must be in the range 1 to 4. */
+
+void KLU_utsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Uip [ ],
+    Int Ulen [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to Ux=b on output */
+    Entry X [ ]
+)
+{
+    Entry x [4], uik, ukk ;
+    Int k, p, len, i ;
+    Int *Ui ;
+    Entry *Ux ;
+
+    switch (nrhs)
+    {
+
+        case 1:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [k] ;
+                for (p = 0 ; p < len ; p++)
+                {
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        /* x [0] -= CONJ (Ux [p]) * X [Ui [p]] ; */
+                        MULT_SUB_CONJ (x [0], X [Ui [p]], Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        /* x [0] -= Ux [p] * X [Ui [p]] ; */
+                        MULT_SUB (x [0], Ux [p], X [Ui [p]]) ;
+                    }
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [k], x [0], ukk) ;
+            }
+            break ;
+
+        case 2:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [2*k    ] ;
+                x [1] = X [2*k + 1] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (uik, Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        uik = Ux [p] ;
+                    }
+                    MULT_SUB (x [0], uik, X [2*i]) ;
+                    MULT_SUB (x [1], uik, X [2*i + 1]) ;
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [2*k], x [0], ukk) ;
+                DIV (X [2*k + 1], x [1], ukk) ;
+            }
+            break ;
+
+        case 3:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [3*k    ] ;
+                x [1] = X [3*k + 1] ;
+                x [2] = X [3*k + 2] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (uik, Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        uik = Ux [p] ;
+                    }
+                    MULT_SUB (x [0], uik, X [3*i]) ;
+                    MULT_SUB (x [1], uik, X [3*i + 1]) ;
+                    MULT_SUB (x [2], uik, X [3*i + 2]) ;
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [3*k], x [0], ukk) ;
+                DIV (X [3*k + 1], x [1], ukk) ;
+                DIV (X [3*k + 2], x [2], ukk) ;
+            }
+            break ;
+
+        case 4:
+
+            for (k = 0 ; k < n ; k++)
+            {
+                GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+                x [0] = X [4*k    ] ;
+                x [1] = X [4*k + 1] ;
+                x [2] = X [4*k + 2] ;
+                x [3] = X [4*k + 3] ;
+                for (p = 0 ; p < len ; p++)
+                {
+                    i = Ui [p] ;
+#ifdef COMPLEX
+                    if (conj_solve)
+                    {
+                        CONJ (uik, Ux [p]) ;
+                    }
+                    else
+#endif
+                    {
+                        uik = Ux [p] ;
+                    }
+                    MULT_SUB (x [0], uik, X [4*i]) ;
+                    MULT_SUB (x [1], uik, X [4*i + 1]) ;
+                    MULT_SUB (x [2], uik, X [4*i + 2]) ;
+                    MULT_SUB (x [3], uik, X [4*i + 3]) ;
+                }
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (ukk, Udiag [k]) ;
+                }
+                else
+#endif
+                {
+                    ukk = Udiag [k] ;
+                }
+                DIV (X [4*k], x [0], ukk) ;
+                DIV (X [4*k + 1], x [1], ukk) ;
+                DIV (X [4*k + 2], x [2], ukk) ;
+                DIV (X [4*k + 3], x [3], ukk) ;
+            }
+            break ;
+    }
+}
diff --git a/src/maths/KLU/klu.h b/src/maths/KLU/klu.h
new file mode 100644
index 000000000..78d6c5776
--- /dev/null
+++ b/src/maths/KLU/klu.h
@@ -0,0 +1,831 @@
+/* ========================================================================== */
+/* === klu include file ===================================================== */
+/* ========================================================================== */
+
+/* Include file for user programs that call klu_* routines */
+
+#ifndef _KLU_H
+#define _KLU_H
+
+/* make it easy for C++ programs to include KLU */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "amd.h"
+#include "colamd.h"
+#include "btf.h"
+
+/* -------------------------------------------------------------------------- */
+/* Symbolic object - contains the pre-ordering computed by klu_analyze */
+/* -------------------------------------------------------------------------- */
+
+typedef struct
+{
+    /* A (P,Q) is in upper block triangular form.  The kth block goes from
+     * row/col index R [k] to R [k+1]-1.  The estimated number of nonzeros
+     * in the L factor of the kth block is Lnz [k]. 
+     */
+
+    /* only computed if the AMD ordering is chosen: */
+    double symmetry ;   /* symmetry of largest block */
+    double est_flops ;  /* est. factorization flop count */
+    double lnz, unz ;   /* estimated nz in L and U, including diagonals */
+    double *Lnz ;       /* size n, but only Lnz [0..nblocks-1] is used */
+
+    /* computed for all orderings: */
+    int
+        n,              /* input matrix A is n-by-n */
+        nz,             /* # entries in input matrix */
+        *P,             /* size n */
+        *Q,             /* size n */
+        *R,             /* size n+1, but only R [0..nblocks] is used */
+        nzoff,          /* nz in off-diagonal blocks */
+        nblocks,        /* number of blocks */
+        maxblock,       /* size of largest block */
+        ordering,       /* ordering used (AMD, COLAMD, or GIVEN) */
+        do_btf ;        /* whether or not BTF preordering was requested */
+
+    /* only computed if BTF preordering requested */
+    int structural_rank ;   /* 0 to n-1 if the matrix is structurally rank
+                        * deficient.  -1 if not computed.  n if the matrix has
+                        * full structural rank */
+
+} klu_symbolic ;
+
+typedef struct          /* 64-bit version (otherwise same as above) */
+{
+    double symmetry, est_flops, lnz, unz ;
+    double *Lnz ;
+    UF_long n, nz, *P, *Q, *R, nzoff, nblocks, maxblock, ordering, do_btf,
+        structural_rank ;
+
+} klu_l_symbolic ;
+
+/* -------------------------------------------------------------------------- */
+/* Numeric object - contains the factors computed by klu_factor */
+/* -------------------------------------------------------------------------- */
+
+typedef struct
+{
+    /* LU factors of each block, the pivot row permutation, and the
+     * entries in the off-diagonal blocks */
+
+    int n ;             /* A is n-by-n */
+    int nblocks ;       /* number of diagonal blocks */
+    int lnz ;           /* actual nz in L, including diagonal */
+    int unz ;           /* actual nz in U, including diagonal */
+    int max_lnz_block ; /* max actual nz in L in any one block, incl. diag */
+    int max_unz_block ; /* max actual nz in U in any one block, incl. diag */
+    int *Pnum ;         /* size n. final pivot permutation */
+    int *Pinv ;         /* size n. inverse of final pivot permutation */
+
+    /* LU factors of each block */
+    int *Lip ;          /* size n. pointers into LUbx[block] for L */
+    int *Uip ;          /* size n. pointers into LUbx[block] for U */
+    int *Llen ;         /* size n. Llen [k] = # of entries in kth column of L */
+    int *Ulen ;         /* size n. Ulen [k] = # of entries in kth column of U */
+    void **LUbx ;       /* L and U indices and entries (excl. diagonal of U) */
+    size_t *LUsize ;    /* size of each LUbx [block], in sizeof (Unit) */
+    void *Udiag ;       /* diagonal of U */
+
+    /* scale factors; can be NULL if no scaling */
+    double *Rs ;        /* size n. Rs [i] is scale factor for row i */
+
+    /* permanent workspace for factorization and solve */
+    size_t worksize ;   /* size (in bytes) of Work */
+    void *Work ;        /* workspace */
+    void *Xwork ;       /* alias into Numeric->Work */
+    int *Iwork ;        /* alias into Numeric->Work */
+
+    /* off-diagonal entries in a conventional compressed-column sparse matrix */
+    int *Offp ;         /* size n+1, column pointers */
+    int *Offi ;         /* size nzoff, row indices */
+    void *Offx ;        /* size nzoff, numerical values */
+    int nzoff ;
+
+} klu_numeric ;
+
+typedef struct          /* 64-bit version (otherwise same as above) */
+{
+    UF_long n, nblocks, lnz, unz, max_lnz_block, max_unz_block, *Pnum, *Pinv,
+        *Lip, *Uip, *Llen, *Ulen ;
+    void **LUbx ;
+    size_t *LUsize ;
+    void *Udiag ;
+    double *Rs ;
+    size_t worksize ;
+    void *Work, *Xwork ;
+    UF_long *Iwork ;
+    UF_long *Offp, *Offi ;
+    void *Offx ;
+    UF_long nzoff ;
+
+} klu_l_numeric ;
+
+/* -------------------------------------------------------------------------- */
+/* KLU control parameters and statistics */
+/* -------------------------------------------------------------------------- */
+
+/* Common->status values */
+#define KLU_OK 0
+#define KLU_SINGULAR (1)            /* status > 0 is a warning, not an error */
+#define KLU_OUT_OF_MEMORY (-2)
+#define KLU_INVALID (-3)
+#define KLU_TOO_LARGE (-4)          /* integer overflow has occured */
+
+typedef struct klu_common_struct
+{
+
+    /* ---------------------------------------------------------------------- */
+    /* parameters */
+    /* ---------------------------------------------------------------------- */
+
+    double tol ;            /* pivot tolerance for diagonal preference */
+    double memgrow ;        /* realloc memory growth size for LU factors */
+    double initmem_amd ;    /* init. memory size with AMD: c*nnz(L) + n */
+    double initmem ;        /* init. memory size: c*nnz(A) + n */
+    double maxwork ;        /* maxwork for BTF, <= 0 if no limit */
+
+    int btf ;               /* use BTF pre-ordering, or not */
+    int ordering ;          /* 0: AMD, 1: COLAMD, 2: user P and Q,
+                             * 3: user function */
+    int scale ;             /* row scaling: -1: none (and no error check),
+                             * 0: none, 1: sum, 2: max */
+
+    /* memory management routines */
+    void *(*malloc_memory) (size_t) ;           /* pointer to malloc */
+    void *(*realloc_memory) (void *, size_t) ;  /* pointer to realloc */
+    void (*free_memory) (void *) ;              /* pointer to free */
+    void *(*calloc_memory) (size_t, size_t) ;   /* pointer to calloc */
+
+    /* pointer to user ordering function */
+    int (*user_order) (int, int *, int *, int *, struct klu_common_struct *) ;
+
+    /* pointer to user data, passed unchanged as the last parameter to the
+     * user ordering function (optional, the user function need not use this
+     * information). */
+    void *user_data ;
+
+    int halt_if_singular ;      /* how to handle a singular matrix:
+        * FALSE: keep going.  Return a Numeric object with a zero U(k,k).  A
+        *   divide-by-zero may occur when computing L(:,k).  The Numeric object
+        *   can be passed to klu_solve (a divide-by-zero will occur).  It can
+        *   also be safely passed to klu_refactor.
+        * TRUE: stop quickly.  klu_factor will free the partially-constructed
+        *   Numeric object.  klu_refactor will not free it, but will leave the
+        *   numerical values only partially defined.  This is the default. */
+
+    /* ---------------------------------------------------------------------- */
+    /* statistics */
+    /* ---------------------------------------------------------------------- */
+
+    int status ;                /* KLU_OK if OK, < 0 if error */
+    int nrealloc ;              /* # of reallocations of L and U */
+
+    int structural_rank ;       /* 0 to n-1 if the matrix is structurally rank
+        * deficient (as determined by maxtrans).  -1 if not computed.  n if the
+        * matrix has full structural rank.  This is computed by klu_analyze
+        * if a BTF preordering is requested. */
+
+    int numerical_rank ;        /* First k for which a zero U(k,k) was found,
+        * if the matrix was singular (in the range 0 to n-1).  n if the matrix
+        * has full rank. This is not a true rank-estimation.  It just reports
+        * where the first zero pivot was found.  -1 if not computed.
+        * Computed by klu_factor and klu_refactor. */
+
+    int singular_col ;          /* n if the matrix is not singular.  If in the
+        * range 0 to n-1, this is the column index of the original matrix A that
+        * corresponds to the column of U that contains a zero diagonal entry.
+        * -1 if not computed.  Computed by klu_factor and klu_refactor. */
+
+    int noffdiag ;      /* # of off-diagonal pivots, -1 if not computed */
+
+    double flops ;      /* actual factorization flop count, from klu_flops */
+    double rcond ;      /* crude reciprocal condition est., from klu_rcond */
+    double condest ;    /* accurate condition est., from klu_condest */
+    double rgrowth ;    /* reciprocal pivot rgrowth, from klu_rgrowth */
+    double work ;       /* actual work done in BTF, in klu_analyze */
+
+    size_t memusage ;   /* current memory usage, in bytes */
+    size_t mempeak ;    /* peak memory usage, in bytes */
+
+} klu_common ;
+
+typedef struct klu_l_common_struct /* 64-bit version (otherwise same as above)*/
+{
+
+    double tol, memgrow, initmem_amd, initmem, maxwork ;
+    UF_long btf, ordering, scale ;
+    void *(*malloc_memory) (size_t) ;
+    void *(*realloc_memory) (void *, size_t) ;
+    void (*free_memory) (void *) ;
+    void *(*calloc_memory) (size_t, size_t) ;
+    UF_long (*user_order) (UF_long, UF_long *, UF_long *, UF_long *,
+        struct klu_l_common_struct *) ;
+    void *user_data ;
+    UF_long halt_if_singular ;
+    UF_long status, nrealloc, structural_rank, numerical_rank, singular_col,
+        noffdiag ;
+    double flops, rcond, condest, rgrowth, work ;
+    size_t memusage, mempeak ;
+
+} klu_l_common ;
+
+/* -------------------------------------------------------------------------- */
+/* klu_defaults: sets default control parameters */
+/* -------------------------------------------------------------------------- */
+
+int klu_defaults
+(
+    klu_common *Common
+) ;
+
+UF_long klu_l_defaults (klu_l_common *Common) ;
+
+/* -------------------------------------------------------------------------- */
+/* klu_analyze:  orders and analyzes a matrix */
+/* -------------------------------------------------------------------------- */
+
+/* Order the matrix with BTF (or not), then order each block with AMD, COLAMD,
+ * a natural ordering, or with a user-provided ordering function */
+
+klu_symbolic *klu_analyze
+(
+    /* inputs, not modified */
+    int n,              /* A is n-by-n */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    klu_common *Common
+) ;
+
+klu_l_symbolic *klu_l_analyze (UF_long, UF_long *, UF_long *,
+    klu_l_common *Common) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_analyze_given: analyzes a matrix using given P and Q */
+/* -------------------------------------------------------------------------- */
+
+/* Order the matrix with BTF (or not), then use natural or given ordering
+ * P and Q on the blocks.  P and Q are interpretted as identity
+ * if NULL. */
+
+klu_symbolic *klu_analyze_given
+(
+    /* inputs, not modified */
+    int n,              /* A is n-by-n */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    int P [ ],          /* size n, user's row permutation (may be NULL) */
+    int Q [ ],          /* size n, user's column permutation (may be NULL) */
+    klu_common *Common
+) ;
+
+klu_l_symbolic *klu_l_analyze_given (UF_long, UF_long *, UF_long *, UF_long *,
+    UF_long *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_factor:  factors a matrix using the klu_analyze results */
+/* -------------------------------------------------------------------------- */
+
+klu_numeric *klu_factor /* returns KLU_OK if OK, < 0 if error */
+(
+    /* inputs, not modified */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],      /* size nz, numerical values */
+    klu_symbolic *Symbolic,
+    klu_common *Common
+) ;
+
+klu_numeric *klu_z_factor      /* returns KLU_OK if OK, < 0 if error */
+(
+     /* inputs, not modified */
+     int Ap [ ],        /* size n+1, column pointers */
+     int Ai [ ],        /* size nz, row indices */
+     double Ax [ ],     /* size 2*nz, numerical values (real,imag pairs) */
+     klu_symbolic *Symbolic,
+     klu_common *Common
+) ;
+
+/* long / real version */
+klu_l_numeric *klu_l_factor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_common *) ;
+
+/* long / complex version */
+klu_l_numeric *klu_zl_factor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_solve: solves Ax=b using the Symbolic and Numeric objects */
+/* -------------------------------------------------------------------------- */
+
+int klu_solve
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    int ldim,               /* leading dimension of B */
+    int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size ldim*nrhs */
+    klu_common *Common
+) ;
+
+int klu_z_solve
+(
+     /* inputs, not modified */
+     klu_symbolic *Symbolic,
+     klu_numeric *Numeric,
+     int ldim,               /* leading dimension of B */
+     int nrhs,               /* number of right-hand-sides */
+
+     /* right-hand-side on input, overwritten with solution to Ax=b on output */
+     double B [ ],          /* size 2*ldim*nrhs */
+     klu_common *Common
+) ;
+
+UF_long klu_l_solve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, klu_l_common *) ;
+
+UF_long klu_zl_solve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_tsolve: solves A'x=b using the Symbolic and Numeric objects */
+/* -------------------------------------------------------------------------- */
+
+int klu_tsolve
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    int ldim,               /* leading dimension of B */
+    int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size ldim*nrhs */
+    klu_common *Common
+) ;
+
+int klu_z_tsolve
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    int ldim,               /* leading dimension of B */
+    int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size 2*ldim*nrhs */
+    int conj_solve,         /* TRUE: conjugate solve, FALSE: solve A.'x=b */
+    klu_common *Common
+     
+) ;
+
+UF_long klu_l_tsolve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, klu_l_common *) ;
+
+UF_long klu_zl_tsolve (klu_l_symbolic *, klu_l_numeric *, UF_long, UF_long,
+    double *, UF_long, klu_l_common * ) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_refactor: refactorizes matrix with same ordering as klu_factor */
+/* -------------------------------------------------------------------------- */
+
+int klu_refactor            /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],      /* size nz, numerical values */
+    klu_symbolic *Symbolic,
+    /* input, and numerical values modified on output */
+    klu_numeric *Numeric,
+    klu_common *Common
+) ;
+
+int klu_z_refactor          /* return TRUE if successful, FALSE otherwise */
+(
+     /* inputs, not modified */
+     int Ap [ ],        /* size n+1, column pointers */
+     int Ai [ ],        /* size nz, row indices */
+     double Ax [ ],     /* size 2*nz, numerical values */
+     klu_symbolic *Symbolic,
+     /* input, and numerical values modified on output */
+     klu_numeric *Numeric,
+     klu_common *Common
+) ;
+
+UF_long klu_l_refactor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+UF_long klu_zl_refactor (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_free_symbolic: destroys the Symbolic object */
+/* -------------------------------------------------------------------------- */
+
+int klu_free_symbolic
+(
+    klu_symbolic **Symbolic,
+    klu_common *Common
+) ;
+
+UF_long klu_l_free_symbolic (klu_l_symbolic **, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_free_numeric: destroys the Numeric object */
+/* -------------------------------------------------------------------------- */
+
+/* Note that klu_free_numeric and klu_z_free_numeric are identical; each can
+ * free both kinds of Numeric objects (real and complex) */
+
+int klu_free_numeric
+(
+    klu_numeric **Numeric,
+    klu_common *Common
+) ;
+
+int klu_z_free_numeric
+(
+     klu_numeric **Numeric,
+     klu_common *Common
+) ;
+
+UF_long klu_l_free_numeric (klu_l_numeric **, klu_l_common *) ;
+UF_long klu_zl_free_numeric (klu_l_numeric **, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_sort: sorts the columns of the LU factorization */
+/* -------------------------------------------------------------------------- */
+
+/* this is not needed except for the MATLAB interface */
+
+int klu_sort
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    /* input/output */
+    klu_numeric *Numeric,
+    klu_common *Common
+) ;
+
+int klu_z_sort
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    /* input/output */
+    klu_numeric *Numeric,
+    klu_common *Common
+) ;
+
+UF_long klu_l_sort (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+UF_long klu_zl_sort (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_flops: determines # of flops performed in numeric factorzation */
+/* -------------------------------------------------------------------------- */
+
+int klu_flops
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    /* input/output */
+    klu_common *Common
+) ;
+
+int klu_z_flops
+(
+    /* inputs, not modified */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    /* input/output */
+    klu_common *Common
+) ;
+
+UF_long klu_l_flops (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+UF_long klu_zl_flops (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_rgrowth : compute the reciprocal pivot growth */
+/* -------------------------------------------------------------------------- */
+
+/* Pivot growth is computed after the input matrix is permuted, scaled, and
+ * off-diagonal entries pruned.  This is because the LU factorization of each
+ * block takes as input the scaled diagonal blocks of the BTF form.  The
+ * reciprocal pivot growth in column j of an LU factorization of a matrix C
+ * is the largest entry in C divided by the largest entry in U; then the overall
+ * reciprocal pivot growth is the smallest such value for all columns j.  Note
+ * that the off-diagonal entries are not scaled, since they do not take part in
+ * the LU factorization of the diagonal blocks.
+ *
+ * In MATLAB notation:
+ *
+ * rgrowth = min (max (abs ((R \ A(p,q)) - F)) ./ max (abs (U))) */
+
+int klu_rgrowth
+(
+    int Ap [ ],
+    int Ai [ ],
+    double Ax [ ],
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    klu_common *Common          /* Common->rgrowth = reciprocal pivot growth */
+) ;
+
+int klu_z_rgrowth
+(
+    int Ap [ ],
+    int Ai [ ],
+    double Ax [ ],
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    klu_common *Common          /* Common->rgrowth = reciprocal pivot growth */
+) ;
+
+UF_long klu_l_rgrowth (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+UF_long klu_zl_rgrowth (UF_long *, UF_long *, double *, klu_l_symbolic *,
+    klu_l_numeric *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_condest */
+/* -------------------------------------------------------------------------- */
+
+/* Computes a reasonably accurate estimate of the 1-norm condition number, using
+ * Hager's method, as modified by Higham and Tisseur (same method as used in
+ * MATLAB's condest */
+
+int klu_condest
+(
+    int Ap [ ],             /* size n+1, column pointers, not modified */
+    double Ax [ ],          /* size nz = Ap[n], numerical values, not modified*/
+    klu_symbolic *Symbolic, /* symbolic analysis, not modified */
+    klu_numeric *Numeric,   /* numeric factorization, not modified */
+    klu_common *Common      /* result returned in Common->condest */
+) ;
+
+int klu_z_condest
+(
+    int Ap [ ],
+    double Ax [ ],          /* size 2*nz */
+    klu_symbolic *Symbolic,
+    klu_numeric *Numeric,
+    klu_common *Common      /* result returned in Common->condest */
+) ;
+
+UF_long klu_l_condest (UF_long *, double *, klu_l_symbolic *, klu_l_numeric *,
+    klu_l_common *) ;
+
+UF_long klu_zl_condest (UF_long *, double *, klu_l_symbolic *, klu_l_numeric *,
+    klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_rcond: compute min(abs(diag(U))) / max(abs(diag(U))) */
+/* -------------------------------------------------------------------------- */
+
+int klu_rcond
+(
+    klu_symbolic *Symbolic,         /* input, not modified */
+    klu_numeric *Numeric,           /* input, not modified */
+    klu_common *Common              /* result in Common->rcond */
+) ;
+
+int klu_z_rcond
+(
+    klu_symbolic *Symbolic,         /* input, not modified */
+    klu_numeric *Numeric,           /* input, not modified */
+    klu_common *Common              /* result in Common->rcond */
+) ;
+
+UF_long klu_l_rcond (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+UF_long klu_zl_rcond (klu_l_symbolic *, klu_l_numeric *, klu_l_common *) ;
+
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_scale */
+/* -------------------------------------------------------------------------- */
+
+int klu_scale           /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    int scale,          /* <0: none, no error check; 0: none, 1: sum, 2: max */
+    int n,
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    /* outputs, not defined on input */
+    double Rs [ ],
+    /* workspace, not defined on input or output */
+    int W [ ],          /* size n, can be NULL */
+    klu_common *Common
+) ;
+
+int klu_z_scale         /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    int scale,          /* <0: none, no error check; 0: none, 1: sum, 2: max */
+    int n,
+    int Ap [ ],         /* size n+1, column pointers */
+    int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    /* outputs, not defined on input */
+    double Rs [ ],
+    /* workspace, not defined on input or output */
+    int W [ ],          /* size n, can be NULL */
+    klu_common *Common
+) ;
+
+UF_long klu_l_scale (UF_long, UF_long, UF_long *, UF_long *, double *,
+    double *, UF_long *, klu_l_common *) ;
+
+UF_long klu_zl_scale (UF_long, UF_long, UF_long *, UF_long *, double *,
+    double *, UF_long *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* klu_extract  */
+/* -------------------------------------------------------------------------- */
+
+int klu_extract     /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs: */
+    klu_numeric *Numeric,
+    klu_symbolic *Symbolic,
+
+    /* outputs, either allocated on input, or ignored otherwise */
+
+    /* L */
+    int *Lp,        /* size n+1 */
+    int *Li,        /* size Numeric->lnz */
+    double *Lx,     /* size Numeric->lnz */
+
+    /* U */
+    int *Up,        /* size n+1 */
+    int *Ui,        /* size Numeric->unz */
+    double *Ux,     /* size Numeric->unz */
+
+    /* F */
+    int *Fp,        /* size n+1 */
+    int *Fi,        /* size Numeric->nzoff */
+    double *Fx,     /* size Numeric->nzoff */
+
+    /* P, row permutation */
+    int *P,         /* size n */
+
+    /* Q, column permutation */
+    int *Q,         /* size n */
+
+    /* Rs, scale factors */
+    double *Rs,     /* size n */
+
+    /* R, block boundaries */
+    int *R,         /* size Symbolic->nblocks+1 (nblocks is at most n) */
+
+    klu_common *Common
+) ;
+
+
+int klu_z_extract           /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs: */
+    klu_numeric *Numeric,
+    klu_symbolic *Symbolic,
+
+    /* outputs, all of which must be allocated on input */
+
+    /* L */
+    int *Lp,        /* size n+1 */
+    int *Li,        /* size nnz(L) */
+    double *Lx,     /* size nnz(L) */
+    double *Lz,     /* size nnz(L) for the complex case, ignored if real */
+
+    /* U */
+    int *Up,        /* size n+1 */
+    int *Ui,        /* size nnz(U) */
+    double *Ux,     /* size nnz(U) */
+    double *Uz,     /* size nnz(U) for the complex case, ignored if real */
+
+    /* F */
+    int *Fp,        /* size n+1 */
+    int *Fi,        /* size nnz(F) */
+    double *Fx,     /* size nnz(F) */
+    double *Fz,     /* size nnz(F) for the complex case, ignored if real */
+
+    /* P, row permutation */
+    int *P,         /* size n */
+
+    /* Q, column permutation */
+    int *Q,         /* size n */
+
+    /* Rs, scale factors */
+    double *Rs,     /* size n */
+
+    /* R, block boundaries */
+    int *R,         /* size Symbolic->nblocks+1 (nblocks is at most n) */
+
+    klu_common *Common
+) ;
+
+UF_long klu_l_extract (klu_l_numeric *, klu_l_symbolic *,
+    UF_long *, UF_long *, double *,
+    UF_long *, UF_long *, double *,
+    UF_long *, UF_long *, double *,
+    UF_long *, UF_long *, double *, UF_long *, klu_l_common *) ;
+
+UF_long klu_zl_extract (klu_l_numeric *, klu_l_symbolic *,
+    UF_long *, UF_long *, double *, double *,
+    UF_long *, UF_long *, double *, double *,
+    UF_long *, UF_long *, double *, double *,
+    UF_long *, UF_long *, double *, UF_long *, klu_l_common *) ;
+
+
+/* -------------------------------------------------------------------------- */
+/* KLU memory management routines */
+/* -------------------------------------------------------------------------- */
+
+void *klu_malloc        /* returns pointer to the newly malloc'd block */
+(
+    /* ---- input ---- */
+    size_t n,           /* number of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    klu_common *Common
+) ;
+
+void *klu_free          /* always returns NULL */
+(
+    /* ---- in/out --- */
+    void *p,            /* block of memory to free */
+    size_t n,           /* number of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    klu_common *Common
+) ;
+
+void *klu_realloc       /* returns pointer to reallocated block */
+(
+    /* ---- input ---- */
+    size_t nnew,        /* requested # of items in reallocated block */
+    size_t nold,        /* current size of block, in # of items */
+    size_t size,        /* size of each item */
+    /* ---- in/out --- */
+    void *p,            /* block of memory to realloc */
+    /* --------------- */
+    klu_common *Common
+) ;
+
+void *klu_l_malloc (size_t, size_t, klu_l_common *) ;
+void *klu_l_free (void *, size_t, size_t, klu_l_common *) ;
+void *klu_l_realloc (size_t, size_t, size_t, void *, klu_l_common *) ;
+
+
+/* ========================================================================== */
+/* === KLU version ========================================================== */
+/* ========================================================================== */
+
+/* All versions of KLU include these definitions.
+ * As an example, to test if the version you are using is 1.2 or later:
+ *
+ *      if (KLU_VERSION >= KLU_VERSION_CODE (1,2)) ...
+ *
+ * This also works during compile-time:
+ *
+ *      #if (KLU >= KLU_VERSION_CODE (1,2))
+ *          printf ("This is version 1.2 or later\n") ;
+ *      #else
+ *          printf ("This is an early version\n") ;
+ *      #endif
+ */
+
+#define KLU_DATE "Dec 7, 2011"
+#define KLU_VERSION_CODE(main,sub) ((main) * 1000 + (sub))
+#define KLU_MAIN_VERSION 1
+#define KLU_SUB_VERSION 1
+#define KLU_SUBSUB_VERSION 3
+#define KLU_VERSION KLU_VERSION_CODE(KLU_MAIN_VERSION,KLU_SUB_VERSION)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/maths/KLU/klu_analyze.c b/src/maths/KLU/klu_analyze.c
new file mode 100644
index 000000000..504b3a1af
--- /dev/null
+++ b/src/maths/KLU/klu_analyze.c
@@ -0,0 +1,488 @@
+/* ========================================================================== */
+/* === klu_analyze ========================================================== */
+/* ========================================================================== */
+
+/* Order the matrix using BTF (or not), and then AMD, COLAMD, the natural
+ * ordering, or the user-provided-function on the blocks.  Does not support
+ * using a given ordering (use klu_analyze_given for that case). */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === analyze_worker ======================================================= */
+/* ========================================================================== */
+
+static Int analyze_worker       /* returns KLU_OK or < 0 if error */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    Int nblocks,        /* # of blocks */
+    Int Pbtf [ ],       /* BTF row permutation */
+    Int Qbtf [ ],       /* BTF col permutation */
+    Int R [ ],          /* size n+1, but only Rbtf [0..nblocks] is used */
+    Int ordering,       /* what ordering to use (0, 1, or 3 for this routine) */
+
+    /* output only, not defined on input */
+    Int P [ ],          /* size n */
+    Int Q [ ],          /* size n */
+    double Lnz [ ],     /* size n, but only Lnz [0..nblocks-1] is used */
+
+    /* workspace, not defined on input or output */
+    Int Pblk [ ],       /* size maxblock */
+    Int Cp [ ],         /* size maxblock+1 */
+    Int Ci [ ],         /* size MAX (nz+1, Cilen) */
+    Int Cilen,          /* nz+1, or COLAMD_recommend(nz,n,n) for COLAMD */
+    Int Pinv [ ],       /* size maxblock */
+
+    /* input/output */
+    KLU_symbolic *Symbolic,
+    KLU_common *Common
+)
+{
+    double amd_Info [AMD_INFO], lnz, lnz1, flops, flops1 ;
+    Int k1, k2, nk, k, block, oldcol, pend, newcol, result, pc, p, newrow,
+        maxnz, nzoff, cstats [COLAMD_STATS], ok, err = KLU_INVALID ;
+
+    /* ---------------------------------------------------------------------- */
+    /* initializations */
+    /* ---------------------------------------------------------------------- */
+
+    /* compute the inverse of Pbtf */
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        P [k] = EMPTY ;
+        Q [k] = EMPTY ;
+        Pinv [k] = EMPTY ;
+    }
+#endif
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (Pbtf [k] >= 0 && Pbtf [k] < n) ;
+        Pinv [Pbtf [k]] = k ;
+    }
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++) ASSERT (Pinv [k] != EMPTY) ;
+#endif
+    nzoff = 0 ;
+    lnz = 0 ;
+    maxnz = 0 ;
+    flops = 0 ;
+    Symbolic->symmetry = EMPTY ;        /* only computed by AMD */
+
+    /* ---------------------------------------------------------------------- */
+    /* order each block */
+    /* ---------------------------------------------------------------------- */
+
+    for (block = 0 ; block < nblocks ; block++)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* the block is from rows/columns k1 to k2-1 */
+        /* ------------------------------------------------------------------ */
+
+        k1 = R [block] ;
+        k2 = R [block+1] ;
+        nk = k2 - k1 ;
+        PRINTF (("BLOCK %d, k1 %d k2-1 %d nk %d\n", block, k1, k2-1, nk)) ;
+
+        /* ------------------------------------------------------------------ */
+        /* construct the kth block, C */
+        /* ------------------------------------------------------------------ */
+
+        Lnz [block] = EMPTY ;
+        pc = 0 ;
+        for (k = k1 ; k < k2 ; k++)
+        {
+            newcol = k-k1 ;
+            Cp [newcol] = pc ;
+            oldcol = Qbtf [k] ;
+            pend = Ap [oldcol+1] ;
+            for (p = Ap [oldcol] ; p < pend ; p++)
+            {
+                newrow = Pinv [Ai [p]] ;
+                if (newrow < k1)
+                {
+                    nzoff++ ;
+                }
+                else
+                {
+                    /* (newrow,newcol) is an entry in the block */
+                    ASSERT (newrow < k2) ;
+                    newrow -= k1 ;
+                    Ci [pc++] = newrow ;
+                }
+            }
+        }
+        Cp [nk] = pc ;
+        maxnz = MAX (maxnz, pc) ;
+        ASSERT (KLU_valid (nk, Cp, Ci, NULL)) ;
+
+        /* ------------------------------------------------------------------ */
+        /* order the block C */
+        /* ------------------------------------------------------------------ */
+
+        if (nk <= 3)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* use natural ordering for tiny blocks (3-by-3 or less) */
+            /* -------------------------------------------------------------- */
+
+            for (k = 0 ; k < nk ; k++)
+            {
+                Pblk [k] = k ;
+            }
+            lnz1 = nk * (nk + 1) / 2 ;
+            flops1 = nk * (nk - 1) / 2 + (nk-1)*nk*(2*nk-1) / 6 ;
+            ok = TRUE ;
+
+        }
+        else if (ordering == 0)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* order the block with AMD (C+C') */
+            /* -------------------------------------------------------------- */
+
+            result = AMD_order (nk, Cp, Ci, Pblk, NULL, amd_Info) ;
+            ok = (result >= AMD_OK) ;
+            if (result == AMD_OUT_OF_MEMORY)
+            {
+                err = KLU_OUT_OF_MEMORY ;
+            }
+
+            /* account for memory usage in AMD */
+            Common->mempeak = MAX (Common->mempeak,
+                Common->memusage + amd_Info [AMD_MEMORY]) ;
+
+            /* get the ordering statistics from AMD */
+            lnz1 = (Int) (amd_Info [AMD_LNZ]) + nk ;
+            flops1 = 2 * amd_Info [AMD_NMULTSUBS_LU] + amd_Info [AMD_NDIV] ;
+            if (pc == maxnz)
+            {
+                /* get the symmetry of the biggest block */
+                Symbolic->symmetry = amd_Info [AMD_SYMMETRY] ;
+            }
+
+        }
+        else if (ordering == 1)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* order the block with COLAMD (C) */
+            /* -------------------------------------------------------------- */
+
+            /* order (and destroy) Ci, returning column permutation in Cp.
+             * COLAMD "cannot" fail since the matrix has already been checked,
+             * and Ci allocated. */
+
+            ok = COLAMD (nk, nk, Cilen, Ci, Cp, NULL, cstats) ;
+            lnz1 = EMPTY ;
+            flops1 = EMPTY ;
+
+            /* copy the permutation from Cp to Pblk */
+            for (k = 0 ; k < nk ; k++)
+            {
+                Pblk [k] = Cp [k] ;
+            }
+
+        }
+        else
+        {
+
+            /* -------------------------------------------------------------- */
+            /* pass the block to the user-provided ordering function */
+            /* -------------------------------------------------------------- */
+
+            lnz1 = (Common->user_order) (nk, Cp, Ci, Pblk, Common) ;
+            flops1 = EMPTY ;
+            ok = (lnz1 != 0) ;
+        }
+
+        if (!ok)
+        {
+            return (err) ;  /* ordering method failed */
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* keep track of nnz(L) and flops statistics */
+        /* ------------------------------------------------------------------ */
+
+        Lnz [block] = lnz1 ;
+        lnz = (lnz == EMPTY || lnz1 == EMPTY) ? EMPTY : (lnz + lnz1) ;
+        flops = (flops == EMPTY || flops1 == EMPTY) ? EMPTY : (flops + flops1) ;
+
+        /* ------------------------------------------------------------------ */
+        /* combine the preordering with the BTF ordering */
+        /* ------------------------------------------------------------------ */
+
+        PRINTF (("Pblk, 1-based:\n")) ;
+        for (k = 0 ; k < nk ; k++)
+        {
+            ASSERT (k + k1 < n) ;
+            ASSERT (Pblk [k] + k1 < n) ;
+            Q [k + k1] = Qbtf [Pblk [k] + k1] ;
+        }
+        for (k = 0 ; k < nk ; k++)
+        {
+            ASSERT (k + k1 < n) ;
+            ASSERT (Pblk [k] + k1 < n) ;
+            P [k + k1] = Pbtf [Pblk [k] + k1] ;
+        }
+    }
+
+    PRINTF (("nzoff %d  Ap[n] %d\n", nzoff, Ap [n])) ;
+    ASSERT (nzoff >= 0 && nzoff <= Ap [n]) ;
+
+    /* return estimates of # of nonzeros in L including diagonal */
+    Symbolic->lnz = lnz ;           /* EMPTY if COLAMD used */
+    Symbolic->unz = lnz ;
+    Symbolic->nzoff = nzoff ;
+    Symbolic->est_flops = flops ;   /* EMPTY if COLAMD or user-ordering used */
+    return (KLU_OK) ;
+}
+
+
+/* ========================================================================== */
+/* === order_and_analyze ==================================================== */
+/* ========================================================================== */
+
+/* Orders the matrix with or with BTF, then orders each block with AMD, COLAMD,
+ * or the user ordering function.  Does not handle the natural or given
+ * ordering cases. */
+
+static KLU_symbolic *order_and_analyze  /* returns NULL if error, or a valid
+                                           KLU_symbolic object if successful */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    /* --------------------- */
+    KLU_common *Common
+)
+{
+    double work ;
+    KLU_symbolic *Symbolic ;
+    double *Lnz ;
+    Int *Qbtf, *Cp, *Ci, *Pinv, *Pblk, *Pbtf, *P, *Q, *R ;
+    Int nblocks, nz, block, maxblock, k1, k2, nk, do_btf, ordering, k, Cilen,
+        *Work ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate the Symbolic object, and check input matrix */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic = KLU_alloc_symbolic (n, Ap, Ai, Common) ;
+    if (Symbolic == NULL)
+    {
+        return (NULL) ;
+    }
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    Lnz = Symbolic->Lnz ;
+    nz = Symbolic->nz ;
+
+    ordering = Common->ordering ;
+    if (ordering == 1)
+    {
+        /* COLAMD */
+        Cilen = COLAMD_recommended (nz, n, n) ;
+    }
+    else if (ordering == 0 || (ordering == 3 && Common->user_order != NULL))
+    {
+        /* AMD or user ordering function */
+        Cilen = nz+1 ;
+    }
+    else
+    {
+        /* invalid ordering */
+        Common->status = KLU_INVALID ;
+        KLU_free_symbolic (&Symbolic, Common) ;
+        return (NULL) ;
+    }
+
+    /* AMD memory management routines */
+    amd_malloc  = Common->malloc_memory ;
+    amd_free    = Common->free_memory ;
+    amd_calloc  = Common->calloc_memory ;
+    amd_realloc = Common->realloc_memory ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate workspace for BTF permutation */
+    /* ---------------------------------------------------------------------- */
+
+    Pbtf = KLU_malloc (n, sizeof (Int), Common) ;
+    Qbtf = KLU_malloc (n, sizeof (Int), Common) ;
+    if (Common->status < KLU_OK)
+    {
+        KLU_free (Pbtf, n, sizeof (Int), Common) ;
+        KLU_free (Qbtf, n, sizeof (Int), Common) ;
+        KLU_free_symbolic (&Symbolic, Common) ;
+        return (NULL) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* get the common parameters for BTF and ordering method */
+    /* ---------------------------------------------------------------------- */
+
+    do_btf = Common->btf ;
+    do_btf = (do_btf) ? TRUE : FALSE ;
+    Symbolic->ordering = ordering ;
+    Symbolic->do_btf = do_btf ;
+    Symbolic->structural_rank = EMPTY ;
+
+    /* ---------------------------------------------------------------------- */
+    /* find the block triangular form (if requested) */
+    /* ---------------------------------------------------------------------- */
+
+    Common->work = 0 ;
+
+    if (do_btf)
+    {
+        Work = KLU_malloc (5*n, sizeof (Int), Common) ;
+        if (Common->status < KLU_OK)
+        {
+            /* out of memory */
+            KLU_free (Pbtf, n, sizeof (Int), Common) ;
+            KLU_free (Qbtf, n, sizeof (Int), Common) ;
+            KLU_free_symbolic (&Symbolic, Common) ;
+            return (NULL) ;
+        }
+
+        nblocks = BTF_order (n, Ap, Ai, Common->maxwork, &work, Pbtf, Qbtf, R,
+                &(Symbolic->structural_rank), Work) ;
+        Common->structural_rank = Symbolic->structural_rank ;
+        Common->work += work ;
+
+        KLU_free (Work, 5*n, sizeof (Int), Common) ;
+
+        /* unflip Qbtf if the matrix does not have full structural rank */
+        if (Symbolic->structural_rank < n)
+        {
+            for (k = 0 ; k < n ; k++)
+            {
+                Qbtf [k] = BTF_UNFLIP (Qbtf [k]) ;
+            }
+        }
+
+        /* find the size of the largest block */
+        maxblock = 1 ;
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("block %d size %d\n", block, nk)) ;
+            maxblock = MAX (maxblock, nk) ;
+        }
+    }
+    else
+    {
+        /* BTF not requested */
+        nblocks = 1 ;
+        maxblock = n ;
+        R [0] = 0 ;
+        R [1] = n ;
+        for (k = 0 ; k < n ; k++)
+        {
+            Pbtf [k] = k ;
+            Qbtf [k] = k ;
+        }
+    }
+
+    Symbolic->nblocks = nblocks ;
+
+    PRINTF (("maxblock size %d\n", maxblock)) ;
+    Symbolic->maxblock = maxblock ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate more workspace, for analyze_worker */
+    /* ---------------------------------------------------------------------- */
+
+    Pblk = KLU_malloc (maxblock, sizeof (Int), Common) ;
+    Cp   = KLU_malloc (maxblock + 1, sizeof (Int), Common) ;
+    Ci   = KLU_malloc (MAX (Cilen, nz+1), sizeof (Int), Common) ;
+    Pinv = KLU_malloc (n, sizeof (Int), Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* order each block of the BTF ordering, and a fill-reducing ordering */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status == KLU_OK)
+    {
+        PRINTF (("calling analyze_worker\n")) ;
+        Common->status = analyze_worker (n, Ap, Ai, nblocks, Pbtf, Qbtf, R,
+            ordering, P, Q, Lnz, Pblk, Cp, Ci, Cilen, Pinv, Symbolic, Common) ;
+        PRINTF (("analyze_worker done\n")) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* free all workspace */
+    /* ---------------------------------------------------------------------- */
+
+    KLU_free (Pblk, maxblock, sizeof (Int), Common) ;
+    KLU_free (Cp, maxblock+1, sizeof (Int), Common) ;
+    KLU_free (Ci, MAX (Cilen, nz+1), sizeof (Int), Common) ;
+    KLU_free (Pinv, n, sizeof (Int), Common) ;
+    KLU_free (Pbtf, n, sizeof (Int), Common) ;
+    KLU_free (Qbtf, n, sizeof (Int), Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* return the symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status < KLU_OK)
+    {
+        KLU_free_symbolic (&Symbolic, Common) ;
+    }
+    return (Symbolic) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_analyze ========================================================== */
+/* ========================================================================== */
+
+KLU_symbolic *KLU_analyze       /* returns NULL if error, or a valid
+                                   KLU_symbolic object if successful */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    /* -------------------- */
+    KLU_common *Common
+)
+{
+
+    /* ---------------------------------------------------------------------- */
+    /* get the control parameters for BTF and ordering method */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (NULL) ;
+    }
+    Common->status = KLU_OK ;
+    Common->structural_rank = EMPTY ;
+
+    /* ---------------------------------------------------------------------- */
+    /* order and analyze */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->ordering == 2)
+    {
+        /* natural ordering */
+        return (KLU_analyze_given (n, Ap, Ai, NULL, NULL, Common)) ;
+    }
+    else
+    {
+        /* order with P and Q */
+        return (order_and_analyze (n, Ap, Ai, Common)) ;
+    }
+}
diff --git a/src/maths/KLU/klu_analyze_given.c b/src/maths/KLU/klu_analyze_given.c
new file mode 100644
index 000000000..bee547345
--- /dev/null
+++ b/src/maths/KLU/klu_analyze_given.c
@@ -0,0 +1,369 @@
+/* ========================================================================== */
+/* === klu_analyze_given ==================================================== */
+/* ========================================================================== */
+
+/* Given an input permutation P and Q, create the Symbolic object.  BTF can
+ * be done to modify the user's P and Q (does not perform the max transversal;
+ * just finds the strongly-connected components). */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === klu_alloc_symbolic =================================================== */
+/* ========================================================================== */
+
+/* Allocate Symbolic object, and check input matrix.  Not user callable. */
+
+KLU_symbolic *KLU_alloc_symbolic
+(
+    Int n,
+    Int *Ap,
+    Int *Ai,
+    KLU_common *Common
+)
+{
+    KLU_symbolic *Symbolic ;
+    Int *P, *Q, *R ;
+    double *Lnz ;
+    Int nz, i, j, p, pend ;
+
+    if (Common == NULL)
+    {
+        return (NULL) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* A is n-by-n, with n > 0.  Ap [0] = 0 and nz = Ap [n] >= 0 required.
+     * Ap [j] <= Ap [j+1] must hold for all j = 0 to n-1.  Row indices in Ai
+     * must be in the range 0 to n-1, and no duplicate entries can be present.
+     * The list of row indices in each column of A need not be sorted.
+     */
+
+    if (n <= 0 || Ap == NULL || Ai == NULL)
+    {
+        /* Ap and Ai must be present, and n must be > 0 */
+        Common->status = KLU_INVALID ;
+        return (NULL) ;
+    }
+
+    nz = Ap [n] ;
+    if (Ap [0] != 0 || nz < 0)
+    {
+        /* nz must be >= 0 and Ap [0] must equal zero */
+        Common->status = KLU_INVALID ;
+        return (NULL) ;
+    }
+
+    for (j = 0 ; j < n ; j++)
+    {
+        if (Ap [j] > Ap [j+1])
+        {
+            /* column pointers must be non-decreasing */
+            Common->status = KLU_INVALID ;
+            return (NULL) ;
+        }
+    }
+    P = KLU_malloc (n, sizeof (Int), Common) ;
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+    for (i = 0 ; i < n ; i++)
+    {
+        P [i] = EMPTY ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        pend = Ap [j+1] ;
+        for (p = Ap [j] ; p < pend ; p++)
+        {
+            i = Ai [p] ;
+            if (i < 0 || i >= n || P [i] == j)
+            {
+                /* row index out of range, or duplicate entry */
+                KLU_free (P, n, sizeof (Int), Common) ;
+                Common->status = KLU_INVALID ;
+                return (NULL) ;
+            }
+            /* flag row i as appearing in column j */
+            P [i] = j ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic = KLU_malloc (sizeof (KLU_symbolic), 1, Common) ;
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        KLU_free (P, n, sizeof (Int), Common) ;
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+
+    Q = KLU_malloc (n, sizeof (Int), Common) ;
+    R = KLU_malloc (n+1, sizeof (Int), Common) ;
+    Lnz = KLU_malloc (n, sizeof (double), Common) ;
+
+    Symbolic->n = n ;
+    Symbolic->nz = nz ;
+    Symbolic->P = P ;
+    Symbolic->Q = Q ;
+    Symbolic->R = R ;
+    Symbolic->Lnz = Lnz ;
+
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        KLU_free_symbolic (&Symbolic, Common) ;
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+
+    return (Symbolic) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_analyze_given ==================================================== */
+/* ========================================================================== */
+
+KLU_symbolic *KLU_analyze_given     /* returns NULL if error, or a valid
+                                       KLU_symbolic object if successful */
+(
+    /* inputs, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    Int Puser [ ],      /* size n, user's row permutation (may be NULL) */
+    Int Quser [ ],      /* size n, user's column permutation (may be NULL) */
+    /* -------------------- */
+    KLU_common *Common
+)
+{
+    KLU_symbolic *Symbolic ;
+    double *Lnz ;
+    Int nblocks, nz, block, maxblock, *P, *Q, *R, nzoff, p, pend, do_btf, k ;
+
+    /* ---------------------------------------------------------------------- */
+    /* determine if input matrix is valid, and get # of nonzeros */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic = KLU_alloc_symbolic (n, Ap, Ai, Common) ;
+    if (Symbolic == NULL)
+    {
+        return (NULL) ;
+    }
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    Lnz = Symbolic->Lnz ;
+    nz = Symbolic->nz ;
+
+    /* ---------------------------------------------------------------------- */
+    /* Q = Quser, or identity if Quser is NULL */
+    /* ---------------------------------------------------------------------- */
+
+    if (Quser == (Int *) NULL)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = k ;
+        }
+    }
+    else
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = Quser [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* get the control parameters for BTF and ordering method */
+    /* ---------------------------------------------------------------------- */
+
+    do_btf = Common->btf ;
+    do_btf = (do_btf) ? TRUE : FALSE ;
+    Symbolic->ordering = 2 ;
+    Symbolic->do_btf = do_btf ;
+
+    /* ---------------------------------------------------------------------- */
+    /* find the block triangular form, if requested */
+    /* ---------------------------------------------------------------------- */
+
+    if (do_btf)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* get workspace for BTF_strongcomp */
+        /* ------------------------------------------------------------------ */
+
+        Int *Pinv, *Work, *Bi, k1, k2, nk, oldcol ;
+
+        Work = KLU_malloc (4*n, sizeof (Int), Common) ;
+        Pinv = KLU_malloc (n, sizeof (Int), Common) ;
+        if (Puser != (Int *) NULL)
+        {
+            Bi = KLU_malloc (nz+1, sizeof (Int), Common) ;
+        }
+        else
+        {
+            Bi = Ai ;
+        }
+
+        if (Common->status < KLU_OK)
+        {
+            /* out of memory */
+            KLU_free (Work, 4*n, sizeof (Int), Common) ;
+            KLU_free (Pinv, n, sizeof (Int), Common) ;
+            if (Puser != (Int *) NULL)
+            {
+                KLU_free (Bi, nz+1, sizeof (Int), Common) ;
+            }
+            KLU_free_symbolic (&Symbolic, Common) ;
+            Common->status = KLU_OUT_OF_MEMORY ;
+            return (NULL) ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* B = Puser * A */
+        /* ------------------------------------------------------------------ */
+
+        if (Puser != (Int *) NULL)
+        {
+            for (k = 0 ; k < n ; k++)
+            {
+                Pinv [Puser [k]] = k ;
+            }
+            for (p = 0 ; p < nz ; p++)
+            {
+                Bi [p] = Pinv [Ai [p]] ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* find the strongly-connected components */
+        /* ------------------------------------------------------------------ */
+
+        /* modifies Q, and determines P and R */
+        nblocks = BTF_strongcomp (n, Ap, Bi, Q, P, R, Work) ;
+
+        /* ------------------------------------------------------------------ */
+        /* P = P * Puser */
+        /* ------------------------------------------------------------------ */
+
+        if (Puser != (Int *) NULL)
+        {
+            for (k = 0 ; k < n ; k++)
+            {
+                Work [k] = Puser [P [k]] ;
+            }
+            for (k = 0 ; k < n ; k++)
+            {
+                P [k] = Work [k] ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* Pinv = inverse of P */
+        /* ------------------------------------------------------------------ */
+
+        for (k = 0 ; k < n ; k++)
+        {
+            Pinv [P [k]] = k ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* analyze each block */
+        /* ------------------------------------------------------------------ */
+
+        nzoff = 0 ;         /* nz in off-diagonal part */
+        maxblock = 1 ;      /* size of the largest block */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("BLOCK %d, k1 %d k2-1 %d nk %d\n", block, k1, k2-1, nk)) ;
+            maxblock = MAX (maxblock, nk) ;
+
+            /* -------------------------------------------------------------- */
+            /* scan the kth block, C */
+            /* -------------------------------------------------------------- */
+
+            for (k = k1 ; k < k2 ; k++)
+            {
+                oldcol = Q [k] ;
+                pend = Ap [oldcol+1] ;
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    if (Pinv [Ai [p]] < k1)
+                    {
+                        nzoff++ ;
+                    }
+                }
+            }
+
+            /* fill-in not estimated */
+            Lnz [block] = EMPTY ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* free all workspace */
+        /* ------------------------------------------------------------------ */
+
+        KLU_free (Work, 4*n, sizeof (Int), Common) ;
+        KLU_free (Pinv, n, sizeof (Int), Common) ;
+        if (Puser != (Int *) NULL)
+        {
+            KLU_free (Bi, nz+1, sizeof (Int), Common) ;
+        }
+
+    }
+    else
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* BTF not requested */
+        /* ------------------------------------------------------------------ */
+
+        nzoff = 0 ;
+        nblocks = 1 ;
+        maxblock = n ;
+        R [0] = 0 ;
+        R [1] = n ;
+        Lnz [0] = EMPTY ;
+
+        /* ------------------------------------------------------------------ */
+        /* P = Puser, or identity if Puser is NULL */
+        /* ------------------------------------------------------------------ */
+
+        for (k = 0 ; k < n ; k++)
+        {
+            P [k] = (Puser == NULL) ? k : Puser [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* return the symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Symbolic->nblocks = nblocks ;
+    Symbolic->maxblock = maxblock ;
+    Symbolic->lnz = EMPTY ;
+    Symbolic->unz = EMPTY ;
+    Symbolic->nzoff = nzoff ;
+
+    return (Symbolic) ;
+}
diff --git a/src/maths/KLU/klu_defaults.c b/src/maths/KLU/klu_defaults.c
new file mode 100644
index 000000000..0369a37f3
--- /dev/null
+++ b/src/maths/KLU/klu_defaults.c
@@ -0,0 +1,60 @@
+/* ========================================================================== */
+/* === KLU_defaults ========================================================= */
+/* ========================================================================== */
+
+/* Sets default parameters for KLU */
+
+#include "klu_internal.h"
+
+Int KLU_defaults
+(
+    KLU_common *Common
+)
+{
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+
+    /* parameters */
+    Common->tol = 0.001 ;       /* pivot tolerance for diagonal */
+    Common->memgrow = 1.2;      /* realloc size ratio increase for LU factors */
+    Common->initmem_amd = 1.2 ; /* init. mem with AMD:  c*nnz(L) + n */
+    Common->initmem = 10 ;      /* init. mem otherwise: c*nnz(A) + n */
+    Common->btf = TRUE ;        /* use BTF pre-ordering, or not */
+    Common->maxwork = 0 ;       /* no limit to work done by btf_order */
+    Common->ordering = 0 ;      /* 0: AMD, 1: COLAMD, 2: user-provided P and Q,
+                                 * 3: user-provided function */
+    Common->scale = 2 ;         /* scale: -1: none, and do not check for errors
+                                 * in the input matrix in KLU_refactor.
+                                 * 0: none, but check for errors,
+                                 * 1: sum, 2: max */
+    Common->halt_if_singular = TRUE ;   /* quick halt if matrix is singular */
+
+    /* memory management routines */
+    Common->malloc_memory  = malloc ;
+    Common->calloc_memory  = calloc ;
+    Common->free_memory    = free ;
+    Common->realloc_memory = realloc ;
+
+    /* user ordering function and optional argument */
+    Common->user_order = NULL ;
+    Common->user_data = NULL ;
+
+    /* statistics */
+    Common->status = KLU_OK ;
+    Common->nrealloc = 0 ;
+    Common->structural_rank = EMPTY ;
+    Common->numerical_rank = EMPTY ;
+    Common->noffdiag = EMPTY ;
+    Common->flops = EMPTY ;
+    Common->rcond = EMPTY ;
+    Common->condest = EMPTY ;
+    Common->rgrowth = EMPTY ;
+    Common->work = 0 ;          /* work done by btf_order */
+
+    Common->memusage = 0 ;
+    Common->mempeak = 0 ;
+
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_diagnostics.c b/src/maths/KLU/klu_diagnostics.c
new file mode 100644
index 000000000..45cb62764
--- /dev/null
+++ b/src/maths/KLU/klu_diagnostics.c
@@ -0,0 +1,570 @@
+/* ========================================================================== */
+/* === KLU_diagnostics ====================================================== */
+/* ========================================================================== */
+
+/* Linear algebraic diagnostics:
+ * KLU_rgrowth: reciprocal pivot growth, takes O(|A|+|U|) time
+ * KLU_condest: condition number estimator, takes about O(|A|+5*(|L|+|U|)) time
+ * KLU_flops:   compute # flops required to factorize A into L*U
+ * KLU_rcond:   compute a really cheap estimate of the reciprocal of the
+ *              condition number, min(abs(diag(U))) / max(abs(diag(U))).
+ *              Takes O(n) time.
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === KLU_rgrowth ========================================================== */
+/* ========================================================================== */
+
+/* Compute the reciprocal pivot growth factor.  In MATLAB notation:
+ *
+ *   rgrowth = min (max (abs ((R \ A (p,q)) - F))) ./ max (abs (U)))
+ */
+
+Int KLU_rgrowth         /* return TRUE if successful, FALSE otherwise */
+(
+    Int *Ap,
+    Int *Ai,
+    double *Ax,
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double temp, max_ai, max_ui, min_block_rgrowth ;
+    Entry aik ;
+    Int *Q, *Ui, *Uip, *Ulen, *Pinv ;
+    Unit *LU ;
+    Entry *Aentry, *Ux, *Ukk ;
+    double *Rs ;
+    Int i, newrow, oldrow, k1, k2, nk, j, oldcol, k, pend, len ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+
+    if (Symbolic == NULL || Ap == NULL || Ai == NULL || Ax == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+
+    if (Numeric == NULL)
+    {
+        /* treat this as a singular matrix */
+        Common->rgrowth = 0 ;
+        Common->status = KLU_SINGULAR ;
+        return (TRUE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute the reciprocal pivot growth */
+    /* ---------------------------------------------------------------------- */
+
+    Aentry = (Entry *) Ax ;
+    Pinv = Numeric->Pinv ;
+    Rs = Numeric->Rs ;
+    Q = Symbolic->Q ;
+    Common->rgrowth = 1 ;
+
+    for (i = 0 ; i < Symbolic->nblocks ; i++)
+    {
+        k1 = Symbolic->R[i] ;
+        k2 = Symbolic->R[i+1] ;
+        nk = k2 - k1 ;
+        if (nk == 1)
+        {
+            continue ;      /* skip singleton blocks */
+        }
+        LU = (Unit *) Numeric->LUbx[i] ;
+        Uip = Numeric->Uip + k1 ;
+        Ulen = Numeric->Ulen + k1 ;
+        Ukk = ((Entry *) Numeric->Udiag) + k1 ;
+        min_block_rgrowth = 1 ;
+        for (j = 0 ; j < nk ; j++)
+        {
+            max_ai = 0 ;
+            max_ui = 0 ;
+            oldcol = Q[j + k1] ;
+            pend = Ap [oldcol + 1] ;
+            for (k = Ap [oldcol] ; k < pend ; k++)
+            {
+                oldrow = Ai [k] ;
+                newrow = Pinv [oldrow] ;
+                if (newrow < k1)
+                {
+                    continue ;  /* skip entry outside the block */
+                }
+                ASSERT (newrow < k2) ;
+                if (Rs != NULL)
+                {
+                    /* aik = Aentry [k] / Rs [oldrow] */
+                    SCALE_DIV_ASSIGN (aik, Aentry [k], Rs [newrow]) ;
+                }
+                else
+                {
+                    aik = Aentry [k] ;
+                }
+                /* temp = ABS (aik) */
+                ABS (temp, aik) ;
+                if (temp > max_ai)
+                {
+                    max_ai = temp ;
+                }
+            }
+
+            GET_POINTER (LU, Uip, Ulen, Ui, Ux, j, len) ;
+            for (k = 0 ; k < len ; k++)
+            {
+                /* temp = ABS (Ux [k]) */
+                ABS (temp, Ux [k]) ;
+                if (temp > max_ui)
+                {
+                    max_ui = temp ;
+                }
+            }
+            /* consider the diagonal element */
+            ABS (temp, Ukk [j]) ;
+            if (temp > max_ui)
+            {
+                max_ui = temp ;
+            }
+
+            /* if max_ui is 0, skip the column */
+            if (SCALAR_IS_ZERO (max_ui))
+            {
+                continue ;
+            }
+            temp = max_ai / max_ui ;
+            if (temp < min_block_rgrowth)
+            {
+                min_block_rgrowth = temp ;
+            }
+        }
+
+        if (min_block_rgrowth < Common->rgrowth)
+        {
+            Common->rgrowth = min_block_rgrowth ;
+        }
+    }
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_condest ========================================================== */
+/* ========================================================================== */
+
+/* Estimate the condition number.  Uses Higham and Tisseur's algorithm
+ * (A block algorithm for matrix 1-norm estimation, with applications to
+ * 1-norm pseudospectra, SIAM J. Matrix Anal. Appl., 21(4):1185-1201, 2000.
+ */
+
+Int KLU_condest         /* return TRUE if successful, FALSE otherwise */
+(
+    Int Ap [ ],
+    double Ax [ ],
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double xj, Xmax, csum, anorm, ainv_norm, est_old, est_new, abs_value ;
+    Entry *Udiag, *Aentry, *X, *S ;
+    Int *R ;
+    Int nblocks, i, j, jmax, jnew, pend, n ;
+#ifndef COMPLEX
+    Int unchanged ;
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Symbolic == NULL || Ap == NULL || Ax == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    abs_value = 0 ;
+    if (Numeric == NULL)
+    {
+        /* treat this as a singular matrix */
+        Common->condest = 1 / abs_value ;
+        Common->status = KLU_SINGULAR ;
+        return (TRUE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get inputs */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+    R = Symbolic->R ;
+    Udiag = Numeric->Udiag ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check if diagonal of U has a zero on it */
+    /* ---------------------------------------------------------------------- */
+
+    for (i = 0 ; i < n ; i++)
+    {
+        ABS (abs_value, Udiag [i]) ;
+        if (SCALAR_IS_ZERO (abs_value))
+        {
+            Common->condest = 1 / abs_value ;
+            Common->status = KLU_SINGULAR ;
+            return (TRUE) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* compute 1-norm (maximum column sum) of the matrix */
+    /* ---------------------------------------------------------------------- */
+
+    anorm =  0.0 ;
+    Aentry = (Entry *) Ax ;
+    for (i = 0 ; i < n ; i++)
+    {
+        pend = Ap [i + 1] ;
+        csum = 0.0 ;
+        for (j = Ap [i] ; j < pend ; j++)
+        {
+            ABS (abs_value, Aentry [j]) ;
+            csum += abs_value ;
+        }
+        if (csum > anorm)
+        {
+            anorm = csum ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* compute estimate of 1-norm of inv (A) */
+    /* ---------------------------------------------------------------------- */
+
+    /* get workspace (size 2*n Entry's) */
+    X = Numeric->Xwork ;            /* size n space used in KLU_solve, tsolve */
+    X += n ;                        /* X is size n */
+    S = X + n ;                     /* S is size n */
+
+    for (i = 0 ; i < n ; i++)
+    {
+        CLEAR (S [i]) ;
+        CLEAR (X [i]) ;
+        REAL (X [i]) = 1.0 / ((double) n) ;
+    }
+    jmax = 0 ;
+
+    ainv_norm = 0.0 ;
+    for (i = 0 ; i < 5 ; i++)
+    {
+        if (i > 0)
+        {
+            /* X [jmax] is the largest entry in X */
+            for (j = 0 ; j < n ; j++)
+            {
+                /* X [j] = 0 ;*/
+                CLEAR (X [j]) ;
+            }
+            REAL (X [jmax]) = 1 ;
+        }
+
+        KLU_solve (Symbolic, Numeric, n, 1, (double *) X, Common) ;
+        est_old = ainv_norm ;
+        ainv_norm = 0.0 ;
+
+        for (j = 0 ; j < n ; j++)
+        {
+            /* ainv_norm += ABS (X [j]) ;*/
+            ABS (abs_value, X [j]) ;
+            ainv_norm += abs_value ;
+        }
+
+#ifndef COMPLEX
+        unchanged = TRUE ;
+
+        for (j = 0 ; j < n ; j++)
+        {
+            double s = (X [j] >= 0) ? 1 : -1 ;
+            if (s != (Int) REAL (S [j]))
+            {
+                S [j] = s ;
+                unchanged = FALSE ;
+            }
+        }
+
+        if (i > 0 && (ainv_norm <= est_old || unchanged))
+        {
+            break ;
+        }
+#else
+        for (j = 0 ; j < n ; j++)
+        {
+            if (IS_NONZERO (X [j]))
+            {
+                ABS (abs_value, X [j]) ;
+                SCALE_DIV_ASSIGN (S [j], X [j], abs_value) ;
+            }
+            else
+            {
+                CLEAR (S [j]) ;
+                REAL (S [j]) = 1 ;
+            }
+        }
+
+        if (i > 0 && ainv_norm <= est_old)
+        {
+            break ;
+        }
+#endif
+
+        for (j = 0 ; j < n ; j++)
+        {
+            X [j] = S [j] ;
+        }
+
+#ifndef COMPLEX
+        /* do a transpose solve */
+        KLU_tsolve (Symbolic, Numeric, n, 1, X, Common) ;
+#else
+        /* do a conjugate transpose solve */
+        KLU_tsolve (Symbolic, Numeric, n, 1, (double *) X, 1, Common) ;
+#endif
+
+        /* jnew = the position of the largest entry in X */
+        jnew = 0 ;
+        Xmax = 0 ;
+        for (j = 0 ; j < n ; j++)
+        {
+            /* xj = ABS (X [j]) ;*/
+            ABS (xj, X [j]) ;
+            if (xj > Xmax)
+            {
+                Xmax = xj ;
+                jnew = j ;
+            }
+        }
+        if (i > 0 && jnew == jmax)
+        {
+            /* the position of the largest entry did not change
+             * from the previous iteration */
+            break ;
+        }
+        jmax = jnew ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* compute another estimate of norm(inv(A),1), and take the largest one */
+    /* ---------------------------------------------------------------------- */
+
+    for (j = 0 ; j < n ; j++)
+    {
+        CLEAR (X [j]) ;
+        if (j % 2)
+        {
+            REAL (X [j]) = 1 + ((double) j) / ((double) (n-1)) ;
+        }
+        else
+        {
+            REAL (X [j]) = -1 - ((double) j) / ((double) (n-1)) ;
+        }
+    }
+
+    KLU_solve (Symbolic, Numeric, n, 1, (double *) X, Common) ;
+
+    est_new = 0.0 ;
+    for (j = 0 ; j < n ; j++)
+    {
+        /* est_new += ABS (X [j]) ;*/
+        ABS (abs_value, X [j]) ;
+        est_new += abs_value ;
+    }
+    est_new = 2 * est_new / (3 * n) ;
+    ainv_norm = MAX (est_new, ainv_norm) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute estimate of condition number */
+    /* ---------------------------------------------------------------------- */
+
+    Common->condest = ainv_norm * anorm ;
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_flops ============================================================ */
+/* ========================================================================== */
+
+/* Compute the flop count for the LU factorization (in Common->flops) */
+
+Int KLU_flops           /* return TRUE if successful, FALSE otherwise */
+(
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double flops = 0 ;
+    Int *R, *Ui, *Uip, *Llen, *Ulen ;
+    Unit **LUbx ;
+    Unit *LU ;
+    Int k, ulen, p, n, nk, block, nblocks, k1 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->flops = EMPTY ;
+    if (Numeric == NULL || Symbolic == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    R = Symbolic->R ;
+    nblocks = Symbolic->nblocks ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    LUbx = (Unit **) Numeric->LUbx ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute the flop count */
+    /* ---------------------------------------------------------------------- */
+
+    for (block = 0 ; block < nblocks ; block++)
+    {
+        k1 = R [block] ;
+        nk = R [block+1] - k1 ;
+        if (nk > 1)
+        {
+            Llen = Numeric->Llen + k1 ;
+            Uip  = Numeric->Uip  + k1 ;
+            Ulen = Numeric->Ulen + k1 ;
+            LU = LUbx [block] ;
+            for (k = 0 ; k < nk ; k++)
+            {
+                /* compute kth column of U, and update kth column of A */
+                GET_I_POINTER (LU, Uip, Ui, k) ;
+                ulen = Ulen [k] ;
+                for (p = 0 ; p < ulen ; p++)
+                {
+                    flops += 2 * Llen [Ui [p]] ;
+                }
+                /* gather and divide by pivot to get kth column of L */
+                flops += Llen [k] ;
+            }
+        }
+    }
+    Common->flops = flops ;
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_rcond ============================================================ */
+/* ========================================================================== */
+
+/* Compute a really cheap estimate of the reciprocal of the condition number,
+ * condition number, min(abs(diag(U))) / max(abs(diag(U))).  If U has a zero
+ * pivot, or a NaN pivot, rcond will be zero.  Takes O(n) time.
+ */   
+
+Int KLU_rcond           /* return TRUE if successful, FALSE otherwise */
+(
+    KLU_symbolic *Symbolic,     /* input, not modified */
+    KLU_numeric *Numeric,       /* input, not modified */
+    KLU_common *Common          /* result in Common->rcond */
+)
+{
+    double ukk, umin = 0, umax = 0 ;
+    Entry *Udiag ;
+    Int j, n ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Symbolic == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    if (Numeric == NULL)
+    {
+        Common->rcond = 0 ;
+        Common->status = KLU_SINGULAR ;
+        return (TRUE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* compute rcond */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    Udiag = Numeric->Udiag ;
+    for (j = 0 ; j < n ; j++)
+    {
+        /* get the magnitude of the pivot */
+        ABS (ukk, Udiag [j]) ;
+        if (SCALAR_IS_NAN (ukk) || SCALAR_IS_ZERO (ukk))
+        {
+            /* if NaN, or zero, the rcond is zero */
+            Common->rcond = 0 ;
+            Common->status = KLU_SINGULAR ;
+            return (TRUE) ;
+        }
+        if (j == 0)
+        {
+            /* first pivot entry */
+            umin = ukk ;
+            umax = ukk ;
+        }
+        else
+        {
+            /* subsequent pivots */
+            umin = MIN (umin, ukk) ;
+            umax = MAX (umax, ukk) ;
+        }
+    }
+
+    Common->rcond = umin / umax ;
+    if (SCALAR_IS_NAN (Common->rcond) || SCALAR_IS_ZERO (Common->rcond))
+    {
+        /* this can occur if umin or umax are Inf or NaN */
+        Common->rcond = 0 ;
+        Common->status = KLU_SINGULAR ;
+    }
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_dump.c b/src/maths/KLU/klu_dump.c
new file mode 100644
index 000000000..63170bc4c
--- /dev/null
+++ b/src/maths/KLU/klu_dump.c
@@ -0,0 +1,142 @@
+/* ========================================================================== */
+/* === KLU_dump ============================================================= */
+/* ========================================================================== */
+
+/* Debug routines for klu.  Only used when NDEBUG is not defined at
+ * compile-time.
+ */
+
+#include "klu_internal.h"
+
+#ifndef NDEBUG
+
+/* ========================================================================== */
+/* === KLU_valid ============================================================ */
+/* ========================================================================== */
+
+/* Check if a column-form matrix is valid or not.  The matrix A is
+ * n-by-n.  The row indices of entries in column j are in
+ * Ai [Ap [j] ... Ap [j+1]-1].  Required conditions are:
+ *
+ *      n >= 0
+ *      nz = Ap [n_col] >= 0        number of entries in the matrix
+ *      Ap [0] == 0
+ *      Ap [j] <= Ap [j+1] for all j in the range 0 to n_col.
+ *      row indices in Ai [Ap [j] ... Ap [j+1]-1]
+ *          must be in the range 0 to n_row-1,
+ *          and no duplicate entries can exist (duplicates not checked here).
+ *
+ * Not user-callable.  Only used when debugging.
+ */
+
+Int KLU_valid (Int n, Int Ap [ ], Int Ai [ ], Entry Ax [ ])
+{
+    Int nz, j, p1, p2, i, p ;
+    PRINTF (("\ncolumn oriented matrix, n = %d\n", n)) ;
+    if (n <= 0)
+    {
+        PRINTF (("n must be >= 0: %d\n", n)) ;
+        return (FALSE) ;
+    }
+    nz = Ap [n] ;
+    if (Ap [0] != 0 || nz < 0)
+    {
+        /* column pointers must start at Ap [0] = 0, and Ap [n] must be >= 0 */
+        PRINTF (("column 0 pointer bad or nz < 0\n")) ;
+        return (FALSE) ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        p1 = Ap [j] ;
+        p2 = Ap [j+1] ;
+        PRINTF (("\nColumn: %d p1: %d p2: %d\n", j, p1, p2)) ;
+        if (p1 > p2)
+        {
+            /* column pointers must be ascending */
+            PRINTF (("column %d pointer bad\n", j)) ;
+            return (FALSE) ;
+        }
+        for (p = p1 ; p < p2 ; p++)
+        {
+            i = Ai [p] ;
+            PRINTF (("row: %d", i)) ;
+            if (i < 0 || i >= n)
+            {
+                /* row index out of range */
+                PRINTF (("index out of range, col %d row %d\n", j, i)) ;
+                return (FALSE) ;
+            }
+            if (Ax != (Entry *) NULL)
+            {
+                PRINT_ENTRY (Ax [p]) ;
+            }
+            PRINTF (("\n")) ;
+        }
+    }
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_valid_LU ========================================================= */
+/* ========================================================================== */
+
+/* This function does the same validity tests as KLU_valid but for the
+ * LU factor storage format. The flag flag_test_start_ptr is used to
+ * test if Xip [0] = 0. This is not applicable for U. So when calling this
+ * function for U, the flag should be set to false.  Only used when debugging.
+ */
+
+Int KLU_valid_LU (Int n, Int flag_test_start_ptr, Int Xip [ ],
+                   Int Xlen [ ],  Unit LU [ ])
+{
+    Int *Xi ;
+    Entry *Xx ;
+    Int j, p1, p2, i, p, len ;
+
+    PRINTF (("\ncolumn oriented matrix, n = %d\n", n)) ;
+    if (n <= 0)
+    {
+        PRINTF (("n must be >= 0: %d\n", n)) ;
+        return (FALSE) ;
+    }
+    if (flag_test_start_ptr && Xip [0] != 0)
+    {
+        /* column pointers must start at Xip [0] = 0*/
+        PRINTF (("column 0 pointer bad\n")) ;
+        return (FALSE) ;
+    }
+
+    for (j = 0 ; j < n ; j++)
+    {
+        p1 = Xip [j] ;
+        p2 = Xip [j+1] ;
+        PRINTF (("\nColumn: %d p1: %d p2: %d\n", j, p1, p2)) ;
+        if (p1 > p2)
+        {
+            /* column pointers must be ascending */
+            PRINTF (("column %d pointer bad\n", j)) ;
+            return (FALSE) ;
+        }
+        GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            i = Xi [p] ;
+            PRINTF (("row: %d", i)) ;
+            if (i < 0 || i >= n)
+            {
+                /* row index out of range */
+                PRINTF (("index out of range, col %d row %d\n", j, i)) ;
+                return (FALSE) ;
+            }
+            if (Xx != (Entry *) NULL)
+            {
+                PRINT_ENTRY (Xx [p]) ;
+            }
+            PRINTF (("\n")) ;
+        }
+    }
+
+    return (TRUE) ;
+}
+#endif
diff --git a/src/maths/KLU/klu_extract.c b/src/maths/KLU/klu_extract.c
new file mode 100644
index 000000000..b009828a7
--- /dev/null
+++ b/src/maths/KLU/klu_extract.c
@@ -0,0 +1,290 @@
+/* ========================================================================== */
+/* === KLU_extract ========================================================== */
+/* ========================================================================== */
+
+/* Extract KLU factorization into conventional compressed-column matrices.
+ * If any output array is NULL, that part of the LU factorization is not
+ * extracted (this is not an error condition).
+ *
+ * nnz(L) = Numeric->lnz, nnz(U) = Numeric->unz, and nnz(F) = Numeric->Offp [n]
+ */
+
+#include "klu_internal.h"
+
+Int KLU_extract     /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs: */
+    KLU_numeric *Numeric,
+    KLU_symbolic *Symbolic,
+
+    /* outputs, all of which must be allocated on input */
+
+    /* L */
+    Int *Lp,        /* size n+1 */
+    Int *Li,        /* size nnz(L) */
+    double *Lx,     /* size nnz(L) */
+#ifdef COMPLEX
+    double *Lz,     /* size nnz(L) for the complex case, ignored if real */
+#endif
+
+    /* U */
+    Int *Up,        /* size n+1 */
+    Int *Ui,        /* size nnz(U) */
+    double *Ux,     /* size nnz(U) */
+#ifdef COMPLEX
+    double *Uz,     /* size nnz(U) for the complex case, ignored if real */
+#endif
+
+    /* F */
+    Int *Fp,        /* size n+1 */
+    Int *Fi,        /* size nnz(F) */
+    double *Fx,     /* size nnz(F) */
+#ifdef COMPLEX
+    double *Fz,     /* size nnz(F) for the complex case, ignored if real */
+#endif
+
+    /* P, row permutation */
+    Int *P,         /* size n */
+
+    /* Q, column permutation */
+    Int *Q,         /* size n */
+
+    /* Rs, scale factors */
+    double *Rs,     /* size n */
+
+    /* R, block boundaries */
+    Int *R,         /* size nblocks+1 */
+
+    KLU_common *Common
+)
+{
+    Int *Lip, *Llen, *Uip, *Ulen, *Li2, *Ui2 ;
+    Unit *LU ;
+    Entry *Lx2, *Ux2, *Ukk ;
+    Int i, k, block, nblocks, n, nz, k1, k2, nk, len, kk, p ;
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+
+    if (Symbolic == NULL || Numeric == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+
+    Common->status = KLU_OK ;
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+
+    /* ---------------------------------------------------------------------- */
+    /* extract scale factors */
+    /* ---------------------------------------------------------------------- */
+
+    if (Rs != NULL)
+    {
+        if (Numeric->Rs != NULL)
+        {
+            for (i = 0 ; i < n ; i++)
+            {
+                Rs [i] = Numeric->Rs [i] ;
+            }
+        }
+        else
+        {
+            /* no scaling */
+            for (i = 0 ; i < n ; i++)
+            {
+                Rs [i] = 1 ;
+            }
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract block boundaries */
+    /* ---------------------------------------------------------------------- */
+
+    if (R != NULL)
+    {
+        for (block = 0 ; block <= nblocks ; block++)
+        {
+            R [block] = Symbolic->R [block] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract final row permutation */
+    /* ---------------------------------------------------------------------- */
+
+    if (P != NULL)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            P [k] = Numeric->Pnum [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract column permutation */
+    /* ---------------------------------------------------------------------- */
+
+    if (Q != NULL)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            Q [k] = Symbolic->Q [k] ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract each block of L */
+    /* ---------------------------------------------------------------------- */
+
+    if (Lp != NULL && Li != NULL && Lx != NULL
+#ifdef COMPLEX
+        && Lz != NULL
+#endif
+    )
+    {
+        nz = 0 ;
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = Symbolic->R [block] ;
+            k2 = Symbolic->R [block+1] ;
+            nk = k2 - k1 ;
+            if (nk == 1)
+            {
+                /* singleton block */
+                Lp [k1] = nz ;
+                Li [nz] = k1 ;
+                Lx [nz] = 1 ;
+#ifdef COMPLEX
+                Lz [nz] = 0 ;
+#endif
+                nz++ ;
+            }
+            else
+            {
+                /* non-singleton block */
+                LU = Numeric->LUbx [block] ;
+                Lip = Numeric->Lip + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                for (kk = 0 ; kk < nk ; kk++)
+                {
+                    Lp [k1+kk] = nz ;
+                    /* add the unit diagonal entry */
+                    Li [nz] = k1 + kk ;
+                    Lx [nz] = 1 ;
+#ifdef COMPLEX
+                    Lz [nz] = 0 ;
+#endif
+                    nz++ ;
+                    GET_POINTER (LU, Lip, Llen, Li2, Lx2, kk, len) ;
+                    for (p = 0 ; p < len ; p++)
+                    {
+                        Li [nz] = k1 + Li2 [p] ;
+                        Lx [nz] = REAL (Lx2 [p]) ;
+#ifdef COMPLEX
+                        Lz [nz] = IMAG (Lx2 [p]) ;
+#endif
+                        nz++ ;
+                    }
+                }
+            }
+        }
+        Lp [n] = nz ;
+        ASSERT (nz == Numeric->lnz) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract each block of U */
+    /* ---------------------------------------------------------------------- */
+
+    if (Up != NULL && Ui != NULL && Ux != NULL
+#ifdef COMPLEX
+        && Uz != NULL
+#endif
+    )
+    {
+        nz = 0 ;
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = Symbolic->R [block] ;
+            k2 = Symbolic->R [block+1] ;
+            nk = k2 - k1 ;
+            Ukk = ((Entry *) Numeric->Udiag) + k1 ;
+            if (nk == 1)
+            {
+                /* singleton block */
+                Up [k1] = nz ;
+                Ui [nz] = k1 ;
+                Ux [nz] = REAL (Ukk [0]) ;
+#ifdef COMPLEX
+                Uz [nz] = IMAG (Ukk [0]) ;
+#endif
+                nz++ ;
+            }
+            else
+            {
+                /* non-singleton block */
+                LU = Numeric->LUbx [block] ;
+                Uip = Numeric->Uip + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                for (kk = 0 ; kk < nk ; kk++)
+                {
+                    Up [k1+kk] = nz ;
+                    GET_POINTER (LU, Uip, Ulen, Ui2, Ux2, kk, len) ;
+                    for (p = 0 ; p < len ; p++)
+                    {
+                        Ui [nz] = k1 + Ui2 [p] ;
+                        Ux [nz] = REAL (Ux2 [p]) ;
+#ifdef COMPLEX
+                        Uz [nz] = IMAG (Ux2 [p]) ;
+#endif
+                        nz++ ;
+                    }
+                    /* add the diagonal entry */
+                    Ui [nz] = k1 + kk ;
+                    Ux [nz] = REAL (Ukk [kk]) ;
+#ifdef COMPLEX
+                    Uz [nz] = IMAG (Ukk [kk]) ;
+#endif
+                    nz++ ;
+                }
+            }
+        }
+        Up [n] = nz ;
+        ASSERT (nz == Numeric->unz) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* extract the off-diagonal blocks, F */
+    /* ---------------------------------------------------------------------- */
+
+    if (Fp != NULL && Fi != NULL && Fx != NULL
+#ifdef COMPLEX
+        && Fz != NULL
+#endif
+    )
+    {
+        for (k = 0 ; k <= n ; k++)
+        {
+            Fp [k] = Numeric->Offp [k] ;
+        }
+        nz = Fp [n] ;
+        for (k = 0 ; k < nz ; k++)
+        {
+            Fi [k] = Numeric->Offi [k] ;
+        }
+        for (k = 0 ; k < nz ; k++)
+        {
+            Fx [k] = REAL (((Entry *) Numeric->Offx) [k]) ;
+#ifdef COMPLEX
+            Fz [k] = IMAG (((Entry *) Numeric->Offx) [k]) ;
+#endif
+        }
+    }
+
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_factor.c b/src/maths/KLU/klu_factor.c
new file mode 100644
index 000000000..50b52ea6d
--- /dev/null
+++ b/src/maths/KLU/klu_factor.c
@@ -0,0 +1,545 @@
+/* ========================================================================== */
+/* === KLU_factor =========================================================== */
+/* ========================================================================== */
+
+/* Factor the matrix, after ordering and analyzing it with KLU_analyze
+ * or KLU_analyze_given.
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === KLU_factor2 ========================================================== */
+/* ========================================================================== */
+
+static void factor2
+(
+    /* inputs, not modified */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    Entry Ax [ ],
+    KLU_symbolic *Symbolic,
+
+    /* inputs, modified on output: */
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    double lsize ;
+    double *Lnz, *Rs ;
+    Int *P, *Q, *R, *Pnum, *Offp, *Offi, *Pblock, *Pinv, *Iwork,
+        *Lip, *Uip, *Llen, *Ulen ;
+    Entry *Offx, *X, s, *Udiag ;
+    Unit **LUbx ;
+    Int k1, k2, nk, k, block, oldcol, pend, oldrow, n, lnz, unz, p, newrow,
+        nblocks, poff, nzoff, lnz_block, unz_block, scale, max_lnz_block,
+        max_unz_block ;
+
+    /* ---------------------------------------------------------------------- */
+    /* initializations */
+    /* ---------------------------------------------------------------------- */
+
+    /* get the contents of the Symbolic object */
+    n = Symbolic->n ;
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    Lnz = Symbolic->Lnz ;
+    nblocks = Symbolic->nblocks ;
+    nzoff = Symbolic->nzoff ;
+
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    Lip = Numeric->Lip ;
+    Uip = Numeric->Uip ;
+    Llen = Numeric->Llen ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+    Udiag = Numeric->Udiag ;
+
+    Rs = Numeric->Rs ;
+    Pinv = Numeric->Pinv ;
+    X = (Entry *) Numeric->Xwork ;              /* X is of size n */
+    Iwork = Numeric->Iwork ;                    /* 5*maxblock for KLU_factor */
+                                                /* 1*maxblock for Pblock */
+    Pblock = Iwork + 5*((size_t) Symbolic->maxblock) ;
+    Common->nrealloc = 0 ;
+    scale = Common->scale ;
+    max_lnz_block = 1 ;
+    max_unz_block = 1 ;
+
+    /* compute the inverse of P from symbolic analysis.  Will be updated to
+     * become the inverse of the numerical factorization when the factorization
+     * is done, for use in KLU_refactor */
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        Pinv [k] = EMPTY ;
+    }
+#endif
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (P [k] >= 0 && P [k] < n) ;
+        Pinv [P [k]] = k ;
+    }
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++) ASSERT (Pinv [k] != EMPTY) ;
+#endif
+
+    lnz = 0 ;
+    unz = 0 ;
+    Common->noffdiag = 0 ;
+    Offp [0] = 0 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* optionally check input matrix and compute scale factors */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale >= 0)
+    {
+        /* use Pnum as workspace. NOTE: scale factors are not yet permuted
+         * according to the final pivot row ordering, so Rs [oldrow] is the
+         * scale factor for A (oldrow,:), for the user's matrix A.  Pnum is
+         * used as workspace in KLU_scale.  When the factorization is done,
+         * the scale factors are permuted according to the final pivot row
+         * permutation, so that Rs [k] is the scale factor for the kth row of
+         * A(p,q) where p and q are the final row and column permutations. */
+        KLU_scale (scale, n, Ap, Ai, (double *) Ax, Rs, Pnum, Common) ;
+        if (Common->status < KLU_OK)
+        {
+            /* matrix is invalid */
+            return ;
+        }
+    }
+
+#ifndef NDEBUG
+    if (scale > 0)
+    {
+        for (k = 0 ; k < n ; k++) PRINTF (("Rs [%d] %g\n", k, Rs [k])) ;
+    }
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* factor each block using klu */
+    /* ---------------------------------------------------------------------- */
+
+    for (block = 0 ; block < nblocks ; block++)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* the block is from rows/columns k1 to k2-1 */
+        /* ------------------------------------------------------------------ */
+
+        k1 = R [block] ;
+        k2 = R [block+1] ;
+        nk = k2 - k1 ;
+        PRINTF (("FACTOR BLOCK %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
+
+        if (nk == 1)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* singleton case */
+            /* -------------------------------------------------------------- */
+
+            poff = Offp [k1] ;
+            oldcol = Q [k1] ;
+            pend = Ap [oldcol+1] ;
+            CLEAR (s) ;
+
+            if (scale <= 0)
+            {
+                /* no scaling */
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    oldrow = Ai [p] ;
+                    newrow = Pinv [oldrow] ;
+                    if (newrow < k1)
+                    {
+                        Offi [poff] = oldrow ;
+                        Offx [poff] = Ax [p] ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        ASSERT (newrow == k1) ;
+                        PRINTF (("singleton block %d", block)) ;
+                        PRINT_ENTRY (Ax [p]) ;
+                        s = Ax [p] ;
+                    }
+                }
+            }
+            else
+            {
+                /* row scaling.  NOTE: scale factors are not yet permuted
+                 * according to the pivot row permutation, so Rs [oldrow] is
+                 * used below.  When the factorization is done, the scale
+                 * factors are permuted, so that Rs [newrow] will be used in
+                 * klu_solve, klu_tsolve, and klu_rgrowth */
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    oldrow = Ai [p] ;
+                    newrow = Pinv [oldrow] ;
+                    if (newrow < k1)
+                    {
+                        Offi [poff] = oldrow ;
+                        /* Offx [poff] = Ax [p] / Rs [oldrow] ; */
+                        SCALE_DIV_ASSIGN (Offx [poff], Ax [p], Rs [oldrow]) ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        ASSERT (newrow == k1) ;
+                        PRINTF (("singleton block %d ", block)) ;
+                        PRINT_ENTRY (Ax[p]) ;
+                        SCALE_DIV_ASSIGN (s, Ax [p], Rs [oldrow]) ;
+                    }
+                }
+            }
+
+            Udiag [k1] = s ;
+
+            if (IS_ZERO (s))
+            {
+                /* singular singleton */
+                Common->status = KLU_SINGULAR ;
+                Common->numerical_rank = k1 ;
+                Common->singular_col = oldcol ;
+                if (Common->halt_if_singular)
+                {
+                    return ;
+                }
+            }
+
+            Offp [k1+1] = poff ;
+            Pnum [k1] = P [k1] ;
+            lnz++ ;
+            unz++ ;
+
+        }
+        else
+        {
+
+            /* -------------------------------------------------------------- */
+            /* construct and factorize the kth block */
+            /* -------------------------------------------------------------- */
+
+            if (Lnz [block] < 0)
+            {
+                /* COLAMD was used - no estimate of fill-in */
+                /* use 10 times the nnz in A, plus n */
+                lsize = -(Common->initmem) ;
+            }
+            else
+            {
+                lsize = Common->initmem_amd * Lnz [block] + nk ;
+            }
+
+            /* allocates 1 arrays: LUbx [block] */
+            Numeric->LUsize [block] = KLU_kernel_factor (nk, Ap, Ai, Ax, Q,
+                    lsize, &LUbx [block], Udiag + k1, Llen + k1, Ulen + k1,
+                    Lip + k1, Uip + k1, Pblock, &lnz_block, &unz_block,
+                    X, Iwork, k1, Pinv, Rs, Offp, Offi, Offx, Common) ;
+
+            if (Common->status < KLU_OK ||
+               (Common->status == KLU_SINGULAR && Common->halt_if_singular))
+            {
+                /* out of memory, invalid inputs, or singular */
+                return ;
+            }
+
+            PRINTF (("\n----------------------- L %d:\n", block)) ;
+            ASSERT (KLU_valid_LU (nk, TRUE, Lip+k1, Llen+k1, LUbx [block])) ;
+            PRINTF (("\n----------------------- U %d:\n", block)) ;
+            ASSERT (KLU_valid_LU (nk, FALSE, Uip+k1, Ulen+k1, LUbx [block])) ;
+
+            /* -------------------------------------------------------------- */
+            /* get statistics */
+            /* -------------------------------------------------------------- */
+
+            lnz += lnz_block ;
+            unz += unz_block ;
+            max_lnz_block = MAX (max_lnz_block, lnz_block) ;
+            max_unz_block = MAX (max_unz_block, unz_block) ;
+
+            if (Lnz [block] == EMPTY)
+            {
+                /* revise estimate for subsequent factorization */
+                Lnz [block] = MAX (lnz_block, unz_block) ;
+            }
+
+            /* -------------------------------------------------------------- */
+            /* combine the klu row ordering with the symbolic pre-ordering */
+            /* -------------------------------------------------------------- */
+
+            PRINTF (("Pnum, 1-based:\n")) ;
+            for (k = 0 ; k < nk ; k++)
+            {
+                ASSERT (k + k1 < n) ;
+                ASSERT (Pblock [k] + k1 < n) ;
+                Pnum [k + k1] = P [Pblock [k] + k1] ;
+                PRINTF (("Pnum (%d + %d + 1 = %d) = %d + 1 = %d\n",
+                    k, k1, k+k1+1, Pnum [k+k1], Pnum [k+k1]+1)) ;
+            }
+
+            /* the local pivot row permutation Pblock is no longer needed */
+        }
+    }
+    ASSERT (nzoff == Offp [n]) ;
+    PRINTF (("\n------------------- Off diagonal entries:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    Numeric->lnz = lnz ;
+    Numeric->unz = unz ;
+    Numeric->max_lnz_block = max_lnz_block ;
+    Numeric->max_unz_block = max_unz_block ;
+
+    /* compute the inverse of Pnum */
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        Pinv [k] = EMPTY ;
+    }
+#endif
+    for (k = 0 ; k < n ; k++)
+    {
+        ASSERT (Pnum [k] >= 0 && Pnum [k] < n) ;
+        Pinv [Pnum [k]] = k ;
+    }
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++) ASSERT (Pinv [k] != EMPTY) ;
+#endif
+
+    /* permute scale factors Rs according to pivotal row order */
+    if (scale > 0)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            REAL (X [k]) = Rs [Pnum [k]] ;
+        }
+        for (k = 0 ; k < n ; k++)
+        {
+            Rs [k] = REAL (X [k]) ;
+        }
+    }
+
+    PRINTF (("\n------------------- Off diagonal entries, old:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    /* apply the pivot row permutations to the off-diagonal entries */
+    for (p = 0 ; p < nzoff ; p++)
+    {
+        ASSERT (Offi [p] >= 0 && Offi [p] < n) ;
+        Offi [p] = Pinv [Offi [p]] ;
+    }
+
+    PRINTF (("\n------------------- Off diagonal entries, new:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+#ifndef NDEBUG
+    {
+        PRINTF (("\n ############# KLU_BTF_FACTOR done, nblocks %d\n",nblocks));
+        Entry ss, *Udiag = Numeric->Udiag ;
+        for (block = 0 ; block < nblocks && Common->status == KLU_OK ; block++)
+        {
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("\n======================KLU_factor output: k1 %d k2 %d nk %d\n",k1,k2,nk)) ;
+            if (nk == 1)
+            {
+                PRINTF (("singleton  ")) ;
+                /* ENTRY_PRINT (singleton [block]) ; */
+                ss = Udiag [k1] ;
+                PRINT_ENTRY (ss) ;
+            }
+            else
+            {
+                Int *Lip, *Uip, *Llen, *Ulen ;
+                Unit *LU ;
+                Lip = Numeric->Lip + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                LU = (Unit *) Numeric->LUbx [block] ;
+                PRINTF (("\n---- L block %d\n", block));
+                ASSERT (KLU_valid_LU (nk, TRUE, Lip, Llen, LU)) ;
+                Uip = Numeric->Uip + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                PRINTF (("\n---- U block %d\n", block)) ;
+                ASSERT (KLU_valid_LU (nk, FALSE, Uip, Ulen, LU)) ;
+            }
+        }
+    }
+#endif
+}
+
+
+
+/* ========================================================================== */
+/* === KLU_factor =========================================================== */
+/* ========================================================================== */
+
+KLU_numeric *KLU_factor         /* returns NULL if error, or a valid
+                                   KLU_numeric object if successful */
+(
+    /* --- inputs --- */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    KLU_symbolic *Symbolic,
+    /* -------------- */
+    KLU_common *Common
+)
+{
+    Int n, nzoff, nblocks, maxblock, k, ok = TRUE ;
+    Int *R ;
+    KLU_numeric *Numeric ;
+    size_t n1, nzoff1, s, b6, n3 ;
+
+    if (Common == NULL)
+    {
+        return (NULL) ;
+    }
+    Common->status = KLU_OK ;
+    Common->numerical_rank = EMPTY ;
+    Common->singular_col = EMPTY ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    /* check for a valid Symbolic object */
+    if (Symbolic == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (NULL) ;
+    }
+
+    n = Symbolic->n ;
+    nzoff = Symbolic->nzoff ;
+    nblocks = Symbolic->nblocks ;
+    maxblock = Symbolic->maxblock ;
+    R = Symbolic->R ;
+    PRINTF (("KLU_factor:  n %d nzoff %d nblocks %d maxblock %d\n",
+        n, nzoff, nblocks, maxblock)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get control parameters and make sure they are in the proper range */
+    /* ---------------------------------------------------------------------- */
+
+    Common->initmem_amd = MAX (1.0, Common->initmem_amd) ;
+    Common->initmem = MAX (1.0, Common->initmem) ;
+    Common->tol = MIN (Common->tol, 1.0) ;
+    Common->tol = MAX (0.0, Common->tol) ;
+    Common->memgrow = MAX (1.0, Common->memgrow) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* allocate the Numeric object  */
+    /* ---------------------------------------------------------------------- */
+
+    /* this will not cause size_t overflow (already checked by KLU_symbolic) */
+    n1 = ((size_t) n) + 1 ;
+    nzoff1 = ((size_t) nzoff) + 1 ;
+
+    Numeric = KLU_malloc (sizeof (KLU_numeric), 1, Common) ;
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory */
+        Common->status = KLU_OUT_OF_MEMORY ;
+        return (NULL) ;
+    }
+    Numeric->n = n ;
+    Numeric->nblocks = nblocks ;
+    Numeric->nzoff = nzoff ;
+    Numeric->Pnum = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Offp = KLU_malloc (n1, sizeof (Int), Common) ;
+    Numeric->Offi = KLU_malloc (nzoff1, sizeof (Int), Common) ;
+    Numeric->Offx = KLU_malloc (nzoff1, sizeof (Entry), Common) ;
+
+    Numeric->Lip  = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Uip  = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Llen = KLU_malloc (n, sizeof (Int), Common) ;
+    Numeric->Ulen = KLU_malloc (n, sizeof (Int), Common) ;
+
+    Numeric->LUsize = KLU_malloc (nblocks, sizeof (size_t), Common) ;
+
+    Numeric->LUbx = KLU_malloc (nblocks, sizeof (Unit *), Common) ;
+    if (Numeric->LUbx != NULL)
+    {
+        for (k = 0 ; k < nblocks ; k++)
+        {
+            Numeric->LUbx [k] = NULL ;
+        }
+    }
+
+    Numeric->Udiag = KLU_malloc (n, sizeof (Entry), Common) ;
+
+    if (Common->scale > 0)
+    {
+        Numeric->Rs = KLU_malloc (n, sizeof (double), Common) ;
+    }
+    else
+    {
+        /* no scaling */
+        Numeric->Rs = NULL ;
+    }
+
+    Numeric->Pinv = KLU_malloc (n, sizeof (Int), Common) ;
+
+    /* allocate permanent workspace for factorization and solve.  Note that the
+     * solver will use an Xwork of size 4n, whereas the factorization codes use
+     * an Xwork of size n and integer space (Iwork) of size 6n. KLU_condest
+     * uses an Xwork of size 2n.  Total size is:
+     *
+     *    n*sizeof(Entry) + max (6*maxblock*sizeof(Int), 3*n*sizeof(Entry))
+     */
+    s = KLU_mult_size_t (n, sizeof (Entry), &ok) ;
+    n3 = KLU_mult_size_t (n, 3 * sizeof (Entry), &ok) ;
+    b6 = KLU_mult_size_t (maxblock, 6 * sizeof (Int), &ok) ;
+    Numeric->worksize = KLU_add_size_t (s, MAX (n3, b6), &ok) ;
+    Numeric->Work = KLU_malloc (Numeric->worksize, 1, Common) ;
+    Numeric->Xwork = Numeric->Work ;
+    Numeric->Iwork = (Int *) ((Entry *) Numeric->Xwork + n) ;
+    if (!ok || Common->status < KLU_OK)
+    {
+        /* out of memory or problem too large */
+        Common->status = ok ? KLU_OUT_OF_MEMORY : KLU_TOO_LARGE ;
+        KLU_free_numeric (&Numeric, Common) ;
+        return (NULL) ;
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* factorize the blocks */
+    /* ---------------------------------------------------------------------- */
+
+    factor2 (Ap, Ai, (Entry *) Ax, Symbolic, Numeric, Common) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* return or free the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common->status < KLU_OK)
+    {
+        /* out of memory or inputs invalid */
+        KLU_free_numeric (&Numeric, Common) ;
+    }
+    else if (Common->status == KLU_SINGULAR)
+    {
+        if (Common->halt_if_singular)
+        {
+            /* Matrix is singular, and the Numeric object is only partially
+             * defined because we halted early.  This is the default case for
+             * a singular matrix. */
+            KLU_free_numeric (&Numeric, Common) ;
+        }
+    }
+    else if (Common->status == KLU_OK)
+    {
+        /* successful non-singular factorization */
+        Common->numerical_rank = n ;
+        Common->singular_col = n ;
+    }
+    return (Numeric) ;
+}
diff --git a/src/maths/KLU/klu_free_numeric.c b/src/maths/KLU/klu_free_numeric.c
new file mode 100644
index 000000000..cd4f3bdcd
--- /dev/null
+++ b/src/maths/KLU/klu_free_numeric.c
@@ -0,0 +1,71 @@
+/* ========================================================================== */
+/* === KLU_free_numeric ===================================================== */
+/* ========================================================================== */
+
+/* Free the KLU Numeric object. */
+
+#include "klu_internal.h"
+
+Int KLU_free_numeric
+(
+    KLU_numeric **NumericHandle,
+    KLU_common  *Common
+)
+{
+    KLU_numeric *Numeric ;
+    Unit **LUbx ;
+    size_t *LUsize ;
+    Int block, n, nzoff, nblocks ;
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (NumericHandle == NULL || *NumericHandle == NULL)
+    {
+        return (TRUE) ;
+    }
+
+    Numeric = *NumericHandle ;
+
+    n = Numeric->n ;
+    nzoff = Numeric->nzoff ;
+    nblocks = Numeric->nblocks ;
+    LUsize = Numeric->LUsize ;
+
+    LUbx = (Unit **) Numeric->LUbx ;
+    if (LUbx != NULL)
+    {
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            KLU_free (LUbx [block], LUsize ? LUsize [block] : 0,
+                sizeof (Unit), Common) ;
+        }
+    }
+
+    KLU_free (Numeric->Pnum, n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Offp, n+1, sizeof (Int), Common) ;
+    KLU_free (Numeric->Offi, nzoff+1, sizeof (Int), Common) ;
+    KLU_free (Numeric->Offx, nzoff+1, sizeof (Entry), Common) ;
+
+    KLU_free (Numeric->Lip,  n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Llen, n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Uip,  n, sizeof (Int), Common) ;
+    KLU_free (Numeric->Ulen, n, sizeof (Int), Common) ;
+
+    KLU_free (Numeric->LUsize, nblocks, sizeof (size_t), Common) ;
+
+    KLU_free (Numeric->LUbx, nblocks, sizeof (Unit *), Common) ;
+
+    KLU_free (Numeric->Udiag, n, sizeof (Entry), Common) ;
+
+    KLU_free (Numeric->Rs,   n, sizeof (double), Common) ;
+    KLU_free (Numeric->Pinv, n, sizeof (Int), Common) ;
+
+    KLU_free (Numeric->Work, Numeric->worksize, 1, Common) ;
+
+    KLU_free (Numeric, 1, sizeof (KLU_numeric), Common) ;
+
+    *NumericHandle = NULL ;
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_free_symbolic.c b/src/maths/KLU/klu_free_symbolic.c
new file mode 100644
index 000000000..20b4000e6
--- /dev/null
+++ b/src/maths/KLU/klu_free_symbolic.c
@@ -0,0 +1,34 @@
+/* ========================================================================== */
+/* === KLU_free_symbolic ==================================================== */
+/* ========================================================================== */
+
+/* Free the KLU Symbolic object. */
+
+#include "klu_internal.h"
+
+Int KLU_free_symbolic
+(
+    KLU_symbolic **SymbolicHandle,
+    KLU_common   *Common
+)
+{
+    KLU_symbolic *Symbolic ;
+    Int n ;
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (SymbolicHandle == NULL || *SymbolicHandle == NULL)
+    {
+        return (TRUE) ;
+    }
+    Symbolic = *SymbolicHandle ;
+    n = Symbolic->n ;
+    KLU_free (Symbolic->P, n, sizeof (Int), Common) ;
+    KLU_free (Symbolic->Q, n, sizeof (Int), Common) ;
+    KLU_free (Symbolic->R, n+1, sizeof (Int), Common) ;
+    KLU_free (Symbolic->Lnz, n, sizeof (double), Common) ;
+    KLU_free (Symbolic, 1, sizeof (KLU_symbolic), Common) ;
+    *SymbolicHandle = NULL ;
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_internal.h b/src/maths/KLU/klu_internal.h
new file mode 100644
index 000000000..662fa8ca2
--- /dev/null
+++ b/src/maths/KLU/klu_internal.h
@@ -0,0 +1,243 @@
+/* ========================================================================== */
+/* === KLU/Include/klu_internal.h =========================================== */
+/* ========================================================================== */
+
+/* For internal use in KLU routines only, not for user programs */
+
+#ifndef _KLU_INTERNAL_H
+#define _KLU_INTERNAL_H
+
+#include "klu.h" 
+#include "btf.h"
+#include "klu_version.h"
+
+/* ========================================================================== */
+/* make sure debugging and printing is turned off */
+
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#ifndef NPRINT
+#define NPRINT
+#endif
+
+/* To enable debugging and assertions, uncomment this line:
+ #undef NDEBUG
+ */
+
+/* To enable diagnostic printing, uncomment this line:
+ #undef NPRINT
+ */
+
+/* ========================================================================== */
+
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <math.h>
+
+#undef ASSERT
+#ifndef NDEBUG
+#define ASSERT(a) assert(a)
+#else
+#define ASSERT(a)
+#endif
+
+#define SCALAR_IS_NAN(x) ((x) != (x))
+
+/* true if an integer (stored in double x) would overflow (or if x is NaN) */
+#define INT_OVERFLOW(x) ((!((x) * (1.0+1e-8) <= (double) INT_MAX)) \
+                        || SCALAR_IS_NAN (x))
+
+#undef TRUE
+#undef FALSE
+#undef MAX
+#undef MIN
+#undef PRINTF
+#undef FLIP
+
+#ifndef NPRINT
+#define PRINTF(s) { printf s ; } ;
+#else
+#define PRINTF(s)
+#endif
+
+#define TRUE 1
+#define FALSE 0
+#define MAX(a,b) (((a) > (b)) ?  (a) : (b))
+#define MIN(a,b) (((a) < (b)) ?  (a) : (b))
+
+/* FLIP is a "negation about -1", and is used to mark an integer i that is
+ * normally non-negative.  FLIP (EMPTY) is EMPTY.  FLIP of a number > EMPTY
+ * is negative, and FLIP of a number < EMTPY is positive.  FLIP (FLIP (i)) = i
+ * for all integers i.  UNFLIP (i) is >= EMPTY. */
+#define EMPTY (-1)
+#define FLIP(i) (-(i)-2)
+#define UNFLIP(i) (((i) < EMPTY) ? FLIP (i) : (i))
+
+
+size_t KLU_kernel   /* final size of LU on output */
+(
+    /* input, not modified */
+    Int n,              /* A is n-by-n */
+    Int Ap [ ],         /* size n+1, column pointers for A */
+    Int Ai [ ],         /* size nz = Ap [n], row indices for A */
+    Entry Ax [ ],       /* size nz, values of A */
+    Int Q [ ],          /* size n, optional input permutation */
+    size_t lusize,      /* initial size of LU */
+
+    /* output, not defined on input */
+    Int Pinv [ ],       /* size n */
+    Int P [ ],          /* size n */
+    Unit **p_LU,        /* size lusize on input, size Uxp[n] on output*/
+    Entry Udiag [ ],    /* size n, diagonal of U */
+    Int Llen [ ],       /* size n, column length of L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Lip [ ],        /* size n+1 */
+    Int Uip [ ],        /* size n+1 */
+    Int *lnz,           /* size of L */
+    Int *unz,           /* size of U */
+
+    /* workspace, not defined on input */
+    Entry X [ ],   /* size n, zero on output */
+
+    /* workspace, not defined on input or output */
+    Int Stack [ ],  /* size n */
+    Int Flag [ ],   /* size n */
+    Int adj_pos [ ],    /* size n */
+    
+    /* workspace for pruning only */
+    Int Lpend [ ],      /* size n workspace */
+
+    /* inputs, not modified on output */
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],      /* inverse of P from symbolic factorization */
+    double Rs [ ],      /* scale factors for A */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ],
+    KLU_common *Common  /* the control input/output structure */
+) ;
+
+
+size_t KLU_kernel_factor            /* 0 if failure, size of LU if OK */
+(
+    /* inputs, not modified */
+    Int n,          /* A is n-by-n. n must be > 0. */
+    Int Ap [ ],     /* size n+1, column pointers for A */
+    Int Ai [ ],     /* size nz = Ap [n], row indices for A */
+    Entry Ax [ ],   /* size nz, values of A */
+    Int Q [ ],      /* size n, optional column permutation */
+    double Lsize,   /* initial size of L and U */
+
+    /* outputs, not defined on input */
+    Unit **p_LU,        /* row indices and values of L and U */
+    Entry Udiag [ ],    /* size n, diagonal of U */
+    Int Llen [ ],       /* size n, column length of L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Lip [ ],        /* size n+1, column pointers of L */
+    Int Uip [ ],        /* size n+1, column pointers of U */
+    Int P [ ],          /* row permutation, size n */
+    Int *lnz,           /* size of L */
+    Int *unz,           /* size of U */
+
+    /* workspace, undefined on input */
+    Entry *X,       /* size n entries.  Zero on output */
+    Int *Work,      /* size 5n Int's */
+
+    /* inputs, not modified on output */
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],      /* inverse of P from symbolic factorization */
+    double Rs [ ],      /* scale factors for A */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ],
+    KLU_common *Common  /* the control input/output structure */
+) ;
+
+void KLU_lsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lp [ ],
+    Int Li [ ],
+    Unit LU [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Lx=b on output */
+    Entry X [ ]
+) ;
+
+void KLU_ltsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Lp [ ],
+    Int Li [ ],
+    Unit LU [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to L'x=b on output */
+    Entry X [ ]
+) ;
+
+
+void KLU_usolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Up [ ],
+    Int Ui [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+    /* right-hand-side on input, solution to Ux=b on output */
+    Entry X [ ]
+) ;
+
+void KLU_utsolve
+(
+    /* inputs, not modified: */
+    Int n,
+    Int Up [ ],
+    Int Ui [ ],
+    Unit LU [ ],
+    Entry Udiag [ ],
+    Int nrhs,
+#ifdef COMPLEX
+    Int conj_solve,
+#endif
+    /* right-hand-side on input, solution to U'x=b on output */
+    Entry X [ ]
+) ;
+
+Int KLU_valid 
+(
+    Int n, 
+    Int Ap [ ], 
+    Int Ai [ ], 
+    Entry Ax [ ]
+) ;
+
+Int KLU_valid_LU 
+(
+    Int n, 
+    Int flag_test_start_ptr, 
+    Int Xip [ ],
+    Int Xlen [ ],  
+    Unit LU [ ]
+);
+
+size_t KLU_add_size_t (size_t a, size_t b, Int *ok) ;
+
+size_t KLU_mult_size_t (size_t a, size_t k, Int *ok) ;
+
+KLU_symbolic *KLU_alloc_symbolic (Int n, Int *Ap, Int *Ai, KLU_common *Common) ;
+
+#endif
diff --git a/src/maths/KLU/klu_kernel.c b/src/maths/KLU/klu_kernel.c
new file mode 100644
index 000000000..bfdbe80fe
--- /dev/null
+++ b/src/maths/KLU/klu_kernel.c
@@ -0,0 +1,1009 @@
+/* ========================================================================== */
+/* === KLU_kernel =========================================================== */
+/* ========================================================================== */
+
+/* Sparse left-looking LU factorization, with partial pivoting.  Based on
+ * Gilbert & Peierl's method, with a non-recursive DFS and with Eisenstat &
+ * Liu's symmetric pruning.  No user-callable routines are in this file.
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === dfs ================================================================== */
+/* ========================================================================== */
+
+/* Does a depth-first-search, starting at node j. */
+
+static Int dfs
+(
+    /* input, not modified on output: */
+    Int j,              /* node at which to start the DFS */
+    Int k,              /* mark value, for the Flag array */
+    Int Pinv [ ],       /* Pinv [i] = k if row i is kth pivot row, or EMPTY if
+                         * row i is not yet pivotal.  */
+    Int Llen [ ],       /* size n, Llen [k] = # nonzeros in column k of L */
+    Int Lip [ ],        /* size n, Lip [k] is position in LU of column k of L */
+
+    /* workspace, not defined on input or output */
+    Int Stack [ ],      /* size n */
+
+    /* input/output: */
+    Int Flag [ ],       /* Flag [i] == k means i is marked */
+    Int Lpend [ ],      /* for symmetric pruning */
+    Int top,            /* top of stack on input*/
+    Unit LU [],
+    Int *Lik,           /* Li row index array of the kth column */
+    Int *plength,
+
+    /* other, not defined on input or output */
+    Int Ap_pos [ ]      /* keeps track of position in adj list during DFS */
+)
+{
+    Int i, pos, jnew, head, l_length ;
+    Int *Li ;
+
+    l_length = *plength ;
+
+    head = 0 ;
+    Stack [0] = j ;
+    ASSERT (Flag [j] != k) ;
+
+    while (head >= 0)
+    {
+        j = Stack [head] ;
+        jnew = Pinv [j] ;
+        ASSERT (jnew >= 0 && jnew < k) ;        /* j is pivotal */
+
+        if (Flag [j] != k)          /* a node is not yet visited */
+        {
+            /* first time that j has been visited */
+            Flag [j] = k ;
+            PRINTF (("[ start dfs at %d : new %d\n", j, jnew)) ;
+            /* set Ap_pos [head] to one past the last entry in col j to scan */
+            Ap_pos [head] =
+                (Lpend [jnew] == EMPTY) ?  Llen [jnew] : Lpend [jnew] ;
+        }
+
+        /* add the adjacent nodes to the recursive stack by iterating through
+         * until finding another non-visited pivotal node */
+        Li = (Int *) (LU + Lip [jnew]) ;
+        for (pos = --Ap_pos [head] ; pos >= 0 ; --pos)
+        {
+            i = Li [pos] ;
+            if (Flag [i] != k)
+            {
+                /* node i is not yet visited */
+                if (Pinv [i] >= 0)
+                {
+                    /* keep track of where we left off in the scan of the
+                     * adjacency list of node j so we can restart j where we
+                     * left off. */
+                    Ap_pos [head] = pos ;
+
+                    /* node i is pivotal; push it onto the recursive stack
+                     * and immediately break so we can recurse on node i. */
+                    Stack [++head] = i ;
+                    break ;
+                }
+                else
+                {
+                    /* node i is not pivotal (no outgoing edges). */
+                    /* Flag as visited and store directly into L,
+                     * and continue with current node j. */
+                    Flag [i] = k ;
+                    Lik [l_length] = i ;
+                    l_length++ ;
+                }
+            }
+        }
+
+        if (pos == -1)
+        {
+            /* if all adjacent nodes of j are already visited, pop j from
+             * recursive stack and push j onto output stack */
+            head-- ;
+            Stack[--top] = j ;
+            PRINTF (("  end   dfs at %d ] head : %d\n", j, head)) ;
+        }
+    }
+
+    *plength = l_length ;
+    return (top) ;
+}
+
+
+/* ========================================================================== */
+/* === lsolve_symbolic ====================================================== */
+/* ========================================================================== */
+
+/* Finds the pattern of x, for the solution of Lx=b */
+
+static Int lsolve_symbolic
+(
+    /* input, not modified on output: */
+    Int n,              /* L is n-by-n, where n >= 0 */
+    Int k,              /* also used as the mark value, for the Flag array */
+    Int Ap [ ],
+    Int Ai [ ],
+    Int Q [ ],
+    Int Pinv [ ],       /* Pinv [i] = k if i is kth pivot row, or EMPTY if row i
+                         * is not yet pivotal.  */
+
+    /* workspace, not defined on input or output */
+    Int Stack [ ],      /* size n */
+
+    /* workspace, defined on input and output */
+    Int Flag [ ],       /* size n.  Initially, all of Flag [0..n-1] < k.  After
+                         * lsolve_symbolic is done, Flag [i] == k if i is in
+                         * the pattern of the output, and Flag [0..n-1] <= k. */
+
+    /* other */
+    Int Lpend [ ],      /* for symmetric pruning */
+    Int Ap_pos [ ],     /* workspace used in dfs */
+
+    Unit LU [ ],        /* LU factors (pattern and values) */
+    Int lup,            /* pointer to free space in LU */
+    Int Llen [ ],       /* size n, Llen [k] = # nonzeros in column k of L */
+    Int Lip [ ],        /* size n, Lip [k] is position in LU of column k of L */
+
+    /* ---- the following are only used in the BTF case --- */
+
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ]       /* inverse of P from symbolic factorization */
+)
+{
+    Int *Lik ;
+    Int i, p, pend, oldcol, kglobal, top, l_length ;
+
+    top = n ;
+    l_length = 0 ;
+    Lik = (Int *) (LU + lup);
+
+    /* ---------------------------------------------------------------------- */
+    /* BTF factorization of A (k1:k2-1, k1:k2-1) */
+    /* ---------------------------------------------------------------------- */
+
+    kglobal = k + k1 ;  /* column k of the block is col kglobal of A */
+    oldcol = Q [kglobal] ;      /* Q must be present for BTF case */
+    pend = Ap [oldcol+1] ;
+    for (p = Ap [oldcol] ; p < pend ; p++)
+    {
+        i = PSinv [Ai [p]] - k1 ;
+        if (i < 0) continue ;   /* skip entry outside the block */
+
+        /* (i,k) is an entry in the block.  start a DFS at node i */
+        PRINTF (("\n ===== DFS at node %d in b, inew: %d\n", i, Pinv [i])) ;
+        if (Flag [i] != k)
+        {
+            if (Pinv [i] >= 0)
+            {
+                top = dfs (i, k, Pinv, Llen, Lip, Stack, Flag,
+                           Lpend, top, LU, Lik, &l_length, Ap_pos) ;
+            }
+            else
+            {
+                /* i is not pivotal, and not flagged. Flag and put in L */
+                Flag [i] = k ;
+                Lik [l_length] = i ;
+                l_length++;
+            }
+        }
+    }
+
+    /* If Llen [k] is zero, the matrix is structurally singular */
+    Llen [k] = l_length ;
+    return (top) ;
+}
+
+
+/* ========================================================================== */
+/* === construct_column ===================================================== */
+/* ========================================================================== */
+
+/* Construct the kth column of A, and the off-diagonal part, if requested.
+ * Scatter the numerical values into the workspace X, and construct the
+ * corresponding column of the off-diagonal matrix. */
+
+static void construct_column
+(
+    /* inputs, not modified on output */
+    Int k,          /* the column of A (or the column of the block) to get */
+    Int Ap [ ],
+    Int Ai [ ],
+    Entry Ax [ ],
+    Int Q [ ],      /* column pre-ordering */
+
+    /* zero on input, modified on output */
+    Entry X [ ],
+
+    /* ---- the following are only used in the BTF case --- */
+
+    /* inputs, not modified on output */
+    Int k1,         /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],  /* inverse of P from symbolic factorization */
+    double Rs [ ],  /* scale factors for A */
+    Int scale,      /* 0: no scaling, nonzero: scale the rows with Rs */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ]
+)
+{
+    Entry aik ;
+    Int i, p, pend, oldcol, kglobal, poff, oldrow ;
+
+    /* ---------------------------------------------------------------------- */
+    /* Scale and scatter the column into X. */
+    /* ---------------------------------------------------------------------- */
+
+    kglobal = k + k1 ;          /* column k of the block is col kglobal of A */
+    poff = Offp [kglobal] ;     /* start of off-diagonal column */
+    oldcol = Q [kglobal] ;
+    pend = Ap [oldcol+1] ;
+
+    if (scale <= 0)
+    {
+        /* no scaling */
+        for (p = Ap [oldcol] ; p < pend ; p++)
+        {
+            oldrow = Ai [p] ;
+            i = PSinv [oldrow] - k1 ;
+            aik = Ax [p] ;
+            if (i < 0)
+            {
+                /* this is an entry in the off-diagonal part */
+                Offi [poff] = oldrow ;
+                Offx [poff] = aik ;
+                poff++ ;
+            }
+            else
+            {
+                /* (i,k) is an entry in the block.  scatter into X */
+                X [i] = aik ;
+            }
+        }
+    }
+    else
+    {
+        /* row scaling */
+        for (p = Ap [oldcol] ; p < pend ; p++)
+        {
+            oldrow = Ai [p] ;
+            i = PSinv [oldrow] - k1 ;
+            aik = Ax [p] ;
+            SCALE_DIV (aik, Rs [oldrow]) ;
+            if (i < 0)
+            {
+                /* this is an entry in the off-diagonal part */
+                Offi [poff] = oldrow ;
+                Offx [poff] = aik ;
+                poff++ ;
+            }
+            else
+            {
+                /* (i,k) is an entry in the block.  scatter into X */
+                X [i] = aik ;
+            }
+        }
+    }
+
+    Offp [kglobal+1] = poff ;   /* start of the next col of off-diag part */
+}
+
+
+/* ========================================================================== */
+/* === lsolve_numeric ======================================================= */
+/* ========================================================================== */
+
+/* Computes the numerical values of x, for the solution of Lx=b.  Note that x
+ * may include explicit zeros if numerical cancelation occurs.  L is assumed
+ * to be unit-diagonal, with possibly unsorted columns (but the first entry in
+ * the column must always be the diagonal entry). */
+
+static void lsolve_numeric
+(
+    /* input, not modified on output: */
+    Int Pinv [ ],       /* Pinv [i] = k if i is kth pivot row, or EMPTY if row i
+                         * is not yet pivotal.  */
+    Unit *LU,           /* LU factors (pattern and values) */
+    Int Stack [ ],      /* stack for dfs */
+    Int Lip [ ],        /* size n, Lip [k] is position in LU of column k of L */
+    Int top,            /* top of stack on input */
+    Int n,              /* A is n-by-n */
+    Int Llen [ ],       /* size n, Llen [k] = # nonzeros in column k of L */
+
+    /* output, must be zero on input: */
+    Entry X [ ] /* size n, initially zero.  On output,
+                 * X [Ui [up1..up-1]] and X [Li [lp1..lp-1]]
+                 * contains the solution. */
+
+)
+{
+    Entry xj ;
+    Entry *Lx ;
+    Int *Li ;
+    Int p, s, j, jnew, len ;
+
+    /* solve Lx=b */
+    for (s = top ; s < n ; s++)
+    {
+        /* forward solve with column j of L */
+        j = Stack [s] ;
+        jnew = Pinv [j] ;
+        ASSERT (jnew >= 0) ;
+        xj = X [j] ;
+        GET_POINTER (LU, Lip, Llen, Li, Lx, jnew, len) ;
+        ASSERT (Lip [jnew] <= Lip [jnew+1]) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            /*X [Li [p]] -= Lx [p] * xj ; */
+            MULT_SUB (X [Li [p]], Lx [p], xj) ;
+        }
+    }
+}
+
+
+/* ========================================================================== */
+/* === lpivot =============================================================== */
+/* ========================================================================== */
+
+/* Find a pivot via partial pivoting, and scale the column of L. */
+
+static Int lpivot
+(
+    Int diagrow,
+    Int *p_pivrow,
+    Entry *p_pivot,
+    double *p_abs_pivot,
+    double tol,
+    Entry X [ ],
+    Unit *LU,           /* LU factors (pattern and values) */
+    Int Lip [ ],
+    Int Llen [ ],
+    Int k,
+    Int n,
+
+    Int Pinv [ ],       /* Pinv [i] = k if row i is kth pivot row, or EMPTY if
+                         * row i is not yet pivotal.  */
+
+    Int *p_firstrow,
+    KLU_common *Common
+)
+{
+    Entry x, pivot, *Lx ;
+    double abs_pivot, xabs ;
+    Int p, i, ppivrow, pdiag, pivrow, *Li, last_row_index, firstrow, len ;
+
+    pivrow = EMPTY ;
+    if (Llen [k] == 0)
+    {
+        /* matrix is structurally singular */
+        if (Common->halt_if_singular)
+        {
+            return (FALSE) ;
+        }
+        for (firstrow = *p_firstrow ; firstrow < n ; firstrow++)
+        {
+            PRINTF (("check %d\n", firstrow)) ;
+            if (Pinv [firstrow] < 0)
+            {
+                /* found the lowest-numbered non-pivotal row.  Pick it. */
+                pivrow = firstrow ;
+                PRINTF (("Got pivotal row: %d\n", pivrow)) ;
+                break ;
+            }
+        }
+        ASSERT (pivrow >= 0 && pivrow < n) ;
+        CLEAR (pivot) ;
+        *p_pivrow = pivrow ;
+        *p_pivot = pivot ;
+        *p_abs_pivot = 0 ;
+        *p_firstrow = firstrow ;
+        return (FALSE) ;
+    }
+
+    pdiag = EMPTY ;
+    ppivrow = EMPTY ;
+    abs_pivot = EMPTY ;
+    i = Llen [k] - 1 ;
+    GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+    last_row_index = Li [i] ;
+
+    /* decrement the length by 1 */
+    Llen [k] = i ;
+    GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+
+    /* look in Li [0 ..Llen [k] - 1 ] for a pivot row */
+    for (p = 0 ; p < len ; p++)
+    {
+        /* gather the entry from X and store in L */
+        i = Li [p] ;
+        x = X [i] ;
+        CLEAR (X [i]) ;
+
+        Lx [p] = x ;
+        /* xabs = ABS (x) ; */
+        ABS (xabs, x) ;
+
+        /* find the diagonal */
+        if (i == diagrow)
+        {
+            pdiag = p ;
+        }
+
+        /* find the partial-pivoting choice */
+        if (xabs > abs_pivot)
+        {
+            abs_pivot = xabs ;
+            ppivrow = p ;
+        }
+    }
+
+    /* xabs = ABS (X [last_row_index]) ;*/
+    ABS (xabs, X [last_row_index]) ;
+    if (xabs > abs_pivot)
+    {
+        abs_pivot = xabs ;
+        ppivrow = EMPTY ;
+    }
+
+    /* compare the diagonal with the largest entry */
+    if (last_row_index == diagrow)
+    {
+        if (xabs >= tol * abs_pivot)
+        {
+            abs_pivot = xabs ;
+            ppivrow = EMPTY ;
+        }
+    }
+    else if (pdiag != EMPTY)
+    {
+        /* xabs = ABS (Lx [pdiag]) ;*/
+        ABS (xabs, Lx [pdiag]) ;
+        if (xabs >= tol * abs_pivot)
+        {
+            /* the diagonal is large enough */
+            abs_pivot = xabs ;
+            ppivrow = pdiag ;
+        }
+    }
+
+    if (ppivrow != EMPTY)
+    {
+        pivrow = Li [ppivrow] ;
+        pivot  = Lx [ppivrow] ;
+        /* overwrite the ppivrow values with last index values */
+        Li [ppivrow] = last_row_index ;
+        Lx [ppivrow] = X [last_row_index] ;
+    }
+    else
+    {
+        pivrow = last_row_index ;
+        pivot = X [last_row_index] ;
+    }
+    CLEAR (X [last_row_index]) ;
+
+    *p_pivrow = pivrow ;
+    *p_pivot = pivot ;
+    *p_abs_pivot = abs_pivot ;
+    ASSERT (pivrow >= 0 && pivrow < n) ;
+
+    if (IS_ZERO (pivot) && Common->halt_if_singular)
+    {
+        /* numerically singular case */
+        return (FALSE) ;
+    }
+
+    /* divide L by the pivot value */
+    for (p = 0 ; p < Llen [k] ; p++)
+    {
+        /* Lx [p] /= pivot ; */
+        DIV (Lx [p], Lx [p], pivot) ;
+    }
+
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === prune ================================================================ */
+/* ========================================================================== */
+
+/* Prune the columns of L to reduce work in subsequent depth-first searches */
+static void prune
+(
+    /* input/output: */
+    Int Lpend [ ],      /* Lpend [j] marks symmetric pruning point for L(:,j) */
+
+    /* input: */
+    Int Pinv [ ],       /* Pinv [i] = k if row i is kth pivot row, or EMPTY if
+                         * row i is not yet pivotal.  */
+    Int k,              /* prune using column k of U */
+    Int pivrow,         /* current pivot row */
+
+    /* input/output: */
+    Unit *LU,           /* LU factors (pattern and values) */
+
+    /* input */
+    Int Uip [ ],        /* size n, column pointers for U */
+    Int Lip [ ],        /* size n, column pointers for L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Llen [ ]        /* size n, column length of L */
+)
+{
+    Entry x ;
+    Entry *Lx, *Ux ;
+    Int *Li, *Ui ;
+    Int p, i, j, p2, phead, ptail, llen, ulen ;
+
+    /* check to see if any column of L can be pruned */
+    GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, ulen) ;
+    for (p = 0 ; p < ulen ; p++)
+    {
+        j = Ui [p] ;
+        ASSERT (j < k) ;
+        PRINTF (("%d is pruned: %d. Lpend[j] %d Lip[j+1] %d\n",
+            j, Lpend [j] != EMPTY, Lpend [j], Lip [j+1])) ;
+        if (Lpend [j] == EMPTY)
+        {
+            /* scan column j of L for the pivot row */
+            GET_POINTER (LU, Lip, Llen, Li, Lx, j, llen) ;
+            for (p2 = 0 ; p2 < llen ; p2++)
+            {
+                if (pivrow == Li [p2])
+                {
+                    /* found it!  This column can be pruned */
+#ifndef NDEBUG
+                    PRINTF (("==== PRUNE: col j %d of L\n", j)) ;
+                    {
+                        Int p3 ;
+                        for (p3 = 0 ; p3 < Llen [j] ; p3++)
+                        {
+                            PRINTF (("before: %i  pivotal: %d\n", Li [p3],
+                                        Pinv [Li [p3]] >= 0)) ;
+                        }
+                    }
+#endif
+
+                    /* partition column j of L.  The unit diagonal of L
+                     * is not stored in the column of L. */
+                    phead = 0 ;
+                    ptail = Llen [j] ;
+                    while (phead < ptail)
+                    {
+                        i = Li [phead] ;
+                        if (Pinv [i] >= 0)
+                        {
+                            /* leave at the head */
+                            phead++ ;
+                        }
+                        else
+                        {
+                            /* swap with the tail */
+                            ptail-- ;
+                            Li [phead] = Li [ptail] ;
+                            Li [ptail] = i ;
+                            x = Lx [phead] ;
+                            Lx [phead] = Lx [ptail] ;
+                            Lx [ptail] = x ;
+                        }
+                    }
+
+                    /* set Lpend to one past the last entry in the
+                     * first part of the column of L.  Entries in
+                     * Li [0 ... Lpend [j]-1] are the only part of
+                     * column j of L that needs to be scanned in the DFS.
+                     * Lpend [j] was EMPTY; setting it >= 0 also flags
+                     * column j as pruned. */
+                    Lpend [j] = ptail ;
+
+#ifndef NDEBUG
+                    {
+                        Int p3 ;
+                        for (p3 = 0 ; p3 < Llen [j] ; p3++)
+                        {
+                            if (p3 == Lpend [j]) PRINTF (("----\n")) ;
+                            PRINTF (("after: %i  pivotal: %d\n", Li [p3],
+                                        Pinv [Li [p3]] >= 0)) ;
+                        }
+                    }
+#endif
+
+                    break ;
+                }
+            }
+        }
+    }
+}
+
+
+/* ========================================================================== */
+/* === KLU_kernel =========================================================== */
+/* ========================================================================== */
+
+size_t KLU_kernel   /* final size of LU on output */
+(
+    /* input, not modified */
+    Int n,          /* A is n-by-n */
+    Int Ap [ ],     /* size n+1, column pointers for A */
+    Int Ai [ ],     /* size nz = Ap [n], row indices for A */
+    Entry Ax [ ],   /* size nz, values of A */
+    Int Q [ ],      /* size n, optional input permutation */
+    size_t lusize,  /* initial size of LU on input */
+
+    /* output, not defined on input */
+    Int Pinv [ ],   /* size n, inverse row permutation, where Pinv [i] = k if
+                     * row i is the kth pivot row */
+    Int P [ ],      /* size n, row permutation, where P [k] = i if row i is the
+                     * kth pivot row. */
+    Unit **p_LU,        /* LU array, size lusize on input */
+    Entry Udiag [ ],    /* size n, diagonal of U */
+    Int Llen [ ],       /* size n, column length of L */
+    Int Ulen [ ],       /* size n, column length of U */
+    Int Lip [ ],        /* size n, column pointers for L */
+    Int Uip [ ],        /* size n, column pointers for U */
+    Int *lnz,           /* size of L*/
+    Int *unz,           /* size of U*/
+    /* workspace, not defined on input */
+    Entry X [ ],    /* size n, undefined on input, zero on output */
+
+    /* workspace, not defined on input or output */
+    Int Stack [ ],  /* size n */
+    Int Flag [ ],   /* size n */
+    Int Ap_pos [ ],     /* size n */
+
+    /* other workspace: */
+    Int Lpend [ ],                  /* size n workspace, for pruning only */
+
+    /* inputs, not modified on output */
+    Int k1,             /* the block of A is from k1 to k2-1 */
+    Int PSinv [ ],      /* inverse of P from symbolic factorization */
+    double Rs [ ],      /* scale factors for A */
+
+    /* inputs, modified on output */
+    Int Offp [ ],   /* off-diagonal matrix (modified by this routine) */
+    Int Offi [ ],
+    Entry Offx [ ],
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    Entry pivot ;
+    double abs_pivot, xsize, nunits, tol, memgrow ;
+    Entry *Ux ;
+    Int *Li, *Ui ;
+    Unit *LU ;          /* LU factors (pattern and values) */
+    Int k, p, i, j, pivrow = 0, kbar, diagrow, firstrow, lup, top, scale, len ;
+    size_t newlusize ;
+
+#ifndef NDEBUG
+    Entry *Lx ;
+#endif
+
+    ASSERT (Common != NULL) ;
+    scale = Common->scale ;
+    tol = Common->tol ;
+    memgrow = Common->memgrow ;
+    *lnz = 0 ;
+    *unz = 0 ;
+    CLEAR (pivot) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get initial Li, Lx, Ui, and Ux */
+    /* ---------------------------------------------------------------------- */
+
+    PRINTF (("input: lusize %d \n", lusize)) ;
+    ASSERT (lusize > 0) ;
+    LU = *p_LU ;
+
+    /* ---------------------------------------------------------------------- */
+    /* initializations */
+    /* ---------------------------------------------------------------------- */
+
+    firstrow = 0 ;
+    lup = 0 ;
+
+    for (k = 0 ; k < n ; k++)
+    {
+        /* X [k] = 0 ; */
+        CLEAR (X [k]) ;
+        Flag [k] = EMPTY ;
+        Lpend [k] = EMPTY ;     /* flag k as not pruned */
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* mark all rows as non-pivotal and determine initial diagonal mapping */
+    /* ---------------------------------------------------------------------- */
+
+    /* PSinv does the symmetric permutation, so don't do it here */
+    for (k = 0 ; k < n ; k++)
+    {
+        P [k] = k ;
+        Pinv [k] = FLIP (k) ;   /* mark all rows as non-pivotal */
+    }
+    /* initialize the construction of the off-diagonal matrix */
+    Offp [0] = 0 ;
+
+    /* P [k] = row means that UNFLIP (Pinv [row]) = k, and visa versa.
+     * If row is pivotal, then Pinv [row] >= 0.  A row is initially "flipped"
+     * (Pinv [k] < EMPTY), and then marked "unflipped" when it becomes
+     * pivotal. */
+
+#ifndef NDEBUG
+    for (k = 0 ; k < n ; k++)
+    {
+        PRINTF (("Initial P [%d] = %d\n", k, P [k])) ;
+    }
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* factorize */
+    /* ---------------------------------------------------------------------- */
+
+    for (k = 0 ; k < n ; k++)
+    {
+
+        PRINTF (("\n\n==================================== k: %d\n", k)) ;
+
+        /* ------------------------------------------------------------------ */
+        /* determine if LU factors have grown too big */
+        /* ------------------------------------------------------------------ */
+
+        /* (n - k) entries for L and k entries for U */
+        nunits = DUNITS (Int, n - k) + DUNITS (Int, k) +
+                 DUNITS (Entry, n - k) + DUNITS (Entry, k) ;
+
+        /* LU can grow by at most 'nunits' entries if the column is dense */
+        PRINTF (("lup %d lusize %g lup+nunits: %g\n", lup, (double) lusize,
+            lup+nunits));
+        xsize = ((double) lup) + nunits ;
+        if (xsize > (double) lusize)
+        {
+            /* check here how much to grow */
+            xsize = (memgrow * ((double) lusize) + 4*n + 1) ;
+            if (INT_OVERFLOW (xsize))
+            {
+                PRINTF (("Matrix is too large (Int overflow)\n")) ;
+                Common->status = KLU_TOO_LARGE ;
+                return (lusize) ;
+            }
+            newlusize = memgrow * lusize + 2*n + 1 ;
+            /* Future work: retry mechanism in case of malloc failure */
+            LU = KLU_realloc (newlusize, lusize, sizeof (Unit), LU, Common) ;
+            Common->nrealloc++ ;
+            *p_LU = LU ;
+            if (Common->status == KLU_OUT_OF_MEMORY)
+            {
+                PRINTF (("Matrix is too large (LU)\n")) ;
+                return (lusize) ;
+            }
+            lusize = newlusize ;
+            PRINTF (("inc LU to %d done\n", lusize)) ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* start the kth column of L and U */
+        /* ------------------------------------------------------------------ */
+
+        Lip [k] = lup ;
+
+        /* ------------------------------------------------------------------ */
+        /* compute the nonzero pattern of the kth column of L and U */
+        /* ------------------------------------------------------------------ */
+
+#ifndef NDEBUG
+        for (i = 0 ; i < n ; i++)
+        {
+            ASSERT (Flag [i] < k) ;
+            /* ASSERT (X [i] == 0) ; */
+            ASSERT (IS_ZERO (X [i])) ;
+        }
+#endif
+
+        top = lsolve_symbolic (n, k, Ap, Ai, Q, Pinv, Stack, Flag,
+                    Lpend, Ap_pos, LU, lup, Llen, Lip, k1, PSinv) ;
+
+#ifndef NDEBUG
+        PRINTF (("--- in U:\n")) ;
+        for (p = top ; p < n ; p++)
+        {
+            PRINTF (("pattern of X for U: %d : %d pivot row: %d\n",
+                p, Stack [p], Pinv [Stack [p]])) ;
+            ASSERT (Flag [Stack [p]] == k) ;
+        }
+        PRINTF (("--- in L:\n")) ;
+        Li = (Int *) (LU + Lip [k]);
+        for (p = 0 ; p < Llen [k] ; p++)
+        {
+            PRINTF (("pattern of X in L: %d : %d pivot row: %d\n",
+                p, Li [p], Pinv [Li [p]])) ;
+            ASSERT (Flag [Li [p]] == k) ;
+        }
+        p = 0 ;
+        for (i = 0 ; i < n ; i++)
+        {
+            ASSERT (Flag [i] <= k) ;
+            if (Flag [i] == k) p++ ;
+        }
+#endif
+
+        /* ------------------------------------------------------------------ */
+        /* get the column of the matrix to factorize and scatter into X */
+        /* ------------------------------------------------------------------ */
+
+        construct_column (k, Ap, Ai, Ax, Q, X,
+            k1, PSinv, Rs, scale, Offp, Offi, Offx) ;
+
+        /* ------------------------------------------------------------------ */
+        /* compute the numerical values of the kth column (s = L \ A (:,k)) */
+        /* ------------------------------------------------------------------ */
+
+        lsolve_numeric (Pinv, LU, Stack, Lip, top, n, Llen, X) ;
+
+#ifndef NDEBUG
+        for (p = top ; p < n ; p++)
+        {
+            PRINTF (("X for U %d : ",  Stack [p])) ;
+            PRINT_ENTRY (X [Stack [p]]) ;
+        }
+        Li = (Int *) (LU + Lip [k]) ;
+        for (p = 0 ; p < Llen [k] ; p++)
+        {
+            PRINTF (("X for L %d : ", Li [p])) ;
+            PRINT_ENTRY (X [Li [p]]) ;
+        }
+#endif
+
+        /* ------------------------------------------------------------------ */
+        /* partial pivoting with diagonal preference */
+        /* ------------------------------------------------------------------ */
+
+        /* determine what the "diagonal" is */
+        diagrow = P [k] ;   /* might already be pivotal */
+        PRINTF (("k %d, diagrow = %d, UNFLIP (diagrow) = %d\n",
+            k, diagrow, UNFLIP (diagrow))) ;
+
+        /* find a pivot and scale the pivot column */
+        if (!lpivot (diagrow, &pivrow, &pivot, &abs_pivot, tol, X, LU, Lip,
+                    Llen, k, n, Pinv, &firstrow, Common))
+        {
+            /* matrix is structurally or numerically singular */
+            Common->status = KLU_SINGULAR ;
+            if (Common->numerical_rank == EMPTY)
+            {
+                Common->numerical_rank = k+k1 ;
+                Common->singular_col = Q [k+k1] ;
+            }
+            if (Common->halt_if_singular)
+            {
+                /* do not continue the factorization */
+                return (lusize) ;
+            }
+        }
+
+        /* we now have a valid pivot row, even if the column has NaN's or
+         * has no entries on or below the diagonal at all. */
+        PRINTF (("\nk %d : Pivot row %d : ", k, pivrow)) ;
+        PRINT_ENTRY (pivot) ;
+        ASSERT (pivrow >= 0 && pivrow < n) ;
+        ASSERT (Pinv [pivrow] < 0) ;
+
+        /* set the Uip pointer */
+        Uip [k] = Lip [k] + UNITS (Int, Llen [k]) + UNITS (Entry, Llen [k]) ;
+
+        /* move the lup pointer to the position where indices of U
+         * should be stored */
+        lup += UNITS (Int, Llen [k]) + UNITS (Entry, Llen [k]) ;
+
+        Ulen [k] = n - top ;
+
+        /* extract Stack [top..n-1] to Ui and the values to Ux and clear X */
+        GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+        for (p = top, i = 0 ; p < n ; p++, i++)
+        {
+            j = Stack [p] ;
+            Ui [i] = Pinv [j] ;
+            Ux [i] = X [j] ;
+            CLEAR (X [j]) ;
+        }
+
+        /* position the lu index at the starting point for next column */
+        lup += UNITS (Int, Ulen [k]) + UNITS (Entry, Ulen [k]) ;
+
+        /* U(k,k) = pivot */
+        Udiag [k] = pivot ;
+
+        /* ------------------------------------------------------------------ */
+        /* log the pivot permutation */
+        /* ------------------------------------------------------------------ */
+
+        ASSERT (UNFLIP (Pinv [diagrow]) < n) ;
+        ASSERT (P [UNFLIP (Pinv [diagrow])] == diagrow) ;
+
+        if (pivrow != diagrow)
+        {
+            /* an off-diagonal pivot has been chosen */
+            Common->noffdiag++ ;
+            PRINTF ((">>>>>>>>>>>>>>>>> pivrow %d k %d off-diagonal\n",
+                        pivrow, k)) ;
+            if (Pinv [diagrow] < 0)
+            {
+                /* the former diagonal row index, diagrow, has not yet been
+                 * chosen as a pivot row.  Log this diagrow as the "diagonal"
+                 * entry in the column kbar for which the chosen pivot row,
+                 * pivrow, was originally logged as the "diagonal" */
+                kbar = FLIP (Pinv [pivrow]) ;
+                P [kbar] = diagrow ;
+                Pinv [diagrow] = FLIP (kbar) ;
+            }
+        }
+        P [k] = pivrow ;
+        Pinv [pivrow] = k ;
+
+#ifndef NDEBUG
+        for (i = 0 ; i < n ; i++) { ASSERT (IS_ZERO (X [i])) ;}
+        GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            PRINTF (("Column %d of U: %d : ", k, Ui [p])) ;
+            PRINT_ENTRY (Ux [p]) ;
+        }
+        GET_POINTER (LU, Lip, Llen, Li, Lx, k, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            PRINTF (("Column %d of L: %d : ", k, Li [p])) ;
+            PRINT_ENTRY (Lx [p]) ;
+        }
+#endif
+
+        /* ------------------------------------------------------------------ */
+        /* symmetric pruning */
+        /* ------------------------------------------------------------------ */
+
+        prune (Lpend, Pinv, k, pivrow, LU, Uip, Lip, Ulen, Llen) ;
+
+        *lnz += Llen [k] + 1 ; /* 1 added to lnz for diagonal */
+        *unz += Ulen [k] + 1 ; /* 1 added to unz for diagonal */
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* finalize column pointers for L and U, and put L in the pivotal order */
+    /* ---------------------------------------------------------------------- */
+
+    for (p = 0 ; p < n ; p++)
+    {
+        Li = (Int *) (LU + Lip [p]) ;
+        for (i = 0 ; i < Llen [p] ; i++)
+        {
+            Li [i] = Pinv [Li [i]] ;
+        }
+    }
+
+#ifndef NDEBUG
+    for (i = 0 ; i < n ; i++)
+    {
+        PRINTF (("P [%d] = %d   Pinv [%d] = %d\n", i, P [i], i, Pinv [i])) ;
+    }
+    for (i = 0 ; i < n ; i++)
+    {
+        ASSERT (Pinv [i] >= 0 && Pinv [i] < n) ;
+        ASSERT (P [i] >= 0 && P [i] < n) ;
+        ASSERT (P [Pinv [i]] == i) ;
+        ASSERT (IS_ZERO (X [i])) ;
+    }
+#endif
+
+    /* ---------------------------------------------------------------------- */
+    /* shrink the LU factors to just the required size */
+    /* ---------------------------------------------------------------------- */
+
+    newlusize = lup ;
+    ASSERT ((size_t) newlusize <= lusize) ;
+
+    /* this cannot fail, since the block is descreasing in size */
+    LU = KLU_realloc (newlusize, lusize, sizeof (Unit), LU, Common) ;
+    *p_LU = LU ;
+    return (newlusize) ;
+}
diff --git a/src/maths/KLU/klu_memory.c b/src/maths/KLU/klu_memory.c
new file mode 100644
index 000000000..6359941ad
--- /dev/null
+++ b/src/maths/KLU/klu_memory.c
@@ -0,0 +1,225 @@
+/* ========================================================================== */
+/* === KLU_memory =========================================================== */
+/* ========================================================================== */
+
+/* KLU memory management routines:
+ *
+ * KLU_malloc                   malloc wrapper
+ * KLU_free                     free wrapper
+ * KLU_realloc                  realloc wrapper
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === KLU_add_size_t ======================================================= */
+/* ========================================================================== */
+
+/* Safely compute a+b, and check for size_t overflow */
+
+size_t KLU_add_size_t (size_t a, size_t b, Int *ok)
+{
+    (*ok) = (*ok) && ((a + b) >= MAX (a,b)) ;
+    return ((*ok) ? (a + b) : ((size_t) -1)) ;
+}
+
+/* ========================================================================== */
+/* === KLU_mult_size_t ====================================================== */
+/* ========================================================================== */
+
+/* Safely compute a*k, where k should be small, and check for size_t overflow */
+
+size_t KLU_mult_size_t (size_t a, size_t k, Int *ok)
+{
+    size_t i, s = 0 ;
+    for (i = 0 ; i < k ; i++)
+    {
+        s = KLU_add_size_t (s, a, ok) ;
+    }
+    return ((*ok) ? s : ((size_t) -1)) ;
+}
+
+/* ========================================================================== */
+/* === KLU_malloc =========================================================== */
+/* ========================================================================== */
+
+/* Wrapper around malloc routine (mxMalloc for a mexFunction).  Allocates
+ * space of size MAX(1,n)*size, where size is normally a sizeof (...).
+ *
+ * This routine and KLU_realloc do not set Common->status to KLU_OK on success,
+ * so that a sequence of KLU_malloc's or KLU_realloc's can be used.  If any of
+ * them fails, the Common->status will hold the most recent error status.
+ *
+ * Usage, for a pointer to Int:
+ *
+ *      p = KLU_malloc (n, sizeof (Int), Common)
+ *
+ * Uses a pointer to the malloc routine (or its equivalent) defined in Common.
+ */
+
+void *KLU_malloc        /* returns pointer to the newly malloc'd block */
+(
+    /* ---- input ---- */
+    size_t n,           /* number of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    void *p ;
+    size_t s ;
+    Int ok = TRUE ;
+
+    if (Common == NULL)
+    {
+        p = NULL ;
+    }
+    else if (size == 0)
+    {
+        /* size must be > 0 */
+        Common->status = KLU_INVALID ;
+        p = NULL ;
+    }
+    else if (n >= INT_MAX)
+    {
+        /* object is too big to allocate; p[i] where i is an Int will not
+         * be enough. */
+        Common->status = KLU_TOO_LARGE ;
+        p = NULL ;
+    }
+    else
+    {
+        /* call malloc, or its equivalent */
+        s = KLU_mult_size_t (MAX (1,n), size, &ok) ;
+        p = ok ? ((Common->malloc_memory) (s)) : NULL ;
+        if (p == NULL)
+        {
+            /* failure: out of memory */
+            Common->status = KLU_OUT_OF_MEMORY ;
+        }
+        else
+        {
+            Common->memusage += s ;
+            Common->mempeak = MAX (Common->mempeak, Common->memusage) ;
+        }
+    }
+    return (p) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_free ============================================================= */
+/* ========================================================================== */
+
+/* Wrapper around free routine (mxFree for a mexFunction).  Returns NULL,
+ * which can be assigned to the pointer being freed, as in:
+ *
+ *      p = KLU_free (p, n, sizeof (int), Common) ;
+ */
+
+void *KLU_free          /* always returns NULL */
+(
+    /* ---- in/out --- */
+    void *p,            /* block of memory to free */
+    /* ---- input --- */
+    size_t n,           /* size of block to free, in # of items */
+    size_t size,        /* size of each item */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    size_t s ;
+    Int ok = TRUE ;
+    if (p != NULL && Common != NULL)
+    {
+        /* only free the object if the pointer is not NULL */
+        /* call free, or its equivalent */
+        (Common->free_memory) (p) ;
+        s = KLU_mult_size_t (MAX (1,n), size, &ok) ;
+        Common->memusage -= s ;
+    }
+    /* return NULL, and the caller should assign this to p.  This avoids
+     * freeing the same pointer twice. */
+    return (NULL) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_realloc ========================================================== */
+/* ========================================================================== */
+
+/* Wrapper around realloc routine (mxRealloc for a mexFunction).  Given a
+ * pointer p to a block allocated by KLU_malloc, it changes the size of the
+ * block pointed to by p to be MAX(1,nnew)*size in size.  It may return a
+ * pointer different than p.  This should be used as (for a pointer to Int):
+ *
+ *      p = KLU_realloc (nnew, nold, sizeof (Int), p, Common) ;
+ *
+ * If p is NULL, this is the same as p = KLU_malloc (...).
+ * A size of nnew=0 is treated as nnew=1.
+ *
+ * If the realloc fails, p is returned unchanged and Common->status is set
+ * to KLU_OUT_OF_MEMORY.  If successful, Common->status is not modified,
+ * and p is returned (possibly changed) and pointing to a large block of memory.
+ *
+ * Uses a pointer to the realloc routine (or its equivalent) defined in Common.
+ */
+
+void *KLU_realloc       /* returns pointer to reallocated block */
+(
+    /* ---- input ---- */
+    size_t nnew,        /* requested # of items in reallocated block */
+    size_t nold,        /* old # of items */
+    size_t size,        /* size of each item */
+    /* ---- in/out --- */
+    void *p,            /* block of memory to realloc */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    void *pnew ;
+    size_t snew, sold ;
+    Int ok = TRUE ;
+
+    if (Common == NULL)
+    {
+        p = NULL ;
+    }
+    else if (size == 0)
+    {
+        /* size must be > 0 */
+        Common->status = KLU_INVALID ;
+        p = NULL ;
+    }
+    else if (p == NULL)
+    {
+        /* A fresh object is being allocated. */
+        p = KLU_malloc (nnew, size, Common) ;
+    }
+    else if (nnew >= INT_MAX)
+    {
+        /* failure: nnew is too big.  Do not change p */
+        Common->status = KLU_TOO_LARGE ;
+    }
+    else
+    {
+        /* The object exists, and is changing to some other nonzero size. */
+        /* call realloc, or its equivalent */
+        snew = KLU_mult_size_t (MAX (1,nnew), size, &ok) ;
+        sold = KLU_mult_size_t (MAX (1,nold), size, &ok) ;
+        pnew = ok ? ((Common->realloc_memory) (p, snew)) : NULL ;
+        if (pnew == NULL)
+        {
+            /* Do not change p, since it still points to allocated memory */
+            Common->status = KLU_OUT_OF_MEMORY ;
+        }
+        else
+        {
+            /* success: return the new p and change the size of the block */
+            Common->memusage += (snew - sold) ;
+            Common->mempeak = MAX (Common->mempeak, Common->memusage) ;
+            p = pnew ;
+        }
+    }
+    return (p) ;
+}
diff --git a/src/maths/KLU/klu_refactor.c b/src/maths/KLU/klu_refactor.c
new file mode 100644
index 000000000..7eb6fe8a9
--- /dev/null
+++ b/src/maths/KLU/klu_refactor.c
@@ -0,0 +1,478 @@
+/* ========================================================================== */
+/* === KLU_refactor ========================================================= */
+/* ========================================================================== */
+
+/* Factor the matrix, after ordering and analyzing it with KLU_analyze, and
+ * factoring it once with KLU_factor.  This routine cannot do any numerical
+ * pivoting.  The pattern of the input matrix (Ap, Ai) must be identical to
+ * the pattern given to KLU_factor.
+ */
+
+#include "klu_internal.h"
+
+
+/* ========================================================================== */
+/* === KLU_refactor ========================================================= */
+/* ========================================================================== */
+
+Int KLU_refactor        /* returns TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    KLU_symbolic *Symbolic,
+
+    /* input/output */
+    KLU_numeric *Numeric,
+    KLU_common  *Common
+)
+{
+    Entry ukk, ujk, s ;
+    Entry *Offx, *Lx, *Ux, *X, *Az, *Udiag ;
+    double *Rs ;
+    Int *P, *Q, *R, *Pnum, *Offp, *Offi, *Ui, *Li, *Pinv, *Lip, *Uip, *Llen,
+        *Ulen ;
+    Unit **LUbx ;
+    Unit *LU ;
+    Int k1, k2, nk, k, block, oldcol, pend, oldrow, n, p, newrow, scale,
+        nblocks, poff, i, j, up, ulen, llen, maxblock, nzoff ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    if (Numeric == NULL)
+    {
+        /* invalid Numeric object */
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+
+    Common->numerical_rank = EMPTY ;
+    Common->singular_col = EMPTY ;
+
+    Az = (Entry *) Ax ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    n = Symbolic->n ;
+    P = Symbolic->P ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+    nblocks = Symbolic->nblocks ;
+    maxblock = Symbolic->maxblock ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    LUbx = (Unit **) Numeric->LUbx ;
+
+    scale = Common->scale ;
+    if (scale > 0)
+    {
+        /* factorization was not scaled, but refactorization is scaled */
+        if (Numeric->Rs == NULL)
+        {
+            Numeric->Rs = KLU_malloc (n, sizeof (double), Common) ;
+            if (Common->status < KLU_OK)
+            {
+                Common->status = KLU_OUT_OF_MEMORY ;
+                return (FALSE) ;
+            }
+        }
+    }
+    else
+    {
+        /* no scaling for refactorization; ensure Numeric->Rs is freed.  This
+         * does nothing if Numeric->Rs is already NULL. */
+        Numeric->Rs = KLU_free (Numeric->Rs, n, sizeof (double), Common) ;
+    }
+    Rs = Numeric->Rs ;
+
+    Pinv = Numeric->Pinv ;
+    X = (Entry *) Numeric->Xwork ;
+    Common->nrealloc = 0 ;
+    Udiag = Numeric->Udiag ;
+    nzoff = Symbolic->nzoff ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check the input matrix compute the row scale factors, Rs */
+    /* ---------------------------------------------------------------------- */
+
+    /* do no scale, or check the input matrix, if scale < 0 */
+    if (scale >= 0)
+    {
+        /* check for out-of-range indices, but do not check for duplicates */
+        if (!KLU_scale (scale, n, Ap, Ai, Ax, Rs, NULL, Common))
+        {
+            return (FALSE) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* clear workspace X */
+    /* ---------------------------------------------------------------------- */
+
+    for (k = 0 ; k < maxblock ; k++)
+    {
+        /* X [k] = 0 */
+        CLEAR (X [k]) ;
+    }
+
+    poff = 0 ;
+
+    /* ---------------------------------------------------------------------- */
+    /* factor each block */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale <= 0)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* no scaling */
+        /* ------------------------------------------------------------------ */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+
+            if (nk == 1)
+            {
+
+                /* ---------------------------------------------------------- */
+                /* singleton case */
+                /* ---------------------------------------------------------- */
+
+                oldcol = Q [k1] ;
+                pend = Ap [oldcol+1] ;
+                CLEAR (s) ;
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    newrow = Pinv [Ai [p]] - k1 ;
+                    if (newrow < 0 && poff < nzoff)
+                    {
+                        /* entry in off-diagonal block */
+                        Offx [poff] = Az [p] ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        /* singleton */
+                        s = Az [p] ;
+                    }
+                }
+                Udiag [k1] = s ;
+
+            }
+            else
+            {
+
+                /* ---------------------------------------------------------- */
+                /* construct and factor the kth block */
+                /* ---------------------------------------------------------- */
+
+                Lip  = Numeric->Lip  + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                Uip  = Numeric->Uip  + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                LU = LUbx [block] ;
+
+                for (k = 0 ; k < nk ; k++)
+                {
+
+                    /* ------------------------------------------------------ */
+                    /* scatter kth column of the block into workspace X */
+                    /* ------------------------------------------------------ */
+
+                    oldcol = Q [k+k1] ;
+                    pend = Ap [oldcol+1] ;
+                    for (p = Ap [oldcol] ; p < pend ; p++)
+                    {
+                        newrow = Pinv [Ai [p]] - k1 ;
+                        if (newrow < 0 && poff < nzoff)
+                        {
+                            /* entry in off-diagonal block */
+                            Offx [poff] = Az [p] ;
+                            poff++ ;
+                        }
+                        else
+                        {
+                            /* (newrow,k) is an entry in the block */
+                            X [newrow] = Az [p] ;
+                        }
+                    }
+
+                    /* ------------------------------------------------------ */
+                    /* compute kth column of U, and update kth column of A */
+                    /* ------------------------------------------------------ */
+
+                    GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, ulen) ;
+                    for (up = 0 ; up < ulen ; up++)
+                    {
+                        j = Ui [up] ;
+                        ujk = X [j] ;
+                        /* X [j] = 0 */
+                        CLEAR (X [j]) ;
+                        Ux [up] = ujk ;
+                        GET_POINTER (LU, Lip, Llen, Li, Lx, j, llen) ;
+                        for (p = 0 ; p < llen ; p++)
+                        {
+                            /* X [Li [p]] -= Lx [p] * ujk */
+                            MULT_SUB (X [Li [p]], Lx [p], ujk) ;
+                        }
+                    }
+                    /* get the diagonal entry of U */
+                    ukk = X [k] ;
+                    /* X [k] = 0 */
+                    CLEAR (X [k]) ;
+                    /* singular case */
+                    if (IS_ZERO (ukk))
+                    {
+                        /* matrix is numerically singular */
+                        Common->status = KLU_SINGULAR ;
+                        if (Common->numerical_rank == EMPTY)
+                        {
+                            Common->numerical_rank = k+k1 ;
+                            Common->singular_col = Q [k+k1] ;
+                        }
+                        if (Common->halt_if_singular)
+                        {
+                            /* do not continue the factorization */
+                            return (FALSE) ;
+                        }
+                    }
+                    Udiag [k+k1] = ukk ;
+                    /* gather and divide by pivot to get kth column of L */
+                    GET_POINTER (LU, Lip, Llen, Li, Lx, k, llen) ;
+                    for (p = 0 ; p < llen ; p++)
+                    {
+                        i = Li [p] ;
+                        DIV (Lx [p], X [i], ukk) ;
+                        CLEAR (X [i]) ;
+                    }
+
+                }
+            }
+        }
+
+    }
+    else
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* scaling */
+        /* ------------------------------------------------------------------ */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+
+            if (nk == 1)
+            {
+
+                /* ---------------------------------------------------------- */
+                /* singleton case */
+                /* ---------------------------------------------------------- */
+
+                oldcol = Q [k1] ;
+                pend = Ap [oldcol+1] ;
+                CLEAR (s) ;
+                for (p = Ap [oldcol] ; p < pend ; p++)
+                {
+                    oldrow = Ai [p] ;
+                    newrow = Pinv [oldrow] - k1 ;
+                    if (newrow < 0 && poff < nzoff)
+                    {
+                        /* entry in off-diagonal block */
+                        /* Offx [poff] = Az [p] / Rs [oldrow] */
+                        SCALE_DIV_ASSIGN (Offx [poff], Az [p], Rs [oldrow]) ;
+                        poff++ ;
+                    }
+                    else
+                    {
+                        /* singleton */
+                        /* s = Az [p] / Rs [oldrow] */
+                        SCALE_DIV_ASSIGN (s, Az [p], Rs [oldrow]) ;
+                    }
+                }
+                Udiag [k1] = s ;
+
+            }
+            else
+            {
+
+                /* ---------------------------------------------------------- */
+                /* construct and factor the kth block */
+                /* ---------------------------------------------------------- */
+
+                Lip  = Numeric->Lip  + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                Uip  = Numeric->Uip  + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                LU = LUbx [block] ;
+
+                for (k = 0 ; k < nk ; k++)
+                {
+
+                    /* ------------------------------------------------------ */
+                    /* scatter kth column of the block into workspace X */
+                    /* ------------------------------------------------------ */
+
+                    oldcol = Q [k+k1] ;
+                    pend = Ap [oldcol+1] ;
+                    for (p = Ap [oldcol] ; p < pend ; p++)
+                    {
+                        oldrow = Ai [p] ;
+                        newrow = Pinv [oldrow] - k1 ;
+                        if (newrow < 0 && poff < nzoff)
+                        {
+                            /* entry in off-diagonal part */
+                            /* Offx [poff] = Az [p] / Rs [oldrow] */
+                            SCALE_DIV_ASSIGN (Offx [poff], Az [p], Rs [oldrow]);
+                            poff++ ;
+                        }
+                        else
+                        {
+                            /* (newrow,k) is an entry in the block */
+                            /* X [newrow] = Az [p] / Rs [oldrow] */
+                            SCALE_DIV_ASSIGN (X [newrow], Az [p], Rs [oldrow]) ;
+                        }
+                    }
+
+                    /* ------------------------------------------------------ */
+                    /* compute kth column of U, and update kth column of A */
+                    /* ------------------------------------------------------ */
+
+                    GET_POINTER (LU, Uip, Ulen, Ui, Ux, k, ulen) ;
+                    for (up = 0 ; up < ulen ; up++)
+                    {
+                        j = Ui [up] ;
+                        ujk = X [j] ;
+                        /* X [j] = 0 */
+                        CLEAR (X [j]) ;
+                        Ux [up] = ujk ;
+                        GET_POINTER (LU, Lip, Llen, Li, Lx, j, llen) ;
+                        for (p = 0 ; p < llen ; p++)
+                        {
+                            /* X [Li [p]] -= Lx [p] * ujk */
+                            MULT_SUB (X [Li [p]], Lx [p], ujk) ;
+                        }
+                    }
+                    /* get the diagonal entry of U */
+                    ukk = X [k] ;
+                    /* X [k] = 0 */
+                    CLEAR (X [k]) ;
+                    /* singular case */
+                    if (IS_ZERO (ukk))
+                    {
+                        /* matrix is numerically singular */
+                        Common->status = KLU_SINGULAR ;
+                        if (Common->numerical_rank == EMPTY)
+                        {
+                            Common->numerical_rank = k+k1 ;
+                            Common->singular_col = Q [k+k1] ;
+                        }
+                        if (Common->halt_if_singular)
+                        {
+                            /* do not continue the factorization */
+                            return (FALSE) ;
+                        }
+                    }
+                    Udiag [k+k1] = ukk ;
+                    /* gather and divide by pivot to get kth column of L */
+                    GET_POINTER (LU, Lip, Llen, Li, Lx, k, llen) ;
+                    for (p = 0 ; p < llen ; p++)
+                    {
+                        i = Li [p] ;
+                        DIV (Lx [p], X [i], ukk) ;
+                        CLEAR (X [i]) ;
+                    }
+                }
+            }
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* permute scale factors Rs according to pivotal row order */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale > 0)
+    {
+        for (k = 0 ; k < n ; k++)
+        {
+            REAL (X [k]) = Rs [Pnum [k]] ;
+        }
+        for (k = 0 ; k < n ; k++)
+        {
+            Rs [k] = REAL (X [k]) ;
+        }
+    }
+
+#ifndef NDEBUG
+    ASSERT (Offp [n] == poff) ;
+    ASSERT (Symbolic->nzoff == poff) ;
+    PRINTF (("\n------------------- Off diagonal entries, new:\n")) ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+    if (Common->status == KLU_OK)
+    {
+        PRINTF (("\n ########### KLU_BTF_REFACTOR done, nblocks %d\n",nblocks));
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF ((
+                "\n================KLU_refactor output: k1 %d k2 %d nk %d\n",
+                k1, k2, nk)) ;
+            if (nk == 1)
+            {
+                PRINTF (("singleton  ")) ;
+                PRINT_ENTRY (Udiag [k1]) ;
+            }
+            else
+            {
+                Lip = Numeric->Lip + k1 ;
+                Llen = Numeric->Llen + k1 ;
+                LU = (Unit *) Numeric->LUbx [block] ;
+                PRINTF (("\n---- L block %d\n", block)) ;
+                ASSERT (KLU_valid_LU (nk, TRUE, Lip, Llen, LU)) ;
+                Uip = Numeric->Uip + k1 ;
+                Ulen = Numeric->Ulen + k1 ;
+                PRINTF (("\n---- U block %d\n", block)) ;
+                ASSERT (KLU_valid_LU (nk, FALSE, Uip, Ulen, LU)) ;
+            }
+        }
+    }
+#endif
+
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_scale.c b/src/maths/KLU/klu_scale.c
new file mode 100644
index 000000000..479612006
--- /dev/null
+++ b/src/maths/KLU/klu_scale.c
@@ -0,0 +1,159 @@
+/* ========================================================================== */
+/* === KLU_scale ============================================================ */
+/* ========================================================================== */
+
+/* Scale a matrix and check to see if it is valid.  Can be called by the user.
+ * This is called by KLU_factor and KLU_refactor.  Returns TRUE if the input
+ * matrix is valid, FALSE otherwise.  If the W input argument is non-NULL,
+ * then the input matrix is checked for duplicate entries.
+ *
+ * scaling methods:
+ *      <0: no scaling, do not compute Rs, and do not check input matrix.
+ *      0: no scaling
+ *      1: the scale factor for row i is sum (abs (A (i,:)))
+ *      2 or more: the scale factor for row i is max (abs (A (i,:)))
+ */
+
+#include "klu_internal.h"
+
+Int KLU_scale           /* return TRUE if successful, FALSE otherwise */
+(
+    /* inputs, not modified */
+    Int scale,          /* 0: none, 1: sum, 2: max */
+    Int n,
+    Int Ap [ ],         /* size n+1, column pointers */
+    Int Ai [ ],         /* size nz, row indices */
+    double Ax [ ],
+    /* outputs, not defined on input */
+    double Rs [ ],      /* size n, can be NULL if scale <= 0 */
+    /* workspace, not defined on input or output */
+    Int W [ ],          /* size n, can be NULL */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    double a ;
+    Entry *Az ;
+    Int row, col, p, pend, check_duplicates ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    if (scale < 0)
+    {
+        /* return without checking anything and without computing the
+         * scale factors */
+        return (TRUE) ;
+    }
+
+    Az = (Entry *) Ax ;
+
+    if (n <= 0 || Ap == NULL || Ai == NULL || Az == NULL ||
+        (scale > 0 && Rs == NULL))
+    {
+        /* Ap, Ai, Ax and Rs must be present, and n must be > 0 */
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    if (Ap [0] != 0 || Ap [n] < 0)
+    {
+        /* nz = Ap [n] must be >= 0 and Ap [0] must equal zero */
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    for (col = 0 ; col < n ; col++)
+    {
+        if (Ap [col] > Ap [col+1])
+        {
+            /* column pointers must be non-decreasing */
+            Common->status = KLU_INVALID ;
+            return (FALSE) ;
+        }
+    }
+
+    /* ---------------------------------------------------------------------- */
+    /* scale */
+    /* ---------------------------------------------------------------------- */
+
+    if (scale > 0)
+    {
+        /* initialize row sum or row max */
+        for (row = 0 ; row < n ; row++)
+        {
+            Rs [row] = 0 ;
+        }
+    }
+
+    /* check for duplicates only if W is present */
+    check_duplicates = (W != (Int *) NULL) ;
+    if (check_duplicates)
+    {
+        for (row = 0 ; row < n ; row++)
+        {
+            W [row] = EMPTY ;
+        }
+    }
+
+    for (col = 0 ; col < n ; col++)
+    {
+        pend = Ap [col+1] ;
+        for (p = Ap [col] ; p < pend ; p++)
+        {
+            row = Ai [p] ;
+            if (row < 0 || row >= n)
+            {
+                /* row index out of range, or duplicate entry */
+                Common->status = KLU_INVALID ;
+                return (FALSE) ;
+            }
+            if (check_duplicates)
+            {
+                if (W [row] == col)
+                {
+                    /* duplicate entry */
+                    Common->status = KLU_INVALID ;
+                    return (FALSE) ;
+                }
+                /* flag row i as appearing in column col */
+                W [row] = col ;
+            }
+            /* a = ABS (Az [p]) ;*/
+            ABS (a, Az [p]) ;
+            if (scale == 1)
+            {
+                /* accumulate the abs. row sum */
+                Rs [row] += a ;
+            }
+            else if (scale > 1)
+            {
+                /* find the max abs. value in the row */
+                Rs [row] = MAX (Rs [row], a) ;
+            }
+        }
+    }
+
+    if (scale > 0)
+    {
+        /* do not scale empty rows */
+        for (row = 0 ; row < n ; row++)
+        {
+            /* matrix is singular */
+            PRINTF (("Rs [%d] = %g\n", row, Rs [row])) ;
+
+            if (Rs [row] == 0.0)
+            {
+                PRINTF (("Row %d of A is all zero\n", row)) ;
+                Rs [row] = 1.0 ;
+            }
+        }
+    }
+
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_solve.c b/src/maths/KLU/klu_solve.c
new file mode 100644
index 000000000..d23a14095
--- /dev/null
+++ b/src/maths/KLU/klu_solve.c
@@ -0,0 +1,396 @@
+/* ========================================================================== */
+/* === KLU_solve ============================================================ */
+/* ========================================================================== */
+
+/* Solve Ax=b using the symbolic and numeric objects from KLU_analyze
+ * (or KLU_analyze_given) and KLU_factor.  Note that no iterative refinement is
+ * performed.  Uses Numeric->Xwork as workspace (undefined on input and output),
+ * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
+ * Numeric->Iwork).
+ */
+
+#include "klu_internal.h"
+
+Int KLU_solve
+(
+    /* inputs, not modified */
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    Int d,                  /* leading dimension of B */
+    Int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size n*nrhs, in column-oriented form, with
+                             * leading dimension d. */
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    Entry x [4], offik, s ;
+    double rs, *Rs ;
+    Entry *Offx, *X, *Bz, *Udiag ;
+    Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
+    Unit **LUbx ;
+    Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
+        B == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Bz = (Entry *) B ;
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    ASSERT (nblocks == Numeric->nblocks) ;
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    Lip  = Numeric->Lip ;
+    Llen = Numeric->Llen ;
+    Uip  = Numeric->Uip ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+    Udiag = Numeric->Udiag ;
+
+    Rs = Numeric->Rs ;
+    X = (Entry *) Numeric->Xwork ;
+
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* solve in chunks of 4 columns at a time */
+    /* ---------------------------------------------------------------------- */
+
+    for (chunk = 0 ; chunk < nrhs ; chunk += 4)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* get the size of the current chunk */
+        /* ------------------------------------------------------------------ */
+
+        nr = MIN (nrhs - chunk, 4) ;
+
+        /* ------------------------------------------------------------------ */
+        /* scale and permute the right hand side, X = P*(R\B) */
+        /* ------------------------------------------------------------------ */
+
+        if (Rs == NULL)
+        {
+
+            /* no scaling */
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        X [k] = Bz [Pnum [k]] ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        X [2*k    ] = Bz [i      ] ;
+                        X [2*k + 1] = Bz  [i + d  ] ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        X [3*k    ] = Bz [i      ] ;
+                        X [3*k + 1] = Bz [i + d  ] ;
+                        X [3*k + 2] = Bz [i + d*2] ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        X [4*k    ] = Bz [i      ] ;
+                        X [4*k + 1] = Bz [i + d  ] ;
+                        X [4*k + 2] = Bz [i + d*2] ;
+                        X [4*k + 3] = Bz [i + d*3] ;
+                    }
+                    break ;
+            }
+
+        }
+        else
+        {
+
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        SCALE_DIV_ASSIGN (X [k], Bz  [Pnum [k]], Rs [k]) ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (X [2*k], Bz [i], rs) ;
+                        SCALE_DIV_ASSIGN (X [2*k + 1], Bz [i + d], rs) ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (X [3*k], Bz [i], rs) ;
+                        SCALE_DIV_ASSIGN (X [3*k + 1], Bz [i + d], rs) ;
+                        SCALE_DIV_ASSIGN (X [3*k + 2], Bz [i + d*2], rs) ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (X [4*k], Bz [i], rs) ;
+                        SCALE_DIV_ASSIGN (X [4*k + 1], Bz [i + d], rs) ;
+                        SCALE_DIV_ASSIGN (X [4*k + 2], Bz [i + d*2], rs) ;
+                        SCALE_DIV_ASSIGN (X [4*k + 3], Bz [i + d*3], rs) ;
+                    }
+                    break ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* solve X = (L*U + Off)\X */
+        /* ------------------------------------------------------------------ */
+
+        for (block = nblocks-1 ; block >= 0 ; block--)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block of size nk is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("solve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
+
+            /* solve the block system */
+            if (nk == 1)
+            {
+                s = Udiag [k1] ;
+                switch (nr)
+                {
+
+                    case 1:
+                        DIV (X [k1], X [k1], s) ;
+                        break ;
+
+                    case 2:
+                        DIV (X [2*k1], X [2*k1], s) ;
+                        DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
+                        break ;
+
+                    case 3:
+                        DIV (X [3*k1], X [3*k1], s) ;
+                        DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
+                        DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
+                        break ;
+
+                    case 4:
+                        DIV (X [4*k1], X [4*k1], s) ;
+                        DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
+                        DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
+                        DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
+                        break ;
+
+                }
+            }
+            else
+            {
+                KLU_lsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
+                        X + nr*k1) ;
+                KLU_usolve (nk, Uip + k1, Ulen + k1, LUbx [block],
+                        Udiag + k1, nr, X + nr*k1) ;
+            }
+
+            /* -------------------------------------------------------------- */
+            /* block back-substitution for the off-diagonal-block entries */
+            /* -------------------------------------------------------------- */
+
+            if (block > 0)
+            {
+                switch (nr)
+                {
+
+                    case 1:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [k] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                MULT_SUB (X [Offi [p]], Offx [p], x [0]) ;
+                            }
+                        }
+                        break ;
+
+                    case 2:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [2*k    ] ;
+                            x [1] = X [2*k + 1] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+                                offik = Offx [p] ;
+                                MULT_SUB (X [2*i], offik, x [0]) ;
+                                MULT_SUB (X [2*i + 1], offik, x [1]) ;
+                            }
+                        }
+                        break ;
+
+                    case 3:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [3*k    ] ;
+                            x [1] = X [3*k + 1] ;
+                            x [2] = X [3*k + 2] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+                                offik = Offx [p] ;
+                                MULT_SUB (X [3*i], offik, x [0]) ;
+                                MULT_SUB (X [3*i + 1], offik, x [1]) ;
+                                MULT_SUB (X [3*i + 2], offik, x [2]) ;
+                            }
+                        }
+                        break ;
+
+                    case 4:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [4*k    ] ;
+                            x [1] = X [4*k + 1] ;
+                            x [2] = X [4*k + 2] ;
+                            x [3] = X [4*k + 3] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+                                offik = Offx [p] ;
+                                MULT_SUB (X [4*i], offik, x [0]) ;
+                                MULT_SUB (X [4*i + 1], offik, x [1]) ;
+                                MULT_SUB (X [4*i + 2], offik, x [2]) ;
+                                MULT_SUB (X [4*i + 3], offik, x [3]) ;
+                            }
+                        }
+                        break ;
+                }
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* permute the result, Bz  = Q*X */
+        /* ------------------------------------------------------------------ */
+
+        switch (nr)
+        {
+
+            case 1:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    Bz  [Q [k]] = X [k] ;
+                }
+                break ;
+
+            case 2:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    Bz  [i      ] = X [2*k    ] ;
+                    Bz  [i + d  ] = X [2*k + 1] ;
+                }
+                break ;
+
+            case 3:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    Bz  [i      ] = X [3*k    ] ;
+                    Bz  [i + d  ] = X [3*k + 1] ;
+                    Bz  [i + d*2] = X [3*k + 2] ;
+                }
+                break ;
+
+            case 4:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    Bz  [i      ] = X [4*k    ] ;
+                    Bz  [i + d  ] = X [4*k + 1] ;
+                    Bz  [i + d*2] = X [4*k + 2] ;
+                    Bz  [i + d*3] = X [4*k + 3] ;
+                }
+                break ;
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* go to the next chunk of B */
+        /* ------------------------------------------------------------------ */
+
+        Bz  += d*4 ;
+    }
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_sort.c b/src/maths/KLU/klu_sort.c
new file mode 100644
index 000000000..a3ce98f46
--- /dev/null
+++ b/src/maths/KLU/klu_sort.c
@@ -0,0 +1,156 @@
+/* ========================================================================== */
+/* === KLU_sort ============================================================= */
+/* ========================================================================== */
+
+/* sorts the columns of L and U so that the row indices appear in strictly
+ * increasing order.
+ */
+
+#include "klu_internal.h"
+
+/* ========================================================================== */
+/* === sort ================================================================= */
+/* ========================================================================== */
+
+/* Sort L or U using a double-transpose */
+
+static void sort (Int n, Int *Xip, Int *Xlen, Unit *LU, Int *Tp, Int *Tj,
+    Entry *Tx, Int *W)
+{
+    Int *Xi ;
+    Entry *Xx ;
+    Int p, i, j, len, nz, tp, xlen, pend ;
+
+    ASSERT (KLU_valid_LU (n, FALSE, Xip, Xlen, LU)) ;
+
+    /* count the number of entries in each row of L or U */ 
+    for (i = 0 ; i < n ; i++)
+    {
+        W [i] = 0 ;
+    }
+    for (j = 0 ; j < n ; j++)
+    {
+        GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            W [Xi [p]]++ ;
+        }
+    }
+
+    /* construct the row pointers for T */
+    nz = 0 ;
+    for (i = 0 ; i < n ; i++)
+    {
+        Tp [i] = nz ;
+        nz += W [i] ;
+    }
+    Tp [n] = nz ;
+    for (i = 0 ; i < n ; i++)
+    {
+        W [i] = Tp [i] ;
+    }
+
+    /* transpose the matrix into Tp, Ti, Tx */
+    for (j = 0 ; j < n ; j++)
+    {
+        GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+        for (p = 0 ; p < len ; p++)
+        {
+            tp = W [Xi [p]]++ ;
+            Tj [tp] = j ;
+            Tx [tp] = Xx [p] ;
+        }
+    }
+
+    /* transpose the matrix back into Xip, Xlen, Xi, Xx */
+    for (j = 0 ; j < n ; j++)
+    {
+        W [j] = 0 ;
+    }
+    for (i = 0 ; i < n ; i++)
+    {
+        pend = Tp [i+1] ;
+        for (p = Tp [i] ; p < pend ; p++)
+        {
+            j = Tj [p] ;
+            GET_POINTER (LU, Xip, Xlen, Xi, Xx, j, len) ;
+            xlen = W [j]++ ;
+            Xi [xlen] = i ;
+            Xx [xlen] = Tx [p] ;
+        }
+    }
+
+    ASSERT (KLU_valid_LU (n, FALSE, Xip, Xlen, LU)) ;
+}
+
+
+/* ========================================================================== */
+/* === KLU_sort ============================================================= */
+/* ========================================================================== */
+
+Int KLU_sort
+(
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    KLU_common *Common
+)
+{
+    Int *R, *W, *Tp, *Ti, *Lip, *Uip, *Llen, *Ulen ;
+    Entry *Tx ;
+    Unit **LUbx ;
+    Int n, nk, nz, block, nblocks, maxblock, k1 ;
+    size_t m1 ;
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    n = Symbolic->n ;
+    R = Symbolic->R ;
+    nblocks = Symbolic->nblocks ;
+    maxblock = Symbolic->maxblock ;
+
+    Lip  = Numeric->Lip ;
+    Llen = Numeric->Llen ;
+    Uip  = Numeric->Uip ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+
+    m1 = ((size_t) maxblock) + 1 ;
+
+    /* allocate workspace */
+    nz = MAX (Numeric->max_lnz_block, Numeric->max_unz_block) ;
+    W  = KLU_malloc (maxblock, sizeof (Int), Common) ;
+    Tp = KLU_malloc (m1, sizeof (Int), Common) ;
+    Ti = KLU_malloc (nz, sizeof (Int), Common) ;
+    Tx = KLU_malloc (nz, sizeof (Entry), Common) ;
+
+    PRINTF (("\n======================= Start sort:\n")) ;
+
+    if (Common->status == KLU_OK)
+    {
+        /* sort each block of L and U */
+        for (block = 0 ; block < nblocks ; block++)
+        {
+            k1 = R [block] ;
+            nk = R [block+1] - k1 ;
+            if (nk > 1)
+            {
+                PRINTF (("\n-------------------block: %d nk %d\n", block, nk)) ;
+                sort (nk, Lip + k1, Llen + k1, LUbx [block], Tp, Ti, Tx, W) ;
+                sort (nk, Uip + k1, Ulen + k1, LUbx [block], Tp, Ti, Tx, W) ;
+            }
+        }
+    }
+
+    PRINTF (("\n======================= sort done.\n")) ;
+
+    /* free workspace */
+    KLU_free (W, maxblock, sizeof (Int), Common) ;
+    KLU_free (Tp, m1, sizeof (Int), Common) ;
+    KLU_free (Ti, nz, sizeof (Int), Common) ;
+    KLU_free (Tx, nz, sizeof (Entry), Common) ;
+    return (Common->status == KLU_OK) ;
+}
diff --git a/src/maths/KLU/klu_tsolve.c b/src/maths/KLU/klu_tsolve.c
new file mode 100644
index 000000000..c1f10f708
--- /dev/null
+++ b/src/maths/KLU/klu_tsolve.c
@@ -0,0 +1,465 @@
+/* ========================================================================== */
+/* === KLU_tsolve =========================================================== */
+/* ========================================================================== */
+
+/* Solve A'x=b using the symbolic and numeric objects from KLU_analyze
+ * (or KLU_analyze_given) and KLU_factor.  Note that no iterative refinement is
+ * performed.  Uses Numeric->Xwork as workspace (undefined on input and output),
+ * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
+ * Numeric->Iwork).
+ */
+
+#include "klu_internal.h"
+
+Int KLU_tsolve
+(
+    /* inputs, not modified */
+    KLU_symbolic *Symbolic,
+    KLU_numeric *Numeric,
+    Int d,                  /* leading dimension of B */
+    Int nrhs,               /* number of right-hand-sides */
+
+    /* right-hand-side on input, overwritten with solution to Ax=b on output */
+    double B [ ],           /* size n*nrhs, in column-oriented form, with
+                             * leading dimension d. */
+#ifdef COMPLEX
+    Int conj_solve,         /* TRUE for conjugate transpose solve, FALSE for
+                             * array transpose solve.  Used for the complex
+                             * case only. */
+#endif
+    /* --------------- */
+    KLU_common *Common
+)
+{
+    Entry x [4], offik, s ;
+    double rs, *Rs ;
+    Entry *Offx, *X, *Bz, *Udiag ;
+    Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
+    Unit **LUbx ;
+    Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
+
+    /* ---------------------------------------------------------------------- */
+    /* check inputs */
+    /* ---------------------------------------------------------------------- */
+
+    if (Common == NULL)
+    {
+        return (FALSE) ;
+    }
+    if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
+        B == NULL)
+    {
+        Common->status = KLU_INVALID ;
+        return (FALSE) ;
+    }
+    Common->status = KLU_OK ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Symbolic object */
+    /* ---------------------------------------------------------------------- */
+
+    Bz = (Entry *) B ;
+    n = Symbolic->n ;
+    nblocks = Symbolic->nblocks ;
+    Q = Symbolic->Q ;
+    R = Symbolic->R ;
+
+    /* ---------------------------------------------------------------------- */
+    /* get the contents of the Numeric object */
+    /* ---------------------------------------------------------------------- */
+
+    ASSERT (nblocks == Numeric->nblocks) ;
+    Pnum = Numeric->Pnum ;
+    Offp = Numeric->Offp ;
+    Offi = Numeric->Offi ;
+    Offx = (Entry *) Numeric->Offx ;
+
+    Lip  = Numeric->Lip ;
+    Llen = Numeric->Llen ;
+    Uip  = Numeric->Uip ;
+    Ulen = Numeric->Ulen ;
+    LUbx = (Unit **) Numeric->LUbx ;
+    Udiag = Numeric->Udiag ;
+
+    Rs = Numeric->Rs ;
+    X = (Entry *) Numeric->Xwork ;
+    ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
+
+    /* ---------------------------------------------------------------------- */
+    /* solve in chunks of 4 columns at a time */
+    /* ---------------------------------------------------------------------- */
+
+    for (chunk = 0 ; chunk < nrhs ; chunk += 4)
+    {
+
+        /* ------------------------------------------------------------------ */
+        /* get the size of the current chunk */
+        /* ------------------------------------------------------------------ */
+
+        nr = MIN (nrhs - chunk, 4) ;
+
+        /* ------------------------------------------------------------------ */
+        /* permute the right hand side, X = Q'*B */
+        /* ------------------------------------------------------------------ */
+
+        switch (nr)
+        {
+
+            case 1:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    X [k] = Bz  [Q [k]] ;
+                }
+                break ;
+
+            case 2:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    X [2*k    ] = Bz [i      ] ;
+                    X [2*k + 1] = Bz [i + d  ] ;
+                }
+                break ;
+
+            case 3:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    X [3*k    ] = Bz [i      ] ;
+                    X [3*k + 1] = Bz [i + d  ] ;
+                    X [3*k + 2] = Bz [i + d*2] ;
+                }
+                break ;
+
+            case 4:
+
+                for (k = 0 ; k < n ; k++)
+                {
+                    i = Q [k] ;
+                    X [4*k    ] = Bz [i      ] ;
+                    X [4*k + 1] = Bz [i + d  ] ;
+                    X [4*k + 2] = Bz [i + d*2] ;
+                    X [4*k + 3] = Bz [i + d*3] ;
+                }
+                break ;
+
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* solve X = (L*U + Off)'\X */
+        /* ------------------------------------------------------------------ */
+
+        for (block = 0 ; block < nblocks ; block++)
+        {
+
+            /* -------------------------------------------------------------- */
+            /* the block of size nk is from rows/columns k1 to k2-1 */
+            /* -------------------------------------------------------------- */
+
+            k1 = R [block] ;
+            k2 = R [block+1] ;
+            nk = k2 - k1 ;
+            PRINTF (("tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
+
+            /* -------------------------------------------------------------- */
+            /* block back-substitution for the off-diagonal-block entries */
+            /* -------------------------------------------------------------- */
+
+            if (block > 0)
+            {
+                switch (nr)
+                    {
+
+                    case 1:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    MULT_SUB_CONJ (X [k], X [Offi [p]],
+                                            Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
+                                }
+                            }
+                        }
+                        break ;
+
+                    case 2:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [2*k    ] ;
+                            x [1] = X [2*k + 1] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    CONJ (offik, Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    offik = Offx [p] ;
+                                }
+                                MULT_SUB (x [0], offik, X [2*i]) ;
+                                MULT_SUB (x [1], offik, X [2*i + 1]) ;
+                            }
+                            X [2*k    ] = x [0] ;
+                            X [2*k + 1] = x [1] ;
+                        }
+                        break ;
+
+                    case 3:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [3*k    ] ;
+                            x [1] = X [3*k + 1] ;
+                            x [2] = X [3*k + 2] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    CONJ (offik, Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    offik = Offx [p] ;
+                                }
+                                MULT_SUB (x [0], offik, X [3*i]) ;
+                                MULT_SUB (x [1], offik, X [3*i + 1]) ;
+                                MULT_SUB (x [2], offik, X [3*i + 2]) ;
+                            }
+                            X [3*k    ] = x [0] ;
+                            X [3*k + 1] = x [1] ;
+                            X [3*k + 2] = x [2] ;
+                        }
+                        break ;
+
+                    case 4:
+
+                        for (k = k1 ; k < k2 ; k++)
+                        {
+                            pend = Offp [k+1] ;
+                            x [0] = X [4*k    ] ;
+                            x [1] = X [4*k + 1] ;
+                            x [2] = X [4*k + 2] ;
+                            x [3] = X [4*k + 3] ;
+                            for (p = Offp [k] ; p < pend ; p++)
+                            {
+                                i = Offi [p] ;
+#ifdef COMPLEX
+                                if (conj_solve)
+                                {
+                                    CONJ(offik, Offx [p]) ;
+                                }
+                                else
+#endif
+                                {
+                                    offik = Offx [p] ;
+                                }
+                                MULT_SUB (x [0], offik, X [4*i]) ;
+                                MULT_SUB (x [1], offik, X [4*i + 1]) ;
+                                MULT_SUB (x [2], offik, X [4*i + 2]) ;
+                                MULT_SUB (x [3], offik, X [4*i + 3]) ;
+                            }
+                            X [4*k    ] = x [0] ;
+                            X [4*k + 1] = x [1] ;
+                            X [4*k + 2] = x [2] ;
+                            X [4*k + 3] = x [3] ;
+                        }
+                        break ;
+                    }
+            }
+
+            /* -------------------------------------------------------------- */
+            /* solve the block system */
+            /* -------------------------------------------------------------- */
+
+            if (nk == 1)
+            {
+#ifdef COMPLEX
+                if (conj_solve)
+                {
+                    CONJ (s, Udiag [k1]) ;
+                }
+                else
+#endif
+                {
+                    s = Udiag [k1] ;
+                }
+                switch (nr)
+                {
+
+                    case 1:
+                        DIV (X [k1], X [k1], s) ;
+                        break ;
+
+                    case 2:
+                        DIV (X [2*k1], X [2*k1], s) ;
+                        DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
+                        break ;
+
+                    case 3:
+                        DIV (X [3*k1], X [3*k1], s) ;
+                        DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
+                        DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
+                        break ;
+
+                    case 4:
+                        DIV (X [4*k1], X [4*k1], s) ;
+                        DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
+                        DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
+                        DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
+                        break ;
+
+                }
+            }
+            else
+            {
+                KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
+                        Udiag + k1, nr,
+#ifdef COMPLEX
+                        conj_solve,
+#endif
+                        X + nr*k1) ;
+                KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
+#ifdef COMPLEX
+                        conj_solve,
+#endif
+                        X + nr*k1) ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* scale and permute the result, Bz  = P'(R\X) */
+        /* ------------------------------------------------------------------ */
+
+        if (Rs == NULL)
+        {
+
+            /* no scaling */
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        Bz  [Pnum [k]] = X [k] ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        Bz  [i      ] = X [2*k    ] ;
+                        Bz  [i + d  ] = X [2*k + 1] ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        Bz  [i      ] = X [3*k    ] ;
+                        Bz  [i + d  ] = X [3*k + 1] ;
+                        Bz  [i + d*2] = X [3*k + 2] ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        Bz  [i      ] = X [4*k    ] ;
+                        Bz  [i + d  ] = X [4*k + 1] ;
+                        Bz  [i + d*2] = X [4*k + 2] ;
+                        Bz  [i + d*3] = X [4*k + 3] ;
+                    }
+                    break ;
+            }
+
+        }
+        else
+        {
+
+            switch (nr)
+            {
+
+                case 1:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
+                    }
+                    break ;
+
+                case 2:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
+                    }
+                    break ;
+
+                case 3:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
+                    }
+                    break ;
+
+                case 4:
+
+                    for (k = 0 ; k < n ; k++)
+                    {
+                        i = Pnum [k] ;
+                        rs = Rs [k] ;
+                        SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
+                        SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
+                    }
+                    break ;
+            }
+        }
+
+        /* ------------------------------------------------------------------ */
+        /* go to the next chunk of B */
+        /* ------------------------------------------------------------------ */
+
+        Bz  += d*4 ;
+    }
+    return (TRUE) ;
+}
diff --git a/src/maths/KLU/klu_version.h b/src/maths/KLU/klu_version.h
new file mode 100644
index 000000000..9e8812107
--- /dev/null
+++ b/src/maths/KLU/klu_version.h
@@ -0,0 +1,694 @@
+#ifndef _KLU_VERSION_H
+#define _KLU_VERSION_H
+
+#ifdef DLONG
+#define Int UF_long
+#define Int_id UF_long_id
+#define Int_MAX UF_long_max
+#else
+#define Int int
+#define Int_id "%d"
+#define Int_MAX INT_MAX
+#endif
+
+#define NPRINT  
+
+#define BYTES(type,n) (sizeof (type) * (n))
+#define CEILING(b,u)  (((b)+(u)-1) / (u))
+#define UNITS(type,n) (CEILING (BYTES (type,n), sizeof (Unit)))
+#define DUNITS(type,n) (ceil (BYTES (type, (double) n) / sizeof (Unit)))
+
+#define GET_I_POINTER(LU, Xip, Xi, k) \
+{ \
+    Xi = (Int *) (LU + Xip [k]) ; \
+}
+
+#define GET_X_POINTER(LU, Xip, Xlen, Xx, k) \
+{ \
+    Xx = (Entry *) (LU + Xip [k] + UNITS (Int, Xlen [k])) ; \
+}
+
+#define GET_POINTER(LU, Xip, Xlen, Xi, Xx, k, xlen) \
+{ \
+    Unit *xp = LU + Xip [k] ; \
+    xlen = Xlen [k] ; \
+    Xi = (Int *) xp ; \
+    Xx = (Entry *) (xp + UNITS (Int, xlen)) ; \
+}
+
+/* function names */
+#ifdef COMPLEX 
+
+#ifdef DLONG
+
+#define KLU_scale klu_zl_scale
+#define KLU_solve klu_zl_solve
+#define KLU_tsolve klu_zl_tsolve
+#define KLU_free_numeric klu_zl_free_numeric
+#define KLU_factor klu_zl_factor
+#define KLU_refactor klu_zl_refactor
+#define KLU_kernel_factor klu_zl_kernel_factor 
+#define KLU_lsolve klu_zl_lsolve
+#define KLU_ltsolve klu_zl_ltsolve
+#define KLU_usolve klu_zl_usolve
+#define KLU_utsolve klu_zl_utsolve
+#define KLU_kernel klu_zl_kernel
+#define KLU_valid klu_zl_valid
+#define KLU_valid_LU klu_zl_valid_LU
+#define KLU_sort klu_zl_sort
+#define KLU_rgrowth klu_zl_rgrowth
+#define KLU_rcond klu_zl_rcond
+#define KLU_extract klu_zl_extract
+#define KLU_condest klu_zl_condest
+#define KLU_flops klu_zl_flops
+
+#else
+
+#define KLU_scale klu_z_scale
+#define KLU_solve klu_z_solve
+#define KLU_tsolve klu_z_tsolve
+#define KLU_free_numeric klu_z_free_numeric
+#define KLU_factor klu_z_factor
+#define KLU_refactor klu_z_refactor
+#define KLU_kernel_factor klu_z_kernel_factor 
+#define KLU_lsolve klu_z_lsolve
+#define KLU_ltsolve klu_z_ltsolve
+#define KLU_usolve klu_z_usolve
+#define KLU_utsolve klu_z_utsolve
+#define KLU_kernel klu_z_kernel
+#define KLU_valid klu_z_valid
+#define KLU_valid_LU klu_z_valid_LU
+#define KLU_sort klu_z_sort
+#define KLU_rgrowth klu_z_rgrowth
+#define KLU_rcond klu_z_rcond
+#define KLU_extract klu_z_extract
+#define KLU_condest klu_z_condest
+#define KLU_flops klu_z_flops
+
+#endif
+
+#else
+
+#ifdef DLONG
+
+#define KLU_scale klu_l_scale
+#define KLU_solve klu_l_solve
+#define KLU_tsolve klu_l_tsolve
+#define KLU_free_numeric klu_l_free_numeric
+#define KLU_factor klu_l_factor
+#define KLU_refactor klu_l_refactor
+#define KLU_kernel_factor klu_l_kernel_factor 
+#define KLU_lsolve klu_l_lsolve
+#define KLU_ltsolve klu_l_ltsolve
+#define KLU_usolve klu_l_usolve
+#define KLU_utsolve klu_l_utsolve
+#define KLU_kernel klu_l_kernel
+#define KLU_valid klu_l_valid
+#define KLU_valid_LU klu_l_valid_LU
+#define KLU_sort klu_l_sort
+#define KLU_rgrowth klu_l_rgrowth
+#define KLU_rcond klu_l_rcond
+#define KLU_extract klu_l_extract
+#define KLU_condest klu_l_condest
+#define KLU_flops klu_l_flops
+
+#else
+
+#define KLU_scale klu_scale
+#define KLU_solve klu_solve
+#define KLU_tsolve klu_tsolve
+#define KLU_free_numeric klu_free_numeric
+#define KLU_factor klu_factor
+#define KLU_refactor klu_refactor
+#define KLU_kernel_factor klu_kernel_factor 
+#define KLU_lsolve klu_lsolve
+#define KLU_ltsolve klu_ltsolve
+#define KLU_usolve klu_usolve
+#define KLU_utsolve klu_utsolve
+#define KLU_kernel klu_kernel
+#define KLU_valid klu_valid
+#define KLU_valid_LU klu_valid_LU
+#define KLU_sort klu_sort
+#define KLU_rgrowth klu_rgrowth
+#define KLU_rcond klu_rcond
+#define KLU_extract klu_extract
+#define KLU_condest klu_condest
+#define KLU_flops klu_flops
+
+#endif
+
+#endif
+
+
+#ifdef DLONG
+
+#define KLU_analyze klu_l_analyze
+#define KLU_analyze_given klu_l_analyze_given
+#define KLU_alloc_symbolic klu_l_alloc_symbolic
+#define KLU_free_symbolic klu_l_free_symbolic
+#define KLU_defaults klu_l_defaults
+#define KLU_free klu_l_free
+#define KLU_malloc klu_l_malloc
+#define KLU_realloc klu_l_realloc
+#define KLU_add_size_t klu_l_add_size_t
+#define KLU_mult_size_t klu_l_mult_size_t
+
+#define KLU_symbolic klu_l_symbolic
+#define KLU_numeric klu_l_numeric
+#define KLU_common klu_l_common
+
+#define BTF_order btf_l_order
+#define BTF_strongcomp btf_l_strongcomp
+
+#define AMD_order amd_l_order
+#define COLAMD colamd_l
+#define COLAMD_recommended colamd_l_recommended
+
+#else
+
+#define KLU_analyze klu_analyze
+#define KLU_analyze_given klu_analyze_given
+#define KLU_alloc_symbolic klu_alloc_symbolic
+#define KLU_free_symbolic klu_free_symbolic
+#define KLU_defaults klu_defaults
+#define KLU_free klu_free
+#define KLU_malloc klu_malloc
+#define KLU_realloc klu_realloc
+#define KLU_add_size_t klu_add_size_t
+#define KLU_mult_size_t klu_mult_size_t
+
+#define KLU_symbolic klu_symbolic
+#define KLU_numeric klu_numeric
+#define KLU_common klu_common
+
+#define BTF_order btf_order
+#define BTF_strongcomp btf_strongcomp
+
+#define AMD_order amd_order
+#define COLAMD colamd
+#define COLAMD_recommended colamd_recommended
+
+#endif
+
+
+/* -------------------------------------------------------------------------- */
+/* Numerical relop macros for correctly handling the NaN case */
+/* -------------------------------------------------------------------------- */
+
+/*
+SCALAR_IS_NAN(x):
+    True if x is NaN.  False otherwise.  The commonly-existing isnan(x)
+    function could be used, but it's not in Kernighan & Ritchie 2nd edition
+    (ANSI C).  It may appear in <math.h>, but I'm not certain about
+    portability.  The expression x != x is true if and only if x is NaN,
+    according to the IEEE 754 floating-point standard.
+
+SCALAR_IS_ZERO(x):
+    True if x is zero.  False if x is nonzero, NaN, or +/- Inf.
+    This is (x == 0) if the compiler is IEEE 754 compliant.
+
+SCALAR_IS_NONZERO(x):
+    True if x is nonzero, NaN, or +/- Inf.  False if x zero.
+    This is (x != 0) if the compiler is IEEE 754 compliant.
+
+SCALAR_IS_LTZERO(x):
+    True if x is < zero or -Inf.  False if x is >= 0, NaN, or +Inf.
+    This is (x < 0) if the compiler is IEEE 754 compliant.
+*/
+
+/* These all work properly, according to the IEEE 754 standard ... except on */
+/* a PC with windows.  Works fine in Linux on the same PC... */
+#define SCALAR_IS_NAN(x)        ((x) != (x))
+#define SCALAR_IS_ZERO(x)       ((x) == 0.)
+#define SCALAR_IS_NONZERO(x)    ((x) != 0.)
+#define SCALAR_IS_LTZERO(x)     ((x) < 0.)
+
+
+/* scalar absolute value macro. If x is NaN, the result is NaN: */
+#define SCALAR_ABS(x) ((SCALAR_IS_LTZERO (x)) ? -(x) : (x))
+
+/* print a scalar (avoid printing "-0" for negative zero).  */
+#ifdef NPRINT
+#define PRINT_SCALAR(a)
+#else
+#define PRINT_SCALAR(a) \
+{ \
+    if (SCALAR_IS_NONZERO (a)) \
+    { \
+        PRINTF ((" (%g)", (a))) ; \
+    } \
+    else \
+    { \
+        PRINTF ((" (0)")) ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+/* Real floating-point arithmetic */
+/* -------------------------------------------------------------------------- */
+
+#ifndef COMPLEX
+
+typedef double Unit ;
+#define Entry double
+
+#define SPLIT(s)                    (1)
+#define REAL(c)                     (c)
+#define IMAG(c)                     (0.)
+#define ASSIGN(c,s1,s2,p,split)     { (c) = (s1)[p] ; }
+#define CLEAR(c)                    { (c) = 0. ; }
+#define CLEAR_AND_INCREMENT(p)      { *p++ = 0. ; }
+#define IS_NAN(a)                   SCALAR_IS_NAN (a)
+#define IS_ZERO(a)                  SCALAR_IS_ZERO (a)
+#define IS_NONZERO(a)               SCALAR_IS_NONZERO (a)
+#define SCALE_DIV(c,s)              { (c) /= (s) ; }
+#define SCALE_DIV_ASSIGN(a,c,s)     { a = c / s ; }
+#define SCALE(c,s)                  { (c) *= (s) ; }
+#define ASSEMBLE(c,a)               { (c) += (a) ; }
+#define ASSEMBLE_AND_INCREMENT(c,p) { (c) += *p++ ; }
+#define DECREMENT(c,a)              { (c) -= (a) ; }
+#define MULT(c,a,b)                 { (c) = (a) * (b) ; }
+#define MULT_CONJ(c,a,b)            { (c) = (a) * (b) ; }
+#define MULT_SUB(c,a,b)             { (c) -= (a) * (b) ; }
+#define MULT_SUB_CONJ(c,a,b)        { (c) -= (a) * (b) ; }
+#define DIV(c,a,b)                  { (c) = (a) / (b) ; }
+#define RECIPROCAL(c)               { (c) = 1.0 / (c) ; }
+#define DIV_CONJ(c,a,b)             { (c) = (a) / (b) ; }
+#define APPROX_ABS(s,a)             { (s) = SCALAR_ABS (a) ; }
+#define ABS(s,a)                    { (s) = SCALAR_ABS (a) ; }
+#define PRINT_ENTRY(a)              PRINT_SCALAR (a)
+#define CONJ(a,x)                   a = x
+
+/* for flop counts */
+#define MULTSUB_FLOPS   2.      /* c -= a*b */
+#define DIV_FLOPS       1.      /* c = a/b */
+#define ABS_FLOPS       0.      /* c = abs (a) */
+#define ASSEMBLE_FLOPS  1.      /* c += a */
+#define DECREMENT_FLOPS 1.      /* c -= a */
+#define MULT_FLOPS      1.      /* c = a*b */
+#define SCALE_FLOPS     1.      /* c = a/s */
+
+#else
+
+/* -------------------------------------------------------------------------- */
+/* Complex floating-point arithmetic */
+/* -------------------------------------------------------------------------- */
+
+/*
+    Note:  An alternative to this Double_Complex type would be to use a
+    struct { double r ; double i ; }.  The problem with that method
+    (used by the Sun Performance Library, for example) is that ANSI C provides
+    no guarantee about the layout of a struct.  It is possible that the sizeof
+    the struct above would be greater than 2 * sizeof (double).  This would
+    mean that the complex BLAS could not be used.  The method used here avoids
+    that possibility.  ANSI C *does* guarantee that an array of structs has
+    the same size as n times the size of one struct.
+
+    The ANSI C99 version of the C language includes a "double _Complex" type.
+    It should be possible in that case to do the following:
+
+    #define Entry double _Complex
+
+    and remove the Double_Complex struct.  The macros, below, could then be
+    replaced with instrinsic operators.  Note that the #define Real and
+    #define Imag should also be removed (they only appear in this file).
+
+    For the MULT, MULT_SUB, MULT_SUB_CONJ, and MULT_CONJ macros,
+    the output argument c cannot be the same as any input argument.
+
+*/
+
+typedef struct
+{
+    double component [2] ;      /* real and imaginary parts */
+
+} Double_Complex ;
+
+typedef Double_Complex Unit ;
+#define Entry Double_Complex
+#define Real component [0]
+#define Imag component [1]
+
+/* for flop counts */
+#define MULTSUB_FLOPS   8.      /* c -= a*b */
+#define DIV_FLOPS       9.      /* c = a/b */
+#define ABS_FLOPS       6.      /* c = abs (a), count sqrt as one flop */
+#define ASSEMBLE_FLOPS  2.      /* c += a */
+#define DECREMENT_FLOPS 2.      /* c -= a */
+#define MULT_FLOPS      6.      /* c = a*b */
+#define SCALE_FLOPS     2.      /* c = a/s or c = a*s */
+
+/* -------------------------------------------------------------------------- */
+
+/* real part of c */
+#define REAL(c) ((c).Real)
+
+/* -------------------------------------------------------------------------- */
+
+/* imag part of c */
+#define IMAG(c) ((c).Imag)
+
+/* -------------------------------------------------------------------------- */
+
+/* Return TRUE if a complex number is in split form, FALSE if in packed form */
+#define SPLIT(sz) ((sz) != (double *) NULL)
+
+/* c = (s1) + (s2)*i, if s2 is null, then X is in "packed" format (compatible
+ * with Entry and ANSI C99 double _Complex type).  */
+/*#define ASSIGN(c,s1,s2,p,split)       \
+{ \
+    if (split) \
+    { \
+        (c).Real = (s1)[p] ; \
+        (c).Imag = (s2)[p] ; \
+    }  \
+    else \
+    { \
+        (c) = ((Entry *)(s1))[p] ; \
+    }  \
+}*/
+
+/* -------------------------------------------------------------------------- */
+#define CONJ(a, x) \
+{ \
+    a.Real = x.Real ; \
+    a.Imag = -x.Imag ; \
+}
+
+/* c = 0 */
+#define CLEAR(c) \
+{ \
+    (c).Real = 0. ; \
+    (c).Imag = 0. ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* *p++ = 0 */
+#define CLEAR_AND_INCREMENT(p) \
+{ \
+    p->Real = 0. ; \
+    p->Imag = 0. ; \
+    p++ ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* True if a == 0 */
+#define IS_ZERO(a) \
+    (SCALAR_IS_ZERO ((a).Real) && SCALAR_IS_ZERO ((a).Imag))
+
+/* -------------------------------------------------------------------------- */
+
+/* True if a is NaN */
+#define IS_NAN(a) \
+    (SCALAR_IS_NAN ((a).Real) || SCALAR_IS_NAN ((a).Imag))
+
+/* -------------------------------------------------------------------------- */
+
+/* True if a != 0 */
+#define IS_NONZERO(a) \
+    (SCALAR_IS_NONZERO ((a).Real) || SCALAR_IS_NONZERO ((a).Imag))
+
+/* -------------------------------------------------------------------------- */
+
+/* a = c/s */
+#define SCALE_DIV_ASSIGN(a,c,s) \
+{ \
+    a.Real = c.Real / s ; \
+    a.Imag = c.Imag / s ; \
+}
+
+/* c /= s */
+#define SCALE_DIV(c,s) \
+{ \
+    (c).Real /= (s) ; \
+    (c).Imag /= (s) ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c *= s */
+#define SCALE(c,s) \
+{ \
+    (c).Real *= (s) ; \
+    (c).Imag *= (s) ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c += a */
+#define ASSEMBLE(c,a) \
+{ \
+    (c).Real += (a).Real ; \
+    (c).Imag += (a).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c += *p++ */
+#define ASSEMBLE_AND_INCREMENT(c,p) \
+{ \
+    (c).Real += p->Real ; \
+    (c).Imag += p->Imag ; \
+    p++ ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c -= a */
+#define DECREMENT(c,a) \
+{ \
+    (c).Real -= (a).Real ; \
+    (c).Imag -= (a).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a*b, assert because c cannot be the same as a or b */
+#define MULT(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real = (a).Real * (b).Real - (a).Imag * (b).Imag ; \
+    (c).Imag = (a).Imag * (b).Real + (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a*conjugate(b), assert because c cannot be the same as a or b */
+#define MULT_CONJ(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real = (a).Real * (b).Real + (a).Imag * (b).Imag ; \
+    (c).Imag = (a).Imag * (b).Real - (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c -= a*b, assert because c cannot be the same as a or b */
+#define MULT_SUB(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real -= (a).Real * (b).Real - (a).Imag * (b).Imag ; \
+    (c).Imag -= (a).Imag * (b).Real + (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c -= a*conjugate(b), assert because c cannot be the same as a or b */
+#define MULT_SUB_CONJ(c,a,b) \
+{ \
+    ASSERT (&(c) != &(a) && &(c) != &(b)) ; \
+    (c).Real -= (a).Real * (b).Real + (a).Imag * (b).Imag ; \
+    (c).Imag -= (a).Imag * (b).Real - (a).Real * (b).Imag ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a/b, be careful to avoid underflow and overflow */
+#ifdef MATHWORKS
+#define DIV(c,a,b) \
+{ \
+    (void) utDivideComplex ((a).Real, (a).Imag, (b).Real, (b).Imag, \
+        &((c).Real), &((c).Imag)) ; \
+}
+#else
+/* This uses ACM Algo 116, by R. L. Smith, 1962. */
+/* c can be the same variable as a or b. */
+/* Ignore NaN case for double relop br>=bi. */
+#define DIV(c,a,b) \
+{ \
+    double r, den, ar, ai, br, bi ; \
+    br = (b).Real ; \
+    bi = (b).Imag ; \
+    ar = (a).Real ; \
+    ai = (a).Imag ; \
+    if (SCALAR_ABS (br) >= SCALAR_ABS (bi)) \
+    { \
+        r = bi / br ; \
+        den = br + r * bi ; \
+        (c).Real = (ar + ai * r) / den ; \
+        (c).Imag = (ai - ar * r) / den ; \
+    } \
+    else \
+    { \
+        r = br / bi ; \
+        den = r * br + bi ; \
+        (c).Real = (ar * r + ai) / den ; \
+        (c).Imag = (ai * r - ar) / den ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+/* c = 1/c, be careful to avoid underflow and overflow */
+/* Not used if MATHWORKS is defined. */
+/* This uses ACM Algo 116, by R. L. Smith, 1962. */
+/* Ignore NaN case for double relop cr>=ci. */
+#define RECIPROCAL(c) \
+{ \
+    double r, den, cr, ci ; \
+    cr = (c).Real ; \
+    ci = (c).Imag ; \
+    if (SCALAR_ABS (cr) >= SCALAR_ABS (ci)) \
+    { \
+        r = ci / cr ; \
+        den = cr + r * ci ; \
+        (c).Real = 1.0 / den ; \
+        (c).Imag = - r / den ; \
+    } \
+    else \
+    { \
+        r = cr / ci ; \
+        den = r * cr + ci ; \
+        (c).Real = r / den ; \
+        (c).Imag = - 1.0 / den ; \
+    } \
+}
+
+
+/* -------------------------------------------------------------------------- */
+
+/* c = a/conjugate(b), be careful to avoid underflow and overflow */
+#ifdef MATHWORKS
+#define DIV_CONJ(c,a,b) \
+{ \
+    (void) utDivideComplex ((a).Real, (a).Imag, (b).Real, (-(b).Imag), \
+        &((c).Real), &((c).Imag)) ; \
+}
+#else
+/* This uses ACM Algo 116, by R. L. Smith, 1962. */
+/* c can be the same variable as a or b. */
+/* Ignore NaN case for double relop br>=bi. */
+#define DIV_CONJ(c,a,b) \
+{ \
+    double r, den, ar, ai, br, bi ; \
+    br = (b).Real ; \
+    bi = (b).Imag ; \
+    ar = (a).Real ; \
+    ai = (a).Imag ; \
+    if (SCALAR_ABS (br) >= SCALAR_ABS (bi)) \
+    { \
+        r = (-bi) / br ; \
+        den = br - r * bi ; \
+        (c).Real = (ar + ai * r) / den ; \
+        (c).Imag = (ai - ar * r) / den ; \
+    } \
+    else \
+    { \
+        r = br / (-bi) ; \
+        den =  r * br - bi; \
+        (c).Real = (ar * r + ai) / den ; \
+        (c).Imag = (ai * r - ar) / den ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+/* approximate absolute value, s = |r|+|i| */
+#define APPROX_ABS(s,a) \
+{ \
+    (s) = SCALAR_ABS ((a).Real) + SCALAR_ABS ((a).Imag) ; \
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* exact absolute value, s = sqrt (a.real^2 + amag^2) */
+#ifdef MATHWORKS
+#define ABS(s,a) \
+{ \
+    (s) = utFdlibm_hypot ((a).Real, (a).Imag) ; \
+}
+#else
+/* Ignore NaN case for the double relops ar>=ai and ar+ai==ar. */
+#define ABS(s,a) \
+{ \
+    double r, ar, ai ; \
+    ar = SCALAR_ABS ((a).Real) ; \
+    ai = SCALAR_ABS ((a).Imag) ; \
+    if (ar >= ai) \
+    { \
+        if (ar + ai == ar) \
+        { \
+            (s) = ar ; \
+        } \
+        else \
+        { \
+            r = ai / ar ; \
+            (s) = ar * sqrt (1.0 + r*r) ; \
+        } \
+    } \
+    else \
+    { \
+        if (ai + ar == ai) \
+        { \
+            (s) = ai ; \
+        } \
+        else \
+        { \
+            r = ar / ai ; \
+            (s) = ai * sqrt (1.0 + r*r) ; \
+        } \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+/* print an entry (avoid printing "-0" for negative zero).  */
+#ifdef NPRINT
+#define PRINT_ENTRY(a)
+#else
+#define PRINT_ENTRY(a) \
+{ \
+    if (SCALAR_IS_NONZERO ((a).Real)) \
+    { \
+        PRINTF ((" (%g", (a).Real)) ; \
+    } \
+    else \
+    { \
+        PRINTF ((" (0")) ; \
+    } \
+    if (SCALAR_IS_LTZERO ((a).Imag)) \
+    { \
+        PRINTF ((" - %gi)", -(a).Imag)) ; \
+    } \
+    else if (SCALAR_IS_ZERO ((a).Imag)) \
+    { \
+        PRINTF ((" + 0i)")) ; \
+    } \
+    else \
+    { \
+        PRINTF ((" + %gi)", (a).Imag)) ; \
+    } \
+}
+#endif
+
+/* -------------------------------------------------------------------------- */
+
+#endif  /* #ifndef COMPLEX */
+
+#endif