diff --git a/src/include/ngspice/slu_Cnames.h b/src/include/ngspice/slu_Cnames.h new file mode 100644 index 000000000..7bcd1bca8 --- /dev/null +++ b/src/include/ngspice/slu_Cnames.h @@ -0,0 +1,458 @@ +/*! @file slu_Cnames.h + * \brief Macros defining how C routines will be called + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 1, 1997 + * + * These macros define how C routines will be called. ADD_ assumes that + * they will be called by fortran, which expects C routines to have an + * underscore postfixed to the name (Suns, and the Intel expect this). + * NOCHANGE indicates that fortran will be calling, and that it expects + * the name called by fortran to be identical to that compiled by the C + * (RS6K's do this). UPCASE says it expects C routines called by fortran + * to be in all upcase (CRAY wants this). + *+ */ +#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ +#define __SUPERLU_CNAMES + + +#define ADD_ 0 +#define ADD__ 1 +#define NOCHANGE 2 +#define UPCASE 3 +#define OLD_CRAY 4 +#define C_CALL 5 + +#ifdef UpCase +#define F77_CALL_C UPCASE +#endif + +#ifdef NoChange +#define F77_CALL_C NOCHANGE +#endif + +#ifdef Add_ +#define F77_CALL_C ADD_ +#endif + +#ifdef Add__ +#define F77_CALL_C ADD__ +#endif + +#ifdef _CRAY +#define F77_CALL_C OLD_CRAY +#endif + +/* Default */ +#ifndef F77_CALL_C +#define F77_CALL_C ADD_ +#endif + + +#if (F77_CALL_C == ADD_) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * No redefinition necessary to have following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm_(...) + * + * This is the default. + */ + +#endif + +#if (F77_CALL_C == ADD__) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * for following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm__(...) + */ +/* BLAS */ +#define sswap_ sswap__ +#define saxpy_ saxpy__ +#define sasum_ sasum__ +#define isamax_ isamax__ +#define scopy_ scopy__ +#define sscal_ sscal__ +#define sger_ sger__ +#define snrm2_ snrm2__ +#define ssymv_ ssymv__ +#define sdot_ sdot__ +#define saxpy_ saxpy__ +#define ssyr2_ ssyr2__ +#define srot_ srot__ +#define sgemv_ sgemv__ +#define strsv_ strsv__ +#define sgemm_ sgemm__ +#define strsm_ strsm__ + +#define dswap_ dswap__ +#define daxpy_ daxpy__ +#define dasum_ dasum__ +#define idamax_ idamax__ +#define dcopy_ dcopy__ +#define dscal_ dscal__ +#define dger_ dger__ +#define dnrm2_ dnrm2__ +#define dsymv_ dsymv__ +#define ddot_ ddot__ +#define dsyr2_ dsyr2__ +#define drot_ drot__ +#define dgemv_ dgemv__ +#define dtrsv_ dtrsv__ +#define dgemm_ dgemm__ +#define dtrsm_ dtrsm__ + +#define cswap_ cswap__ +#define caxpy_ caxpy__ +#define scasum_ scasum__ +#define icamax_ icamax__ +#define ccopy_ ccopy__ +#define cscal_ cscal__ +#define scnrm2_ scnrm2__ +#define caxpy_ caxpy__ +#define cgemv_ cgemv__ +#define ctrsv_ ctrsv__ +#define cgemm_ cgemm__ +#define ctrsm_ ctrsm__ +#define cgerc_ cgerc__ +#define chemv_ chemv__ +#define cher2_ cher2__ + +#define zswap_ zswap__ +#define zaxpy_ zaxpy__ +#define dzasum_ dzasum__ +#define izamax_ izamax__ +#define zcopy_ zcopy__ +#define zscal_ zscal__ +#define dznrm2_ dznrm2__ +#define zaxpy_ zaxpy__ +#define zgemv_ zgemv__ +#define ztrsv_ ztrsv__ +#define zgemm_ zgemm__ +#define ztrsm_ ztrsm__ +#define zgerc_ zgerc__ +#define zhemv_ zhemv__ +#define zher2_ zher2__ + +/* LAPACK */ +#define dlamch_ dlamch__ +#define slamch_ slamch__ +#define xerbla_ xerbla__ +#define lsame_ lsame__ +#define dlacon_ dlacon__ +#define slacon_ slacon__ +#define icmax1_ icmax1__ +#define scsum1_ scsum1__ +#define clacon_ clacon__ +#define dzsum1_ dzsum1__ +#define izmax1_ izmax1__ +#define zlacon_ zlacon__ + +/* Fortran interface */ +#define c_bridge_dgssv_ c_bridge_dgssv__ +#define c_fortran_sgssv_ c_fortran_sgssv__ +#define c_fortran_dgssv_ c_fortran_dgssv__ +#define c_fortran_cgssv_ c_fortran_cgssv__ +#define c_fortran_zgssv_ c_fortran_zgssv__ +#endif + +#if (F77_CALL_C == UPCASE) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void DGEMM(...) + */ +/* BLAS */ +#define sswap_ SSWAP +#define saxpy_ SAXPY +#define sasum_ SASUM +#define isamax_ ISAMAX +#define scopy_ SCOPY +#define sscal_ SSCAL +#define sger_ SGER +#define snrm2_ SNRM2 +#define ssymv_ SSYMV +#define sdot_ SDOT +#define saxpy_ SAXPY +#define ssyr2_ SSYR2 +#define srot_ SROT +#define sgemv_ SGEMV +#define strsv_ STRSV +#define sgemm_ SGEMM +#define strsm_ STRSM + +#define dswap_ DSWAP +#define daxpy_ DAXPY +#define dasum_ DASUM +#define idamax_ IDAMAX +#define dcopy_ DCOPY +#define dscal_ DSCAL +#define dger_ DGER +#define dnrm2_ DNRM2 +#define dsymv_ DSYMV +#define ddot_ DDOT +#define dsyr2_ DSYR2 +#define drot_ DROT +#define dgemv_ DGEMV +#define dtrsv_ DTRSV +#define dgemm_ DGEMM +#define dtrsm_ DTRSM + +#define cswap_ CSWAP +#define caxpy_ CAXPY +#define scasum_ SCASUM +#define icamax_ ICAMAX +#define ccopy_ CCOPY +#define cscal_ CSCAL +#define scnrm2_ SCNRM2 +#define cgemv_ CGEMV +#define ctrsv_ CTRSV +#define cgemm_ CGEMM +#define ctrsm_ CTRSM +#define cgerc_ CGERC +#define chemv_ CHEMV +#define cher2_ CHER2 + +#define zswap_ ZSWAP +#define zaxpy_ ZAXPY +#define dzasum_ DZASUM +#define izamax_ IZAMAX +#define zcopy_ ZCOPY +#define zscal_ ZSCAL +#define dznrm2_ DZNRM2 +#define zgemv_ ZGEMV +#define ztrsv_ ZTRSV +#define zgemm_ ZGEMM +#define ztrsm_ ZTRSM +#define zgerc_ ZGERC +#define zhemv_ ZHEMV +#define zher2_ ZHER2 + +/* LAPACK */ +#define dlamch_ DLAMCH +#define slamch_ SLAMCH +#define xerbla_ XERBLA +#define lsame_ LSAME +#define dlacon_ DLACON +#define slacon_ SLACON +#define icmax1_ ICMAX1 +#define scsum1_ SCSUM1 +#define clacon_ CLACON +#define dzsum1_ DZSUM1 +#define izmax1_ IZMAX1 +#define zlacon_ ZLACON + +/* Fortran interface */ +#define c_bridge_dgssv_ C_BRIDGE_DGSSV +#define c_fortran_sgssv_ C_FORTRAN_SGSSV +#define c_fortran_dgssv_ C_FORTRAN_DGSSV +#define c_fortran_cgssv_ C_FORTRAN_CGSSV +#define c_fortran_zgssv_ C_FORTRAN_ZGSSV +#endif + + +#if (F77_CALL_C == OLD_CRAY) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void SGEMM(...) + */ +/* BLAS */ +#define sswap_ SSWAP +#define saxpy_ SAXPY +#define sasum_ SASUM +#define isamax_ ISAMAX +#define scopy_ SCOPY +#define sscal_ SSCAL +#define sger_ SGER +#define snrm2_ SNRM2 +#define ssymv_ SSYMV +#define sdot_ SDOT +#define ssyr2_ SSYR2 +#define srot_ SROT +#define sgemv_ SGEMV +#define strsv_ STRSV +#define sgemm_ SGEMM +#define strsm_ STRSM + +#define dswap_ SSWAP +#define daxpy_ SAXPY +#define dasum_ SASUM +#define idamax_ ISAMAX +#define dcopy_ SCOPY +#define dscal_ SSCAL +#define dger_ SGER +#define dnrm2_ SNRM2 +#define dsymv_ SSYMV +#define ddot_ SDOT +#define dsyr2_ SSYR2 +#define drot_ SROT +#define dgemv_ SGEMV +#define dtrsv_ STRSV +#define dgemm_ SGEMM +#define dtrsm_ STRSM + +#define cswap_ CSWAP +#define caxpy_ CAXPY +#define scasum_ SCASUM +#define icamax_ ICAMAX +#define ccopy_ CCOPY +#define cscal_ CSCAL +#define scnrm2_ SCNRM2 +#define caxpy_ CAXPY +#define cgemv_ CGEMV +#define ctrsv_ CTRSV +#define cgemm_ CGEMM +#define ctrsm_ CTRSM +#define cgerc_ CGERC +#define chemv_ CHEMV +#define cher2_ CHER2 + +#define zswap_ ZSWAP +#define zaxpy_ ZAXPY +#define dzasum_ DZASUM +#define izamax_ IZAMAX +#define zcopy_ ZCOPY +#define zscal_ ZSCAL +#define dznrm2_ DZNRM2 +#define zgemv_ ZGEMV +#define ztrsv_ ZTRSV +#define zgemm_ ZGEMM +#define ztrsm_ ZTRSM +#define zgerc_ ZGERC +#define zhemv_ ZHEMV +#define zher2_ ZHER2 + +/* LAPACK */ +#define dlamch_ DLAMCH +#define slamch_ SLAMCH +#define xerbla_ XERBLA +#define lsame_ LSAME +#define dlacon_ DLACON +#define slacon_ SLACON +#define icmax1_ ICMAX1 +#define scsum1_ SCSUM1 +#define clacon_ CLACON +#define dzsum1_ DZSUM1 +#define izmax1_ IZMAX1 +#define zlacon_ ZLACON + +/* Fortran interface */ +#define c_bridge_dgssv_ C_BRIDGE_DGSSV +#define c_fortran_sgssv_ C_FORTRAN_SGSSV +#define c_fortran_dgssv_ C_FORTRAN_DGSSV +#define c_fortran_cgssv_ C_FORTRAN_CGSSV +#define c_fortran_zgssv_ C_FORTRAN_ZGSSV +#endif + + +#if (F77_CALL_C == NOCHANGE) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * for following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm(...) + */ +/* BLAS */ +#define sswap_ sswap +#define saxpy_ saxpy +#define sasum_ sasum +#define isamax_ isamax +#define scopy_ scopy +#define sscal_ sscal +#define sger_ sger +#define snrm2_ snrm2 +#define ssymv_ ssymv +#define sdot_ sdot +#define saxpy_ saxpy +#define ssyr2_ ssyr2 +#define srot_ srot +#define sgemv_ sgemv +#define strsv_ strsv +#define sgemm_ sgemm +#define strsm_ strsm + +#define dswap_ dswap +#define daxpy_ daxpy +#define dasum_ dasum +#define idamax_ idamax +#define dcopy_ dcopy +#define dscal_ dscal +#define dger_ dger +#define dnrm2_ dnrm2 +#define dsymv_ dsymv +#define ddot_ ddot +#define dsyr2_ dsyr2 +#define drot_ drot +#define dgemv_ dgemv +#define dtrsv_ dtrsv +#define dgemm_ dgemm +#define dtrsm_ dtrsm + +#define cswap_ cswap +#define caxpy_ caxpy +#define scasum_ scasum +#define icamax_ icamax +#define ccopy_ ccopy +#define cscal_ cscal +#define scnrm2_ scnrm2 +#define cgemv_ cgemv +#define ctrsv_ ctrsv +#define cgemm_ cgemm +#define ctrsm_ ctrsm +#define cgerc_ cgerc +#define chemv_ chemv +#define cher2_ cher2 + +#define zswap_ zswap +#define zaxpy_ zaxpy +#define dzasum_ dzasum +#define izamax_ izamax +#define zcopy_ zcopy +#define zscal_ zscal +#define dznrm2_ dznrm2 +#define zgemv_ zgemv +#define ztrsv_ ztrsv +#define zgemm_ zgemm +#define ztrsm_ ztrsm +#define zgerc_ zgerc +#define zhemv_ zhemv +#define zher2_ zher2 + +/* LAPACK */ +#define dlamch_ dlamch +#define slamch_ slamch +#define xerbla_ xerbla +#define lsame_ lsame +#define dlacon_ dlacon +#define slacon_ slacon +#define icmax1_ icmax1 +#define scsum1_ scsum1 +#define clacon_ clacon +#define dzsum1_ dzsum1 +#define izmax1_ izmax1 +#define zlacon_ zlacon + +/* Fortran interface */ +#define c_bridge_dgssv_ c_bridge_dgssv +#define c_fortran_sgssv_ c_fortran_sgssv +#define c_fortran_dgssv_ c_fortran_dgssv +#define c_fortran_cgssv_ c_fortran_cgssv +#define c_fortran_zgssv_ c_fortran_zgssv +#endif + + +#endif /* __SUPERLU_CNAMES */ diff --git a/src/include/ngspice/slu_dcomplex.h b/src/include/ngspice/slu_dcomplex.h new file mode 100644 index 000000000..386ad6893 --- /dev/null +++ b/src/include/ngspice/slu_dcomplex.h @@ -0,0 +1,78 @@ + +/*! @file slu_dcomplex.h + * \brief Header file for complex operations + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Contains definitions for various complex operations. + * This header file is to be included in source files z*.c + *+ */ +#ifndef __SUPERLU_DCOMPLEX /* allow multiple inclusions */ +#define __SUPERLU_DCOMPLEX + + +#ifndef DCOMPLEX_INCLUDE +#define DCOMPLEX_INCLUDE + +typedef struct { double r, i; } doublecomplex; + + +/* Macro definitions */ + +/*! \brief Complex Addition c = a + b */ +#define z_add(c, a, b) { (c)->r = (a)->r + (b)->r; \ + (c)->i = (a)->i + (b)->i; } + +/*! \brief Complex Subtraction c = a - b */ +#define z_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \ + (c)->i = (a)->i - (b)->i; } + +/*! \brief Complex-Double Multiplication */ +#define zd_mult(c, a, b) { (c)->r = (a)->r * (b); \ + (c)->i = (a)->i * (b); } + +/*! \brief Complex-Complex Multiplication */ +#define zz_mult(c, a, b) { \ + double cr, ci; \ + cr = (a)->r * (b)->r - (a)->i * (b)->i; \ + ci = (a)->i * (b)->r + (a)->r * (b)->i; \ + (c)->r = cr; \ + (c)->i = ci; \ + } + +#define zz_conj(a, b) { \ + (a)->r = (b)->r; \ + (a)->i = -((b)->i); \ + } + +/*! \brief Complex equality testing */ +#define z_eq(a, b) ( (a)->r == (b)->r && (a)->i == (b)->i ) + + +#ifdef __cplusplus +extern "C" { +#endif + +/* Prototypes for functions in dcomplex.c */ +void z_div(doublecomplex *, doublecomplex *, doublecomplex *); +double z_abs(doublecomplex *); /* exact */ +double z_abs1(doublecomplex *); /* approximate */ +void z_exp(doublecomplex *, doublecomplex *); +void d_cnjg(doublecomplex *r, doublecomplex *z); +double d_imag(doublecomplex *); +doublecomplex z_sgn(doublecomplex *); +doublecomplex z_sqrt(doublecomplex *); + + + +#ifdef __cplusplus + } +#endif + +#endif + +#endif /* __SUPERLU_DCOMPLEX */ diff --git a/src/include/ngspice/slu_ddefs.h b/src/include/ngspice/slu_ddefs.h new file mode 100644 index 000000000..194c070a7 --- /dev/null +++ b/src/include/ngspice/slu_ddefs.h @@ -0,0 +1,281 @@ + +/*! @file slu_ddefs.h + * \brief Header file for real operations + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November, 2010
+ *
+ * Global data structures used in LU factorization -
+ *
+ * nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
+ * (xsup,supno): supno[i] is the supernode no to which i belongs;
+ * xsup(s) points to the beginning of the s-th supernode.
+ * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12)
+ * xsup 0 1 2 4 7 12
+ * Note: dfs will be performed on supernode rep. relative to the new
+ * row pivoting ordering
+ *
+ * (xlsub,lsub): lsub[*] contains the compressed subscript of
+ * rectangular supernodes; xlsub[j] points to the starting
+ * location of the j-th column in lsub[*]. Note that xlsub
+ * is indexed by column.
+ * Storage: original row subscripts
+ *
+ * During the course of sparse LU factorization, we also use
+ * (xlsub,lsub) for the purpose of symmetric pruning. For each
+ * supernode {s,s+1,...,t=s+r} with first column s and last
+ * column t, the subscript set
+ * lsub[j], j=xlsub[s], .., xlsub[s+1]-1
+ * is the structure of column s (i.e. structure of this supernode).
+ * It is used for the storage of numerical values.
+ * Furthermore,
+ * lsub[j], j=xlsub[t], .., xlsub[t+1]-1
+ * is the structure of the last column t of this supernode.
+ * It is for the purpose of symmetric pruning. Therefore, the
+ * structural subscripts can be rearranged without making physical
+ * interchanges among the numerical values.
+ *
+ * However, if the supernode has only one column, then we
+ * only keep one set of subscripts. For any subscript interchange
+ * performed, similar interchange must be done on the numerical
+ * values.
+ *
+ * The last column structures (for pruning) will be removed
+ * after the numercial LU factorization phase.
+ *
+ * (xlusup,lusup): lusup[*] contains the numerical values of the
+ * rectangular supernodes; xlusup[j] points to the starting
+ * location of the j-th column in storage vector lusup[*]
+ * Note: xlusup is indexed by column.
+ * Each rectangular supernode is stored by column-major
+ * scheme, consistent with Fortran 2-dim array storage.
+ *
+ * (xusub,ucol,usub): ucol[*] stores the numerical values of
+ * U-columns outside the rectangular supernodes. The row
+ * subscript of nonzero ucol[k] is stored in usub[k].
+ * xusub[i] points to the starting location of column i in ucol.
+ * Storage: new row subscripts; that is subscripts of PA.
+ *
+ */
+#ifndef __SUPERLU_dSP_DEFS /* allow multiple inclusions */
+#define __SUPERLU_dSP_DEFS
+
+/*
+ * File name: dsp_defs.h
+ * Purpose: Sparse matrix types and function prototypes
+ * History:
+ */
+
+#ifdef _CRAY
+#include
+ * -- SuperLU routine (version 4.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November, 2010
+ *
+ * Global data structures used in LU factorization -
+ *
+ * nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
+ * (xsup,supno): supno[i] is the supernode no to which i belongs;
+ * xsup(s) points to the beginning of the s-th supernode.
+ * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12)
+ * xsup 0 1 2 4 7 12
+ * Note: dfs will be performed on supernode rep. relative to the new
+ * row pivoting ordering
+ *
+ * (xlsub,lsub): lsub[*] contains the compressed subscript of
+ * rectangular supernodes; xlsub[j] points to the starting
+ * location of the j-th column in lsub[*]. Note that xlsub
+ * is indexed by column.
+ * Storage: original row subscripts
+ *
+ * During the course of sparse LU factorization, we also use
+ * (xlsub,lsub) for the purpose of symmetric pruning. For each
+ * supernode {s,s+1,...,t=s+r} with first column s and last
+ * column t, the subscript set
+ * lsub[j], j=xlsub[s], .., xlsub[s+1]-1
+ * is the structure of column s (i.e. structure of this supernode).
+ * It is used for the storage of numerical values.
+ * Furthermore,
+ * lsub[j], j=xlsub[t], .., xlsub[t+1]-1
+ * is the structure of the last column t of this supernode.
+ * It is for the purpose of symmetric pruning. Therefore, the
+ * structural subscripts can be rearranged without making physical
+ * interchanges among the numerical values.
+ *
+ * However, if the supernode has only one column, then we
+ * only keep one set of subscripts. For any subscript interchange
+ * performed, similar interchange must be done on the numerical
+ * values.
+ *
+ * The last column structures (for pruning) will be removed
+ * after the numercial LU factorization phase.
+ *
+ * (xlusup,lusup): lusup[*] contains the numerical values of the
+ * rectangular supernodes; xlusup[j] points to the starting
+ * location of the j-th column in storage vector lusup[*]
+ * Note: xlusup is indexed by column.
+ * Each rectangular supernode is stored by column-major
+ * scheme, consistent with Fortran 2-dim array storage.
+ *
+ * (xusub,ucol,usub): ucol[*] stores the numerical values of
+ * U-columns outside the rectangular supernodes. The row
+ * subscript of nonzero ucol[k] is stored in usub[k].
+ * xusub[i] points to the starting location of column i in ucol.
+ * Storage: new row subscripts; that is subscripts of PA.
+ *
+ */
+#ifndef __SUPERLU_zSP_DEFS /* allow multiple inclusions */
+#define __SUPERLU_zSP_DEFS
+
+/*
+ * File name: zsp_defs.h
+ * Purpose: Sparse matrix types and function prototypes
+ * History:
+ */
+
+#ifdef _CRAY
+#include
+ ==========================================================================
+ === colamd/symamd - a sparse matrix column ordering algorithm ============
+ ==========================================================================
+
+
+ colamd: an approximate minimum degree column ordering algorithm,
+ for LU factorization of symmetric or unsymmetric matrices,
+ QR factorization, least squares, interior point methods for
+ linear programming problems, and other related problems.
+
+ symamd: an approximate minimum degree ordering algorithm for Cholesky
+ factorization of symmetric matrices.
+
+ Purpose:
+
+ Colamd computes a permutation Q such that the Cholesky factorization of
+ (AQ)'(AQ) has less fill-in and requires fewer floating point operations
+ than A'A. This also provides a good ordering for sparse partial
+ pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
+ factorization, and P is computed during numerical factorization via
+ conventional partial pivoting with row interchanges. Colamd is the
+ column ordering method used in SuperLU, part of the ScaLAPACK library.
+ It is also available as built-in function in MATLAB Version 6,
+ available from MathWorks, Inc. (http://www.mathworks.com). This
+ routine can be used in place of colmmd in MATLAB.
+
+ Symamd computes a permutation P of a symmetric matrix A such that the
+ Cholesky factorization of PAP' has less fill-in and requires fewer
+ floating point operations than A. Symamd constructs a matrix M such
+ that M'M has the same nonzero pattern of A, and then orders the columns
+ of M using colmmd. The column ordering of M is then returned as the
+ row and column ordering P of A.
+
+ Authors:
+
+ The authors of the code itself are Stefan I. Larimore and Timothy A.
+ Davis (davis@cise.ufl.edu), University of Florida. The algorithm was
+ developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+ Ng, Oak Ridge National Laboratory.
+
+ Date:
+
+ September 8, 2003. Version 2.3.
+
+ Acknowledgements:
+
+ This work was supported by the National Science Foundation, under
+ grants DMS-9504974 and DMS-9803599.
+
+ Copyright and License:
+
+ Copyright (c) 1998-2003 by the University of Florida.
+ All Rights Reserved.
+
+ THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
+
+ Permission is hereby granted to use, copy, modify, and/or distribute
+ this program, provided that the Copyright, this License, and the
+ Availability of the original version is retained on all copies and made
+ accessible to the end-user of any code or package that includes COLAMD
+ or any modified version of COLAMD.
+
+ Availability:
+
+ The colamd/symamd library is available at
+
+ http://www.cise.ufl.edu/research/sparse/colamd/
+
+ This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.c
+ file. It requires the colamd.h file. It is required by the colamdmex.c
+ and symamdmex.c files, for the MATLAB interface to colamd and symamd.
+
+ See the ChangeLog file for changes since Version 1.0.
+
+ ==========================================================================
+ === Description of user-callable routines ================================
+ ==========================================================================
+
+
+ ----------------------------------------------------------------------------
+ colamd_recommended:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ int colamd_recommended (int nnz, int n_row, int n_col) ;
+
+ or as a C macro
+
+ #include "colamd.h"
+ Alen = COLAMD_RECOMMENDED (int nnz, int n_row, int n_col) ;
+
+ Purpose:
+
+ Returns recommended value of Alen for use by colamd. Returns -1
+ if any input argument is negative. The use of this routine
+ or macro is optional. Note that the macro uses its arguments
+ more than once, so be careful for side effects, if you pass
+ expressions as arguments to COLAMD_RECOMMENDED. Not needed for
+ symamd, which dynamically allocates its own memory.
+
+ Arguments (all input arguments):
+
+ int nnz ; Number of nonzeros in the matrix A. This must
+ be the same value as p [n_col] in the call to
+ colamd - otherwise you will get a wrong value
+ of the recommended memory to use.
+
+ int n_row ; Number of rows in the matrix A.
+
+ int n_col ; Number of columns in the matrix A.
+
+ ----------------------------------------------------------------------------
+ colamd_set_defaults:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ colamd_set_defaults (double knobs [COLAMD_KNOBS]) ;
+
+ Purpose:
+
+ Sets the default parameters. The use of this routine is optional.
+
+ Arguments:
+
+ double knobs [COLAMD_KNOBS] ; Output only.
+
+ Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
+ entries are removed prior to ordering. Columns with more than
+ (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
+ ordering, and placed last in the output column ordering.
+
+ Symamd: uses only knobs [COLAMD_DENSE_ROW], which is knobs [0].
+ Rows and columns with more than (knobs [COLAMD_DENSE_ROW] * n)
+ entries are removed prior to ordering, and placed last in the
+ output ordering.
+
+ COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+ respectively, in colamd.h. Default values of these two knobs
+ are both 0.5. Currently, only knobs [0] and knobs [1] are
+ used, but future versions may use more knobs. If so, they will
+ be properly set to their defaults by the future version of
+ colamd_set_defaults, so that the code that calls colamd will
+ not need to change, assuming that you either use
+ colamd_set_defaults, or pass a (double *) NULL pointer as the
+ knobs array to colamd or symamd.
+
+ ----------------------------------------------------------------------------
+ colamd:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ int colamd (int n_row, int n_col, int Alen, int *A, int *p,
+ double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS]) ;
+
+ Purpose:
+
+ Computes a column ordering (Q) of A such that P(AQ)=LU or
+ (AQ)'AQ=LL' have less fill-in and require fewer floating point
+ operations than factorizing the unpermuted matrix A or A'A,
+ respectively.
+
+ Returns:
+
+ TRUE (1) if successful, FALSE (0) otherwise.
+
+ Arguments:
+
+ int n_row ; Input argument.
+
+ Number of rows in the matrix A.
+ Restriction: n_row >= 0.
+ Colamd returns FALSE if n_row is negative.
+
+ int n_col ; Input argument.
+
+ Number of columns in the matrix A.
+ Restriction: n_col >= 0.
+ Colamd returns FALSE if n_col is negative.
+
+ int Alen ; Input argument.
+
+ Restriction (see note):
+ Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col
+ Colamd returns FALSE if these conditions are not met.
+
+ Note: this restriction makes an modest assumption regarding
+ the size of the two typedef's structures in colamd.h.
+ We do, however, guarantee that
+
+ Alen >= colamd_recommended (nnz, n_row, n_col)
+
+ or equivalently as a C preprocessor macro:
+
+ Alen >= COLAMD_RECOMMENDED (nnz, n_row, n_col)
+
+ will be sufficient.
+
+ int A [Alen] ; Input argument, undefined on output.
+
+ A is an integer array of size Alen. Alen must be at least as
+ large as the bare minimum value given above, but this is very
+ low, and can result in excessive run time. For best
+ performance, we recommend that Alen be greater than or equal to
+ colamd_recommended (nnz, n_row, n_col), which adds
+ nnz/5 to the bare minimum value given above.
+
+ On input, the row indices of the entries in column c of the
+ matrix are held in A [(p [c]) ... (p [c+1]-1)]. The row indices
+ in a given column c need not be in ascending order, and
+ duplicate row indices may be be present. However, colamd will
+ work a little faster if both of these conditions are met
+ (Colamd puts the matrix into this format, if it finds that the
+ the conditions are not met).
+
+ The matrix is 0-based. That is, rows are in the range 0 to
+ n_row-1, and columns are in the range 0 to n_col-1. Colamd
+ returns FALSE if any row index is out of range.
+
+ The contents of A are modified during ordering, and are
+ undefined on output.
+
+ int p [n_col+1] ; Both input and output argument.
+
+ p is an integer array of size n_col+1. On input, it holds the
+ "pointers" for the column form of the matrix A. Column c of
+ the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first
+ entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+ for all c in the range 0 to n_col-1. The value p [n_col] is
+ thus the total number of entries in the pattern of the matrix A.
+ Colamd returns FALSE if these conditions are not met.
+
+ On output, if colamd returns TRUE, the array p holds the column
+ permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
+ the first column index in the new ordering, and p [n_col-1] is
+ the last. That is, p [k] = j means that column j of A is the
+ kth pivot column, in AQ, where k is in the range 0 to n_col-1
+ (p [0] = j means that column j of A is the first column in AQ).
+
+ If colamd returns FALSE, then no permutation is returned, and
+ p is undefined on output.
+
+ double knobs [COLAMD_KNOBS] ; Input argument.
+
+ See colamd_set_defaults for a description.
+
+ int stats [COLAMD_STATS] ; Output argument.
+
+ Statistics on the ordering, and error status.
+ See colamd.h for related definitions.
+ Colamd returns FALSE if stats is not present.
+
+ stats [0]: number of dense or empty rows ignored.
+
+ stats [1]: number of dense or empty columns ignored (and
+ ordered last in the output permutation p)
+ Note that a row can become "empty" if it
+ contains only "dense" and/or "empty" columns,
+ and similarly a column can become "empty" if it
+ only contains "dense" and/or "empty" rows.
+
+ stats [2]: number of garbage collections performed.
+ This can be excessively high if Alen is close
+ to the minimum required value.
+
+ stats [3]: status code. < 0 is an error code.
+ > 1 is a warning or notice.
+
+ 0 OK. Each column of the input matrix contained
+ row indices in increasing order, with no
+ duplicates.
+
+ 1 OK, but columns of input matrix were jumbled
+ (unsorted columns or duplicate entries). Colamd
+ had to do some extra work to sort the matrix
+ first and remove duplicate entries, but it
+ still was able to return a valid permutation
+ (return value of colamd was TRUE).
+
+ stats [4]: highest numbered column that
+ is unsorted or has duplicate
+ entries.
+ stats [5]: last seen duplicate or
+ unsorted row index.
+ stats [6]: number of duplicate or
+ unsorted row indices.
+
+ -1 A is a null pointer
+
+ -2 p is a null pointer
+
+ -3 n_row is negative
+
+ stats [4]: n_row
+
+ -4 n_col is negative
+
+ stats [4]: n_col
+
+ -5 number of nonzeros in matrix is negative
+
+ stats [4]: number of nonzeros, p [n_col]
+
+ -6 p [0] is nonzero
+
+ stats [4]: p [0]
+
+ -7 A is too small
+
+ stats [4]: required size
+ stats [5]: actual size (Alen)
+
+ -8 a column has a negative number of entries
+
+ stats [4]: column with < 0 entries
+ stats [5]: number of entries in col
+
+ -9 a row index is out of bounds
+
+ stats [4]: column with bad row index
+ stats [5]: bad row index
+ stats [6]: n_row, # of rows of matrx
+
+ -10 (unused; see symamd.c)
+
+ -999 (unused; see symamd.c)
+
+ Future versions may return more statistics in the stats array.
+
+ Example:
+
+ See http://www.cise.ufl.edu/research/sparse/colamd/example.c
+ for a complete example.
+
+ To order the columns of a 5-by-4 matrix with 11 nonzero entries in
+ the following nonzero pattern
+
+ x 0 x 0
+ x 0 x x
+ 0 x x 0
+ 0 0 x x
+ x x 0 0
+
+ with default knobs and no output statistics, do the following:
+
+ #include "colamd.h"
+ #define ALEN COLAMD_RECOMMENDED (11, 5, 4)
+ int A [ALEN] = {1, 2, 5, 3, 5, 1, 2, 3, 4, 2, 4} ;
+ int p [ ] = {0, 3, 5, 9, 11} ;
+ int stats [COLAMD_STATS] ;
+ colamd (5, 4, ALEN, A, p, (double *) NULL, stats) ;
+
+ The permutation is returned in the array p, and A is destroyed.
+
+ ----------------------------------------------------------------------------
+ symamd:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ int symamd (int n, int *A, int *p, int *perm,
+ double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS],
+ void (*allocate) (size_t, size_t), void (*release) (void *)) ;
+
+ Purpose:
+
+ The symamd routine computes an ordering P of a symmetric sparse
+ matrix A such that the Cholesky factorization PAP' = LL' remains
+ sparse. It is based on a column ordering of a matrix M constructed
+ so that the nonzero pattern of M'M is the same as A. The matrix A
+ is assumed to be symmetric; only the strictly lower triangular part
+ is accessed. You must pass your selected memory allocator (usually
+ calloc/free or mxCalloc/mxFree) to symamd, for it to allocate
+ memory for the temporary matrix M.
+
+ Returns:
+
+ TRUE (1) if successful, FALSE (0) otherwise.
+
+ Arguments:
+
+ int n ; Input argument.
+
+ Number of rows and columns in the symmetrix matrix A.
+ Restriction: n >= 0.
+ Symamd returns FALSE if n is negative.
+
+ int A [nnz] ; Input argument.
+
+ A is an integer array of size nnz, where nnz = p [n].
+
+ The row indices of the entries in column c of the matrix are
+ held in A [(p [c]) ... (p [c+1]-1)]. The row indices in a
+ given column c need not be in ascending order, and duplicate
+ row indices may be present. However, symamd will run faster
+ if the columns are in sorted order with no duplicate entries.
+
+ The matrix is 0-based. That is, rows are in the range 0 to
+ n-1, and columns are in the range 0 to n-1. Symamd
+ returns FALSE if any row index is out of range.
+
+ The contents of A are not modified.
+
+ int p [n+1] ; Input argument.
+
+ p is an integer array of size n+1. On input, it holds the
+ "pointers" for the column form of the matrix A. Column c of
+ the matrix A is held in A [(p [c]) ... (p [c+1]-1)]. The first
+ entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+ for all c in the range 0 to n-1. The value p [n] is
+ thus the total number of entries in the pattern of the matrix A.
+ Symamd returns FALSE if these conditions are not met.
+
+ The contents of p are not modified.
+
+ int perm [n+1] ; Output argument.
+
+ On output, if symamd returns TRUE, the array perm holds the
+ permutation P, where perm [0] is the first index in the new
+ ordering, and perm [n-1] is the last. That is, perm [k] = j
+ means that row and column j of A is the kth column in PAP',
+ where k is in the range 0 to n-1 (perm [0] = j means
+ that row and column j of A are the first row and column in
+ PAP'). The array is used as a workspace during the ordering,
+ which is why it must be of length n+1, not just n.
+
+ double knobs [COLAMD_KNOBS] ; Input argument.
+
+ See colamd_set_defaults for a description.
+
+ int stats [COLAMD_STATS] ; Output argument.
+
+ Statistics on the ordering, and error status.
+ See colamd.h for related definitions.
+ Symamd returns FALSE if stats is not present.
+
+ stats [0]: number of dense or empty row and columns ignored
+ (and ordered last in the output permutation
+ perm). Note that a row/column can become
+ "empty" if it contains only "dense" and/or
+ "empty" columns/rows.
+
+ stats [1]: (same as stats [0])
+
+ stats [2]: number of garbage collections performed.
+
+ stats [3]: status code. < 0 is an error code.
+ > 1 is a warning or notice.
+
+ 0 OK. Each column of the input matrix contained
+ row indices in increasing order, with no
+ duplicates.
+
+ 1 OK, but columns of input matrix were jumbled
+ (unsorted columns or duplicate entries). Symamd
+ had to do some extra work to sort the matrix
+ first and remove duplicate entries, but it
+ still was able to return a valid permutation
+ (return value of symamd was TRUE).
+
+ stats [4]: highest numbered column that
+ is unsorted or has duplicate
+ entries.
+ stats [5]: last seen duplicate or
+ unsorted row index.
+ stats [6]: number of duplicate or
+ unsorted row indices.
+
+ -1 A is a null pointer
+
+ -2 p is a null pointer
+
+ -3 (unused, see colamd.c)
+
+ -4 n is negative
+
+ stats [4]: n
+
+ -5 number of nonzeros in matrix is negative
+
+ stats [4]: # of nonzeros (p [n]).
+
+ -6 p [0] is nonzero
+
+ stats [4]: p [0]
+
+ -7 (unused)
+
+ -8 a column has a negative number of entries
+
+ stats [4]: column with < 0 entries
+ stats [5]: number of entries in col
+
+ -9 a row index is out of bounds
+
+ stats [4]: column with bad row index
+ stats [5]: bad row index
+ stats [6]: n_row, # of rows of matrx
+
+ -10 out of memory (unable to allocate temporary
+ workspace for M or count arrays using the
+ "allocate" routine passed into symamd).
+
+ -999 internal error. colamd failed to order the
+ matrix M, when it should have succeeded. This
+ indicates a bug. If this (and *only* this)
+ error code occurs, please contact the authors.
+ Don't contact the authors if you get any other
+ error code.
+
+ Future versions may return more statistics in the stats array.
+
+ void * (*allocate) (size_t, size_t)
+
+ A pointer to a function providing memory allocation. The
+ allocated memory must be returned initialized to zero. For a
+ C application, this argument should normally be a pointer to
+ calloc. For a MATLAB mexFunction, the routine mxCalloc is
+ passed instead.
+
+ void (*release) (size_t, size_t)
+
+ A pointer to a function that frees memory allocated by the
+ memory allocation routine above. For a C application, this
+ argument should normally be a pointer to free. For a MATLAB
+ mexFunction, the routine mxFree is passed instead.
+
+
+ ----------------------------------------------------------------------------
+ colamd_report:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ colamd_report (int stats [COLAMD_STATS]) ;
+
+ Purpose:
+
+ Prints the error status and statistics recorded in the stats
+ array on the standard error output (for a standard C routine)
+ or on the MATLAB output (for a mexFunction).
+
+ Arguments:
+
+ int stats [COLAMD_STATS] ; Input only. Statistics from colamd.
+
+
+ ----------------------------------------------------------------------------
+ symamd_report:
+ ----------------------------------------------------------------------------
+
+ C syntax:
+
+ #include "colamd.h"
+ symamd_report (int stats [COLAMD_STATS]) ;
+
+ Purpose:
+
+ Prints the error status and statistics recorded in the stats
+ array on the standard error output (for a standard C routine)
+ or on the MATLAB output (for a mexFunction).
+
+ Arguments:
+
+ int stats [COLAMD_STATS] ; Input only. Statistics from symamd.
+
+
+*/
+
+/* ========================================================================== */
+/* === Scaffolding code definitions ======================================== */
+/* ========================================================================== */
+
+/* Ensure that debugging is turned off: */
+#ifndef NDEBUG
+#define NDEBUG
+#endif /* NDEBUG */
+
+/*
+ Our "scaffolding code" philosophy: In our opinion, well-written library
+ code should keep its "debugging" code, and just normally have it turned off
+ by the compiler so as not to interfere with performance. This serves
+ several purposes:
+
+ (1) assertions act as comments to the reader, telling you what the code
+ expects at that point. All assertions will always be true (unless
+ there really is a bug, of course).
+
+ (2) leaving in the scaffolding code assists anyone who would like to modify
+ the code, or understand the algorithm (by reading the debugging output,
+ one can get a glimpse into what the code is doing).
+
+ (3) (gasp!) for actually finding bugs. This code has been heavily tested
+ and "should" be fully functional and bug-free ... but you never know...
+
+ To enable debugging, comment out the "#define NDEBUG" above. For a MATLAB
+ mexFunction, you will also need to modify mexopts.sh to remove the -DNDEBUG
+ definition. The code will become outrageously slow when debugging is
+ enabled. To control the level of debugging output, set an environment
+ variable D to 0 (little), 1 (some), 2, 3, or 4 (lots). When debugging,
+ you should see the following message on the standard output:
+
+ colamd: debug version, D = 1 (THIS WILL BE SLOW!)
+
+ or a similar message for symamd. If you don't, then debugging has not
+ been enabled.
+
+*/
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+#include "colamd.h"
+#include + ========================================================================== + === colamd/symamd prototypes and definitions ============================= + ========================================================================== + + You must include this file (colamd.h) in any routine that uses colamd, + symamd, or the related macros and definitions. + + Authors: + + The authors of the code itself are Stefan I. Larimore and Timothy A. + Davis (davis@cise.ufl.edu), University of Florida. The algorithm was + developed in collaboration with John Gilbert, Xerox PARC, and Esmond + Ng, Oak Ridge National Laboratory. + + Date: + + September 8, 2003. Version 2.3. + + Acknowledgements: + + This work was supported by the National Science Foundation, under + grants DMS-9504974 and DMS-9803599. + + Notice: + + Copyright (c) 1998-2003 by the University of Florida. + All Rights Reserved. + + THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + + Permission is hereby granted to use, copy, modify, and/or distribute + this program, provided that the Copyright, this License, and the + Availability of the original version is retained on all copies and made + accessible to the end-user of any code or package that includes COLAMD + or any modified version of COLAMD. + + Availability: + + The colamd/symamd library is available at + + http://www.cise.ufl.edu/research/sparse/colamd/ + + This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h + file. It is required by the colamd.c, colamdmex.c, and symamdmex.c + files, and by any C code that calls the routines whose prototypes are + listed below, or that uses the colamd/symamd definitions listed below. ++*/ + +#ifndef COLAMD_H +#define COLAMD_H + +/* ========================================================================== */ +/* === Include files ======================================================== */ +/* ========================================================================== */ + +#include
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+*/ + +#include
+ * Purpose: + * ======== + * Performs numeric block updates (sup-col) in topological order. + * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. + * Special processing on the supernodal portion of L\U[*,j] + * Return value: 0 - successful return + * > 0 - number of bytes allocated when run out of space + *+ */ +int +dcolumn_bmod ( + const int jcol, /* in */ + const int nseg, /* in */ + double *dense, /* in */ + double *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in */ + int fpanelc, /* in -- first column in the current panel */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + double alpha, beta; + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in supernode + * nsupr = no of rows in supernode (used as leading dimension) + * luptr = location of supernodal LU-block in storage + * kfnz = first nonz in the k-th supernodal segment + * no_zeros = no of leading zeros in a supernodal U-segment + */ + double ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int fsupc, nsupc, nsupr, segsze; + int nrow; /* No of rows in the matrix of matrix-vector */ + int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; + register int lptr, kfnz, isub, irow, i; + register int no_zeros, new_next; + int ufirst, nextlu; + int fst_col; /* First column within small LU update */ + int d_fsupc; /* Distance between the first column of the current + panel and the first column of the current snode. */ + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + int nzlumax; + double *tempv1; + double zero = 0.0; + double one = 1.0; + double none = -1.0; + int mem_error; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + nzlumax = Glu->nzlumax; + jcolp1 = jcol + 1; + jsupno = supno[jcol]; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + + krep = segrep[k]; + k--; + ksupno = supno[krep]; + if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ + + fsupc = xsup[ksupno]; + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + /* Distance from the current supernode to the current panel; + d_fsupc=0 if fsupc > fpanelc. */ + d_fsupc = fst_col - fsupc; + + luptr = xlusup[fst_col] + d_fsupc; + lptr = xlsub[fsupc] + d_fsupc; + + kfnz = repfnz[krep]; + kfnz = SUPERLU_MAX ( kfnz, fpanelc ); + + segsze = krep - kfnz + 1; + nsupc = krep - fst_col + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nrow = nsupr - d_fsupc - nsupc; + krep_ind = lptr + nsupc - 1; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + + /* + * Case 1: Update U-segment of size 1 -- col-col update + */ + if ( segsze == 1 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + dense[irow] -= ukj*lusup[luptr]; + luptr++; + } + + } else if ( segsze <= 3 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { /* Case 2: 2cols-col update */ + ukj -= ukj1 * lusup[luptr1]; + dense[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + dense[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] ); + } + } else { /* Case 3: 3cols-col update */ + ukj2 = dense[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense[lsub[krep_ind]] = ukj; + dense[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + luptr2++; + dense[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + + + } else { + /* + * Case: sup-col update + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense + */ + + no_zeros = kfnz - fst_col; + + /* Copy U[*,j] segment from dense[*] to tempv[*] */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + tempv[i] = dense[irow]; + ++isub; + } + + /* Dense triangular solve -- start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + dlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + dmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); +#endif + + + /* Scatter tempv[] into SPA dense[] as a temporary storage */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense[irow] = tempv[i]; + tempv[i] = zero; + ++isub; + } + + /* Scatter tempv1[] into SPA dense[] */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + dense[irow] -= tempv1[i]; + tempv1[i] = zero; + ++isub; + } + } + + } /* if jsupno ... */ + + } /* for each segment... */ + + /* + * Process the supernodal portion of L\U[*,j] + */ + nextlu = xlusup[jcol]; + fsupc = xsup[jsupno]; + + /* Copy the SPA dense into L\U[*,j] */ + new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; + while ( new_next > nzlumax ) { + if (mem_error = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) + return (mem_error); + lusup = Glu->lusup; + lsub = Glu->lsub; + } + + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = zero; + ++nextlu; + } + + xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ + + /* For more updates within the panel (also within the current supernode), + * should start from the first column of the panel, or the first column + * of the supernode, whichever is bigger. There are 2 cases: + * 1) fsupc < fpanelc, then fst_col := fpanelc + * 2) fsupc >= fpanelc, then fst_col := fsupc + */ + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + if ( fst_col < jcol ) { + + /* Distance between the current supernode and the current panel. + d_fsupc=0 if fsupc >= fpanelc. */ + d_fsupc = fst_col - fsupc; + + lptr = xlsub[fsupc] + d_fsupc; + luptr = xlusup[fst_col] + d_fsupc; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nsupc = jcol - fst_col; /* Excluding jcol */ + nrow = nsupr - d_fsupc - nsupc; + + /* Points to the beginning of jcol in snode L\U(jsupno) */ + ufirst = xlusup[jcol] + d_fsupc; + + ops[TRSV] += nsupc * (nsupc - 1); + ops[GEMV] += 2 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#else + dtrsv_( "L", "N", "U", &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#endif + + alpha = none; beta = one; /* y := beta*y + alpha*A*x */ + +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + dgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + dlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + + dmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], tempv ); + + /* Copy updates from tempv[*] into lusup[*] */ + isub = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + lusup[isub] -= tempv[i]; + tempv[i] = 0.0; + ++isub; + } + +#endif + + + } /* if fst_col < jcol ... */ + + return 0; +} diff --git a/src/maths/SuperLU/dcolumn_dfs.c b/src/maths/SuperLU/dcolumn_dfs.c new file mode 100644 index 000000000..258830f58 --- /dev/null +++ b/src/maths/SuperLU/dcolumn_dfs.c @@ -0,0 +1,275 @@ + +/*! @file dcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+*/ + +#include
+ * Purpose + * ======= + * DCOLUMN_DFS performs a symbolic factorization on column jcol, and + * decide the supernode boundary. + * + * This routine does not use numeric values, but only use the RHS + * row indices to start the dfs. + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. The routine returns a list of such supernodal + * representatives in topological order of the dfs that generates them. + * The location of the first nonzero in each such supernodal segment + * (supernodal entry location) is also returned. + * + * Local parameters + * ================ + * nseg: no of segments in current U[*,j] + * jsuper: jsuper=EMPTY if column j does not belong to the same + * supernode as j-1. Otherwise, jsuper=nsuper. + * + * marker2: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + * + * Return value + * ============ + * 0 success; + * > 0 number of bytes allocated when run out of space. + *+ */ +int +dcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *xprune, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom, istop; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(3); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonz */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ( mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( mem_error = + dLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + +#ifdef T2_SUPER + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; +#endif + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first and last columns of + * a supernode. (first for num values, last for pruning) + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + istop = ito + jptr - jm1ptr; + xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ + xlsub[jcol] = istop; + for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; /* = istop + length(jcol) */ + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xprune[jcol] = nextl; /* Initialize upper bound for pruning */ + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/dcomplex.c b/src/maths/SuperLU/dcomplex.c new file mode 100644 index 000000000..706612498 --- /dev/null +++ b/src/maths/SuperLU/dcomplex.c @@ -0,0 +1,147 @@ + +/*! @file dcomplex.c + * \brief Common arithmetic for complex type + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * This file defines common arithmetic operations for complex type. + *+ */ + +#include
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + +#include
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + +#include
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Modified from lapack routines DGECON. + *+ */ + +/* + * File name: dgscon.c + * History: Modified from lapack routines DGECON. + */ +#include
+ * Purpose + * ======= + * + * DGSCON estimates the reciprocal of the condition number of a general + * real matrix A, in either the 1-norm or the infinity-norm, using + * the LU factorization computed by DGETRF. * + * + * An estimate is obtained for norm(inv(A)), and the reciprocal of the + * condition number is computed as + * RCOND = 1 / ( norm(A) * norm(inv(A)) ). + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * NORM (input) char* + * Specifies whether the 1-norm condition number or the + * infinity-norm condition number is required: + * = '1' or 'O': 1-norm; + * = 'I': Infinity-norm. + * + * L (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U as computed by + * dgstrf(). Use compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. + * + * U (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U as computed by + * dgstrf(). Use column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. + * + * ANORM (input) double + * If NORM = '1' or 'O', the 1-norm of the original matrix A. + * If NORM = 'I', the infinity-norm of the original matrix A. + * + * RCOND (output) double* + * The reciprocal of the condition number of the matrix A, + * computed as RCOND = 1/(norm(A) * norm(inv(A))). + * + * INFO (output) int* + * = 0: successful exit + * < 0: if INFO = -i, the i-th argument had an illegal value + * + * ===================================================================== + *+ */ + +void +dgscon(char *norm, SuperMatrix *L, SuperMatrix *U, + double anorm, double *rcond, SuperLUStat_t *stat, int *info) +{ + + + /* Local variables */ + int kase, kase1, onenrm, i; + double ainvnm; + double *work; + int *iwork; + extern int drscl_(int *, double *, double *, int *); + + extern int dlacon_(int *, double *, double *, int *, double *, int *); + + + /* Test the input parameters. */ + *info = 0; + onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O"); + if (! onenrm && ! lsame_(norm, "I")) *info = -1; + else if (L->nrow < 0 || L->nrow != L->ncol || + L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU) + *info = -2; + else if (U->nrow < 0 || U->nrow != U->ncol || + U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU) + *info = -3; + if (*info != 0) { + i = -(*info); + xerbla_("dgscon", &i); + return; + } + + /* Quick return if possible */ + *rcond = 0.; + if ( L->nrow == 0 || U->nrow == 0) { + *rcond = 1.; + return; + } + + work = doubleCalloc( 3*L->nrow ); + iwork = intMalloc( L->nrow ); + + + if ( !work || !iwork ) + ABORT_SuperLU("Malloc fails for work arrays in dgscon."); + + /* Estimate the norm of inv(A). */ + ainvnm = 0.; + if ( onenrm ) kase1 = 1; + else kase1 = 2; + kase = 0; + + do { + dlacon_(&L->nrow, &work[L->nrow], &work[0], &iwork[0], &ainvnm, &kase); + + if (kase == 0) break; + + if (kase == kase1) { + /* Multiply by inv(L). */ + sp_dtrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); + + /* Multiply by inv(U). */ + sp_dtrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); + + } else { + + /* Multiply by inv(U'). */ + sp_dtrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); + + /* Multiply by inv(L'). */ + sp_dtrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); + + } + + } while ( kase != 0 ); + + /* Compute the estimate of the reciprocal condition number. */ + if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; + + SUPERLU_FREE (work); + SUPERLU_FREE (iwork); + return; + +} /* dgscon */ + diff --git a/src/maths/SuperLU/dgsequ.c b/src/maths/SuperLU/dgsequ.c new file mode 100644 index 000000000..35754187a --- /dev/null +++ b/src/maths/SuperLU/dgsequ.c @@ -0,0 +1,195 @@ + +/*! @file dgsequ.c + * \brief Computes row and column scalings + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Modified from LAPACK routine DGEEQU + *+ */ +/* + * File name: dgsequ.c + * History: Modified from LAPACK routine DGEEQU + */ +#include
+ * Purpose + * ======= + * + * DGSEQU computes row and column scalings intended to equilibrate an + * M-by-N sparse matrix A and reduce its condition number. R returns the row + * scale factors and C the column scale factors, chosen to try to make + * the largest element in each row and column of the matrix B with + * elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. + * + * R(i) and C(j) are restricted to be between SMLNUM = smallest safe + * number and BIGNUM = largest safe number. Use of these scaling + * factors is not guaranteed to reduce the condition number of A but + * works well in practice. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * A (input) SuperMatrix* + * The matrix of dimension (A->nrow, A->ncol) whose equilibration + * factors are to be computed. The type of A can be: + * Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. + * + * R (output) double*, size A->nrow + * If INFO = 0 or INFO > M, R contains the row scale factors + * for A. + * + * C (output) double*, size A->ncol + * If INFO = 0, C contains the column scale factors for A. + * + * ROWCND (output) double* + * If INFO = 0 or INFO > M, ROWCND contains the ratio of the + * smallest R(i) to the largest R(i). If ROWCND >= 0.1 and + * AMAX is neither too large nor too small, it is not worth + * scaling by R. + * + * COLCND (output) double* + * If INFO = 0, COLCND contains the ratio of the smallest + * C(i) to the largest C(i). If COLCND >= 0.1, it is not + * worth scaling by C. + * + * AMAX (output) double* + * Absolute value of largest matrix element. If AMAX is very + * close to overflow or very close to underflow, the matrix + * should be scaled. + * + * INFO (output) int* + * = 0: successful exit + * < 0: if INFO = -i, the i-th argument had an illegal value + * > 0: if INFO = i, and i is + * <= A->nrow: the i-th row of A is exactly zero + * > A->ncol: the (i-M)-th column of A is exactly zero + * + * ===================================================================== + *+ */ +void +dgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, + double *colcnd, double *amax, int *info) +{ + + + /* Local variables */ + NCformat *Astore; + double *Aval; + int i, j, irow; + double rcmin, rcmax; + double bignum, smlnum; + extern double dlamch_(char *); + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + xerbla_("dgsequ", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = 0; i < A->nrow; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + return; + +} /* dgsequ */ + + diff --git a/src/maths/SuperLU/dgsisx.c b/src/maths/SuperLU/dgsisx.c new file mode 100644 index 000000000..01883fd13 --- /dev/null +++ b/src/maths/SuperLU/dgsisx.c @@ -0,0 +1,727 @@ + +/*! @file dgsisx.c + * \brief Computes an approximate solutions of linear equations A*X=B or A'*X=B + * + *
+ * -- SuperLU routine (version 4.2) -- + * Lawrence Berkeley National Laboratory. + * November, 2010 + * August, 2011 + *+ */ +#include
+ * Purpose
+ * =======
+ *
+ * DGSISX computes an approximate solutions of linear equations
+ * A*X=B or A'*X=B, using the ILU factorization from dgsitrf().
+ * An estimation of the condition number is provided.
+ * The routine performs the following steps:
+ *
+ * 1. If A is stored column-wise (A->Stype = SLU_NC):
+ *
+ * 1.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling
+ * factors are computed to equilibrate the system:
+ * options->Trans = NOTRANS:
+ * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ * options->Trans = TRANS:
+ * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ * options->Trans = CONJ:
+ * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ * Whether or not the system will be equilibrated depends on the
+ * scaling of the matrix A, but if equilibration is used, A is
+ * overwritten by diag(R)*A*diag(C) and B by diag(R)*B
+ * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
+ * = TRANS or CONJ).
+ *
+ * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
+ * matrix that usually preserves sparsity.
+ * For more details of this step, see sp_preorder.c.
+ *
+ * 1.3. If options->Fact != FACTORED, the LU decomposition is used to
+ * factor the matrix A (after equilibration if options->Equil = YES)
+ * as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
+ *
+ * 1.4. Compute the reciprocal pivot growth factor.
+ *
+ * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ * routine fills a small number on the diagonal entry, that is
+ * U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n),
+ * and info will be increased by 1. The factored form of A is used
+ * to estimate the condition number of the preconditioner. If the
+ * reciprocal of the condition number is less than machine precision,
+ * info = A->ncol+1 is returned as a warning, but the routine still
+ * goes on to solve for X.
+ *
+ * 1.6. The system of equations is solved for X using the factored form
+ * of A.
+ *
+ * 1.7. options->IterRefine is not used
+ *
+ * 1.8. If equilibration was used, the matrix X is premultiplied by
+ * diag(C) (if options->Trans = NOTRANS) or diag(R)
+ * (if options->Trans = TRANS or CONJ) so that it solves the
+ * original system before equilibration.
+ *
+ * 1.9. options for ILU only
+ * 1) If options->RowPerm = LargeDiag, MC64 is used to scale and
+ * permute the matrix to an I-matrix, that is Pr*Dr*A*Dc has
+ * entries of modulus 1 on the diagonal and off-diagonal entries
+ * of modulus at most 1. If MC64 fails, dgsequ() is used to
+ * equilibrate the system.
+ * ( Default: LargeDiag )
+ * 2) options->ILU_DropTol = tau is the threshold for dropping.
+ * For L, it is used directly (for the whole row in a supernode);
+ * For U, ||A(:,i)||_oo * tau is used as the threshold
+ * for the i-th column.
+ * If a secondary dropping rule is required, tau will
+ * also be used to compute the second threshold.
+ * ( Default: 1e-4 )
+ * 3) options->ILU_FillFactor = gamma, used as the initial guess
+ * of memory growth.
+ * If a secondary dropping rule is required, it will also
+ * be used as an upper bound of the memory.
+ * ( Default: 10 )
+ * 4) options->ILU_DropRule specifies the dropping rule.
+ * Option Meaning
+ * ====== ===========
+ * DROP_BASIC: Basic dropping rule, supernodal based ILUTP(tau).
+ * DROP_PROWS: Supernodal based ILUTP(p,tau), p = gamma*nnz(A)/n.
+ * DROP_COLUMN: Variant of ILUTP(p,tau), for j-th column,
+ * p = gamma * nnz(A(:,j)).
+ * DROP_AREA: Variation of ILUTP, for j-th column, use
+ * nnz(F(:,1:j)) / nnz(A(:,1:j)) to control memory.
+ * DROP_DYNAMIC: Modify the threshold tau during factorizaion:
+ * If nnz(L(:,1:j)) / nnz(A(:,1:j)) > gamma
+ * tau_L(j) := MIN(tau_0, tau_L(j-1) * 2);
+ * Otherwise
+ * tau_L(j) := MAX(tau_0, tau_L(j-1) / 2);
+ * tau_U(j) uses the similar rule.
+ * NOTE: the thresholds used by L and U are separate.
+ * DROP_INTERP: Compute the second dropping threshold by
+ * interpolation instead of sorting (default).
+ * In this case, the actual fill ratio is not
+ * guaranteed smaller than gamma.
+ * DROP_PROWS, DROP_COLUMN and DROP_AREA are mutually exclusive.
+ * ( Default: DROP_BASIC | DROP_AREA )
+ * 5) options->ILU_Norm is the criterion of measuring the magnitude
+ * of a row in a supernode of L. ( Default is INF_NORM )
+ * options->ILU_Norm RowSize(x[1:n])
+ * ================= ===============
+ * ONE_NORM ||x||_1 / n
+ * TWO_NORM ||x||_2 / sqrt(n)
+ * INF_NORM max{|x[i]|}
+ * 6) options->ILU_MILU specifies the type of MILU's variation.
+ * = SILU: do not perform Modified ILU;
+ * = SMILU_1 (not recommended):
+ * U(i,i) := U(i,i) + sum(dropped entries);
+ * = SMILU_2:
+ * U(i,i) := U(i,i) + SGN(U(i,i)) * sum(dropped entries);
+ * = SMILU_3:
+ * U(i,i) := U(i,i) + SGN(U(i,i)) * sum(|dropped entries|);
+ * NOTE: Even SMILU_1 does not preserve the column sum because of
+ * late dropping.
+ * ( Default: SILU )
+ * 7) options->ILU_FillTol is used as the perturbation when
+ * encountering zero pivots. If some U(i,i) = 0, so that U is
+ * exactly singular, then
+ * U(i,i) := ||A(:,i)|| * options->ILU_FillTol ** (1 - i / n).
+ * ( Default: 1e-2 )
+ *
+ * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
+ * to the transpose of A:
+ *
+ * 2.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling
+ * factors are computed to equilibrate the system:
+ * options->Trans = NOTRANS:
+ * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ * options->Trans = TRANS:
+ * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ * options->Trans = CONJ:
+ * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ * Whether or not the system will be equilibrated depends on the
+ * scaling of the matrix A, but if equilibration is used, A' is
+ * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B
+ * (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
+ *
+ * 2.2. Permute columns of transpose(A) (rows of A),
+ * forming transpose(A)*Pc, where Pc is a permutation matrix that
+ * usually preserves sparsity.
+ * For more details of this step, see sp_preorder.c.
+ *
+ * 2.3. If options->Fact != FACTORED, the LU decomposition is used to
+ * factor the transpose(A) (after equilibration if
+ * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
+ * permutation Pr determined by partial pivoting.
+ *
+ * 2.4. Compute the reciprocal pivot growth factor.
+ *
+ * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ * routine fills a small number on the diagonal entry, that is
+ * U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n).
+ * And info will be increased by 1. The factored form of A is used
+ * to estimate the condition number of the preconditioner. If the
+ * reciprocal of the condition number is less than machine precision,
+ * info = A->ncol+1 is returned as a warning, but the routine still
+ * goes on to solve for X.
+ *
+ * 2.6. The system of equations is solved for X using the factored form
+ * of transpose(A).
+ *
+ * 2.7. If options->IterRefine is not used.
+ *
+ * 2.8. If equilibration was used, the matrix X is premultiplied by
+ * diag(C) (if options->Trans = NOTRANS) or diag(R)
+ * (if options->Trans = TRANS or CONJ) so that it solves the
+ * original system before equilibration.
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ * The structure defines the input parameters to control
+ * how the LU decomposition will be performed and how the
+ * system will be solved.
+ *
+ * A (input/output) SuperMatrix*
+ * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ * of the linear equations is A->nrow. Currently, the type of A can be:
+ * Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
+ * In the future, more general A may be handled.
+ *
+ * On entry, If options->Fact = FACTORED and equed is not 'N',
+ * then A must have been equilibrated by the scaling factors in
+ * R and/or C.
+ * On exit, A is not modified
+ * if options->Equil = NO, or
+ * if options->Equil = YES but equed = 'N' on exit, or
+ * if options->RowPerm = NO.
+ *
+ * Otherwise, if options->Equil = YES and equed is not 'N',
+ * A is scaled as follows:
+ * If A->Stype = SLU_NC:
+ * equed = 'R': A := diag(R) * A
+ * equed = 'C': A := A * diag(C)
+ * equed = 'B': A := diag(R) * A * diag(C).
+ * If A->Stype = SLU_NR:
+ * equed = 'R': transpose(A) := diag(R) * transpose(A)
+ * equed = 'C': transpose(A) := transpose(A) * diag(C)
+ * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C).
+ *
+ * If options->RowPerm = LargeDiag, MC64 is used to scale and permute
+ * the matrix to an I-matrix, that is A is modified as follows:
+ * P*Dr*A*Dc has entries of modulus 1 on the diagonal and
+ * off-diagonal entries of modulus at most 1. P is a permutation
+ * obtained from MC64.
+ * If MC64 fails, dgsequ() is used to equilibrate the system,
+ * and A is scaled as above, but no permutation is involved.
+ * On exit, A is restored to the orginal row numbering, so
+ * Dr*A*Dc is returned.
+ *
+ * perm_c (input/output) int*
+ * If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
+ * which defines the permutation matrix Pc; perm_c[i] = j means
+ * column i of A is in position j in A*Pc.
+ * On exit, perm_c may be overwritten by the product of the input
+ * perm_c and a permutation that postorders the elimination tree
+ * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ * is already in postorder.
+ *
+ * If A->Stype = SLU_NR, column permutation vector of size A->nrow,
+ * which describes permutation of columns of transpose(A)
+ * (rows of A) as described above.
+ *
+ * perm_r (input/output) int*
+ * If A->Stype = SLU_NC, row permutation vector of size A->nrow,
+ * which defines the permutation matrix Pr, and is determined
+ * by MC64 first then followed by partial pivoting.
+ * perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ * If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ * determines permutation of rows of transpose(A)
+ * (columns of A) as described above.
+ *
+ * If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ * will try to use the input perm_r, unless a certain threshold
+ * criterion is violated. In that case, perm_r is overwritten by a
+ * new permutation determined by partial pivoting or diagonal
+ * threshold pivoting.
+ * Otherwise, perm_r is output argument.
+ *
+ * etree (input/output) int*, dimension (A->ncol)
+ * Elimination tree of Pc'*A'*A*Pc.
+ * If options->Fact != FACTORED and options->Fact != DOFACT,
+ * etree is an input argument, otherwise it is an output argument.
+ * Note: etree is a vector of parent pointers for a forest whose
+ * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * equed (input/output) char*
+ * Specifies the form of equilibration that was done.
+ * = 'N': No equilibration.
+ * = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
+ * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
+ * = 'B': Both row and column equilibration, i.e., A was replaced
+ * by diag(R)*A*diag(C).
+ * If options->Fact = FACTORED, equed is an input argument,
+ * otherwise it is an output argument.
+ *
+ * R (input/output) double*, dimension (A->nrow)
+ * The row scale factors for A or transpose(A).
+ * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ * (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
+ * If equed = 'N' or 'C', R is not accessed.
+ * If options->Fact = FACTORED, R is an input argument,
+ * otherwise, R is output.
+ * If options->Fact = FACTORED and equed = 'R' or 'B', each element
+ * of R must be positive.
+ *
+ * C (input/output) double*, dimension (A->ncol)
+ * The column scale factors for A or transpose(A).
+ * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ * (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
+ * If equed = 'N' or 'R', C is not accessed.
+ * If options->Fact = FACTORED, C is an input argument,
+ * otherwise, C is output.
+ * If options->Fact = FACTORED and equed = 'C' or 'B', each element
+ * of C must be positive.
+ *
+ * L (output) SuperMatrix*
+ * The factor L from the factorization
+ * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or
+ * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR).
+ * Uses compressed row subscripts storage for supernodes, i.e.,
+ * L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *
+ * U (output) SuperMatrix*
+ * The factor U from the factorization
+ * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or
+ * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR).
+ * Uses column-wise storage scheme, i.e., U has types:
+ * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ * work (workspace/output) void*, size (lwork) (in bytes)
+ * User supplied workspace, should be large enough
+ * to hold data structures for factors L and U.
+ * On exit, if fact is not 'F', L and U point to this array.
+ *
+ * lwork (input) int
+ * Specifies the size of work array in bytes.
+ * = 0: allocate space internally by system malloc;
+ * > 0: use user-supplied work array of length lwork in bytes,
+ * returns error if space runs out.
+ * = -1: the routine guesses the amount of space needed without
+ * performing the factorization, and returns it in
+ * mem_usage->total_needed; no other side effects.
+ *
+ * See argument 'mem_usage' for memory usage statistics.
+ *
+ * B (input/output) SuperMatrix*
+ * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ * On entry, the right hand side matrix.
+ * If B->ncol = 0, only LU decomposition is performed, the triangular
+ * solve is skipped.
+ * On exit,
+ * if equed = 'N', B is not modified; otherwise
+ * if A->Stype = SLU_NC:
+ * if options->Trans = NOTRANS and equed = 'R' or 'B',
+ * B is overwritten by diag(R)*B;
+ * if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
+ * B is overwritten by diag(C)*B;
+ * if A->Stype = SLU_NR:
+ * if options->Trans = NOTRANS and equed = 'C' or 'B',
+ * B is overwritten by diag(C)*B;
+ * if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
+ * B is overwritten by diag(R)*B.
+ *
+ * X (output) SuperMatrix*
+ * X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ * If info = 0 or info = A->ncol+1, X contains the solution matrix
+ * to the original system of equations. Note that A and B are modified
+ * on exit if equed is not 'N', and the solution to the equilibrated
+ * system is inv(diag(C))*X if options->Trans = NOTRANS and
+ * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
+ * and equed = 'R' or 'B'.
+ *
+ * recip_pivot_growth (output) double*
+ * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
+ * The infinity norm is used. If recip_pivot_growth is much less
+ * than 1, the stability of the LU factorization could be poor.
+ *
+ * rcond (output) double*
+ * The estimate of the reciprocal condition number of the matrix A
+ * after equilibration (if done). If rcond is less than the machine
+ * precision (in particular, if rcond = 0), the matrix is singular
+ * to working precision. This condition is indicated by a return
+ * code of info > 0.
+ *
+ * mem_usage (output) mem_usage_t*
+ * Record the memory usage statistics, consisting of following fields:
+ * - for_lu (float)
+ * The amount of space used in bytes for L\U data structures.
+ * - total_needed (float)
+ * The amount of space needed in bytes to perform factorization.
+ * - expansions (int)
+ * The number of memory expansions during the LU factorization.
+ *
+ * stat (output) SuperLUStat_t*
+ * Record the statistics on runtime and floating-point operation count.
+ * See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info (output) int*
+ * = 0: successful exit
+ * < 0: if info = -i, the i-th argument had an illegal value
+ * > 0: if info = i, and i is
+ * <= A->ncol: number of zero pivots. They are replaced by small
+ * entries due to options->ILU_FillTol.
+ * = A->ncol+1: U is nonsingular, but RCOND is less than machine
+ * precision, meaning that the matrix is singular to
+ * working precision. Nevertheless, the solution and
+ * error bounds are computed because there are a number
+ * of situations where the computed solution can be more
+ * accurate than the value of RCOND would suggest.
+ * > A->ncol+1: number of bytes allocated when memory allocation
+ * failure occurred, plus A->ncol.
+ *
+ */
+
+void
+dgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
+ int *etree, char *equed, double *R, double *C,
+ SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
+ SuperMatrix *B, SuperMatrix *X,
+ double *recip_pivot_growth, double *rcond,
+ mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info)
+{
+
+ DNformat *Bstore, *Xstore;
+ double *Bmat, *Xmat;
+ int ldb, ldx, nrhs, n;
+ SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
+ SuperMatrix AC; /* Matrix postmultiplied by Pc */
+ int colequ, equil, nofact, notran, rowequ, permc_spec, mc64;
+ trans_t trant;
+ char norm[1];
+ int i, j, info1;
+ double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
+ int relax, panel_size;
+ double diag_pivot_thresh;
+ double t0; /* temporary time */
+ double *utime;
+
+ int *perm = NULL; /* permutation returned from MC64 */
+
+ /* External functions */
+ extern double dlangs(char *, SuperMatrix *);
+
+ Bstore = B->Store;
+ Xstore = X->Store;
+ Bmat = Bstore->nzval;
+ Xmat = Xstore->nzval;
+ ldb = Bstore->lda;
+ ldx = Xstore->lda;
+ nrhs = B->ncol;
+ n = B->nrow;
+
+ *info = 0;
+ nofact = (options->Fact != FACTORED);
+ equil = (options->Equil == YES_SuperLU);
+ notran = (options->Trans == NOTRANS);
+ mc64 = (options->RowPerm == LargeDiag);
+ if ( nofact ) {
+ *(unsigned char *)equed = 'N';
+ rowequ = FALSE;
+ colequ = FALSE;
+ } else {
+ rowequ = lsame_(equed, "R") || lsame_(equed, "B");
+ colequ = lsame_(equed, "C") || lsame_(equed, "B");
+ smlnum = dlamch_("Safe minimum");
+ bignum = 1. / smlnum;
+ }
+
+ /* Test the input parameters */
+ if (options->Fact != DOFACT && options->Fact != SamePattern &&
+ options->Fact != SamePattern_SameRowPerm &&
+ options->Fact != FACTORED &&
+ options->Trans != NOTRANS && options->Trans != TRANS &&
+ options->Trans != CONJ &&
+ options->Equil != NO_SuperLU && options->Equil != YES_SuperLU)
+ *info = -1;
+ else if ( A->nrow != A->ncol || A->nrow < 0 ||
+ (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
+ A->Dtype != SLU_D || A->Mtype != SLU_GE )
+ *info = -2;
+ else if (options->Fact == FACTORED &&
+ !(rowequ || colequ || lsame_(equed, "N")))
+ *info = -6;
+ else {
+ if (rowequ) {
+ rcmin = bignum;
+ rcmax = 0.;
+ for (j = 0; j < A->nrow; ++j) {
+ rcmin = SUPERLU_MIN(rcmin, R[j]);
+ rcmax = SUPERLU_MAX(rcmax, R[j]);
+ }
+ if (rcmin <= 0.) *info = -7;
+ else if ( A->nrow > 0)
+ rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
+ else rowcnd = 1.;
+ }
+ if (colequ && *info == 0) {
+ rcmin = bignum;
+ rcmax = 0.;
+ for (j = 0; j < A->nrow; ++j) {
+ rcmin = SUPERLU_MIN(rcmin, C[j]);
+ rcmax = SUPERLU_MAX(rcmax, C[j]);
+ }
+ if (rcmin <= 0.) *info = -8;
+ else if (A->nrow > 0)
+ colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
+ else colcnd = 1.;
+ }
+ if (*info == 0) {
+ if ( lwork < -1 ) *info = -12;
+ else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
+ B->Stype != SLU_DN || B->Dtype != SLU_D ||
+ B->Mtype != SLU_GE )
+ *info = -13;
+ else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
+ (B->ncol != 0 && B->ncol != X->ncol) ||
+ X->Stype != SLU_DN ||
+ X->Dtype != SLU_D || X->Mtype != SLU_GE )
+ *info = -14;
+ }
+ }
+ if (*info != 0) {
+ i = -(*info);
+ xerbla_("dgsisx", &i);
+ return;
+ }
+
+ /* Initialization for factor parameters */
+ panel_size = sp_ienv(1);
+ relax = sp_ienv(2);
+ diag_pivot_thresh = options->DiagPivotThresh;
+
+ utime = stat->utime;
+
+ /* Convert A to SLU_NC format when necessary. */
+ if ( A->Stype == SLU_NR ) {
+ NRformat *Astore = A->Store;
+ AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
+ dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz,
+ Astore->nzval, Astore->colind, Astore->rowptr,
+ SLU_NC, A->Dtype, A->Mtype);
+ if ( notran ) { /* Reverse the transpose argument. */
+ trant = TRANS;
+ notran = 0;
+ } else {
+ trant = NOTRANS;
+ notran = 1;
+ }
+ } else { /* A->Stype == SLU_NC */
+ trant = options->Trans;
+ AA = A;
+ }
+
+ if ( nofact ) {
+ register int i, j;
+ NCformat *Astore = AA->Store;
+ int nnz = Astore->nnz;
+ int *colptr = Astore->colptr;
+ int *rowind = Astore->rowind;
+ double *nzval = (double *)Astore->nzval;
+
+ if ( mc64 ) {
+ t0 = SuperLU_timer_();
+ if ((perm = intMalloc(n)) == NULL)
+ ABORT_SuperLU("SUPERLU_MALLOC fails for perm[]");
+
+ info1 = dldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C);
+
+ if (info1 != 0) { /* MC64 fails, call dgsequ() later */
+ mc64 = 0;
+ SUPERLU_FREE(perm);
+ perm = NULL;
+ } else {
+ if ( equil ) {
+ rowequ = colequ = 1;
+ for (i = 0; i < n; i++) {
+ R[i] = exp(R[i]);
+ C[i] = exp(C[i]);
+ }
+ /* scale the matrix */
+ for (j = 0; j < n; j++) {
+ for (i = colptr[j]; i < colptr[j + 1]; i++) {
+ nzval[i] *= R[rowind[i]] * C[j];
+ }
+ }
+ *equed = 'B';
+ }
+
+ /* permute the matrix */
+ for (j = 0; j < n; j++) {
+ for (i = colptr[j]; i < colptr[j + 1]; i++) {
+ /*nzval[i] *= R[rowind[i]] * C[j];*/
+ rowind[i] = perm[rowind[i]];
+ }
+ }
+ }
+ utime[EQUIL] = SuperLU_timer_() - t0;
+ }
+
+ if ( !mc64 & equil ) { /* Only perform equilibration, no row perm */
+ t0 = SuperLU_timer_();
+ /* Compute row and column scalings to equilibrate the matrix A. */
+ dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
+
+ if ( info1 == 0 ) {
+ /* Equilibrate matrix A. */
+ dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
+ rowequ = lsame_(equed, "R") || lsame_(equed, "B");
+ colequ = lsame_(equed, "C") || lsame_(equed, "B");
+ }
+ utime[EQUIL] = SuperLU_timer_() - t0;
+ }
+ }
+
+
+ if ( nofact ) {
+
+ t0 = SuperLU_timer_();
+ /*
+ * Gnet column permutation vector perm_c[], according to permc_spec:
+ * permc_spec = NATURAL: natural ordering
+ * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+ * permc_spec = MMD_ATA: minimum degree on structure of A'*A
+ * permc_spec = COLAMD: approximate minimum degree column ordering
+ * permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+ */
+ permc_spec = options->ColPerm;
+ if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
+ get_perm_c(permc_spec, AA, perm_c);
+ utime[COLPERM] = SuperLU_timer_() - t0;
+
+ t0 = SuperLU_timer_();
+ sp_preorder(options, AA, perm_c, etree, &AC);
+ utime[ETREE] = SuperLU_timer_() - t0;
+
+ /* Compute the LU factorization of A*Pc. */
+ t0 = SuperLU_timer_();
+ dgsitrf(options, &AC, relax, panel_size, etree, work, lwork,
+ perm_c, perm_r, L, U, stat, info);
+ utime[FACT] = SuperLU_timer_() - t0;
+
+ if ( lwork == -1 ) {
+ mem_usage->total_needed = *info - A->ncol;
+ return;
+ }
+
+ if ( mc64 ) { /* Fold MC64's perm[] into perm_r[]. */
+ NCformat *Astore = AA->Store;
+ int nnz = Astore->nnz, *rowind = Astore->rowind;
+ int *perm_tmp, *iperm;
+ if ((perm_tmp = intMalloc(2*n)) == NULL)
+ ABORT_SuperLU("SUPERLU_MALLOC fails for perm_tmp[]");
+ iperm = perm_tmp + n;
+ for (i = 0; i < n; ++i) perm_tmp[i] = perm_r[perm[i]];
+ for (i = 0; i < n; ++i) {
+ perm_r[i] = perm_tmp[i];
+ iperm[perm[i]] = i;
+ }
+
+ /* Restore A's original row indices. */
+ for (i = 0; i < nnz; ++i) rowind[i] = iperm[rowind[i]];
+
+ SUPERLU_FREE(perm); /* MC64 permutation */
+ SUPERLU_FREE(perm_tmp);
+ }
+ }
+
+ if ( options->PivotGrowth ) {
+ if ( *info > 0 ) return;
+
+ /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
+ *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U);
+ }
+
+ if ( options->ConditionNumber ) {
+ /* Estimate the reciprocal of the condition number of A. */
+ t0 = SuperLU_timer_();
+ if ( notran ) {
+ *(unsigned char *)norm = '1';
+ } else {
+ *(unsigned char *)norm = 'I';
+ }
+ anorm = dlangs(norm, AA);
+ dgscon(norm, L, U, anorm, rcond, stat, &info1);
+ utime[RCOND] = SuperLU_timer_() - t0;
+ }
+
+ if ( nrhs > 0 ) { /* Solve the system */
+ double *rhs_work;
+
+ /* Scale and permute the right-hand side if equilibration
+ and permutation from MC64 were performed. */
+ if ( notran ) {
+ if ( rowequ ) {
+ for (j = 0; j < nrhs; ++j)
+ for (i = 0; i < n; ++i)
+ Bmat[i + j*ldb] *= R[i];
+ }
+ } else if ( colequ ) {
+ for (j = 0; j < nrhs; ++j)
+ for (i = 0; i < n; ++i) {
+ Bmat[i + j*ldb] *= C[i];
+ }
+ }
+
+ /* Compute the solution matrix X. */
+ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */
+ for (i = 0; i < B->nrow; i++)
+ Xmat[i + j*ldx] = Bmat[i + j*ldb];
+
+ t0 = SuperLU_timer_();
+ dgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1);
+ utime[SOLVE] = SuperLU_timer_() - t0;
+
+ /* Transform the solution matrix X to a solution of the original
+ system. */
+ if ( notran ) {
+ if ( colequ ) {
+ for (j = 0; j < nrhs; ++j)
+ for (i = 0; i < n; ++i) {
+ Xmat[i + j*ldx] *= C[i];
+ }
+ }
+ } else { /* transposed system */
+ if ( rowequ ) {
+ for (j = 0; j < nrhs; ++j)
+ for (i = 0; i < A->nrow; ++i) {
+ Xmat[i + j*ldx] *= R[i];
+ }
+ }
+ }
+
+ } /* end if nrhs > 0 */
+
+ if ( options->ConditionNumber ) {
+ /* The matrix is singular to working precision. */
+ if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1;
+ }
+
+ if ( nofact ) {
+ ilu_dQuerySpace(L, U, mem_usage);
+ Destroy_CompCol_Permuted(&AC);
+ }
+ if ( A->Stype == SLU_NR ) {
+ Destroy_SuperMatrix_Store(AA);
+ SUPERLU_FREE(AA);
+ }
+
+}
diff --git a/src/maths/SuperLU/dgsitrf.c b/src/maths/SuperLU/dgsitrf.c
new file mode 100644
index 000000000..88a184c71
--- /dev/null
+++ b/src/maths/SuperLU/dgsitrf.c
@@ -0,0 +1,639 @@
+
+/*! @file dgsitrf.c
+ * \brief Computes an ILU factorization of a general sparse matrix
+ *
+ * + * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + * + *+ */ + +#include
+ * Purpose + * ======= + * + * DGSITRF computes an ILU factorization of a general sparse m-by-n + * matrix A using partial pivoting with row interchanges. + * The factorization has the form + * Pr * A = L * U + * where Pr is a row permutation matrix, L is lower triangular with unit + * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper + * triangular (upper trapezoidal if A->nrow < A->ncol). + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the ILU decomposition will be performed. + * + * A (input) SuperMatrix* + * Original matrix A, permuted by columns, of dimension + * (A->nrow, A->ncol). The type of A can be: + * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. + * + * relax (input) int + * To control degree of relaxing supernodes. If the number + * of nodes (columns) in a subtree of the elimination tree is less + * than relax, this subtree is considered as one supernode, + * regardless of the row structures of those columns. + * + * panel_size (input) int + * A panel consists of at most panel_size consecutive columns. + * + * etree (input) int*, dimension (A->ncol) + * Elimination tree of A'*A. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * On input, the columns of A should be permuted so that the + * etree is in a certain postorder. + * + * work (input/output) void*, size (lwork) (in bytes) + * User-supplied work space and space for the output data structures. + * Not referenced if lwork = 0; + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * *info; no other side effects. + * + * perm_c (input) int*, dimension (A->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * When searching for diagonal, perm_c[*] is applied to the + * row subscripts of A, so that diagonal threshold pivoting + * can find the diagonal of A, rather than that of A*Pc. + * + * perm_r (input/output) int*, dimension (A->nrow) + * Row permutation vector which defines the permutation matrix Pr, + * perm_r[i] = j means row i of A is in position j in Pr*A. + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by + * a new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument; + * + * L (output) SuperMatrix* + * The factor L from the factorization Pr*A=L*U; use compressed row + * subscripts storage for supernodes, i.e., L has type: + * Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise + * storage scheme, i.e., U has types: Stype = SLU_NC, + * Dtype = SLU_D, Mtype = SLU_TRU. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See slu_util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: number of zero pivots. They are replaced by small + * entries according to options->ILU_FillTol. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. If lwork = -1, it is + * the estimated amount of space needed, plus A->ncol. + * + * ====================================================================== + * + * Local Working Arrays: + * ====================== + * m = number of rows in the matrix + * n = number of columns in the matrix + * + * marker[0:3*m-1]: marker[i] = j means that node i has been + * reached when working on column j. + * Storage: relative to original row subscripts + * NOTE: There are 4 of them: + * marker/marker1 are used for panel dfs, see (ilu_)dpanel_dfs.c; + * marker2 is used for inner-factorization, see (ilu)_dcolumn_dfs.c; + * marker_relax(has its own space) is used for relaxed supernodes. + * + * parent[0:m-1]: parent vector used during dfs + * Storage: relative to new row subscripts + * + * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) + * unexplored neighbor of i in lsub[*] + * + * segrep[0:nseg-1]: contains the list of supernodal representatives + * in topological order of the dfs. A supernode representative is the + * last column of a supernode. + * The maximum size of segrep[] is n. + * + * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a + * supernodal representative r, repfnz[r] is the location of the first + * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 + * indicates the supernode r has been explored. + * NOTE: There are W of them, each used for one column of a panel. + * + * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below + * the panel diagonal. These are filled in during dpanel_dfs(), and are + * used later in the inner LU factorization within the panel. + * panel_lsub[]/dense[] pair forms the SPA data structure. + * NOTE: There are W of them. + * + * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; + * NOTE: there are W of them. + * + * tempv[0:*]: real temporary used for dense numeric kernels; + * The size of this array is defined by NUM_TEMPV() in slu_util.h. + * It is also used by the dropping routine ilu_ddrop_row(). + *+ */ + +void +dgsitrf(superlu_options_t *options, SuperMatrix *A, int relax, int panel_size, + int *etree, void *work, int lwork, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *swap, *iswap; /* swap is used to store the row permutation + during the factorization. Initially, it is set + to iperm_c (row indeces of Pc*A*Pc'). + iswap is the inverse of swap. After the + factorization, it is equal to perm_r. */ + int *iwork; + double *dwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *marker, *marker_relax; + double *dense, *tempv; + int *relax_end, *relax_fsupc; + double *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double *amax; + double drop_sum; + double alpha, omega; /* used in MILU, mimicing DRIC */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + double *dwork2; /* used by the second dropping rule */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + double drop_tol = options->ILU_DropTol; /* tau */ + double fill_ini = options->ILU_FillTol; /* tau^hat */ + double gamma = options->ILU_FillFactor; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + double fill_tol; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + int last_drop;/* the last column which the dropping rules applied */ + int quota; + int nnzAj; /* number of nonzeros in A(:,1:j) */ + int nnzLj, nnzUj; + double tol_L = drop_tol, tol_U = drop_tol; + double zero = 0.0; + double one = 1.0; + + /* Executable */ + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, + gamma, L, U, &Glu, &iwork, &dwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &marker_relax, &marker); + dSetRWork(m, panel_size, dwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + swap = (int *)intMalloc(n); + for (k = 0; k < n; k++) swap[k] = iperm_c[k]; + iswap = (int *)intMalloc(n); + for (k = 0; k < n; k++) iswap[k] = perm_c[k]; + amax = (double *) doubleMalloc(panel_size); + if (drop_rule & DROP_SECONDARY) + dwork2 = (double *)doubleMalloc(n); + else + dwork2 = NULL; + + nnzAj = 0; + nnzLj = 0; + nnzUj = 0; + last_drop = SUPERLU_MAX(min_mn - 2 * sp_ienv(7), (int)(min_mn * 0.95)); + alpha = pow((double)n, -1.0 / options->ILU_MILU_Dim); + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + relax_fsupc = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) + ilu_heap_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + else + ilu_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* Mark the rows used by relaxed supernodes */ + ifill (marker_relax, m, EMPTY); + i = mark_relax(m, relax_end, relax_fsupc, xa_begin, xa_end, + asub, marker_relax); +#if ( PRNTlevel >= 1) + printf("%d relaxed supernodes.\n", i); +#endif + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* Drop small rows in the previous supernode. */ + if (jcol > 0 && jcol < last_drop) { + int first = xsup[supno[jcol - 1]]; + int last = jcol - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / min_mn); + + /* Drop small rows */ + i = ilu_ddrop_row(options, first, last, tol_L, quota, &nnzLj, + &fill_tol, &Glu, tempv, dwork2, 0); + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = ilu_dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ((*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu))) + return; + } + + for (icol = jcol; icol <= kcol; icol++) { + xusub[icol+1] = nextu; + + amax[0] = 0.0; + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) { + register double tmp = fabs(a[k]); + if (tmp > amax[0]) amax[0] = tmp; + dense[asub[k]] = a[k]; + } + nnzAj += xa_end[icol] - xa_begin[icol]; + if (amax[0] == 0.0) { + amax[0] = fill_ini; +#if ( PRNTlevel >= 1) + printf("Column %d is entirely zero!\n", icol); + fflush(stdout); +#endif + } + + /* Numeric update within the snode */ + dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if (usepr) pivrow = iperm_r[icol]; + fill_tol = pow(fill_ini, 1.0 - (double)icol / (double)min_mn); + if ( (*info = ilu_dpivotL(icol, diag_pivot_thresh, &usepr, + perm_r, iperm_c[icol], swap, iswap, + marker_relax, &pivrow, + amax[0] * fill_tol, milu, zero, + &Glu, stat)) ) { + iinfo++; + marker[pivrow] = kcol; + } + + } + + jcol = kcol + 1; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + ilu_dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, amax, panel_lsub, segrep, repfnz, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + dpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for (jj = jcol; jj < jcol + panel_size; jj++) { + + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + nnzAj += xa_end[jj] - xa_begin[jj]; + + if ((*info = ilu_dcolumn_dfs(m, jj, perm_r, &nseg, + &panel_lsub[k], segrep, &repfnz[k], + marker, parent, xplore, &Glu))) + return; + + /* Numeric updates */ + if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Make a fill-in position if the column is entirely zero */ + if (xlsub[jj + 1] == xlsub[jj]) { + register int i, row; + int nextl; + int nzlmax = Glu.nzlmax; + int *lsub = Glu.lsub; + int *marker2 = marker + 2 * m; + + /* Allocate memory */ + nextl = xlsub[jj] + 1; + if (nextl >= nzlmax) { + int error = dLUMemXpand(jj, nextl, LSUB, &nzlmax, &Glu); + if (error) { *info = error; return; } + lsub = Glu.lsub; + } + xlsub[jj + 1]++; + assert(xlusup[jj]==xlusup[jj+1]); + xlusup[jj + 1]++; + Glu.lusup[xlusup[jj]] = zero; + + /* Choose a row index (pivrow) for fill-in */ + for (i = jj; i < n; i++) + if (marker_relax[swap[i]] <= jj) break; + row = swap[i]; + marker2[row] = jj; + lsub[xlsub[jj]] = row; +#ifdef DEBUG + printf("Fill col %d.\n", jj); + fflush(stdout); +#endif + } + + /* Computer the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * jj / m; + else if (drop_rule & DROP_COLUMN) + quota = gamma * (xa_end[jj] - xa_begin[jj]) * + (jj + 1) / m; + else if (drop_rule & DROP_AREA) + quota = gamma * 0.9 * nnzAj * 0.5 - nnzUj; + else + quota = m; + + /* Copy the U-segments to ucol[*] and drop small entries */ + if ((*info = ilu_dcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], drop_rule, + milu, amax[jj - jcol] * tol_U, + quota, &drop_sum, &nnzUj, &Glu, + dwork2)) != 0) + return; + + /* Reset the dropping threshold if required */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * 0.9 * nnzAj * 0.5 < nnzLj) + tol_U = SUPERLU_MIN(1.0, tol_U * 2.0); + else + tol_U = SUPERLU_MAX(drop_tol, tol_U * 0.5); + } + + if (drop_sum != zero) + { + if (drop_sum > zero) + omega = SUPERLU_MIN(2.0 * (1.0 - alpha) + * amax[jj - jcol] / drop_sum, one); + else + omega = SUPERLU_MAX(2.0 * (1.0 - alpha) + * amax[jj - jcol] / drop_sum, -one); + drop_sum *= omega; + } + if (usepr) pivrow = iperm_r[jj]; + fill_tol = pow(fill_ini, 1.0 - (double)jj / (double)min_mn); + if ( (*info = ilu_dpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_c[jj], swap, iswap, + marker_relax, &pivrow, + amax[jj - jcol] * fill_tol, milu, + drop_sum, &Glu, stat)) ) { + iinfo++; + marker[m + pivrow] = jj; + marker[2 * m + pivrow] = jj; + } + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + + /* Start a new supernode, drop the previous one */ + if (jj > 0 && supno[jj] > supno[jj - 1] && jj < last_drop) { + int first = xsup[supno[jj - 1]]; + int last = jj - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) + / m) - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / + (double)min_mn); + + /* Drop small rows */ + i = ilu_ddrop_row(options, first, last, tol_L, quota, + &nnzLj, &fill_tol, &Glu, tempv, dwork2, + 1); + + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } /* if start a new supernode */ + + } /* for */ + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + ilu_countnz(min_mn, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_D, SLU_TRLU); + dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + SUPERLU_FREE (swap); + SUPERLU_FREE (iswap); + SUPERLU_FREE (relax_fsupc); + SUPERLU_FREE (amax); + if ( dwork2 ) SUPERLU_FREE (dwork2); + +} diff --git a/src/maths/SuperLU/dgsrfs.c b/src/maths/SuperLU/dgsrfs.c new file mode 100644 index 000000000..d24619f85 --- /dev/null +++ b/src/maths/SuperLU/dgsrfs.c @@ -0,0 +1,452 @@ + +/*! @file dgsrfs.c + * \brief Improves computed solution to a system of inear equations + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Modified from lapack routine DGERFS + *+ */ +/* + * File name: dgsrfs.c + * History: Modified from lapack routine DGERFS + */ +#include
+ * Purpose + * ======= + * + * DGSRFS improves the computed solution to a system of linear + * equations and provides error bounds and backward error estimates for + * the solution. + * + * If equilibration was performed, the system becomes: + * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * trans (input) trans_t + * Specifies the form of the system of equations: + * = NOTRANS: A * X = B (No transpose) + * = TRANS: A'* X = B (Transpose) + * = CONJ: A**H * X = B (Conjugate transpose) + * + * A (input) SuperMatrix* + * The original matrix A in the system, or the scaled A if + * equilibration was done. The type of A can be: + * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_GE. + * + * L (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U. Use + * compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. + * + * U (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U as computed by + * dgstrf(). Use column-wise storage scheme, + * i.e., U has types: Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. + * + * perm_c (input) int*, dimension (A->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * + * perm_r (input) int*, dimension (A->nrow) + * Row permutation vector, which defines the permutation matrix Pr; + * perm_r[i] = j means row i of A is in position j in Pr*A. + * + * equed (input) Specifies the form of equilibration that was done. + * = 'N': No equilibration. + * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). + * = 'C': Column equilibration, i.e., A was postmultiplied by + * diag(C). + * = 'B': Both row and column equilibration, i.e., A was replaced + * by diag(R)*A*diag(C). + * + * R (input) double*, dimension (A->nrow) + * The row scale factors for A. + * If equed = 'R' or 'B', A is premultiplied by diag(R). + * If equed = 'N' or 'C', R is not accessed. + * + * C (input) double*, dimension (A->ncol) + * The column scale factors for A. + * If equed = 'C' or 'B', A is postmultiplied by diag(C). + * If equed = 'N' or 'R', C is not accessed. + * + * B (input) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. + * The right hand side matrix B. + * if equed = 'R' or 'B', B is premultiplied by diag(R). + * + * X (input/output) SuperMatrix* + * X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. + * On entry, the solution matrix X, as computed by dgstrs(). + * On exit, the improved solution matrix X. + * if *equed = 'C' or 'B', X should be premultiplied by diag(C) + * in order to obtain the solution to the original system. + * + * FERR (output) double*, dimension (B->ncol) + * The estimated forward error bound for each solution vector + * X(j) (the j-th column of the solution matrix X). + * If XTRUE is the true solution corresponding to X(j), FERR(j) + * is an estimated upper bound for the magnitude of the largest + * element in (X(j) - XTRUE) divided by the magnitude of the + * largest element in X(j). The estimate is as reliable as + * the estimate for RCOND, and is almost always a slight + * overestimate of the true error. + * + * BERR (output) double*, dimension (B->ncol) + * The componentwise relative backward error of each solution + * vector X(j) (i.e., the smallest relative change in + * any element of A or B that makes X(j) an exact solution). + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if INFO = -i, the i-th argument had an illegal value + * + * Internal Parameters + * =================== + * + * ITMAX is the maximum number of steps of iterative refinement. + * + *+ */ +void +dgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, char *equed, double *R, double *C, + SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, + SuperLUStat_t *stat, int *info) +{ + + +#define ITMAX 5 + + /* Table of constant values */ + int ione = 1; + double ndone = -1.; + double done = 1.; + + /* Local variables */ + NCformat *Astore; + double *Aval; + SuperMatrix Bjcol; + DNformat *Bstore, *Xstore, *Bjcol_store; + double *Bmat, *Xmat, *Bptr, *Xptr; + int kase; + double safe1, safe2; + int i, j, k, irow, nz, count, notran, rowequ, colequ; + int ldb, ldx, nrhs; + double s, xk, lstres, eps, safmin; + char transc[1]; + trans_t transt; + double *work; + double *rwork; + int *iwork; + + extern int dlacon_(int *, double *, double *, int *, double *, int *); +#ifdef _CRAY + extern int SCOPY(int *, double *, int *, double *, int *); + extern int SSAXPY(int *, double *, double *, int *, double *, int *); +#else + extern int dcopy_(int *, double *, int *, double *, int *); + extern int daxpy_(int *, double *, double *, int *, double *, int *); +#endif + + Astore = A->Store; + Aval = Astore->nzval; + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + /* Test the input parameters */ + *info = 0; + notran = (trans == NOTRANS); + if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -2; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) + *info = -3; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) + *info = -4; + else if ( ldb < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) + *info = -10; + else if ( ldx < SUPERLU_MAX(0, A->nrow) || + X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE ) + *info = -11; + if (*info != 0) { + i = -(*info); + xerbla_("dgsrfs", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || nrhs == 0) { + for (j = 0; j < nrhs; ++j) { + ferr[j] = 0.; + berr[j] = 0.; + } + return; + } + + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + + /* Allocate working space */ + work = doubleMalloc(2*A->nrow); + rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); + iwork = intMalloc(2*A->nrow); + if ( !work || !rwork || !iwork ) + ABORT_SuperLU("Malloc fails for work/rwork/iwork."); + + if ( notran ) { + *(unsigned char *)transc = 'N'; + transt = TRANS; + } else { + *(unsigned char *)transc = 'T'; + transt = NOTRANS; + } + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = dlamch_("Epsilon"); + safmin = dlamch_("Safe minimum"); + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + + /* Compute the number of nonzeros in each row (or column) of A */ + for (i = 0; i < A->nrow; ++i) iwork[i] = 0; + if ( notran ) { + for (k = 0; k < A->ncol; ++k) + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + ++iwork[Astore->rowind[i]]; + } else { + for (k = 0; k < A->ncol; ++k) + iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; + } + + /* Copy one column of RHS B into Bjcol. */ + Bjcol.Stype = B->Stype; + Bjcol.Dtype = B->Dtype; + Bjcol.Mtype = B->Mtype; + Bjcol.nrow = B->nrow; + Bjcol.ncol = 1; + Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); + if ( !Bjcol.Store ) ABORT_SuperLU("SUPERLU_MALLOC fails for Bjcol.Store"); + Bjcol_store = Bjcol.Store; + Bjcol_store->lda = ldb; + Bjcol_store->nzval = work; /* address aliasing */ + + /* Do for each right hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + Bptr = &Bmat[j*ldb]; + Xptr = &Xmat[j*ldx]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + +#ifdef _CRAY + SCOPY(&A->nrow, Bptr, &ione, work, &ione); +#else + dcopy_(&A->nrow, Bptr, &ione, work, &ione); +#endif + sp_dgemv(transc, ndone, A, Xptr, ione, done, work, ione); + + /* Compute componentwise relative backward error from formula + max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) + where abs(Z) is the componentwise absolute value of the matrix + or vector Z. If the i-th component of the denominator is less + than SAFE2, then SAFE1 is added to the i-th component of the + numerator before dividing. */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if (notran) { + for (k = 0; k < A->ncol; ++k) { + xk = fabs( Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + s += fabs(Aval[i]) * fabs(Xptr[irow]); + } + rwork[k] += s; + } + } + s = 0.; + for (i = 0; i < A->nrow; ++i) { + if (rwork[i] > safe2) { + s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] ); + } else if ( rwork[i] != 0.0 ) { + /* Adding SAFE1 to the numerator guards against + spuriously zero residuals (underflow). */ + s = SUPERLU_MAX( s, (safe1 + fabs(work[i])) / rwork[i] ); + } + /* If rwork[i] is exactly 0.0, then we know the true + residual also must be exactly 0.0. */ + } + berr[j] = s; + + /* Test stopping criterion. Continue iterating if + 1) The residual BERR(J) is larger than machine epsilon, and + 2) BERR(J) decreased by at least a factor of 2 during the + last iteration, and + 3) At most ITMAX iterations tried. */ + + if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { + /* Update solution and try again. */ + dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + +#ifdef _CRAY + SAXPY(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#else + daxpy_(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#endif + lstres = berr[j]; + ++count; + } else { + break; + } + + } /* end while */ + + stat->RefineSteps = count; + + /* Bound error from formula: + norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* + ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) + where + norm(Z) is the magnitude of the largest component of Z + inv(op(A)) is the inverse of op(A) + abs(Z) is the componentwise absolute value of the matrix or + vector Z + NZ is the maximum number of nonzeros in any row of A, plus 1 + EPS is machine epsilon + + The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) + is incremented by SAFE1 if the i-th component of + abs(op(A))*abs(X) + abs(B) is less than SAFE2. + + Use DLACON to estimate the infinity-norm of the matrix + inv(op(A)) * diag(W), + where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if ( notran ) { + for (k = 0; k < A->ncol; ++k) { + xk = fabs( Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + xk = fabs( Xptr[irow] ); + s += fabs(Aval[i]) * xk; + } + rwork[k] += s; + } + } + + for (i = 0; i < A->nrow; ++i) + if (rwork[i] > safe2) + rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i]; + else + rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; + + kase = 0; + + do { + dlacon_(&A->nrow, &work[A->nrow], work, + &iwork[A->nrow], &ferr[j], &kase); + if (kase == 0) break; + + if (kase == 1) { + /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; + else if ( !notran && rowequ ) + for (i = 0; i < A->nrow; ++i) work[i] *= R[i]; + + dgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); + + for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; + } else { + /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ + for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; + + dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; + else if ( !notran && rowequ ) + for (i = 0; i < A->ncol; ++i) work[i] *= R[i]; + } + + } while ( kase != 0 ); + + + /* Normalize error. */ + lstres = 0.; + if ( notran && colequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) ); + } else if ( !notran && rowequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) ); + } else { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) ); + } + if ( lstres != 0. ) + ferr[j] /= lstres; + + } /* for each RHS j ... */ + + SUPERLU_FREE(work); + SUPERLU_FREE(rwork); + SUPERLU_FREE(iwork); + SUPERLU_FREE(Bjcol.Store); + + return; + +} /* dgsrfs */ diff --git a/src/maths/SuperLU/dgssv.c b/src/maths/SuperLU/dgssv.c new file mode 100644 index 000000000..d62202cd0 --- /dev/null +++ b/src/maths/SuperLU/dgssv.c @@ -0,0 +1,227 @@ + +/*! @file dgssv.c + * \brief Solves the system of linear equations A*X=B + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + *+ */ +#include
+ * Purpose + * ======= + * + * DGSSV solves the system of linear equations A*X=B, using the + * LU factorization from DGSTRF. It performs the following steps: + * + * 1. If A is stored column-wise (A->Stype = SLU_NC): + * + * 1.1. Permute the columns of A, forming A*Pc, where Pc + * is a permutation matrix. For more details of this step, + * see sp_preorder.c. + * + * 1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined + * by Gaussian elimination with partial pivoting. + * L is unit lower triangular with offdiagonal entries + * bounded by 1 in magnitude, and U is upper triangular. + * + * 1.3. Solve the system of equations A*X=B using the factored + * form of A. + * + * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the + * above algorithm to the transpose of A: + * + * 2.1. Permute columns of transpose(A) (rows of A), + * forming transpose(A)*Pc, where Pc is a permutation matrix. + * For more details of this step, see sp_preorder.c. + * + * 2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr + * determined by Gaussian elimination with partial pivoting. + * L is unit lower triangular with offdiagonal entries + * bounded by 1 in magnitude, and U is upper triangular. + * + * 2.3. Solve the system of equations A*X=B using the factored + * form of A. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed and how the + * system will be solved. + * + * A (input) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of linear equations is A->nrow. Currently, the type of A can be: + * Stype = SLU_NC or SLU_NR; Dtype = SLU_D; Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * perm_c (input/output) int* + * If A->Stype = SLU_NC, column permutation vector of size A->ncol + * which defines the permutation matrix Pc; perm_c[i] = j means + * column i of A is in position j in A*Pc. + * If A->Stype = SLU_NR, column permutation vector of size A->nrow + * which describes permutation of columns of transpose(A) + * (rows of A) as described above. + * + * If options->ColPerm = MY_PERMC or options->Fact = SamePattern or + * options->Fact = SamePattern_SameRowPerm, it is an input argument. + * On exit, perm_c may be overwritten by the product of the input + * perm_c and a permutation that postorders the elimination tree + * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree + * is already in postorder. + * Otherwise, it is an output argument. + * + * perm_r (input/output) int* + * If A->Stype = SLU_NC, row permutation vector of size A->nrow, + * which defines the permutation matrix Pr, and is determined + * by partial pivoting. perm_r[i] = j means row i of A is in + * position j in Pr*A. + * If A->Stype = SLU_NR, permutation vector of size A->ncol, which + * determines permutation of rows of transpose(A) + * (columns of A) as described above. + * + * If options->RowPerm = MY_PERMR or + * options->Fact = SamePattern_SameRowPerm, perm_r is an + * input argument. + * otherwise it is an output argument. + * + * L (output) SuperMatrix* + * The factor L from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses compressed row subscripts storage for supernodes, i.e., + * L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * On exit, the solution matrix if info = 0; + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly singular, + * so the solution could not be computed. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. + *+ */ + +void +dgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, + SuperLUStat_t *stat, int *info ) +{ + + DNformat *Bstore; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int lwork = 0, *etree, i; + + /* Set default values for some parameters */ + int panel_size; /* panel size */ + int relax; /* no of columns in a relaxed snodes */ + int permc_spec; + trans_t trans = NOTRANS; + double *utime; + double t; /* Temporary time */ + + /* Test the input parameters ... */ + *info = 0; + Bstore = B->Store; + if ( options->Fact != DOFACT ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -2; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) + *info = -7; + if ( *info != 0 ) { + i = -(*info); + xerbla_("dgssv", &i); + return; + } + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + trans = TRANS; + } else { + if ( A->Stype == SLU_NC ) AA = A; + } + + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t; + + etree = intMalloc(A->ncol); + + t = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + + /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4));*/ + t = SuperLU_timer_(); + /* Compute the LU factorization of A. */ + dgstrf(options, &AC, relax, panel_size, etree, + NULL, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t; + + t = SuperLU_timer_(); + if ( *info == 0 ) { + /* Solve the system A*X=B, overwriting B with X. */ + dgstrs (trans, L, U, perm_c, perm_r, B, stat, info); + } + utime[SOLVE] = SuperLU_timer_() - t; + + SUPERLU_FREE (etree); + Destroy_CompCol_Permuted(&AC); + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/dgssvx.c b/src/maths/SuperLU/dgssvx.c new file mode 100644 index 000000000..5f06db4fe --- /dev/null +++ b/src/maths/SuperLU/dgssvx.c @@ -0,0 +1,622 @@ + +/*! @file dgssvx.c + * \brief Solves the system of linear equations A*X=B or A'*X=B + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + *+ */ +#include
+ * Purpose + * ======= + * + * DGSSVX solves the system of linear equations A*X=B or A'*X=B, using + * the LU factorization from dgstrf(). Error bounds on the solution and + * a condition estimate are also provided. It performs the following steps: + * + * 1. If A is stored column-wise (A->Stype = SLU_NC): + * + * 1.1. If options->Equil = YES, scaling factors are computed to + * equilibrate the system: + * options->Trans = NOTRANS: + * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B + * options->Trans = TRANS: + * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B + * options->Trans = CONJ: + * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B + * Whether or not the system will be equilibrated depends on the + * scaling of the matrix A, but if equilibration is used, A is + * overwritten by diag(R)*A*diag(C) and B by diag(R)*B + * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans + * = TRANS or CONJ). + * + * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation + * matrix that usually preserves sparsity. + * For more details of this step, see sp_preorder.c. + * + * 1.3. If options->Fact != FACTORED, the LU decomposition is used to + * factor the matrix A (after equilibration if options->Equil = YES) + * as Pr*A*Pc = L*U, with Pr determined by partial pivoting. + * + * 1.4. Compute the reciprocal pivot growth factor. + * + * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the + * routine returns with info = i. Otherwise, the factored form of + * A is used to estimate the condition number of the matrix A. If + * the reciprocal of the condition number is less than machine + * precision, info = A->ncol+1 is returned as a warning, but the + * routine still goes on to solve for X and computes error bounds + * as described below. + * + * 1.6. The system of equations is solved for X using the factored form + * of A. + * + * 1.7. If options->IterRefine != NOREFINE, iterative refinement is + * applied to improve the computed solution matrix and calculate + * error bounds and backward error estimates for it. + * + * 1.8. If equilibration was used, the matrix X is premultiplied by + * diag(C) (if options->Trans = NOTRANS) or diag(R) + * (if options->Trans = TRANS or CONJ) so that it solves the + * original system before equilibration. + * + * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm + * to the transpose of A: + * + * 2.1. If options->Equil = YES, scaling factors are computed to + * equilibrate the system: + * options->Trans = NOTRANS: + * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B + * options->Trans = TRANS: + * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B + * options->Trans = CONJ: + * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B + * Whether or not the system will be equilibrated depends on the + * scaling of the matrix A, but if equilibration is used, A' is + * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B + * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). + * + * 2.2. Permute columns of transpose(A) (rows of A), + * forming transpose(A)*Pc, where Pc is a permutation matrix that + * usually preserves sparsity. + * For more details of this step, see sp_preorder.c. + * + * 2.3. If options->Fact != FACTORED, the LU decomposition is used to + * factor the transpose(A) (after equilibration if + * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the + * permutation Pr determined by partial pivoting. + * + * 2.4. Compute the reciprocal pivot growth factor. + * + * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the + * routine returns with info = i. Otherwise, the factored form + * of transpose(A) is used to estimate the condition number of the + * matrix A. If the reciprocal of the condition number + * is less than machine precision, info = A->nrow+1 is returned as + * a warning, but the routine still goes on to solve for X and + * computes error bounds as described below. + * + * 2.6. The system of equations is solved for X using the factored form + * of transpose(A). + * + * 2.7. If options->IterRefine != NOREFINE, iterative refinement is + * applied to improve the computed solution matrix and calculate + * error bounds and backward error estimates for it. + * + * 2.8. If equilibration was used, the matrix X is premultiplied by + * diag(C) (if options->Trans = NOTRANS) or diag(R) + * (if options->Trans = TRANS or CONJ) so that it solves the + * original system before equilibration. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed and how the + * system will be solved. + * + * A (input/output) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of the linear equations is A->nrow. Currently, the type of A can be: + * Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * On entry, If options->Fact = FACTORED and equed is not 'N', + * then A must have been equilibrated by the scaling factors in + * R and/or C. + * On exit, A is not modified if options->Equil = NO, or if + * options->Equil = YES but equed = 'N' on exit. + * Otherwise, if options->Equil = YES and equed is not 'N', + * A is scaled as follows: + * If A->Stype = SLU_NC: + * equed = 'R': A := diag(R) * A + * equed = 'C': A := A * diag(C) + * equed = 'B': A := diag(R) * A * diag(C). + * If A->Stype = SLU_NR: + * equed = 'R': transpose(A) := diag(R) * transpose(A) + * equed = 'C': transpose(A) := transpose(A) * diag(C) + * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). + * + * perm_c (input/output) int* + * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, + * which defines the permutation matrix Pc; perm_c[i] = j means + * column i of A is in position j in A*Pc. + * On exit, perm_c may be overwritten by the product of the input + * perm_c and a permutation that postorders the elimination tree + * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree + * is already in postorder. + * + * If A->Stype = SLU_NR, column permutation vector of size A->nrow, + * which describes permutation of columns of transpose(A) + * (rows of A) as described above. + * + * perm_r (input/output) int* + * If A->Stype = SLU_NC, row permutation vector of size A->nrow, + * which defines the permutation matrix Pr, and is determined + * by partial pivoting. perm_r[i] = j means row i of A is in + * position j in Pr*A. + * + * If A->Stype = SLU_NR, permutation vector of size A->ncol, which + * determines permutation of rows of transpose(A) + * (columns of A) as described above. + * + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by a + * new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument. + * + * etree (input/output) int*, dimension (A->ncol) + * Elimination tree of Pc'*A'*A*Pc. + * If options->Fact != FACTORED and options->Fact != DOFACT, + * etree is an input argument, otherwise it is an output argument. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * + * equed (input/output) char* + * Specifies the form of equilibration that was done. + * = 'N': No equilibration. + * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). + * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). + * = 'B': Both row and column equilibration, i.e., A was replaced + * by diag(R)*A*diag(C). + * If options->Fact = FACTORED, equed is an input argument, + * otherwise it is an output argument. + * + * R (input/output) double*, dimension (A->nrow) + * The row scale factors for A or transpose(A). + * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) + * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). + * If equed = 'N' or 'C', R is not accessed. + * If options->Fact = FACTORED, R is an input argument, + * otherwise, R is output. + * If options->zFact = FACTORED and equed = 'R' or 'B', each element + * of R must be positive. + * + * C (input/output) double*, dimension (A->ncol) + * The column scale factors for A or transpose(A). + * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) + * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). + * If equed = 'N' or 'R', C is not accessed. + * If options->Fact = FACTORED, C is an input argument, + * otherwise, C is output. + * If options->Fact = FACTORED and equed = 'C' or 'B', each element + * of C must be positive. + * + * L (output) SuperMatrix* + * The factor L from the factorization + * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses compressed row subscripts storage for supernodes, i.e., + * L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. + * + * work (workspace/output) void*, size (lwork) (in bytes) + * User supplied workspace, should be large enough + * to hold data structures for factors L and U. + * On exit, if fact is not 'F', L and U point to this array. + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * mem_usage->total_needed; no other side effects. + * + * See argument 'mem_usage' for memory usage statistics. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * If B->ncol = 0, only LU decomposition is performed, the triangular + * solve is skipped. + * On exit, + * if equed = 'N', B is not modified; otherwise + * if A->Stype = SLU_NC: + * if options->Trans = NOTRANS and equed = 'R' or 'B', + * B is overwritten by diag(R)*B; + * if options->Trans = TRANS or CONJ and equed = 'C' of 'B', + * B is overwritten by diag(C)*B; + * if A->Stype = SLU_NR: + * if options->Trans = NOTRANS and equed = 'C' or 'B', + * B is overwritten by diag(C)*B; + * if options->Trans = TRANS or CONJ and equed = 'R' of 'B', + * B is overwritten by diag(R)*B. + * + * X (output) SuperMatrix* + * X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. + * If info = 0 or info = A->ncol+1, X contains the solution matrix + * to the original system of equations. Note that A and B are modified + * on exit if equed is not 'N', and the solution to the equilibrated + * system is inv(diag(C))*X if options->Trans = NOTRANS and + * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C' + * and equed = 'R' or 'B'. + * + * recip_pivot_growth (output) double* + * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). + * The infinity norm is used. If recip_pivot_growth is much less + * than 1, the stability of the LU factorization could be poor. + * + * rcond (output) double* + * The estimate of the reciprocal condition number of the matrix A + * after equilibration (if done). If rcond is less than the machine + * precision (in particular, if rcond = 0), the matrix is singular + * to working precision. This condition is indicated by a return + * code of info > 0. + * + * FERR (output) double*, dimension (B->ncol) + * The estimated forward error bound for each solution vector + * X(j) (the j-th column of the solution matrix X). + * If XTRUE is the true solution corresponding to X(j), FERR(j) + * is an estimated upper bound for the magnitude of the largest + * element in (X(j) - XTRUE) divided by the magnitude of the + * largest element in X(j). The estimate is as reliable as + * the estimate for RCOND, and is almost always a slight + * overestimate of the true error. + * If options->IterRefine = NOREFINE, ferr = 1.0. + * + * BERR (output) double*, dimension (B->ncol) + * The componentwise relative backward error of each solution + * vector X(j) (i.e., the smallest relative change in + * any element of A or B that makes X(j) an exact solution). + * If options->IterRefine = NOREFINE, berr = 1.0. + * + * mem_usage (output) mem_usage_t* + * Record the memory usage statistics, consisting of following fields: + * - for_lu (float) + * The amount of space used in bytes for L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + * - expansions (int) + * The number of memory expansions during the LU factorization. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See slu_util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly + * singular, so the solution and error bounds + * could not be computed. + * = A->ncol+1: U is nonsingular, but RCOND is less than machine + * precision, meaning that the matrix is singular to + * working precision. Nevertheless, the solution and + * error bounds are computed because there are a number + * of situations where the computed solution can be more + * accurate than the value of RCOND would suggest. + * > A->ncol+1: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. + *+ */ + +void +dgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + int *etree, char *equed, double *R, double *C, + SuperMatrix *L, SuperMatrix *U, void *work, int lwork, + SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, + double *rcond, double *ferr, double *berr, + mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) +{ + + + DNformat *Bstore, *Xstore; + double *Bmat, *Xmat; + int ldb, ldx, nrhs; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int colequ, equil, nofact, notran, rowequ, permc_spec; + trans_t trant; + char norm[1]; + int i, j, info1; + double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; + int relax, panel_size; + double diag_pivot_thresh; + double t0; /* temporary time */ + double *utime; + + /* External functions */ + extern double dlangs(char *, SuperMatrix *); + + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + *info = 0; + nofact = (options->Fact != FACTORED); + equil = (options->Equil == YES_SuperLU); + notran = (options->Trans == NOTRANS); + if ( nofact ) { + *(unsigned char *)equed = 'N'; + rowequ = FALSE; + colequ = FALSE; + } else { + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + smlnum = dlamch_("Safe minimum"); + bignum = 1. / smlnum; + } + +#if 0 +printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", + options->Fact, options->Trans, *equed); +#endif + + /* Test the input parameters */ + if (options->Fact != DOFACT && options->Fact != SamePattern && + options->Fact != SamePattern_SameRowPerm && + options->Fact != FACTORED && + options->Trans != NOTRANS && options->Trans != TRANS && + options->Trans != CONJ && + options->Equil != NO_SuperLU && options->Equil != YES_SuperLU) + *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -2; + else if (options->Fact == FACTORED && + !(rowequ || colequ || lsame_(equed, "N"))) + *info = -6; + else { + if (rowequ) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, R[j]); + rcmax = SUPERLU_MAX(rcmax, R[j]); + } + if (rcmin <= 0.) *info = -7; + else if ( A->nrow > 0) + rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else rowcnd = 1.; + } + if (colequ && *info == 0) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, C[j]); + rcmax = SUPERLU_MAX(rcmax, C[j]); + } + if (rcmin <= 0.) *info = -8; + else if (A->nrow > 0) + colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else colcnd = 1.; + } + if (*info == 0) { + if ( lwork < -1 ) *info = -12; + else if ( B->ncol < 0 ) *info = -13; + else if ( B->ncol > 0 ) { /* no checking if B->ncol=0 */ + if ( Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || + B->Mtype != SLU_GE ) + *info = -13; + } + if ( X->ncol < 0 ) *info = -14; + else if ( X->ncol > 0 ) { /* no checking if X->ncol=0 */ + if ( Xstore->lda < SUPERLU_MAX(0, A->nrow) || + (B->ncol != 0 && B->ncol != X->ncol) || + X->Stype != SLU_DN || + X->Dtype != SLU_D || X->Mtype != SLU_GE ) + *info = -14; + } + } + } + if (*info != 0) { + i = -(*info); + xerbla_("dgssvx", &i); + return; + } + + /* Initialization for factor parameters */ + panel_size = sp_ienv(1); + relax = sp_ienv(2); + diag_pivot_thresh = options->DiagPivotThresh; + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + if ( notran ) { /* Reverse the transpose argument. */ + trant = TRANS; + notran = 0; + } else { + trant = NOTRANS; + notran = 1; + } + } else { /* A->Stype == SLU_NC */ + trant = options->Trans; + AA = A; + } + + if ( nofact && equil ) { + t0 = SuperLU_timer_(); + /* Compute row and column scalings to equilibrate the matrix A. */ + dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); + + if ( info1 == 0 ) { + /* Equilibrate matrix A. */ + dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + + + if ( nofact ) { + + t0 = SuperLU_timer_(); + /* + * Gnet column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t0; + + t0 = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t0; + +/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4)); + fflush(stdout); */ + + /* Compute the LU factorization of A*Pc. */ + t0 = SuperLU_timer_(); + dgstrf(options, &AC, relax, panel_size, etree, + work, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t0; + + if ( lwork == -1 ) { + mem_usage->total_needed = *info - A->ncol; + return; + } + } + + if ( options->PivotGrowth ) { + if ( *info > 0 ) { + if ( *info <= A->ncol ) { + /* Compute the reciprocal pivot growth factor of the leading + rank-deficient *info columns of A. */ + *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U); + } + return; + } + + /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ + *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U); + } + + if ( options->ConditionNumber ) { + /* Estimate the reciprocal of the condition number of A. */ + t0 = SuperLU_timer_(); + if ( notran ) { + *(unsigned char *)norm = '1'; + } else { + *(unsigned char *)norm = 'I'; + } + anorm = dlangs(norm, AA); + dgscon(norm, L, U, anorm, rcond, stat, info); + utime[RCOND] = SuperLU_timer_() - t0; + } + + if ( nrhs > 0 ) { + /* Scale the right hand side if equilibration was performed. */ + if ( notran ) { + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Bmat[i + j*ldb] *= R[i]; + } + } else if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Bmat[i + j*ldb] *= C[i]; + } + + /* Compute the solution matrix X. */ + for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ + for (i = 0; i < B->nrow; i++) + Xmat[i + j*ldx] = Bmat[i + j*ldb]; + + t0 = SuperLU_timer_(); + dgstrs (trant, L, U, perm_c, perm_r, X, stat, info); + utime[SOLVE] = SuperLU_timer_() - t0; + + /* Use iterative refinement to improve the computed solution and compute + error bounds and backward error estimates for it. */ + t0 = SuperLU_timer_(); + if ( options->IterRefine != NOREFINE ) { + dgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, + X, ferr, berr, stat, info); + } else { + for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; + } + utime[REFINE] = SuperLU_timer_() - t0; + + /* Transform the solution matrix X to a solution of the original system. */ + if ( notran ) { + if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Xmat[i + j*ldx] *= C[i]; + } + } else if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Xmat[i + j*ldx] *= R[i]; + } + } /* end if nrhs > 0 */ + + if ( options->ConditionNumber ) { + /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ + if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; + } + + if ( nofact ) { + dQuerySpace(L, U, mem_usage); + Destroy_CompCol_Permuted(&AC); + } + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/dgstrf.c b/src/maths/SuperLU/dgstrf.c new file mode 100644 index 000000000..d6e65d648 --- /dev/null +++ b/src/maths/SuperLU/dgstrf.c @@ -0,0 +1,436 @@ + +/*! @file dgstrf.c + * \brief Computes an LU factorization of a general sparse matrix + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * + * DGSTRF computes an LU factorization of a general sparse m-by-n + * matrix A using partial pivoting with row interchanges. + * The factorization has the form + * Pr * A = L * U + * where Pr is a row permutation matrix, L is lower triangular with unit + * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper + * triangular (upper trapezoidal if A->nrow < A->ncol). + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed. + * + * A (input) SuperMatrix* + * Original matrix A, permuted by columns, of dimension + * (A->nrow, A->ncol). The type of A can be: + * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. + * + * relax (input) int + * To control degree of relaxing supernodes. If the number + * of nodes (columns) in a subtree of the elimination tree is less + * than relax, this subtree is considered as one supernode, + * regardless of the row structures of those columns. + * + * panel_size (input) int + * A panel consists of at most panel_size consecutive columns. + * + * etree (input) int*, dimension (A->ncol) + * Elimination tree of A'*A. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * On input, the columns of A should be permuted so that the + * etree is in a certain postorder. + * + * work (input/output) void*, size (lwork) (in bytes) + * User-supplied work space and space for the output data structures. + * Not referenced if lwork = 0; + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * *info; no other side effects. + * + * perm_c (input) int*, dimension (A->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * When searching for diagonal, perm_c[*] is applied to the + * row subscripts of A, so that diagonal threshold pivoting + * can find the diagonal of A, rather than that of A*Pc. + * + * perm_r (input/output) int*, dimension (A->nrow) + * Row permutation vector which defines the permutation matrix Pr, + * perm_r[i] = j means row i of A is in position j in Pr*A. + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by + * a new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument; + * + * L (output) SuperMatrix* + * The factor L from the factorization Pr*A=L*U; use compressed row + * subscripts storage for supernodes, i.e., L has type: + * Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise + * storage scheme, i.e., U has types: Stype = SLU_NC, + * Dtype = SLU_D, Mtype = SLU_TRU. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See slu_util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly singular, + * and division by zero will occur if it is used to solve a + * system of equations. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. If lwork = -1, it is + * the estimated amount of space needed, plus A->ncol. + * + * ====================================================================== + * + * Local Working Arrays: + * ====================== + * m = number of rows in the matrix + * n = number of columns in the matrix + * + * xprune[0:n-1]: xprune[*] points to locations in subscript + * vector lsub[*]. For column i, xprune[i] denotes the point where + * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need + * to be traversed for symbolic factorization. + * + * marker[0:3*m-1]: marker[i] = j means that node i has been + * reached when working on column j. + * Storage: relative to original row subscripts + * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, + * see dpanel_dfs.c; marker2 is used for inner-factorization, + * see dcolumn_dfs.c. + * + * parent[0:m-1]: parent vector used during dfs + * Storage: relative to new row subscripts + * + * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) + * unexplored neighbor of i in lsub[*] + * + * segrep[0:nseg-1]: contains the list of supernodal representatives + * in topological order of the dfs. A supernode representative is the + * last column of a supernode. + * The maximum size of segrep[] is n. + * + * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a + * supernodal representative r, repfnz[r] is the location of the first + * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 + * indicates the supernode r has been explored. + * NOTE: There are W of them, each used for one column of a panel. + * + * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below + * the panel diagonal. These are filled in during dpanel_dfs(), and are + * used later in the inner LU factorization within the panel. + * panel_lsub[]/dense[] pair forms the SPA data structure. + * NOTE: There are W of them. + * + * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; + * NOTE: there are W of them. + * + * tempv[0:*]: real temporary used for dense numeric kernels; + * The size of this array is defined by NUM_TEMPV() in slu_ddefs.h. + *+ */ + +void +dgstrf (superlu_options_t *options, SuperMatrix *A, + int relax, int panel_size, int *etree, void *work, int lwork, + int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, + SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *iwork; + double *dwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *xprune; + int *marker; + double *dense, *tempv; + int *relax_end; + double *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double fill_ratio = sp_ienv(6); /* estimated fill ratio */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz, + panel_size, fill_ratio, L, U, &Glu, &iwork, &dwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &xprune, &marker); + dSetRWork(m, panel_size, dwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) { + heap_relax_snode(n, etree, relax, marker, relax_end); + } else { + relax_snode(n, etree, relax, marker, relax_end); + } + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + xprune, marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ( (*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) + return; + } + + for (icol = jcol; icol<= kcol; icol++) { + xusub[icol+1] = nextu; + + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) + dense[asub[k]] = a[k]; + + /* Numeric update within the snode */ + dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if ( (*info = dpivotL(icol, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + +#ifdef DEBUG + dprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); +#endif + + } + + jcol = icol; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, panel_lsub, segrep, repfnz, xprune, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + dpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for ( jj = jcol; jj < jcol + panel_size; jj++) { + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + if ((*info = dcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], + segrep, &repfnz[k], xprune, marker, + parent, xplore, &Glu)) != 0) return; + + /* Numeric updates */ + if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Copy the U-segments to ucol[*] */ + if ((*info = dcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], &Glu)) != 0) + return; + + if ( (*info = dpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + + /* Prune columns (0:jj-1) using column jj */ + dpruneL(jj, perm_r, pivrow, nseg, segrep, + &repfnz[k], xprune, &Glu); + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + +#ifdef DEBUG + dprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); +#endif + + } + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_D, SLU_TRLU); + dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + +} diff --git a/src/maths/SuperLU/dgstrs.c b/src/maths/SuperLU/dgstrs.c new file mode 100644 index 000000000..30a3975db --- /dev/null +++ b/src/maths/SuperLU/dgstrs.c @@ -0,0 +1,337 @@ + +/*! @file dgstrs.c + * \brief Solves a system using LU factorization + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + +#include
+ * Purpose + * ======= + * + * DGSTRS solves a system of linear equations A*X=B or A'*X=B + * with A sparse and B dense, using the LU factorization computed by + * DGSTRF. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * trans (input) trans_t + * Specifies the form of the system of equations: + * = NOTRANS: A * X = B (No transpose) + * = TRANS: A'* X = B (Transpose) + * = CONJ: A**H * X = B (Conjugate transpose) + * + * L (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U as computed by + * dgstrf(). Use compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. + * + * U (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U as computed by + * dgstrf(). Use column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. + * + * perm_c (input) int*, dimension (L->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * + * perm_r (input) int*, dimension (L->nrow) + * Row permutation vector, which defines the permutation matrix Pr; + * perm_r[i] = j means row i of A is in position j in Pr*A. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * On exit, the solution matrix if info = 0; + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + *+ */ + +void +dgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, SuperMatrix *B, + SuperLUStat_t *stat, int *info) +{ + +#ifdef _CRAY + _fcd ftcs1, ftcs2, ftcs3, ftcs4; +#endif + int incx = 1, incy = 1; +#ifdef USE_VENDOR_BLAS + double alpha = 1.0, beta = 1.0; + double *work_col; +#endif + DNformat *Bstore; + double *Bmat; + SCformat *Lstore; + NCformat *Ustore; + double *Lval, *Uval; + int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; + int i, j, k, iptr, jcol, n, ldb, nrhs; + double *work, *rhs_work, *soln; + flops_t solve_ops; + void dprint_soln(); + + /* Test input parameters ... */ + *info = 0; + Bstore = B->Store; + ldb = Bstore->lda; + nrhs = B->ncol; + if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) + *info = -2; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) + *info = -3; + else if ( ldb < SUPERLU_MAX(0, L->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) + *info = -6; + if ( *info ) { + i = -(*info); + xerbla_("dgstrs", &i); + return; + } + + n = L->nrow; + work = doubleCalloc(n * nrhs); + if ( !work ) ABORT_SuperLU("Malloc fails for local work[]."); + soln = doubleMalloc(n); + if ( !soln ) ABORT_SuperLU("Malloc fails for local soln[]."); + + Bmat = Bstore->nzval; + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( trans == NOTRANS ) { + /* Permute right hand sides to form Pr*B */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + /* Forward solve PLy=Pb. */ + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + nrow = nsupr - nsupc; + + solve_ops += nsupc * (nsupc - 1) * nrhs; + solve_ops += 2 * nrow * nsupc * nrhs; + + if ( nsupc == 1 ) { + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + luptr = L_NZ_START(fsupc); + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ + irow = L_SUB(iptr); + ++luptr; + rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; + } + } + } else { + luptr = L_NZ_START(fsupc); +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); + STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#else + dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#endif + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + work_col = &work[j*n]; + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + rhs_work[irow] -= work_col[i]; /* Scatter */ + work_col[i] = 0.0; + iptr++; + } + } +#else + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); + dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], + &rhs_work[fsupc], &work[0] ); + + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + rhs_work[irow] -= work[i]; + work[i] = 0.0; + iptr++; + } + } +#endif + } /* else ... */ + } /* for L-solve */ + +#ifdef DEBUG + printf("After L-solve: y=\n"); + dprint_soln(n, nrhs, Bmat); +#endif + + /* + * Back solve Ux=y. + */ + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += nsupc * (nsupc + 1) * nrhs; + + if ( nsupc == 1 ) { + rhs_work = &Bmat[0]; + for (j = 0; j < nrhs; j++) { + rhs_work[fsupc] /= Lval[luptr]; + rhs_work += ldb; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("U", strlen("U")); + ftcs3 = _cptofcd("N", strlen("N")); + STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#else + dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#endif +#else + for (j = 0; j < nrhs; j++) + dusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); +#endif + } + + for (j = 0; j < nrhs; ++j) { + rhs_work = &Bmat[j*ldb]; + for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ + irow = U_SUB(i); + rhs_work[irow] -= rhs_work[jcol] * Uval[i]; + } + } + } + + } /* for U-solve */ + +#ifdef DEBUG + printf("After U-solve: x=\n"); + dprint_soln(n, nrhs, Bmat); +#endif + + /* Compute the final solution X := Pc*X. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = solve_ops; + + } else { /* Solve A'*X=B or CONJ(A)*X=B */ + /* Permute right hand sides to form Pc'*B. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = 0; + for (k = 0; k < nrhs; ++k) { + + /* Multiply by inv(U'). */ + sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); + + /* Multiply by inv(L'). */ + sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); + + } + /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + } + + SUPERLU_FREE(work); + SUPERLU_FREE(soln); +} + +/* + * Diagnostic print of the solution vector + */ +void +dprint_soln(int n, int nrhs, double *soln) +{ + int i; + + for (i = 0; i < n; i++) + printf("\t%d: %.4f\n", i, soln[i]); +} diff --git a/src/maths/SuperLU/dlacon.c b/src/maths/SuperLU/dlacon.c new file mode 100644 index 000000000..bcbf32815 --- /dev/null +++ b/src/maths/SuperLU/dlacon.c @@ -0,0 +1,236 @@ + +/*! @file dlacon.c + * \brief Estimates the 1-norm + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ */ +#include
+ * Purpose + * ======= + * + * DLACON estimates the 1-norm of a square matrix A. + * Reverse communication is used for evaluating matrix-vector products. + * + * + * Arguments + * ========= + * + * N (input) INT + * The order of the matrix. N >= 1. + * + * V (workspace) DOUBLE PRECISION array, dimension (N) + * On the final return, V = A*W, where EST = norm(V)/norm(W) + * (W is not returned). + * + * X (input/output) DOUBLE PRECISION array, dimension (N) + * On an intermediate return, X should be overwritten by + * A * X, if KASE=1, + * A' * X, if KASE=2, + * and DLACON must be re-called with all the other parameters + * unchanged. + * + * ISGN (workspace) INT array, dimension (N) + * + * EST (output) DOUBLE PRECISION + * An estimate (a lower bound) for norm(A). + * + * KASE (input/output) INT + * On the initial call to DLACON, KASE should be 0. + * On an intermediate return, KASE will be 1 or 2, indicating + * whether X should be overwritten by A * X or A' * X. + * On the final return from DLACON, KASE will again be 0. + * + * Further Details + * ======= ======= + * + * Contributed by Nick Higham, University of Manchester. + * Originally named CONEST, dated March 16, 1988. + * + * Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of + * a real or complex matrix, with applications to condition estimation", + * ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. + * ===================================================================== + *+ */ + +int +dlacon_(int *n, double *v, double *x, int *isgn, double *est, int *kase) + +{ + + + /* Table of constant values */ + int c__1 = 1; + double zero = 0.0; + double one = 1.0; + + /* Local variables */ + static int iter; + static int jump, jlast; + static double altsgn, estold; + static int i, j; + double temp; +#ifdef _CRAY + extern int ISAMAX(int *, double *, int *); + extern double SASUM(int *, double *, int *); + extern int SCOPY(int *, double *, int *, double *, int *); +#else + extern int idamax_(int *, double *, int *); + extern double dasum_(int *, double *, int *); + extern int dcopy_(int *, double *, int *, double *, int *); +#endif +#define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a)) /* Copy sign */ +#define i_dnnt(a) \ + ( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */ + + if ( *kase == 0 ) { + for (i = 0; i < *n; ++i) { + x[i] = 1. / (double) (*n); + } + *kase = 1; + jump = 1; + return 0; + } + + switch (jump) { + case 1: goto L20; + case 2: goto L40; + case 3: goto L70; + case 4: goto L110; + case 5: goto L140; + } + + /* ................ ENTRY (JUMP = 1) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ + L20: + if (*n == 1) { + v[0] = x[0]; + *est = fabs(v[0]); + /* ... QUIT */ + goto L150; + } +#ifdef _CRAY + *est = SASUM(n, x, &c__1); +#else + *est = dasum_(n, x, &c__1); +#endif + + for (i = 0; i < *n; ++i) { + x[i] = d_sign(one, x[i]); + isgn[i] = i_dnnt(x[i]); + } + *kase = 2; + jump = 2; + return 0; + + /* ................ ENTRY (JUMP = 2) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ +L40: +#ifdef _CRAY + j = ISAMAX(n, &x[0], &c__1); +#else + j = idamax_(n, &x[0], &c__1); +#endif + --j; + iter = 2; + + /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ +L50: + for (i = 0; i < *n; ++i) x[i] = zero; + x[j] = one; + *kase = 1; + jump = 3; + return 0; + + /* ................ ENTRY (JUMP = 3) + X HAS BEEN OVERWRITTEN BY A*X. */ +L70: +#ifdef _CRAY + SCOPY(n, x, &c__1, v, &c__1); +#else + dcopy_(n, x, &c__1, v, &c__1); +#endif + estold = *est; +#ifdef _CRAY + *est = SASUM(n, v, &c__1); +#else + *est = dasum_(n, v, &c__1); +#endif + + for (i = 0; i < *n; ++i) + if (i_dnnt(d_sign(one, x[i])) != isgn[i]) + goto L90; + + /* REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */ + goto L120; + +L90: + /* TEST FOR CYCLING. */ + if (*est <= estold) goto L120; + + for (i = 0; i < *n; ++i) { + x[i] = d_sign(one, x[i]); + isgn[i] = i_dnnt(x[i]); + } + *kase = 2; + jump = 4; + return 0; + + /* ................ ENTRY (JUMP = 4) + X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ +L110: + jlast = j; +#ifdef _CRAY + j = ISAMAX(n, &x[0], &c__1); +#else + j = idamax_(n, &x[0], &c__1); +#endif + --j; + if (x[jlast] != fabs(x[j]) && iter < 5) { + ++iter; + goto L50; + } + + /* ITERATION COMPLETE. FINAL STAGE. */ +L120: + altsgn = 1.; + for (i = 1; i <= *n; ++i) { + x[i-1] = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.); + altsgn = -altsgn; + } + *kase = 1; + jump = 5; + return 0; + + /* ................ ENTRY (JUMP = 5) + X HAS BEEN OVERWRITTEN BY A*X. */ +L140: +#ifdef _CRAY + temp = SASUM(n, x, &c__1) / (double)(*n * 3) * 2.; +#else + temp = dasum_(n, x, &c__1) / (double)(*n * 3) * 2.; +#endif + if (temp > *est) { +#ifdef _CRAY + SCOPY(n, &x[0], &c__1, &v[0], &c__1); +#else + dcopy_(n, &x[0], &c__1, &v[0], &c__1); +#endif + *est = temp; + } + +L150: + *kase = 0; + return 0; + +} /* dlacon_ */ diff --git a/src/maths/SuperLU/dlamch.c b/src/maths/SuperLU/dlamch.c new file mode 100644 index 000000000..628cf5307 --- /dev/null +++ b/src/maths/SuperLU/dlamch.c @@ -0,0 +1,971 @@ +/*! @file dlamch.c + * \brief Determines double precision machine parameters + * + *
+ * -- LAPACK auxiliary routine (version 2.0) -- + * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + * Courant Institute, Argonne National Lab, and Rice University + * October 31, 1992 + *+ */ +#include
+ Purpose + ======= + + DLAMCH determines double precision machine parameters. + + Arguments + ========= + + CMACH (input) CHARACTER*1 + Specifies the value to be returned by DLAMCH: + = 'E' or 'e', DLAMCH := eps + = 'S' or 's , DLAMCH := sfmin + = 'B' or 'b', DLAMCH := base + = 'P' or 'p', DLAMCH := eps*base + = 'N' or 'n', DLAMCH := t + = 'R' or 'r', DLAMCH := rnd + = 'M' or 'm', DLAMCH := emin + = 'U' or 'u', DLAMCH := rmin + = 'L' or 'l', DLAMCH := emax + = 'O' or 'o', DLAMCH := rmax + + where + + eps = relative machine precision + sfmin = safe minimum, such that 1/sfmin does not overflow + base = base of the machine + prec = eps*base + t = number of (base) digits in the mantissa + rnd = 1.0 when rounding occurs in addition, 0.0 otherwise + emin = minimum exponent before (gradual) underflow + rmin = underflow threshold - base**(emin-1) + emax = largest exponent before overflow + rmax = overflow threshold - (base**emax)*(1-eps) + + ===================================================================== ++*/ +double dlamch_(char *cmach) +{ + + + static int first = TRUE_; + + /* System generated locals */ + int i__1; + double ret_val; + /* Builtin functions */ + double pow_di(double *, int *); + /* Local variables */ + static double base; + static int beta; + static double emin, prec, emax; + static int imin, imax; + static int lrnd; + static double rmin, rmax, t, rmach; + extern int lsame_(char *, char *); + static double small, sfmin; + extern /* Subroutine */ int dlamc2_(int *, int *, int *, + double *, int *, double *, int *, double *); + static int it; + static double rnd, eps; + + if (first) { + first = FALSE_; + dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); + base = (double) beta; + t = (double) it; + if (lrnd) { + rnd = 1.; + i__1 = 1 - it; + eps = pow_di(&base, &i__1) / 2; + } else { + rnd = 0.; + i__1 = 1 - it; + eps = pow_di(&base, &i__1); + } + prec = eps * base; + emin = (double) imin; + emax = (double) imax; + sfmin = rmin; + small = 1. / rmax; + if (small >= sfmin) { + + /* Use SMALL plus a bit, to avoid the possibility of rounding + causing overflow when computing 1/sfmin. */ + sfmin = small * (eps + 1.); + } + } + + if (lsame_(cmach, "E")) { + rmach = eps; + } else if (lsame_(cmach, "S")) { + rmach = sfmin; + } else if (lsame_(cmach, "B")) { + rmach = base; + } else if (lsame_(cmach, "P")) { + rmach = prec; + } else if (lsame_(cmach, "N")) { + rmach = t; + } else if (lsame_(cmach, "R")) { + rmach = rnd; + } else if (lsame_(cmach, "M")) { + rmach = emin; + } else if (lsame_(cmach, "U")) { + rmach = rmin; + } else if (lsame_(cmach, "L")) { + rmach = emax; + } else if (lsame_(cmach, "O")) { + rmach = rmax; + } + + ret_val = rmach; + return ret_val; + +/* End of DLAMCH */ + +} /* dlamch_ */ +/* Subroutine */ +/*! \brief + +
+ Purpose + ======= + + DLAMC1 determines the machine parameters given by BETA, T, RND, and + IEEE1. + + Arguments + ========= + + BETA (output) INT + The base of the machine. + + T (output) INT + The number of ( BETA ) digits in the mantissa. + + RND (output) INT + Specifies whether proper rounding ( RND = .TRUE. ) or + chopping ( RND = .FALSE. ) occurs in addition. This may not + + be a reliable guide to the way in which the machine performs + + its arithmetic. + + IEEE1 (output) INT + Specifies whether rounding appears to be done in the IEEE + 'round to nearest' style. + + Further Details + =============== + + The routine is based on the routine ENVRON by Malcolm and + incorporates suggestions by Gentleman and Marovich. See + + Malcolm M. A. (1972) Algorithms to reveal properties of + floating-point arithmetic. Comms. of the ACM, 15, 949-951. + + Gentleman W. M. and Marovich S. B. (1974) More on algorithms + that reveal properties of floating point arithmetic units. + Comms. of the ACM, 17, 276-277. + + ===================================================================== ++*/ +int dlamc1_(int *beta, int *t, int *rnd, int + *ieee1) +{ + /* Initialized data */ + static int first = TRUE_; + /* System generated locals */ + double d__1, d__2; + /* Local variables */ + static int lrnd; + static double a, b, c, f; + static int lbeta; + static double savec; + extern double dlamc3_(double *, double *); + static int lieee1; + static double t1, t2; + static int lt; + static double one, qtr; + + if (first) { + first = FALSE_; + one = 1.; + +/* LBETA, LIEEE1, LT and LRND are the local values of BE +TA, + IEEE1, T and RND. + + Throughout this routine we use the function DLAMC3 to ens +ure + that relevant values are stored and not held in registers, + or + are not affected by optimizers. + + Compute a = 2.0**m with the smallest positive integer m s +uch + that + + fl( a + 1.0 ) = a. */ + + a = 1.; + c = 1.; + +/* + WHILE( C.EQ.ONE )LOOP */ +L10: + if (c == one) { + a *= 2; + c = dlamc3_(&a, &one); + d__1 = -a; + c = dlamc3_(&c, &d__1); + goto L10; + } +/* + END WHILE + + Now compute b = 2.0**m with the smallest positive integer +m + such that + + fl( a + b ) .gt. a. */ + + b = 1.; + c = dlamc3_(&a, &b); + +/* + WHILE( C.EQ.A )LOOP */ +L20: + if (c == a) { + b *= 2; + c = dlamc3_(&a, &b); + goto L20; + } +/* + END WHILE + + Now compute the base. a and c are neighbouring floating po +int + numbers in the interval ( beta**t, beta**( t + 1 ) ) and + so + their difference is beta. Adding 0.25 to c is to ensure that + it + is truncated to beta and not ( beta - 1 ). */ + + qtr = one / 4; + savec = c; + d__1 = -a; + c = dlamc3_(&c, &d__1); + lbeta = (int) (c + qtr); + +/* Now determine whether rounding or chopping occurs, by addin +g a + bit less than beta/2 and a bit more than beta/2 to + a. */ + + b = (double) lbeta; + d__1 = b / 2; + d__2 = -b / 100; + f = dlamc3_(&d__1, &d__2); + c = dlamc3_(&f, &a); + if (c == a) { + lrnd = TRUE_; + } else { + lrnd = FALSE_; + } + d__1 = b / 2; + d__2 = b / 100; + f = dlamc3_(&d__1, &d__2); + c = dlamc3_(&f, &a); + if (lrnd && c == a) { + lrnd = FALSE_; + } + +/* Try and decide whether rounding is done in the IEEE 'round + to + nearest' style. B/2 is half a unit in the last place of the +two + numbers A and SAVEC. Furthermore, A is even, i.e. has last +bit + zero, and SAVEC is odd. Thus adding B/2 to A should not cha +nge + A, but adding B/2 to SAVEC should change SAVEC. */ + + d__1 = b / 2; + t1 = dlamc3_(&d__1, &a); + d__1 = b / 2; + t2 = dlamc3_(&d__1, &savec); + lieee1 = t1 == a && t2 > savec && lrnd; + +/* Now find the mantissa, t. It should be the integer part + of + log to the base beta of a, however it is safer to determine + t + by powering. So we find t as the smallest positive integer +for + which + + fl( beta**t + 1.0 ) = 1.0. */ + + lt = 0; + a = 1.; + c = 1.; + +/* + WHILE( C.EQ.ONE )LOOP */ +L30: + if (c == one) { + ++lt; + a *= lbeta; + c = dlamc3_(&a, &one); + d__1 = -a; + c = dlamc3_(&c, &d__1); + goto L30; + } +/* + END WHILE */ + + } + + *beta = lbeta; + *t = lt; + *rnd = lrnd; + *ieee1 = lieee1; + return 0; + +/* End of DLAMC1 */ + +} /* dlamc1_ */ + + +/* Subroutine */ +/*! \brief + +
+ Purpose + ======= + + DLAMC2 determines the machine parameters specified in its argument + list. + + Arguments + ========= + + BETA (output) INT + The base of the machine. + + T (output) INT + The number of ( BETA ) digits in the mantissa. + + RND (output) INT + Specifies whether proper rounding ( RND = .TRUE. ) or + chopping ( RND = .FALSE. ) occurs in addition. This may not + + be a reliable guide to the way in which the machine performs + + its arithmetic. + + EPS (output) DOUBLE PRECISION + The smallest positive number such that + + fl( 1.0 - EPS ) .LT. 1.0, + + where fl denotes the computed value. + + EMIN (output) INT + The minimum exponent before (gradual) underflow occurs. + + RMIN (output) DOUBLE PRECISION + The smallest normalized number for the machine, given by + BASE**( EMIN - 1 ), where BASE is the floating point value + + of BETA. + + EMAX (output) INT + The maximum exponent before overflow occurs. + + RMAX (output) DOUBLE PRECISION + The largest positive number for the machine, given by + BASE**EMAX * ( 1 - EPS ), where BASE is the floating point + + value of BETA. + + Further Details + =============== + + The computation of EPS is based on a routine PARANOIA by + W. Kahan of the University of California at Berkeley. + + ===================================================================== ++*/ +int dlamc2_(int *beta, int *t, int *rnd, + double *eps, int *emin, double *rmin, int *emax, + double *rmax) +{ + + /* Table of constant values */ + static int c__1 = 1; + + /* Initialized data */ + static int first = TRUE_; + static int iwarn = FALSE_; + /* System generated locals */ + int i__1; + double d__1, d__2, d__3, d__4, d__5; + /* Builtin functions */ + double pow_di(double *, int *); + /* Local variables */ + static int ieee; + static double half; + static int lrnd; + static double leps, zero, a, b, c; + static int i, lbeta; + static double rbase; + static int lemin, lemax, gnmin; + static double small; + static int gpmin; + static double third, lrmin, lrmax, sixth; + extern /* Subroutine */ int dlamc1_(int *, int *, int *, + int *); + extern double dlamc3_(double *, double *); + static int lieee1; + extern /* Subroutine */ int dlamc4_(int *, double *, int *), + dlamc5_(int *, int *, int *, int *, int *, + double *); + static int lt, ngnmin, ngpmin; + static double one, two; + + if (first) { + first = FALSE_; + zero = 0.; + one = 1.; + two = 2.; + +/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values + of + BETA, T, RND, EPS, EMIN and RMIN. + + Throughout this routine we use the function DLAMC3 to ens +ure + that relevant values are stored and not held in registers, + or + are not affected by optimizers. + + DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. +*/ + + dlamc1_(&lbeta, <, &lrnd, &lieee1); + +/* Start to find EPS. */ + + b = (double) lbeta; + i__1 = -lt; + a = pow_di(&b, &i__1); + leps = a; + +/* Try some tricks to see whether or not this is the correct E +PS. */ + + b = two / 3; + half = one / 2; + d__1 = -half; + sixth = dlamc3_(&b, &d__1); + third = dlamc3_(&sixth, &sixth); + d__1 = -half; + b = dlamc3_(&third, &d__1); + b = dlamc3_(&b, &sixth); + b = abs(b); + if (b < leps) { + b = leps; + } + + leps = 1.; + +/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ +L10: + if (leps > b && b > zero) { + leps = b; + d__1 = half * leps; +/* Computing 5th power */ + d__3 = two, d__4 = d__3, d__3 *= d__3; +/* Computing 2nd power */ + d__5 = leps; + d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5); + c = dlamc3_(&d__1, &d__2); + d__1 = -c; + c = dlamc3_(&half, &d__1); + b = dlamc3_(&half, &c); + d__1 = -b; + c = dlamc3_(&half, &d__1); + b = dlamc3_(&half, &c); + goto L10; + } +/* + END WHILE */ + + if (a < leps) { + leps = a; + } + +/* Computation of EPS complete. + + Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 +)). + Keep dividing A by BETA until (gradual) underflow occurs. T +his + is detected when we cannot recover the previous A. */ + + rbase = one / lbeta; + small = one; + for (i = 1; i <= 3; ++i) { + d__1 = small * rbase; + small = dlamc3_(&d__1, &zero); +/* L20: */ + } + a = dlamc3_(&one, &small); + dlamc4_(&ngpmin, &one, &lbeta); + d__1 = -one; + dlamc4_(&ngnmin, &d__1, &lbeta); + dlamc4_(&gpmin, &a, &lbeta); + d__1 = -a; + dlamc4_(&gnmin, &d__1, &lbeta); + ieee = FALSE_; + + if (ngpmin == ngnmin && gpmin == gnmin) { + if (ngpmin == gpmin) { + lemin = ngpmin; +/* ( Non twos-complement machines, no gradual under +flow; + e.g., VAX ) */ + } else if (gpmin - ngpmin == 3) { + lemin = ngpmin - 1 + lt; + ieee = TRUE_; +/* ( Non twos-complement machines, with gradual und +erflow; + e.g., IEEE standard followers ) */ + } else { + lemin = min(ngpmin,gpmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } + + } else if (ngpmin == gpmin && ngnmin == gnmin) { + if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { + lemin = max(ngpmin,ngnmin); +/* ( Twos-complement machines, no gradual underflow +; + e.g., CYBER 205 ) */ + } else { + lemin = min(ngpmin,ngnmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } + + } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) + { + if (gpmin - min(ngpmin,ngnmin) == 3) { + lemin = max(ngpmin,ngnmin) - 1 + lt; +/* ( Twos-complement machines with gradual underflo +w; + no known machine ) */ + } else { + lemin = min(ngpmin,ngnmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } + + } else { +/* Computing MIN */ + i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); + lemin = min(i__1,gnmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } +/* ** + Comment out this if block if EMIN is ok */ + if (iwarn) { + first = TRUE_; + printf("\n\n WARNING. The value EMIN may be incorrect:- "); + printf("EMIN = %8i\n",lemin); + printf("If, after inspection, the value EMIN looks acceptable"); + printf("please comment out \n the IF block as marked within the"); + printf("code of routine DLAMC2, \n otherwise supply EMIN"); + printf("explicitly.\n"); + } +/* ** + + Assume IEEE arithmetic if we found denormalised numbers abo +ve, + or if arithmetic seems to round in the IEEE style, determi +ned + in routine DLAMC1. A true IEEE machine should have both thi +ngs + true; however, faulty machines may have one or the other. */ + + ieee = ieee || lieee1; + +/* Compute RMIN by successive division by BETA. We could comp +ute + RMIN as BASE**( EMIN - 1 ), but some machines underflow dur +ing + this computation. */ + + lrmin = 1.; + i__1 = 1 - lemin; + for (i = 1; i <= 1-lemin; ++i) { + d__1 = lrmin * rbase; + lrmin = dlamc3_(&d__1, &zero); +/* L30: */ + } + +/* Finally, call DLAMC5 to compute EMAX and RMAX. */ + + dlamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); + } + + *beta = lbeta; + *t = lt; + *rnd = lrnd; + *eps = leps; + *emin = lemin; + *rmin = lrmin; + *emax = lemax; + *rmax = lrmax; + + return 0; + + +/* End of DLAMC2 */ + +} /* dlamc2_ */ + +/*! \brief + +
+ Purpose + ======= + + DLAMC3 is intended to force A and B to be stored prior to doing + + the addition of A and B , for use in situations where optimizers + + might hold one of these in a register. + + Arguments + ========= + + A, B (input) DOUBLE PRECISION + The values A and B. + + ===================================================================== ++*/ +double dlamc3_(double *a, double *b) +{ +/* >>Start of File<< + System generated locals */ + double ret_val; + + ret_val = *a + *b; + + return ret_val; + +/* End of DLAMC3 */ + +} /* dlamc3_ */ + + +/* Subroutine */ +/*! \brief + +
+ Purpose + ======= + + DLAMC4 is a service routine for DLAMC2. + + Arguments + ========= + + EMIN (output) EMIN + The minimum exponent before (gradual) underflow, computed by + + setting A = START and dividing by BASE until the previous A + can not be recovered. + + START (input) DOUBLE PRECISION + The starting point for determining EMIN. + + BASE (input) INT + The base of the machine. + + ===================================================================== ++*/ + +int dlamc4_(int *emin, double *start, int *base) +{ + /* System generated locals */ + int i__1; + double d__1; + /* Local variables */ + static double zero, a; + static int i; + static double rbase, b1, b2, c1, c2, d1, d2; + extern double dlamc3_(double *, double *); + static double one; + + a = *start; + one = 1.; + rbase = one / *base; + zero = 0.; + *emin = 1; + d__1 = a * rbase; + b1 = dlamc3_(&d__1, &zero); + c1 = a; + c2 = a; + d1 = a; + d2 = a; +/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. + $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ +L10: + if (c1 == a && c2 == a && d1 == a && d2 == a) { + --(*emin); + a = b1; + d__1 = a / *base; + b1 = dlamc3_(&d__1, &zero); + d__1 = b1 * *base; + c1 = dlamc3_(&d__1, &zero); + d1 = zero; + i__1 = *base; + for (i = 1; i <= *base; ++i) { + d1 += b1; +/* L20: */ + } + d__1 = a * rbase; + b2 = dlamc3_(&d__1, &zero); + d__1 = b2 / rbase; + c2 = dlamc3_(&d__1, &zero); + d2 = zero; + i__1 = *base; + for (i = 1; i <= *base; ++i) { + d2 += b2; +/* L30: */ + } + goto L10; + } +/* + END WHILE */ + + return 0; + +/* End of DLAMC4 */ + +} /* dlamc4_ */ + + +/* Subroutine */ +/*! \brief + +
+ Purpose + ======= + + DLAMC5 attempts to compute RMAX, the largest machine floating-point + number, without overflow. It assumes that EMAX + abs(EMIN) sum + approximately to a power of 2. It will fail on machines where this + assumption does not hold, for example, the Cyber 205 (EMIN = -28625, + + EMAX = 28718). It will also fail if the value supplied for EMIN is + too large (i.e. too close to zero), probably with overflow. + + Arguments + ========= + + BETA (input) INT + The base of floating-point arithmetic. + + P (input) INT + The number of base BETA digits in the mantissa of a + floating-point value. + + EMIN (input) INT + The minimum exponent before (gradual) underflow. + + IEEE (input) INT + A int flag specifying whether or not the arithmetic + system is thought to comply with the IEEE standard. + + EMAX (output) INT + The largest exponent before overflow + + RMAX (output) DOUBLE PRECISION + The largest machine floating-point number. + + ===================================================================== + + + + First compute LEXP and UEXP, two powers of 2 that bound + abs(EMIN). We then assume that EMAX + abs(EMIN) will sum + approximately to the bound that is closest to abs(EMIN). + (EMAX is the exponent of the required number RMAX). ++*/ +int dlamc5_(int *beta, int *p, int *emin, + int *ieee, int *emax, double *rmax) +{ + + /* Table of constant values */ + static double c_b5 = 0.; + + /* System generated locals */ + int i__1; + double d__1; + /* Local variables */ + static int lexp; + static double oldy; + static int uexp, i; + static double y, z; + static int nbits; + extern double dlamc3_(double *, double *); + static double recbas; + static int exbits, expsum, try__; + + + + lexp = 1; + exbits = 1; +L10: + try__ = lexp << 1; + if (try__ <= -(*emin)) { + lexp = try__; + ++exbits; + goto L10; + } + if (lexp == -(*emin)) { + uexp = lexp; + } else { + uexp = try__; + ++exbits; + } + +/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater + than or equal to EMIN. EXBITS is the number of bits needed to + store the exponent. */ + + if (uexp + *emin > -lexp - *emin) { + expsum = lexp << 1; + } else { + expsum = uexp << 1; + } + +/* EXPSUM is the exponent range, approximately equal to + EMAX - EMIN + 1 . */ + + *emax = expsum + *emin - 1; + nbits = exbits + 1 + *p; + +/* NBITS is the total number of bits needed to store a + floating-point number. */ + + if (nbits % 2 == 1 && *beta == 2) { + +/* Either there are an odd number of bits used to store a + floating-point number, which is unlikely, or some bits are + + not used in the representation of numbers, which is possible +, + (e.g. Cray machines) or the mantissa has an implicit bit, + (e.g. IEEE machines, Dec Vax machines), which is perhaps the + + most likely. We have to assume the last alternative. + If this is true, then we need to reduce EMAX by one because + + there must be some way of representing zero in an implicit-b +it + system. On machines like Cray, we are reducing EMAX by one + + unnecessarily. */ + + --(*emax); + } + + if (*ieee) { + +/* Assume we are on an IEEE machine which reserves one exponent + + for infinity and NaN. */ + + --(*emax); + } + +/* Now create RMAX, the largest machine number, which should + be equal to (1.0 - BETA**(-P)) * BETA**EMAX . + + First compute 1.0 - BETA**(-P), being careful that the + result is less than 1.0 . */ + + recbas = 1. / *beta; + z = *beta - 1.; + y = 0.; + i__1 = *p; + for (i = 1; i <= *p; ++i) { + z *= recbas; + if (y < 1.) { + oldy = y; + } + y = dlamc3_(&y, &z); +/* L20: */ + } + if (y >= 1.) { + y = oldy; + } + +/* Now multiply by BETA**EMAX to get RMAX. */ + + i__1 = *emax; + for (i = 1; i <= *emax; ++i) { + d__1 = y * *beta; + y = dlamc3_(&d__1, &c_b5); +/* L30: */ + } + + *rmax = y; + return 0; + +/* End of DLAMC5 */ + +} /* dlamc5_ */ + +double pow_di(double *ap, int *bp) +{ + double pow, x; + int n; + + pow = 1; + x = *ap; + n = *bp; + + if(n != 0){ + if(n < 0) { + n = -n; + x = 1/x; + } + for( ; ; ) { + if(n & 01) pow *= x; + if(n >>= 1) x *= x; + else break; + } + } + return(pow); +} + diff --git a/src/maths/SuperLU/dlangs.c b/src/maths/SuperLU/dlangs.c new file mode 100644 index 000000000..c0e9797cd --- /dev/null +++ b/src/maths/SuperLU/dlangs.c @@ -0,0 +1,119 @@ + +/*! @file dlangs.c + * \brief Returns the value of the one norm + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Modified from lapack routine DLANGE + *+ */ +/* + * File name: dlangs.c + * History: Modified from lapack routine DLANGE + */ +#include
+ * Purpose + * ======= + * + * DLANGS returns the value of the one norm, or the Frobenius norm, or + * the infinity norm, or the element of largest absolute value of a + * real matrix A. + * + * Description + * =========== + * + * DLANGE returns the value + * + * DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' + * ( + * ( norm1(A), NORM = '1', 'O' or 'o' + * ( + * ( normI(A), NORM = 'I' or 'i' + * ( + * ( normF(A), NORM = 'F', 'f', 'E' or 'e' + * + * where norm1 denotes the one norm of a matrix (maximum column sum), + * normI denotes the infinity norm of a matrix (maximum row sum) and + * normF denotes the Frobenius norm of a matrix (square root of sum of + * squares). Note that max(abs(A(i,j))) is not a matrix norm. + * + * Arguments + * ========= + * + * NORM (input) CHARACTER*1 + * Specifies the value to be returned in DLANGE as described above. + * A (input) SuperMatrix* + * The M by N sparse matrix A. + * + * ===================================================================== + *+ */ + +double dlangs(char *norm, SuperMatrix *A) +{ + + /* Local variables */ + NCformat *Astore; + double *Aval; + int i, j, irow; + double value, sum; + double *rwork; + + Astore = A->Store; + Aval = Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + + } else if (lsame_(norm, "M")) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + value = SUPERLU_MAX( value, fabs( Aval[i]) ); + + } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += fabs(Aval[i]); + value = SUPERLU_MAX(value,sum); + } + + } else if (lsame_(norm, "I")) { + /* Find normI(A). */ + if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for rwork."); + for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { + irow = Astore->rowind[i]; + rwork[irow] += fabs(Aval[i]); + } + value = 0.; + for (i = 0; i < A->nrow; ++i) + value = SUPERLU_MAX(value, rwork[i]); + + SUPERLU_FREE (rwork); + + } else if (lsame_(norm, "F") || lsame_(norm, "E")) { + /* Find normF(A). */ + ABORT_SuperLU("Not implemented."); + } else + ABORT_SuperLU("Illegal norm specified."); + + return (value); + +} /* dlangs */ + diff --git a/src/maths/SuperLU/dlaqgs.c b/src/maths/SuperLU/dlaqgs.c new file mode 100644 index 000000000..aebbc0782 --- /dev/null +++ b/src/maths/SuperLU/dlaqgs.c @@ -0,0 +1,145 @@ + +/*! @file dlaqgs.c + * \brief Equlibrates a general sprase matrix + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Modified from LAPACK routine DLAQGE + *+ */ +/* + * File name: dlaqgs.c + * History: Modified from LAPACK routine DLAQGE + */ +#include
+ * Purpose + * ======= + * + * DLAQGS equilibrates a general sparse M by N matrix A using the row and + * scaling factors in the vectors R and C. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * A (input/output) SuperMatrix* + * On exit, the equilibrated matrix. See EQUED for the form of + * the equilibrated matrix. The type of A can be: + * Stype = NC; Dtype = SLU_D; Mtype = GE. + * + * R (input) double*, dimension (A->nrow) + * The row scale factors for A. + * + * C (input) double*, dimension (A->ncol) + * The column scale factors for A. + * + * ROWCND (input) double + * Ratio of the smallest R(i) to the largest R(i). + * + * COLCND (input) double + * Ratio of the smallest C(i) to the largest C(i). + * + * AMAX (input) double + * Absolute value of largest matrix entry. + * + * EQUED (output) char* + * Specifies the form of equilibration that was done. + * = 'N': No equilibration + * = 'R': Row equilibration, i.e., A has been premultiplied by + * diag(R). + * = 'C': Column equilibration, i.e., A has been postmultiplied + * by diag(C). + * = 'B': Both row and column equilibration, i.e., A has been + * replaced by diag(R) * A * diag(C). + * + * Internal Parameters + * =================== + * + * THRESH is a threshold value used to decide if row or column scaling + * should be done based on the ratio of the row or column scaling + * factors. If ROWCND < THRESH, row scaling is done, and if + * COLCND < THRESH, column scaling is done. + * + * LARGE and SMALL are threshold values used to decide if row scaling + * should be done based on the absolute size of the largest matrix + * element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. + * + * ===================================================================== + *+ */ + +void +dlaqgs(SuperMatrix *A, double *r, double *c, + double rowcnd, double colcnd, double amax, char *equed) +{ + + +#define THRESH (0.1) + + /* Local variables */ + NCformat *Astore; + double *Aval; + int i, j, irow; + double large, small, cj; + + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Initialize LARGE and SMALL. */ + small = dlamch_("Safe minimum") / dlamch_("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + Aval[i] *= cj; + } + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= r[irow]; + } + *(unsigned char *)equed = 'R'; + } else { + /* Row and column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= cj * r[irow]; + } + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* dlaqgs */ + diff --git a/src/maths/SuperLU/dldperm.c b/src/maths/SuperLU/dldperm.c new file mode 100644 index 000000000..69c6bce50 --- /dev/null +++ b/src/maths/SuperLU/dldperm.c @@ -0,0 +1,165 @@ + +/*! @file + * \brief Finds a row permutation so that the matrix has large entries on the diagonal + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * + * DLDPERM finds a row permutation so that the matrix has large + * entries on the diagonal. + * + * Arguments + * ========= + * + * job (input) int + * Control the action. Possible values for JOB are: + * = 1 : Compute a row permutation of the matrix so that the + * permuted matrix has as many entries on its diagonal as + * possible. The values on the diagonal are of arbitrary size. + * HSL subroutine MC21A/AD is used for this. + * = 2 : Compute a row permutation of the matrix so that the smallest + * value on the diagonal of the permuted matrix is maximized. + * = 3 : Compute a row permutation of the matrix so that the smallest + * value on the diagonal of the permuted matrix is maximized. + * The algorithm differs from the one used for JOB = 2 and may + * have quite a different performance. + * = 4 : Compute a row permutation of the matrix so that the sum + * of the diagonal entries of the permuted matrix is maximized. + * = 5 : Compute a row permutation of the matrix so that the product + * of the diagonal entries of the permuted matrix is maximized + * and vectors to scale the matrix so that the nonzero diagonal + * entries of the permuted matrix are one in absolute value and + * all the off-diagonal entries are less than or equal to one in + * absolute value. + * Restriction: 1 <= JOB <= 5. + * + * n (input) int + * The order of the matrix. + * + * nnz (input) int + * The number of nonzeros in the matrix. + * + * adjncy (input) int*, of size nnz + * The adjacency structure of the matrix, which contains the row + * indices of the nonzeros. + * + * colptr (input) int*, of size n+1 + * The pointers to the beginning of each column in ADJNCY. + * + * nzval (input) double*, of size nnz + * The nonzero values of the matrix. nzval[k] is the value of + * the entry corresponding to adjncy[k]. + * It is not used if job = 1. + * + * perm (output) int*, of size n + * The permutation vector. perm[i] = j means row i in the + * original matrix is in row j of the permuted matrix. + * + * u (output) double*, of size n + * If job = 5, the natural logarithms of the row scaling factors. + * + * v (output) double*, of size n + * If job = 5, the natural logarithms of the column scaling factors. + * The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j). + *+ */ + +int +dldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], + double nzval[], int_t *perm, double u[], double v[]) +{ + int_t i, liw, ldw, num; + int_t *iw, icntl[10], info[10]; + double *dw; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Enter dldperm()"); +#endif + liw = 5*n; + if ( job == 3 ) liw = 10*n + nnz; + if ( !(iw = intMalloc(liw)) ) ABORT_SuperLU("Malloc fails for iw[]"); + ldw = 3*n + nnz; + if ( !(dw = (double*) SUPERLU_MALLOC(ldw * sizeof(double))) ) + ABORT_SuperLU("Malloc fails for dw[]"); + + /* Increment one to get 1-based indexing. */ + for (i = 0; i <= n; ++i) ++colptr[i]; + for (i = 0; i < nnz; ++i) ++adjncy[i]; +#if ( DEBUGlevel>=2 ) + printf("LDPERM(): n %d, nnz %d\n", n, nnz); + slu_PrintInt10("colptr", n+1, colptr); + slu_PrintInt10("adjncy", nnz, adjncy); +#endif + + /* + * NOTE: + * ===== + * + * MC64AD assumes that column permutation vector is defined as: + * perm(i) = j means column i of permuted A is in column j of original A. + * + * Since a symmetric permutation preserves the diagonal entries. Then + * by the following relation: + * P'(A*P')P = P'A + * we can apply inverse(perm) to rows of A to get large diagonal entries. + * But, since 'perm' defined in MC64AD happens to be the reverse of + * SuperLU's definition of permutation vector, therefore, it is already + * an inverse for our purpose. We will thus use it directly. + * + */ + mc64id_(icntl); +#if 0 + /* Suppress error and warning messages. */ + icntl[0] = -1; + icntl[1] = -1; +#endif + + mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval, &num, perm, + &liw, iw, &ldw, dw, icntl, info); + +#if ( DEBUGlevel>=2 ) + slu_PrintInt10("perm", n, perm); + printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); +#endif + if ( info[0] == 1 ) { /* Structurally singular */ + printf(".. The last %d permutations:\n", n-num); + slu_PrintInt10("perm", n-num, &perm[num]); + } + + /* Restore to 0-based indexing. */ + for (i = 0; i <= n; ++i) --colptr[i]; + for (i = 0; i < nnz; ++i) --adjncy[i]; + for (i = 0; i < n; ++i) --perm[i]; + + if ( job == 5 ) + for (i = 0; i < n; ++i) { + u[i] = dw[i]; + v[i] = dw[n+i]; + } + + SUPERLU_FREE(iw); + SUPERLU_FREE(dw); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Exit dldperm()"); +#endif + + return info[0]; +} diff --git a/src/maths/SuperLU/dmemory.c b/src/maths/SuperLU/dmemory.c new file mode 100644 index 000000000..de3f46349 --- /dev/null +++ b/src/maths/SuperLU/dmemory.c @@ -0,0 +1,701 @@ + +/*! @file dmemory.c + * \brief Memory details + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + *+ */ +#include
+ * mem_usage consists of the following fields: + * - for_lu (float) + * The amount of space used in bytes for the L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + *+ */ +int dQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, iword, dword, panel_size = sp_ienv(1); + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(double); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0*n + 3.0) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0 * panel_size + 4.0 + NO_MARKER) * n * iword + + (panel_size + 1.0) * n * dword ); + + return 0; +} /* dQuerySpace */ + + +/*! \brief + * + *
+ * mem_usage consists of the following fields: + * - for_lu (float) + * The amount of space used in bytes for the L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + *+ */ +int ilu_dQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, panel_size = sp_ienv(1); + register float iword, dword; + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(double); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0f * n + 3.0f) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0f) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization. + ILU needs 5*n more integers than LU */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0f * panel_size + 9.0f + NO_MARKER) * n * iword + + (panel_size + 1.0f) * n * dword ); + + return 0; +} /* ilu_dQuerySpace */ + + +/*! \brief Allocate storage for the data structures common to all factor routines. + * + *
+ * For those unpredictable size, estimate as fill_ratio * nnz(A). + * Return value: + * If lwork = -1, return the estimated amount of space required, plus n; + * otherwise, return the amount of space actually allocated when + * memory allocation failure occurred. + *+ */ +int +dLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, + int panel_size, double fill_ratio, SuperMatrix *L, SuperMatrix *U, + GlobalLU_t *Glu, int **iwork, double **dwork) +{ + int info, iword, dword; + SCformat *Lstore; + NCformat *Ustore; + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + double *ucol; + int *usub, *xusub; + int nzlmax, nzumax, nzlumax; + + iword = sizeof(int); + dword = sizeof(double); + Glu->n = n; + Glu->num_expansions = 0; + + if ( !Glu->expanders ) + Glu->expanders = (ExpHeader*)SUPERLU_MALLOC( NO_MEMTYPE * + sizeof(ExpHeader) ); + if ( !Glu->expanders ) ABORT_SuperLU("SUPERLU_MALLOC fails for expanders"); + + if ( fact != SamePattern_SameRowPerm ) { + /* Guess for L\U factors */ + nzumax = nzlumax = fill_ratio * annz; + nzlmax = SUPERLU_MAX(1, fill_ratio/4.) * annz; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else { + dSetupSpace(work, lwork, Glu); + } + +#if ( PRNTlevel >= 1 ) + printf("dLUMemInit() called: fill_ratio %.0f, nzlmax %ld, nzumax %ld\n", + fill_ratio, nzlmax, nzumax); + fflush(stdout); +#endif + + /* Integer pointers for L\U factors */ + if ( Glu->MemModel == SYSTEM ) { + xsup = intMalloc(n+1); + supno = intMalloc(n+1); + xlsub = intMalloc(n+1); + xlusup = intMalloc(n+1); + xusub = intMalloc(n+1); + } else { + xsup = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + supno = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + xlsub = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + xlusup = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + xusub = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + } + + lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) dexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) dexpand( &nzumax, USUB, 0, 1, Glu ); + + while ( !lusup || !ucol || !lsub || !usub ) { + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE(lusup); + SUPERLU_FREE(ucol); + SUPERLU_FREE(lsub); + SUPERLU_FREE(usub); + } else { + duser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, + HEAD, Glu); + } + nzlumax /= 2; + nzumax /= 2; + nzlmax /= 2; + if ( nzlumax < annz ) { + printf("Not enough memory to perform factorization.\n"); + return (dmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + } +#if ( PRNTlevel >= 1) + printf("dLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", + nzlmax, nzumax); + fflush(stdout); +#endif + lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) dexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) dexpand( &nzumax, USUB, 0, 1, Glu ); + } + + } else { + /* fact == SamePattern_SameRowPerm */ + Lstore = L->Store; + Ustore = U->Store; + xsup = Lstore->sup_to_col; + supno = Lstore->col_to_sup; + xlsub = Lstore->rowind_colptr; + xlusup = Lstore->nzval_colptr; + xusub = Ustore->colptr; + nzlmax = Glu->nzlmax; /* max from previous factorization */ + nzumax = Glu->nzumax; + nzlumax = Glu->nzlumax; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; + } else { + Glu->MemModel = USER; + Glu->stack.top2 = (lwork/4)*4; /* must be word-addressable */ + Glu->stack.size = Glu->stack.top2; + } + + lsub = Glu->expanders[LSUB].mem = Lstore->rowind; + lusup = Glu->expanders[LUSUP].mem = Lstore->nzval; + usub = Glu->expanders[USUB].mem = Ustore->rowind; + ucol = Glu->expanders[UCOL].mem = Ustore->nzval;; + Glu->expanders[LSUB].size = nzlmax; + Glu->expanders[LUSUP].size = nzlumax; + Glu->expanders[USUB].size = nzumax; + Glu->expanders[UCOL].size = nzumax; + } + + Glu->xsup = xsup; + Glu->supno = supno; + Glu->lsub = lsub; + Glu->xlsub = xlsub; + Glu->lusup = lusup; + Glu->xlusup = xlusup; + Glu->ucol = ucol; + Glu->usub = usub; + Glu->xusub = xusub; + Glu->nzlmax = nzlmax; + Glu->nzumax = nzumax; + Glu->nzlumax = nzlumax; + + info = dLUWorkInit(m, n, panel_size, iwork, dwork, Glu); + if ( info ) + return ( info + dmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + + ++Glu->num_expansions; + return 0; + +} /* dLUMemInit */ + +/*! \brief Allocate known working storage. Returns 0 if success, otherwise + returns the number of bytes allocated so far when failure occurred. */ +int +dLUWorkInit(int m, int n, int panel_size, int **iworkptr, + double **dworkptr, GlobalLU_t *Glu) +{ + int isize, dsize, extra; + double *old_ptr; + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + + isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); + dsize = (m * panel_size + + NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(double); + + if ( Glu->MemModel == SYSTEM ) + *iworkptr = (int *) intCalloc(isize/sizeof(int)); + else + *iworkptr = (int *) duser_malloc(isize, TAIL, Glu); + if ( ! *iworkptr ) { + fprintf(stderr, "dLUWorkInit: malloc fails for local iworkptr[]\n"); + return (isize + n); + } + + if ( Glu->MemModel == SYSTEM ) + *dworkptr = (double *) SUPERLU_MALLOC(dsize); + else { + *dworkptr = (double *) duser_malloc(dsize, TAIL, Glu); + if ( NotDoubleAlign(*dworkptr) ) { + old_ptr = *dworkptr; + *dworkptr = (double*) DoubleAlign(*dworkptr); + *dworkptr = (double*) ((double*)*dworkptr - 1); + extra = (char*)old_ptr - (char*)*dworkptr; +#ifdef DEBUG + printf("dLUWorkInit: not aligned, extra %d\n", extra); +#endif + Glu->stack.top2 -= extra; + Glu->stack.used += extra; + } + } + if ( ! *dworkptr ) { + fprintf(stderr, "malloc fails for local dworkptr[]."); + return (isize + dsize + n); + } + + return 0; +} + + +/*! \brief Set up pointers for real working arrays. + */ +void +dSetRWork(int m, int panel_size, double *dworkptr, + double **dense, double **tempv) +{ + double zero = 0.0; + + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + *dense = dworkptr; + *tempv = *dense + panel_size*m; + dfill (*dense, m * panel_size, zero); + dfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); +} + +/*! \brief Free the working storage used by factor routines. + */ +void dLUWorkFree(int *iwork, double *dwork, GlobalLU_t *Glu) +{ + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE (iwork); + SUPERLU_FREE (dwork); + } else { + Glu->stack.used -= (Glu->stack.size - Glu->stack.top2); + Glu->stack.top2 = Glu->stack.size; +/* dStackCompress(Glu); */ + } + + SUPERLU_FREE (Glu->expanders); + Glu->expanders = NULL; +} + +/*! \brief Expand the data structures for L and U during the factorization. + * + *
+ * Return value: 0 - successful return + * > 0 - number of bytes allocated when run out of space + *+ */ +int +dLUMemXpand(int jcol, + int next, /* number of elements currently in the factors */ + MemType mem_type, /* which type of memory to expand */ + int *maxlen, /* modified - maximum length of a data structure */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + void *new_mem; + +#ifdef DEBUG + printf("dLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", + jcol, next, *maxlen, mem_type); +#endif + + if (mem_type == USUB) + new_mem = dexpand(maxlen, mem_type, next, 1, Glu); + else + new_mem = dexpand(maxlen, mem_type, next, 0, Glu); + + if ( !new_mem ) { + int nzlmax = Glu->nzlmax; + int nzumax = Glu->nzumax; + int nzlumax = Glu->nzlumax; + fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); + return (dmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); + } + + switch ( mem_type ) { + case LUSUP: + Glu->lusup = (double *) new_mem; + Glu->nzlumax = *maxlen; + break; + case UCOL: + Glu->ucol = (double *) new_mem; + Glu->nzumax = *maxlen; + break; + case LSUB: + Glu->lsub = (int *) new_mem; + Glu->nzlmax = *maxlen; + break; + case USUB: + Glu->usub = (int *) new_mem; + Glu->nzumax = *maxlen; + break; + } + + return 0; + +} + + + +void +copy_mem_double(int howmany, void *old, void *new) +{ + register int i; + double *dold = old; + double *dnew = new; + for (i = 0; i < howmany; i++) dnew[i] = dold[i]; +} + +/*! \brief Expand the existing storage to accommodate more fill-ins. + */ +void +*dexpand ( + int *prev_len, /* length used from previous call */ + MemType type, /* which part of the memory to expand */ + int len_to_copy, /* size of the memory to be copied to new store */ + int keep_prev, /* = 1: use prev_len; + = 0: compute new_len to expand */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + float EXPAND = 1.5; + float alpha; + void *new_mem, *old_mem; + int new_len, tries, lword, extra, bytes_to_copy; + ExpHeader *expanders = Glu->expanders; /* Array of 4 types of memory */ + + alpha = EXPAND; + + if ( Glu->num_expansions == 0 || keep_prev ) { + /* First time allocate requested */ + new_len = *prev_len; + } else { + new_len = alpha * *prev_len; + } + + if ( type == LSUB || type == USUB ) lword = sizeof(int); + else lword = sizeof(double); + + if ( Glu->MemModel == SYSTEM ) { + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + if ( Glu->num_expansions != 0 ) { + tries = 0; + if ( keep_prev ) { + if ( !new_mem ) return (NULL); + } else { + while ( !new_mem ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + } + } + if ( type == LSUB || type == USUB ) { + copy_mem_int(len_to_copy, expanders[type].mem, new_mem); + } else { + copy_mem_double(len_to_copy, expanders[type].mem, new_mem); + } + SUPERLU_FREE (expanders[type].mem); + } + expanders[type].mem = (void *) new_mem; + + } else { /* MemModel == USER */ + if ( Glu->num_expansions == 0 ) { + new_mem = duser_malloc(new_len * lword, HEAD, Glu); + if ( NotDoubleAlign(new_mem) && + (type == LUSUP || type == UCOL) ) { + old_mem = new_mem; + new_mem = (void *)DoubleAlign(new_mem); + extra = (char*)new_mem - (char*)old_mem; +#ifdef DEBUG + printf("expand(): not aligned, extra %d\n", extra); +#endif + Glu->stack.top1 += extra; + Glu->stack.used += extra; + } + expanders[type].mem = (void *) new_mem; + } else { + tries = 0; + extra = (new_len - *prev_len) * lword; + if ( keep_prev ) { + if ( StackFull(extra) ) return (NULL); + } else { + while ( StackFull(extra) ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + extra = (new_len - *prev_len) * lword; + } + } + + if ( type != USUB ) { + new_mem = (void*)((char*)expanders[type + 1].mem + extra); + bytes_to_copy = (char*)Glu->stack.array + Glu->stack.top1 + - (char*)expanders[type + 1].mem; + user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); + + if ( type < USUB ) { + Glu->usub = expanders[USUB].mem = + (void*)((char*)expanders[USUB].mem + extra); + } + if ( type < LSUB ) { + Glu->lsub = expanders[LSUB].mem = + (void*)((char*)expanders[LSUB].mem + extra); + } + if ( type < UCOL ) { + Glu->ucol = expanders[UCOL].mem = + (void*)((char*)expanders[UCOL].mem + extra); + } + Glu->stack.top1 += extra; + Glu->stack.used += extra; + if ( type == UCOL ) { + Glu->stack.top1 += extra; /* Add same amount for USUB */ + Glu->stack.used += extra; + } + + } /* if ... */ + + } /* else ... */ + } + + expanders[type].size = new_len; + *prev_len = new_len; + if ( Glu->num_expansions ) ++Glu->num_expansions; + + return (void *) expanders[type].mem; + +} /* dexpand */ + + +/*! \brief Compress the work[] array to remove fragmentation. + */ +void +dStackCompress(GlobalLU_t *Glu) +{ + register int iword, dword, ndim; + char *last, *fragment; + int *ifrom, *ito; + double *dfrom, *dto; + int *xlsub, *lsub, *xusub, *usub, *xlusup; + double *ucol, *lusup; + + iword = sizeof(int); + dword = sizeof(double); + ndim = Glu->n; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + lusup = Glu->lusup; + + dfrom = ucol; + dto = (double *)((char*)lusup + xlusup[ndim] * dword); + copy_mem_double(xusub[ndim], dfrom, dto); + ucol = dto; + + ifrom = lsub; + ito = (int *) ((char*)ucol + xusub[ndim] * iword); + copy_mem_int(xlsub[ndim], ifrom, ito); + lsub = ito; + + ifrom = usub; + ito = (int *) ((char*)lsub + xlsub[ndim] * iword); + copy_mem_int(xusub[ndim], ifrom, ito); + usub = ito; + + last = (char*)usub + xusub[ndim] * iword; + fragment = (char*) (((char*)Glu->stack.array + Glu->stack.top1) - last); + Glu->stack.used -= (long int) fragment; + Glu->stack.top1 -= (long int) fragment; + + Glu->ucol = ucol; + Glu->lsub = lsub; + Glu->usub = usub; + +#ifdef DEBUG + printf("dStackCompress: fragment %d\n", fragment); + /* for (last = 0; last < ndim; ++last) + print_lu_col("After compress:", last, 0);*/ +#endif + +} + +/*! \brief Allocate storage for original matrix A + */ +void +dallocateA(int n, int nnz, double **a, int **asub, int **xa) +{ + *a = (double *) doubleMalloc(nnz); + *asub = (int *) intMalloc(nnz); + *xa = (int *) intMalloc(n+1); +} + + +double *doubleMalloc(int n) +{ + double *buf; + buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doubleMalloc()\n"); + } + return (buf); +} + +double *doubleCalloc(int n) +{ + double *buf; + register int i; + double zero = 0.0; + buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doubleCalloc()\n"); + } + for (i = 0; i < n; ++i) buf[i] = zero; + return (buf); +} + + +int dmemory_usage(const int nzlmax, const int nzumax, + const int nzlumax, const int n) +{ + register int iword, dword; + + iword = sizeof(int); + dword = sizeof(double); + + return (10 * n * iword + + nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); + +} diff --git a/src/maths/SuperLU/dmyblas2.c b/src/maths/SuperLU/dmyblas2.c new file mode 100644 index 000000000..2a58f5bc3 --- /dev/null +++ b/src/maths/SuperLU/dmyblas2.c @@ -0,0 +1,230 @@ + +/*! @file dmyblas2.c + * \brief Level 2 Blas operations + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ * Purpose: + * Level 2 BLAS operations: solves and matvec, written in C. + * Note: + * This is only used when the system lacks an efficient BLAS library. + * + */ +/* + * File name: dmyblas2.c + */ + +/*! \brief Solves a dense UNIT lower triangular system + * + * The unit lower + * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). + * The solution will be returned in the rhs vector. + */ +void dlsolve ( int ldm, int ncol, double *M, double *rhs ) +{ + int k; + double x0, x1, x2, x3, x4, x5, x6, x7; + double *M0; + register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + + M0 = &M[0]; + + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + Mki4 = Mki3 + ldm + 1; + Mki5 = Mki4 + ldm + 1; + Mki6 = Mki5 + ldm + 1; + Mki7 = Mki6 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++; + x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++; + x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; + x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + rhs[++firstcol] = x4; + rhs[++firstcol] = x5; + rhs[++firstcol] = x6; + rhs[++firstcol] = x7; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++ + - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++ - x7 * *Mki7++; + + M0 += 8 * ldm + 8; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++; + + M0 += 4 * ldm + 4; + } + + if ( firstcol < ncol - 1 ) { /* Do 2 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + + rhs[++firstcol] = x1; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; + + } + +} + +/*! \brief Solves a dense upper triangular system + * + * The upper triangular matrix is + * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned + * in the rhs vector. + */ +void +dusolve ( ldm, ncol, M, rhs ) +int ldm; /* in */ +int ncol; /* in */ +double *M; /* in */ +double *rhs; /* modified */ +{ + double xj; + int jcol, j, irow; + + jcol = ncol - 1; + + for (j = 0; j < ncol; j++) { + + xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ + rhs[jcol] = xj; + + for (irow = 0; irow < jcol; irow++) + rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ + + jcol--; + + } +} + + +/*! \brief Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. + * + * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. + */ +void dmatvec ( ldm, nrow, ncol, M, vec, Mxvec ) + +int ldm; /* in -- leading dimension of M */ +int nrow; /* in */ +int ncol; /* in */ +double *M; /* in */ +double *vec; /* in */ +double *Mxvec; /* in/out */ + +{ + double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; + double *M0; + register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + int k; + + M0 = &M[0]; + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + Mki4 = Mki3 + ldm; + Mki5 = Mki4 + ldm; + Mki6 = Mki5 + ldm; + Mki7 = Mki6 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + vi4 = vec[firstcol++]; + vi5 = vec[firstcol++]; + vi6 = vec[firstcol++]; + vi7 = vec[firstcol++]; + + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ + + vi4 * *Mki4++ + vi5 * *Mki5++ + + vi6 * *Mki6++ + vi7 * *Mki7++; + + M0 += 8 * ldm; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ ; + + M0 += 4 * ldm; + } + + while ( firstcol < ncol ) { /* Do 1 column */ + + Mki0 = M0; + vi0 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++; + + M0 += ldm; + } + +} + diff --git a/src/maths/SuperLU/dpanel_bmod.c b/src/maths/SuperLU/dpanel_bmod.c new file mode 100644 index 000000000..b4255db6d --- /dev/null +++ b/src/maths/SuperLU/dpanel_bmod.c @@ -0,0 +1,459 @@ + +/*! @file dpanel_bmod.c + * \brief Performs numeric block updates + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ +/* + +*/ + +#include
+ * Purpose + * ======= + * + * Performs numeric block updates (sup-panel) in topological order. + * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. + * Special processing on the supernodal portion of L\U[*,j] + * + * Before entering this routine, the original nonzeros in the panel + * were already copied into the spa[m,w]. + * + * Updated/Output parameters- + * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned + * collectively in the m-by-w vector dense[*]. + *+ */ + +void +dpanel_bmod ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + const int nseg, /* in */ + double *dense, /* out, of size n by w */ + double *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in, of size n by w */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + double alpha, beta; +#endif + + register int k, ksub; + int fsupc, nsupc, nsupr, nrow; + int krep, krep_ind; + double ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int segsze; + int block_nrow; /* no of rows in a block row */ + register int lptr; /* Points to the row subscripts of a supernode */ + int kfnz, irow, no_zeros; + register int isub, isub1, i; + register int jj; /* Index through each column in the panel */ + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + int *repfnz_col; /* repfnz[] for a column in the panel */ + double *dense_col; /* dense[] for a column in the panel */ + double *tempv1; /* Used in 1-D update */ + double *TriTmp, *MatvecTmp; /* used in 2-D update */ + double zero = 0.0; + double one = 1.0; + register int ldaTmp; + register int r_ind, r_hi; + static int first = 1, maxsuper, rowblk, colblk; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + if ( first ) { + maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); + rowblk = sp_ienv(4); + colblk = sp_ienv(5); + first = 0; + } + ldaTmp = maxsuper + rowblk; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in a supernode + * nsupr = no of rows in a supernode + */ + krep = segrep[k--]; + fsupc = xsup[supno[krep]]; + nsupc = krep - fsupc + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nrow = nsupr - nsupc; + lptr = xlsub[fsupc]; + krep_ind = lptr + nsupc - 1; + + repfnz_col = repfnz; + dense_col = dense; + + if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ + + TriTmp = tempv; + + /* Sequence through each column in panel -- triangular solves */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + dense_col[irow] -= ukj * lusup[luptr]; + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + ukj -= ukj1 * lusup[luptr1]; + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; + dense_col[irow] -= (ukj*lusup[luptr] + + ukj1*lusup[luptr1]); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; luptr2++; + dense_col[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + } else { /* segsze >= 4 */ + + /* Copy U[*,j] segment from dense[*] to TriTmp[*], which + holds the result of triangular solves. */ + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + TriTmp[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#else + dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#endif +#else + dlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); +#endif + + + } /* else ... */ + + } /* for jj ... end tri-solves */ + + /* Block row updates; push all the way into dense[*] block */ + for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { + + r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); + block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); + luptr = xlusup[fsupc] + nsupc + r_ind; + isub1 = lptr + nsupc + r_ind; + + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + /* Sequence through each column in panel -- matrix-vector */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + /* Perform a block update, and scatter the result of + matrix-vector to dense[]. */ + no_zeros = kfnz - fsupc; + luptr1 = luptr + nsupr * no_zeros; + MatvecTmp = &TriTmp[maxsuper]; + +#ifdef USE_VENDOR_BLAS + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#else + dgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#endif +#else + dmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], + TriTmp, MatvecTmp); +#endif + + /* Scatter MatvecTmp[*] into SPA dense[*] temporarily + * such that MatvecTmp[*] can be re-used for the + * the next blok row update. dense[] will be copied into + * global store after the whole panel has been finished. + */ + isub = isub1; + for (i = 0; i < block_nrow; i++) { + irow = lsub[isub]; + dense_col[irow] -= MatvecTmp[i]; + MatvecTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } /* for each block row ... */ + + /* Scatter the triangular solves into SPA dense[*] */ + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = TriTmp[i]; + TriTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } else { /* 1-D block modification */ + + + /* Sequence through each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + dense_col[irow] -= ukj * lusup[luptr]; + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + ukj -= ukj1 * lusup[luptr1]; + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; + dense_col[irow] -= (ukj*lusup[luptr] + + ukj1*lusup[luptr1]); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; ++luptr2; + dense_col[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + } else { /* segsze >= 4 */ + /* + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense[]. + */ + no_zeros = kfnz - fsupc; + + /* Copy U[*,j] segment from dense[*] to tempv[*]: + * The result of triangular solve is in tempv[*]; + * The result of matrix vector update is in dense_col[*] + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + tempv[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + dlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + dmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); +#endif + + /* Scatter tempv[*] into SPA dense[*] temporarily, such + * that tempv[*] can be used for the triangular solve of + * the next column of the panel. They will be copied into + * ucol[*] after the whole panel has been finished. + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = tempv[i]; + tempv[i] = zero; + isub++; + } + + /* Scatter the update from tempv1[*] into SPA dense[*] */ + /* Start dense rectangular L */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + dense_col[irow] -= tempv1[i]; + tempv1[i] = zero; + ++isub; + } + + } /* else segsze>=4 ... */ + + } /* for each column in the panel... */ + + } /* else 1-D update ... */ + + } /* for each updating supernode ... */ + +} + + + diff --git a/src/maths/SuperLU/dpanel_dfs.c b/src/maths/SuperLU/dpanel_dfs.c new file mode 100644 index 000000000..6f8bf414c --- /dev/null +++ b/src/maths/SuperLU/dpanel_dfs.c @@ -0,0 +1,254 @@ + +/*! @file dpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * + * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. + * + * The routine returns one list of the supernodal representatives + * in topological order of the dfs that generates them. This list is + * a superset of the topological order of each individual column within + * the panel. + * The location of the first nonzero in each supernodal segment + * (supernodal entry location) is also returned. Each column has a + * separate list for this purpose. + * + * Two marker arrays are used for dfs: + * marker[i] == jj, if i was visited during dfs of current column jj; + * marker1[i] >= jcol, if i was visited by earlier columns in this panel; + * + * marker: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + *+ */ + +void +dpanel_dfs ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + double *dense, /* out */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *xprune, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + NCPformat *Astore; + double *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + double *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/dpivotL.c b/src/maths/SuperLU/dpivotL.c new file mode 100644 index 000000000..4844bbe86 --- /dev/null +++ b/src/maths/SuperLU/dpivotL.c @@ -0,0 +1,184 @@ + +/*! @file dpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * Performs the numerical pivoting on the current column of L, + * and the CDIV operation. + * + * Pivot policy: + * (1) Compute thresh = u * max_(i>=j) abs(A_ij); + * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN + * pivot row = k; + * ELSE IF abs(A_jj) >= thresh THEN + * pivot row = j; + * ELSE + * pivot row = m; + * + * Note: If you absolutely want to use a given pivot order, then set u=0.0. + * + * Return value: 0 success; + * i > 0 U(i,i) is exactly zero. + *+ */ + +int +dpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int *iperm_r, /* in - inverse of perm_r */ + int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ + int *pivrow, /* out */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + int pivptr, old_pivptr, diag, diagind; + double pivmax, rtemp, thresh; + double temp; + double *lu_sup_ptr; + double *lu_col_ptr; + int *lsub_ptr; + int isub, icol, k, itemp; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + /* Initialize pointers */ + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + +#ifdef DEBUG +if ( jcol == MIN_COL ) { + printf("Before cdiv: col %d\n", jcol); + for (k = nsupc; k < nsupr; k++) + printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); +} +#endif + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + if ( *usepr ) *pivrow = iperm_r[jcol]; + diagind = iperm_c[jcol]; + pivmax = 0.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + for (isub = nsupc; isub < nsupr; ++isub) { + rtemp = fabs (lu_col_ptr[isub]); + if ( rtemp > pivmax ) { + pivmax = rtemp; + pivptr = isub; + } + if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; + if ( lsub_ptr[isub] == diagind ) diag = isub; + } + + /* Test for singularity */ + if ( pivmax == 0.0 ) { +#if 1 + *pivrow = lsub_ptr[pivptr]; + perm_r[*pivrow] = jcol; +#else + perm_r[diagind] = jcol; +#endif + *usepr = 0; + return (jcol+1); + } + + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + rtemp = fabs (lu_col_ptr[old_pivptr]); + if ( rtemp != 0.0 && rtemp >= thresh ) + pivptr = old_pivptr; + else + *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + rtemp = fabs (lu_col_ptr[diag]); + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += nsupr - nsupc; + + temp = 1.0 / lu_col_ptr[nsupc]; + for (k = nsupc+1; k < nsupr; k++) + lu_col_ptr[k] *= temp; + + return 0; +} + diff --git a/src/maths/SuperLU/dpivotgrowth.c b/src/maths/SuperLU/dpivotgrowth.c new file mode 100644 index 000000000..a75061631 --- /dev/null +++ b/src/maths/SuperLU/dpivotgrowth.c @@ -0,0 +1,113 @@ + +/*! @file dpivotgrowth.c + * \brief Computes the reciprocal pivot growth factor + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ */ +#include
+ * Purpose + * ======= + * + * Compute the reciprocal pivot growth factor of the leading ncols columns + * of the matrix, using the formula: + * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) + * + * Arguments + * ========= + * + * ncols (input) int + * The number of columns of matrices A, L and U. + * + * A (input) SuperMatrix* + * Original matrix A, permuted by columns, of dimension + * (A->nrow, A->ncol). The type of A can be: + * Stype = NC; Dtype = SLU_D; Mtype = GE. + * + * L (output) SuperMatrix* + * The factor L from the factorization Pr*A=L*U; use compressed row + * subscripts storage for supernodes, i.e., L has type: + * Stype = SC; Dtype = SLU_D; Mtype = TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise + * storage scheme, i.e., U has types: Stype = NC; + * Dtype = SLU_D; Mtype = TRU. + *+ */ + +double +dPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, + SuperMatrix *L, SuperMatrix *U) +{ + + NCformat *Astore; + SCformat *Lstore; + NCformat *Ustore; + double *Aval, *Lval, *Uval; + int fsupc, nsupr, luptr, nz_in_U; + int i, j, k, oldcol; + int *inv_perm_c; + double rpg, maxaj, maxuj; + double smlnum; + double *luval; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + rpg = 1. / smlnum; + + Astore = A->Store; + Lstore = L->Store; + Ustore = U->Store; + Aval = Astore->nzval; + Lval = Lstore->nzval; + Uval = Ustore->nzval; + + inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); + for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; + + for (k = 0; k <= Lstore->nsuper; ++k) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + luptr = L_NZ_START(fsupc); + luval = &Lval[luptr]; + nz_in_U = 1; + + for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { + maxaj = 0.; + oldcol = inv_perm_c[j]; + for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) + maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) ); + + maxuj = 0.; + for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) + maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) ); + + /* Supernode */ + for (i = 0; i < nz_in_U; ++i) + maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) ); + + ++nz_in_U; + luval += nsupr; + + if ( maxuj == 0. ) + rpg = SUPERLU_MIN( rpg, 1.); + else + rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); + } + + if ( j >= ncols ) break; + } + + SUPERLU_FREE(inv_perm_c); + return (rpg); +} diff --git a/src/maths/SuperLU/dpruneL.c b/src/maths/SuperLU/dpruneL.c new file mode 100644 index 000000000..024dc67db --- /dev/null +++ b/src/maths/SuperLU/dpruneL.c @@ -0,0 +1,154 @@ + +/*! @file dpruneL.c + * \brief Prunes the L-structure + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * Prunes the L-structure of supernodes whose L-structure + * contains the current pivot row "pivrow" + *+ */ + +void +dpruneL( + const int jcol, /* in */ + const int *perm_r, /* in */ + const int pivrow, /* in */ + const int nseg, /* in */ + const int *segrep, /* in */ + const int *repfnz, /* in */ + int *xprune, /* out */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + + double utemp; + int jsupno, irep, irep1, kmin, kmax, krow, movnum; + int i, ktemp, minloc, maxloc; + int do_prune; /* logical variable */ + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + /* + * For each supernode-rep irep in U[*,j] + */ + jsupno = supno[jcol]; + for (i = 0; i < nseg; i++) { + + irep = segrep[i]; + irep1 = irep + 1; + do_prune = FALSE; + + /* Don't prune with a zero U-segment */ + if ( repfnz[irep] == EMPTY ) + continue; + + /* If a snode overlaps with the next panel, then the U-segment + * is fragmented into two parts -- irep and irep1. We should let + * pruning occur at the rep-column in irep1's snode. + */ + if ( supno[irep] == supno[irep1] ) /* Don't prune */ + continue; + + /* + * If it has not been pruned & it has a nonz in row L[pivrow,i] + */ + if ( supno[irep] != jsupno ) { + if ( xprune[irep] >= xlsub[irep1] ) { + kmin = xlsub[irep]; + kmax = xlsub[irep1] - 1; + for (krow = kmin; krow <= kmax; krow++) + if ( lsub[krow] == pivrow ) { + do_prune = TRUE; + break; + } + } + + if ( do_prune ) { + + /* Do a quicksort-type partition + * movnum=TRUE means that the num values have to be exchanged. + */ + movnum = FALSE; + if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ + movnum = TRUE; + + while ( kmin <= kmax ) { + + if ( perm_r[lsub[kmax]] == EMPTY ) + kmax--; + else if ( perm_r[lsub[kmin]] != EMPTY ) + kmin++; + else { /* kmin below pivrow (not yet pivoted), and kmax + * above pivrow: interchange the two subscripts + */ + ktemp = lsub[kmin]; + lsub[kmin] = lsub[kmax]; + lsub[kmax] = ktemp; + + /* If the supernode has only one column, then we + * only keep one set of subscripts. For any subscript + * interchange performed, similar interchange must be + * done on the numerical values. + */ + if ( movnum ) { + minloc = xlusup[irep] + (kmin - xlsub[irep]); + maxloc = xlusup[irep] + (kmax - xlsub[irep]); + utemp = lusup[minloc]; + lusup[minloc] = lusup[maxloc]; + lusup[maxloc] = utemp; + } + + kmin++; + kmax--; + + } + + } /* while */ + + xprune[irep] = kmin; /* Pruning */ + +#ifdef CHK_PRUNE + printf(" After dpruneL(),using col %d: xprune[%d] = %d\n", + jcol, irep, kmin); +#endif + } /* if do_prune */ + + } /* if */ + + } /* for each U-segment... */ +} diff --git a/src/maths/SuperLU/dreadhb.c b/src/maths/SuperLU/dreadhb.c new file mode 100644 index 000000000..852e0b7af --- /dev/null +++ b/src/maths/SuperLU/dreadhb.c @@ -0,0 +1,257 @@ + +/*! @file dreadhb.c + * \brief Read a matrix stored in Harwell-Boeing format + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Purpose + * ======= + * + * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format + * as described below. + * + * Line 1 (A72,A8) + * Col. 1 - 72 Title (TITLE) + * Col. 73 - 80 Key (KEY) + * + * Line 2 (5I14) + * Col. 1 - 14 Total number of lines excluding header (TOTCRD) + * Col. 15 - 28 Number of lines for pointers (PTRCRD) + * Col. 29 - 42 Number of lines for row (or variable) indices (INDCRD) + * Col. 43 - 56 Number of lines for numerical values (VALCRD) + * Col. 57 - 70 Number of lines for right-hand sides (RHSCRD) + * (including starting guesses and solution vectors + * if present) + * (zero indicates no right-hand side data is present) + * + * Line 3 (A3, 11X, 4I14) + * Col. 1 - 3 Matrix type (see below) (MXTYPE) + * Col. 15 - 28 Number of rows (or variables) (NROW) + * Col. 29 - 42 Number of columns (or elements) (NCOL) + * Col. 43 - 56 Number of row (or variable) indices (NNZERO) + * (equal to number of entries for assembled matrices) + * Col. 57 - 70 Number of elemental matrix entries (NELTVL) + * (zero in the case of assembled matrices) + * Line 4 (2A16, 2A20) + * Col. 1 - 16 Format for pointers (PTRFMT) + * Col. 17 - 32 Format for row (or variable) indices (INDFMT) + * Col. 33 - 52 Format for numerical values of coefficient matrix (VALFMT) + * Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) + * + * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present + * Col. 1 Right-hand side type: + * F for full storage or M for same format as matrix + * Col. 2 G if a starting vector(s) (Guess) is supplied. (RHSTYP) + * Col. 3 X if an exact solution vector(s) is supplied. + * Col. 15 - 28 Number of right-hand sides (NRHS) + * Col. 29 - 42 Number of row indices (NRHSIX) + * (ignored in case of unassembled matrices) + * + * The three character type field on line 3 describes the matrix type. + * The following table lists the permitted values for each of the three + * characters. As an example of the type field, RSA denotes that the matrix + * is real, symmetric, and assembled. + * + * First Character: + * R Real matrix + * C Complex matrix + * P Pattern only (no numerical values supplied) + * + * Second Character: + * S Symmetric + * U Unsymmetric + * H Hermitian + * Z Skew symmetric + * R Rectangular + * + * Third Character: + * A Assembled + * E Elemental matrices (unassembled) + * + *+ */ +#include
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * dsnode_dfs() - Determine the union of the row structures of those + * columns within the relaxed snode. + * Note: The relaxed snodes are leaves of the supernodal etree, therefore, + * the portion outside the rectangular supernode must be zero. + * + * Return value + * ============ + * 0 success; + * >0 number of bytes allocated when run out of memory. + *+ */ + +int +dsnode_dfs ( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *xprune, /* out */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, ifrom, ito, nextl, new_next; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) { + if ( mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1, then make a copy of the subscripts for pruning */ + if ( jcol < kcol ) { + new_next = nextl + (nextl - xlsub[jcol]); + while ( new_next > nzlmax ) { + if ( mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + ito = nextl; + for (ifrom = xlsub[jcol]; ifrom < nextl; ) + lsub[ito++] = lsub[ifrom++]; + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + nextl = ito; + } + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xprune[kcol] = nextl; + xlsub[kcol+1] = nextl; + + return 0; +} + diff --git a/src/maths/SuperLU/dsp_blas2.c b/src/maths/SuperLU/dsp_blas2.c new file mode 100644 index 000000000..49a625a7e --- /dev/null +++ b/src/maths/SuperLU/dsp_blas2.c @@ -0,0 +1,477 @@ + +/*! @file dsp_blas2.c + * \brief Sparse BLAS 2, using some dense BLAS 2 operations + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + *+ */ +/* + * File name: dsp_blas2.c + * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. + */ + +#include
+ * Purpose + * ======= + * + * sp_dtrsv() solves one of the systems of equations + * A*x = b, or A'*x = b, + * where b and x are n element vectors and A is a sparse unit , or + * non-unit, upper or lower triangular matrix. + * No test for singularity or near-singularity is included in this + * routine. Such tests must be performed before calling this routine. + * + * Parameters + * ========== + * + * uplo - (input) char* + * On entry, uplo specifies whether the matrix is an upper or + * lower triangular matrix as follows: + * uplo = 'U' or 'u' A is an upper triangular matrix. + * uplo = 'L' or 'l' A is a lower triangular matrix. + * + * trans - (input) char* + * On entry, trans specifies the equations to be solved as + * follows: + * trans = 'N' or 'n' A*x = b. + * trans = 'T' or 't' A'*x = b. + * trans = 'C' or 'c' A'*x = b. + * + * diag - (input) char* + * On entry, diag specifies whether or not A is unit + * triangular as follows: + * diag = 'U' or 'u' A is assumed to be unit triangular. + * diag = 'N' or 'n' A is not assumed to be unit + * triangular. + * + * L - (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U. Use + * compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SC, Dtype = SLU_D, Mtype = TRLU. + * + * U - (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. + * U has types: Stype = NC, Dtype = SLU_D, Mtype = TRU. + * + * x - (input/output) double* + * Before entry, the incremented array X must contain the n + * element right-hand side vector b. On exit, X is overwritten + * with the solution vector x. + * + * info - (output) int* + * If *info = -i, the i-th argument had an illegal value. + *+ */ +int +sp_dtrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, + SuperMatrix *U, double *x, SuperLUStat_t *stat, int *info) +{ +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + SCformat *Lstore; + NCformat *Ustore; + double *Lval, *Uval; + int incx = 1, incy = 1; + double alpha = 1.0, beta = 1.0; + int nrow; + int fsupc, nsupr, nsupc, luptr, istart, irow; + int i, k, iptr, jcol; + double *work; + flops_t solve_ops; + + /* Test the input parameters */ + *info = 0; + if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; + else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && + !lsame_(trans, "C")) *info = -2; + else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; + else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; + else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; + if ( *info ) { + i = -(*info); + xerbla_("sp_dtrsv", &i); + return 0; + } + + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( !(work = doubleCalloc(L->nrow)) ) + ABORT_SuperLU("Malloc fails for work in sp_dtrsv()."); + + if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L)*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + nrow = nsupr - nsupc; + + solve_ops += nsupc * (nsupc - 1); + solve_ops += 2 * nrow * nsupc; + + if ( nsupc == 1 ) { + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { + irow = L_SUB(iptr); + ++luptr; + x[irow] -= x[fsupc] * Lval[luptr]; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#else + dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#endif +#else + dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); + + dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], + &x[fsupc], &work[0] ); +#endif + + iptr = istart + nsupc; + for (i = 0; i < nrow; ++i, ++iptr) { + irow = L_SUB(iptr); + x[irow] -= work[i]; /* Scatter */ + work[i] = 0.0; + + } + } + } /* for k ... */ + + } else { + /* Form x := inv(U)*x */ + + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += nsupc * (nsupc + 1); + + if ( nsupc == 1 ) { + x[fsupc] /= Lval[luptr]; + for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { + irow = U_SUB(i); + x[irow] -= x[fsupc] * Uval[i]; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif +#else + dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); +#endif + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); + i++) { + irow = U_SUB(i); + x[irow] -= x[jcol] * Uval[i]; + } + } + } + } /* for k ... */ + + } + } else { /* Form x := inv(A')*x */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L')*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; --k) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 2 * (nsupr - nsupc) * nsupc; + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + iptr = istart + nsupc; + for (i = L_NZ_START(jcol) + nsupc; + i < L_NZ_START(jcol+1); i++) { + irow = L_SUB(iptr); + x[jcol] -= x[irow] * Lval[i]; + iptr++; + } + } + + if ( nsupc > 1 ) { + solve_ops += nsupc * (nsupc - 1); +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("U", strlen("U")); + STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } + } else { + /* Form x := inv(U')*x */ + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { + irow = U_SUB(i); + x[jcol] -= x[irow] * Uval[i]; + } + } + + solve_ops += nsupc * (nsupc + 1); + + if ( nsupc == 1 ) { + x[fsupc] /= Lval[luptr]; + } else { +#ifdef _CRAY + ftcs1 = _cptofcd("U", strlen("U")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("N", strlen("N")); + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } /* for k ... */ + } + } + + stat->ops[SOLVE] += solve_ops; + SUPERLU_FREE(work); + return 0; +} + + + +/*! \brief Performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, + * + *
+ * Purpose + * ======= + * + * sp_dgemv() performs one of the matrix-vector operations + * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, + * where alpha and beta are scalars, x and y are vectors and A is a + * sparse A->nrow by A->ncol matrix. + * + * Parameters + * ========== + * + * TRANS - (input) char* + * On entry, TRANS specifies the operation to be performed as + * follows: + * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. + * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. + * TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. + * + * ALPHA - (input) double + * On entry, ALPHA specifies the scalar alpha. + * + * A - (input) SuperMatrix* + * Matrix A with a sparse format, of dimension (A->nrow, A->ncol). + * Currently, the type of A can be: + * Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. + * In the future, more general A can be handled. + * + * X - (input) double*, array of DIMENSION at least + * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' + * and at least + * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. + * Before entry, the incremented array X must contain the + * vector x. + * + * INCX - (input) int + * On entry, INCX specifies the increment for the elements of + * X. INCX must not be zero. + * + * BETA - (input) double + * On entry, BETA specifies the scalar beta. When BETA is + * supplied as zero then Y need not be set on input. + * + * Y - (output) double*, array of DIMENSION at least + * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' + * and at least + * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. + * Before entry with BETA non-zero, the incremented array Y + * must contain the vector y. On exit, Y is overwritten by the + * updated vector y. + * + * INCY - (input) int + * On entry, INCY specifies the increment for the elements of + * Y. INCY must not be zero. + * + * ==== Sparse Level 2 Blas routine. + *+ */ + +int +sp_dgemv(char *trans, double alpha, SuperMatrix *A, double *x, + int incx, double beta, double *y, int incy) +{ + /* Local variables */ + NCformat *Astore; + double *Aval; + int info; + double temp; + int lenx, leny, i, j, irow; + int iy, jx, jy, kx, ky; + int notran; + + notran = lsame_(trans, "N"); + Astore = A->Store; + Aval = Astore->nzval; + + /* Test the input parameters */ + info = 0; + if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; + else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; + else if (incx == 0) info = 5; + else if (incy == 0) info = 8; + if (info != 0) { + xerbla_("sp_dgemv ", &info); + return 0; + } + + /* Quick return if possible. */ + if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.)) + return 0; + + /* Set LENX and LENY, the lengths of the vectors x and y, and set + up the start points in X and Y. */ + if (lsame_(trans, "N")) { + lenx = A->ncol; + leny = A->nrow; + } else { + lenx = A->nrow; + leny = A->ncol; + } + if (incx > 0) kx = 0; + else kx = - (lenx - 1) * incx; + if (incy > 0) ky = 0; + else ky = - (leny - 1) * incy; + + /* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + /* First form y := beta*y. */ + if (beta != 1.) { + if (incy == 1) { + if (beta == 0.) + for (i = 0; i < leny; ++i) y[i] = 0.; + else + for (i = 0; i < leny; ++i) y[i] = beta * y[i]; + } else { + iy = ky; + if (beta == 0.) + for (i = 0; i < leny; ++i) { + y[iy] = 0.; + iy += incy; + } + else + for (i = 0; i < leny; ++i) { + y[iy] = beta * y[iy]; + iy += incy; + } + } + } + + if (alpha == 0.) return 0; + + if ( notran ) { + /* Form y := alpha*A*x + y. */ + jx = kx; + if (incy == 1) { + for (j = 0; j < A->ncol; ++j) { + if (x[jx] != 0.) { + temp = alpha * x[jx]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + y[irow] += temp * Aval[i]; + } + } + jx += incx; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } else { + /* Form y := alpha*A'*x + y. */ + jy = ky; + if (incx == 1) { + for (j = 0; j < A->ncol; ++j) { + temp = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + temp += Aval[i] * x[irow]; + } + y[jy] += alpha * temp; + jy += incy; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } + return 0; +} /* sp_dgemv */ + + + diff --git a/src/maths/SuperLU/dsp_blas3.c b/src/maths/SuperLU/dsp_blas3.c new file mode 100644 index 000000000..56a94f901 --- /dev/null +++ b/src/maths/SuperLU/dsp_blas3.c @@ -0,0 +1,127 @@ + +/*! @file dsp_blas3.c + * \brief Sparse BLAS3, using some dense BLAS3 operations + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ */ +/* + * File name: sp_blas3.c + * Purpose: Sparse BLAS3, using some dense BLAS3 operations. + */ + +#include
+ * Purpose + * ======= + * + * sp_d performs one of the matrix-matrix operations + * + * C := alpha*op( A )*op( B ) + beta*C, + * + * where op( X ) is one of + * + * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), + * + * alpha and beta are scalars, and A, B and C are matrices, with op( A ) + * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. + * + * + * Parameters + * ========== + * + * TRANSA - (input) char* + * On entry, TRANSA specifies the form of op( A ) to be used in + * the matrix multiplication as follows: + * TRANSA = 'N' or 'n', op( A ) = A. + * TRANSA = 'T' or 't', op( A ) = A'. + * TRANSA = 'C' or 'c', op( A ) = conjg( A' ). + * Unchanged on exit. + * + * TRANSB - (input) char* + * On entry, TRANSB specifies the form of op( B ) to be used in + * the matrix multiplication as follows: + * TRANSB = 'N' or 'n', op( B ) = B. + * TRANSB = 'T' or 't', op( B ) = B'. + * TRANSB = 'C' or 'c', op( B ) = conjg( B' ). + * Unchanged on exit. + * + * M - (input) int + * On entry, M specifies the number of rows of the matrix + * op( A ) and of the matrix C. M must be at least zero. + * Unchanged on exit. + * + * N - (input) int + * On entry, N specifies the number of columns of the matrix + * op( B ) and the number of columns of the matrix C. N must be + * at least zero. + * Unchanged on exit. + * + * K - (input) int + * On entry, K specifies the number of columns of the matrix + * op( A ) and the number of rows of the matrix op( B ). K must + * be at least zero. + * Unchanged on exit. + * + * ALPHA - (input) double + * On entry, ALPHA specifies the scalar alpha. + * + * A - (input) SuperMatrix* + * Matrix A with a sparse format, of dimension (A->nrow, A->ncol). + * Currently, the type of A can be: + * Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. + * In the future, more general A can be handled. + * + * B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is + * n when TRANSB = 'N' or 'n', and is k otherwise. + * Before entry with TRANSB = 'N' or 'n', the leading k by n + * part of the array B must contain the matrix B, otherwise + * the leading n by k part of the array B must contain the + * matrix B. + * Unchanged on exit. + * + * LDB - (input) int + * On entry, LDB specifies the first dimension of B as declared + * in the calling (sub) program. LDB must be at least max( 1, n ). + * Unchanged on exit. + * + * BETA - (input) double + * On entry, BETA specifies the scalar beta. When BETA is + * supplied as zero then C need not be set on input. + * + * C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). + * Before entry, the leading m by n part of the array C must + * contain the matrix C, except when beta is zero, in which + * case C need not be set on entry. + * On exit, the array C is overwritten by the m by n matrix + * ( alpha*op( A )*B + beta*C ). + * + * LDC - (input) int + * On entry, LDC specifies the first dimension of C as declared + * in the calling (sub)program. LDC must be at least max(1,m). + * Unchanged on exit. + * + * ==== Sparse Level 3 Blas routine. + *+ */ + +int +sp_dgemm(char *transa, char *transb, int m, int n, int k, + double alpha, SuperMatrix *A, double *b, int ldb, + double beta, double *c, int ldc) +{ + int incx = 1, incy = 1; + int j; + + for (j = 0; j < n; ++j) { + sp_dgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); + } + return 0; +} diff --git a/src/maths/SuperLU/dutil.c b/src/maths/SuperLU/dutil.c new file mode 100644 index 000000000..ea850fff3 --- /dev/null +++ b/src/maths/SuperLU/dutil.c @@ -0,0 +1,471 @@ + +/*! @file dutil.c + * \brief Matrix utility functions + * + *
+ * -- SuperLU routine (version 3.1) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * August 1, 2008 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * -- LAPACK auxiliary routine (version 2.0) -- + * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + * Courant Institute, Argonne National Lab, and Rice University + * October 31, 1992 + *+ */ + +#include
+ Purpose + ======= + + DZSUM1 takes the sum of the absolute values of a complex + vector and returns a double precision result. + + Based on DZASUM from the Level 1 BLAS. + The change is to use the 'genuine' absolute value. + + Contributed by Nick Higham for use with ZLACON. + + Arguments + ========= + + N (input) INT + The number of elements in the vector CX. + + CX (input) COMPLEX*16 array, dimension (N) + The vector whose elements will be summed. + + INCX (input) INT + The spacing between successive values of CX. INCX > 0. + + ===================================================================== ++*/ +double dzsum1_(int *n, doublecomplex *cx, int *incx) +{ + + /* Builtin functions */ + double z_abs(doublecomplex *); + + /* Local variables */ + int i, nincx; + double stemp; + + +#define CX(I) cx[(I)-1] + + stemp = 0.; + if (*n <= 0) { + return stemp; + } + if (*incx == 1) { + goto L20; + } + + /* CODE FOR INCREMENT NOT EQUAL TO 1 */ + + nincx = *n * *incx; + for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { + + /* NEXT LINE MODIFIED. */ + + stemp += z_abs(&CX(i)); +/* L10: */ + } + + return stemp; + + /* CODE FOR INCREMENT EQUAL TO 1 */ + +L20: + for (i = 1; i <= *n; ++i) { + + /* NEXT LINE MODIFIED. */ + + stemp += z_abs(&CX(i)); +/* L30: */ + } + + return stemp; + + /* End of DZSUM1 */ + +} /* dzsum1_ */ + diff --git a/src/maths/SuperLU/get_perm_c.c b/src/maths/SuperLU/get_perm_c.c new file mode 100644 index 000000000..c0d5cbb6c --- /dev/null +++ b/src/maths/SuperLU/get_perm_c.c @@ -0,0 +1,457 @@ +/*! @file get_perm_c.c + * \brief Matrix permutation operations + * + *
+ * -- SuperLU routine (version 3.1) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * August 1, 2008 + *+ */ +#include
+ * Purpose
+ * =======
+ *
+ * Form the structure of A'*A. A is an m-by-n matrix in column oriented
+ * format represented by (colptr, rowind). The output A'*A is in column
+ * oriented format (symmetrically, also row oriented), represented by
+ * (ata_colptr, ata_rowind).
+ *
+ * This routine is modified from GETATA routine by Tim Davis.
+ * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2,
+ * i.e., the sum of the square of the row counts.
+ *
+ * Questions
+ * =========
+ * o Do I need to withhold the *dense* rows?
+ * o How do I know the number of nonzeros in A'*A?
+ *
+ */
+void
+getata(
+ const int m, /* number of rows in matrix A. */
+ const int n, /* number of columns in matrix A. */
+ const int nz, /* number of nonzeros in matrix A */
+ int *colptr, /* column pointer of size n+1 for matrix A. */
+ int *rowind, /* row indices of size nz for matrix A. */
+ int *atanz, /* out - on exit, returns the actual number of
+ nonzeros in matrix A'*A. */
+ int **ata_colptr, /* out - size n+1 */
+ int **ata_rowind /* out - size *atanz */
+ )
+{
+ register int i, j, k, col, num_nz, ti, trow;
+ int *marker, *b_colptr, *b_rowind;
+ int *t_colptr, *t_rowind; /* a column oriented form of T = A' */
+
+ if ( !(marker = (int*) SUPERLU_MALLOC((SUPERLU_MAX(m,n)+1)*sizeof(int))) )
+ ABORT_SuperLU("SUPERLU_MALLOC fails for marker[]");
+ if ( !(t_colptr = (int*) SUPERLU_MALLOC((m+1) * sizeof(int))) )
+ ABORT_SuperLU("SUPERLU_MALLOC t_colptr[]");
+ if ( !(t_rowind = (int*) SUPERLU_MALLOC(nz * sizeof(int))) )
+ ABORT_SuperLU("SUPERLU_MALLOC fails for t_rowind[]");
+
+
+ /* Get counts of each column of T, and set up column pointers */
+ for (i = 0; i < m; ++i) marker[i] = 0;
+ for (j = 0; j < n; ++j) {
+ for (i = colptr[j]; i < colptr[j+1]; ++i)
+ ++marker[rowind[i]];
+ }
+ t_colptr[0] = 0;
+ for (i = 0; i < m; ++i) {
+ t_colptr[i+1] = t_colptr[i] + marker[i];
+ marker[i] = t_colptr[i];
+ }
+
+ /* Transpose the matrix from A to T */
+ for (j = 0; j < n; ++j)
+ for (i = colptr[j]; i < colptr[j+1]; ++i) {
+ col = rowind[i];
+ t_rowind[marker[col]] = j;
+ ++marker[col];
+ }
+
+
+ /* ----------------------------------------------------------------
+ compute B = T * A, where column j of B is:
+
+ Struct (B_*j) = UNION ( Struct (T_*k) )
+ A_kj != 0
+
+ do not include the diagonal entry
+
+ ( Partition A as: A = (A_*1, ..., A_*n)
+ Then B = T * A = (T * A_*1, ..., T * A_*n), where
+ T * A_*j = (T_*1, ..., T_*m) * A_*j. )
+ ---------------------------------------------------------------- */
+
+ /* Zero the diagonal flag */
+ for (i = 0; i < n; ++i) marker[i] = -1;
+
+ /* First pass determines number of nonzeros in B */
+ num_nz = 0;
+ for (j = 0; j < n; ++j) {
+ /* Flag the diagonal so it's not included in the B matrix */
+ marker[j] = j;
+
+ for (i = colptr[j]; i < colptr[j+1]; ++i) {
+ /* A_kj is nonzero, add pattern of column T_*k to B_*j */
+ k = rowind[i];
+ for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
+ trow = t_rowind[ti];
+ if ( marker[trow] != j ) {
+ marker[trow] = j;
+ num_nz++;
+ }
+ }
+ }
+ }
+ *atanz = num_nz;
+
+ /* Allocate storage for A'*A */
+ if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
+ ABORT_SuperLU("SUPERLU_MALLOC fails for ata_colptr[]");
+ if ( *atanz ) {
+ if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) )
+ ABORT_SuperLU("SUPERLU_MALLOC fails for ata_rowind[]");
+ }
+ b_colptr = *ata_colptr; /* aliasing */
+ b_rowind = *ata_rowind;
+
+ /* Zero the diagonal flag */
+ for (i = 0; i < n; ++i) marker[i] = -1;
+
+ /* Compute each column of B, one at a time */
+ num_nz = 0;
+ for (j = 0; j < n; ++j) {
+ b_colptr[j] = num_nz;
+
+ /* Flag the diagonal so it's not included in the B matrix */
+ marker[j] = j;
+
+ for (i = colptr[j]; i < colptr[j+1]; ++i) {
+ /* A_kj is nonzero, add pattern of column T_*k to B_*j */
+ k = rowind[i];
+ for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
+ trow = t_rowind[ti];
+ if ( marker[trow] != j ) {
+ marker[trow] = j;
+ b_rowind[num_nz++] = trow;
+ }
+ }
+ }
+ }
+ b_colptr[n] = num_nz;
+
+ SUPERLU_FREE(marker);
+ SUPERLU_FREE(t_colptr);
+ SUPERLU_FREE(t_rowind);
+}
+
+
+/*! \brief
+ *
+ * + * Purpose + * ======= + * + * Form the structure of A'+A. A is an n-by-n matrix in column oriented + * format represented by (colptr, rowind). The output A'+A is in column + * oriented format (symmetrically, also row oriented), represented by + * (b_colptr, b_rowind). + *+ */ +void +at_plus_a( + const int n, /* number of columns in matrix A. */ + const int nz, /* number of nonzeros in matrix A */ + int *colptr, /* column pointer of size n+1 for matrix A. */ + int *rowind, /* row indices of size nz for matrix A. */ + int *bnz, /* out - on exit, returns the actual number of + nonzeros in matrix A'*A. */ + int **b_colptr, /* out - size n+1 */ + int **b_rowind /* out - size *bnz */ + ) +{ + register int i, j, k, col, num_nz; + int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ + int *marker; + + if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for t_colptr[]"); + if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails t_rowind[]"); + + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < n; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) + ++marker[rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < n; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose the matrix from A to T */ + for (j = 0; j < n; ++j) + for (i = colptr[j]; i < colptr[j+1]; ++i) { + col = rowind[i]; + t_rowind[marker[col]] = j; + ++marker[col]; + } + + + /* ---------------------------------------------------------------- + compute B = A + T, where column j of B is: + + Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k) + + do not include the diagonal entry + ---------------------------------------------------------------- */ + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* First pass determines number of nonzeros in B */ + num_nz = 0; + for (j = 0; j < n; ++j) { + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + /* Add pattern of column A_*k to B_*j */ + for (i = colptr[j]; i < colptr[j+1]; ++i) { + k = rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + ++num_nz; + } + } + + /* Add pattern of column T_*k to B_*j */ + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + k = t_rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + ++num_nz; + } + } + } + *bnz = num_nz; + + /* Allocate storage for A+A' */ + if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for b_colptr[]"); + if ( *bnz) { + if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for b_rowind[]"); + } + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* Compute each column of B, one at a time */ + num_nz = 0; + for (j = 0; j < n; ++j) { + (*b_colptr)[j] = num_nz; + + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + /* Add pattern of column A_*k to B_*j */ + for (i = colptr[j]; i < colptr[j+1]; ++i) { + k = rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + (*b_rowind)[num_nz++] = k; + } + } + + /* Add pattern of column T_*k to B_*j */ + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + k = t_rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + (*b_rowind)[num_nz++] = k; + } + } + } + (*b_colptr)[n] = num_nz; + + SUPERLU_FREE(marker); + SUPERLU_FREE(t_colptr); + SUPERLU_FREE(t_rowind); +} + +/*! \brief + * + *
+ * Purpose + * ======= + * + * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple + * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'. + * or using approximate minimum degree column ordering by Davis et. al. + * The LU factorization of A*Pc tends to have less fill than the LU + * factorization of A. + * + * Arguments + * ========= + * + * ispec (input) int + * Specifies the type of column ordering to reduce fill: + * = 1: minimum degree on the structure of A^T * A + * = 2: minimum degree on the structure of A^T + A + * = 3: approximate minimum degree for unsymmetric matrices + * If ispec == 0, the natural ordering (i.e., Pc = I) is returned. + * + * A (input) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of the linear equations is A->nrow. Currently, the type of A + * can be: Stype = NC; Dtype = _D; Mtype = GE. In the future, + * more general A can be handled. + * + * perm_c (output) int* + * Column permutation vector of size A->ncol, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + *+ */ +void +get_perm_c(int ispec, SuperMatrix *A, int *perm_c) +{ + NCformat *Astore = A->Store; + int m, n, bnz = 0, *b_colptr, i; + int delta, maxint, nofsub, *invp; + int *b_rowind, *dhead, *qsize, *llist, *marker; + double t, SuperLU_timer_(); + + m = A->nrow; + n = A->ncol; + + t = SuperLU_timer_(); + switch ( ispec ) { + case (NATURAL): /* Natural ordering */ + for (i = 0; i < n; ++i) perm_c[i] = i; +#if ( PRNTlevel>=1 ) + printf("Use natural column ordering.\n"); +#endif + return; + case (MMD_ATA): /* Minimum degree ordering on A'*A */ + getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); +#if ( PRNTlevel>=1 ) + printf("Use minimum degree ordering on A'*A.\n"); +#endif + t = SuperLU_timer_() - t; + /*printf("Form A'*A time = %8.3f\n", t);*/ + break; + case (MMD_AT_PLUS_A): /* Minimum degree ordering on A'+A */ + if ( m != n ) ABORT_SuperLU("Matrix is not square"); + at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); +#if ( PRNTlevel>=1 ) + printf("Use minimum degree ordering on A'+A.\n"); +#endif + t = SuperLU_timer_() - t; + /*printf("Form A'+A time = %8.3f\n", t);*/ + break; + case (COLAMD): /* Approximate minimum degree column ordering. */ + get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind, perm_c); +#if ( PRNTlevel>=1 ) + printf(".. Use approximate minimum degree column ordering.\n"); +#endif + return; + default: + ABORT_SuperLU("Invalid ISPEC"); + } + + if ( bnz != 0 ) { + t = SuperLU_timer_(); + + /* Initialize and allocate storage for GENMMD. */ + delta = 0; /* DELTA is a parameter to allow the choice of nodes + whose degree <= min-degree + DELTA. */ + maxint = 2147483647; /* 2**31 - 1 */ + invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !invp ) ABORT_SuperLU("SUPERLU_MALLOC fails for invp."); + dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !dhead ) ABORT_SuperLU("SUPERLU_MALLOC fails for dhead."); + qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !qsize ) ABORT_SuperLU("SUPERLU_MALLOC fails for qsize."); + llist = (int *) SUPERLU_MALLOC(n*sizeof(int)); + if ( !llist ) ABORT_SuperLU("SUPERLU_MALLOC fails for llist."); + marker = (int *) SUPERLU_MALLOC(n*sizeof(int)); + if ( !marker ) ABORT_SuperLU("SUPERLU_MALLOC fails for marker."); + + /* Transform adjacency list into 1-based indexing required by GENMMD.*/ + for (i = 0; i <= n; ++i) ++b_colptr[i]; + for (i = 0; i < bnz; ++i) ++b_rowind[i]; + + genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, + qsize, llist, marker, &maxint, &nofsub); + + /* Transform perm_c into 0-based indexing. */ + for (i = 0; i < n; ++i) --perm_c[i]; + + SUPERLU_FREE(invp); + SUPERLU_FREE(dhead); + SUPERLU_FREE(qsize); + SUPERLU_FREE(llist); + SUPERLU_FREE(marker); + SUPERLU_FREE(b_rowind); + + t = SuperLU_timer_() - t; + /* printf("call GENMMD time = %8.3f\n", t);*/ + + } else { /* Empty adjacency structure */ + for (i = 0; i < n; ++i) perm_c[i] = i; + } + + SUPERLU_FREE(b_colptr); +} diff --git a/src/maths/SuperLU/heap_relax_snode.c b/src/maths/SuperLU/heap_relax_snode.c new file mode 100644 index 000000000..b3e3bd42d --- /dev/null +++ b/src/maths/SuperLU/heap_relax_snode.c @@ -0,0 +1,124 @@ +/*! @file heap_relax_snode.c + * \brief Identify the initial relaxed supernodes + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + +#include
+ * Purpose + * ======= + * relax_snode() - Identify the initial relaxed supernodes, assuming that + * the matrix has been reordered according to the postorder of the etree. + *+ */ + +void +heap_relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end /* last column in a supernode */ + ) +{ + register int i, j, k, l, parent; + register int snode_start; /* beginning of a snode */ + int *et_save, *post, *inv_post, *iwork; + int nsuper_et = 0, nsuper_et_post = 0; + + /* The etree may not be postordered, but is heap ordered. */ + + iwork = (int*) intMalloc(3*n+2); + if ( !iwork ) ABORT_SuperLU("SUPERLU_MALLOC fails for iwork[]"); + inv_post = iwork + n+1; + et_save = inv_post + n+1; + + /* Post order etree */ + post = (int *) TreePostorder(n, et); + for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) { + iwork[post[i]] = post[et[i]]; + et_save[i] = et[i]; /* Save the original etree */ + } + for (i = 0; i < n; ++i) et[i] = iwork[i]; + + /* Compute the number of descendants of each node in the etree */ + ifill (relax_end, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode in postordered etree; j is the last column. */ + ++nsuper_et_post; + k = n; + for (i = snode_start; i <= j; ++i) + k = SUPERLU_MIN(k, inv_post[i]); + l = inv_post[j]; + if ( (l - k) == (j - snode_start) ) { + /* It's also a supernode in the original etree */ + relax_end[k] = l; /* Last column is recorded */ + ++nsuper_et; + } else { + for (i = snode_start; i <= j; ++i) { + l = inv_post[i]; + if ( descendants[i] == 0 ) { + relax_end[l] = l; + ++nsuper_et; + } + } + } + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + +#if ( PRNTlevel>=1 ) + printf(".. heap_snode_relax:\n" + "\tNo of relaxed snodes in postordered etree:\t%d\n" + "\tNo of relaxed snodes in original etree:\t%d\n", + nsuper_et_post, nsuper_et); +#endif + + /* Recover the original etree */ + for (i = 0; i < n; ++i) et[i] = et_save[i]; + + SUPERLU_FREE(post); + SUPERLU_FREE(iwork); +} + + diff --git a/src/maths/SuperLU/ilu_dcolumn_dfs.c b/src/maths/SuperLU/ilu_dcolumn_dfs.c new file mode 100644 index 000000000..3423bb9b1 --- /dev/null +++ b/src/maths/SuperLU/ilu_dcolumn_dfs.c @@ -0,0 +1,258 @@ + +/*! @file ilu_dcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+*/ + +#include
+ * Purpose + * ======= + * ILU_DCOLUMN_DFS performs a symbolic factorization on column jcol, and + * decide the supernode boundary. + * + * This routine does not use numeric values, but only use the RHS + * row indices to start the dfs. + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. The routine returns a list of such supernodal + * representatives in topological order of the dfs that generates them. + * The location of the first nonzero in each such supernodal segment + * (supernodal entry location) is also returned. + * + * Local parameters + * ================ + * nseg: no of segments in current U[*,j] + * jsuper: jsuper=EMPTY if column j does not belong to the same + * supernode as j-1. Otherwise, jsuper=nsuper. + * + * marker2: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + * + * Return value + * ============ + * 0 success; + * > 0 number of bytes allocated when run out of space. + *+ */ +int +ilu_dcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the + dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(7); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonzero */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ((mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu))) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( (mem_error = dLUMemXpand(jcol,nextl, + LSUB,&nzlmax,Glu)) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; + + /* Always start a new supernode for a singular column */ + if ( nextl == jptr ) jsuper = EMPTY; + + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first columns of the supernode. + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1) ) { /* >= 2 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + xlsub[jcol] = ito; + for (ifrom = jptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_dcopy_to_ucol.c b/src/maths/SuperLU/ilu_dcopy_to_ucol.c new file mode 100644 index 000000000..5dba8baee --- /dev/null +++ b/src/maths/SuperLU/ilu_dcopy_to_ucol.c @@ -0,0 +1,207 @@ + +/*! @file ilu_dcopy_to_ucol.c + * \brief Copy a computed column of U to the compressed data structure + * and drop some small entries + * + *
+ * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Laboratory + * November, 2010 + *+ */ + +#include
+ * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * ilu_ddrop_row() - Drop some small rows from the previous + * supernode (L-part only). + *+ */ +int ilu_ddrop_row( + superlu_options_t *options, /* options */ + int first, /* index of the first column in the supernode */ + int last, /* index of the last column in the supernode */ + double drop_tol, /* dropping parameter */ + int quota, /* maximum nonzero entries allowed */ + int *nnzLj, /* in/out number of nonzeros in L(:, 1:last) */ + double *fill_tol, /* in/out - on exit, fill_tol=-num_zero_pivots, + * does not change if options->ILU_MILU != SMILU1 */ + GlobalLU_t *Glu, /* modified */ + double dwork[], /* working space + * the length of dwork[] should be no less than + * the number of rows in the supernode */ + double dwork2[], /* working space with the same size as dwork[], + * used only by the second dropping rule */ + int lastc /* if lastc == 0, there is nothing after the + * working supernode [first:last]; + * if lastc == 1, there is one more column after + * the working supernode. */ ) +{ + register int i, j, k, m1; + register int nzlc; /* number of nonzeros in column last+1 */ + register int xlusup_first, xlsub_first; + int m, n; /* m x n is the size of the supernode */ + int r = 0; /* number of dropped rows */ + register double *temp; + register double *lusup = Glu->lusup; + register int *lsub = Glu->lsub; + register int *xlsub = Glu->xlsub; + register int *xlusup = Glu->xlusup; + register double d_max = 0.0, d_min = 1.0; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + norm_t nrm = options->ILU_Norm; + double zero = 0.0; + double one = 1.0; + double none = -1.0; + int i_1 = 1; + int inc_diag; /* inc_diag = m + 1 */ + int nzp = 0; /* number of zero pivots */ + double alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim); + + xlusup_first = xlusup[first]; + xlsub_first = xlsub[first]; + m = xlusup[first + 1] - xlusup_first; + n = last - first + 1; + m1 = m - 1; + inc_diag = m + 1; + nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0; + temp = dwork - n; + + /* Quick return if nothing to do. */ + if (m == 0 || m == n || drop_rule == NODROP) + { + *nnzLj += m * n; + return 0; + } + + /* basic dropping: ILU(tau) */ + for (i = n; i <= m1; ) + { + /* the average abs value of ith row */ + switch (nrm) + { + case ONE_NORM: + temp[i] = dasum_(&n, &lusup[xlusup_first + i], &m) / (double)n; + break; + case TWO_NORM: + temp[i] = dnrm2_(&n, &lusup[xlusup_first + i], &m) + / sqrt((double)n); + break; + case INF_NORM: + default: + k = idamax_(&n, &lusup[xlusup_first + i], &m) - 1; + temp[i] = fabs(lusup[xlusup_first + i + m * k]); + break; + } + + /* drop small entries due to drop_tol */ + if (drop_rule & DROP_BASIC && temp[i] < drop_tol) + { + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + daxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m] += + fabs(lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + dcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + dswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m] = + fabs(lusup[xlusup_first + m1 + j * m]); + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + continue; + } /* if dropping */ + else + { + if (temp[i] > d_max) d_max = temp[i]; + if (temp[i] < d_min) d_min = temp[i]; + } + i++; + } /* for */ + + /* Secondary dropping: drop more rows according to the quota. */ + quota = ceil((double)quota / (double)n); + if (drop_rule & DROP_SECONDARY && m - r > quota) + { + register double tol = d_max; + + /* Calculate the second dropping tolerance */ + if (quota > n) + { + if (drop_rule & DROP_INTERP) /* by interpolation */ + { + d_max = 1.0 / d_max; d_min = 1.0 / d_min; + tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r)); + } + else /* by quick select */ + { + int len = m1 - n + 1; + dcopy_(&len, dwork, &i_1, dwork2, &i_1); + tol = dqselect(len, dwork2, quota - n); +#if 0 + register int *itemp = iwork - n; + A = temp; + for (i = n; i <= m1; i++) itemp[i] = i; + qsort(iwork, m1 - n + 1, sizeof(int), _compare_); + tol = temp[itemp[quota]]; +#endif + } + } + + for (i = n; i <= m1; ) + { + if (temp[i] <= tol) + { + register int j; + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + daxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m] += + fabs(lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + dcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + dswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m] = + fabs(lusup[xlusup_first + m1 + j * m]); + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + temp[i] = temp[m1]; + + continue; + } + i++; + + } /* for */ + + } /* if secondary dropping */ + + for (i = n; i < m; i++) temp[i] = 0.0; + + if (r == 0) + { + *nnzLj += m * n; + return 0; + } + + /* add dropped entries to the diagnal */ + if (milu != SILU) + { + register int j; + double t; + double omega; + for (j = 0; j < n; j++) + { + t = lusup[xlusup_first + (m - 1) + j * m]; + if (t == zero) continue; + if (t > zero) + omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / t, 1.0); + else + omega = SUPERLU_MAX(2.0 * (1.0 - alpha) / t, -1.0); + t *= omega; + + switch (milu) + { + case SMILU_1: + if (t != none) { + lusup[xlusup_first + j * inc_diag] *= (one + t); + } + else + { + lusup[xlusup_first + j * inc_diag] *= *fill_tol; +#ifdef DEBUG + printf("[1] ZERO PIVOT: FILL col %d.\n", first + j); + fflush(stdout); +#endif + nzp++; + } + break; + case SMILU_2: + lusup[xlusup_first + j * inc_diag] *= (1.0 + fabs(t)); + break; + case SMILU_3: + lusup[xlusup_first + j * inc_diag] *= (one + t); + break; + case SILU: + default: + break; + } + } + if (nzp > 0) *fill_tol = -nzp; + } + + /* Remove dropped entries from the memory and fix the pointers. */ + m1 = m - r; + for (j = 1; j < n; j++) + { + register int tmp1, tmp2; + tmp1 = xlusup_first + j * m1; + tmp2 = xlusup_first + j * m; + for (i = 0; i < m1; i++) + lusup[i + tmp1] = lusup[i + tmp2]; + } + for (i = 0; i < nzlc; i++) + lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m]; + for (i = 0; i < nzlc; i++) + lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i]; + for (i = first + 1; i <= last + 1; i++) + { + xlusup[i] -= r * (i - first); + xlsub[i] -= r; + } + if (lastc) + { + xlusup[last + 2] -= r * n; + xlsub[last + 2] -= r; + } + + *nnzLj += (m - r) * n; + return r; +} diff --git a/src/maths/SuperLU/ilu_dpanel_dfs.c b/src/maths/SuperLU/ilu_dpanel_dfs.c new file mode 100644 index 000000000..ac5686670 --- /dev/null +++ b/src/maths/SuperLU/ilu_dpanel_dfs.c @@ -0,0 +1,248 @@ + +/*! @file ilu_dpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols and + * record the entries with maximum absolute value in each column + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * + * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. + * + * The routine returns one list of the supernodal representatives + * in topological order of the dfs that generates them. This list is + * a superset of the topological order of each individual column within + * the panel. + * The location of the first nonzero in each supernodal segment + * (supernodal entry location) is also returned. Each column has a + * separate list for this purpose. + * + * Two marker arrays are used for dfs: + * marker[i] == jj, if i was visited during dfs of current column jj; + * marker1[i] >= jcol, if i was visited by earlier columns in this panel; + * + * marker: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + *+ */ +void +ilu_dpanel_dfs( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + double *dense, /* out */ + double *amax, /* out - max. abs. value of each column in panel */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ +) +{ + + NCPformat *Astore; + double *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + double *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + double *amax_col; + register double tmp; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + amax_col = amax; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + *amax_col = 0.0; + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + tmp = fabs(a[k]); + if (tmp > *amax_col) *amax_col = tmp; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + amax_col++; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/ilu_dpivotL.c b/src/maths/SuperLU/ilu_dpivotL.c new file mode 100644 index 000000000..bd929932a --- /dev/null +++ b/src/maths/SuperLU/ilu_dpivotL.c @@ -0,0 +1,266 @@ + +/*! @file ilu_dpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + + +#include
+ * Purpose + * ======= + * Performs the numerical pivoting on the current column of L, + * and the CDIV operation. + * + * Pivot policy: + * (1) Compute thresh = u * max_(i>=j) abs(A_ij); + * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN + * pivot row = k; + * ELSE IF abs(A_jj) >= thresh THEN + * pivot row = j; + * ELSE + * pivot row = m; + * + * Note: If you absolutely want to use a given pivot order, then set u=0.0. + * + * Return value: 0 success; + * i > 0 U(i,i) is exactly zero. + *+ */ + +int +ilu_dpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by + * perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int diagind, /* diagonal of Pc*A*Pc' */ + int *swap, /* in/out record the row permutation */ + int *iswap, /* in/out inverse of swap, it is the same as + perm_r after the factorization */ + int *marker, /* in */ + int *pivrow, /* in/out, as an input if *usepr!=0 */ + double fill_tol, /* in - fill tolerance of current column + * used for a singular column */ + milu_t milu, /* in */ + double drop_sum, /* in - computed in ilu_dcopy_to_ucol() + (MILU only) */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + int n; /* number of columns */ + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + register int pivptr; + int old_pivptr, diag, ptr0; + register double pivmax, rtemp; + double thresh; + double temp; + double *lu_sup_ptr; + double *lu_col_ptr; + int *lsub_ptr; + register int isub, icol, k, itemp; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + flops_t *ops = stat->ops; + int info; + + /* Initialize pointers */ + n = Glu->n; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + pivmax = -1.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + ptr0 = EMPTY; + for (isub = nsupc; isub < nsupr; ++isub) { + if (marker[lsub_ptr[isub]] > jcol) + continue; /* do not overlap with a later relaxed supernode */ + + switch (milu) { + case SMILU_1: + rtemp = fabs(lu_col_ptr[isub] + drop_sum); + break; + case SMILU_2: + case SMILU_3: + /* In this case, drop_sum contains the sum of the abs. value */ + rtemp = fabs(lu_col_ptr[isub]); + break; + case SILU: + default: + rtemp = fabs(lu_col_ptr[isub]); + break; + } + if (rtemp > pivmax) { pivmax = rtemp; pivptr = isub; } + if (*usepr && lsub_ptr[isub] == *pivrow) old_pivptr = isub; + if (lsub_ptr[isub] == diagind) diag = isub; + if (ptr0 == EMPTY) ptr0 = isub; + } + + if (milu == SMILU_2 || milu == SMILU_3) pivmax += drop_sum; + + /* Test for singularity */ + if (pivmax < 0.0) { + fprintf(stderr, "[0]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + if ( pivmax == 0.0 ) { + if (diag != EMPTY) + *pivrow = lsub_ptr[pivptr = diag]; + else if (ptr0 != EMPTY) + *pivrow = lsub_ptr[pivptr = ptr0]; + else { + /* look for the first row which does not + belong to any later supernodes */ + for (icol = jcol; icol < n; icol++) + if (marker[swap[icol]] <= jcol) break; + if (icol >= n) { + fprintf(stderr, "[1]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + + *pivrow = swap[icol]; + + /* pick up the pivot row */ + for (isub = nsupc; isub < nsupr; ++isub) + if ( lsub_ptr[isub] == *pivrow ) { pivptr = isub; break; } + } + pivmax = fill_tol; + lu_col_ptr[pivptr] = pivmax; + *usepr = 0; +#ifdef DEBUG + printf("[0] ZERO PIVOT: FILL (%d, %d).\n", *pivrow, jcol); + fflush(stdout); +#endif + info =jcol + 1; + } /* if (*pivrow == 0.0) */ + else { + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + switch (milu) { + case SMILU_1: + rtemp = fabs(lu_col_ptr[old_pivptr] + drop_sum); + break; + case SMILU_2: + case SMILU_3: + rtemp = fabs(lu_col_ptr[old_pivptr]) + drop_sum; + break; + case SILU: + default: + rtemp = fabs(lu_col_ptr[old_pivptr]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = old_pivptr; + else *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + switch (milu) { + case SMILU_1: + rtemp = fabs(lu_col_ptr[diag] + drop_sum); + break; + case SMILU_2: + case SMILU_3: + rtemp = fabs(lu_col_ptr[diag]) + drop_sum; + break; + case SILU: + default: + rtemp = fabs(lu_col_ptr[diag]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + info = 0; + + /* Reset the diagonal */ + switch (milu) { + case SMILU_1: + lu_col_ptr[pivptr] += drop_sum; + break; + case SMILU_2: + case SMILU_3: + lu_col_ptr[pivptr] += SGN(lu_col_ptr[pivptr]) * drop_sum; + break; + case SILU: + default: + break; + } + + } /* else */ + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + if (jcol < n - 1) { + register int t1, t2, t; + t1 = iswap[*pivrow]; t2 = jcol; + if (t1 != t2) { + t = swap[t1]; swap[t1] = swap[t2]; swap[t2] = t; + t1 = swap[t1]; t2 = t; + t = iswap[t1]; iswap[t1] = iswap[t2]; iswap[t2] = t; + } + } /* if (jcol < n - 1) */ + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += nsupr - nsupc; + temp = 1.0 / lu_col_ptr[nsupc]; + for (k = nsupc+1; k < nsupr; k++) lu_col_ptr[k] *= temp; + + return info; +} diff --git a/src/maths/SuperLU/ilu_dsnode_dfs.c b/src/maths/SuperLU/ilu_dsnode_dfs.c new file mode 100644 index 000000000..87a5cd4e4 --- /dev/null +++ b/src/maths/SuperLU/ilu_dsnode_dfs.c @@ -0,0 +1,90 @@ + +/*! @file ilu_dsnode_dfs.c + * \brief Determines the union of row structures of columns within the relaxed node + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * ilu_dsnode_dfs() - Determine the union of the row structures of those + * columns within the relaxed snode. + * Note: The relaxed snodes are leaves of the supernodal etree, therefore, + * the portion outside the rectangular supernode must be zero. + * + * Return value + * ============ + * 0 success; + * >0 number of bytes allocated when run out of memory. + *+ */ + +int +ilu_dsnode_dfs( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, nextl; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) + { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) + { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) + { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) + { + if ( (mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, + Glu)) != 0) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1 */ + if ( jcol < kcol ) + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xlsub[kcol+1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_heap_relax_snode.c b/src/maths/SuperLU/ilu_heap_relax_snode.c new file mode 100644 index 000000000..b9940de6f --- /dev/null +++ b/src/maths/SuperLU/ilu_heap_relax_snode.c @@ -0,0 +1,120 @@ +/*! @file ilu_heap_relax_snode.c + * \brief Identify the initial relaxed supernodes + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 1, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * ilu_heap_relax_snode() - Identify the initial relaxed supernodes, + * assuming that the matrix has been reordered according to the postorder + * of the etree. + *+ */ + +void +ilu_heap_relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end, /* last column in a supernode + * if j-th column starts a relaxed + * supernode, relax_end[j] represents + * the last column of this supernode */ + int *relax_fsupc /* first column in a supernode + * relax_fsupc[j] represents the first + * column of j-th supernode */ + ) +{ + register int i, j, k, l, f, parent; + register int snode_start; /* beginning of a snode */ + int *et_save, *post, *inv_post, *iwork; + int nsuper_et = 0, nsuper_et_post = 0; + + /* The etree may not be postordered, but is heap ordered. */ + + iwork = (int*) intMalloc(3*n+2); + if ( !iwork ) ABORT_SuperLU("SUPERLU_MALLOC fails for iwork[]"); + inv_post = iwork + n+1; + et_save = inv_post + n+1; + + /* Post order etree */ + post = (int *) TreePostorder(n, et); + for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) { + iwork[post[i]] = post[et[i]]; + et_save[i] = et[i]; /* Save the original etree */ + } + for (i = 0; i < n; ++i) et[i] = iwork[i]; + + /* Compute the number of descendants of each node in the etree */ + ifill (relax_end, n, EMPTY); + ifill (relax_fsupc, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for ( f = j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode in postordered etree; j is the last column. */ + ++nsuper_et_post; + k = n; + for (i = snode_start; i <= j; ++i) + k = SUPERLU_MIN(k, inv_post[i]); + l = inv_post[j]; + if ( (l - k) == (j - snode_start) ) { + /* It's also a supernode in the original etree */ + relax_end[k] = l; /* Last column is recorded */ + relax_fsupc[f++] = k; + ++nsuper_et; + } else { + for (i = snode_start; i <= j; ++i) { + l = inv_post[i]; + if ( descendants[i] == 0 ) { + relax_end[l] = l; + relax_fsupc[f++] = l; + ++nsuper_et; + } + } + } + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + +#if ( PRNTlevel>=1 ) + printf(".. heap_snode_relax:\n" + "\tNo of relaxed snodes in postordered etree:\t%d\n" + "\tNo of relaxed snodes in original etree:\t%d\n", + nsuper_et_post, nsuper_et); +#endif + + /* Recover the original etree */ + for (i = 0; i < n; ++i) et[i] = et_save[i]; + + SUPERLU_FREE(post); + SUPERLU_FREE(iwork); +} diff --git a/src/maths/SuperLU/ilu_relax_snode.c b/src/maths/SuperLU/ilu_relax_snode.c new file mode 100644 index 000000000..5634098a5 --- /dev/null +++ b/src/maths/SuperLU/ilu_relax_snode.c @@ -0,0 +1,69 @@ +/*! @file ilu_relax_snode.c + * \brief Identify initial relaxed supernodes + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 1, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * ilu_relax_snode() - Identify the initial relaxed supernodes, assuming + * that the matrix has been reordered according to the postorder of the + * etree. + *+ */ +void +ilu_relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end, /* last column in a supernode + * if j-th column starts a relaxed + * supernode, relax_end[j] represents + * the last column of this supernode */ + int *relax_fsupc /* first column in a supernode + * relax_fsupc[j] represents the first + * column of j-th supernode */ + ) +{ + + register int j, f, parent; + register int snode_start; /* beginning of a snode */ + + ifill (relax_end, n, EMPTY); + ifill (relax_fsupc, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + + /* Compute the number of descendants of each node in the etree */ + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = f = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode with j being the last column. */ + relax_end[snode_start] = j; /* Last column is recorded */ + j++; + relax_fsupc[f++] = snode_start; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } +} diff --git a/src/maths/SuperLU/ilu_zcolumn_dfs.c b/src/maths/SuperLU/ilu_zcolumn_dfs.c new file mode 100644 index 000000000..a8e1e22d6 --- /dev/null +++ b/src/maths/SuperLU/ilu_zcolumn_dfs.c @@ -0,0 +1,258 @@ + +/*! @file ilu_zcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+*/ + +#include
+ * Purpose + * ======= + * ILU_ZCOLUMN_DFS performs a symbolic factorization on column jcol, and + * decide the supernode boundary. + * + * This routine does not use numeric values, but only use the RHS + * row indices to start the dfs. + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. The routine returns a list of such supernodal + * representatives in topological order of the dfs that generates them. + * The location of the first nonzero in each such supernodal segment + * (supernodal entry location) is also returned. + * + * Local parameters + * ================ + * nseg: no of segments in current U[*,j] + * jsuper: jsuper=EMPTY if column j does not belong to the same + * supernode as j-1. Otherwise, jsuper=nsuper. + * + * marker2: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + * + * Return value + * ============ + * 0 success; + * > 0 number of bytes allocated when run out of space. + *+ */ +int +ilu_zcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the + dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(7); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonzero */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ((mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu))) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( (mem_error = zLUMemXpand(jcol,nextl, + LSUB,&nzlmax,Glu)) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; + + /* Always start a new supernode for a singular column */ + if ( nextl == jptr ) jsuper = EMPTY; + + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first columns of the supernode. + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1) ) { /* >= 2 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + xlsub[jcol] = ito; + for (ifrom = jptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_zcopy_to_ucol.c b/src/maths/SuperLU/ilu_zcopy_to_ucol.c new file mode 100644 index 000000000..59f01ee9c --- /dev/null +++ b/src/maths/SuperLU/ilu_zcopy_to_ucol.c @@ -0,0 +1,211 @@ + +/*! @file ilu_zcopy_to_ucol.c + * \brief Copy a computed column of U to the compressed data structure + * and drop some small entries + * + *
+ * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Laboratory + * November, 2010 + *+ */ + +#include
+ * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * ilu_zdrop_row() - Drop some small rows from the previous + * supernode (L-part only). + *+ */ +int ilu_zdrop_row( + superlu_options_t *options, /* options */ + int first, /* index of the first column in the supernode */ + int last, /* index of the last column in the supernode */ + double drop_tol, /* dropping parameter */ + int quota, /* maximum nonzero entries allowed */ + int *nnzLj, /* in/out number of nonzeros in L(:, 1:last) */ + double *fill_tol, /* in/out - on exit, fill_tol=-num_zero_pivots, + * does not change if options->ILU_MILU != SMILU1 */ + GlobalLU_t *Glu, /* modified */ + double dwork[], /* working space + * the length of dwork[] should be no less than + * the number of rows in the supernode */ + double dwork2[], /* working space with the same size as dwork[], + * used only by the second dropping rule */ + int lastc /* if lastc == 0, there is nothing after the + * working supernode [first:last]; + * if lastc == 1, there is one more column after + * the working supernode. */ ) +{ + register int i, j, k, m1; + register int nzlc; /* number of nonzeros in column last+1 */ + register int xlusup_first, xlsub_first; + int m, n; /* m x n is the size of the supernode */ + int r = 0; /* number of dropped rows */ + register double *temp; + register doublecomplex *lusup = Glu->lusup; + register int *lsub = Glu->lsub; + register int *xlsub = Glu->xlsub; + register int *xlusup = Glu->xlusup; + register double d_max = 0.0, d_min = 1.0; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + norm_t nrm = options->ILU_Norm; + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + doublecomplex none = {-1.0, 0.0}; + int i_1 = 1; + int inc_diag; /* inc_diag = m + 1 */ + int nzp = 0; /* number of zero pivots */ + double alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim); + + xlusup_first = xlusup[first]; + xlsub_first = xlsub[first]; + m = xlusup[first + 1] - xlusup_first; + n = last - first + 1; + m1 = m - 1; + inc_diag = m + 1; + nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0; + temp = dwork - n; + + /* Quick return if nothing to do. */ + if (m == 0 || m == n || drop_rule == NODROP) + { + *nnzLj += m * n; + return 0; + } + + /* basic dropping: ILU(tau) */ + for (i = n; i <= m1; ) + { + /* the average abs value of ith row */ + switch (nrm) + { + case ONE_NORM: + temp[i] = dzasum_(&n, &lusup[xlusup_first + i], &m) / (double)n; + break; + case TWO_NORM: + temp[i] = dznrm2_(&n, &lusup[xlusup_first + i], &m) + / sqrt((double)n); + break; + case INF_NORM: + default: + k = izamax_(&n, &lusup[xlusup_first + i], &m) - 1; + temp[i] = z_abs1(&lusup[xlusup_first + i + m * k]); + break; + } + + /* drop small entries due to drop_tol */ + if (drop_rule & DROP_BASIC && temp[i] < drop_tol) + { + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + zaxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m].r += + z_abs1(&lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + zcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + zswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m].r = + z_abs1(&lusup[xlusup_first + m1 + j * m]); + lusup[xlusup_first + m1 + j * m].i = 0.0; + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + continue; + } /* if dropping */ + else + { + if (temp[i] > d_max) d_max = temp[i]; + if (temp[i] < d_min) d_min = temp[i]; + } + i++; + } /* for */ + + /* Secondary dropping: drop more rows according to the quota. */ + quota = ceil((double)quota / (double)n); + if (drop_rule & DROP_SECONDARY && m - r > quota) + { + register double tol = d_max; + + /* Calculate the second dropping tolerance */ + if (quota > n) + { + if (drop_rule & DROP_INTERP) /* by interpolation */ + { + d_max = 1.0 / d_max; d_min = 1.0 / d_min; + tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r)); + } + else /* by quick select */ + { + int len = m1 - n + 1; + dcopy_(&len, dwork, &i_1, dwork2, &i_1); + tol = dqselect(len, dwork2, quota - n); +#if 0 + register int *itemp = iwork - n; + A = temp; + for (i = n; i <= m1; i++) itemp[i] = i; + qsort(iwork, m1 - n + 1, sizeof(int), _compare_); + tol = temp[itemp[quota]]; +#endif + } + } + + for (i = n; i <= m1; ) + { + if (temp[i] <= tol) + { + register int j; + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + zaxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m].r += + z_abs1(&lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + zcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + zswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m].r = + z_abs1(&lusup[xlusup_first + m1 + j * m]); + lusup[xlusup_first + m1 + j * m].i = 0.0; + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + temp[i] = temp[m1]; + + continue; + } + i++; + + } /* for */ + + } /* if secondary dropping */ + + for (i = n; i < m; i++) temp[i] = 0.0; + + if (r == 0) + { + *nnzLj += m * n; + return 0; + } + + /* add dropped entries to the diagnal */ + if (milu != SILU) + { + register int j; + doublecomplex t; + double omega; + for (j = 0; j < n; j++) + { + t = lusup[xlusup_first + (m - 1) + j * m]; + if (t.r == 0.0 && t.i == 0.0) continue; + omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / z_abs1(&t), 1.0); + zd_mult(&t, &t, omega); + + switch (milu) + { + case SMILU_1: + if ( !(z_eq(&t, &none)) ) { + z_add(&t, &t, &one); + zz_mult(&lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + &t); + } + else + { + zd_mult( + &lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + *fill_tol); +#ifdef DEBUG + printf("[1] ZERO PIVOT: FILL col %d.\n", first + j); + fflush(stdout); +#endif + nzp++; + } + break; + case SMILU_2: + zd_mult(&lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + 1.0 + z_abs1(&t)); + break; + case SMILU_3: + z_add(&t, &t, &one); + zz_mult(&lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + &t); + break; + case SILU: + default: + break; + } + } + if (nzp > 0) *fill_tol = -nzp; + } + + /* Remove dropped entries from the memory and fix the pointers. */ + m1 = m - r; + for (j = 1; j < n; j++) + { + register int tmp1, tmp2; + tmp1 = xlusup_first + j * m1; + tmp2 = xlusup_first + j * m; + for (i = 0; i < m1; i++) + lusup[i + tmp1] = lusup[i + tmp2]; + } + for (i = 0; i < nzlc; i++) + lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m]; + for (i = 0; i < nzlc; i++) + lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i]; + for (i = first + 1; i <= last + 1; i++) + { + xlusup[i] -= r * (i - first); + xlsub[i] -= r; + } + if (lastc) + { + xlusup[last + 2] -= r * n; + xlsub[last + 2] -= r; + } + + *nnzLj += (m - r) * n; + return r; +} diff --git a/src/maths/SuperLU/ilu_zpanel_dfs.c b/src/maths/SuperLU/ilu_zpanel_dfs.c new file mode 100644 index 000000000..471c64751 --- /dev/null +++ b/src/maths/SuperLU/ilu_zpanel_dfs.c @@ -0,0 +1,248 @@ + +/*! @file ilu_zpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols and + * record the entries with maximum absolute value in each column + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * + * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. + * + * The routine returns one list of the supernodal representatives + * in topological order of the dfs that generates them. This list is + * a superset of the topological order of each individual column within + * the panel. + * The location of the first nonzero in each supernodal segment + * (supernodal entry location) is also returned. Each column has a + * separate list for this purpose. + * + * Two marker arrays are used for dfs: + * marker[i] == jj, if i was visited during dfs of current column jj; + * marker1[i] >= jcol, if i was visited by earlier columns in this panel; + * + * marker: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + *+ */ +void +ilu_zpanel_dfs( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + doublecomplex *dense, /* out */ + double *amax, /* out - max. abs. value of each column in panel */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ +) +{ + + NCPformat *Astore; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + doublecomplex *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + double *amax_col; + register double tmp; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + amax_col = amax; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + *amax_col = 0.0; + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + tmp = z_abs1(&a[k]); + if (tmp > *amax_col) *amax_col = tmp; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + amax_col++; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/ilu_zpivotL.c b/src/maths/SuperLU/ilu_zpivotL.c new file mode 100644 index 000000000..bf658b90a --- /dev/null +++ b/src/maths/SuperLU/ilu_zpivotL.c @@ -0,0 +1,274 @@ + +/*! @file ilu_zpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + + +#include
+ * Purpose + * ======= + * Performs the numerical pivoting on the current column of L, + * and the CDIV operation. + * + * Pivot policy: + * (1) Compute thresh = u * max_(i>=j) abs(A_ij); + * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN + * pivot row = k; + * ELSE IF abs(A_jj) >= thresh THEN + * pivot row = j; + * ELSE + * pivot row = m; + * + * Note: If you absolutely want to use a given pivot order, then set u=0.0. + * + * Return value: 0 success; + * i > 0 U(i,i) is exactly zero. + *+ */ + +int +ilu_zpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by + * perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int diagind, /* diagonal of Pc*A*Pc' */ + int *swap, /* in/out record the row permutation */ + int *iswap, /* in/out inverse of swap, it is the same as + perm_r after the factorization */ + int *marker, /* in */ + int *pivrow, /* in/out, as an input if *usepr!=0 */ + double fill_tol, /* in - fill tolerance of current column + * used for a singular column */ + milu_t milu, /* in */ + doublecomplex drop_sum, /* in - computed in ilu_zcopy_to_ucol() + (MILU only) */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + int n; /* number of columns */ + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + register int pivptr; + int old_pivptr, diag, ptr0; + register double pivmax, rtemp; + double thresh; + doublecomplex temp; + doublecomplex *lu_sup_ptr; + doublecomplex *lu_col_ptr; + int *lsub_ptr; + register int isub, icol, k, itemp; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + flops_t *ops = stat->ops; + int info; + doublecomplex one = {1.0, 0.0}; + + /* Initialize pointers */ + n = Glu->n; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + pivmax = -1.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + ptr0 = EMPTY; + for (isub = nsupc; isub < nsupr; ++isub) { + if (marker[lsub_ptr[isub]] > jcol) + continue; /* do not overlap with a later relaxed supernode */ + + switch (milu) { + case SMILU_1: + z_add(&temp, &lu_col_ptr[isub], &drop_sum); + rtemp = z_abs1(&temp); + break; + case SMILU_2: + case SMILU_3: + /* In this case, drop_sum contains the sum of the abs. value */ + rtemp = z_abs1(&lu_col_ptr[isub]); + break; + case SILU: + default: + rtemp = z_abs1(&lu_col_ptr[isub]); + break; + } + if (rtemp > pivmax) { pivmax = rtemp; pivptr = isub; } + if (*usepr && lsub_ptr[isub] == *pivrow) old_pivptr = isub; + if (lsub_ptr[isub] == diagind) diag = isub; + if (ptr0 == EMPTY) ptr0 = isub; + } + + if (milu == SMILU_2 || milu == SMILU_3) pivmax += drop_sum.r; + + /* Test for singularity */ + if (pivmax < 0.0) { + fprintf(stderr, "[0]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + if ( pivmax == 0.0 ) { + if (diag != EMPTY) + *pivrow = lsub_ptr[pivptr = diag]; + else if (ptr0 != EMPTY) + *pivrow = lsub_ptr[pivptr = ptr0]; + else { + /* look for the first row which does not + belong to any later supernodes */ + for (icol = jcol; icol < n; icol++) + if (marker[swap[icol]] <= jcol) break; + if (icol >= n) { + fprintf(stderr, "[1]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + + *pivrow = swap[icol]; + + /* pick up the pivot row */ + for (isub = nsupc; isub < nsupr; ++isub) + if ( lsub_ptr[isub] == *pivrow ) { pivptr = isub; break; } + } + pivmax = fill_tol; + lu_col_ptr[pivptr].r = pivmax; + lu_col_ptr[pivptr].i = 0.0; + *usepr = 0; +#ifdef DEBUG + printf("[0] ZERO PIVOT: FILL (%d, %d).\n", *pivrow, jcol); + fflush(stdout); +#endif + info =jcol + 1; + } /* if (*pivrow == 0.0) */ + else { + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + switch (milu) { + case SMILU_1: + z_add(&temp, &lu_col_ptr[old_pivptr], &drop_sum); + rtemp = z_abs1(&temp); + break; + case SMILU_2: + case SMILU_3: + rtemp = z_abs1(&lu_col_ptr[old_pivptr]) + drop_sum.r; + break; + case SILU: + default: + rtemp = z_abs1(&lu_col_ptr[old_pivptr]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = old_pivptr; + else *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + switch (milu) { + case SMILU_1: + z_add(&temp, &lu_col_ptr[diag], &drop_sum); + rtemp = z_abs1(&temp); + break; + case SMILU_2: + case SMILU_3: + rtemp = z_abs1(&lu_col_ptr[diag]) + drop_sum.r; + break; + case SILU: + default: + rtemp = z_abs1(&lu_col_ptr[diag]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + info = 0; + + /* Reset the diagonal */ + switch (milu) { + case SMILU_1: + z_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum); + break; + case SMILU_2: + case SMILU_3: + temp = z_sgn(&lu_col_ptr[pivptr]); + zz_mult(&temp, &temp, &drop_sum); + z_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum); + break; + case SILU: + default: + break; + } + + } /* else */ + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + if (jcol < n - 1) { + register int t1, t2, t; + t1 = iswap[*pivrow]; t2 = jcol; + if (t1 != t2) { + t = swap[t1]; swap[t1] = swap[t2]; swap[t2] = t; + t1 = swap[t1]; t2 = t; + t = iswap[t1]; iswap[t1] = iswap[t2]; iswap[t2] = t; + } + } /* if (jcol < n - 1) */ + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += 10 * (nsupr - nsupc); + z_div(&temp, &one, &lu_col_ptr[nsupc]); + for (k = nsupc+1; k < nsupr; k++) + zz_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); + + return info; +} diff --git a/src/maths/SuperLU/ilu_zsnode_dfs.c b/src/maths/SuperLU/ilu_zsnode_dfs.c new file mode 100644 index 000000000..0a8975819 --- /dev/null +++ b/src/maths/SuperLU/ilu_zsnode_dfs.c @@ -0,0 +1,90 @@ + +/*! @file ilu_zsnode_dfs.c + * \brief Determines the union of row structures of columns within the relaxed node + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + +#include
+ * Purpose + * ======= + * ilu_zsnode_dfs() - Determine the union of the row structures of those + * columns within the relaxed snode. + * Note: The relaxed snodes are leaves of the supernodal etree, therefore, + * the portion outside the rectangular supernode must be zero. + * + * Return value + * ============ + * 0 success; + * >0 number of bytes allocated when run out of memory. + *+ */ + +int +ilu_zsnode_dfs( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, nextl; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) + { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) + { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) + { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) + { + if ( (mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, + Glu)) != 0) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1 */ + if ( jcol < kcol ) + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xlsub[kcol+1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/izmax1.c b/src/maths/SuperLU/izmax1.c new file mode 100644 index 000000000..13272a9d6 --- /dev/null +++ b/src/maths/SuperLU/izmax1.c @@ -0,0 +1,113 @@ +/*! @file izmax1.c + * \brief Finds the index of the element whose real part has maximum absolute value + * + *
+ * -- LAPACK auxiliary routine (version 2.0) -- + * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + * Courant Institute, Argonne National Lab, and Rice University + * October 31, 1992 + *+ */ +#include
+ Purpose + ======= + + IZMAX1 finds the index of the element whose real part has maximum + absolute value. + + Based on IZAMAX from Level 1 BLAS. + The change is to use the 'genuine' absolute value. + + Contributed by Nick Higham for use with ZLACON. + + Arguments + ========= + + N (input) INT + The number of elements in the vector CX. + + CX (input) COMPLEX*16 array, dimension (N) + The vector whose elements will be summed. + + INCX (input) INT + The spacing between successive values of CX. INCX >= 1. + + ===================================================================== ++*/ + +int +izmax1_(int *n, doublecomplex *cx, int *incx) +{ + + + /* System generated locals */ + int ret_val, i__1, i__2; + double d__1; + + /* Local variables */ + double smax; + int i, ix; + +#define CX(I) cx[(I)-1] + + ret_val = 0; + if (*n < 1) { + return ret_val; + } + ret_val = 1; + if (*n == 1) { + return ret_val; + } + if (*incx == 1) { + goto L30; + } + +/* CODE FOR INCREMENT NOT EQUAL TO 1 */ + + ix = 1; + smax = (d__1 = CX(1).r, fabs(d__1)); + ix += *incx; + i__1 = *n; + for (i = 2; i <= *n; ++i) { + i__2 = ix; + if ((d__1 = CX(ix).r, fabs(d__1)) <= smax) { + goto L10; + } + ret_val = i; + i__2 = ix; + smax = (d__1 = CX(ix).r, fabs(d__1)); +L10: + ix += *incx; +/* L20: */ + } + return ret_val; + +/* CODE FOR INCREMENT EQUAL TO 1 */ + +L30: + smax = (d__1 = CX(1).r, fabs(d__1)); + i__1 = *n; + for (i = 2; i <= *n; ++i) { + i__2 = i; + if ((d__1 = CX(i).r, fabs(d__1)) <= smax) { + goto L40; + } + ret_val = i; + i__2 = i; + smax = (d__1 = CX(i).r, fabs(d__1)); +L40: + ; + } + return ret_val; + +/* End of IZMAX1 */ + +} /* izmax1_ */ + diff --git a/src/maths/SuperLU/lsame.c b/src/maths/SuperLU/lsame.c new file mode 100644 index 000000000..cd7fa00d0 --- /dev/null +++ b/src/maths/SuperLU/lsame.c @@ -0,0 +1,83 @@ +/*! @file lsame.c + * \brief Check if CA is the same letter as CB regardless of case. + * + *
+ * -- LAPACK auxiliary routine (version 2.0) -- + * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + * Courant Institute, Argonne National Lab, and Rice University + * September 30, 1994 + *+ */ +#include
+ Purpose + ======= + + LSAME returns .TRUE. if CA is the same letter as CB regardless of case. + + Arguments + ========= + + CA (input) CHARACTER*1 + CB (input) CHARACTER*1 + CA and CB specify the single characters to be compared. + + ===================================================================== ++*/ + +int lsame_(char *ca, char *cb) +{ + + + /* System generated locals */ + int ret_val; + + /* Local variables */ + int inta, intb, zcode; + + ret_val = *(unsigned char *)ca == *(unsigned char *)cb; + if (ret_val) { + return ret_val; + } + + /* Now test for equivalence if both characters are alphabetic. */ + + zcode = 'Z'; + + /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime + machines, on which ICHAR returns a value with bit 8 set. + ICHAR('A') on Prime machines returns 193 which is the same as + ICHAR('A') on an EBCDIC machine. */ + + inta = *(unsigned char *)ca; + intb = *(unsigned char *)cb; + + if (zcode == 90 || zcode == 122) { + /* ASCII is assumed - ZCODE is the ASCII code of either lower or + upper case 'Z'. */ + if (inta >= 97 && inta <= 122) inta += -32; + if (intb >= 97 && intb <= 122) intb += -32; + + } else if (zcode == 233 || zcode == 169) { + /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or + upper case 'Z'. */ + if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta + >= 162 && inta <= 169) + inta += 64; + if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb + >= 162 && intb <= 169) + intb += 64; + } else if (zcode == 218 || zcode == 250) { + /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code + plus 128 of either lower or upper case 'Z'. */ + if (inta >= 225 && inta <= 250) inta += -32; + if (intb >= 225 && intb <= 250) intb += -32; + } + ret_val = inta == intb; + return ret_val; + +} /* lsame_ */ diff --git a/src/maths/SuperLU/mark_relax.c b/src/maths/SuperLU/mark_relax.c new file mode 100644 index 000000000..16664dcd5 --- /dev/null +++ b/src/maths/SuperLU/mark_relax.c @@ -0,0 +1,47 @@ +/*! @file mark_relax.c + * \brief Record the rows pivoted by the relaxed supernodes. + * + *
+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 1, 2009 + * <\pre> + */ +#include+ */ +/* + * File name: zmyblas2.c + */ +#include+ +/*! \brief + * + * + * Purpose + * ======= + * mark_relax() - record the rows used by the relaxed supernodes. + *+ */ +int mark_relax( + int n, /* order of the matrix A */ + int *relax_end, /* last column in a relaxed supernode. + * if j-th column starts a relaxed supernode, + * relax_end[j] represents the last column of + * this supernode. */ + int *relax_fsupc, /* first column in a relaxed supernode. + * relax_fsupc[j] represents the first column of + * j-th supernode. */ + int *xa_begin, /* Astore->colbeg */ + int *xa_end, /* Astore->colend */ + int *asub, /* row index of A */ + int *marker /* marker[j] is the maximum column index if j-th + * row belongs to a relaxed supernode. */ ) +{ + register int jcol, kcol; + register int i, j, k; + + for (i = 0; i < n && relax_fsupc[i] != EMPTY; i++) + { + jcol = relax_fsupc[i]; /* first column */ + kcol = relax_end[jcol]; /* last column */ + for (j = jcol; j <= kcol; j++) + for (k = xa_begin[j]; k < xa_end[j]; k++) + marker[asub[k]] = jcol; + } + return i; +} diff --git a/src/maths/SuperLU/mc64ad.c b/src/maths/SuperLU/mc64ad.c new file mode 100644 index 000000000..160155529 --- /dev/null +++ b/src/maths/SuperLU/mc64ad.c @@ -0,0 +1,2641 @@ +/* mc64ad.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include+ +#define abs(a) ((a) >= 0) ? (a) : -(a) +#define min(a,b) ((a) < (b)) ? (a) : (b) + +/* Table of constant values */ + +static int_t c__1 = 1; +static int_t c__2 = 2; + +/* CCCC COPYRIGHT (c) 1999 Council for the Central Laboratory of the */ +/* CCCC Research Councils. All rights reserved. */ +/* CCCC PACKAGE MC64A/AD */ +/* CCCC AUTHORS Iain Duff (i.duff@rl.ac.uk) and Jacko Koster (jak@ii.uib.no) */ +/* CCCC LAST UPDATE 20/09/99 */ +/* CCCC */ +/* *** Conditions on external use *** */ + +/* The user shall acknowledge the contribution of this */ +/* package in any publication of material dependent upon the use of */ +/* the package. The user shall use reasonable endeavours to notify */ +/* the authors of the package of this publication. */ + +/* The user can modify this code but, at no time */ +/* shall the right or title to all or any part of this package pass */ +/* to the user. The user shall make available free of charge */ +/* to the authors for any purpose all information relating to any */ +/* alteration or addition made to this package for the purposes of */ +/* extending the capabilities or enhancing the performance of this */ +/* package. */ + +/* The user shall not pass this code directly to a third party without the */ +/* express prior consent of the authors. Users wanting to licence their */ +/* own copy of these routines should send email to hsl@aeat.co.uk */ + +/* None of the comments from the Copyright notice up to and including this */ +/* one shall be removed or altered in any way. */ +/* ********************************************************************** */ +/* Subroutine */ int_t mc64id_(int_t *icntl) +{ + int_t i__; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Purpose */ +/* ======= */ + +/* The components of the array ICNTL control the action of MC64A/AD. */ +/* Default values for these are set in this subroutine. */ + +/* Parameters */ +/* ========== */ + + +/* Local variables */ + +/* ICNTL(1) has default value 6. */ +/* It is the output stream for error messages. If it */ +/* is negative, these messages will be suppressed. */ + +/* ICNTL(2) has default value 6. */ +/* It is the output stream for warning messages. */ +/* If it is negative, these messages are suppressed. */ + +/* ICNTL(3) has default value -1. */ +/* It is the output stream for monitoring printing. */ +/* If it is negative, these messages are suppressed. */ + +/* ICNTL(4) has default value 0. */ +/* If left at the defaut value, the incoming data is checked for */ +/* out-of-range indices and duplicates. Setting ICNTL(4) to any */ +/* other will avoid the checks but is likely to cause problems */ +/* later if out-of-range indices or duplicates are present. */ +/* The user should only set ICNTL(4) non-zero, if the data is */ +/* known to avoid these problems. */ + +/* ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to */ +/* zero in this routine. */ +/* Initialization of the ICNTL array. */ + /* Parameter adjustments */ + --icntl; + + /* Function Body */ + icntl[1] = 6; + icntl[2] = 6; + icntl[3] = -1; + for (i__ = 4; i__ <= 10; ++i__) { + icntl[i__] = 0; +/* L10: */ + } + return 0; +} /* mc64id_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64ad_(int_t *job, int_t *n, int_t *ne, int_t * + ip, int_t *irn, double *a, int_t *num, int_t *cperm, + int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t * + icntl, int_t *info) +{ + /* System generated locals */ + int_t i__1, i__2; + double d__1, d__2; + + /* Builtin functions */ + double log(double); + + /* Local variables */ + int_t i__, j, k; + double fact, rinf; + + extern /* Subroutine */ int_t mc21ad_(int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *), mc64bd_( + int_t *, int_t *, int_t *, int_t *, double *, int_t + *, int_t *, int_t *, int_t *, int_t *, int_t *, + double *), mc64rd_(int_t *, int_t *, int_t *, int_t *, + double *), mc64sd_(int_t *, int_t *, int_t *, int_t * + , double *, int_t *, int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *), mc64wd_( + int_t *, int_t *, int_t *, int_t *, double *, int_t + *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t + *, double *, double *); + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Purpose */ +/* ======= */ + +/* This subroutine attempts to find a column permutation for an NxN */ +/* sparse matrix A = {a_ij} that makes the permuted matrix have N */ +/* entries on its diagonal. */ +/* If the matrix is structurally nonsingular, the subroutine optionally */ +/* returns a column permutation that maximizes the smallest element */ +/* on the diagonal, maximizes the sum of the diagonal entries, or */ +/* maximizes the product of the diagonal entries of the permuted matrix. */ +/* For the latter option, the subroutine also finds scaling factors */ +/* that may be used to scale the matrix so that the nonzero diagonal */ +/* entries of the permuted matrix are one in absolute value and all the */ +/* off-diagonal entries are less than or equal to one in absolute value. */ +/* The natural logarithms of the scaling factors u(i), i=1..N, for the */ +/* rows and v(j), j=1..N, for the columns are returned so that the */ +/* scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j). */ + +/* Parameters */ +/* ========== */ + + +/* JOB is an INT_T variable which must be set by the user to */ +/* control the action. It is not altered by the subroutine. */ +/* Possible values for JOB are: */ +/* 1 Compute a column permutation of the matrix so that the */ +/* permuted matrix has as many entries on its diagonal as possible. */ +/* The values on the diagonal are of arbitrary size. HSL subroutine */ +/* MC21A/AD is used for this. See [1]. */ +/* 2 Compute a column permutation of the matrix so that the smallest */ +/* value on the diagonal of the permuted matrix is maximized. */ +/* See [3]. */ +/* 3 Compute a column permutation of the matrix so that the smallest */ +/* value on the diagonal of the permuted matrix is maximized. */ +/* The algorithm differs from the one used for JOB = 2 and may */ +/* have quite a different performance. See [2]. */ +/* 4 Compute a column permutation of the matrix so that the sum */ +/* of the diagonal entries of the permuted matrix is maximized. */ +/* See [3]. */ +/* 5 Compute a column permutation of the matrix so that the product */ +/* of the diagonal entries of the permuted matrix is maximized */ +/* and vectors to scale the matrix so that the nonzero diagonal */ +/* entries of the permuted matrix are one in absolute value and */ +/* all the off-diagonal entries are less than or equal to one in */ +/* absolute value. See [3]. */ +/* Restriction: 1 <= JOB <= 5. */ + +/* N is an INT_T variable which must be set by the user to the */ +/* order of the matrix A. It is not altered by the subroutine. */ +/* Restriction: N >= 1. */ + +/* NE is an INT_T variable which must be set by the user to the */ +/* number of entries in the matrix. It is not altered by the */ +/* subroutine. */ +/* Restriction: NE >= 1. */ + +/* IP is an INT_T array of length N+1. */ +/* IP(J), J=1..N, must be set by the user to the position in array IRN */ +/* of the first row index of an entry in column J. IP(N+1) must be set */ +/* to NE+1. It is not altered by the subroutine. */ + +/* IRN is an INT_T array of length NE. */ +/* IRN(K), K=1..NE, must be set by the user to hold the row indices of */ +/* the entries of the matrix. Those belonging to column J must be */ +/* stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering */ +/* of the row indices within each column is unimportant. Repeated */ +/* entries are not allowed. The array IRN is not altered by the */ +/* subroutine. */ + +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ +/* The user must set A(K), K=1..NE, to the numerical value of the */ +/* entry that corresponds to IRN(K). */ +/* It is not used by the subroutine when JOB = 1. */ +/* It is not altered by the subroutine. */ + +/* NUM is an INT_T variable that need not be set by the user. */ +/* On successful exit, NUM will be the number of entries on the */ +/* diagonal of the permuted matrix. */ +/* If NUM < N, the matrix is structurally singular. */ + +/* CPERM is an INT_T array of length N that need not be set by the */ +/* user. On successful exit, CPERM contains the column permutation. */ +/* Column CPERM(J) of the original matrix is column J in the permuted */ +/* matrix, J=1..N. */ + +/* LIW is an INT_T variable that must be set by the user to */ +/* the dimension of array IW. It is not altered by the subroutine. */ +/* Restriction: */ +/* JOB = 1 : LIW >= 5N */ +/* JOB = 2 : LIW >= 4N */ +/* JOB = 3 : LIW >= 10N + NE */ +/* JOB = 4 : LIW >= 5N */ +/* JOB = 5 : LIW >= 5N */ + +/* IW is an INT_T array of length LIW that is used for workspace. */ + +/* LDW is an INT_T variable that must be set by the user to the */ +/* dimension of array DW. It is not altered by the subroutine. */ +/* Restriction: */ +/* JOB = 1 : LDW is not used */ +/* JOB = 2 : LDW >= N */ +/* JOB = 3 : LDW >= NE */ +/* JOB = 4 : LDW >= 2N + NE */ +/* JOB = 5 : LDW >= 3N + NE */ + +/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW */ +/* that is used for workspace. If JOB = 5, on return, */ +/* DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. */ + +/* ICNTL is an INT_T array of length 10. Its components control the */ +/* output of MC64A/AD and must be set by the user before calling */ +/* MC64A/AD. They are not altered by the subroutine. */ + +/* ICNTL(1) must be set to specify the output stream for */ +/* error messages. If ICNTL(1) < 0, messages are suppressed. */ +/* The default value set by MC46I/ID is 6. */ + +/* ICNTL(2) must be set by the user to specify the output stream for */ +/* warning messages. If ICNTL(2) < 0, messages are suppressed. */ +/* The default value set by MC46I/ID is 6. */ + +/* ICNTL(3) must be set by the user to specify the output stream for */ +/* diagnostic messages. If ICNTL(3) < 0, messages are suppressed. */ +/* The default value set by MC46I/ID is -1. */ + +/* ICNTL(4) must be set by the user to a value other than 0 to avoid */ +/* checking of the input data. */ +/* The default value set by MC46I/ID is 0. */ + +/* INFO is an INT_T array of length 10 which need not be set by the */ +/* user. INFO(1) is set non-negative to indicate success. A negative */ +/* value is returned if an error occurred, a positive value if a */ +/* warning occurred. INFO(2) holds further information on the error. */ +/* On exit from the subroutine, INFO(1) will take one of the */ +/* following values: */ +/* 0 : successful entry (for structurally nonsingular matrix). */ +/* +1 : successful entry (for structurally singular matrix). */ +/* +2 : the returned scaling factors are large and may cause */ +/* overflow when used to scale the matrix. */ +/* (For JOB = 5 entry only.) */ +/* -1 : JOB < 1 or JOB > 5. Value of JOB held in INFO(2). */ +/* -2 : N < 1. Value of N held in INFO(2). */ +/* -3 : NE < 1. Value of NE held in INFO(2). */ +/* -4 : the defined length LIW violates the restriction on LIW. */ +/* Value of LIW required given by INFO(2). */ +/* -5 : the defined length LDW violates the restriction on LDW. */ +/* Value of LDW required given by INFO(2). */ +/* -6 : entries are found whose row indices are out of range. INFO(2) */ +/* contains the index of a column in which such an entry is found. */ +/* -7 : repeated entries are found. INFO(2) contains the index of a */ +/* column in which such entries are found. */ +/* INFO(3) to INFO(10) are not currently used and are set to zero by */ +/* the routine. */ + +/* References: */ +/* [1] I. S. Duff, (1981), */ +/* "Algorithm 575. Permutations for a zero-free diagonal", */ +/* ACM Trans. Math. Software 7(3), 387-390. */ +/* [2] I. S. Duff and J. Koster, (1998), */ +/* "The design and use of algorithms for permuting large */ +/* entries to the diagonal of sparse matrices", */ +/* SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. */ +/* [3] I. S. Duff and J. Koster, (1999), */ +/* "On algorithms for permuting large entries to the diagonal */ +/* of sparse matrices", */ +/* Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. */ +/* Local variables and parameters */ +/* External routines and functions */ +/* EXTERNAL FD05AD */ +/* DOUBLE PRECISION FD05AD */ +/* Intrinsic functions */ +/* Set RINF to largest positive real number (infinity) */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --cperm; + --ip; + --a; + --irn; + --iw; + --dw; + --icntl; + --info; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Check value of JOB */ + if (*job < 1 || *job > 5) { + info[1] = -1; + info[2] = *job; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " because JOB = %d\n", *job); + } + goto L99; + } +/* Check value of N */ + if (*n < 1) { + info[1] = -2; + info[2] = *n; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " because N = %d\n", *job); + } + goto L99; + } +/* Check value of NE */ + if (*ne < 1) { + info[1] = -3; + info[2] = *ne; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " because NE = %d\n", *job); + } + goto L99; + } +/* Check LIW */ + if (*job == 1) { + k = *n * 5; + } + if (*job == 2) { + k = *n << 2; + } + if (*job == 3) { + k = *n * 10 + *ne; + } + if (*job == 4) { + k = *n * 5; + } + if (*job == 5) { + k = *n * 5; + } + if (*liw < k) { + info[1] = -4; + info[2] = k; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " LIW too small, must be at least %8d\n", k); + } + goto L99; + } +/* Check LDW */ +/* If JOB = 1, do not check */ + if (*job > 1) { + if (*job == 2) { + k = *n; + } + if (*job == 3) { + k = *ne; + } + if (*job == 4) { + k = (*n << 1) + *ne; + } + if (*job == 5) { + k = *n * 3 + *ne; + } + if (*ldw < k) { + info[1] = -5; + info[2] = k; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " LDW too small, must be at least %8d\n", k); + } + goto L99; + } + } + if (icntl[4] == 0) { +/* Check row indices. Use IW(1:N) as workspace */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + iw[i__] = 0; +/* L3: */ + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; +/* Check for row indices that are out of range */ + if (i__ < 1 || i__ > *n) { + info[1] = -6; + info[2] = j; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", + info[1], " Column %8d", j, + " contains an entry with invalid row index %8d\n", i__); + } + goto L99; + } +/* Check for repeated row indices within a column */ + if (iw[i__] == j) { + info[1] = -7; + info[2] = j; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " Column %8d", j, + " contains two or more entries with row index %8d\n", i__); + } + goto L99; + } else { + iw[i__] = j; + } +/* L4: */ + } +/* L6: */ + } + } +/* Print diagnostics on input */ + if (icntl[3] >= 0) { + printf(" ****** Input parameters for MC64A/AD: JOB = %8d," + " N = %d, NE = %8d\n", *job, *n, *ne); + printf(" IP(1:N+1) = "); + for (j=1; j<=(*n+1); ++j) { + printf("%8d", ip[j]); + if (j%8 == 0) printf("\n"); + } + printf("\n IRN(1:NE) = "); + for (j=1; j<=(*ne); ++j) { + printf("%8d", irn[j]); + if (j%8 == 0) printf("\n"); + } + printf("\n"); + + if (*job > 1) { + printf(" A(1:NE) = "); + for (j=1; j<=(*ne); ++j) { + printf("%f14.4", a[j]); + if (j%4 == 0) printf("\n"); + } + printf("\n"); + } + } +/* Set components of INFO to zero */ + for (i__ = 1; i__ <= 10; ++i__) { + info[i__] = 0; +/* L8: */ + } +/* Compute maximum matching with MC21A/AD */ + if (*job == 1) { +/* Put length of column J in IW(J) */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + iw[j] = ip[j + 1] - ip[j]; +/* L10: */ + } +/* IW(N+1:5N) is workspace */ +#if 0 + mc21ad_(n, &irn[1], ne, &ip[1], &iw[1], &cperm[1], num, &iw[*n+1]); +#else + printf(" ****** Warning from MC64A/AD. Need to link mc21ad.\n"); +#endif + goto L90; + } +/* Compute bottleneck matching */ + if (*job == 2) { +/* IW(1:5N), DW(1:N) are workspaces */ + mc64bd_(n, ne, &ip[1], &irn[1], &a[1], &cperm[1], num, &iw[1], &iw[*n + + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &dw[1]); + goto L90; + } +/* Compute bottleneck matching */ + if (*job == 3) { +/* Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE */ + i__1 = *ne; + for (k = 1; k <= i__1; ++k) { + iw[k] = irn[k]; + dw[k] = (d__1 = a[k], abs(d__1)); +/* L20: */ + } +/* Sort entries in each column by decreasing value. */ + mc64rd_(n, ne, &ip[1], &iw[1], &dw[1]); +/* IW(NE+1:NE+10N) is workspace */ + mc64sd_(n, ne, &ip[1], &iw[1], &dw[1], &cperm[1], num, &iw[*ne + 1], & + iw[*ne + *n + 1], &iw[*ne + (*n << 1) + 1], &iw[*ne + *n * 3 + + 1], &iw[*ne + (*n << 2) + 1], &iw[*ne + *n * 5 + 1], &iw[* + ne + *n * 6 + 1]); + goto L90; + } + if (*job == 4) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + fact = 0.; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if ((d__1 = a[k], abs(d__1)) > fact) { + fact = (d__2 = a[k], abs(d__2)); + } +/* L30: */ + } + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + dw[(*n << 1) + k] = fact - (d__1 = a[k], abs(d__1)); +/* L40: */ + } +/* L50: */ + } +/* B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces */ + mc64wd_(n, ne, &ip[1], &irn[1], &dw[(*n << 1) + 1], &cperm[1], num, & + iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[( + *n << 2) + 1], &dw[1], &dw[*n + 1]); + goto L90; + } + if (*job == 5) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + fact = 0.; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + dw[*n * 3 + k] = (d__1 = a[k], abs(d__1)); + if (dw[*n * 3 + k] > fact) { + fact = dw[*n * 3 + k]; + } +/* L60: */ + } + dw[(*n << 1) + j] = fact; + if (fact != 0.) { + fact = log(fact); + } else { + fact = rinf / *n; + } + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if (dw[*n * 3 + k] != 0.) { + dw[*n * 3 + k] = fact - log(dw[*n * 3 + k]); + } else { + dw[*n * 3 + k] = rinf / *n; + } +/* L70: */ + } +/* L75: */ + } +/* B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces */ + mc64wd_(n, ne, &ip[1], &irn[1], &dw[*n * 3 + 1], &cperm[1], num, &iw[ + 1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[(*n + << 2) + 1], &dw[1], &dw[*n + 1]); + if (*num == *n) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (dw[(*n << 1) + j] != 0.) { + dw[*n + j] -= log(dw[(*n << 1) + j]); + } else { + dw[*n + j] = 0.; + } +/* L80: */ + } + } +/* Check size of scaling factors */ + fact = log(rinf) * .5f; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (dw[j] < fact && dw[*n + j] < fact) { + goto L86; + } + info[1] = 2; + goto L90; +L86: + ; + } +/* GO TO 90 */ + } +L90: + if (info[1] == 0 && *num < *n) { +/* Matrix is structurally singular, return with warning */ + info[1] = 1; + if (icntl[2] >= 0) { + printf(" ****** Warning from MC64A/AD. INFO(1) = %2d", info[1], + " The matrix is structurally singular.\n"); + } + } + if (info[1] == 2) { +/* Scaling factors are large, return with warning */ + if (icntl[2] >= 0) { + printf(" ****** Warning from MC64A/AD. INFO(1) = %2d\n", info[1], + " Some scaling factors may be too large.\n"); + } + } +/* Print diagnostics on output */ + if (icntl[3] >= 0) { + printf(" ****** Output parameters for MC64A/AD: INFO(1:2) = %8d%8d\n", + info[1], info[2]); + printf(" NUM = ", *num); + printf(" CPERM(1:N) = "); + for (j=1; j<=*n; ++j) { + printf("%8d", cperm[j]); + if (j%8 == 0) printf("\n"); + } + if (*job == 5) { + printf("\n DW(1:N) = "); + for (j=1; j<=*n; ++j) { + printf("%11.3f", dw[j]); + if (j%5 == 0) printf("\n"); + } + printf("\n DW(N+1:2N) = "); + for (j=1; j<=*n; ++j) { + printf("%11.3f", dw[*n+j]); + if (j%5 == 0) printf("\n"); + } + printf("\n"); + } + } +/* Return from subroutine. */ +L99: + return 0; +} /* mc64ad_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64bd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int_t *num, int_t *jperm, + int_t *pr, int_t *q, int_t *l, double *d__) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + double d__1, d__2, d__3; + + /* Local variables */ + int_t i__, j, k; + double a0; + int_t i0, q0; + double ai, di; + int_t ii, jj, kk; + double bv; + int_t up; + double dq0; + int_t kk1, kk2; + double csp; + int_t isp, jsp, low; + double dnew; + int_t jord, qlen, idum, jdum; + double rinf; + extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *, + double *, int_t *, int_t *), mc64ed_(int_t *, int_t *, + int_t *, double *, int_t *, int_t *), mc64fd_(int_t * + , int_t *, int_t *, int_t *, double *, int_t *, int_t *); + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* N, NE, IP, IRN are described in MC64A/AD. */ +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length */ +/* NE. A(K), K=1..NE, must be set to the value of the entry */ +/* that corresponds to IRN(K). It is not altered. */ +/* IPERM is an INT_T array of length N. On exit, it contains the */ +/* matching: IPERM(I) = 0 or row I is matched to column IPERM(I). */ +/* NUM is INT_T variable. On exit, it contains the cardinality of the */ +/* matching stored in IPERM. */ +/* IW is an INT_T work array of length 4N. */ +/* DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. */ +/* Local variables */ +/* Local parameters */ +/* Intrinsic functions */ +/* External subroutines and/or functions */ +/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DLAMCH */ +/* DOUBLE PRECISION FD05AD, DLAMCH */ +/* Set RINF to largest positive real number */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --d__; + --l; + --q; + --pr; + --jperm; + --iperm; + --ip; + --a; + --irn; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Initialization */ + *num = 0; + bv = rinf; + i__1 = *n; + for (k = 1; k <= i__1; ++k) { + iperm[k] = 0; + jperm[k] = 0; + pr[k] = ip[k]; + d__[k] = 0.; +/* L10: */ + } +/* Scan columns of matrix; */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + a0 = -1.; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + ai = (d__1 = a[k], abs(d__1)); + if (ai > d__[i__]) { + d__[i__] = ai; + } + if (jperm[j] != 0) { + goto L30; + } + if (ai >= bv) { + a0 = bv; + if (iperm[i__] != 0) { + goto L30; + } + jperm[j] = i__; + iperm[i__] = j; + ++(*num); + } else { + if (ai <= a0) { + goto L30; + } + a0 = ai; + i0 = i__; + } +L30: + ; + } + if (a0 != -1. && a0 < bv) { + bv = a0; + if (iperm[i0] != 0) { + goto L20; + } + iperm[i0] = j; + jperm[j] = i0; + ++(*num); + } +L20: + ; + } +/* Update BV with smallest of all the largest maximum absolute values */ +/* of the rows. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Computing MIN */ + d__1 = bv, d__2 = d__[i__]; + bv = min(d__1,d__2); +/* L25: */ + } + if (*num == *n) { + goto L1000; + } +/* Rescan unassigned columns; improve initial assignment */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (jperm[j] != 0) { + goto L95; + } + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + ai = (d__1 = a[k], abs(d__1)); + if (ai < bv) { + goto L50; + } + if (iperm[i__] == 0) { + goto L90; + } + jj = iperm[i__]; + kk1 = pr[jj]; + kk2 = ip[jj + 1] - 1; + if (kk1 > kk2) { + goto L50; + } + i__3 = kk2; + for (kk = kk1; kk <= i__3; ++kk) { + ii = irn[kk]; + if (iperm[ii] != 0) { + goto L70; + } + if ((d__1 = a[kk], abs(d__1)) >= bv) { + goto L80; + } +L70: + ; + } + pr[jj] = kk2 + 1; +L50: + ; + } + goto L95; +L80: + jperm[jj] = ii; + iperm[ii] = jj; + pr[jj] = kk + 1; +L90: + ++(*num); + jperm[j] = i__; + iperm[i__] = j; + pr[j] = k + 1; +L95: + ; + } + if (*num == *n) { + goto L1000; + } +/* Prepare for main loop */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + d__[i__] = -1.; + l[i__] = 0; +/* L99: */ + } +/* Main loop ... each pass round this loop is similar to Dijkstra's */ +/* algorithm for solving the single source shortest path problem */ + i__1 = *n; + for (jord = 1; jord <= i__1; ++jord) { + if (jperm[jord] != 0) { + goto L100; + } + qlen = 0; + low = *n + 1; + up = *n + 1; +/* CSP is cost of shortest path to any unassigned row */ +/* ISP is matrix position of unassigned row element in shortest path */ +/* JSP is column index of unassigned row element in shortest path */ + csp = -1.; +/* Build shortest path tree starting from unassigned column JORD */ + j = jord; + pr[j] = -1; +/* Scan column J */ + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + dnew = (d__1 = a[k], abs(d__1)); + if (csp >= dnew) { + goto L115; + } + if (iperm[i__] == 0) { +/* Row I is unassigned; update shortest path info */ + csp = dnew; + isp = i__; + jsp = j; + if (csp >= bv) { + goto L160; + } + } else { + d__[i__] = dnew; + if (dnew >= bv) { +/* Add row I to Q2 */ + --low; + q[low] = i__; + } else { +/* Add row I to Q, and push it */ + ++qlen; + l[i__] = qlen; + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1); + } + jj = iperm[i__]; + pr[jj] = j; + } +L115: + ; + } + i__2 = *num; + for (jdum = 1; jdum <= i__2; ++jdum) { +/* If Q2 is empty, extract new rows from Q */ + if (low == up) { + if (qlen == 0) { + goto L160; + } + i__ = q[1]; + if (csp >= d__[i__]) { + goto L160; + } + bv = d__[i__]; + i__3 = *n; + for (idum = 1; idum <= i__3; ++idum) { + mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__1); + l[i__] = 0; + --low; + q[low] = i__; + if (qlen == 0) { + goto L153; + } + i__ = q[1]; + if (d__[i__] != bv) { + goto L153; + } +/* L152: */ + } +/* End of dummy loop; this point is never reached */ + } +/* Move row Q0 */ +L153: + --up; + q0 = q[up]; + dq0 = d__[q0]; + l[q0] = up; +/* Scan column that matches with row Q0 */ + j = iperm[q0]; + i__3 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__3; ++k) { + i__ = irn[k]; +/* Update D(I) */ + if (l[i__] >= up) { + goto L155; + } +/* Computing MIN */ + d__2 = dq0, d__3 = (d__1 = a[k], abs(d__1)); + dnew = min(d__2,d__3); + if (csp >= dnew) { + goto L155; + } + if (iperm[i__] == 0) { +/* Row I is unassigned; update shortest path info */ + csp = dnew; + isp = i__; + jsp = j; + if (csp >= bv) { + goto L160; + } + } else { + di = d__[i__]; + if (di >= bv || di >= dnew) { + goto L155; + } + d__[i__] = dnew; + if (dnew >= bv) { +/* Delete row I from Q (if necessary); add row I to Q2 */ + if (di != -1.) { + mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], + &c__1); + } + l[i__] = 0; + --low; + q[low] = i__; + } else { +/* Add row I to Q (if necessary); push row I up Q */ + if (di == -1.) { + ++qlen; + l[i__] = qlen; + } + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1); + } +/* Update tree */ + jj = iperm[i__]; + pr[jj] = j; + } +L155: + ; + } +/* L150: */ + } +/* If CSP = MINONE, no augmenting path is found */ +L160: + if (csp == -1.) { + goto L190; + } +/* Update bottleneck value */ + bv = min(bv,csp); +/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */ + ++(*num); + i__ = isp; + j = jsp; + i__2 = *num + 1; + for (jdum = 1; jdum <= i__2; ++jdum) { + i0 = jperm[j]; + jperm[j] = i__; + iperm[i__] = j; + j = pr[j]; + if (j == -1) { + goto L190; + } + i__ = i0; +/* L170: */ + } +/* End of dummy loop; this point is never reached */ +L190: + i__2 = *n; + for (kk = up; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = -1.; + l[i__] = 0; +/* L191: */ + } + i__2 = up - 1; + for (kk = low; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = -1.; +/* L192: */ + } + i__2 = qlen; + for (kk = 1; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = -1.; + l[i__] = 0; +/* L193: */ + } +L100: + ; + } +/* End of main loop */ +/* BV is bottleneck value of final matching */ + if (*num == *n) { + goto L1000; + } +/* Matrix is structurally singular, complete IPERM. */ +/* JPERM, PR are work arrays */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + jperm[j] = 0; +/* L300: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (iperm[i__] == 0) { + ++k; + pr[k] = i__; + } else { + j = iperm[i__]; + jperm[j] = i__; + } +/* L310: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (jperm[i__] != 0) { + goto L320; + } + ++k; + jdum = pr[k]; + iperm[jdum] = i__; +L320: + ; + } +L1000: + return 0; +} /* mc64bd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64dd_(int_t *i__, int_t *n, int_t *q, double + *d__, int_t *l, int_t *iway) +{ + /* System generated locals */ + int_t i__1; + + /* Local variables */ + double di; + int_t qk, pos, idum, posk; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Variables N,Q,D,L are described in MC64B/BD */ +/* IF IWAY is equal to 1, then */ +/* node I is pushed from its current position upwards */ +/* IF IWAY is not equal to 1, then */ +/* node I is pushed from its current position downwards */ +/* Local variables and parameters */ + /* Parameter adjustments */ + --l; + --d__; + --q; + + /* Function Body */ + di = d__[*i__]; + pos = l[*i__]; +/* POS is index of current position of I in the tree */ + if (*iway == 1) { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L20; + } + posk = pos / 2; + qk = q[posk]; + if (di <= d__[qk]) { + goto L20; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L10: */ + } +/* End of dummy loop; this point is never reached */ + } else { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L20; + } + posk = pos / 2; + qk = q[posk]; + if (di >= d__[qk]) { + goto L20; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L15: */ + } +/* End of dummy loop; this point is never reached */ + } +/* End of dummy if; this point is never reached */ +L20: + q[pos] = *i__; + l[*i__] = pos; + return 0; +} /* mc64dd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64ed_(int_t *qlen, int_t *n, int_t *q, + double *d__, int_t *l, int_t *iway) +{ + /* System generated locals */ + int_t i__1; + + /* Local variables */ + int_t i__; + double di, dk, dr; + int_t pos, idum, posk; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */ +/* MC64W/WD (IWAY = 2) */ +/* The root node is deleted from the binary heap. */ +/* Local variables and parameters */ +/* Move last element to begin of Q */ + /* Parameter adjustments */ + --l; + --d__; + --q; + + /* Function Body */ + i__ = q[*qlen]; + di = d__[i__]; + --(*qlen); + pos = 1; + if (*iway == 1) { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L20; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk < dr) { + ++posk; + dk = dr; + } + } + if (di >= dk) { + goto L20; + } +/* Exchange old last element with larger priority child */ + q[pos] = q[posk]; + l[q[pos]] = pos; + pos = posk; +/* L10: */ + } +/* End of dummy loop; this point is never reached */ + } else { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L20; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk > dr) { + ++posk; + dk = dr; + } + } + if (di <= dk) { + goto L20; + } +/* Exchange old last element with smaller child */ + q[pos] = q[posk]; + l[q[pos]] = pos; + pos = posk; +/* L15: */ + } +/* End of dummy loop; this point is never reached */ + } +/* End of dummy if; this point is never reached */ +L20: + q[pos] = i__; + l[i__] = pos; + return 0; +} /* mc64ed_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64fd_(int_t *pos0, int_t *qlen, int_t *n, + int_t *q, double *d__, int_t *l, int_t *iway) +{ + /* System generated locals */ + int_t i__1; + + /* Local variables */ + int_t i__; + double di, dk, dr; + int_t qk, pos, idum, posk; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */ +/* MC64WD (IWAY = 2). */ +/* Move last element in the heap */ +/* Quick return, if possible */ + /* Parameter adjustments */ + --l; + --d__; + --q; + + /* Function Body */ + if (*qlen == *pos0) { + --(*qlen); + return 0; + } +/* Move last element from queue Q to position POS0 */ +/* POS is current position of node I in the tree */ + i__ = q[*qlen]; + di = d__[i__]; + --(*qlen); + pos = *pos0; + if (*iway == 1) { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L20; + } + posk = pos / 2; + qk = q[posk]; + if (di <= d__[qk]) { + goto L20; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L10: */ + } +/* End of dummy loop; this point is never reached */ +L20: + q[pos] = i__; + l[i__] = pos; + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L40; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk < dr) { + ++posk; + dk = dr; + } + } + if (di >= dk) { + goto L40; + } + qk = q[posk]; + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L30: */ + } +/* End of dummy loop; this point is never reached */ + } else { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L34; + } + posk = pos / 2; + qk = q[posk]; + if (di >= d__[qk]) { + goto L34; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L32: */ + } +/* End of dummy loop; this point is never reached */ +L34: + q[pos] = i__; + l[i__] = pos; + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L40; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk > dr) { + ++posk; + dk = dr; + } + } + if (di <= dk) { + goto L40; + } + qk = q[posk]; + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L36: */ + } +/* End of dummy loop; this point is never reached */ + } +/* End of dummy if; this point is never reached */ +L40: + q[pos] = i__; + l[i__] = pos; + return 0; +} /* mc64fd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64rd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + + /* Local variables */ + int_t j, k, r__, s; + double ha; + int_t hi, td, mid, len, ipj; + double key; + int_t last, todo[50], first; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* This subroutine sorts the entries in each column of the */ +/* sparse matrix (defined by N,NE,IP,IRN,A) by decreasing */ +/* numerical value. */ +/* Local constants */ +/* Local variables */ +/* Local arrays */ + /* Parameter adjustments */ + --ip; + --a; + --irn; + + /* Function Body */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + len = ip[j + 1] - ip[j]; + if (len <= 1) { + goto L100; + } + ipj = ip[j]; +/* Sort array roughly with partial quicksort */ + if (len < 15) { + goto L400; + } + todo[0] = ipj; + todo[1] = ipj + len; + td = 2; +L500: + first = todo[td - 2]; + last = todo[td - 1]; +/* KEY is the smallest of two values present in interval [FIRST,LAST) */ + key = a[(first + last) / 2]; + i__2 = last - 1; + for (k = first; k <= i__2; ++k) { + ha = a[k]; + if (ha == key) { + goto L475; + } + if (ha > key) { + goto L470; + } + key = ha; + goto L470; +L475: + ; + } +/* Only one value found in interval, so it is already sorted */ + td += -2; + goto L425; +/* Reorder interval [FIRST,LAST) such that entries before MID are gt KEY */ +L470: + mid = first; + i__2 = last - 1; + for (k = first; k <= i__2; ++k) { + if (a[k] <= key) { + goto L450; + } + ha = a[mid]; + a[mid] = a[k]; + a[k] = ha; + hi = irn[mid]; + irn[mid] = irn[k]; + irn[k] = hi; + ++mid; +L450: + ; + } +/* Both subintervals [FIRST,MID), [MID,LAST) are nonempty */ +/* Stack the longest of the two subintervals first */ + if (mid - first >= last - mid) { + todo[td + 1] = last; + todo[td] = mid; + todo[td - 1] = mid; +/* TODO(TD-1) = FIRST */ + } else { + todo[td + 1] = mid; + todo[td] = first; + todo[td - 1] = last; + todo[td - 2] = mid; + } + td += 2; +L425: + if (td == 0) { + goto L400; + } +/* There is still work to be done */ + if (todo[td - 1] - todo[td - 2] >= 15) { + goto L500; + } +/* Next interval is already short enough for straightforward insertion */ + td += -2; + goto L425; +/* Complete sorting with straightforward insertion */ +L400: + i__2 = ipj + len - 1; + for (r__ = ipj + 1; r__ <= i__2; ++r__) { + if (a[r__ - 1] < a[r__]) { + ha = a[r__]; + hi = irn[r__]; + a[r__] = a[r__ - 1]; + irn[r__] = irn[r__ - 1]; + i__3 = ipj + 1; + for (s = r__ - 1; s >= i__3; --s) { + if (a[s - 1] < ha) { + a[s] = a[s - 1]; + irn[s] = irn[s - 1]; + } else { + a[s] = ha; + irn[s] = hi; + goto L200; + } +/* L300: */ + } + a[ipj] = ha; + irn[ipj] = hi; + } +L200: + ; + } +L100: + ; + } + return 0; +} /* mc64rd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64sd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int_t *numx, int_t *w, + int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw, + int_t *iw4) +{ + /* System generated locals */ + int_t i__1, i__2, i__3, i__4; + + /* Local variables */ + int_t i__, j, k, l, ii, mod, cnt, num; + double bval, bmin, bmax, rinf; + int_t nval, wlen, idum1, idum2, idum3; + extern /* Subroutine */ int_t mc64qd_(int_t *, int_t *, int_t *, + int_t *, int_t *, double *, int_t *, double *), + mc64ud_(int_t *, int_t *, int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *); + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* N, NE, IP, IRN, are described in MC64A/AD. */ +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ +/* A(K), K=1..NE, must be set to the value of the entry that */ +/* corresponds to IRN(k). The entries in each column must be */ +/* non-negative and ordered by decreasing value. */ +/* IPERM is an INT_T array of length N. On exit, it contains the */ +/* bottleneck matching: IPERM(I) - 0 or row I is matched to column */ +/* IPERM(I). */ +/* NUMX is an INT_T variable. On exit, it contains the cardinality */ +/* of the matching stored in IPERM. */ +/* IW is an INT_T work array of length 10N. */ +/* FC is an int_t array of length N that contains the list of */ +/* unmatched columns. */ +/* LEN(J), LENL(J), LENH(J) are int_t arrays of length N that point */ +/* to entries in matrix column J. */ +/* In the matrix defined by the column parts IP(J)+LENL(J) we know */ +/* a matching does not exist; in the matrix defined by the column */ +/* parts IP(J)+LENH(J) we know one exists. */ +/* LEN(J) lies between LENL(J) and LENH(J) and determines the matrix */ +/* that is tested for a maximum matching. */ +/* W is an int_t array of length N and contains the indices of the */ +/* columns for which LENL ne LENH. */ +/* WLEN is number of indices stored in array W. */ +/* IW is int_t work array of length N. */ +/* IW4 is int_t work array of length 4N used by MC64U/UD. */ +/* EXTERNAL FD05AD,MC64QD,MC64UD */ +/* DOUBLE PRECISION FD05AD */ +/* BMIN and BMAX are such that a maximum matching exists for the input */ +/* matrix in which all entries smaller than BMIN are dropped. */ +/* For BMAX, a maximum matching does not exist. */ +/* BVAL is a value between BMIN and BMAX. */ +/* CNT is the number of calls made to MC64U/UD so far. */ +/* NUM is the cardinality of last matching found. */ +/* Set RINF to largest positive real number */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --iw4; + --iw; + --fc; + --lenh; + --lenl; + --len; + --w; + --iperm; + --ip; + --a; + --irn; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Compute a first maximum matching from scratch on whole matrix. */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + fc[j] = j; + iw[j] = 0; + len[j] = ip[j + 1] - ip[j]; +/* L20: */ + } +/* The first call to MC64U/UD */ + cnt = 1; + mod = 1; + *numx = 0; + mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], numx, + n, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[*n * 3 + 1]); +/* IW contains a maximum matching of length NUMX. */ + num = *numx; + if (num != *n) { +/* Matrix is structurally singular */ + bmax = rinf; + } else { +/* Matrix is structurally nonsingular, NUM=NUMX=N; */ +/* Set BMAX just above the smallest of all the maximum absolute */ +/* values of the columns */ + bmax = rinf; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + bval = 0.f; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if (a[k] > bval) { + bval = a[k]; + } +/* L25: */ + } + if (bval < bmax) { + bmax = bval; + } +/* L30: */ + } + bmax *= 1.001f; + } +/* Initialize BVAL,BMIN */ + bval = 0.f; + bmin = 0.f; +/* Initialize LENL,LEN,LENH,W,WLEN according to BMAX. */ +/* Set LEN(J), LENH(J) just after last entry in column J. */ +/* Set LENL(J) just after last entry in column J with value ge BMAX. */ + wlen = 0; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + l = ip[j + 1] - ip[j]; + lenh[j] = l; + len[j] = l; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if (a[k] < bmax) { + goto L46; + } +/* L45: */ + } +/* Column J is empty or all entries are ge BMAX */ + k = ip[j + 1]; +L46: + lenl[j] = k - ip[j]; +/* Add J to W if LENL(J) ne LENH(J) */ + if (lenl[j] == l) { + goto L48; + } + ++wlen; + w[wlen] = j; +L48: + ; + } +/* Main loop */ + i__1 = *ne; + for (idum1 = 1; idum1 <= i__1; ++idum1) { + if (num == *numx) { +/* We have a maximum matching in IW; store IW in IPERM */ + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + iperm[i__] = iw[i__]; +/* L50: */ + } +/* Keep going round this loop until matching IW is no longer maximum. */ + i__2 = *ne; + for (idum2 = 1; idum2 <= i__2; ++idum2) { + bmin = bval; + if (bmax == bmin) { + goto L99; + } +/* Find splitting value BVAL */ + mc64qd_(&ip[1], &lenl[1], &len[1], &w[1], &wlen, &a[1], &nval, + &bval); + if (nval <= 1) { + goto L99; + } +/* Set LEN such that all matrix entries with value lt BVAL are */ +/* discarded. Store old LEN in LENH. Do this for all columns W(K). */ +/* Each step, either K is incremented or WLEN is decremented. */ + k = 1; + i__3 = *n; + for (idum3 = 1; idum3 <= i__3; ++idum3) { + if (k > wlen) { + goto L71; + } + j = w[k]; + i__4 = ip[j] + lenl[j]; + for (ii = ip[j] + len[j] - 1; ii >= i__4; --ii) { + if (a[ii] >= bval) { + goto L60; + } + i__ = irn[ii]; + if (iw[i__] != j) { + goto L55; + } +/* Remove entry from matching */ + iw[i__] = 0; + --num; + fc[*n - num] = j; +L55: + ; + } +L60: + lenh[j] = len[j]; +/* IP(J)+LEN(J)-1 is last entry in column ge BVAL */ + len[j] = ii - ip[j] + 1; +/* If LENH(J) = LENL(J), remove J from W */ + if (lenl[j] == lenh[j]) { + w[k] = w[wlen]; + --wlen; + } else { + ++k; + } +/* L70: */ + } +L71: + if (num < *numx) { + goto L81; + } +/* L80: */ + } +/* End of dummy loop; this point is never reached */ +/* Set mode for next call to MC64U/UD */ +L81: + mod = 1; + } else { +/* We do not have a maximum matching in IW. */ + bmax = bval; +/* BMIN is the bottleneck value of a maximum matching; */ +/* for BMAX the matching is not maximum, so BMAX>BMIN */ +/* IF (BMAX .EQ. BMIN) GO TO 99 */ +/* Find splitting value BVAL */ + mc64qd_(&ip[1], &len[1], &lenh[1], &w[1], &wlen, &a[1], &nval, & + bval); + if (nval == 0 || bval == bmin) { + goto L99; + } +/* Set LEN such that all matrix entries with value ge BVAL are */ +/* inside matrix. Store old LEN in LENL. Do this for all columns W(K). */ +/* Each step, either K is incremented or WLEN is decremented. */ + k = 1; + i__2 = *n; + for (idum3 = 1; idum3 <= i__2; ++idum3) { + if (k > wlen) { + goto L88; + } + j = w[k]; + i__3 = ip[j] + lenh[j] - 1; + for (ii = ip[j] + len[j]; ii <= i__3; ++ii) { + if (a[ii] < bval) { + goto L86; + } +/* L85: */ + } +L86: + lenl[j] = len[j]; + len[j] = ii - ip[j]; + if (lenl[j] == lenh[j]) { + w[k] = w[wlen]; + --wlen; + } else { + ++k; + } +/* L87: */ + } +/* End of dummy loop; this point is never reached */ +/* Set mode for next call to MC64U/UD */ +L88: + mod = 0; + } + ++cnt; + mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], & + num, numx, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[* + n * 3 + 1]); +/* IW contains maximum matching of length NUM */ +/* L90: */ + } +/* End of dummy loop; this point is never reached */ +/* BMIN is bottleneck value of final matching */ +L99: + if (*numx == *n) { + goto L1000; + } +/* The matrix is structurally singular, complete IPERM */ +/* W, IW are work arrays */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + w[j] = 0; +/* L300: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (iperm[i__] == 0) { + ++k; + iw[k] = i__; + } else { + j = iperm[i__]; + w[j] = i__; + } +/* L310: */ + } + k = 0; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (w[j] != 0) { + goto L320; + } + ++k; + idum1 = iw[k]; + iperm[idum1] = j; +L320: + ; + } +L1000: + return 0; +} /* mc64sd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64qd_(int_t *ip, int_t *lenl, int_t *lenh, + int_t *w, int_t *wlen, double *a, int_t *nval, double * + val) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + + /* Local variables */ + int_t j, k, s; + double ha; + int_t ii, pos; + double split[10]; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* This routine searches for at most XX different numerical values */ +/* in the columns W(1:WLEN). XX>=2. */ +/* Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 */ +/* until XX values are found or all columns have been considered. */ +/* On output, NVAL is the number of different values that is found */ +/* and SPLIT(1:NVAL) contains the values in decreasing order. */ +/* If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). */ + +/* Scan columns in W(1:WLEN). For each encountered value, if value not */ +/* already present in SPLIT(1:NVAL), insert value such that SPLIT */ +/* remains sorted by decreasing value. */ +/* The sorting is done by straightforward insertion; therefore the use */ +/* of this routine should be avoided for large XX (XX < 20). */ + /* Parameter adjustments */ + --a; + --w; + --lenh; + --lenl; + --ip; + + /* Function Body */ + *nval = 0; + i__1 = *wlen; + for (k = 1; k <= i__1; ++k) { + j = w[k]; + i__2 = ip[j] + lenh[j] - 1; + for (ii = ip[j] + lenl[j]; ii <= i__2; ++ii) { + ha = a[ii]; + if (*nval == 0) { + split[0] = ha; + *nval = 1; + } else { +/* Check presence of HA in SPLIT */ + for (s = *nval; s >= 1; --s) { + if (split[s - 1] == ha) { + goto L15; + } + if (split[s - 1] > ha) { + pos = s + 1; + goto L21; + } +/* L20: */ + } + pos = 1; +/* The insertion */ +L21: + i__3 = pos; + for (s = *nval; s >= i__3; --s) { + split[s] = split[s - 1]; +/* L22: */ + } + split[pos - 1] = ha; + ++(*nval); + } +/* Exit loop if XX values are found */ + if (*nval == 10) { + goto L11; + } +L15: + ; + } +/* L10: */ + } +/* Determine VAL */ +L11: + if (*nval > 0) { + *val = split[(*nval + 1) / 2 - 1]; + } + return 0; +} /* mc64qd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64ud_(int_t *id, int_t *mod, int_t *n, int_t * + irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t * + iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp, + int_t *cv, int_t *out) +{ + /* System generated locals */ + int_t i__1, i__2, i__3, i__4; + + /* Local variables */ + int_t i__, j, k, j1, ii, kk, id0, id1, in1, in2, nfc, num0, num1, num2, + jord, last; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* PR(J) is the previous column to J in the depth first search. */ +/* Array PR is used as workspace in the sorting algorithm. */ +/* Elements (I,IPERM(I)) I=1,..,N are entries at the end of the */ +/* algorithm unless N assignments have not been made in which case */ +/* N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. */ +/* CV(I) is the most recent loop number (ID+JORD) at which row I */ +/* was visited. */ +/* ARP(J) is the number of entries in column J which have been scanned */ +/* when looking for a cheap assignment. */ +/* OUT(J) is one less than the number of entries in column J which have */ +/* not been scanned during one pass through the main loop. */ +/* NUMX is maximum possible size of matching. */ + /* Parameter adjustments */ + --out; + --cv; + --arp; + --pr; + --iperm; + --fc; + --lenc; + --ip; + --irn; + + /* Function Body */ + if (*id == 1) { +/* The first call to MC64U/UD. */ +/* Initialize CV and ARP; parameters MOD, NUMX are not accessed */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + cv[i__] = 0; + arp[i__] = 0; +/* L5: */ + } + num1 = *n; + num2 = *n; + } else { +/* Not the first call to MC64U/UD. */ +/* Re-initialize ARP if entries were deleted since last call to MC64U/UD */ + if (*mod == 1) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + arp[i__] = 0; +/* L8: */ + } + } + num1 = *numx; + num2 = *n - *numx; + } + num0 = *num; +/* NUM0 is size of input matching */ +/* NUM1 is maximum possible size of matching */ +/* NUM2 is maximum allowed number of unassigned rows/columns */ +/* NUM is size of current matching */ +/* Quick return if possible */ +/* IF (NUM.EQ.N) GO TO 199 */ +/* NFC is number of rows/columns that could not be assigned */ + nfc = 0; +/* Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, */ +/* so 1st call uses 1..N, 2nd call uses N+1..2N, etc */ + id0 = (*id - 1) * *n; +/* Main loop. Each pass round this loop either results in a new */ +/* assignment or gives a column with no assignment */ + i__1 = *n; + for (jord = num0 + 1; jord <= i__1; ++jord) { +/* Each pass uses unique number ID1 */ + id1 = id0 + jord; +/* J is unmatched column */ + j = fc[jord - num0]; + pr[j] = -1; + i__2 = jord; + for (k = 1; k <= i__2; ++k) { +/* Look for a cheap assignment */ + if (arp[j] >= lenc[j]) { + goto L30; + } + in1 = ip[j] + arp[j]; + in2 = ip[j] + lenc[j] - 1; + i__3 = in2; + for (ii = in1; ii <= i__3; ++ii) { + i__ = irn[ii]; + if (iperm[i__] == 0) { + goto L80; + } +/* L20: */ + } +/* No cheap assignment in row */ + arp[j] = lenc[j]; +/* Begin looking for assignment chain starting with row J */ +L30: + out[j] = lenc[j] - 1; +/* Inner loop. Extends chain by one or backtracks */ + i__3 = jord; + for (kk = 1; kk <= i__3; ++kk) { + in1 = out[j]; + if (in1 < 0) { + goto L50; + } + in2 = ip[j] + lenc[j] - 1; + in1 = in2 - in1; +/* Forward scan */ + i__4 = in2; + for (ii = in1; ii <= i__4; ++ii) { + i__ = irn[ii]; + if (cv[i__] == id1) { + goto L40; + } +/* Column J has not yet been accessed during this pass */ + j1 = j; + j = iperm[i__]; + cv[i__] = id1; + pr[j] = j1; + out[j1] = in2 - ii - 1; + goto L70; +L40: + ; + } +/* Backtracking step. */ +L50: + j1 = pr[j]; + if (j1 == -1) { +/* No augmenting path exists for column J. */ + ++nfc; + fc[nfc] = j; + if (nfc > num2) { +/* A matching of maximum size NUM1 is not possible */ + last = jord; + goto L101; + } + goto L100; + } + j = j1; +/* L60: */ + } +/* End of dummy loop; this point is never reached */ +L70: + ; + } +/* End of dummy loop; this point is never reached */ +/* New assignment is made. */ +L80: + iperm[i__] = j; + arp[j] = ii - ip[j] + 1; + ++(*num); + i__2 = jord; + for (k = 1; k <= i__2; ++k) { + j = pr[j]; + if (j == -1) { + goto L95; + } + ii = ip[j] + lenc[j] - out[j] - 2; + i__ = irn[ii]; + iperm[i__] = j; +/* L90: */ + } +/* End of dummy loop; this point is never reached */ +L95: + if (*num == num1) { +/* A matching of maximum size NUM1 is found */ + last = jord; + goto L101; + } + +L100: + ; + } +/* All unassigned columns have been considered */ + last = *n; +/* Now, a transversal is computed or is not possible. */ +/* Complete FC before returning. */ +L101: + i__1 = *n; + for (jord = last + 1; jord <= i__1; ++jord) { + ++nfc; + fc[nfc] = fc[jord - num0]; +/* L110: */ + } +/* 199 RETURN */ + return 0; +} /* mc64ud_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64wd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int_t *num, int_t *jperm, + int_t *out, int_t *pr, int_t *q, int_t *l, double *u, + double *d__) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + + /* Local variables */ + int_t i__, j, k, i0, k0, k1, k2, q0; + double di; + int_t ii, jj, kk; + double vj; + int_t up; + double dq0; + int_t kk1, kk2; + double csp; + int_t isp, jsp, low; + double dmin__, dnew; + int_t jord, qlen, jdum; + double rinf; + extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *, + double *, int_t *, int_t *), mc64ed_(int_t *, int_t *, + int_t *, double *, int_t *, int_t *), mc64fd_(int_t * + , int_t *, int_t *, int_t *, double *, int_t *, + int_t *); + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* N, NE, IP, IRN are described in MC64A/AD. */ +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ +/* A(K), K=1..NE, must be set to the value of the entry that */ +/* corresponds to IRN(K). It is not altered. */ +/* All values A(K) must be non-negative. */ +/* IPERM is an INT_T array of length N. On exit, it contains the */ +/* weighted matching: IPERM(I) = 0 or row I is matched to column */ +/* IPERM(I). */ +/* NUM is an INT_T variable. On exit, it contains the cardinality of */ +/* the matching stored in IPERM. */ +/* IW is an INT_T work array of length 5N. */ +/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. */ +/* On exit, U = D(1:N) contains the dual row variable and */ +/* V = D(N+1:2N) contains the dual column variable. If the matrix */ +/* is structurally nonsingular (NUM = N), the following holds: */ +/* U(I)+V(J) <= A(I,J) if IPERM(I) |= J */ +/* U(I)+V(J) = A(I,J) if IPERM(I) = J */ +/* U(I) = 0 if IPERM(I) = 0 */ +/* V(J) = 0 if there is no I for which IPERM(I) = J */ +/* Local variables */ +/* Local parameters */ +/* External subroutines and/or functions */ +/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD */ +/* DOUBLE PRECISION FD05AD */ +/* Set RINF to largest positive real number */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --d__; + --u; + --l; + --q; + --pr; + --out; + --jperm; + --iperm; + --ip; + --a; + --irn; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Initialization */ + *num = 0; + i__1 = *n; + for (k = 1; k <= i__1; ++k) { + u[k] = rinf; + d__[k] = 0.; + iperm[k] = 0; + jperm[k] = 0; + pr[k] = ip[k]; + l[k] = 0; +/* L10: */ + } +/* Initialize U(I) */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + if (a[k] > u[i__]) { + goto L20; + } + u[i__] = a[k]; + iperm[i__] = j; + l[i__] = k; +L20: + ; + } +/* L30: */ + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + j = iperm[i__]; + if (j == 0) { + goto L40; + } +/* Row I is not empty */ + iperm[i__] = 0; + if (jperm[j] != 0) { + goto L40; + } +/* Assignment of column J to row I */ + ++(*num); + iperm[i__] = j; + jperm[j] = l[i__]; +L40: + ; + } + if (*num == *n) { + goto L1000; + } +/* Scan unassigned columns; improve assignment */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { +/* JPERM(J) ne 0 iff column J is already assigned */ + if (jperm[j] != 0) { + goto L95; + } + k1 = ip[j]; + k2 = ip[j + 1] - 1; +/* Continue only if column J is not empty */ + if (k1 > k2) { + goto L95; + } + vj = rinf; + i__2 = k2; + for (k = k1; k <= i__2; ++k) { + i__ = irn[k]; + di = a[k] - u[i__]; + if (di > vj) { + goto L50; + } + if (di < vj || di == rinf) { + goto L55; + } + if (iperm[i__] != 0 || iperm[i0] == 0) { + goto L50; + } +L55: + vj = di; + i0 = i__; + k0 = k; +L50: + ; + } + d__[j] = vj; + k = k0; + i__ = i0; + if (iperm[i__] == 0) { + goto L90; + } + i__2 = k2; + for (k = k0; k <= i__2; ++k) { + i__ = irn[k]; + if (a[k] - u[i__] > vj) { + goto L60; + } + jj = iperm[i__]; +/* Scan remaining part of assigned column JJ */ + kk1 = pr[jj]; + kk2 = ip[jj + 1] - 1; + if (kk1 > kk2) { + goto L60; + } + i__3 = kk2; + for (kk = kk1; kk <= i__3; ++kk) { + ii = irn[kk]; + if (iperm[ii] > 0) { + goto L70; + } + if (a[kk] - u[ii] <= d__[jj]) { + goto L80; + } +L70: + ; + } + pr[jj] = kk2 + 1; +L60: + ; + } + goto L95; +L80: + jperm[jj] = kk; + iperm[ii] = jj; + pr[jj] = kk + 1; +L90: + ++(*num); + jperm[j] = k; + iperm[i__] = j; + pr[j] = k + 1; +L95: + ; + } + if (*num == *n) { + goto L1000; + } +/* Prepare for main loop */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + d__[i__] = rinf; + l[i__] = 0; +/* L99: */ + } +/* Main loop ... each pass round this loop is similar to Dijkstra's */ +/* algorithm for solving the single source shortest path problem */ + i__1 = *n; + for (jord = 1; jord <= i__1; ++jord) { + if (jperm[jord] != 0) { + goto L100; + } +/* JORD is next unmatched column */ +/* DMIN is the length of shortest path in the tree */ + dmin__ = rinf; + qlen = 0; + low = *n + 1; + up = *n + 1; +/* CSP is the cost of the shortest augmenting path to unassigned row */ +/* IRN(ISP). The corresponding column index is JSP. */ + csp = rinf; +/* Build shortest path tree starting from unassigned column (root) JORD */ + j = jord; + pr[j] = -1; +/* Scan column J */ + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + dnew = a[k] - u[i__]; + if (dnew >= csp) { + goto L115; + } + if (iperm[i__] == 0) { + csp = dnew; + isp = k; + jsp = j; + } else { + if (dnew < dmin__) { + dmin__ = dnew; + } + d__[i__] = dnew; + ++qlen; + q[qlen] = k; + } +L115: + ; + } +/* Initialize heap Q and Q2 with rows held in Q(1:QLEN) */ + q0 = qlen; + qlen = 0; + i__2 = q0; + for (kk = 1; kk <= i__2; ++kk) { + k = q[kk]; + i__ = irn[k]; + if (csp <= d__[i__]) { + d__[i__] = rinf; + goto L120; + } + if (d__[i__] <= dmin__) { + --low; + q[low] = i__; + l[i__] = low; + } else { + ++qlen; + l[i__] = qlen; + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2); + } +/* Update tree */ + jj = iperm[i__]; + out[jj] = k; + pr[jj] = j; +L120: + ; + } + i__2 = *num; + for (jdum = 1; jdum <= i__2; ++jdum) { +/* If Q2 is empty, extract rows from Q */ + if (low == up) { + if (qlen == 0) { + goto L160; + } + i__ = q[1]; + if (d__[i__] >= csp) { + goto L160; + } + dmin__ = d__[i__]; +L152: + mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__2); + --low; + q[low] = i__; + l[i__] = low; + if (qlen == 0) { + goto L153; + } + i__ = q[1]; + if (d__[i__] > dmin__) { + goto L153; + } + goto L152; + } +/* Q0 is row whose distance D(Q0) to the root is smallest */ +L153: + q0 = q[up - 1]; + dq0 = d__[q0]; +/* Exit loop if path to Q0 is longer than the shortest augmenting path */ + if (dq0 >= csp) { + goto L160; + } + --up; +/* Scan column that matches with row Q0 */ + j = iperm[q0]; + vj = dq0 - a[jperm[j]] + u[q0]; + i__3 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__3; ++k) { + i__ = irn[k]; + if (l[i__] >= up) { + goto L155; + } +/* DNEW is new cost */ + dnew = vj + a[k] - u[i__]; +/* Do not update D(I) if DNEW ge cost of shortest path */ + if (dnew >= csp) { + goto L155; + } + if (iperm[i__] == 0) { +/* Row I is unmatched; update shortest path info */ + csp = dnew; + isp = k; + jsp = j; + } else { +/* Row I is matched; do not update D(I) if DNEW is larger */ + di = d__[i__]; + if (di <= dnew) { + goto L155; + } + if (l[i__] >= low) { + goto L155; + } + d__[i__] = dnew; + if (dnew <= dmin__) { + if (l[i__] != 0) { + mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], + &c__2); + } + --low; + q[low] = i__; + l[i__] = low; + } else { + if (l[i__] == 0) { + ++qlen; + l[i__] = qlen; + } + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2); + } +/* Update tree */ + jj = iperm[i__]; + out[jj] = k; + pr[jj] = j; + } +L155: + ; + } +/* L150: */ + } +/* If CSP = RINF, no augmenting path is found */ +L160: + if (csp == rinf) { + goto L190; + } +/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */ + ++(*num); + i__ = irn[isp]; + iperm[i__] = jsp; + jperm[jsp] = isp; + j = jsp; + i__2 = *num; + for (jdum = 1; jdum <= i__2; ++jdum) { + jj = pr[j]; + if (jj == -1) { + goto L180; + } + k = out[j]; + i__ = irn[k]; + iperm[i__] = jj; + jperm[jj] = k; + j = jj; +/* L170: */ + } +/* End of dummy loop; this point is never reached */ +/* Update U for rows in Q(UP:N) */ +L180: + i__2 = *n; + for (kk = up; kk <= i__2; ++kk) { + i__ = q[kk]; + u[i__] = u[i__] + d__[i__] - csp; +/* L185: */ + } +L190: + i__2 = *n; + for (kk = low; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = rinf; + l[i__] = 0; +/* L191: */ + } + i__2 = qlen; + for (kk = 1; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = rinf; + l[i__] = 0; +/* L193: */ + } +L100: + ; + } +/* End of main loop */ +/* Set dual column variable in D(1:N) */ +L1000: + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + k = jperm[j]; + if (k != 0) { + d__[j] = a[k] - u[irn[k]]; + } else { + d__[j] = 0.; + } + if (iperm[j] == 0) { + u[j] = 0.; + } +/* L200: */ + } + if (*num == *n) { + goto L1100; + } +/* The matrix is structurally singular, complete IPERM. */ +/* JPERM, OUT are work arrays */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + jperm[j] = 0; +/* L300: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (iperm[i__] == 0) { + ++k; + out[k] = i__; + } else { + j = iperm[i__]; + jperm[j] = i__; + } +/* L310: */ + } + k = 0; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (jperm[j] != 0) { + goto L320; + } + ++k; + jdum = out[k]; + iperm[jdum] = j; +L320: + ; + } +L1100: + return 0; +} /* mc64wd_ */ + + diff --git a/src/maths/SuperLU/memory.c b/src/maths/SuperLU/memory.c new file mode 100644 index 000000000..f7fdccf44 --- /dev/null +++ b/src/maths/SuperLU/memory.c @@ -0,0 +1,210 @@ +/*! @file memory.c + * \brief Precision-independent memory-related routines + * + * + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ */ +/** Precision-independent memory-related routines. + (Shared by [sdcz]memory.c) **/ + +#include+ + +#if ( DEBUGlevel>=1 ) /* Debug malloc/free. */ +int superlu_malloc_total = 0; + +#define PAD_FACTOR 2 +#define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */ +/* size_t is usually defined as 'unsigned long' */ + +void *superlu_malloc(size_t size) +{ + char *buf; + + buf = (char *) malloc(size + DWORD); + if ( !buf ) { + printf("superlu_malloc fails: malloc_total %.0f MB, size %ld\n", + superlu_malloc_total*1e-6, size); + ABORT_SuperLU("superlu_malloc: out of memory"); + } + + ((int_t *) buf)[0] = size; +#if 0 + superlu_malloc_total += size + DWORD; +#else + superlu_malloc_total += size; +#endif + return (void *) (buf + DWORD); +} + +void superlu_free(void *addr) +{ + char *p = ((char *) addr) - DWORD; + + if ( !addr ) + ABORT_SuperLU("superlu_free: tried to free NULL pointer"); + + if ( !p ) + ABORT_SuperLU("superlu_free: tried to free NULL+DWORD pointer"); + + { + int_t n = ((int_t *) p)[0]; + + if ( !n ) + ABORT_SuperLU("superlu_free: tried to free a freed pointer"); + *((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */ +#if 0 + superlu_malloc_total -= (n + DWORD); +#else + superlu_malloc_total -= n; +#endif + + if ( superlu_malloc_total < 0 ) + ABORT_SuperLU("superlu_malloc_total went negative!"); + + /*free (addr);*/ + free (p); + } + +} + +#else /* production mode */ + +void *superlu_malloc(size_t size) +{ + void *buf; + buf = (void *) malloc(size); + return (buf); +} + +void superlu_free(void *addr) +{ + free (addr); +} + +#endif + + +/*! \brief Set up pointers for integer working arrays. + */ +void +SetIWork(int m, int n, int panel_size, int *iworkptr, int **segrep, + int **parent, int **xplore, int **repfnz, int **panel_lsub, + int **xprune, int **marker) +{ + *segrep = iworkptr; + *parent = iworkptr + m; + *xplore = *parent + m; + *repfnz = *xplore + m; + *panel_lsub = *repfnz + panel_size * m; + *xprune = *panel_lsub + panel_size * m; + *marker = *xprune + n; + ifill (*repfnz, m * panel_size, EMPTY); + ifill (*panel_lsub, m * panel_size, EMPTY); +} + + +void +copy_mem_int(int howmany, void *old, void *new) +{ + register int i; + int *iold = old; + int *inew = new; + for (i = 0; i < howmany; i++) inew[i] = iold[i]; +} + + +void +user_bcopy(char *src, char *dest, int bytes) +{ + char *s_ptr, *d_ptr; + + s_ptr = src + bytes - 1; + d_ptr = dest + bytes - 1; + for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr; +} + + + +int *intMalloc(int n) +{ + int *buf; + buf = (int *) SUPERLU_MALLOC((size_t) n * sizeof(int)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC fails for buf in intMalloc()"); + } + return (buf); +} + +int *intCalloc(int n) +{ + int *buf; + register int i; + buf = (int *) SUPERLU_MALLOC(n * sizeof(int)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC fails for buf in intCalloc()"); + } + for (i = 0; i < n; ++i) buf[i] = 0; + return (buf); +} + + + +#if 0 +check_expanders() +{ + int p; + printf("Check expanders:\n"); + for (p = 0; p < NO_MEMTYPE; p++) { + printf("type %d, size %d, mem %d\n", + p, expanders[p].size, (int)expanders[p].mem); + } + + return 0; +} + + +StackInfo() +{ + printf("Stack: size %d, used %d, top1 %d, top2 %d\n", + stack.size, stack.used, stack.top1, stack.top2); + return 0; +} + + + +PrintStack(char *msg, GlobalLU_t *Glu) +{ + int i; + int *xlsub, *lsub, *xusub, *usub; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + + printf("%s\n", msg); + +/* printf("\nUCOL: "); + for (i = 0; i < xusub[ndim]; ++i) + printf("%f ", ucol[i]); + + printf("\nLSUB: "); + for (i = 0; i < xlsub[ndim]; ++i) + printf("%d ", lsub[i]); + + printf("\nUSUB: "); + for (i = 0; i < xusub[ndim]; ++i) + printf("%d ", usub[i]); + + printf("\n");*/ + return 0; +} +#endif + + + diff --git a/src/maths/SuperLU/mmd.c b/src/maths/SuperLU/mmd.c new file mode 100644 index 000000000..05f26ce09 --- /dev/null +++ b/src/maths/SuperLU/mmd.c @@ -0,0 +1,1012 @@ + +typedef int shortint; + +/* *************************************************************** */ +/* *************************************************************** */ +/* **** GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE **** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */ +/* ALGORITHM. IT MAKES USE OF THE IMPLICIT REPRESENTATION */ +/* OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */ +/* NOTION OF INDISTINGUISHABLE NODES. IT ALSO IMPLEMENTS */ +/* THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */ +/* EXTERNAL DEGREE. */ +/* --------------------------------------------- */ +/* CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */ +/* DESTROYED. */ +/* --------------------------------------------- */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */ +/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ +/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */ +/* (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */ +/* NODES. */ + +/* OUTPUT PARAMETERS - */ +/* PERM - THE MINIMUM DEGREE ORDERING. */ +/* INVP - THE INVERSE OF PERM. */ +/* NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */ +/* SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */ + +/* WORKING PARAMETERS - */ +/* DHEAD - VECTOR FOR HEAD OF DEGREE LISTS. */ +/* INVP - USED TEMPORARILY FOR DEGREE FORWARD LINK. */ +/* PERM - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */ +/* QSIZE - VECTOR FOR SIZE OF SUPERNODES. */ +/* LLIST - VECTOR FOR TEMPORARY LINKED LISTS. */ +/* MARKER - A TEMPORARY MARKER VECTOR. */ + +/* PROGRAM SUBROUTINES - */ +/* MMDELM, MMDINT, MMDNUM, MMDUPD. */ + +/* *************************************************************** */ + +/* Subroutine */ int genmmd_(int *neqns, int *xadj, shortint *adjncy, + shortint *invp, shortint *perm, int *delta, shortint *dhead, + shortint *qsize, shortint *llist, shortint *marker, int *maxint, + int *nofsub) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int mdeg, ehead, i, mdlmt, mdnode; + extern /* Subroutine */ int mmdelm_(int *, int *, shortint *, + shortint *, shortint *, shortint *, shortint *, shortint *, + shortint *, int *, int *), mmdupd_(int *, int *, + int *, shortint *, int *, int *, shortint *, shortint + *, shortint *, shortint *, shortint *, shortint *, int *, + int *), mmdint_(int *, int *, shortint *, shortint *, + shortint *, shortint *, shortint *, shortint *, shortint *), + mmdnum_(int *, shortint *, shortint *, shortint *); + static int nextmd, tag, num; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dhead; + --perm; + --invp; + --adjncy; + --xadj; + + /* Function Body */ + if (*neqns <= 0) { + return 0; + } + +/* ------------------------------------------------ */ +/* INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */ +/* ------------------------------------------------ */ + *nofsub = 0; + mmdint_(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & + qsize[1], &llist[1], &marker[1]); + +/* ---------------------------------------------- */ +/* NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */ +/* ---------------------------------------------- */ + num = 1; + +/* ----------------------------- */ +/* ELIMINATE ALL ISOLATED NODES. */ +/* ----------------------------- */ + nextmd = dhead[1]; +L100: + if (nextmd <= 0) { + goto L200; + } + mdnode = nextmd; + nextmd = invp[mdnode]; + marker[mdnode] = *maxint; + invp[mdnode] = -num; + ++num; + goto L100; + +L200: +/* ---------------------------------------- */ +/* SEARCH FOR NODE OF THE MINIMUM DEGREE. */ +/* MDEG IS THE CURRENT MINIMUM DEGREE; */ +/* TAG IS USED TO FACILITATE MARKING NODES. */ +/* ---------------------------------------- */ + if (num > *neqns) { + goto L1000; + } + tag = 1; + dhead[1] = 0; + mdeg = 2; +L300: + if (dhead[mdeg] > 0) { + goto L400; + } + ++mdeg; + goto L300; +L400: +/* ------------------------------------------------- */ +/* USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */ +/* WHEN A DEGREE UPDATE IS TO BE PERFORMED. */ +/* ------------------------------------------------- */ + mdlmt = mdeg + *delta; + ehead = 0; + +L500: + mdnode = dhead[mdeg]; + if (mdnode > 0) { + goto L600; + } + ++mdeg; + if (mdeg > mdlmt) { + goto L900; + } + goto L500; +L600: +/* ---------------------------------------- */ +/* REMOVE MDNODE FROM THE DEGREE STRUCTURE. */ +/* ---------------------------------------- */ + nextmd = invp[mdnode]; + dhead[mdeg] = nextmd; + if (nextmd > 0) { + perm[nextmd] = -mdeg; + } + invp[mdnode] = -num; + *nofsub = *nofsub + mdeg + qsize[mdnode] - 2; + if (num + qsize[mdnode] > *neqns) { + goto L1000; + } +/* ---------------------------------------------- */ +/* ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */ +/* TRANSFORMATION. RESET TAG VALUE IF NECESSARY. */ +/* ---------------------------------------------- */ + ++tag; + if (tag < *maxint) { + goto L800; + } + tag = 1; + i__1 = *neqns; + for (i = 1; i <= i__1; ++i) { + if (marker[i] < *maxint) { + marker[i] = 0; + } +/* L700: */ + } +L800: + mmdelm_(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & + qsize[1], &llist[1], &marker[1], maxint, &tag); + num += qsize[mdnode]; + llist[mdnode] = ehead; + ehead = mdnode; + if (*delta >= 0) { + goto L500; + } +L900: +/* ------------------------------------------- */ +/* UPDATE DEGREES OF THE NODES INVOLVED IN THE */ +/* MINIMUM DEGREE NODES ELIMINATION. */ +/* ------------------------------------------- */ + if (num > *neqns) { + goto L1000; + } + mmdupd_(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1], & + invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint, &tag) + ; + goto L300; + +L1000: + mmdnum_(neqns, &perm[1], &invp[1], &qsize[1]); + return 0; + +} /* genmmd_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* *** MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION *** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */ +/* MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */ +/* ALGORITHM. */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ + +/* OUTPUT PARAMETERS - */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE (INITIALIZED TO ONE). */ +/* LLIST - LINKED LIST. */ +/* MARKER - MARKER VECTOR. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdint_(int *neqns, int *xadj, shortint *adjncy, + shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, + shortint *llist, shortint *marker) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int ndeg, node, fnode; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + dhead[node] = 0; + qsize[node] = 1; + marker[node] = 0; + llist[node] = 0; +/* L100: */ + } +/* ------------------------------------------ */ +/* INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */ +/* ------------------------------------------ */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + ndeg = xadj[node + 1] - xadj[node] + 1; + fnode = dhead[ndeg]; + dforw[node] = fnode; + dhead[ndeg] = node; + if (fnode > 0) { + dbakw[fnode] = node; + } + dbakw[node] = -ndeg; +/* L200: */ + } + return 0; + +} /* mmdint_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ** MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION *** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */ +/* MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */ +/* IS STORED IN THE QUOTIENT GRAPH FORMAT. IT ALSO */ +/* TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */ +/* ELIMINATION GRAPH. */ + +/* INPUT PARAMETERS - */ +/* MDNODE - NODE OF MINIMUM DEGREE. */ +/* MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */ +/* INT. */ +/* TAG - TAG VALUE. */ + +/* UPDATED PARAMETERS - */ +/* (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE. */ +/* MARKER - MARKER VECTOR. */ +/* LLIST - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdelm_(int *mdnode, int *xadj, shortint *adjncy, + shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, + shortint *llist, shortint *marker, int *maxint, int *tag) +{ + /* System generated locals */ + int i__1, i__2; + + /* Local variables */ + static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, + istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + +/* ----------------------------------------------- */ +/* FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */ +/* ----------------------------------------------- */ + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + marker[*mdnode] = *tag; + istrt = xadj[*mdnode]; + istop = xadj[*mdnode + 1] - 1; +/* ------------------------------------------------------- */ +/* ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */ +/* NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */ +/* FOR THE NEXT REACHABLE NODE. */ +/* ------------------------------------------------------- */ + elmnt = 0; + rloc = istrt; + rlmt = istop; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + nabor = adjncy[i]; + if (nabor == 0) { + goto L300; + } + if (marker[nabor] >= *tag) { + goto L200; + } + marker[nabor] = *tag; + if (dforw[nabor] < 0) { + goto L100; + } + adjncy[rloc] = nabor; + ++rloc; + goto L200; +L100: + llist[nabor] = elmnt; + elmnt = nabor; +L200: + ; + } +L300: +/* ----------------------------------------------------- */ +/* MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */ +/* ----------------------------------------------------- */ + if (elmnt <= 0) { + goto L1000; + } + adjncy[rlmt] = -elmnt; + link = elmnt; +L400: + jstrt = xadj[link]; + jstop = xadj[link + 1] - 1; + i__1 = jstop; + for (j = jstrt; j <= i__1; ++j) { + node = adjncy[j]; + link = -node; + if (node < 0) { + goto L400; + } else if (node == 0) { + goto L900; + } else { + goto L500; + } +L500: + if (marker[node] >= *tag || dforw[node] < 0) { + goto L800; + } + marker[node] = *tag; +/* --------------------------------- */ +/* USE STORAGE FROM ELIMINATED NODES */ +/* IF NECESSARY. */ +/* --------------------------------- */ +L600: + if (rloc < rlmt) { + goto L700; + } + link = -adjncy[rlmt]; + rloc = xadj[link]; + rlmt = xadj[link + 1] - 1; + goto L600; +L700: + adjncy[rloc] = node; + ++rloc; +L800: + ; + } +L900: + elmnt = llist[elmnt]; + goto L300; +L1000: + if (rloc <= rlmt) { + adjncy[rloc] = 0; + } +/* -------------------------------------------------------- */ +/* FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */ +/* -------------------------------------------------------- */ + link = *mdnode; +L1100: + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + rnode = adjncy[i]; + link = -rnode; + if (rnode < 0) { + goto L1100; + } else if (rnode == 0) { + goto L1800; + } else { + goto L1200; + } +L1200: +/* -------------------------------------------- */ +/* IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */ +/* -------------------------------------------- */ + pvnode = dbakw[rnode]; + if (pvnode == 0 || pvnode == -(*maxint)) { + goto L1300; + } +/* ------------------------------------- */ +/* THEN REMOVE RNODE FROM THE STRUCTURE. */ +/* ------------------------------------- */ + nxnode = dforw[rnode]; + if (nxnode > 0) { + dbakw[nxnode] = pvnode; + } + if (pvnode > 0) { + dforw[pvnode] = nxnode; + } + npv = -pvnode; + if (pvnode < 0) { + dhead[npv] = nxnode; + } +L1300: +/* ---------------------------------------- */ +/* PURGE INACTIVE QUOTIENT NABORS OF RNODE. */ +/* ---------------------------------------- */ + jstrt = xadj[rnode]; + jstop = xadj[rnode + 1] - 1; + xqnbr = jstrt; + i__2 = jstop; + for (j = jstrt; j <= i__2; ++j) { + nabor = adjncy[j]; + if (nabor == 0) { + goto L1500; + } + if (marker[nabor] >= *tag) { + goto L1400; + } + adjncy[xqnbr] = nabor; + ++xqnbr; +L1400: + ; + } +L1500: +/* ---------------------------------------- */ +/* IF NO ACTIVE NABOR AFTER THE PURGING ... */ +/* ---------------------------------------- */ + nqnbrs = xqnbr - jstrt; + if (nqnbrs > 0) { + goto L1600; + } +/* ----------------------------- */ +/* THEN MERGE RNODE WITH MDNODE. */ +/* ----------------------------- */ + qsize[*mdnode] += qsize[rnode]; + qsize[rnode] = 0; + marker[rnode] = *maxint; + dforw[rnode] = -(*mdnode); + dbakw[rnode] = -(*maxint); + goto L1700; +L1600: +/* -------------------------------------- */ +/* ELSE FLAG RNODE FOR DEGREE UPDATE, AND */ +/* ADD MDNODE AS A NABOR OF RNODE. */ +/* -------------------------------------- */ + dforw[rnode] = nqnbrs + 1; + dbakw[rnode] = 0; + adjncy[xqnbr] = *mdnode; + ++xqnbr; + if (xqnbr <= jstop) { + adjncy[xqnbr] = 0; + } + +L1700: + ; + } +L1800: + return 0; + +} /* mmdelm_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ***** MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE ***** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */ +/* AFTER A MULTIPLE ELIMINATION STEP. */ + +/* INPUT PARAMETERS - */ +/* EHEAD - THE BEGINNING OF THE LIST OF ELIMINATED */ +/* NODES (I.E., NEWLY FORMED ELEMENTS). */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ +/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ +/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */ +/* INTEGER. */ + +/* UPDATED PARAMETERS - */ +/* MDEG - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE. */ +/* LLIST - WORKING LINKED LIST. */ +/* MARKER - MARKER VECTOR FOR DEGREE UPDATE. */ +/* TAG - TAG VALUE. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdupd_(int *ehead, int *neqns, int *xadj, + shortint *adjncy, int *delta, int *mdeg, shortint *dhead, + shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, + shortint *marker, int *maxint, int *tag) +{ + /* System generated locals */ + int i__1, i__2; + + /* Local variables */ + static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, + istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + mdeg0 = *mdeg + *delta; + elmnt = *ehead; +L100: +/* ------------------------------------------------------- */ +/* FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */ +/* (RESET TAG VALUE IF NECESSARY.) */ +/* ------------------------------------------------------- */ + if (elmnt <= 0) { + return 0; + } + mtag = *tag + mdeg0; + if (mtag < *maxint) { + goto L300; + } + *tag = 1; + i__1 = *neqns; + for (i = 1; i <= i__1; ++i) { + if (marker[i] < *maxint) { + marker[i] = 0; + } +/* L200: */ + } + mtag = *tag + mdeg0; +L300: +/* --------------------------------------------- */ +/* CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */ +/* WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */ +/* ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */ +/* THAN TWO NABORS (QXHEAD). ALSO COMPUTE DEG0, */ +/* NUMBER OF NODES IN THIS ELEMENT. */ +/* --------------------------------------------- */ + q2head = 0; + qxhead = 0; + deg0 = 0; + link = elmnt; +L400: + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + enode = adjncy[i]; + link = -enode; + if (enode < 0) { + goto L400; + } else if (enode == 0) { + goto L800; + } else { + goto L500; + } + +L500: + if (qsize[enode] == 0) { + goto L700; + } + deg0 += qsize[enode]; + marker[enode] = mtag; +/* ---------------------------------- */ +/* IF ENODE REQUIRES A DEGREE UPDATE, */ +/* THEN DO THE FOLLOWING. */ +/* ---------------------------------- */ + if (dbakw[enode] != 0) { + goto L700; + } +/* --------------------------------------- +*/ +/* PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. +*/ +/* --------------------------------------- +*/ + if (dforw[enode] == 2) { + goto L600; + } + llist[enode] = qxhead; + qxhead = enode; + goto L700; +L600: + llist[enode] = q2head; + q2head = enode; +L700: + ; + } +L800: +/* -------------------------------------------- */ +/* FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */ +/* -------------------------------------------- */ + enode = q2head; + iq2 = 1; +L900: + if (enode <= 0) { + goto L1500; + } + if (dbakw[enode] != 0) { + goto L2200; + } + ++(*tag); + deg = deg0; +/* ------------------------------------------ */ +/* IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */ +/* ------------------------------------------ */ + istrt = xadj[enode]; + nabor = adjncy[istrt]; + if (nabor == elmnt) { + nabor = adjncy[istrt + 1]; + } +/* ------------------------------------------------ */ +/* IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */ +/* ------------------------------------------------ */ + link = nabor; + if (dforw[nabor] < 0) { + goto L1000; + } + deg += qsize[nabor]; + goto L2100; +L1000: +/* -------------------------------------------- */ +/* OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */ +/* DO THE FOLLOWING. */ +/* -------------------------------------------- */ + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + node = adjncy[i]; + link = -node; + if (node == enode) { + goto L1400; + } + if (node < 0) { + goto L1000; + } else if (node == 0) { + goto L2100; + } else { + goto L1100; + } + +L1100: + if (qsize[node] == 0) { + goto L1400; + } + if (marker[node] >= *tag) { + goto L1200; + } +/* ----------------------------------- +-- */ +/* CASE WHEN NODE IS NOT YET CONSIDERED +. */ +/* ----------------------------------- +-- */ + marker[node] = *tag; + deg += qsize[node]; + goto L1400; +L1200: +/* ---------------------------------------- + */ +/* CASE WHEN NODE IS INDISTINGUISHABLE FROM + */ +/* ENODE. MERGE THEM INTO A NEW SUPERNODE. + */ +/* ---------------------------------------- + */ + if (dbakw[node] != 0) { + goto L1400; + } + if (dforw[node] != 2) { + goto L1300; + } + qsize[enode] += qsize[node]; + qsize[node] = 0; + marker[node] = *maxint; + dforw[node] = -enode; + dbakw[node] = -(*maxint); + goto L1400; +L1300: +/* -------------------------------------- +*/ +/* CASE WHEN NODE IS OUTMATCHED BY ENODE. +*/ +/* -------------------------------------- +*/ + if (dbakw[node] == 0) { + dbakw[node] = -(*maxint); + } +L1400: + ; + } + goto L2100; +L1500: +/* ------------------------------------------------ */ +/* FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */ +/* ------------------------------------------------ */ + enode = qxhead; + iq2 = 0; +L1600: + if (enode <= 0) { + goto L2300; + } + if (dbakw[enode] != 0) { + goto L2200; + } + ++(*tag); + deg = deg0; +/* --------------------------------- */ +/* FOR EACH UNMARKED NABOR OF ENODE, */ +/* DO THE FOLLOWING. */ +/* --------------------------------- */ + istrt = xadj[enode]; + istop = xadj[enode + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + nabor = adjncy[i]; + if (nabor == 0) { + goto L2100; + } + if (marker[nabor] >= *tag) { + goto L2000; + } + marker[nabor] = *tag; + link = nabor; +/* ------------------------------ */ +/* IF UNELIMINATED, INCLUDE IT IN */ +/* DEG COUNT. */ +/* ------------------------------ */ + if (dforw[nabor] < 0) { + goto L1700; + } + deg += qsize[nabor]; + goto L2000; +L1700: +/* ------------------------------- +*/ +/* IF ELIMINATED, INCLUDE UNMARKED +*/ +/* NODES IN THIS ELEMENT INTO THE +*/ +/* DEGREE COUNT. */ +/* ------------------------------- +*/ + jstrt = xadj[link]; + jstop = xadj[link + 1] - 1; + i__2 = jstop; + for (j = jstrt; j <= i__2; ++j) { + node = adjncy[j]; + link = -node; + if (node < 0) { + goto L1700; + } else if (node == 0) { + goto L2000; + } else { + goto L1800; + } + +L1800: + if (marker[node] >= *tag) { + goto L1900; + } + marker[node] = *tag; + deg += qsize[node]; +L1900: + ; + } +L2000: + ; + } +L2100: +/* ------------------------------------------- */ +/* UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */ +/* STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */ +/* ------------------------------------------- */ + deg = deg - qsize[enode] + 1; + fnode = dhead[deg]; + dforw[enode] = fnode; + dbakw[enode] = -deg; + if (fnode > 0) { + dbakw[fnode] = enode; + } + dhead[deg] = enode; + if (deg < *mdeg) { + *mdeg = deg; + } +L2200: +/* ---------------------------------- */ +/* GET NEXT ENODE IN CURRENT ELEMENT. */ +/* ---------------------------------- */ + enode = llist[enode]; + if (iq2 == 1) { + goto L900; + } + goto L1600; +L2300: +/* ----------------------------- */ +/* GET NEXT ELEMENT IN THE LIST. */ +/* ----------------------------- */ + *tag = mtag; + elmnt = llist[elmnt]; + goto L100; + +} /* mmdupd_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ***** MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING ***** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */ +/* PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */ +/* VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */ +/* MINIMUM DEGREE ORDERING ALGORITHM. */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* QSIZE - SIZE OF SUPERNODES AT ELIMINATION. */ + +/* UPDATED PARAMETERS - */ +/* INVP - INVERSE PERMUTATION VECTOR. ON INPUT, */ +/* IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */ +/* INTO THE NODE -INVP(NODE); OTHERWISE, */ +/* -INVP(NODE) IS ITS INVERSE LABELLING. */ + +/* OUTPUT PARAMETERS - */ +/* PERM - THE PERMUTATION VECTOR. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdnum_(int *neqns, shortint *perm, shortint *invp, + shortint *qsize) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int node, root, nextf, father, nqsize, num; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --qsize; + --invp; + --perm; + + /* Function Body */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + nqsize = qsize[node]; + if (nqsize <= 0) { + perm[node] = invp[node]; + } + if (nqsize > 0) { + perm[node] = -invp[node]; + } +/* L100: */ + } +/* ------------------------------------------------------ */ +/* FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */ +/* ------------------------------------------------------ */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + if (perm[node] > 0) { + goto L500; + } +/* ----------------------------------------- */ +/* TRACE THE MERGED TREE UNTIL ONE WHICH HAS */ +/* NOT BEEN MERGED, CALL IT ROOT. */ +/* ----------------------------------------- */ + father = node; +L200: + if (perm[father] > 0) { + goto L300; + } + father = -perm[father]; + goto L200; +L300: +/* ----------------------- */ +/* NUMBER NODE AFTER ROOT. */ +/* ----------------------- */ + root = father; + num = perm[root] + 1; + invp[node] = -num; + perm[root] = num; +/* ------------------------ */ +/* SHORTEN THE MERGED TREE. */ +/* ------------------------ */ + father = node; +L400: + nextf = -perm[father]; + if (nextf <= 0) { + goto L500; + } + perm[father] = -root; + father = nextf; + goto L400; +L500: + ; + } +/* ---------------------- */ +/* READY TO COMPUTE PERM. */ +/* ---------------------- */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + num = -invp[node]; + invp[node] = num; + perm[num] = node; +/* L600: */ + } + return 0; + +} /* mmdnum_ */ + diff --git a/src/maths/SuperLU/qselect.c b/src/maths/SuperLU/qselect.c new file mode 100644 index 000000000..bb04dd44c --- /dev/null +++ b/src/maths/SuperLU/qselect.c @@ -0,0 +1,74 @@ + +/*! @file qselect.c + * \brief Quickselect: returns the k-th (zero-based) largest value in A[]. + * + * + * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Laboratory. + * November, 2010 + *+ */ + +#include+ +double dqselect(int n, double A[], int k) +{ + register int i, j, p; + register double val; + + k = SUPERLU_MAX(k, 0); + k = SUPERLU_MIN(k, n - 1); + while (n > 1) + { + i = 0; j = n-1; + p = j; val = A[p]; + while (i < j) + { + for (; A[i] >= val && i < p; i++); + if (A[i] < val) { A[p] = A[i]; p = i; } + for (; A[j] <= val && j > p; j--); + if (A[j] > val) { A[p] = A[j]; p = j; } + } + A[p] = val; + if (p == k) return val; + else if (p > k) n = p; + else + { + p++; + n -= p; A += p; k -= p; + } + } + + return A[0]; +} + +float sqselect(int n, float A[], int k) +{ + register int i, j, p; + register float val; + + k = SUPERLU_MAX(k, 0); + k = SUPERLU_MIN(k, n - 1); + while (n > 1) + { + i = 0; j = n-1; + p = j; val = A[p]; + while (i < j) + { + for (; A[i] >= val && i < p; i++); + if (A[i] < val) { A[p] = A[i]; p = i; } + for (; A[j] <= val && j > p; j--); + if (A[j] > val) { A[p] = A[j]; p = j; } + } + A[p] = val; + if (p == k) return val; + else if (p > k) n = p; + else + { + p++; + n -= p; A += p; k -= p; + } + } + + return A[0]; +} diff --git a/src/maths/SuperLU/relax_snode.c b/src/maths/SuperLU/relax_snode.c new file mode 100644 index 000000000..f9b06bcbd --- /dev/null +++ b/src/maths/SuperLU/relax_snode.c @@ -0,0 +1,75 @@ +/*! @file relax_snode.c + * \brief Identify initial relaxed supernodes + * + * + * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + +#include+/*! \brief + * + * + * Purpose + * ======= + * relax_snode() - Identify the initial relaxed supernodes, assuming that + * the matrix has been reordered according to the postorder of the etree. + *+ */ +void +relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end /* last column in a supernode */ + ) +{ + + register int j, parent; + register int snode_start; /* beginning of a snode */ + + ifill (relax_end, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + + /* Compute the number of descendants of each node in the etree */ + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode with j being the last column. */ + relax_end[snode_start] = j; /* Last column is recorded */ + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + + /*printf("No of relaxed snodes: %d; relaxed columns: %d\n", + nsuper, no_relaxed_col); */ +} diff --git a/src/maths/SuperLU/sp_coletree.c b/src/maths/SuperLU/sp_coletree.c new file mode 100644 index 000000000..d02dfdd6d --- /dev/null +++ b/src/maths/SuperLU/sp_coletree.c @@ -0,0 +1,419 @@ +/*! @file sp_coletree.c + * \brief Tree layout and computation routines + * + *+ * -- SuperLU routine (version 3.1) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * August 1, 2008 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+*/ + +/* Elimination tree computation and layout routines */ + +#include+#include +#include + +/* + * Implementation of disjoint set union routines. + * Elements are integers in 0..n-1, and the + * names of the sets themselves are of type int. + * + * Calls are: + * initialize_disjoint_sets (n) initial call. + * s = make_set (i) returns a set containing only i. + * s = link (t, u) returns s = t union u, destroying t and u. + * s = find (i) return name of set containing i. + * finalize_disjoint_sets final call. + * + * This implementation uses path compression but not weighted union. + * See Tarjan's book for details. + * John Gilbert, CMI, 1987. + * + * Implemented path-halving by XSL 07/05/95. + */ + + +static +int *mxCallocInt(int n) +{ + register int i; + int *buf; + + buf = (int *) SUPERLU_MALLOC( n * sizeof(int) ); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC fails for buf in mxCallocInt()"); + } + for (i = 0; i < n; i++) buf[i] = 0; + return (buf); +} + +static +void initialize_disjoint_sets ( + int n, + int **pp + ) +{ + (*pp) = mxCallocInt(n); +} + + +static +int make_set ( + int i, + int *pp + ) +{ + pp[i] = i; + return i; +} + + +static +int link ( + int s, + int t, + int *pp + ) +{ + pp[s] = t; + return t; +} + + +/* PATH HALVING */ +static +int find ( + int i, + int *pp + ) +{ + register int p, gp; + + p = pp[i]; + gp = pp[p]; + while (gp != p) { + pp[i] = gp; + i = gp; + p = pp[i]; + gp = pp[p]; + } + return (p); +} + +#if 0 +/* PATH COMPRESSION */ +static +int find ( + int i + ) +{ + if (pp[i] != i) + pp[i] = find (pp[i]); + return pp[i]; +} +#endif + +static +void finalize_disjoint_sets ( + int *pp + ) +{ + SUPERLU_FREE(pp); +} + + +/* + * Find the elimination tree for A'*A. + * This uses something similar to Liu's algorithm. + * It runs in time O(nz(A)*log n) and does not form A'*A. + * + * Input: + * Sparse matrix A. Numeric values are ignored, so any + * explicit zeros are treated as nonzero. + * Output: + * Integer array of parents representing the elimination + * tree of the symbolic product A'*A. Each vertex is a + * column of A, and nc means a root of the elimination forest. + * + * John R. Gilbert, Xerox, 10 Dec 1990 + * Based on code by JRG dated 1987, 1988, and 1990. + */ + +/* + * Nonsymmetric elimination tree + */ +int +sp_coletree( + int *acolst, int *acolend, /* column start and end past 1 */ + int *arow, /* row indices of A */ + int nr, int nc, /* dimension of A */ + int *parent /* parent in elim tree */ + ) +{ + int *root; /* root of subtee of etree */ + int *firstcol; /* first nonzero col in each row*/ + int rset, cset; + int row, col; + int rroot; + int p; + int *pp; + + root = mxCallocInt (nc); + initialize_disjoint_sets (nc, &pp); + + /* Compute firstcol[row] = first nonzero column in row */ + + firstcol = mxCallocInt (nr); + for (row = 0; row < nr; firstcol[row++] = nc); + for (col = 0; col < nc; col++) + for (p = acolst[col]; p < acolend[col]; p++) { + row = arow[p]; + firstcol[row] = SUPERLU_MIN(firstcol[row], col); + } + + /* Compute etree by Liu's algorithm for symmetric matrices, + except use (firstcol[r],c) in place of an edge (r,c) of A. + Thus each row clique in A'*A is replaced by a star + centered at its first vertex, which has the same fill. */ + + for (col = 0; col < nc; col++) { + cset = make_set (col, pp); + root[cset] = col; + parent[col] = nc; /* Matlab */ + for (p = acolst[col]; p < acolend[col]; p++) { + row = firstcol[arow[p]]; + if (row >= col) continue; + rset = find (row, pp); + rroot = root[rset]; + if (rroot != col) { + parent[rroot] = col; + cset = link (cset, rset, pp); + root[cset] = col; + } + } + } + + SUPERLU_FREE (root); + SUPERLU_FREE (firstcol); + finalize_disjoint_sets (pp); + return 0; +} + +/* + * q = TreePostorder (n, p); + * + * Postorder a tree. + * Input: + * p is a vector of parent pointers for a forest whose + * vertices are the integers 0 to n-1; p[root]==n. + * Output: + * q is a vector indexed by 0..n-1 such that q[i] is the + * i-th vertex in a postorder numbering of the tree. + * + * ( 2/7/95 modified by X.Li: + * q is a vector indexed by 0:n-1 such that vertex i is the + * q[i]-th vertex in a postorder numbering of the tree. + * That is, this is the inverse of the previous q. ) + * + * In the child structure, lower-numbered children are represented + * first, so that a tree which is already numbered in postorder + * will not have its order changed. + * + * Written by John Gilbert, Xerox, 10 Dec 1990. + * Based on code written by John Gilbert at CMI in 1987. + */ + +static +/* + * Depth-first search from vertex v. + */ +void etdfs ( + int v, + int first_kid[], + int next_kid[], + int post[], + int *postnum + ) +{ + int w; + + for (w = first_kid[v]; w != -1; w = next_kid[w]) { + etdfs (w, first_kid, next_kid, post, postnum); + } + /* post[postnum++] = v; in Matlab */ + post[v] = (*postnum)++; /* Modified by X. Li on 08/10/07 */ +} + + +static +/* + * Depth-first search from vertex n. No recursion. + * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France. + */ +void nr_etdfs (int n, int *parent, + int *first_kid, int *next_kid, + int *post, int postnum) +{ + int current = n, first, next; + + while (postnum != n){ + + /* no kid for the current node */ + first = first_kid[current]; + + /* no first kid for the current node */ + if (first == -1){ + + /* numbering this node because it has no kid */ + post[current] = postnum++; + + /* looking for the next kid */ + next = next_kid[current]; + + while (next == -1){ + + /* no more kids : back to the parent node */ + current = parent[current]; + + /* numbering the parent node */ + post[current] = postnum++; + + /* get the next kid */ + next = next_kid[current]; + } + + /* stopping criterion */ + if (postnum==n+1) return; + + /* updating current node */ + current = next; + } + /* updating current node */ + else { + current = first; + } + } +} + +/* + * Post order a tree + */ +int *TreePostorder( + int n, + int *parent + ) +{ + int *first_kid, *next_kid; /* Linked list of children. */ + int *post, postnum; + int v, dad; + + /* Allocate storage for working arrays and results */ + first_kid = mxCallocInt (n+1); + next_kid = mxCallocInt (n+1); + post = mxCallocInt (n+1); + + /* Set up structure describing children */ + for (v = 0; v <= n; first_kid[v++] = -1); + for (v = n-1; v >= 0; v--) { + dad = parent[v]; + next_kid[v] = first_kid[dad]; + first_kid[dad] = v; + } + + /* Depth-first search from dummy root vertex #n */ + postnum = 0; +#if 0 + /* recursion */ + etdfs (n, first_kid, next_kid, post, &postnum); +#else + /* no recursion */ + nr_etdfs(n, parent, first_kid, next_kid, post, postnum); +#endif + + SUPERLU_FREE (first_kid); + SUPERLU_FREE (next_kid); + return post; +} + + +/* + * p = spsymetree (A); + * + * Find the elimination tree for symmetric matrix A. + * This uses Liu's algorithm, and runs in time O(nz*log n). + * + * Input: + * Square sparse matrix A. No check is made for symmetry; + * elements below and on the diagonal are ignored. + * Numeric values are ignored, so any explicit zeros are + * treated as nonzero. + * Output: + * Integer array of parents representing the etree, with n + * meaning a root of the elimination forest. + * Note: + * This routine uses only the upper triangle, while sparse + * Cholesky (as in spchol.c) uses only the lower. Matlab's + * dense Cholesky uses only the upper. This routine could + * be modified to use the lower triangle either by transposing + * the matrix or by traversing it by rows with auxiliary + * pointer and link arrays. + * + * John R. Gilbert, Xerox, 10 Dec 1990 + * Based on code by JRG dated 1987, 1988, and 1990. + * Modified by X.S. Li, November 1999. + */ + +/* + * Symmetric elimination tree + */ +int +sp_symetree( + int *acolst, int *acolend, /* column starts and ends past 1 */ + int *arow, /* row indices of A */ + int n, /* dimension of A */ + int *parent /* parent in elim tree */ + ) +{ + int *root; /* root of subtree of etree */ + int rset, cset; + int row, col; + int rroot; + int p; + int *pp; + + root = mxCallocInt (n); + initialize_disjoint_sets (n, &pp); + + for (col = 0; col < n; col++) { + cset = make_set (col, pp); + root[cset] = col; + parent[col] = n; /* Matlab */ + for (p = acolst[col]; p < acolend[col]; p++) { + row = arow[p]; + if (row >= col) continue; + rset = find (row, pp); + rroot = root[rset]; + if (rroot != col) { + parent[rroot] = col; + cset = link (cset, rset, pp); + root[cset] = col; + } + } + } + SUPERLU_FREE (root); + finalize_disjoint_sets (pp); + return 0; +} /* SP_SYMETREE */ diff --git a/src/maths/SuperLU/sp_ienv.c b/src/maths/SuperLU/sp_ienv.c new file mode 100644 index 000000000..883932fdc --- /dev/null +++ b/src/maths/SuperLU/sp_ienv.c @@ -0,0 +1,79 @@ +/*! @file sp_ienv.c + * \brief Chooses machine-dependent parameters for the local environment. + * + * + * -- SuperLU routine (version 4.1) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November, 2010 + *+*/ + +/* + * File name: sp_ienv.c + * History: Modified from lapack routine ILAENV + */ +#include+ +/*! \brief + + + Purpose + ======= + + sp_ienv() is inquired to choose machine-dependent parameters for the + local environment. See ISPEC for a description of the parameters. + + This version provides a set of parameters which should give good, + but not optimal, performance on many of the currently available + computers. Users are encouraged to modify this subroutine to set + the tuning parameters for their particular machine using the option + and problem size information in the arguments. + + Arguments + ========= + + ISPEC (input) int + Specifies the parameter to be returned as the value of SP_IENV. + = 1: the panel size w; a panel consists of w consecutive + columns of matrix A in the process of Gaussian elimination. + The best value depends on machine's cache characters. + = 2: the relaxation parameter relax; if the number of + nodes (columns) in a subtree of the elimination tree is less + than relax, this subtree is considered as one supernode, + regardless of their row structures. + = 3: the maximum size for a supernode in complete LU; + = 4: the minimum row dimension for 2-D blocking to be used; + = 5: the minimum column dimension for 2-D blocking to be used; + = 6: the estimated fills factor for L and U, compared with A; + = 7: the maximum size for a supernode in ILU. + + (SP_IENV) (output) int + >= 0: the value of the parameter specified by ISPEC + < 0: if SP_IENV = -k, the k-th argument had an illegal value. + + ===================================================================== ++*/ +int +sp_ienv(int ispec) +{ + int i; + + switch (ispec) { + case 1: return (12); + case 2: return (6); + case 3: return (100); + case 4: return (200); + case 5: return (60); + case 6: return (20); + case 7: return (10); + } + + /* Invalid value for ISPEC */ + i = 1; + xerbla_("sp_ienv", &i); + return 0; + +} /* sp_ienv_ */ + diff --git a/src/maths/SuperLU/sp_preorder.c b/src/maths/SuperLU/sp_preorder.c new file mode 100644 index 000000000..d147e5050 --- /dev/null +++ b/src/maths/SuperLU/sp_preorder.c @@ -0,0 +1,208 @@ +/*! @file sp_preorder.c + * \brief Permute and performs functions on columns of orginal matrix + */ +#include+ + +/*! \brief + * + * + * Purpose + * ======= + * + * sp_preorder() permutes the columns of the original matrix. It performs + * the following steps: + * + * 1. Apply column permutation perm_c[] to A's column pointers to form AC; + * + * 2. If options->Fact = DOFACT, then + * (1) Compute column elimination tree etree[] of AC'AC; + * (2) Post order etree[] to get a postordered elimination tree etree[], + * and a postorder permutation post[]; + * (3) Apply post[] permutation to columns of AC; + * (4) Overwrite perm_c[] with the product perm_c * post. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * Specifies whether or not the elimination tree will be re-used. + * If options->Fact == DOFACT, this means first time factor A, + * etree is computed, postered, and output. + * Otherwise, re-factor A, etree is input, unchanged on exit. + * + * A (input) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of the linear equations is A->nrow. Currently, the type of A can be: + * Stype = NC or SLU_NCP; Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * perm_c (input/output) int* + * Column permutation vector of size A->ncol, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * If options->Fact == DOFACT, perm_c is both input and output. + * On output, it is changed according to a postorder of etree. + * Otherwise, perm_c is input. + * + * etree (input/output) int* + * Elimination tree of Pc'*A'*A*Pc, dimension A->ncol. + * If options->Fact == DOFACT, etree is an output argument, + * otherwise it is an input argument. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * + * AC (output) SuperMatrix* + * The resulting matrix after applied the column permutation + * perm_c[] to matrix A. The type of AC can be: + * Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE. + *+ */ +void +sp_preorder(superlu_options_t *options, SuperMatrix *A, int *perm_c, + int *etree, SuperMatrix *AC) +{ + NCformat *Astore; + NCPformat *ACstore; + int *iwork, *post; + register int n, i; + + n = A->ncol; + + /* Apply column permutation perm_c to A's column pointers so to + obtain NCP format in AC = A*Pc. */ + AC->Stype = SLU_NCP; + AC->Dtype = A->Dtype; + AC->Mtype = A->Mtype; + AC->nrow = A->nrow; + AC->ncol = A->ncol; + Astore = A->Store; + ACstore = AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) ); + if ( !ACstore ) ABORT_SuperLU("SUPERLU_MALLOC fails for ACstore"); + ACstore->nnz = Astore->nnz; + ACstore->nzval = Astore->nzval; + ACstore->rowind = Astore->rowind; + ACstore->colbeg = (int*) SUPERLU_MALLOC(n*sizeof(int)); + if ( !(ACstore->colbeg) ) ABORT_SuperLU("SUPERLU_MALLOC fails for ACstore->colbeg"); + ACstore->colend = (int*) SUPERLU_MALLOC(n*sizeof(int)); + if ( !(ACstore->colend) ) ABORT_SuperLU("SUPERLU_MALLOC fails for ACstore->colend"); + +#ifdef DEBUG + print_int_vec("pre_order:", n, perm_c); + check_perm("Initial perm_c", n, perm_c); +#endif + + for (i = 0; i < n; i++) { + ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; + ACstore->colend[perm_c[i]] = Astore->colptr[i+1]; + } + + if ( options->Fact == DOFACT ) { +#undef ETREE_ATplusA +#ifdef ETREE_ATplusA + /*-------------------------------------------- + COMPUTE THE ETREE OF Pc*(A'+A)*Pc'. + --------------------------------------------*/ + int *b_colptr, *b_rowind, bnz, j; + int *c_colbeg, *c_colend; + + /*printf("Use etree(A'+A)\n");*/ + + /* Form B = A + A'. */ + at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); + + /* Form C = Pc*B*Pc'. */ + c_colbeg = (int*) SUPERLU_MALLOC(2*n*sizeof(int)); + c_colend = c_colbeg + n; + if (!c_colbeg ) ABORT_SuperLU("SUPERLU_MALLOC fails for c_colbeg/c_colend"); + for (i = 0; i < n; i++) { + c_colbeg[perm_c[i]] = b_colptr[i]; + c_colend[perm_c[i]] = b_colptr[i+1]; + } + for (j = 0; j < n; ++j) { + for (i = c_colbeg[j]; i < c_colend[j]; ++i) { + b_rowind[i] = perm_c[b_rowind[i]]; + } + } + + /* Compute etree of C. */ + sp_symetree(c_colbeg, c_colend, b_rowind, n, etree); + + SUPERLU_FREE(b_colptr); + if ( bnz ) SUPERLU_FREE(b_rowind); + SUPERLU_FREE(c_colbeg); + +#else + /*-------------------------------------------- + COMPUTE THE COLUMN ELIMINATION TREE. + --------------------------------------------*/ + sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind, + A->nrow, A->ncol, etree); +#endif +#ifdef DEBUG + print_int_vec("etree:", n, etree); +#endif + + /* In symmetric mode, do not do postorder here. */ + if ( options->SymmetricMode == NO_SuperLU ) { + /* Post order etree */ + post = (int *) TreePostorder(n, etree); + /* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + iwork = post; */ + +#ifdef DEBUG + print_int_vec("post:", n+1, post); + check_perm("post", n, post); +#endif + iwork = (int*) SUPERLU_MALLOC((n+1)*sizeof(int)); + if ( !iwork ) ABORT_SuperLU("SUPERLU_MALLOC fails for iwork[]"); + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]]; + for (i = 0; i < n; ++i) etree[i] = iwork[i]; + +#ifdef DEBUG + print_int_vec("postorder etree:", n, etree); +#endif + + /* Postmultiply A*Pc by post[] */ + for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i]; + for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i]; + for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i]; + for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i]; + + for (i = 0; i < n; ++i) + iwork[i] = post[perm_c[i]]; /* product of perm_c and post */ + for (i = 0; i < n; ++i) perm_c[i] = iwork[i]; + +#ifdef DEBUG + print_int_vec("Pc*post:", n, perm_c); + check_perm("final perm_c", n, perm_c); +#endif + SUPERLU_FREE (post); + SUPERLU_FREE (iwork); + } /* end postordering */ + + } /* if options->Fact == DOFACT ... */ + +} + +int check_perm(char *what, int n, int *perm) +{ + register int i; + int *marker; + marker = (int *) calloc(n, sizeof(int)); + + for (i = 0; i < n; ++i) { + if ( marker[perm[i]] == 1 || perm[i] >= n ) { + printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]); + ABORT_SuperLU("check_perm"); + } else { + marker[perm[i]] = 1; + } + } + + SUPERLU_FREE(marker); + return 0; +} diff --git a/src/maths/SuperLU/superlu_timer.c b/src/maths/SuperLU/superlu_timer.c new file mode 100644 index 000000000..e888f0823 --- /dev/null +++ b/src/maths/SuperLU/superlu_timer.c @@ -0,0 +1,72 @@ +/*! @file superlu_timer.c + * \brief Returns the time used + * + *+ * Purpose + * ======= + * + * Returns the time in seconds used by the process. + * + * Note: the timer function call is machine dependent. Use conditional + * compilation to choose the appropriate function. + *+ */ + + +#ifdef SUN +/* + * It uses the system call gethrtime(3C), which is accurate to + * nanoseconds. +*/ +#include+ +double SuperLU_timer_() { + return ( (double)gethrtime() / 1e9 ); +} + +#elif _WIN32 + +#include + +double SuperLU_timer_() +{ + clock_t t; + t=clock(); + + return ((double)t)/CLOCKS_PER_SEC; +} + +#else + +#ifndef NO_TIMER +#include +#include +#include +#include +#endif + +/*! \brief Timer function + */ + +double SuperLU_timer_() +{ +#ifdef NO_TIMER + /* no sys/times.h on WIN32 */ + double tmp; + tmp = 0.0; + /* return (double)(tmp) / CLK_TCK;*/ + return 0.0; +#else + struct tms use; + double tmp; + int clocks_per_sec = sysconf(_SC_CLK_TCK); + + times ( &use ); + tmp = use.tms_utime; + tmp += use.tms_stime; + return (double)(tmp) / clocks_per_sec; +#endif +} + +#endif + diff --git a/src/maths/SuperLU/superlusmp.c b/src/maths/SuperLU/superlusmp.c new file mode 100644 index 000000000..31cceb9e1 --- /dev/null +++ b/src/maths/SuperLU/superlusmp.c @@ -0,0 +1,722 @@ +/* + * Spice3 COMPATIBILITY MODULE + * + * Author: Advising professor: + * Kenneth S. Kundert Alberto Sangiovanni-Vincentelli + * UC Berkeley + * + * This module contains routines that make Sparse1.3 a direct + * replacement for the SMP sparse matrix package in Spice3c1 or Spice3d1. + * Sparse1.3 is in general a faster and more robust package than SMP. + * These advantages become significant on large circuits. + * + * >>> User accessible functions contained in this file: + * SMPaddElt + * SMPmakeElt + * SMPcClear + * SMPclear + * SMPcLUfac + * SMPluFac + * SMPcReorder + * SMPreorder + * SMPcaSolve + * SMPcSolve + * SMPsolve + * SMPmatSize + * SMPnewMatrix + * SMPdestroy + * SMPpreOrder + * SMPprint + * SMPgetError + * SMPcProdDiag + * LoadGmin + * SMPfindElt + * SMPcombine + * SMPcCombine + */ + +/* + * To replace SMP with Sparse, rename the file spSpice3.h to + * spMatrix.h and place Sparse in a subdirectory of SPICE called + * `sparse'. Then on UNIX compile Sparse by executing `make spice'. + * If not on UNIX, after compiling Sparse and creating the sparse.a + * archive, compile this file (spSMP.c) and spSMP.o to the archive, + * then copy sparse.a into the SPICE main directory and rename it + * SMP.a. Finally link SPICE. + * + * To be compatible with SPICE, the following Sparse compiler options + * (in spConfig.h) should be set as shown below: + * + * EXPANDABLE YES + * TRANSLATE NO + * INITIALIZE NO or YES, YES for use with test prog. + * DIAGONAL_PIVOTING YES + * MODIFIED_MARKOWITZ NO + * DELETE NO + * STRIP NO + * MODIFIED_NODAL YES + * QUAD_ELEMENT NO + * TRANSPOSE YES + * SCALING NO + * DOCUMENTATION YES + * MULTIPLICATION NO + * DETERMINANT YES + * STABILITY NO + * CONDITION NO + * PSEUDOCONDITION NO + * DEBUG YES + * + * spREAL double + */ + +/* + * Revision and copyright information. + * + * Copyright (c) 1985,86,87,88,89,90 + * by Kenneth S. Kundert and the University of California. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose and without fee is hereby granted, provided + * that the above copyright notice appear in all copies and supporting + * documentation and that the authors and the University of California + * are properly credited. The authors and the University of California + * make no representations as to the suitability of this software for + * any purpose. It is provided `as is', without express or implied warranty. + */ + +/* + * IMPORTS + * + * >>> Import descriptions: + * spMatrix.h + * Sparse macros and declarations. + * SMPdefs.h + * Spice3's matrix macro definitions. + */ + +#include "ngspice/config.h" +#include +#include +#include +#include "ngspice/spmatrix.h" +#include "../sparse/spdefs.h" +#include "ngspice/smpdefs.h" + +#if defined (_MSC_VER) +extern double scalbn(double, int); +#define logb _logb +extern double logb(double); +#endif + +static void LoadGmin_CSC (double **diag, int n, double Gmin) ; +static void LoadGmin(SMPmatrix *eMatrix, double Gmin); + +void +SMPmatrix_CSC (SMPmatrix *Matrix) +{ + spMatrix_CSC (Matrix->SPmatrix, Matrix->CKTsuperluAp, Matrix->CKTsuperluAi, Matrix->CKTsuperluAx, + Matrix->CKTsuperluN, Matrix->CKTbind_Sparse, Matrix->CKTbind_CSC, Matrix->CKTdiag_CSC) ; + return ; +} + +void +SMPnnz (SMPmatrix *Matrix) +{ + Matrix->CKTsuperluN = spGetSize (Matrix->SPmatrix, 1) ; + Matrix->CKTsuperlunz = Matrix->SPmatrix->Elements ; + return ; +} + +/* + * SMPaddElt() + */ +int +SMPaddElt (SMPmatrix *Matrix, int Row, int Col, double Value) +{ + *spGetElement (Matrix->SPmatrix, Row, Col) = Value; + return spError (Matrix->SPmatrix) ; +} + +/* + * SMPmakeElt() + */ +double * +SMPmakeElt (SMPmatrix *Matrix, int Row, int Col) +{ + return spGetElement (Matrix->SPmatrix, Row, Col) ; +} + + +/* + * SMPcClear() + */ + +void +SMPcClear (SMPmatrix *Matrix) +{ + spClear (Matrix->SPmatrix) ; +} + +/* + * SMPclear() + */ + +void +SMPclear (SMPmatrix *Matrix) +{ + int i ; + if (Matrix->CKTsuperluMODE) + { + spClear (Matrix->SPmatrix) ; + if (Matrix->CKTsuperluAx != NULL) + { + for (i = 0 ; i < Matrix->CKTsuperlunz ; i++) + Matrix->CKTsuperluAx [i] = 0 ; + } + } else { + spClear (Matrix->SPmatrix) ; + } +} + +#define NG_IGNORE(x) (void)x + +/* + * SMPcLUfac() + */ +/*ARGSUSED*/ + +int +SMPcLUfac (SMPmatrix *Matrix, double PivTol) +{ + NG_IGNORE(PivTol); + + spSetComplex (Matrix->SPmatrix) ; + return spFactor (Matrix->SPmatrix) ; +} + +/* + * SMPluFac() + */ +/*ARGSUSED*/ + +int +SMPluFac (SMPmatrix *Matrix, double PivTol, double Gmin) +{//printf("ReFactor\n"); + int relax, panel_size, lwork = 0 ; + + NG_IGNORE(PivTol) ; + + if (Matrix->CKTsuperluMODE) + { + spSetReal (Matrix->SPmatrix) ; + LoadGmin_CSC (Matrix->CKTdiag_CSC, Matrix->CKTsuperluN, Gmin) ; + Matrix->CKTsuperluOptions.Fact = SamePattern_SameRowPerm ; /* IMPORTANT */ + panel_size = sp_ienv (1) ; + relax = sp_ienv (2) ; +// t = SuperLU_timer_(); + dgstrf (&(Matrix->CKTsuperluOptions), &(Matrix->CKTsuperluAC), relax, panel_size, Matrix->CKTsuperluEtree, + NULL, lwork, Matrix->CKTsuperluPerm_c, Matrix->CKTsuperluPerm_r, &(Matrix->CKTsuperluL), + &(Matrix->CKTsuperluU), &(Matrix->CKTsuperluStat), &(Matrix->CKTsuperluInfo)) ; +// utime[FACT] = SuperLU_timer_() - t; + if (Matrix->CKTsuperluInfo == 0) return 0 ; /* ReFactorization DONE */ + else return 1 ; + } else { + spSetReal (Matrix->SPmatrix) ; + LoadGmin (Matrix, Gmin) ; + return spFactor (Matrix->SPmatrix) ; + } +} + +/* + * SMPcReorder() + */ + +int +SMPcReorder (SMPmatrix *Matrix, double PivTol, double PivRel, int *NumSwaps) +{ + *NumSwaps = 1; + spSetComplex (Matrix->SPmatrix) ; + return spOrderAndFactor (Matrix->SPmatrix, NULL, + (spREAL)PivRel, (spREAL)PivTol, YES) ; +} + +/* + * SMPreorder() + */ + +int +SMPreorder (SMPmatrix *Matrix, double PivTol, double PivRel, double Gmin) +{//printf("Factor\n"); +// int permc_spec ; + int relax, panel_size, lwork = 0 ; + + if (Matrix->CKTsuperluMODE) + { + spSetReal (Matrix->SPmatrix) ; + LoadGmin_CSC (Matrix->CKTdiag_CSC, Matrix->CKTsuperluN, Gmin) ; + Matrix->CKTsuperluOptions.Fact = SamePattern ; /* IMPORTANT */ +// permc_spec = options->ColPerm; +// if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, A, perm_c); +// sp_preorder(options, A, perm_c, etree, AC); + panel_size = sp_ienv (1) ; + relax = sp_ienv (2) ; +// t = SuperLU_timer_(); + dgstrf (&(Matrix->CKTsuperluOptions), &(Matrix->CKTsuperluAC), relax, panel_size, Matrix->CKTsuperluEtree, + NULL, lwork, Matrix->CKTsuperluPerm_c, Matrix->CKTsuperluPerm_r, &(Matrix->CKTsuperluL), + &(Matrix->CKTsuperluU), &(Matrix->CKTsuperluStat), &(Matrix->CKTsuperluInfo)) ; +// utime[FACT] = SuperLU_timer_() - t; + return 0 ; + } + else { + spSetReal (Matrix->SPmatrix) ; + LoadGmin (Matrix, Gmin) ; + return spOrderAndFactor (Matrix->SPmatrix, NULL, + (spREAL)PivRel, (spREAL)PivTol, YES) ; + } +} + +/* + * SMPcaSolve() + */ +void +SMPcaSolve (SMPmatrix *Matrix, double RHS[], double iRHS[], + double Spare[], double iSpare[]) +{ + printf ("SMPcaSolve\n") ; + NG_IGNORE (iSpare) ; + NG_IGNORE (Spare) ; + + spSolveTransposed (Matrix->SPmatrix, RHS, RHS, iRHS, iRHS) ; +} + +/* + * SMPcSolve() + */ + +void +SMPcSolve (SMPmatrix *Matrix, double RHS[], double iRHS[], double Spare[], double iSpare[]) +{ + NG_IGNORE (iSpare) ; + NG_IGNORE (Spare) ; + + spSolve (Matrix->SPmatrix, RHS, RHS, iRHS, iRHS) ; +} + +/* + * SMPsolve() + */ + +void +SMPsolve(SMPmatrix *Matrix, double RHS[], double Spare[]) +{//printf("Solve\n"); + int i, *pExtOrder ; + + NG_IGNORE (Spare) ; + + if (Matrix->CKTsuperluMODE) + { + pExtOrder = &Matrix->SPmatrix->IntToExtRowMap[Matrix->CKTsuperluN] ; + for (i = Matrix->CKTsuperluN - 1 ; i >= 0 ; i--) + Matrix->CKTsuperluIntermediate [i] = RHS [*(pExtOrder--)] ; + + dgstrs (NOTRANS, &(Matrix->CKTsuperluL), &(Matrix->CKTsuperluU), Matrix->CKTsuperluPerm_c, + Matrix->CKTsuperluPerm_r, &(Matrix->CKTsuperluI), &(Matrix->CKTsuperluStat), + &(Matrix->CKTsuperluInfo)) ; + + pExtOrder = &Matrix->SPmatrix->IntToExtColMap[Matrix->CKTsuperluN] ; + for (i = Matrix->CKTsuperluN - 1 ; i >= 0 ; i--) + RHS [*(pExtOrder--)] = Matrix->CKTsuperluIntermediate [i] ; + } else { + spSolve (Matrix->SPmatrix, RHS, RHS, NULL, NULL) ; + } +} + +/* + * SMPmatSize() + */ +int +SMPmatSize (SMPmatrix *Matrix) +{ + return spGetSize (Matrix->SPmatrix, 1) ; +} + +/* + * SMPnewMatrix() + */ +int +SMPnewMatrix(SMPmatrix *Matrix) +{ + int Error; + Matrix->SPmatrix = spCreate (0, 1, &Error) ; + return Error ; +} + +/* + * SMPdestroy() + */ + +void +SMPdestroy (SMPmatrix *Matrix) +{ + spDestroy (Matrix->SPmatrix) ; +} + +/* + * SMPpreOrder() + */ + +int +SMPpreOrder (SMPmatrix *Matrix) +{//printf("PreOrder\n"); +// DNformat *Bstore ; + int permc_spec ; +// int n, i ; + if (Matrix->CKTsuperluMODE) + { + Matrix->CKTsuperluOptions.Fact = DOFACT ; + Matrix->CKTsuperluOptions.Equil = NO_SuperLU ; + Matrix->CKTsuperluOptions.ColPerm = MMD_AT_PLUS_A ; + Matrix->CKTsuperluOptions.Trans = NOTRANS ; + Matrix->CKTsuperluOptions.IterRefine = NOREFINE ; + Matrix->CKTsuperluOptions.DiagPivotThresh = 0.001 ; + Matrix->CKTsuperluOptions.SymmetricMode = YES_SuperLU ; + Matrix->CKTsuperluOptions.PivotGrowth = NO_SuperLU ; + Matrix->CKTsuperluOptions.ConditionNumber = NO_SuperLU ; + Matrix->CKTsuperluOptions.PrintStat = NO_SuperLU ; + + /* Test the input parameters ... */ +// *info = 0 ; +// Bstore = B->Store; +// if ( options->Fact != DOFACT ) *info = -1 ; +// else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2 ; +// else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -7 ; +// if ( *info != 0 ) { +// i = -(*info); +// xerbla_("dgssv", &i); +// return 0 ; /* Make sure that 0 is correct - Francesco Lannutti */ +// } + +// utime = stat->utime; + +// t = SuperLU_timer_(); + permc_spec = Matrix->CKTsuperluOptions.ColPerm; + if ( permc_spec != MY_PERMC && Matrix->CKTsuperluOptions.Fact == DOFACT ) + get_perm_c (permc_spec, &(Matrix->CKTsuperluA), Matrix->CKTsuperluPerm_c) ; +// utime[COLPERM] = SuperLU_timer_() - t; + +// t = SuperLU_timer_(); + sp_preorder (&(Matrix->CKTsuperluOptions), &(Matrix->CKTsuperluA), Matrix->CKTsuperluPerm_c, + Matrix->CKTsuperluEtree, &(Matrix->CKTsuperluAC)) ; +// utime[ETREE] = SuperLU_timer_() - t; + + return 0 ; + } else { + spMNA_Preorder (Matrix->SPmatrix) ; + return spError (Matrix->SPmatrix) ; + } +} + +/* + * SMPprint() + */ +/*ARGSUSED*/ +void +SMPprint (SMPmatrix *Matrix, FILE *File) +{ + NG_IGNORE (File) ; + + spPrint (Matrix->SPmatrix, 0, 1, 1) ; +} + +/* + * SMPgetError() + */ +void +SMPgetError (SMPmatrix *Matrix, int *Col, int *Row) +{ + spWhereSingular (Matrix->SPmatrix, Row, Col) ; +} + +/* + * SMPcProdDiag() + * note: obsolete for Spice3d2 and later + */ +int +SMPcProdDiag (SMPmatrix *Matrix, SPcomplex *pMantissa, int *pExponent) +{ + spDeterminant (Matrix->SPmatrix, pExponent, &(pMantissa->real), &(pMantissa->imag)) ; + return spError (Matrix->SPmatrix) ; +} + +/* + * SMPcDProd() + */ +int +SMPcDProd (SMPmatrix *Matrix, SPcomplex *pMantissa, int *pExponent) +{ + double re, im, x, y, z; + int p; + + spDeterminant (Matrix->SPmatrix, &p, &re, &im) ; + +#ifndef M_LN2 +#define M_LN2 0.69314718055994530942 +#endif +#ifndef M_LN10 +#define M_LN10 2.30258509299404568402 +#endif + +#ifdef debug_print + printf("Determinant 10: (%20g,%20g)^%d\n", re, im, p); +#endif + + /* Convert base 10 numbers to base 2 numbers, for comparison */ + y = p * M_LN10 / M_LN2; + x = (int) y; + y -= x; + + /* ASSERT + * x = integral part of exponent, y = fraction part of exponent + */ + + /* Fold in the fractional part */ +#ifdef debug_print + printf(" ** base10 -> base2 int = %g, frac = %20g\n", x, y); +#endif + z = pow(2.0, y); + re *= z; + im *= z; +#ifdef debug_print + printf(" ** multiplier = %20g\n", z); +#endif + + /* Re-normalize (re or im may be > 2.0 or both < 1.0 */ + if (re != 0.0) { + y = logb(re); + if (im != 0.0) + z = logb(im); + else + z = 0; + } else if (im != 0.0) { + z = logb(im); + y = 0; + } else { + /* Singular */ + /*printf("10 -> singular\n");*/ + y = 0; + z = 0; + } + +#ifdef debug_print + printf(" ** renormalize changes = %g,%g\n", y, z); +#endif + if (y < z) + y = z; + + *pExponent = (int)(x + y); + x = scalbn(re, (int) -y); + z = scalbn(im, (int) -y); +#ifdef debug_print + printf(" ** values are: re %g, im %g, y %g, re' %g, im' %g\n", + re, im, y, x, z); +#endif + pMantissa->real = scalbn(re, (int) -y); + pMantissa->imag = scalbn(im, (int) -y); + +#ifdef debug_print + printf("Determinant 10->2: (%20g,%20g)^%d\n", pMantissa->real, + pMantissa->imag, *pExponent); +#endif + return spError (Matrix->SPmatrix) ; +} + + + +/* + * The following routines need internal knowledge of the Sparse data + * structures. + */ + +/* + * LOAD GMIN + * + * This routine adds Gmin to each diagonal element. Because Gmin is + * added to the current diagonal, which may bear little relation to + * what the outside world thinks is a diagonal, and because the + * elements that are diagonals may change after calling spOrderAndFactor, + * use of this routine is not recommended. It is included here simply + * for compatibility with Spice3. + */ + + +static void +LoadGmin_CSC (double **diag, int n, double Gmin) +{ + + int i ; + if (Gmin != 0.0) { + for (i = 0 ; i < n ; i++) { + if (diag [i] != NULL) *(diag [i]) += Gmin ; + } + } + + return ; +} + +static void +LoadGmin(SMPmatrix *eMatrix, double Gmin) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + int I; + ArrayOfElementPtrs Diag; + ElementPtr diag; + + /* Begin `LoadGmin'. */ + assert (IS_SPARSE (Matrix)) ; + + if (Gmin != 0.0) { + Diag = Matrix->Diag; + for (I = Matrix->Size; I > 0; I--) { + if ((diag = Diag[I]) != NULL) + diag->Real += Gmin; + } + } + return; +} + + +/* + * FIND ELEMENT + * + * This routine finds an element in the matrix by row and column number. + * If the element exists, a pointer to it is returned. If not, then NULL + * is returned unless the CreateIfMissing flag is TRUE, in which case a + * pointer to the new element is returned. + */ + +SMPelement * +SMPfindElt(SMPmatrix *eMatrix, int Row, int Col, int CreateIfMissing) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Element; + + /* Begin `SMPfindElt'. */ + assert( IS_SPARSE( Matrix ) ); + Row = Matrix->ExtToIntRowMap[Row]; + Col = Matrix->ExtToIntColMap[Col]; + Element = Matrix->FirstInCol[Col]; + Element = spcFindElementInCol(Matrix, &Element, Row, Col, CreateIfMissing); + return (SMPelement *)Element; +} + +/* XXX The following should probably be implemented in spUtils */ + +/* + * SMPcZeroCol() + */ +int +SMPcZeroCol(SMPmatrix *eMatrix, int Col) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Element; + + Col = Matrix->ExtToIntColMap[Col]; + + for (Element = Matrix->FirstInCol[Col]; + Element != NULL; + Element = Element->NextInCol) + { + Element->Real = 0.0; + Element->Imag = 0.0; + } + + return spError( Matrix ); +} + +/* + * SMPcAddCol() + */ +int +SMPcAddCol(SMPmatrix *eMatrix, int Accum_Col, int Addend_Col) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Accum, Addend, *Prev; + + Accum_Col = Matrix->ExtToIntColMap[Accum_Col]; + Addend_Col = Matrix->ExtToIntColMap[Addend_Col]; + + Addend = Matrix->FirstInCol[Addend_Col]; + Prev = &Matrix->FirstInCol[Accum_Col]; + Accum = *Prev;; + + while (Addend != NULL) { + while (Accum && Accum->Row < Addend->Row) { + Prev = &Accum->NextInCol; + Accum = *Prev; + } + if (!Accum || Accum->Row > Addend->Row) { + Accum = spcCreateElement(Matrix, Addend->Row, Accum_Col, Prev, 0); + } + Accum->Real += Addend->Real; + Accum->Imag += Addend->Imag; + Addend = Addend->NextInCol; + } + + return spError( Matrix ); +} + +/* + * SMPzeroRow() + */ +int +SMPzeroRow(SMPmatrix *eMatrix, int Row) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Element; + + Row = Matrix->ExtToIntColMap[Row]; + + if (Matrix->RowsLinked == NO) + spcLinkRows(Matrix); + + if (Matrix->PreviousMatrixWasComplex || Matrix->Complex) { + for (Element = Matrix->FirstInRow[Row]; + Element != NULL; + Element = Element->NextInRow) + { + Element->Real = 0.0; + Element->Imag = 0.0; + } + } else { + for (Element = Matrix->FirstInRow[Row]; + Element != NULL; + Element = Element->NextInRow) + { + Element->Real = 0.0; + } + } + + return spError( Matrix ); +} + +#ifdef PARALLEL_ARCH +/* + * SMPcombine() + */ +void +SMPcombine(SMPmatrix *Matrix, double RHS[], double Spare[]) +{ + spSetReal (Matrix->SPmatrix) ; + spCombine (Matrix->SPmatrix, RHS, Spare, NULL, NULL) ; +} + +/* + * SMPcCombine() + */ +void +SMPcCombine (SMPmatrix *Matrix, double RHS[], double Spare[], double iRHS[], double iSpare[]) +{ + spSetComplex (Matrix->SPmatrix) ; + spCombine (Matrix->SPmatrix, RHS, Spare, iRHS, iSpare) ; +} +#endif /* PARALLEL_ARCH */ diff --git a/src/maths/SuperLU/util.c b/src/maths/SuperLU/util.c new file mode 100644 index 000000000..9e82056f0 --- /dev/null +++ b/src/maths/SuperLU/util.c @@ -0,0 +1,495 @@ +/*! @file util.c + * \brief Utility functions + * + * + * -- SuperLU routine (version 4.1) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November, 2010 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include+#include + +/*! \brief Global statistics variale + */ + +void superlu_abort_and_exit(char* msg) +{ + fprintf(stderr, msg); + exit (-1); +} + +/*! \brief Set the default values for the options argument. + */ +void set_default_options(superlu_options_t *options) +{ + options->Fact = DOFACT; + options->Equil = YES_SuperLU; + options->ColPerm = COLAMD; + options->Trans = NOTRANS; + options->IterRefine = NOREFINE; + options->DiagPivotThresh = 1.0; + options->SymmetricMode = NO_SuperLU; + options->PivotGrowth = NO_SuperLU; + options->ConditionNumber = NO_SuperLU; + options->PrintStat = YES_SuperLU; +} + +/*! \brief Set the default values for the options argument for ILU. + */ +void ilu_set_default_options(superlu_options_t *options) +{ + set_default_options(options); + + /* further options for incomplete factorization */ + options->DiagPivotThresh = 0.1; + options->RowPerm = LargeDiag; + options->ILU_DropRule = DROP_BASIC | DROP_AREA; + options->ILU_DropTol = 1e-4; + options->ILU_FillFactor = 10.0; + options->ILU_Norm = INF_NORM; + options->ILU_MILU = SILU; + options->ILU_MILU_Dim = 3.0; /* -log(n)/log(h) is perfect */ + options->ILU_FillTol = 1e-2; +} + +/*! \brief Print the options setting. + */ +void print_options(superlu_options_t *options) +{ + printf(".. options:\n"); + printf("\tFact\t %8d\n", options->Fact); + printf("\tEquil\t %8d\n", options->Equil); + printf("\tColPerm\t %8d\n", options->ColPerm); + printf("\tDiagPivotThresh %8.4f\n", options->DiagPivotThresh); + printf("\tTrans\t %8d\n", options->Trans); + printf("\tIterRefine\t%4d\n", options->IterRefine); + printf("\tSymmetricMode\t%4d\n", options->SymmetricMode); + printf("\tPivotGrowth\t%4d\n", options->PivotGrowth); + printf("\tConditionNumber\t%4d\n", options->ConditionNumber); + printf("..\n"); +} + +/*! \brief Print the options setting. + */ +void print_ilu_options(superlu_options_t *options) +{ + printf(".. ILU options:\n"); + printf("\tDiagPivotThresh\t%6.2e\n", options->DiagPivotThresh); + printf("\ttau\t%6.2e\n", options->ILU_DropTol); + printf("\tgamma\t%6.2f\n", options->ILU_FillFactor); + printf("\tDropRule\t%0x\n", options->ILU_DropRule); + printf("\tMILU\t%d\n", options->ILU_MILU); + printf("\tMILU_ALPHA\t%6.2e\n", MILU_ALPHA); + printf("\tDiagFillTol\t%6.2e\n", options->ILU_FillTol); + printf("..\n"); +} + +/*! \brief Deallocate the structure pointing to the actual storage of the matrix. */ +void +Destroy_SuperMatrix_Store(SuperMatrix *A) +{ + SUPERLU_FREE ( A->Store ); +} + +void +Destroy_CompCol_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE( ((NCformat *)A->Store)->rowind ); + SUPERLU_FREE( ((NCformat *)A->Store)->colptr ); + SUPERLU_FREE( ((NCformat *)A->Store)->nzval ); + SUPERLU_FREE( A->Store ); +} + +void +Destroy_CompRow_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE( ((NRformat *)A->Store)->colind ); + SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); + SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); + SUPERLU_FREE( A->Store ); +} + +void +Destroy_SuperNode_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); + SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); + SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); + SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); + SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); + SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); + SUPERLU_FREE ( A->Store ); +} + +/*! \brief A is of type Stype==NCP */ +void +Destroy_CompCol_Permuted(SuperMatrix *A) +{ + SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); + SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); + SUPERLU_FREE ( A->Store ); +} + +/*! \brief A is of type Stype==DN */ +void +Destroy_Dense_Matrix(SuperMatrix *A) +{ + DNformat* Astore = A->Store; + SUPERLU_FREE (Astore->nzval); + SUPERLU_FREE ( A->Store ); +} + +/*! \brief Reset repfnz[] for the current column + */ +void +resetrep_col (const int nseg, const int *segrep, int *repfnz) +{ + int i, irep; + + for (i = 0; i < nseg; i++) { + irep = segrep[i]; + repfnz[irep] = EMPTY; + } +} + + +/*! \brief Count the total number of nonzeros in factors L and U, and in the symmetrically reduced L. + */ +void +countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLU_t *Glu) +{ + int nsuper, fsupc, i, j; + int nnzL0, jlen, irep; + int *xsup, *xlsub; + + xsup = Glu->xsup; + xlsub = Glu->xlsub; + *nnzL = 0; + *nnzU = (Glu->xusub)[n]; + nnzL0 = 0; + nsuper = (Glu->supno)[n]; + + if ( n <= 0 ) return; + + /* + * For each supernode + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jlen = xlsub[fsupc+1] - xlsub[fsupc]; + + for (j = fsupc; j < xsup[i+1]; j++) { + *nnzL += jlen; + *nnzU += j - fsupc + 1; + jlen--; + } + irep = xsup[i+1] - 1; + nnzL0 += xprune[irep] - xlsub[irep]; + } + + /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/ +} + +/*! \brief Count the total number of nonzeros in factors L and U. + */ +void +ilu_countnz(const int n, int *nnzL, int *nnzU, GlobalLU_t *Glu) +{ + int nsuper, fsupc, i, j; + int jlen, irep; + int *xsup, *xlsub; + + xsup = Glu->xsup; + xlsub = Glu->xlsub; + *nnzL = 0; + *nnzU = (Glu->xusub)[n]; + nsuper = (Glu->supno)[n]; + + if ( n <= 0 ) return; + + /* + * For each supernode + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jlen = xlsub[fsupc+1] - xlsub[fsupc]; + + for (j = fsupc; j < xsup[i+1]; j++) { + *nnzL += jlen; + *nnzU += j - fsupc + 1; + jlen--; + } + irep = xsup[i+1] - 1; + } +} + + +/*! \brief Fix up the data storage lsub for L-subscripts. It removes the subscript sets for structural pruning, and applies permuation to the remaining subscripts. + */ +void +fixupL(const int n, const int *perm_r, GlobalLU_t *Glu) +{ + register int nsuper, fsupc, nextl, i, j, k, jstrt; + int *xsup, *lsub, *xlsub; + + if ( n <= 1 ) return; + + xsup = Glu->xsup; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nextl = 0; + nsuper = (Glu->supno)[n]; + + /* + * For each supernode ... + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jstrt = xlsub[fsupc]; + xlsub[fsupc] = nextl; + for (j = jstrt; j < xlsub[fsupc+1]; j++) { + lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ + nextl++; + } + for (k = fsupc+1; k < xsup[i+1]; k++) + xlsub[k] = nextl; /* Other columns in supernode i */ + + } + + xlsub[n] = nextl; +} + + +/*! \brief Diagnostic print of segment info after panel_dfs(). + */ +void print_panel_seg(int n, int w, int jcol, int nseg, + int *segrep, int *repfnz) +{ + int j, k; + + for (j = jcol; j < jcol+w; j++) { + printf("\tcol %d:\n", j); + for (k = 0; k < nseg; k++) + printf("\t\tseg %d, segrep %d, repfnz %d\n", k, + segrep[k], repfnz[(j-jcol)*n + segrep[k]]); + } + +} + + +void +StatInit(SuperLUStat_t *stat) +{ + register int i, w, panel_size, relax; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + w = SUPERLU_MAX(panel_size, relax); + stat->panel_histo = intCalloc(w+1); + stat->utime = (double *) SUPERLU_MALLOC(NPHASES * sizeof(double)); + if (!stat->utime) ABORT_SuperLU("SUPERLU_MALLOC fails for stat->utime"); + stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); + if (!stat->ops) ABORT_SuperLU("SUPERLU_MALLOC fails for stat->ops"); + for (i = 0; i < NPHASES; ++i) { + stat->utime[i] = 0.; + stat->ops[i] = 0.; + } + stat->TinyPivots = 0; + stat->RefineSteps = 0; + stat->expansions = 0; +#if ( PRNTlevel >= 1 ) + printf(".. parameters in sp_ienv():\n"); + printf("\t 1: panel size \t %4d \n" + "\t 2: relax \t %4d \n" + "\t 3: max. super \t %4d \n" + "\t 4: row-dim 2D \t %4d \n" + "\t 5: col-dim 2D \t %4d \n" + "\t 6: fill ratio \t %4d \n", + sp_ienv(1), sp_ienv(2), sp_ienv(3), + sp_ienv(4), sp_ienv(5), sp_ienv(6)); +#endif +} + + +void +StatPrint(SuperLUStat_t *stat) +{ + double *utime; + flops_t *ops; + + utime = stat->utime; + ops = stat->ops; + printf("Factor time = %8.2f\n", utime[FACT]); + if ( utime[FACT] != 0.0 ) + printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], + ops[FACT]*1e-6/utime[FACT]); + + printf("Solve time = %8.2f\n", utime[SOLVE]); + if ( utime[SOLVE] != 0.0 ) + printf("Solve flops = %e\tMflops = %8.2f\n", ops[SOLVE], + ops[SOLVE]*1e-6/utime[SOLVE]); + + printf("Number of memory expansions: %d\n", stat->expansions); + +} + + +void +StatFree(SuperLUStat_t *stat) +{ + SUPERLU_FREE(stat->panel_histo); + SUPERLU_FREE(stat->utime); + SUPERLU_FREE(stat->ops); +} + + +flops_t +LUFactFlops(SuperLUStat_t *stat) +{ + return (stat->ops[FACT]); +} + +flops_t +LUSolveFlops(SuperLUStat_t *stat) +{ + return (stat->ops[SOLVE]); +} + + + + + +/*! \brief Fills an integer array with a given value. + */ +void ifill(int *a, int alen, int ival) +{ + register int i; + for (i = 0; i < alen; i++) a[i] = ival; +} + + + +/*! \brief Get the statistics of the supernodes + */ +#define NBUCKS 10 +static int max_sup_size; + +void super_stats(int nsuper, int *xsup) +{ + register int nsup1 = 0; + int i, isize, whichb, bl, bh; + int bucket[NBUCKS]; + + max_sup_size = 0; + + for (i = 0; i <= nsuper; i++) { + isize = xsup[i+1] - xsup[i]; + if ( isize == 1 ) nsup1++; + if ( max_sup_size < isize ) max_sup_size = isize; + } + + printf(" Supernode statistics:\n\tno of super = %d\n", nsuper+1); + printf("\tmax supernode size = %d\n", max_sup_size); + printf("\tno of size 1 supernodes = %d\n", nsup1); + + /* Histogram of the supernode sizes */ + ifill (bucket, NBUCKS, 0); + + for (i = 0; i <= nsuper; i++) { + isize = xsup[i+1] - xsup[i]; + whichb = (float) isize / max_sup_size * NBUCKS; + if (whichb >= NBUCKS) whichb = NBUCKS - 1; + bucket[whichb]++; + } + + printf("\tHistogram of supernode sizes:\n"); + for (i = 0; i < NBUCKS; i++) { + bl = (float) i * max_sup_size / NBUCKS; + bh = (float) (i+1) * max_sup_size / NBUCKS; + printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]); + } + +} + + +float SpaSize(int n, int np, float sum_npw) +{ + return (sum_npw*8 + np*8 + n*4)/1024.; +} + +float DenseSize(int n, float sum_nw) +{ + return (sum_nw*8 + n*8)/1024.;; +} + + + +/*! \brief Check whether repfnz[] == EMPTY after reset. + */ +void check_repfnz(int n, int w, int jcol, int *repfnz) +{ + int jj, k; + + for (jj = jcol; jj < jcol+w; jj++) + for (k = 0; k < n; k++) + if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { + fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj, + k, repfnz[(jj-jcol)*n + k]); + ABORT_SuperLU("check_repfnz"); + } +} + + +/*! \brief Print a summary of the testing results. */ +void +PrintSumm(char *type, int nfail, int nrun, int nerrs) +{ + if ( nfail > 0 ) + printf("%3s driver: %d out of %d tests failed to pass the threshold\n", + type, nfail, nrun); + else + printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun); + + if ( nerrs > 0 ) + printf("%6d error messages recorded\n", nerrs); +} + + +int print_int_vec(char *what, int n, int *vec) +{ + int i; + printf("%s\n", what); + for (i = 0; i < n; ++i) printf("%d\t%d\n", i, vec[i]); + return 0; +} + +int slu_PrintInt10(char *name, int len, int *x) +{ + register int i; + + printf("%10s:", name); + for (i = 0; i < len; ++i) + { + if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i + 9); + printf("%6d", x[i]); + } + printf("\n"); + return 0; +} + + diff --git a/src/maths/SuperLU/xerbla.c b/src/maths/SuperLU/xerbla.c new file mode 100644 index 000000000..b00a69cfe --- /dev/null +++ b/src/maths/SuperLU/xerbla.c @@ -0,0 +1,43 @@ +#include +#include + +/* Subroutine */ int xerbla_(char *srname, int *info) +{ +/* -- LAPACK auxiliary routine (version 2.0) -- + Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + Courant Institute, Argonne National Lab, and Rice University + September 30, 1994 + + + Purpose + ======= + + XERBLA is an error handler for the LAPACK routines. + It is called by an LAPACK routine if an input parameter has an + invalid value. A message is printed and execution stops. + + Installers may consider modifying the STOP statement in order to + call system-specific exception-handling facilities. + + Arguments + ========= + + SRNAME (input) CHARACTER*6 + The name of the routine which called XERBLA. + + INFO (input) INT + The position of the invalid parameter in the parameter list + + of the calling routine. + + ===================================================================== +*/ + + printf("** On entry to %6s, parameter number %2d had an illegal value\n", + srname, *info); + +/* End of XERBLA */ + + return 0; +} /* xerbla_ */ + diff --git a/src/maths/SuperLU/zcolumn_bmod.c b/src/maths/SuperLU/zcolumn_bmod.c new file mode 100644 index 000000000..68c0b020b --- /dev/null +++ b/src/maths/SuperLU/zcolumn_bmod.c @@ -0,0 +1,367 @@ + +/*! @file zcolumn_bmod.c + * \brief performs numeric block updates + * + * + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+*/ + +#include+#include +#include + +/* + * Function prototypes + */ +void zusolve(int, int, doublecomplex*, doublecomplex*); +void zlsolve(int, int, doublecomplex*, doublecomplex*); +void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); + + + +/*! \brief + * + * + * Purpose: + * ======== + * Performs numeric block updates (sup-col) in topological order. + * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. + * Special processing on the supernodal portion of L\U[*,j] + * Return value: 0 - successful return + * > 0 - number of bytes allocated when run out of space + *+ */ +int +zcolumn_bmod ( + const int jcol, /* in */ + const int nseg, /* in */ + doublecomplex *dense, /* in */ + doublecomplex *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in */ + int fpanelc, /* in -- first column in the current panel */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + doublecomplex alpha, beta; + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in supernode + * nsupr = no of rows in supernode (used as leading dimension) + * luptr = location of supernodal LU-block in storage + * kfnz = first nonz in the k-th supernodal segment + * no_zeros = no of leading zeros in a supernodal U-segment + */ + doublecomplex ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int fsupc, nsupc, nsupr, segsze; + int nrow; /* No of rows in the matrix of matrix-vector */ + int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; + register int lptr, kfnz, isub, irow, i; + register int no_zeros, new_next; + int ufirst, nextlu; + int fst_col; /* First column within small LU update */ + int d_fsupc; /* Distance between the first column of the current + panel and the first column of the current snode. */ + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + int nzlumax; + doublecomplex *tempv1; + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + doublecomplex none = {-1.0, 0.0}; + doublecomplex comp_temp, comp_temp1; + int mem_error; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + nzlumax = Glu->nzlumax; + jcolp1 = jcol + 1; + jsupno = supno[jcol]; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + + krep = segrep[k]; + k--; + ksupno = supno[krep]; + if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ + + fsupc = xsup[ksupno]; + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + /* Distance from the current supernode to the current panel; + d_fsupc=0 if fsupc > fpanelc. */ + d_fsupc = fst_col - fsupc; + + luptr = xlusup[fst_col] + d_fsupc; + lptr = xlsub[fsupc] + d_fsupc; + + kfnz = repfnz[krep]; + kfnz = SUPERLU_MAX ( kfnz, fpanelc ); + + segsze = krep - kfnz + 1; + nsupc = krep - fst_col + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nrow = nsupr - d_fsupc - nsupc; + krep_ind = lptr + nsupc - 1; + + ops[TRSV] += 4 * segsze * (segsze - 1); + ops[GEMV] += 8 * nrow * segsze; + + + + /* + * Case 1: Update U-segment of size 1 -- col-col update + */ + if ( segsze == 1 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + z_sub(&dense[irow], &dense[irow], &comp_temp); + luptr++; + } + + } else if ( segsze <= 3 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { /* Case 2: 2cols-col update */ + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + z_sub(&ukj, &ukj, &comp_temp); + dense[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense[irow], &dense[irow], &comp_temp); + } + } else { /* Case 3: 3cols-col update */ + ukj2 = dense[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); + z_sub(&ukj1, &ukj1, &comp_temp); + + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&ukj, &ukj, &comp_temp); + + dense[lsub[krep_ind]] = ukj; + dense[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + luptr2++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense[irow], &dense[irow], &comp_temp); + } + } + + + } else { + /* + * Case: sup-col update + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense + */ + + no_zeros = kfnz - fst_col; + + /* Copy U[*,j] segment from dense[*] to tempv[*] */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + tempv[i] = dense[irow]; + ++isub; + } + + /* Dense triangular solve -- start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + zlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + zmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); +#endif + + + /* Scatter tempv[] into SPA dense[] as a temporary storage */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense[irow] = tempv[i]; + tempv[i] = zero; + ++isub; + } + + /* Scatter tempv1[] into SPA dense[] */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + z_sub(&dense[irow], &dense[irow], &tempv1[i]); + tempv1[i] = zero; + ++isub; + } + } + + } /* if jsupno ... */ + + } /* for each segment... */ + + /* + * Process the supernodal portion of L\U[*,j] + */ + nextlu = xlusup[jcol]; + fsupc = xsup[jsupno]; + + /* Copy the SPA dense into L\U[*,j] */ + new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; + while ( new_next > nzlumax ) { + if (mem_error = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) + return (mem_error); + lusup = Glu->lusup; + lsub = Glu->lsub; + } + + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = zero; + ++nextlu; + } + + xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ + + /* For more updates within the panel (also within the current supernode), + * should start from the first column of the panel, or the first column + * of the supernode, whichever is bigger. There are 2 cases: + * 1) fsupc < fpanelc, then fst_col := fpanelc + * 2) fsupc >= fpanelc, then fst_col := fsupc + */ + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + if ( fst_col < jcol ) { + + /* Distance between the current supernode and the current panel. + d_fsupc=0 if fsupc >= fpanelc. */ + d_fsupc = fst_col - fsupc; + + lptr = xlsub[fsupc] + d_fsupc; + luptr = xlusup[fst_col] + d_fsupc; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nsupc = jcol - fst_col; /* Excluding jcol */ + nrow = nsupr - d_fsupc - nsupc; + + /* Points to the beginning of jcol in snode L\U(jsupno) */ + ufirst = xlusup[jcol] + d_fsupc; + + ops[TRSV] += 4 * nsupc * (nsupc - 1); + ops[GEMV] += 8 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#else + ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#endif + + alpha = none; beta = one; /* y := beta*y + alpha*A*x */ + +#ifdef _CRAY + CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + + zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], tempv ); + + /* Copy updates from tempv[*] into lusup[*] */ + isub = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + z_sub(&lusup[isub], &lusup[isub], &tempv[i]); + tempv[i] = zero; + ++isub; + } + +#endif + + + } /* if fst_col < jcol ... */ + + return 0; +} diff --git a/src/maths/SuperLU/zcolumn_dfs.c b/src/maths/SuperLU/zcolumn_dfs.c new file mode 100644 index 000000000..e7e5eaec0 --- /dev/null +++ b/src/maths/SuperLU/zcolumn_dfs.c @@ -0,0 +1,275 @@ + +/*! @file zcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+*/ + +#include+ +/*! \brief What type of supernodes we want */ +#define T2_SUPER + + +/*! \brief + * + * + * Purpose + * ======= + * ZCOLUMN_DFS performs a symbolic factorization on column jcol, and + * decide the supernode boundary. + * + * This routine does not use numeric values, but only use the RHS + * row indices to start the dfs. + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. The routine returns a list of such supernodal + * representatives in topological order of the dfs that generates them. + * The location of the first nonzero in each such supernodal segment + * (supernodal entry location) is also returned. + * + * Local parameters + * ================ + * nseg: no of segments in current U[*,j] + * jsuper: jsuper=EMPTY if column j does not belong to the same + * supernode as j-1. Otherwise, jsuper=nsuper. + * + * marker2: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + * + * Return value + * ============ + * 0 success; + * > 0 number of bytes allocated when run out of space. + *+ */ +int +zcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *xprune, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom, istop; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(3); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonz */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( mem_error = + zLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + +#ifdef T2_SUPER + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; +#endif + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first and last columns of + * a supernode. (first for num values, last for pruning) + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + istop = ito + jptr - jm1ptr; + xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ + xlsub[jcol] = istop; + for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; /* = istop + length(jcol) */ + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xprune[jcol] = nextl; /* Initialize upper bound for pruning */ + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/zcopy_to_ucol.c b/src/maths/SuperLU/zcopy_to_ucol.c new file mode 100644 index 000000000..33e8251e4 --- /dev/null +++ b/src/maths/SuperLU/zcopy_to_ucol.c @@ -0,0 +1,103 @@ + +/*! @file zcopy_to_ucol.c + * \brief Copy a computed column of U to the compressed data structure + * + *+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + +#include+ +int +zcopy_to_ucol( + int jcol, /* in */ + int nseg, /* in */ + int *segrep, /* in */ + int *repfnz, /* in */ + int *perm_r, /* in */ + doublecomplex *dense, /* modified - reset to zero on return */ + GlobalLU_t *Glu /* modified */ + ) +{ +/* + * Gather from SPA dense[*] to global ucol[*]. + */ + int ksub, krep, ksupno; + int i, k, kfnz, segsze; + int fsupc, isub, irow; + int jsupno, nextu; + int new_next, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *ucol; + int *usub, *xusub; + int nzumax; + doublecomplex zero = {0.0, 0.0}; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + nzumax = Glu->nzumax; + + jsupno = supno[jcol]; + nextu = xusub[jcol]; + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + krep = segrep[k--]; + ksupno = supno[krep]; + + if ( ksupno != jsupno ) { /* Should go into ucol[] */ + kfnz = repfnz[krep]; + if ( kfnz != EMPTY ) { /* Nonzero U-segment */ + + fsupc = xsup[ksupno]; + isub = xlsub[fsupc] + kfnz - fsupc; + segsze = krep - kfnz + 1; + + new_next = nextu + segsze; + while ( new_next > nzumax ) { + if (mem_error = zLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)) + return (mem_error); + ucol = Glu->ucol; + if (mem_error = zLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)) + return (mem_error); + usub = Glu->usub; + lsub = Glu->lsub; + } + + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + usub[nextu] = perm_r[irow]; + ucol[nextu] = dense[irow]; + dense[irow] = zero; + nextu++; + isub++; + } + + } + + } + + } /* for each segment... */ + + xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ + return 0; +} diff --git a/src/maths/SuperLU/zdiagonal.c b/src/maths/SuperLU/zdiagonal.c new file mode 100644 index 000000000..05b3a6daf --- /dev/null +++ b/src/maths/SuperLU/zdiagonal.c @@ -0,0 +1,133 @@ + +/*! @file zdiagonal.c + * \brief Auxiliary routines to work with diagonal elements + * + * + * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory + * June 30, 2009 + *+ */ + +#include+ +int zfill_diag(int n, NCformat *Astore) +/* fill explicit zeros on the diagonal entries, so that the matrix is not + structurally singular. */ +{ + doublecomplex *nzval = (doublecomplex *)Astore->nzval; + int *rowind = Astore->rowind; + int *colptr = Astore->colptr; + int nnz = colptr[n]; + int fill = 0; + doublecomplex *nzval_new; + doublecomplex zero = {1.0, 0.0}; + int *rowind_new; + int i, j, diag; + + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + if (rowind[j] == i) diag = j; + if (diag < 0) fill++; + } + if (fill) + { + nzval_new = doublecomplexMalloc(nnz + fill); + rowind_new = intMalloc(nnz + fill); + fill = 0; + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i] - fill; j < colptr[i + 1]; j++) + { + if ((rowind_new[j + fill] = rowind[j]) == i) diag = j; + nzval_new[j + fill] = nzval[j]; + } + if (diag < 0) + { + rowind_new[colptr[i + 1] + fill] = i; + nzval_new[colptr[i + 1] + fill] = zero; + fill++; + } + colptr[i + 1] += fill; + } + Astore->nzval = nzval_new; + Astore->rowind = rowind_new; + SUPERLU_FREE(nzval); + SUPERLU_FREE(rowind); + } + Astore->nnz += fill; + return fill; +} + +int zdominate(int n, NCformat *Astore) +/* make the matrix diagonally dominant */ +{ + doublecomplex *nzval = (doublecomplex *)Astore->nzval; + int *rowind = Astore->rowind; + int *colptr = Astore->colptr; + int nnz = colptr[n]; + int fill = 0; + doublecomplex *nzval_new; + int *rowind_new; + int i, j, diag; + double s; + + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + if (rowind[j] == i) diag = j; + if (diag < 0) fill++; + } + if (fill) + { + nzval_new = doublecomplexMalloc(nnz + fill); + rowind_new = intMalloc(nnz+ fill); + fill = 0; + for (i = 0; i < n; i++) + { + s = 1e-6; + diag = -1; + for (j = colptr[i] - fill; j < colptr[i + 1]; j++) + { + if ((rowind_new[j + fill] = rowind[j]) == i) diag = j; + nzval_new[j + fill] = nzval[j]; + s += z_abs1(&nzval_new[j + fill]); + } + if (diag >= 0) { + nzval_new[diag+fill].r = s * 3.0; + nzval_new[diag+fill].i = 0.0; + } else { + rowind_new[colptr[i + 1] + fill] = i; + nzval_new[colptr[i + 1] + fill].r = s * 3.0; + nzval_new[colptr[i + 1] + fill].i = 0.0; + fill++; + } + colptr[i + 1] += fill; + } + Astore->nzval = nzval_new; + Astore->rowind = rowind_new; + SUPERLU_FREE(nzval); + SUPERLU_FREE(rowind); + } + else + { + for (i = 0; i < n; i++) + { + s = 1e-6; + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + { + if (rowind[j] == i) diag = j; + s += z_abs1(&nzval[j]); + } + nzval[diag].r = s * 3.0; + nzval[diag].i = 0.0; + } + } + Astore->nnz += fill; + return fill; +} diff --git a/src/maths/SuperLU/zgscon.c b/src/maths/SuperLU/zgscon.c new file mode 100644 index 000000000..52a3939a6 --- /dev/null +++ b/src/maths/SuperLU/zgscon.c @@ -0,0 +1,154 @@ + +/*! @file zgscon.c + * \brief Estimates reciprocal of the condition number of a general matrix + * + * + * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Modified from lapack routines ZGECON. + *+ */ + +/* + * File name: zgscon.c + * History: Modified from lapack routines ZGECON. + */ +#include+#include + +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSCON estimates the reciprocal of the condition number of a general + * real matrix A, in either the 1-norm or the infinity-norm, using + * the LU factorization computed by ZGETRF. * + * + * An estimate is obtained for norm(inv(A)), and the reciprocal of the + * condition number is computed as + * RCOND = 1 / ( norm(A) * norm(inv(A)) ). + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * NORM (input) char* + * Specifies whether the 1-norm condition number or the + * infinity-norm condition number is required: + * = '1' or 'O': 1-norm; + * = 'I': Infinity-norm. + * + * L (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U as computed by + * zgstrf(). Use compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U as computed by + * zgstrf(). Use column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. + * + * ANORM (input) double + * If NORM = '1' or 'O', the 1-norm of the original matrix A. + * If NORM = 'I', the infinity-norm of the original matrix A. + * + * RCOND (output) double* + * The reciprocal of the condition number of the matrix A, + * computed as RCOND = 1/(norm(A) * norm(inv(A))). + * + * INFO (output) int* + * = 0: successful exit + * < 0: if INFO = -i, the i-th argument had an illegal value + * + * ===================================================================== + *+ */ + +void +zgscon(char *norm, SuperMatrix *L, SuperMatrix *U, + double anorm, double *rcond, SuperLUStat_t *stat, int *info) +{ + + + /* Local variables */ + int kase, kase1, onenrm, i; + double ainvnm; + doublecomplex *work; + extern int zrscl_(int *, doublecomplex *, doublecomplex *, int *); + + extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *); + + + /* Test the input parameters. */ + *info = 0; + onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O"); + if (! onenrm && ! lsame_(norm, "I")) *info = -1; + else if (L->nrow < 0 || L->nrow != L->ncol || + L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU) + *info = -2; + else if (U->nrow < 0 || U->nrow != U->ncol || + U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU) + *info = -3; + if (*info != 0) { + i = -(*info); + xerbla_("zgscon", &i); + return; + } + + /* Quick return if possible */ + *rcond = 0.; + if ( L->nrow == 0 || U->nrow == 0) { + *rcond = 1.; + return; + } + + work = doublecomplexCalloc( 3*L->nrow ); + + + if ( !work ) + ABORT_SuperLU("Malloc fails for work arrays in zgscon."); + + /* Estimate the norm of inv(A). */ + ainvnm = 0.; + if ( onenrm ) kase1 = 1; + else kase1 = 2; + kase = 0; + + do { + zlacon_(&L->nrow, &work[L->nrow], &work[0], &ainvnm, &kase); + + if (kase == 0) break; + + if (kase == kase1) { + /* Multiply by inv(L). */ + sp_ztrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); + + /* Multiply by inv(U). */ + sp_ztrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); + + } else { + + /* Multiply by inv(U'). */ + sp_ztrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); + + /* Multiply by inv(L'). */ + sp_ztrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); + + } + + } while ( kase != 0 ); + + /* Compute the estimate of the reciprocal condition number. */ + if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; + + SUPERLU_FREE (work); + return; + +} /* zgscon */ + diff --git a/src/maths/SuperLU/zgsequ.c b/src/maths/SuperLU/zgsequ.c new file mode 100644 index 000000000..2ca1e4abe --- /dev/null +++ b/src/maths/SuperLU/zgsequ.c @@ -0,0 +1,195 @@ + +/*! @file zgsequ.c + * \brief Computes row and column scalings + * + *+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Modified from LAPACK routine ZGEEQU + *+ */ +/* + * File name: zgsequ.c + * History: Modified from LAPACK routine ZGEEQU + */ +#include+#include + + + +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSEQU computes row and column scalings intended to equilibrate an + * M-by-N sparse matrix A and reduce its condition number. R returns the row + * scale factors and C the column scale factors, chosen to try to make + * the largest element in each row and column of the matrix B with + * elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. + * + * R(i) and C(j) are restricted to be between SMLNUM = smallest safe + * number and BIGNUM = largest safe number. Use of these scaling + * factors is not guaranteed to reduce the condition number of A but + * works well in practice. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * A (input) SuperMatrix* + * The matrix of dimension (A->nrow, A->ncol) whose equilibration + * factors are to be computed. The type of A can be: + * Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. + * + * R (output) double*, size A->nrow + * If INFO = 0 or INFO > M, R contains the row scale factors + * for A. + * + * C (output) double*, size A->ncol + * If INFO = 0, C contains the column scale factors for A. + * + * ROWCND (output) double* + * If INFO = 0 or INFO > M, ROWCND contains the ratio of the + * smallest R(i) to the largest R(i). If ROWCND >= 0.1 and + * AMAX is neither too large nor too small, it is not worth + * scaling by R. + * + * COLCND (output) double* + * If INFO = 0, COLCND contains the ratio of the smallest + * C(i) to the largest C(i). If COLCND >= 0.1, it is not + * worth scaling by C. + * + * AMAX (output) double* + * Absolute value of largest matrix element. If AMAX is very + * close to overflow or very close to underflow, the matrix + * should be scaled. + * + * INFO (output) int* + * = 0: successful exit + * < 0: if INFO = -i, the i-th argument had an illegal value + * > 0: if INFO = i, and i is + * <= A->nrow: the i-th row of A is exactly zero + * > A->ncol: the (i-M)-th column of A is exactly zero + * + * ===================================================================== + *+ */ +void +zgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, + double *colcnd, double *amax, int *info) +{ + + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int i, j, irow; + double rcmin, rcmax; + double bignum, smlnum; + extern double dlamch_(char *); + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + xerbla_("zgsequ", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[i]) ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = 0; i < A->nrow; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + c[j] = SUPERLU_MAX( c[j], z_abs1(&Aval[i]) * r[irow] ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + return; + +} /* zgsequ */ + + diff --git a/src/maths/SuperLU/zgsisx.c b/src/maths/SuperLU/zgsisx.c new file mode 100644 index 000000000..aaee79f66 --- /dev/null +++ b/src/maths/SuperLU/zgsisx.c @@ -0,0 +1,727 @@ + +/*! @file zgsisx.c + * \brief Computes an approximate solutions of linear equations A*X=B or A'*X=B + * + *+ * -- SuperLU routine (version 4.2) -- + * Lawrence Berkeley National Laboratory. + * November, 2010 + * August, 2011 + *+ */ +#include+ +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSISX computes an approximate solutions of linear equations + * A*X=B or A'*X=B, using the ILU factorization from zgsitrf(). + * An estimation of the condition number is provided. + * The routine performs the following steps: + * + * 1. If A is stored column-wise (A->Stype = SLU_NC): + * + * 1.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling + * factors are computed to equilibrate the system: + * options->Trans = NOTRANS: + * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B + * options->Trans = TRANS: + * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B + * options->Trans = CONJ: + * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B + * Whether or not the system will be equilibrated depends on the + * scaling of the matrix A, but if equilibration is used, A is + * overwritten by diag(R)*A*diag(C) and B by diag(R)*B + * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans + * = TRANS or CONJ). + * + * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation + * matrix that usually preserves sparsity. + * For more details of this step, see sp_preorder.c. + * + * 1.3. If options->Fact != FACTORED, the LU decomposition is used to + * factor the matrix A (after equilibration if options->Equil = YES) + * as Pr*A*Pc = L*U, with Pr determined by partial pivoting. + * + * 1.4. Compute the reciprocal pivot growth factor. + * + * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the + * routine fills a small number on the diagonal entry, that is + * U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n), + * and info will be increased by 1. The factored form of A is used + * to estimate the condition number of the preconditioner. If the + * reciprocal of the condition number is less than machine precision, + * info = A->ncol+1 is returned as a warning, but the routine still + * goes on to solve for X. + * + * 1.6. The system of equations is solved for X using the factored form + * of A. + * + * 1.7. options->IterRefine is not used + * + * 1.8. If equilibration was used, the matrix X is premultiplied by + * diag(C) (if options->Trans = NOTRANS) or diag(R) + * (if options->Trans = TRANS or CONJ) so that it solves the + * original system before equilibration. + * + * 1.9. options for ILU only + * 1) If options->RowPerm = LargeDiag, MC64 is used to scale and + * permute the matrix to an I-matrix, that is Pr*Dr*A*Dc has + * entries of modulus 1 on the diagonal and off-diagonal entries + * of modulus at most 1. If MC64 fails, dgsequ() is used to + * equilibrate the system. + * ( Default: LargeDiag ) + * 2) options->ILU_DropTol = tau is the threshold for dropping. + * For L, it is used directly (for the whole row in a supernode); + * For U, ||A(:,i)||_oo * tau is used as the threshold + * for the i-th column. + * If a secondary dropping rule is required, tau will + * also be used to compute the second threshold. + * ( Default: 1e-4 ) + * 3) options->ILU_FillFactor = gamma, used as the initial guess + * of memory growth. + * If a secondary dropping rule is required, it will also + * be used as an upper bound of the memory. + * ( Default: 10 ) + * 4) options->ILU_DropRule specifies the dropping rule. + * Option Meaning + * ====== =========== + * DROP_BASIC: Basic dropping rule, supernodal based ILUTP(tau). + * DROP_PROWS: Supernodal based ILUTP(p,tau), p = gamma*nnz(A)/n. + * DROP_COLUMN: Variant of ILUTP(p,tau), for j-th column, + * p = gamma * nnz(A(:,j)). + * DROP_AREA: Variation of ILUTP, for j-th column, use + * nnz(F(:,1:j)) / nnz(A(:,1:j)) to control memory. + * DROP_DYNAMIC: Modify the threshold tau during factorizaion: + * If nnz(L(:,1:j)) / nnz(A(:,1:j)) > gamma + * tau_L(j) := MIN(tau_0, tau_L(j-1) * 2); + * Otherwise + * tau_L(j) := MAX(tau_0, tau_L(j-1) / 2); + * tau_U(j) uses the similar rule. + * NOTE: the thresholds used by L and U are separate. + * DROP_INTERP: Compute the second dropping threshold by + * interpolation instead of sorting (default). + * In this case, the actual fill ratio is not + * guaranteed smaller than gamma. + * DROP_PROWS, DROP_COLUMN and DROP_AREA are mutually exclusive. + * ( Default: DROP_BASIC | DROP_AREA ) + * 5) options->ILU_Norm is the criterion of measuring the magnitude + * of a row in a supernode of L. ( Default is INF_NORM ) + * options->ILU_Norm RowSize(x[1:n]) + * ================= =============== + * ONE_NORM ||x||_1 / n + * TWO_NORM ||x||_2 / sqrt(n) + * INF_NORM max{|x[i]|} + * 6) options->ILU_MILU specifies the type of MILU's variation. + * = SILU: do not perform Modified ILU; + * = SMILU_1 (not recommended): + * U(i,i) := U(i,i) + sum(dropped entries); + * = SMILU_2: + * U(i,i) := U(i,i) + SGN(U(i,i)) * sum(dropped entries); + * = SMILU_3: + * U(i,i) := U(i,i) + SGN(U(i,i)) * sum(|dropped entries|); + * NOTE: Even SMILU_1 does not preserve the column sum because of + * late dropping. + * ( Default: SILU ) + * 7) options->ILU_FillTol is used as the perturbation when + * encountering zero pivots. If some U(i,i) = 0, so that U is + * exactly singular, then + * U(i,i) := ||A(:,i)|| * options->ILU_FillTol ** (1 - i / n). + * ( Default: 1e-2 ) + * + * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm + * to the transpose of A: + * + * 2.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling + * factors are computed to equilibrate the system: + * options->Trans = NOTRANS: + * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B + * options->Trans = TRANS: + * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B + * options->Trans = CONJ: + * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B + * Whether or not the system will be equilibrated depends on the + * scaling of the matrix A, but if equilibration is used, A' is + * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B + * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). + * + * 2.2. Permute columns of transpose(A) (rows of A), + * forming transpose(A)*Pc, where Pc is a permutation matrix that + * usually preserves sparsity. + * For more details of this step, see sp_preorder.c. + * + * 2.3. If options->Fact != FACTORED, the LU decomposition is used to + * factor the transpose(A) (after equilibration if + * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the + * permutation Pr determined by partial pivoting. + * + * 2.4. Compute the reciprocal pivot growth factor. + * + * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the + * routine fills a small number on the diagonal entry, that is + * U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n). + * And info will be increased by 1. The factored form of A is used + * to estimate the condition number of the preconditioner. If the + * reciprocal of the condition number is less than machine precision, + * info = A->ncol+1 is returned as a warning, but the routine still + * goes on to solve for X. + * + * 2.6. The system of equations is solved for X using the factored form + * of transpose(A). + * + * 2.7. If options->IterRefine is not used. + * + * 2.8. If equilibration was used, the matrix X is premultiplied by + * diag(C) (if options->Trans = NOTRANS) or diag(R) + * (if options->Trans = TRANS or CONJ) so that it solves the + * original system before equilibration. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed and how the + * system will be solved. + * + * A (input/output) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of the linear equations is A->nrow. Currently, the type of A can be: + * Stype = SLU_NC or SLU_NR, Dtype = SLU_Z, Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * On entry, If options->Fact = FACTORED and equed is not 'N', + * then A must have been equilibrated by the scaling factors in + * R and/or C. + * On exit, A is not modified + * if options->Equil = NO, or + * if options->Equil = YES but equed = 'N' on exit, or + * if options->RowPerm = NO. + * + * Otherwise, if options->Equil = YES and equed is not 'N', + * A is scaled as follows: + * If A->Stype = SLU_NC: + * equed = 'R': A := diag(R) * A + * equed = 'C': A := A * diag(C) + * equed = 'B': A := diag(R) * A * diag(C). + * If A->Stype = SLU_NR: + * equed = 'R': transpose(A) := diag(R) * transpose(A) + * equed = 'C': transpose(A) := transpose(A) * diag(C) + * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). + * + * If options->RowPerm = LargeDiag, MC64 is used to scale and permute + * the matrix to an I-matrix, that is A is modified as follows: + * P*Dr*A*Dc has entries of modulus 1 on the diagonal and + * off-diagonal entries of modulus at most 1. P is a permutation + * obtained from MC64. + * If MC64 fails, zgsequ() is used to equilibrate the system, + * and A is scaled as above, but no permutation is involved. + * On exit, A is restored to the orginal row numbering, so + * Dr*A*Dc is returned. + * + * perm_c (input/output) int* + * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, + * which defines the permutation matrix Pc; perm_c[i] = j means + * column i of A is in position j in A*Pc. + * On exit, perm_c may be overwritten by the product of the input + * perm_c and a permutation that postorders the elimination tree + * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree + * is already in postorder. + * + * If A->Stype = SLU_NR, column permutation vector of size A->nrow, + * which describes permutation of columns of transpose(A) + * (rows of A) as described above. + * + * perm_r (input/output) int* + * If A->Stype = SLU_NC, row permutation vector of size A->nrow, + * which defines the permutation matrix Pr, and is determined + * by MC64 first then followed by partial pivoting. + * perm_r[i] = j means row i of A is in position j in Pr*A. + * + * If A->Stype = SLU_NR, permutation vector of size A->ncol, which + * determines permutation of rows of transpose(A) + * (columns of A) as described above. + * + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by a + * new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument. + * + * etree (input/output) int*, dimension (A->ncol) + * Elimination tree of Pc'*A'*A*Pc. + * If options->Fact != FACTORED and options->Fact != DOFACT, + * etree is an input argument, otherwise it is an output argument. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * + * equed (input/output) char* + * Specifies the form of equilibration that was done. + * = 'N': No equilibration. + * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). + * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). + * = 'B': Both row and column equilibration, i.e., A was replaced + * by diag(R)*A*diag(C). + * If options->Fact = FACTORED, equed is an input argument, + * otherwise it is an output argument. + * + * R (input/output) double*, dimension (A->nrow) + * The row scale factors for A or transpose(A). + * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) + * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). + * If equed = 'N' or 'C', R is not accessed. + * If options->Fact = FACTORED, R is an input argument, + * otherwise, R is output. + * If options->Fact = FACTORED and equed = 'R' or 'B', each element + * of R must be positive. + * + * C (input/output) double*, dimension (A->ncol) + * The column scale factors for A or transpose(A). + * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) + * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). + * If equed = 'N' or 'R', C is not accessed. + * If options->Fact = FACTORED, C is an input argument, + * otherwise, C is output. + * If options->Fact = FACTORED and equed = 'C' or 'B', each element + * of C must be positive. + * + * L (output) SuperMatrix* + * The factor L from the factorization + * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses compressed row subscripts storage for supernodes, i.e., + * L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. + * + * work (workspace/output) void*, size (lwork) (in bytes) + * User supplied workspace, should be large enough + * to hold data structures for factors L and U. + * On exit, if fact is not 'F', L and U point to this array. + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * mem_usage->total_needed; no other side effects. + * + * See argument 'mem_usage' for memory usage statistics. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * If B->ncol = 0, only LU decomposition is performed, the triangular + * solve is skipped. + * On exit, + * if equed = 'N', B is not modified; otherwise + * if A->Stype = SLU_NC: + * if options->Trans = NOTRANS and equed = 'R' or 'B', + * B is overwritten by diag(R)*B; + * if options->Trans = TRANS or CONJ and equed = 'C' of 'B', + * B is overwritten by diag(C)*B; + * if A->Stype = SLU_NR: + * if options->Trans = NOTRANS and equed = 'C' or 'B', + * B is overwritten by diag(C)*B; + * if options->Trans = TRANS or CONJ and equed = 'R' of 'B', + * B is overwritten by diag(R)*B. + * + * X (output) SuperMatrix* + * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * If info = 0 or info = A->ncol+1, X contains the solution matrix + * to the original system of equations. Note that A and B are modified + * on exit if equed is not 'N', and the solution to the equilibrated + * system is inv(diag(C))*X if options->Trans = NOTRANS and + * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C' + * and equed = 'R' or 'B'. + * + * recip_pivot_growth (output) double* + * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). + * The infinity norm is used. If recip_pivot_growth is much less + * than 1, the stability of the LU factorization could be poor. + * + * rcond (output) double* + * The estimate of the reciprocal condition number of the matrix A + * after equilibration (if done). If rcond is less than the machine + * precision (in particular, if rcond = 0), the matrix is singular + * to working precision. This condition is indicated by a return + * code of info > 0. + * + * mem_usage (output) mem_usage_t* + * Record the memory usage statistics, consisting of following fields: + * - for_lu (float) + * The amount of space used in bytes for L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + * - expansions (int) + * The number of memory expansions during the LU factorization. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See slu_util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: number of zero pivots. They are replaced by small + * entries due to options->ILU_FillTol. + * = A->ncol+1: U is nonsingular, but RCOND is less than machine + * precision, meaning that the matrix is singular to + * working precision. Nevertheless, the solution and + * error bounds are computed because there are a number + * of situations where the computed solution can be more + * accurate than the value of RCOND would suggest. + * > A->ncol+1: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. + *+ */ + +void +zgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + int *etree, char *equed, double *R, double *C, + SuperMatrix *L, SuperMatrix *U, void *work, int lwork, + SuperMatrix *B, SuperMatrix *X, + double *recip_pivot_growth, double *rcond, + mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info) +{ + + DNformat *Bstore, *Xstore; + doublecomplex *Bmat, *Xmat; + int ldb, ldx, nrhs, n; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int colequ, equil, nofact, notran, rowequ, permc_spec, mc64; + trans_t trant; + char norm[1]; + int i, j, info1; + double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; + int relax, panel_size; + double diag_pivot_thresh; + double t0; /* temporary time */ + double *utime; + + int *perm = NULL; /* permutation returned from MC64 */ + + /* External functions */ + extern double zlangs(char *, SuperMatrix *); + + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + n = B->nrow; + + *info = 0; + nofact = (options->Fact != FACTORED); + equil = (options->Equil == YES_SuperLU); + notran = (options->Trans == NOTRANS); + mc64 = (options->RowPerm == LargeDiag); + if ( nofact ) { + *(unsigned char *)equed = 'N'; + rowequ = FALSE; + colequ = FALSE; + } else { + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + smlnum = dlamch_("Safe minimum"); + bignum = 1. / smlnum; + } + + /* Test the input parameters */ + if (options->Fact != DOFACT && options->Fact != SamePattern && + options->Fact != SamePattern_SameRowPerm && + options->Fact != FACTORED && + options->Trans != NOTRANS && options->Trans != TRANS && + options->Trans != CONJ && + options->Equil != NO_SuperLU && options->Equil != YES_SuperLU) + *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if (options->Fact == FACTORED && + !(rowequ || colequ || lsame_(equed, "N"))) + *info = -6; + else { + if (rowequ) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, R[j]); + rcmax = SUPERLU_MAX(rcmax, R[j]); + } + if (rcmin <= 0.) *info = -7; + else if ( A->nrow > 0) + rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else rowcnd = 1.; + } + if (colequ && *info == 0) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, C[j]); + rcmax = SUPERLU_MAX(rcmax, C[j]); + } + if (rcmin <= 0.) *info = -8; + else if (A->nrow > 0) + colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else colcnd = 1.; + } + if (*info == 0) { + if ( lwork < -1 ) *info = -12; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || + B->Mtype != SLU_GE ) + *info = -13; + else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || + (B->ncol != 0 && B->ncol != X->ncol) || + X->Stype != SLU_DN || + X->Dtype != SLU_Z || X->Mtype != SLU_GE ) + *info = -14; + } + } + if (*info != 0) { + i = -(*info); + xerbla_("zgsisx", &i); + return; + } + + /* Initialization for factor parameters */ + panel_size = sp_ienv(1); + relax = sp_ienv(2); + diag_pivot_thresh = options->DiagPivotThresh; + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + if ( notran ) { /* Reverse the transpose argument. */ + trant = TRANS; + notran = 0; + } else { + trant = NOTRANS; + notran = 1; + } + } else { /* A->Stype == SLU_NC */ + trant = options->Trans; + AA = A; + } + + if ( nofact ) { + register int i, j; + NCformat *Astore = AA->Store; + int nnz = Astore->nnz; + int *colptr = Astore->colptr; + int *rowind = Astore->rowind; + doublecomplex *nzval = (doublecomplex *)Astore->nzval; + + if ( mc64 ) { + t0 = SuperLU_timer_(); + if ((perm = intMalloc(n)) == NULL) + ABORT_SuperLU("SUPERLU_MALLOC fails for perm[]"); + + info1 = zldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C); + + if (info1 != 0) { /* MC64 fails, call zgsequ() later */ + mc64 = 0; + SUPERLU_FREE(perm); + perm = NULL; + } else { + if ( equil ) { + rowequ = colequ = 1; + for (i = 0; i < n; i++) { + R[i] = exp(R[i]); + C[i] = exp(C[i]); + } + /* scale the matrix */ + for (j = 0; j < n; j++) { + for (i = colptr[j]; i < colptr[j + 1]; i++) { + zd_mult(&nzval[i], &nzval[i], R[rowind[i]] * C[j]); + } + } + *equed = 'B'; + } + + /* permute the matrix */ + for (j = 0; j < n; j++) { + for (i = colptr[j]; i < colptr[j + 1]; i++) { + /*nzval[i] *= R[rowind[i]] * C[j];*/ + rowind[i] = perm[rowind[i]]; + } + } + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + + if ( !mc64 & equil ) { /* Only perform equilibration, no row perm */ + t0 = SuperLU_timer_(); + /* Compute row and column scalings to equilibrate the matrix A. */ + zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); + + if ( info1 == 0 ) { + /* Equilibrate matrix A. */ + zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + } + + + if ( nofact ) { + + t0 = SuperLU_timer_(); + /* + * Gnet column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t0; + + t0 = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t0; + + /* Compute the LU factorization of A*Pc. */ + t0 = SuperLU_timer_(); + zgsitrf(options, &AC, relax, panel_size, etree, work, lwork, + perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t0; + + if ( lwork == -1 ) { + mem_usage->total_needed = *info - A->ncol; + return; + } + + if ( mc64 ) { /* Fold MC64's perm[] into perm_r[]. */ + NCformat *Astore = AA->Store; + int nnz = Astore->nnz, *rowind = Astore->rowind; + int *perm_tmp, *iperm; + if ((perm_tmp = intMalloc(2*n)) == NULL) + ABORT_SuperLU("SUPERLU_MALLOC fails for perm_tmp[]"); + iperm = perm_tmp + n; + for (i = 0; i < n; ++i) perm_tmp[i] = perm_r[perm[i]]; + for (i = 0; i < n; ++i) { + perm_r[i] = perm_tmp[i]; + iperm[perm[i]] = i; + } + + /* Restore A's original row indices. */ + for (i = 0; i < nnz; ++i) rowind[i] = iperm[rowind[i]]; + + SUPERLU_FREE(perm); /* MC64 permutation */ + SUPERLU_FREE(perm_tmp); + } + } + + if ( options->PivotGrowth ) { + if ( *info > 0 ) return; + + /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ + *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); + } + + if ( options->ConditionNumber ) { + /* Estimate the reciprocal of the condition number of A. */ + t0 = SuperLU_timer_(); + if ( notran ) { + *(unsigned char *)norm = '1'; + } else { + *(unsigned char *)norm = 'I'; + } + anorm = zlangs(norm, AA); + zgscon(norm, L, U, anorm, rcond, stat, &info1); + utime[RCOND] = SuperLU_timer_() - t0; + } + + if ( nrhs > 0 ) { /* Solve the system */ + doublecomplex *rhs_work; + + /* Scale and permute the right-hand side if equilibration + and permutation from MC64 were performed. */ + if ( notran ) { + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); + } + } else if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); + } + } + + /* Compute the solution matrix X. */ + for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ + for (i = 0; i < B->nrow; i++) + Xmat[i + j*ldx] = Bmat[i + j*ldb]; + + t0 = SuperLU_timer_(); + zgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1); + utime[SOLVE] = SuperLU_timer_() - t0; + + /* Transform the solution matrix X to a solution of the original + system. */ + if ( notran ) { + if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); + } + } + } else { /* transposed system */ + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) { + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); + } + } + } + + } /* end if nrhs > 0 */ + + if ( options->ConditionNumber ) { + /* The matrix is singular to working precision. */ + if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1; + } + + if ( nofact ) { + ilu_zQuerySpace(L, U, mem_usage); + Destroy_CompCol_Permuted(&AC); + } + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/zgsitrf.c b/src/maths/SuperLU/zgsitrf.c new file mode 100644 index 000000000..c8cc256ee --- /dev/null +++ b/src/maths/SuperLU/zgsitrf.c @@ -0,0 +1,637 @@ + +/*! @file zgsitrf.c + * \brief Computes an ILU factorization of a general sparse matrix + * + *+ * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + * + *+ */ + +#include+ +#ifdef DEBUG +int num_drop_L; +#endif + +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSITRF computes an ILU factorization of a general sparse m-by-n + * matrix A using partial pivoting with row interchanges. + * The factorization has the form + * Pr * A = L * U + * where Pr is a row permutation matrix, L is lower triangular with unit + * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper + * triangular (upper trapezoidal if A->nrow < A->ncol). + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the ILU decomposition will be performed. + * + * A (input) SuperMatrix* + * Original matrix A, permuted by columns, of dimension + * (A->nrow, A->ncol). The type of A can be: + * Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. + * + * relax (input) int + * To control degree of relaxing supernodes. If the number + * of nodes (columns) in a subtree of the elimination tree is less + * than relax, this subtree is considered as one supernode, + * regardless of the row structures of those columns. + * + * panel_size (input) int + * A panel consists of at most panel_size consecutive columns. + * + * etree (input) int*, dimension (A->ncol) + * Elimination tree of A'*A. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * On input, the columns of A should be permuted so that the + * etree is in a certain postorder. + * + * work (input/output) void*, size (lwork) (in bytes) + * User-supplied work space and space for the output data structures. + * Not referenced if lwork = 0; + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * *info; no other side effects. + * + * perm_c (input) int*, dimension (A->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * When searching for diagonal, perm_c[*] is applied to the + * row subscripts of A, so that diagonal threshold pivoting + * can find the diagonal of A, rather than that of A*Pc. + * + * perm_r (input/output) int*, dimension (A->nrow) + * Row permutation vector which defines the permutation matrix Pr, + * perm_r[i] = j means row i of A is in position j in Pr*A. + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by + * a new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument; + * + * L (output) SuperMatrix* + * The factor L from the factorization Pr*A=L*U; use compressed row + * subscripts storage for supernodes, i.e., L has type: + * Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise + * storage scheme, i.e., U has types: Stype = SLU_NC, + * Dtype = SLU_Z, Mtype = SLU_TRU. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See slu_util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: number of zero pivots. They are replaced by small + * entries according to options->ILU_FillTol. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. If lwork = -1, it is + * the estimated amount of space needed, plus A->ncol. + * + * ====================================================================== + * + * Local Working Arrays: + * ====================== + * m = number of rows in the matrix + * n = number of columns in the matrix + * + * marker[0:3*m-1]: marker[i] = j means that node i has been + * reached when working on column j. + * Storage: relative to original row subscripts + * NOTE: There are 4 of them: + * marker/marker1 are used for panel dfs, see (ilu_)dpanel_dfs.c; + * marker2 is used for inner-factorization, see (ilu)_dcolumn_dfs.c; + * marker_relax(has its own space) is used for relaxed supernodes. + * + * parent[0:m-1]: parent vector used during dfs + * Storage: relative to new row subscripts + * + * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) + * unexplored neighbor of i in lsub[*] + * + * segrep[0:nseg-1]: contains the list of supernodal representatives + * in topological order of the dfs. A supernode representative is the + * last column of a supernode. + * The maximum size of segrep[] is n. + * + * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a + * supernodal representative r, repfnz[r] is the location of the first + * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 + * indicates the supernode r has been explored. + * NOTE: There are W of them, each used for one column of a panel. + * + * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below + * the panel diagonal. These are filled in during dpanel_dfs(), and are + * used later in the inner LU factorization within the panel. + * panel_lsub[]/dense[] pair forms the SPA data structure. + * NOTE: There are W of them. + * + * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; + * NOTE: there are W of them. + * + * tempv[0:*]: real temporary used for dense numeric kernels; + * The size of this array is defined by NUM_TEMPV() in slu_util.h. + * It is also used by the dropping routine ilu_ddrop_row(). + *+ */ + +void +zgsitrf(superlu_options_t *options, SuperMatrix *A, int relax, int panel_size, + int *etree, void *work, int lwork, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *swap, *iswap; /* swap is used to store the row permutation + during the factorization. Initially, it is set + to iperm_c (row indeces of Pc*A*Pc'). + iswap is the inverse of swap. After the + factorization, it is equal to perm_r. */ + int *iwork; + doublecomplex *zwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *marker, *marker_relax; + doublecomplex *dense, *tempv; + double *dtempv; + int *relax_end, *relax_fsupc; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double *amax; + doublecomplex drop_sum; + double alpha, omega; /* used in MILU, mimicing DRIC */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + double *dwork2; /* used by the second dropping rule */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + double drop_tol = options->ILU_DropTol; /* tau */ + double fill_ini = options->ILU_FillTol; /* tau^hat */ + double gamma = options->ILU_FillFactor; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + double fill_tol; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + int last_drop;/* the last column which the dropping rules applied */ + int quota; + int nnzAj; /* number of nonzeros in A(:,1:j) */ + int nnzLj, nnzUj; + double tol_L = drop_tol, tol_U = drop_tol; + doublecomplex zero = {0.0, 0.0}; + double one = 1.0; + + /* Executable */ + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, + gamma, L, U, &Glu, &iwork, &zwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &marker_relax, &marker); + zSetRWork(m, panel_size, zwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + swap = (int *)intMalloc(n); + for (k = 0; k < n; k++) swap[k] = iperm_c[k]; + iswap = (int *)intMalloc(n); + for (k = 0; k < n; k++) iswap[k] = perm_c[k]; + amax = (double *) doubleMalloc(panel_size); + if (drop_rule & DROP_SECONDARY) + dwork2 = (double *)doubleMalloc(n); + else + dwork2 = NULL; + + nnzAj = 0; + nnzLj = 0; + nnzUj = 0; + last_drop = SUPERLU_MAX(min_mn - 2 * sp_ienv(7), (int)(min_mn * 0.95)); + alpha = pow((double)n, -1.0 / options->ILU_MILU_Dim); + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + relax_fsupc = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) + ilu_heap_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + else + ilu_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* Mark the rows used by relaxed supernodes */ + ifill (marker_relax, m, EMPTY); + i = mark_relax(m, relax_end, relax_fsupc, xa_begin, xa_end, + asub, marker_relax); +#if ( PRNTlevel >= 1) + printf("%d relaxed supernodes.\n", i); +#endif + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* Drop small rows in the previous supernode. */ + if (jcol > 0 && jcol < last_drop) { + int first = xsup[supno[jcol - 1]]; + int last = jcol - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / min_mn); + + /* Drop small rows */ + dtempv = (double *) tempv; + i = ilu_zdrop_row(options, first, last, tol_L, quota, &nnzLj, + &fill_tol, &Glu, dtempv, dwork2, 0); + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = ilu_zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ((*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu))) + return; + } + + for (icol = jcol; icol <= kcol; icol++) { + xusub[icol+1] = nextu; + + amax[0] = 0.0; + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) { + register double tmp = z_abs1 (&a[k]); + if (tmp > amax[0]) amax[0] = tmp; + dense[asub[k]] = a[k]; + } + nnzAj += xa_end[icol] - xa_begin[icol]; + if (amax[0] == 0.0) { + amax[0] = fill_ini; +#if ( PRNTlevel >= 1) + printf("Column %d is entirely zero!\n", icol); + fflush(stdout); +#endif + } + + /* Numeric update within the snode */ + zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if (usepr) pivrow = iperm_r[icol]; + fill_tol = pow(fill_ini, 1.0 - (double)icol / (double)min_mn); + if ( (*info = ilu_zpivotL(icol, diag_pivot_thresh, &usepr, + perm_r, iperm_c[icol], swap, iswap, + marker_relax, &pivrow, + amax[0] * fill_tol, milu, zero, + &Glu, stat)) ) { + iinfo++; + marker[pivrow] = kcol; + } + + } + + jcol = kcol + 1; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + ilu_zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, amax, panel_lsub, segrep, repfnz, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + zpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for (jj = jcol; jj < jcol + panel_size; jj++) { + + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + nnzAj += xa_end[jj] - xa_begin[jj]; + + if ((*info = ilu_zcolumn_dfs(m, jj, perm_r, &nseg, + &panel_lsub[k], segrep, &repfnz[k], + marker, parent, xplore, &Glu))) + return; + + /* Numeric updates */ + if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Make a fill-in position if the column is entirely zero */ + if (xlsub[jj + 1] == xlsub[jj]) { + register int i, row; + int nextl; + int nzlmax = Glu.nzlmax; + int *lsub = Glu.lsub; + int *marker2 = marker + 2 * m; + + /* Allocate memory */ + nextl = xlsub[jj] + 1; + if (nextl >= nzlmax) { + int error = zLUMemXpand(jj, nextl, LSUB, &nzlmax, &Glu); + if (error) { *info = error; return; } + lsub = Glu.lsub; + } + xlsub[jj + 1]++; + assert(xlusup[jj]==xlusup[jj+1]); + xlusup[jj + 1]++; + Glu.lusup[xlusup[jj]] = zero; + + /* Choose a row index (pivrow) for fill-in */ + for (i = jj; i < n; i++) + if (marker_relax[swap[i]] <= jj) break; + row = swap[i]; + marker2[row] = jj; + lsub[xlsub[jj]] = row; +#ifdef DEBUG + printf("Fill col %d.\n", jj); + fflush(stdout); +#endif + } + + /* Computer the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * jj / m; + else if (drop_rule & DROP_COLUMN) + quota = gamma * (xa_end[jj] - xa_begin[jj]) * + (jj + 1) / m; + else if (drop_rule & DROP_AREA) + quota = gamma * 0.9 * nnzAj * 0.5 - nnzUj; + else + quota = m; + + /* Copy the U-segments to ucol[*] and drop small entries */ + if ((*info = ilu_zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], drop_rule, + milu, amax[jj - jcol] * tol_U, + quota, &drop_sum, &nnzUj, &Glu, + dwork2)) != 0) + return; + + /* Reset the dropping threshold if required */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * 0.9 * nnzAj * 0.5 < nnzLj) + tol_U = SUPERLU_MIN(1.0, tol_U * 2.0); + else + tol_U = SUPERLU_MAX(drop_tol, tol_U * 0.5); + } + + if (drop_sum.r != 0.0 && drop_sum.i != 0.0) + { + omega = SUPERLU_MIN(2.0*(1.0-alpha)/z_abs1(&drop_sum), 1.0); + zd_mult(&drop_sum, &drop_sum, omega); + } + if (usepr) pivrow = iperm_r[jj]; + fill_tol = pow(fill_ini, 1.0 - (double)jj / (double)min_mn); + if ( (*info = ilu_zpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_c[jj], swap, iswap, + marker_relax, &pivrow, + amax[jj - jcol] * fill_tol, milu, + drop_sum, &Glu, stat)) ) { + iinfo++; + marker[m + pivrow] = jj; + marker[2 * m + pivrow] = jj; + } + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + + /* Start a new supernode, drop the previous one */ + if (jj > 0 && supno[jj] > supno[jj - 1] && jj < last_drop) { + int first = xsup[supno[jj - 1]]; + int last = jj - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) + / m) - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / + (double)min_mn); + + /* Drop small rows */ + dtempv = (double *) tempv; + i = ilu_zdrop_row(options, first, last, tol_L, quota, + &nnzLj, &fill_tol, &Glu, dtempv, dwork2, + 1); + + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } /* if start a new supernode */ + + } /* for */ + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + ilu_countnz(min_mn, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU); + zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + SUPERLU_FREE (swap); + SUPERLU_FREE (iswap); + SUPERLU_FREE (relax_fsupc); + SUPERLU_FREE (amax); + if ( dwork2 ) SUPERLU_FREE (dwork2); + +} diff --git a/src/maths/SuperLU/zgsrfs.c b/src/maths/SuperLU/zgsrfs.c new file mode 100644 index 000000000..99b3af680 --- /dev/null +++ b/src/maths/SuperLU/zgsrfs.c @@ -0,0 +1,460 @@ + +/*! @file zgsrfs.c + * \brief Improves computed solution to a system of inear equations + * + *+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Modified from lapack routine ZGERFS + *+ */ +/* + * File name: zgsrfs.c + * History: Modified from lapack routine ZGERFS + */ +#include+#include + +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSRFS improves the computed solution to a system of linear + * equations and provides error bounds and backward error estimates for + * the solution. + * + * If equilibration was performed, the system becomes: + * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * trans (input) trans_t + * Specifies the form of the system of equations: + * = NOTRANS: A * X = B (No transpose) + * = TRANS: A'* X = B (Transpose) + * = CONJ: A**H * X = B (Conjugate transpose) + * + * A (input) SuperMatrix* + * The original matrix A in the system, or the scaled A if + * equilibration was done. The type of A can be: + * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_GE. + * + * L (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U. Use + * compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U as computed by + * zgstrf(). Use column-wise storage scheme, + * i.e., U has types: Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. + * + * perm_c (input) int*, dimension (A->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * + * perm_r (input) int*, dimension (A->nrow) + * Row permutation vector, which defines the permutation matrix Pr; + * perm_r[i] = j means row i of A is in position j in Pr*A. + * + * equed (input) Specifies the form of equilibration that was done. + * = 'N': No equilibration. + * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). + * = 'C': Column equilibration, i.e., A was postmultiplied by + * diag(C). + * = 'B': Both row and column equilibration, i.e., A was replaced + * by diag(R)*A*diag(C). + * + * R (input) double*, dimension (A->nrow) + * The row scale factors for A. + * If equed = 'R' or 'B', A is premultiplied by diag(R). + * If equed = 'N' or 'C', R is not accessed. + * + * C (input) double*, dimension (A->ncol) + * The column scale factors for A. + * If equed = 'C' or 'B', A is postmultiplied by diag(C). + * If equed = 'N' or 'R', C is not accessed. + * + * B (input) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * The right hand side matrix B. + * if equed = 'R' or 'B', B is premultiplied by diag(R). + * + * X (input/output) SuperMatrix* + * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * On entry, the solution matrix X, as computed by zgstrs(). + * On exit, the improved solution matrix X. + * if *equed = 'C' or 'B', X should be premultiplied by diag(C) + * in order to obtain the solution to the original system. + * + * FERR (output) double*, dimension (B->ncol) + * The estimated forward error bound for each solution vector + * X(j) (the j-th column of the solution matrix X). + * If XTRUE is the true solution corresponding to X(j), FERR(j) + * is an estimated upper bound for the magnitude of the largest + * element in (X(j) - XTRUE) divided by the magnitude of the + * largest element in X(j). The estimate is as reliable as + * the estimate for RCOND, and is almost always a slight + * overestimate of the true error. + * + * BERR (output) double*, dimension (B->ncol) + * The componentwise relative backward error of each solution + * vector X(j) (i.e., the smallest relative change in + * any element of A or B that makes X(j) an exact solution). + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if INFO = -i, the i-th argument had an illegal value + * + * Internal Parameters + * =================== + * + * ITMAX is the maximum number of steps of iterative refinement. + * + *+ */ +void +zgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, char *equed, double *R, double *C, + SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, + SuperLUStat_t *stat, int *info) +{ + + +#define ITMAX 5 + + /* Table of constant values */ + int ione = 1; + doublecomplex ndone = {-1., 0.}; + doublecomplex done = {1., 0.}; + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + SuperMatrix Bjcol; + DNformat *Bstore, *Xstore, *Bjcol_store; + doublecomplex *Bmat, *Xmat, *Bptr, *Xptr; + int kase; + double safe1, safe2; + int i, j, k, irow, nz, count, notran, rowequ, colequ; + int ldb, ldx, nrhs; + double s, xk, lstres, eps, safmin; + char transc[1]; + trans_t transt; + doublecomplex *work; + double *rwork; + int *iwork; + + extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *); +#ifdef _CRAY + extern int CCOPY(int *, doublecomplex *, int *, doublecomplex *, int *); + extern int CSAXPY(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); +#else + extern int zcopy_(int *, doublecomplex *, int *, doublecomplex *, int *); + extern int zaxpy_(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); +#endif + + Astore = A->Store; + Aval = Astore->nzval; + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + /* Test the input parameters */ + *info = 0; + notran = (trans == NOTRANS); + if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) + *info = -3; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) + *info = -4; + else if ( ldb < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) + *info = -10; + else if ( ldx < SUPERLU_MAX(0, A->nrow) || + X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) + *info = -11; + if (*info != 0) { + i = -(*info); + xerbla_("zgsrfs", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || nrhs == 0) { + for (j = 0; j < nrhs; ++j) { + ferr[j] = 0.; + berr[j] = 0.; + } + return; + } + + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + + /* Allocate working space */ + work = doublecomplexMalloc(2*A->nrow); + rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); + iwork = intMalloc(A->nrow); + if ( !work || !rwork || !iwork ) + ABORT_SuperLU("Malloc fails for work/rwork/iwork."); + + if ( notran ) { + *(unsigned char *)transc = 'N'; + transt = TRANS; + } else { + *(unsigned char *)transc = 'T'; + transt = NOTRANS; + } + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = dlamch_("Epsilon"); + safmin = dlamch_("Safe minimum"); + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + + /* Compute the number of nonzeros in each row (or column) of A */ + for (i = 0; i < A->nrow; ++i) iwork[i] = 0; + if ( notran ) { + for (k = 0; k < A->ncol; ++k) + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + ++iwork[Astore->rowind[i]]; + } else { + for (k = 0; k < A->ncol; ++k) + iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; + } + + /* Copy one column of RHS B into Bjcol. */ + Bjcol.Stype = B->Stype; + Bjcol.Dtype = B->Dtype; + Bjcol.Mtype = B->Mtype; + Bjcol.nrow = B->nrow; + Bjcol.ncol = 1; + Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); + if ( !Bjcol.Store ) ABORT_SuperLU("SUPERLU_MALLOC fails for Bjcol.Store"); + Bjcol_store = Bjcol.Store; + Bjcol_store->lda = ldb; + Bjcol_store->nzval = work; /* address aliasing */ + + /* Do for each right hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + Bptr = &Bmat[j*ldb]; + Xptr = &Xmat[j*ldx]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + +#ifdef _CRAY + CCOPY(&A->nrow, Bptr, &ione, work, &ione); +#else + zcopy_(&A->nrow, Bptr, &ione, work, &ione); +#endif + sp_zgemv(transc, ndone, A, Xptr, ione, done, work, ione); + + /* Compute componentwise relative backward error from formula + max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) + where abs(Z) is the componentwise absolute value of the matrix + or vector Z. If the i-th component of the denominator is less + than SAFE2, then SAFE1 is added to the i-th component of the + numerator before dividing. */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if (notran) { + for (k = 0; k < A->ncol; ++k) { + xk = z_abs1( &Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + s += z_abs1(&Aval[i]) * z_abs1(&Xptr[irow]); + } + rwork[k] += s; + } + } + s = 0.; + for (i = 0; i < A->nrow; ++i) { + if (rwork[i] > safe2) { + s = SUPERLU_MAX( s, z_abs1(&work[i]) / rwork[i] ); + } else if ( rwork[i] != 0.0 ) { + s = SUPERLU_MAX( s, (z_abs1(&work[i]) + safe1) / rwork[i] ); + } + /* If rwork[i] is exactly 0.0, then we know the true + residual also must be exactly 0.0. */ + } + berr[j] = s; + + /* Test stopping criterion. Continue iterating if + 1) The residual BERR(J) is larger than machine epsilon, and + 2) BERR(J) decreased by at least a factor of 2 during the + last iteration, and + 3) At most ITMAX iterations tried. */ + + if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { + /* Update solution and try again. */ + zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + +#ifdef _CRAY + CAXPY(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#else + zaxpy_(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#endif + lstres = berr[j]; + ++count; + } else { + break; + } + + } /* end while */ + + stat->RefineSteps = count; + + /* Bound error from formula: + norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* + ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) + where + norm(Z) is the magnitude of the largest component of Z + inv(op(A)) is the inverse of op(A) + abs(Z) is the componentwise absolute value of the matrix or + vector Z + NZ is the maximum number of nonzeros in any row of A, plus 1 + EPS is machine epsilon + + The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) + is incremented by SAFE1 if the i-th component of + abs(op(A))*abs(X) + abs(B) is less than SAFE2. + + Use ZLACON to estimate the infinity-norm of the matrix + inv(op(A)) * diag(W), + where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if ( notran ) { + for (k = 0; k < A->ncol; ++k) { + xk = z_abs1( &Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + xk = z_abs1( &Xptr[irow] ); + s += z_abs1(&Aval[i]) * xk; + } + rwork[k] += s; + } + } + + for (i = 0; i < A->nrow; ++i) + if (rwork[i] > safe2) + rwork[i] = z_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; + else + rwork[i] = z_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; + kase = 0; + + do { + zlacon_(&A->nrow, &work[A->nrow], work, + &ferr[j], &kase); + if (kase == 0) break; + + if (kase == 1) { + /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) { + zd_mult(&work[i], &work[i], C[i]); + } + else if ( !notran && rowequ ) + for (i = 0; i < A->nrow; ++i) { + zd_mult(&work[i], &work[i], R[i]); + } + + zgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); + + for (i = 0; i < A->nrow; ++i) { + zd_mult(&work[i], &work[i], rwork[i]); + } + } else { + /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ + for (i = 0; i < A->nrow; ++i) { + zd_mult(&work[i], &work[i], rwork[i]); + } + + zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) { + zd_mult(&work[i], &work[i], C[i]); + } + else if ( !notran && rowequ ) + for (i = 0; i < A->ncol; ++i) { + zd_mult(&work[i], &work[i], R[i]); + } + } + + } while ( kase != 0 ); + + /* Normalize error. */ + lstres = 0.; + if ( notran && colequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, C[i] * z_abs1( &Xptr[i]) ); + } else if ( !notran && rowequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, R[i] * z_abs1( &Xptr[i]) ); + } else { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, z_abs1( &Xptr[i]) ); + } + if ( lstres != 0. ) + ferr[j] /= lstres; + + } /* for each RHS j ... */ + + SUPERLU_FREE(work); + SUPERLU_FREE(rwork); + SUPERLU_FREE(iwork); + SUPERLU_FREE(Bjcol.Store); + + return; + +} /* zgsrfs */ diff --git a/src/maths/SuperLU/zgssv.c b/src/maths/SuperLU/zgssv.c new file mode 100644 index 000000000..b99b3d3d9 --- /dev/null +++ b/src/maths/SuperLU/zgssv.c @@ -0,0 +1,227 @@ + +/*! @file zgssv.c + * \brief Solves the system of linear equations A*X=B + * + *+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + *+ */ +#include+ +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSSV solves the system of linear equations A*X=B, using the + * LU factorization from ZGSTRF. It performs the following steps: + * + * 1. If A is stored column-wise (A->Stype = SLU_NC): + * + * 1.1. Permute the columns of A, forming A*Pc, where Pc + * is a permutation matrix. For more details of this step, + * see sp_preorder.c. + * + * 1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined + * by Gaussian elimination with partial pivoting. + * L is unit lower triangular with offdiagonal entries + * bounded by 1 in magnitude, and U is upper triangular. + * + * 1.3. Solve the system of equations A*X=B using the factored + * form of A. + * + * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the + * above algorithm to the transpose of A: + * + * 2.1. Permute columns of transpose(A) (rows of A), + * forming transpose(A)*Pc, where Pc is a permutation matrix. + * For more details of this step, see sp_preorder.c. + * + * 2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr + * determined by Gaussian elimination with partial pivoting. + * L is unit lower triangular with offdiagonal entries + * bounded by 1 in magnitude, and U is upper triangular. + * + * 2.3. Solve the system of equations A*X=B using the factored + * form of A. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed and how the + * system will be solved. + * + * A (input) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of linear equations is A->nrow. Currently, the type of A can be: + * Stype = SLU_NC or SLU_NR; Dtype = SLU_Z; Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * perm_c (input/output) int* + * If A->Stype = SLU_NC, column permutation vector of size A->ncol + * which defines the permutation matrix Pc; perm_c[i] = j means + * column i of A is in position j in A*Pc. + * If A->Stype = SLU_NR, column permutation vector of size A->nrow + * which describes permutation of columns of transpose(A) + * (rows of A) as described above. + * + * If options->ColPerm = MY_PERMC or options->Fact = SamePattern or + * options->Fact = SamePattern_SameRowPerm, it is an input argument. + * On exit, perm_c may be overwritten by the product of the input + * perm_c and a permutation that postorders the elimination tree + * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree + * is already in postorder. + * Otherwise, it is an output argument. + * + * perm_r (input/output) int* + * If A->Stype = SLU_NC, row permutation vector of size A->nrow, + * which defines the permutation matrix Pr, and is determined + * by partial pivoting. perm_r[i] = j means row i of A is in + * position j in Pr*A. + * If A->Stype = SLU_NR, permutation vector of size A->ncol, which + * determines permutation of rows of transpose(A) + * (columns of A) as described above. + * + * If options->RowPerm = MY_PERMR or + * options->Fact = SamePattern_SameRowPerm, perm_r is an + * input argument. + * otherwise it is an output argument. + * + * L (output) SuperMatrix* + * The factor L from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses compressed row subscripts storage for supernodes, i.e., + * L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * On exit, the solution matrix if info = 0; + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly singular, + * so the solution could not be computed. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. + *+ */ + +void +zgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, + SuperLUStat_t *stat, int *info ) +{ + + DNformat *Bstore; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int lwork = 0, *etree, i; + + /* Set default values for some parameters */ + int panel_size; /* panel size */ + int relax; /* no of columns in a relaxed snodes */ + int permc_spec; + trans_t trans = NOTRANS; + double *utime; + double t; /* Temporary time */ + + /* Test the input parameters ... */ + *info = 0; + Bstore = B->Store; + if ( options->Fact != DOFACT ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) + *info = -7; + if ( *info != 0 ) { + i = -(*info); + xerbla_("zgssv", &i); + return; + } + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + trans = TRANS; + } else { + if ( A->Stype == SLU_NC ) AA = A; + } + + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t; + + etree = intMalloc(A->ncol); + + t = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + + /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4));*/ + t = SuperLU_timer_(); + /* Compute the LU factorization of A. */ + zgstrf(options, &AC, relax, panel_size, etree, + NULL, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t; + + t = SuperLU_timer_(); + if ( *info == 0 ) { + /* Solve the system A*X=B, overwriting B with X. */ + zgstrs (trans, L, U, perm_c, perm_r, B, stat, info); + } + utime[SOLVE] = SuperLU_timer_() - t; + + SUPERLU_FREE (etree); + Destroy_CompCol_Permuted(&AC); + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/zgssvx.c b/src/maths/SuperLU/zgssvx.c new file mode 100644 index 000000000..ac2924fa3 --- /dev/null +++ b/src/maths/SuperLU/zgssvx.c @@ -0,0 +1,622 @@ + +/*! @file zgssvx.c + * \brief Solves the system of linear equations A*X=B or A'*X=B + * + *+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + *+ */ +#include+ +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSSVX solves the system of linear equations A*X=B or A'*X=B, using + * the LU factorization from zgstrf(). Error bounds on the solution and + * a condition estimate are also provided. It performs the following steps: + * + * 1. If A is stored column-wise (A->Stype = SLU_NC): + * + * 1.1. If options->Equil = YES, scaling factors are computed to + * equilibrate the system: + * options->Trans = NOTRANS: + * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B + * options->Trans = TRANS: + * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B + * options->Trans = CONJ: + * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B + * Whether or not the system will be equilibrated depends on the + * scaling of the matrix A, but if equilibration is used, A is + * overwritten by diag(R)*A*diag(C) and B by diag(R)*B + * (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans + * = TRANS or CONJ). + * + * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation + * matrix that usually preserves sparsity. + * For more details of this step, see sp_preorder.c. + * + * 1.3. If options->Fact != FACTORED, the LU decomposition is used to + * factor the matrix A (after equilibration if options->Equil = YES) + * as Pr*A*Pc = L*U, with Pr determined by partial pivoting. + * + * 1.4. Compute the reciprocal pivot growth factor. + * + * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the + * routine returns with info = i. Otherwise, the factored form of + * A is used to estimate the condition number of the matrix A. If + * the reciprocal of the condition number is less than machine + * precision, info = A->ncol+1 is returned as a warning, but the + * routine still goes on to solve for X and computes error bounds + * as described below. + * + * 1.6. The system of equations is solved for X using the factored form + * of A. + * + * 1.7. If options->IterRefine != NOREFINE, iterative refinement is + * applied to improve the computed solution matrix and calculate + * error bounds and backward error estimates for it. + * + * 1.8. If equilibration was used, the matrix X is premultiplied by + * diag(C) (if options->Trans = NOTRANS) or diag(R) + * (if options->Trans = TRANS or CONJ) so that it solves the + * original system before equilibration. + * + * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm + * to the transpose of A: + * + * 2.1. If options->Equil = YES, scaling factors are computed to + * equilibrate the system: + * options->Trans = NOTRANS: + * diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B + * options->Trans = TRANS: + * (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B + * options->Trans = CONJ: + * (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B + * Whether or not the system will be equilibrated depends on the + * scaling of the matrix A, but if equilibration is used, A' is + * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B + * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). + * + * 2.2. Permute columns of transpose(A) (rows of A), + * forming transpose(A)*Pc, where Pc is a permutation matrix that + * usually preserves sparsity. + * For more details of this step, see sp_preorder.c. + * + * 2.3. If options->Fact != FACTORED, the LU decomposition is used to + * factor the transpose(A) (after equilibration if + * options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the + * permutation Pr determined by partial pivoting. + * + * 2.4. Compute the reciprocal pivot growth factor. + * + * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the + * routine returns with info = i. Otherwise, the factored form + * of transpose(A) is used to estimate the condition number of the + * matrix A. If the reciprocal of the condition number + * is less than machine precision, info = A->nrow+1 is returned as + * a warning, but the routine still goes on to solve for X and + * computes error bounds as described below. + * + * 2.6. The system of equations is solved for X using the factored form + * of transpose(A). + * + * 2.7. If options->IterRefine != NOREFINE, iterative refinement is + * applied to improve the computed solution matrix and calculate + * error bounds and backward error estimates for it. + * + * 2.8. If equilibration was used, the matrix X is premultiplied by + * diag(C) (if options->Trans = NOTRANS) or diag(R) + * (if options->Trans = TRANS or CONJ) so that it solves the + * original system before equilibration. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed and how the + * system will be solved. + * + * A (input/output) SuperMatrix* + * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number + * of the linear equations is A->nrow. Currently, the type of A can be: + * Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE. + * In the future, more general A may be handled. + * + * On entry, If options->Fact = FACTORED and equed is not 'N', + * then A must have been equilibrated by the scaling factors in + * R and/or C. + * On exit, A is not modified if options->Equil = NO, or if + * options->Equil = YES but equed = 'N' on exit. + * Otherwise, if options->Equil = YES and equed is not 'N', + * A is scaled as follows: + * If A->Stype = SLU_NC: + * equed = 'R': A := diag(R) * A + * equed = 'C': A := A * diag(C) + * equed = 'B': A := diag(R) * A * diag(C). + * If A->Stype = SLU_NR: + * equed = 'R': transpose(A) := diag(R) * transpose(A) + * equed = 'C': transpose(A) := transpose(A) * diag(C) + * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). + * + * perm_c (input/output) int* + * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, + * which defines the permutation matrix Pc; perm_c[i] = j means + * column i of A is in position j in A*Pc. + * On exit, perm_c may be overwritten by the product of the input + * perm_c and a permutation that postorders the elimination tree + * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree + * is already in postorder. + * + * If A->Stype = SLU_NR, column permutation vector of size A->nrow, + * which describes permutation of columns of transpose(A) + * (rows of A) as described above. + * + * perm_r (input/output) int* + * If A->Stype = SLU_NC, row permutation vector of size A->nrow, + * which defines the permutation matrix Pr, and is determined + * by partial pivoting. perm_r[i] = j means row i of A is in + * position j in Pr*A. + * + * If A->Stype = SLU_NR, permutation vector of size A->ncol, which + * determines permutation of rows of transpose(A) + * (columns of A) as described above. + * + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by a + * new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument. + * + * etree (input/output) int*, dimension (A->ncol) + * Elimination tree of Pc'*A'*A*Pc. + * If options->Fact != FACTORED and options->Fact != DOFACT, + * etree is an input argument, otherwise it is an output argument. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * + * equed (input/output) char* + * Specifies the form of equilibration that was done. + * = 'N': No equilibration. + * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). + * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). + * = 'B': Both row and column equilibration, i.e., A was replaced + * by diag(R)*A*diag(C). + * If options->Fact = FACTORED, equed is an input argument, + * otherwise it is an output argument. + * + * R (input/output) double*, dimension (A->nrow) + * The row scale factors for A or transpose(A). + * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) + * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). + * If equed = 'N' or 'C', R is not accessed. + * If options->Fact = FACTORED, R is an input argument, + * otherwise, R is output. + * If options->zFact = FACTORED and equed = 'R' or 'B', each element + * of R must be positive. + * + * C (input/output) double*, dimension (A->ncol) + * The column scale factors for A or transpose(A). + * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) + * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). + * If equed = 'N' or 'R', C is not accessed. + * If options->Fact = FACTORED, C is an input argument, + * otherwise, C is output. + * If options->Fact = FACTORED and equed = 'C' or 'B', each element + * of C must be positive. + * + * L (output) SuperMatrix* + * The factor L from the factorization + * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses compressed row subscripts storage for supernodes, i.e., + * L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization + * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or + * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). + * Uses column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. + * + * work (workspace/output) void*, size (lwork) (in bytes) + * User supplied workspace, should be large enough + * to hold data structures for factors L and U. + * On exit, if fact is not 'F', L and U point to this array. + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * mem_usage->total_needed; no other side effects. + * + * See argument 'mem_usage' for memory usage statistics. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * If B->ncol = 0, only LU decomposition is performed, the triangular + * solve is skipped. + * On exit, + * if equed = 'N', B is not modified; otherwise + * if A->Stype = SLU_NC: + * if options->Trans = NOTRANS and equed = 'R' or 'B', + * B is overwritten by diag(R)*B; + * if options->Trans = TRANS or CONJ and equed = 'C' of 'B', + * B is overwritten by diag(C)*B; + * if A->Stype = SLU_NR: + * if options->Trans = NOTRANS and equed = 'C' or 'B', + * B is overwritten by diag(C)*B; + * if options->Trans = TRANS or CONJ and equed = 'R' of 'B', + * B is overwritten by diag(R)*B. + * + * X (output) SuperMatrix* + * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * If info = 0 or info = A->ncol+1, X contains the solution matrix + * to the original system of equations. Note that A and B are modified + * on exit if equed is not 'N', and the solution to the equilibrated + * system is inv(diag(C))*X if options->Trans = NOTRANS and + * equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C' + * and equed = 'R' or 'B'. + * + * recip_pivot_growth (output) double* + * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). + * The infinity norm is used. If recip_pivot_growth is much less + * than 1, the stability of the LU factorization could be poor. + * + * rcond (output) double* + * The estimate of the reciprocal condition number of the matrix A + * after equilibration (if done). If rcond is less than the machine + * precision (in particular, if rcond = 0), the matrix is singular + * to working precision. This condition is indicated by a return + * code of info > 0. + * + * FERR (output) double*, dimension (B->ncol) + * The estimated forward error bound for each solution vector + * X(j) (the j-th column of the solution matrix X). + * If XTRUE is the true solution corresponding to X(j), FERR(j) + * is an estimated upper bound for the magnitude of the largest + * element in (X(j) - XTRUE) divided by the magnitude of the + * largest element in X(j). The estimate is as reliable as + * the estimate for RCOND, and is almost always a slight + * overestimate of the true error. + * If options->IterRefine = NOREFINE, ferr = 1.0. + * + * BERR (output) double*, dimension (B->ncol) + * The componentwise relative backward error of each solution + * vector X(j) (i.e., the smallest relative change in + * any element of A or B that makes X(j) an exact solution). + * If options->IterRefine = NOREFINE, berr = 1.0. + * + * mem_usage (output) mem_usage_t* + * Record the memory usage statistics, consisting of following fields: + * - for_lu (float) + * The amount of space used in bytes for L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + * - expansions (int) + * The number of memory expansions during the LU factorization. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See slu_util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly + * singular, so the solution and error bounds + * could not be computed. + * = A->ncol+1: U is nonsingular, but RCOND is less than machine + * precision, meaning that the matrix is singular to + * working precision. Nevertheless, the solution and + * error bounds are computed because there are a number + * of situations where the computed solution can be more + * accurate than the value of RCOND would suggest. + * > A->ncol+1: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. + *+ */ + +void +zgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + int *etree, char *equed, double *R, double *C, + SuperMatrix *L, SuperMatrix *U, void *work, int lwork, + SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, + double *rcond, double *ferr, double *berr, + mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) +{ + + + DNformat *Bstore, *Xstore; + doublecomplex *Bmat, *Xmat; + int ldb, ldx, nrhs; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int colequ, equil, nofact, notran, rowequ, permc_spec; + trans_t trant; + char norm[1]; + int i, j, info1; + double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; + int relax, panel_size; + double diag_pivot_thresh; + double t0; /* temporary time */ + double *utime; + + /* External functions */ + extern double zlangs(char *, SuperMatrix *); + + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + *info = 0; + nofact = (options->Fact != FACTORED); + equil = (options->Equil == YES_SuperLU); + notran = (options->Trans == NOTRANS); + if ( nofact ) { + *(unsigned char *)equed = 'N'; + rowequ = FALSE; + colequ = FALSE; + } else { + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + smlnum = dlamch_("Safe minimum"); + bignum = 1. / smlnum; + } + +#if 0 +printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", + options->Fact, options->Trans, *equed); +#endif + + /* Test the input parameters */ + if (options->Fact != DOFACT && options->Fact != SamePattern && + options->Fact != SamePattern_SameRowPerm && + options->Fact != FACTORED && + options->Trans != NOTRANS && options->Trans != TRANS && + options->Trans != CONJ && + options->Equil != NO_SuperLU && options->Equil != YES_SuperLU) + *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if (options->Fact == FACTORED && + !(rowequ || colequ || lsame_(equed, "N"))) + *info = -6; + else { + if (rowequ) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, R[j]); + rcmax = SUPERLU_MAX(rcmax, R[j]); + } + if (rcmin <= 0.) *info = -7; + else if ( A->nrow > 0) + rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else rowcnd = 1.; + } + if (colequ && *info == 0) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, C[j]); + rcmax = SUPERLU_MAX(rcmax, C[j]); + } + if (rcmin <= 0.) *info = -8; + else if (A->nrow > 0) + colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else colcnd = 1.; + } + if (*info == 0) { + if ( lwork < -1 ) *info = -12; + else if ( B->ncol < 0 ) *info = -13; + else if ( B->ncol > 0 ) { /* no checking if B->ncol=0 */ + if ( Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || + B->Mtype != SLU_GE ) + *info = -13; + } + if ( X->ncol < 0 ) *info = -14; + else if ( X->ncol > 0 ) { /* no checking if X->ncol=0 */ + if ( Xstore->lda < SUPERLU_MAX(0, A->nrow) || + (B->ncol != 0 && B->ncol != X->ncol) || + X->Stype != SLU_DN || + X->Dtype != SLU_Z || X->Mtype != SLU_GE ) + *info = -14; + } + } + } + if (*info != 0) { + i = -(*info); + xerbla_("zgssvx", &i); + return; + } + + /* Initialization for factor parameters */ + panel_size = sp_ienv(1); + relax = sp_ienv(2); + diag_pivot_thresh = options->DiagPivotThresh; + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + if ( notran ) { /* Reverse the transpose argument. */ + trant = TRANS; + notran = 0; + } else { + trant = NOTRANS; + notran = 1; + } + } else { /* A->Stype == SLU_NC */ + trant = options->Trans; + AA = A; + } + + if ( nofact && equil ) { + t0 = SuperLU_timer_(); + /* Compute row and column scalings to equilibrate the matrix A. */ + zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); + + if ( info1 == 0 ) { + /* Equilibrate matrix A. */ + zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + + + if ( nofact ) { + + t0 = SuperLU_timer_(); + /* + * Gnet column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t0; + + t0 = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t0; + +/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4)); + fflush(stdout); */ + + /* Compute the LU factorization of A*Pc. */ + t0 = SuperLU_timer_(); + zgstrf(options, &AC, relax, panel_size, etree, + work, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t0; + + if ( lwork == -1 ) { + mem_usage->total_needed = *info - A->ncol; + return; + } + } + + if ( options->PivotGrowth ) { + if ( *info > 0 ) { + if ( *info <= A->ncol ) { + /* Compute the reciprocal pivot growth factor of the leading + rank-deficient *info columns of A. */ + *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U); + } + return; + } + + /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ + *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); + } + + if ( options->ConditionNumber ) { + /* Estimate the reciprocal of the condition number of A. */ + t0 = SuperLU_timer_(); + if ( notran ) { + *(unsigned char *)norm = '1'; + } else { + *(unsigned char *)norm = 'I'; + } + anorm = zlangs(norm, AA); + zgscon(norm, L, U, anorm, rcond, stat, info); + utime[RCOND] = SuperLU_timer_() - t0; + } + + if ( nrhs > 0 ) { + /* Scale the right hand side if equilibration was performed. */ + if ( notran ) { + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); + } + } else if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); + } + + /* Compute the solution matrix X. */ + for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ + for (i = 0; i < B->nrow; i++) + Xmat[i + j*ldx] = Bmat[i + j*ldb]; + + t0 = SuperLU_timer_(); + zgstrs (trant, L, U, perm_c, perm_r, X, stat, info); + utime[SOLVE] = SuperLU_timer_() - t0; + + /* Use iterative refinement to improve the computed solution and compute + error bounds and backward error estimates for it. */ + t0 = SuperLU_timer_(); + if ( options->IterRefine != NOREFINE ) { + zgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, + X, ferr, berr, stat, info); + } else { + for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; + } + utime[REFINE] = SuperLU_timer_() - t0; + + /* Transform the solution matrix X to a solution of the original system. */ + if ( notran ) { + if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); + } + } else if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); + } + } /* end if nrhs > 0 */ + + if ( options->ConditionNumber ) { + /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ + if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; + } + + if ( nofact ) { + zQuerySpace(L, U, mem_usage); + Destroy_CompCol_Permuted(&AC); + } + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/zgstrf.c b/src/maths/SuperLU/zgstrf.c new file mode 100644 index 000000000..ed79815e4 --- /dev/null +++ b/src/maths/SuperLU/zgstrf.c @@ -0,0 +1,436 @@ + +/*! @file zgstrf.c + * \brief Computes an LU factorization of a general sparse matrix + * + *+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include+ +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSTRF computes an LU factorization of a general sparse m-by-n + * matrix A using partial pivoting with row interchanges. + * The factorization has the form + * Pr * A = L * U + * where Pr is a row permutation matrix, L is lower triangular with unit + * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper + * triangular (upper trapezoidal if A->nrow < A->ncol). + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * options (input) superlu_options_t* + * The structure defines the input parameters to control + * how the LU decomposition will be performed. + * + * A (input) SuperMatrix* + * Original matrix A, permuted by columns, of dimension + * (A->nrow, A->ncol). The type of A can be: + * Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. + * + * relax (input) int + * To control degree of relaxing supernodes. If the number + * of nodes (columns) in a subtree of the elimination tree is less + * than relax, this subtree is considered as one supernode, + * regardless of the row structures of those columns. + * + * panel_size (input) int + * A panel consists of at most panel_size consecutive columns. + * + * etree (input) int*, dimension (A->ncol) + * Elimination tree of A'*A. + * Note: etree is a vector of parent pointers for a forest whose + * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. + * On input, the columns of A should be permuted so that the + * etree is in a certain postorder. + * + * work (input/output) void*, size (lwork) (in bytes) + * User-supplied work space and space for the output data structures. + * Not referenced if lwork = 0; + * + * lwork (input) int + * Specifies the size of work array in bytes. + * = 0: allocate space internally by system malloc; + * > 0: use user-supplied work array of length lwork in bytes, + * returns error if space runs out. + * = -1: the routine guesses the amount of space needed without + * performing the factorization, and returns it in + * *info; no other side effects. + * + * perm_c (input) int*, dimension (A->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * When searching for diagonal, perm_c[*] is applied to the + * row subscripts of A, so that diagonal threshold pivoting + * can find the diagonal of A, rather than that of A*Pc. + * + * perm_r (input/output) int*, dimension (A->nrow) + * Row permutation vector which defines the permutation matrix Pr, + * perm_r[i] = j means row i of A is in position j in Pr*A. + * If options->Fact = SamePattern_SameRowPerm, the pivoting routine + * will try to use the input perm_r, unless a certain threshold + * criterion is violated. In that case, perm_r is overwritten by + * a new permutation determined by partial pivoting or diagonal + * threshold pivoting. + * Otherwise, perm_r is output argument; + * + * L (output) SuperMatrix* + * The factor L from the factorization Pr*A=L*U; use compressed row + * subscripts storage for supernodes, i.e., L has type: + * Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise + * storage scheme, i.e., U has types: Stype = SLU_NC, + * Dtype = SLU_Z, Mtype = SLU_TRU. + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See slu_util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + * > 0: if info = i, and i is + * <= A->ncol: U(i,i) is exactly zero. The factorization has + * been completed, but the factor U is exactly singular, + * and division by zero will occur if it is used to solve a + * system of equations. + * > A->ncol: number of bytes allocated when memory allocation + * failure occurred, plus A->ncol. If lwork = -1, it is + * the estimated amount of space needed, plus A->ncol. + * + * ====================================================================== + * + * Local Working Arrays: + * ====================== + * m = number of rows in the matrix + * n = number of columns in the matrix + * + * xprune[0:n-1]: xprune[*] points to locations in subscript + * vector lsub[*]. For column i, xprune[i] denotes the point where + * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need + * to be traversed for symbolic factorization. + * + * marker[0:3*m-1]: marker[i] = j means that node i has been + * reached when working on column j. + * Storage: relative to original row subscripts + * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, + * see zpanel_dfs.c; marker2 is used for inner-factorization, + * see zcolumn_dfs.c. + * + * parent[0:m-1]: parent vector used during dfs + * Storage: relative to new row subscripts + * + * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) + * unexplored neighbor of i in lsub[*] + * + * segrep[0:nseg-1]: contains the list of supernodal representatives + * in topological order of the dfs. A supernode representative is the + * last column of a supernode. + * The maximum size of segrep[] is n. + * + * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a + * supernodal representative r, repfnz[r] is the location of the first + * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 + * indicates the supernode r has been explored. + * NOTE: There are W of them, each used for one column of a panel. + * + * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below + * the panel diagonal. These are filled in during zpanel_dfs(), and are + * used later in the inner LU factorization within the panel. + * panel_lsub[]/dense[] pair forms the SPA data structure. + * NOTE: There are W of them. + * + * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; + * NOTE: there are W of them. + * + * tempv[0:*]: real temporary used for dense numeric kernels; + * The size of this array is defined by NUM_TEMPV() in slu_zdefs.h. + *+ */ + +void +zgstrf (superlu_options_t *options, SuperMatrix *A, + int relax, int panel_size, int *etree, void *work, int lwork, + int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, + SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *iwork; + doublecomplex *zwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *xprune; + int *marker; + doublecomplex *dense, *tempv; + int *relax_end; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double fill_ratio = sp_ienv(6); /* estimated fill ratio */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, + panel_size, fill_ratio, L, U, &Glu, &iwork, &zwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &xprune, &marker); + zSetRWork(m, panel_size, zwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) { + heap_relax_snode(n, etree, relax, marker, relax_end); + } else { + relax_snode(n, etree, relax, marker, relax_end); + } + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + xprune, marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ( (*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) + return; + } + + for (icol = jcol; icol<= kcol; icol++) { + xusub[icol+1] = nextu; + + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) + dense[asub[k]] = a[k]; + + /* Numeric update within the snode */ + zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if ( (*info = zpivotL(icol, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + +#ifdef DEBUG + zprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); +#endif + + } + + jcol = icol; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, panel_lsub, segrep, repfnz, xprune, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + zpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for ( jj = jcol; jj < jcol + panel_size; jj++) { + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + if ((*info = zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], + segrep, &repfnz[k], xprune, marker, + parent, xplore, &Glu)) != 0) return; + + /* Numeric updates */ + if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Copy the U-segments to ucol[*] */ + if ((*info = zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], &Glu)) != 0) + return; + + if ( (*info = zpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + + /* Prune columns (0:jj-1) using column jj */ + zpruneL(jj, perm_r, pivrow, nseg, segrep, + &repfnz[k], xprune, &Glu); + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + +#ifdef DEBUG + zprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); +#endif + + } + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU); + zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + +} diff --git a/src/maths/SuperLU/zgstrs.c b/src/maths/SuperLU/zgstrs.c new file mode 100644 index 000000000..7b2a80437 --- /dev/null +++ b/src/maths/SuperLU/zgstrs.c @@ -0,0 +1,350 @@ + +/*! @file zgstrs.c + * \brief Solves a system using LU factorization + * + *+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + +#include+ + +/* + * Function prototypes + */ +void zusolve(int, int, doublecomplex*, doublecomplex*); +void zlsolve(int, int, doublecomplex*, doublecomplex*); +void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); + +/*! \brief + * + * + * Purpose + * ======= + * + * ZGSTRS solves a system of linear equations A*X=B or A'*X=B + * with A sparse and B dense, using the LU factorization computed by + * ZGSTRF. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * trans (input) trans_t + * Specifies the form of the system of equations: + * = NOTRANS: A * X = B (No transpose) + * = TRANS: A'* X = B (Transpose) + * = CONJ: A**H * X = B (Conjugate transpose) + * + * L (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U as computed by + * zgstrf(). Use compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. + * + * U (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U as computed by + * zgstrf(). Use column-wise storage scheme, i.e., U has types: + * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. + * + * perm_c (input) int*, dimension (L->ncol) + * Column permutation vector, which defines the + * permutation matrix Pc; perm_c[i] = j means column i of A is + * in position j in A*Pc. + * + * perm_r (input) int*, dimension (L->nrow) + * Row permutation vector, which defines the permutation matrix Pr; + * perm_r[i] = j means row i of A is in position j in Pr*A. + * + * B (input/output) SuperMatrix* + * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. + * On entry, the right hand side matrix. + * On exit, the solution matrix if info = 0; + * + * stat (output) SuperLUStat_t* + * Record the statistics on runtime and floating-point operation count. + * See util.h for the definition of 'SuperLUStat_t'. + * + * info (output) int* + * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value + *+ */ + +void +zgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, SuperMatrix *B, + SuperLUStat_t *stat, int *info) +{ + +#ifdef _CRAY + _fcd ftcs1, ftcs2, ftcs3, ftcs4; +#endif + int incx = 1, incy = 1; +#ifdef USE_VENDOR_BLAS + doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; + doublecomplex *work_col; +#endif + doublecomplex temp_comp; + DNformat *Bstore; + doublecomplex *Bmat; + SCformat *Lstore; + NCformat *Ustore; + doublecomplex *Lval, *Uval; + int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; + int i, j, k, iptr, jcol, n, ldb, nrhs; + doublecomplex *work, *rhs_work, *soln; + flops_t solve_ops; + void zprint_soln(); + + /* Test input parameters ... */ + *info = 0; + Bstore = B->Store; + ldb = Bstore->lda; + nrhs = B->ncol; + if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) + *info = -2; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) + *info = -3; + else if ( ldb < SUPERLU_MAX(0, L->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) + *info = -6; + if ( *info ) { + i = -(*info); + xerbla_("zgstrs", &i); + return; + } + + n = L->nrow; + work = doublecomplexCalloc(n * nrhs); + if ( !work ) ABORT_SuperLU("Malloc fails for local work[]."); + soln = doublecomplexMalloc(n); + if ( !soln ) ABORT_SuperLU("Malloc fails for local soln[]."); + + Bmat = Bstore->nzval; + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( trans == NOTRANS ) { + /* Permute right hand sides to form Pr*B */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + /* Forward solve PLy=Pb. */ + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + nrow = nsupr - nsupc; + + solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; + solve_ops += 8 * nrow * nsupc * nrhs; + + if ( nsupc == 1 ) { + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + luptr = L_NZ_START(fsupc); + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ + irow = L_SUB(iptr); + ++luptr; + zz_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); + z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); + } + } + } else { + luptr = L_NZ_START(fsupc); +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); + CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#else + ztrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + zgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#endif + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + work_col = &work[j*n]; + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + z_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); + work_col[i].r = 0.0; + work_col[i].i = 0.0; + iptr++; + } + } +#else + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + zlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); + zmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], + &rhs_work[fsupc], &work[0] ); + + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + z_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); + work[i].r = 0.; + work[i].i = 0.; + iptr++; + } + } +#endif + } /* else ... */ + } /* for L-solve */ + +#ifdef DEBUG + printf("After L-solve: y=\n"); + zprint_soln(n, nrhs, Bmat); +#endif + + /* + * Back solve Ux=y. + */ + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; + + if ( nsupc == 1 ) { + rhs_work = &Bmat[0]; + for (j = 0; j < nrhs; j++) { + z_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); + rhs_work += ldb; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("U", strlen("U")); + ftcs3 = _cptofcd("N", strlen("N")); + CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#else + ztrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#endif +#else + for (j = 0; j < nrhs; j++) + zusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); +#endif + } + + for (j = 0; j < nrhs; ++j) { + rhs_work = &Bmat[j*ldb]; + for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ + irow = U_SUB(i); + zz_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); + z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); + } + } + } + + } /* for U-solve */ + +#ifdef DEBUG + printf("After U-solve: x=\n"); + zprint_soln(n, nrhs, Bmat); +#endif + + /* Compute the final solution X := Pc*X. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = solve_ops; + + } else { /* Solve A'*X=B or CONJ(A)*X=B */ + /* Permute right hand sides to form Pc'*B. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = 0; + if (trans == TRANS) { + for (k = 0; k < nrhs; ++k) { + /* Multiply by inv(U'). */ + sp_ztrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); + + /* Multiply by inv(L'). */ + sp_ztrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); + } + } else { /* trans == CONJ */ + for (k = 0; k < nrhs; ++k) { + /* Multiply by conj(inv(U')). */ + sp_ztrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); + + /* Multiply by conj(inv(L')). */ + sp_ztrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); + } + } + /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + } + + SUPERLU_FREE(work); + SUPERLU_FREE(soln); +} + +/* + * Diagnostic print of the solution vector + */ +void +zprint_soln(int n, int nrhs, doublecomplex *soln) +{ + int i; + + for (i = 0; i < n; i++) + printf("\t%d: %.4f\n", i, soln[i]); +} diff --git a/src/maths/SuperLU/zlacon.c b/src/maths/SuperLU/zlacon.c new file mode 100644 index 000000000..40240d5d3 --- /dev/null +++ b/src/maths/SuperLU/zlacon.c @@ -0,0 +1,221 @@ + +/*! @file zlacon.c + * \brief Estimates the 1-norm + * + *+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ */ +#include+#include +#include + +/*! \brief + * + * + * Purpose + * ======= + * + * ZLACON estimates the 1-norm of a square matrix A. + * Reverse communication is used for evaluating matrix-vector products. + * + * + * Arguments + * ========= + * + * N (input) INT + * The order of the matrix. N >= 1. + * + * V (workspace) DOUBLE COMPLEX PRECISION array, dimension (N) + * On the final return, V = A*W, where EST = norm(V)/norm(W) + * (W is not returned). + * + * X (input/output) DOUBLE COMPLEX PRECISION array, dimension (N) + * On an intermediate return, X should be overwritten by + * A * X, if KASE=1, + * A' * X, if KASE=2, + * where A' is the conjugate transpose of A, + * and ZLACON must be re-called with all the other parameters + * unchanged. + * + * + * EST (output) DOUBLE PRECISION + * An estimate (a lower bound) for norm(A). + * + * KASE (input/output) INT + * On the initial call to ZLACON, KASE should be 0. + * On an intermediate return, KASE will be 1 or 2, indicating + * whether X should be overwritten by A * X or A' * X. + * On the final return from ZLACON, KASE will again be 0. + * + * Further Details + * ======= ======= + * + * Contributed by Nick Higham, University of Manchester. + * Originally named CONEST, dated March 16, 1988. + * + * Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of + * a real or complex matrix, with applications to condition estimation", + * ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. + * ===================================================================== + *+ */ + +int +zlacon_(int *n, doublecomplex *v, doublecomplex *x, double *est, int *kase) + +{ + + + /* Table of constant values */ + int c__1 = 1; + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + + /* System generated locals */ + double d__1; + + /* Local variables */ + static int iter; + static int jump, jlast; + static double altsgn, estold; + static int i, j; + double temp; + double safmin; + extern double dlamch_(char *); + extern int izmax1_(int *, doublecomplex *, int *); + extern double dzsum1_(int *, doublecomplex *, int *); + + safmin = dlamch_("Safe minimum"); + if ( *kase == 0 ) { + for (i = 0; i < *n; ++i) { + x[i].r = 1. / (double) (*n); + x[i].i = 0.; + } + *kase = 1; + jump = 1; + return 0; + } + + switch (jump) { + case 1: goto L20; + case 2: goto L40; + case 3: goto L70; + case 4: goto L110; + case 5: goto L140; + } + + /* ................ ENTRY (JUMP = 1) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ + L20: + if (*n == 1) { + v[0] = x[0]; + *est = z_abs(&v[0]); + /* ... QUIT */ + goto L150; + } + *est = dzsum1_(n, x, &c__1); + + for (i = 0; i < *n; ++i) { + d__1 = z_abs(&x[i]); + if (d__1 > safmin) { + d__1 = 1 / d__1; + x[i].r *= d__1; + x[i].i *= d__1; + } else { + x[i] = one; + } + } + *kase = 2; + jump = 2; + return 0; + + /* ................ ENTRY (JUMP = 2) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ +L40: + j = izmax1_(n, &x[0], &c__1); + --j; + iter = 2; + + /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ +L50: + for (i = 0; i < *n; ++i) x[i] = zero; + x[j] = one; + *kase = 1; + jump = 3; + return 0; + + /* ................ ENTRY (JUMP = 3) + X HAS BEEN OVERWRITTEN BY A*X. */ +L70: +#ifdef _CRAY + CCOPY(n, x, &c__1, v, &c__1); +#else + zcopy_(n, x, &c__1, v, &c__1); +#endif + estold = *est; + *est = dzsum1_(n, v, &c__1); + + +L90: + /* TEST FOR CYCLING. */ + if (*est <= estold) goto L120; + + for (i = 0; i < *n; ++i) { + d__1 = z_abs(&x[i]); + if (d__1 > safmin) { + d__1 = 1 / d__1; + x[i].r *= d__1; + x[i].i *= d__1; + } else { + x[i] = one; + } + } + *kase = 2; + jump = 4; + return 0; + + /* ................ ENTRY (JUMP = 4) + X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ +L110: + jlast = j; + j = izmax1_(n, &x[0], &c__1); + --j; + if (x[jlast].r != (d__1 = x[j].r, fabs(d__1)) && iter < 5) { + ++iter; + goto L50; + } + + /* ITERATION COMPLETE. FINAL STAGE. */ +L120: + altsgn = 1.; + for (i = 1; i <= *n; ++i) { + x[i-1].r = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.); + x[i-1].i = 0.; + altsgn = -altsgn; + } + *kase = 1; + jump = 5; + return 0; + + /* ................ ENTRY (JUMP = 5) + X HAS BEEN OVERWRITTEN BY A*X. */ +L140: + temp = dzsum1_(n, x, &c__1) / (double)(*n * 3) * 2.; + if (temp > *est) { +#ifdef _CRAY + CCOPY(n, &x[0], &c__1, &v[0], &c__1); +#else + zcopy_(n, &x[0], &c__1, &v[0], &c__1); +#endif + *est = temp; + } + +L150: + *kase = 0; + return 0; + +} /* zlacon_ */ diff --git a/src/maths/SuperLU/zlangs.c b/src/maths/SuperLU/zlangs.c new file mode 100644 index 000000000..f3afd65d7 --- /dev/null +++ b/src/maths/SuperLU/zlangs.c @@ -0,0 +1,119 @@ + +/*! @file zlangs.c + * \brief Returns the value of the one norm + * + *+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Modified from lapack routine ZLANGE + *+ */ +/* + * File name: zlangs.c + * History: Modified from lapack routine ZLANGE + */ +#include+#include + +/*! \brief + * + * + * Purpose + * ======= + * + * ZLANGS returns the value of the one norm, or the Frobenius norm, or + * the infinity norm, or the element of largest absolute value of a + * real matrix A. + * + * Description + * =========== + * + * ZLANGE returns the value + * + * ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' + * ( + * ( norm1(A), NORM = '1', 'O' or 'o' + * ( + * ( normI(A), NORM = 'I' or 'i' + * ( + * ( normF(A), NORM = 'F', 'f', 'E' or 'e' + * + * where norm1 denotes the one norm of a matrix (maximum column sum), + * normI denotes the infinity norm of a matrix (maximum row sum) and + * normF denotes the Frobenius norm of a matrix (square root of sum of + * squares). Note that max(abs(A(i,j))) is not a matrix norm. + * + * Arguments + * ========= + * + * NORM (input) CHARACTER*1 + * Specifies the value to be returned in ZLANGE as described above. + * A (input) SuperMatrix* + * The M by N sparse matrix A. + * + * ===================================================================== + *+ */ + +double zlangs(char *norm, SuperMatrix *A) +{ + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int i, j, irow; + double value, sum; + double *rwork; + + Astore = A->Store; + Aval = Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + + } else if (lsame_(norm, "M")) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + value = SUPERLU_MAX( value, z_abs( &Aval[i]) ); + + } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += z_abs( &Aval[i] ); + value = SUPERLU_MAX(value,sum); + } + + } else if (lsame_(norm, "I")) { + /* Find normI(A). */ + if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for rwork."); + for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { + irow = Astore->rowind[i]; + rwork[irow] += z_abs( &Aval[i] ); + } + value = 0.; + for (i = 0; i < A->nrow; ++i) + value = SUPERLU_MAX(value, rwork[i]); + + SUPERLU_FREE (rwork); + + } else if (lsame_(norm, "F") || lsame_(norm, "E")) { + /* Find normF(A). */ + ABORT_SuperLU("Not implemented."); + } else + ABORT_SuperLU("Illegal norm specified."); + + return (value); + +} /* zlangs */ + diff --git a/src/maths/SuperLU/zlaqgs.c b/src/maths/SuperLU/zlaqgs.c new file mode 100644 index 000000000..c381cb43a --- /dev/null +++ b/src/maths/SuperLU/zlaqgs.c @@ -0,0 +1,147 @@ + +/*! @file zlaqgs.c + * \brief Equlibrates a general sprase matrix + * + *+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Modified from LAPACK routine ZLAQGE + *+ */ +/* + * File name: zlaqgs.c + * History: Modified from LAPACK routine ZLAQGE + */ +#include+#include + +/*! \brief + * + * + * Purpose + * ======= + * + * ZLAQGS equilibrates a general sparse M by N matrix A using the row and + * scaling factors in the vectors R and C. + * + * See supermatrix.h for the definition of 'SuperMatrix' structure. + * + * Arguments + * ========= + * + * A (input/output) SuperMatrix* + * On exit, the equilibrated matrix. See EQUED for the form of + * the equilibrated matrix. The type of A can be: + * Stype = NC; Dtype = SLU_Z; Mtype = GE. + * + * R (input) double*, dimension (A->nrow) + * The row scale factors for A. + * + * C (input) double*, dimension (A->ncol) + * The column scale factors for A. + * + * ROWCND (input) double + * Ratio of the smallest R(i) to the largest R(i). + * + * COLCND (input) double + * Ratio of the smallest C(i) to the largest C(i). + * + * AMAX (input) double + * Absolute value of largest matrix entry. + * + * EQUED (output) char* + * Specifies the form of equilibration that was done. + * = 'N': No equilibration + * = 'R': Row equilibration, i.e., A has been premultiplied by + * diag(R). + * = 'C': Column equilibration, i.e., A has been postmultiplied + * by diag(C). + * = 'B': Both row and column equilibration, i.e., A has been + * replaced by diag(R) * A * diag(C). + * + * Internal Parameters + * =================== + * + * THRESH is a threshold value used to decide if row or column scaling + * should be done based on the ratio of the row or column scaling + * factors. If ROWCND < THRESH, row scaling is done, and if + * COLCND < THRESH, column scaling is done. + * + * LARGE and SMALL are threshold values used to decide if row scaling + * should be done based on the absolute size of the largest matrix + * element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. + * + * ===================================================================== + *+ */ + +void +zlaqgs(SuperMatrix *A, double *r, double *c, + double rowcnd, double colcnd, double amax, char *equed) +{ + + +#define THRESH (0.1) + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int i, j, irow; + double large, small, cj; + double temp; + + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Initialize LARGE and SMALL. */ + small = dlamch_("Safe minimum") / dlamch_("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + zd_mult(&Aval[i], &Aval[i], cj); + } + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + zd_mult(&Aval[i], &Aval[i], r[irow]); + } + *(unsigned char *)equed = 'R'; + } else { + /* Row and column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + temp = cj * r[irow]; + zd_mult(&Aval[i], &Aval[i], temp); + } + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* zlaqgs */ + diff --git a/src/maths/SuperLU/zldperm.c b/src/maths/SuperLU/zldperm.c new file mode 100644 index 000000000..7600ae2dc --- /dev/null +++ b/src/maths/SuperLU/zldperm.c @@ -0,0 +1,168 @@ + +/*! @file + * \brief Finds a row permutation so that the matrix has large entries on the diagonal + * + *+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + *+ */ + +#include+ +extern int_t mc64id_(int_t*); +extern int_t mc64ad_(int_t*, int_t*, int_t*, int_t [], int_t [], double [], + int_t*, int_t [], int_t*, int_t[], int_t*, double [], + int_t [], int_t []); + +/*! \brief + * + * + * Purpose + * ======= + * + * ZLDPERM finds a row permutation so that the matrix has large + * entries on the diagonal. + * + * Arguments + * ========= + * + * job (input) int + * Control the action. Possible values for JOB are: + * = 1 : Compute a row permutation of the matrix so that the + * permuted matrix has as many entries on its diagonal as + * possible. The values on the diagonal are of arbitrary size. + * HSL subroutine MC21A/AD is used for this. + * = 2 : Compute a row permutation of the matrix so that the smallest + * value on the diagonal of the permuted matrix is maximized. + * = 3 : Compute a row permutation of the matrix so that the smallest + * value on the diagonal of the permuted matrix is maximized. + * The algorithm differs from the one used for JOB = 2 and may + * have quite a different performance. + * = 4 : Compute a row permutation of the matrix so that the sum + * of the diagonal entries of the permuted matrix is maximized. + * = 5 : Compute a row permutation of the matrix so that the product + * of the diagonal entries of the permuted matrix is maximized + * and vectors to scale the matrix so that the nonzero diagonal + * entries of the permuted matrix are one in absolute value and + * all the off-diagonal entries are less than or equal to one in + * absolute value. + * Restriction: 1 <= JOB <= 5. + * + * n (input) int + * The order of the matrix. + * + * nnz (input) int + * The number of nonzeros in the matrix. + * + * adjncy (input) int*, of size nnz + * The adjacency structure of the matrix, which contains the row + * indices of the nonzeros. + * + * colptr (input) int*, of size n+1 + * The pointers to the beginning of each column in ADJNCY. + * + * nzval (input) doublecomplex*, of size nnz + * The nonzero values of the matrix. nzval[k] is the value of + * the entry corresponding to adjncy[k]. + * It is not used if job = 1. + * + * perm (output) int*, of size n + * The permutation vector. perm[i] = j means row i in the + * original matrix is in row j of the permuted matrix. + * + * u (output) double*, of size n + * If job = 5, the natural logarithms of the row scaling factors. + * + * v (output) double*, of size n + * If job = 5, the natural logarithms of the column scaling factors. + * The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j). + *+ */ + +int +zldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], + doublecomplex nzval[], int_t *perm, double u[], double v[]) +{ + int_t i, liw, ldw, num; + int_t *iw, icntl[10], info[10]; + double *dw; + double *nzval_d = (double *) SUPERLU_MALLOC(nnz * sizeof(double)); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Enter zldperm()"); +#endif + liw = 5*n; + if ( job == 3 ) liw = 10*n + nnz; + if ( !(iw = intMalloc(liw)) ) ABORT_SuperLU("Malloc fails for iw[]"); + ldw = 3*n + nnz; + if ( !(dw = (double*) SUPERLU_MALLOC(ldw * sizeof(double))) ) + ABORT_SuperLU("Malloc fails for dw[]"); + + /* Increment one to get 1-based indexing. */ + for (i = 0; i <= n; ++i) ++colptr[i]; + for (i = 0; i < nnz; ++i) ++adjncy[i]; +#if ( DEBUGlevel>=2 ) + printf("LDPERM(): n %d, nnz %d\n", n, nnz); + slu_PrintInt10("colptr", n+1, colptr); + slu_PrintInt10("adjncy", nnz, adjncy); +#endif + + /* + * NOTE: + * ===== + * + * MC64AD assumes that column permutation vector is defined as: + * perm(i) = j means column i of permuted A is in column j of original A. + * + * Since a symmetric permutation preserves the diagonal entries. Then + * by the following relation: + * P'(A*P')P = P'A + * we can apply inverse(perm) to rows of A to get large diagonal entries. + * But, since 'perm' defined in MC64AD happens to be the reverse of + * SuperLU's definition of permutation vector, therefore, it is already + * an inverse for our purpose. We will thus use it directly. + * + */ + mc64id_(icntl); +#if 0 + /* Suppress error and warning messages. */ + icntl[0] = -1; + icntl[1] = -1; +#endif + + for (i = 0; i < nnz; ++i) nzval_d[i] = z_abs1(&nzval[i]); + mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval_d, &num, perm, + &liw, iw, &ldw, dw, icntl, info); + +#if ( DEBUGlevel>=2 ) + slu_PrintInt10("perm", n, perm); + printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); +#endif + if ( info[0] == 1 ) { /* Structurally singular */ + printf(".. The last %d permutations:\n", n-num); + slu_PrintInt10("perm", n-num, &perm[num]); + } + + /* Restore to 0-based indexing. */ + for (i = 0; i <= n; ++i) --colptr[i]; + for (i = 0; i < nnz; ++i) --adjncy[i]; + for (i = 0; i < n; ++i) --perm[i]; + + if ( job == 5 ) + for (i = 0; i < n; ++i) { + u[i] = dw[i]; + v[i] = dw[n+i]; + } + + SUPERLU_FREE(iw); + SUPERLU_FREE(dw); + SUPERLU_FREE(nzval_d); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Exit zldperm()"); +#endif + + return info[0]; +} diff --git a/src/maths/SuperLU/zmemory.c b/src/maths/SuperLU/zmemory.c new file mode 100644 index 000000000..c76ba2b14 --- /dev/null +++ b/src/maths/SuperLU/zmemory.c @@ -0,0 +1,701 @@ + +/*! @file zmemory.c + * \brief Memory details + * + *+ * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + *+ */ +#include+ + +/* Internal prototypes */ +void *zexpand (int *, MemType,int, int, GlobalLU_t *); +int zLUWorkInit (int, int, int, int **, doublecomplex **, GlobalLU_t *); +void copy_mem_doublecomplex (int, void *, void *); +void zStackCompress (GlobalLU_t *); +void zSetupSpace (void *, int, GlobalLU_t *); +void *zuser_malloc (int, int, GlobalLU_t *); +void zuser_free (int, int, GlobalLU_t *); + +/* External prototypes (in memory.c - prec-independent) */ +extern void copy_mem_int (int, void *, void *); +extern void user_bcopy (char *, char *, int); + + +/* Macros to manipulate stack */ +#define StackFull(x) ( x + Glu->stack.used >= Glu->stack.size ) +#define NotDoubleAlign(addr) ( (long int)addr & 7 ) +#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) +#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ + (w + 1) * m * sizeof(doublecomplex) ) +#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ + + + + +/*! \brief Setup the memory model to be used for factorization. + * + * lwork = 0: use system malloc; + * lwork > 0: use user-supplied work[] space. + */ +void zSetupSpace(void *work, int lwork, GlobalLU_t *Glu) +{ + if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; /* malloc/free */ + } else if ( lwork > 0 ) { + Glu->MemModel = USER; /* user provided space */ + Glu->stack.used = 0; + Glu->stack.top1 = 0; + Glu->stack.top2 = (lwork/4)*4; /* must be word addressable */ + Glu->stack.size = Glu->stack.top2; + Glu->stack.array = (void *) work; + } +} + + + +void *zuser_malloc(int bytes, int which_end, GlobalLU_t *Glu) +{ + void *buf; + + if ( StackFull(bytes) ) return (NULL); + + if ( which_end == HEAD ) { + buf = (char*) Glu->stack.array + Glu->stack.top1; + Glu->stack.top1 += bytes; + } else { + Glu->stack.top2 -= bytes; + buf = (char*) Glu->stack.array + Glu->stack.top2; + } + + Glu->stack.used += bytes; + return buf; +} + + +void zuser_free(int bytes, int which_end, GlobalLU_t *Glu) +{ + if ( which_end == HEAD ) { + Glu->stack.top1 -= bytes; + } else { + Glu->stack.top2 += bytes; + } + Glu->stack.used -= bytes; +} + + + +/*! \brief + * + * + * mem_usage consists of the following fields: + * - for_lu (float) + * The amount of space used in bytes for the L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + *+ */ +int zQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, iword, dword, panel_size = sp_ienv(1); + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(doublecomplex); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0*n + 3.0) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0 * panel_size + 4.0 + NO_MARKER) * n * iword + + (panel_size + 1.0) * n * dword ); + + return 0; +} /* zQuerySpace */ + + +/*! \brief + * + *+ * mem_usage consists of the following fields: + * - for_lu (float) + * The amount of space used in bytes for the L\U data structures. + * - total_needed (float) + * The amount of space needed in bytes to perform factorization. + *+ */ +int ilu_zQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, panel_size = sp_ienv(1); + register float iword, dword; + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(double); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0f * n + 3.0f) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0f) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization. + ILU needs 5*n more integers than LU */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0f * panel_size + 9.0f + NO_MARKER) * n * iword + + (panel_size + 1.0f) * n * dword ); + + return 0; +} /* ilu_zQuerySpace */ + + +/*! \brief Allocate storage for the data structures common to all factor routines. + * + *+ * For those unpredictable size, estimate as fill_ratio * nnz(A). + * Return value: + * If lwork = -1, return the estimated amount of space required, plus n; + * otherwise, return the amount of space actually allocated when + * memory allocation failure occurred. + *+ */ +int +zLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, + int panel_size, double fill_ratio, SuperMatrix *L, SuperMatrix *U, + GlobalLU_t *Glu, int **iwork, doublecomplex **dwork) +{ + int info, iword, dword; + SCformat *Lstore; + NCformat *Ustore; + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + doublecomplex *ucol; + int *usub, *xusub; + int nzlmax, nzumax, nzlumax; + + iword = sizeof(int); + dword = sizeof(doublecomplex); + Glu->n = n; + Glu->num_expansions = 0; + + if ( !Glu->expanders ) + Glu->expanders = (ExpHeader*)SUPERLU_MALLOC( NO_MEMTYPE * + sizeof(ExpHeader) ); + if ( !Glu->expanders ) ABORT_SuperLU("SUPERLU_MALLOC fails for expanders"); + + if ( fact != SamePattern_SameRowPerm ) { + /* Guess for L\U factors */ + nzumax = nzlumax = fill_ratio * annz; + nzlmax = SUPERLU_MAX(1, fill_ratio/4.) * annz; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else { + zSetupSpace(work, lwork, Glu); + } + +#if ( PRNTlevel >= 1 ) + printf("zLUMemInit() called: fill_ratio %.0f, nzlmax %ld, nzumax %ld\n", + fill_ratio, nzlmax, nzumax); + fflush(stdout); +#endif + + /* Integer pointers for L\U factors */ + if ( Glu->MemModel == SYSTEM ) { + xsup = intMalloc(n+1); + supno = intMalloc(n+1); + xlsub = intMalloc(n+1); + xlusup = intMalloc(n+1); + xusub = intMalloc(n+1); + } else { + xsup = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + supno = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + xlsub = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + xlusup = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + xusub = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + } + + lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) zexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) zexpand( &nzumax, USUB, 0, 1, Glu ); + + while ( !lusup || !ucol || !lsub || !usub ) { + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE(lusup); + SUPERLU_FREE(ucol); + SUPERLU_FREE(lsub); + SUPERLU_FREE(usub); + } else { + zuser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, + HEAD, Glu); + } + nzlumax /= 2; + nzumax /= 2; + nzlmax /= 2; + if ( nzlumax < annz ) { + printf("Not enough memory to perform factorization.\n"); + return (zmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + } +#if ( PRNTlevel >= 1) + printf("zLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", + nzlmax, nzumax); + fflush(stdout); +#endif + lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) zexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) zexpand( &nzumax, USUB, 0, 1, Glu ); + } + + } else { + /* fact == SamePattern_SameRowPerm */ + Lstore = L->Store; + Ustore = U->Store; + xsup = Lstore->sup_to_col; + supno = Lstore->col_to_sup; + xlsub = Lstore->rowind_colptr; + xlusup = Lstore->nzval_colptr; + xusub = Ustore->colptr; + nzlmax = Glu->nzlmax; /* max from previous factorization */ + nzumax = Glu->nzumax; + nzlumax = Glu->nzlumax; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; + } else { + Glu->MemModel = USER; + Glu->stack.top2 = (lwork/4)*4; /* must be word-addressable */ + Glu->stack.size = Glu->stack.top2; + } + + lsub = Glu->expanders[LSUB].mem = Lstore->rowind; + lusup = Glu->expanders[LUSUP].mem = Lstore->nzval; + usub = Glu->expanders[USUB].mem = Ustore->rowind; + ucol = Glu->expanders[UCOL].mem = Ustore->nzval;; + Glu->expanders[LSUB].size = nzlmax; + Glu->expanders[LUSUP].size = nzlumax; + Glu->expanders[USUB].size = nzumax; + Glu->expanders[UCOL].size = nzumax; + } + + Glu->xsup = xsup; + Glu->supno = supno; + Glu->lsub = lsub; + Glu->xlsub = xlsub; + Glu->lusup = lusup; + Glu->xlusup = xlusup; + Glu->ucol = ucol; + Glu->usub = usub; + Glu->xusub = xusub; + Glu->nzlmax = nzlmax; + Glu->nzumax = nzumax; + Glu->nzlumax = nzlumax; + + info = zLUWorkInit(m, n, panel_size, iwork, dwork, Glu); + if ( info ) + return ( info + zmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + + ++Glu->num_expansions; + return 0; + +} /* zLUMemInit */ + +/*! \brief Allocate known working storage. Returns 0 if success, otherwise + returns the number of bytes allocated so far when failure occurred. */ +int +zLUWorkInit(int m, int n, int panel_size, int **iworkptr, + doublecomplex **dworkptr, GlobalLU_t *Glu) +{ + int isize, dsize, extra; + doublecomplex *old_ptr; + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + + isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); + dsize = (m * panel_size + + NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(doublecomplex); + + if ( Glu->MemModel == SYSTEM ) + *iworkptr = (int *) intCalloc(isize/sizeof(int)); + else + *iworkptr = (int *) zuser_malloc(isize, TAIL, Glu); + if ( ! *iworkptr ) { + fprintf(stderr, "zLUWorkInit: malloc fails for local iworkptr[]\n"); + return (isize + n); + } + + if ( Glu->MemModel == SYSTEM ) + *dworkptr = (doublecomplex *) SUPERLU_MALLOC(dsize); + else { + *dworkptr = (doublecomplex *) zuser_malloc(dsize, TAIL, Glu); + if ( NotDoubleAlign(*dworkptr) ) { + old_ptr = *dworkptr; + *dworkptr = (doublecomplex*) DoubleAlign(*dworkptr); + *dworkptr = (doublecomplex*) ((double*)*dworkptr - 1); + extra = (char*)old_ptr - (char*)*dworkptr; +#ifdef DEBUG + printf("zLUWorkInit: not aligned, extra %d\n", extra); +#endif + Glu->stack.top2 -= extra; + Glu->stack.used += extra; + } + } + if ( ! *dworkptr ) { + fprintf(stderr, "malloc fails for local dworkptr[]."); + return (isize + dsize + n); + } + + return 0; +} + + +/*! \brief Set up pointers for real working arrays. + */ +void +zSetRWork(int m, int panel_size, doublecomplex *dworkptr, + doublecomplex **dense, doublecomplex **tempv) +{ + doublecomplex zero = {0.0, 0.0}; + + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + *dense = dworkptr; + *tempv = *dense + panel_size*m; + zfill (*dense, m * panel_size, zero); + zfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); +} + +/*! \brief Free the working storage used by factor routines. + */ +void zLUWorkFree(int *iwork, doublecomplex *dwork, GlobalLU_t *Glu) +{ + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE (iwork); + SUPERLU_FREE (dwork); + } else { + Glu->stack.used -= (Glu->stack.size - Glu->stack.top2); + Glu->stack.top2 = Glu->stack.size; +/* zStackCompress(Glu); */ + } + + SUPERLU_FREE (Glu->expanders); + Glu->expanders = NULL; +} + +/*! \brief Expand the data structures for L and U during the factorization. + * + *+ * Return value: 0 - successful return + * > 0 - number of bytes allocated when run out of space + *+ */ +int +zLUMemXpand(int jcol, + int next, /* number of elements currently in the factors */ + MemType mem_type, /* which type of memory to expand */ + int *maxlen, /* modified - maximum length of a data structure */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + void *new_mem; + +#ifdef DEBUG + printf("zLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", + jcol, next, *maxlen, mem_type); +#endif + + if (mem_type == USUB) + new_mem = zexpand(maxlen, mem_type, next, 1, Glu); + else + new_mem = zexpand(maxlen, mem_type, next, 0, Glu); + + if ( !new_mem ) { + int nzlmax = Glu->nzlmax; + int nzumax = Glu->nzumax; + int nzlumax = Glu->nzlumax; + fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); + return (zmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); + } + + switch ( mem_type ) { + case LUSUP: + Glu->lusup = (doublecomplex *) new_mem; + Glu->nzlumax = *maxlen; + break; + case UCOL: + Glu->ucol = (doublecomplex *) new_mem; + Glu->nzumax = *maxlen; + break; + case LSUB: + Glu->lsub = (int *) new_mem; + Glu->nzlmax = *maxlen; + break; + case USUB: + Glu->usub = (int *) new_mem; + Glu->nzumax = *maxlen; + break; + } + + return 0; + +} + + + +void +copy_mem_doublecomplex(int howmany, void *old, void *new) +{ + register int i; + doublecomplex *dold = old; + doublecomplex *dnew = new; + for (i = 0; i < howmany; i++) dnew[i] = dold[i]; +} + +/*! \brief Expand the existing storage to accommodate more fill-ins. + */ +void +*zexpand ( + int *prev_len, /* length used from previous call */ + MemType type, /* which part of the memory to expand */ + int len_to_copy, /* size of the memory to be copied to new store */ + int keep_prev, /* = 1: use prev_len; + = 0: compute new_len to expand */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + float EXPAND = 1.5; + float alpha; + void *new_mem, *old_mem; + int new_len, tries, lword, extra, bytes_to_copy; + ExpHeader *expanders = Glu->expanders; /* Array of 4 types of memory */ + + alpha = EXPAND; + + if ( Glu->num_expansions == 0 || keep_prev ) { + /* First time allocate requested */ + new_len = *prev_len; + } else { + new_len = alpha * *prev_len; + } + + if ( type == LSUB || type == USUB ) lword = sizeof(int); + else lword = sizeof(doublecomplex); + + if ( Glu->MemModel == SYSTEM ) { + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + if ( Glu->num_expansions != 0 ) { + tries = 0; + if ( keep_prev ) { + if ( !new_mem ) return (NULL); + } else { + while ( !new_mem ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + } + } + if ( type == LSUB || type == USUB ) { + copy_mem_int(len_to_copy, expanders[type].mem, new_mem); + } else { + copy_mem_doublecomplex(len_to_copy, expanders[type].mem, new_mem); + } + SUPERLU_FREE (expanders[type].mem); + } + expanders[type].mem = (void *) new_mem; + + } else { /* MemModel == USER */ + if ( Glu->num_expansions == 0 ) { + new_mem = zuser_malloc(new_len * lword, HEAD, Glu); + if ( NotDoubleAlign(new_mem) && + (type == LUSUP || type == UCOL) ) { + old_mem = new_mem; + new_mem = (void *)DoubleAlign(new_mem); + extra = (char*)new_mem - (char*)old_mem; +#ifdef DEBUG + printf("expand(): not aligned, extra %d\n", extra); +#endif + Glu->stack.top1 += extra; + Glu->stack.used += extra; + } + expanders[type].mem = (void *) new_mem; + } else { + tries = 0; + extra = (new_len - *prev_len) * lword; + if ( keep_prev ) { + if ( StackFull(extra) ) return (NULL); + } else { + while ( StackFull(extra) ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + extra = (new_len - *prev_len) * lword; + } + } + + if ( type != USUB ) { + new_mem = (void*)((char*)expanders[type + 1].mem + extra); + bytes_to_copy = (char*)Glu->stack.array + Glu->stack.top1 + - (char*)expanders[type + 1].mem; + user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); + + if ( type < USUB ) { + Glu->usub = expanders[USUB].mem = + (void*)((char*)expanders[USUB].mem + extra); + } + if ( type < LSUB ) { + Glu->lsub = expanders[LSUB].mem = + (void*)((char*)expanders[LSUB].mem + extra); + } + if ( type < UCOL ) { + Glu->ucol = expanders[UCOL].mem = + (void*)((char*)expanders[UCOL].mem + extra); + } + Glu->stack.top1 += extra; + Glu->stack.used += extra; + if ( type == UCOL ) { + Glu->stack.top1 += extra; /* Add same amount for USUB */ + Glu->stack.used += extra; + } + + } /* if ... */ + + } /* else ... */ + } + + expanders[type].size = new_len; + *prev_len = new_len; + if ( Glu->num_expansions ) ++Glu->num_expansions; + + return (void *) expanders[type].mem; + +} /* zexpand */ + + +/*! \brief Compress the work[] array to remove fragmentation. + */ +void +zStackCompress(GlobalLU_t *Glu) +{ + register int iword, dword, ndim; + char *last, *fragment; + int *ifrom, *ito; + doublecomplex *dfrom, *dto; + int *xlsub, *lsub, *xusub, *usub, *xlusup; + doublecomplex *ucol, *lusup; + + iword = sizeof(int); + dword = sizeof(doublecomplex); + ndim = Glu->n; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + lusup = Glu->lusup; + + dfrom = ucol; + dto = (doublecomplex *)((char*)lusup + xlusup[ndim] * dword); + copy_mem_doublecomplex(xusub[ndim], dfrom, dto); + ucol = dto; + + ifrom = lsub; + ito = (int *) ((char*)ucol + xusub[ndim] * iword); + copy_mem_int(xlsub[ndim], ifrom, ito); + lsub = ito; + + ifrom = usub; + ito = (int *) ((char*)lsub + xlsub[ndim] * iword); + copy_mem_int(xusub[ndim], ifrom, ito); + usub = ito; + + last = (char*)usub + xusub[ndim] * iword; + fragment = (char*) (((char*)Glu->stack.array + Glu->stack.top1) - last); + Glu->stack.used -= (long int) fragment; + Glu->stack.top1 -= (long int) fragment; + + Glu->ucol = ucol; + Glu->lsub = lsub; + Glu->usub = usub; + +#ifdef DEBUG + printf("zStackCompress: fragment %d\n", fragment); + /* for (last = 0; last < ndim; ++last) + print_lu_col("After compress:", last, 0);*/ +#endif + +} + +/*! \brief Allocate storage for original matrix A + */ +void +zallocateA(int n, int nnz, doublecomplex **a, int **asub, int **xa) +{ + *a = (doublecomplex *) doublecomplexMalloc(nnz); + *asub = (int *) intMalloc(nnz); + *xa = (int *) intMalloc(n+1); +} + + +doublecomplex *doublecomplexMalloc(int n) +{ + doublecomplex *buf; + buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doublecomplexMalloc()\n"); + } + return (buf); +} + +doublecomplex *doublecomplexCalloc(int n) +{ + doublecomplex *buf; + register int i; + doublecomplex zero = {0.0, 0.0}; + buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doublecomplexCalloc()\n"); + } + for (i = 0; i < n; ++i) buf[i] = zero; + return (buf); +} + + +int zmemory_usage(const int nzlmax, const int nzumax, + const int nzlumax, const int n) +{ + register int iword, dword; + + iword = sizeof(int); + dword = sizeof(doublecomplex); + + return (10 * n * iword + + nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); + +} diff --git a/src/maths/SuperLU/zmyblas2.c b/src/maths/SuperLU/zmyblas2.c new file mode 100644 index 000000000..e50505433 --- /dev/null +++ b/src/maths/SuperLU/zmyblas2.c @@ -0,0 +1,188 @@ + +/*! @file zmyblas2.c + * \brief Level 2 Blas operations + * + *+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ * Purpose: + * Level 2 BLAS operations: solves and matvec, written in C. + * Note: + * This is only used when the system lacks an efficient BLAS library. + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ +/* + +*/ + +#include
+ * Purpose + * ======= + * + * Performs numeric block updates (sup-panel) in topological order. + * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. + * Special processing on the supernodal portion of L\U[*,j] + * + * Before entering this routine, the original nonzeros in the panel + * were already copied into the spa[m,w]. + * + * Updated/Output parameters- + * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned + * collectively in the m-by-w vector dense[*]. + *+ */ + +void +zpanel_bmod ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + const int nseg, /* in */ + doublecomplex *dense, /* out, of size n by w */ + doublecomplex *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in, of size n by w */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + doublecomplex alpha, beta; +#endif + + register int k, ksub; + int fsupc, nsupc, nsupr, nrow; + int krep, krep_ind; + doublecomplex ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int segsze; + int block_nrow; /* no of rows in a block row */ + register int lptr; /* Points to the row subscripts of a supernode */ + int kfnz, irow, no_zeros; + register int isub, isub1, i; + register int jj; /* Index through each column in the panel */ + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + int *repfnz_col; /* repfnz[] for a column in the panel */ + doublecomplex *dense_col; /* dense[] for a column in the panel */ + doublecomplex *tempv1; /* Used in 1-D update */ + doublecomplex *TriTmp, *MatvecTmp; /* used in 2-D update */ + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + doublecomplex comp_temp, comp_temp1; + register int ldaTmp; + register int r_ind, r_hi; + static int first = 1, maxsuper, rowblk, colblk; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + if ( first ) { + maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); + rowblk = sp_ienv(4); + colblk = sp_ienv(5); + first = 0; + } + ldaTmp = maxsuper + rowblk; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in a supernode + * nsupr = no of rows in a supernode + */ + krep = segrep[k--]; + fsupc = xsup[supno[krep]]; + nsupc = krep - fsupc + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nrow = nsupr - nsupc; + lptr = xlsub[fsupc]; + krep_ind = lptr + nsupc - 1; + + repfnz_col = repfnz; + dense_col = dense; + + if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ + + TriTmp = tempv; + + /* Sequence through each column in panel -- triangular solves */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += 4 * segsze * (segsze - 1); + ops[GEMV] += 8 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); + z_sub(&ukj1, &ukj1, &comp_temp); + + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; luptr2++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } + + } else { /* segsze >= 4 */ + + /* Copy U[*,j] segment from dense[*] to TriTmp[*], which + holds the result of triangular solves. */ + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + TriTmp[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#else + ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#endif +#else + zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); +#endif + + + } /* else ... */ + + } /* for jj ... end tri-solves */ + + /* Block row updates; push all the way into dense[*] block */ + for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { + + r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); + block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); + luptr = xlusup[fsupc] + nsupc + r_ind; + isub1 = lptr + nsupc + r_ind; + + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + /* Sequence through each column in panel -- matrix-vector */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + /* Perform a block update, and scatter the result of + matrix-vector to dense[]. */ + no_zeros = kfnz - fsupc; + luptr1 = luptr + nsupr * no_zeros; + MatvecTmp = &TriTmp[maxsuper]; + +#ifdef USE_VENDOR_BLAS + alpha = one; + beta = zero; +#ifdef _CRAY + CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#else + zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#endif +#else + zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], + TriTmp, MatvecTmp); +#endif + + /* Scatter MatvecTmp[*] into SPA dense[*] temporarily + * such that MatvecTmp[*] can be re-used for the + * the next blok row update. dense[] will be copied into + * global store after the whole panel has been finished. + */ + isub = isub1; + for (i = 0; i < block_nrow; i++) { + irow = lsub[isub]; + z_sub(&dense_col[irow], &dense_col[irow], + &MatvecTmp[i]); + MatvecTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } /* for each block row ... */ + + /* Scatter the triangular solves into SPA dense[*] */ + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = TriTmp[i]; + TriTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } else { /* 1-D block modification */ + + + /* Sequence through each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += 4 * segsze * (segsze - 1); + ops[GEMV] += 8 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); + z_sub(&ukj1, &ukj1, &comp_temp); + + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; ++luptr2; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } + + } else { /* segsze >= 4 */ + /* + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense[]. + */ + no_zeros = kfnz - fsupc; + + /* Copy U[*,j] segment from dense[*] to tempv[*]: + * The result of triangular solve is in tempv[*]; + * The result of matrix vector update is in dense_col[*] + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + tempv[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + zlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); +#endif + + /* Scatter tempv[*] into SPA dense[*] temporarily, such + * that tempv[*] can be used for the triangular solve of + * the next column of the panel. They will be copied into + * ucol[*] after the whole panel has been finished. + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = tempv[i]; + tempv[i] = zero; + isub++; + } + + /* Scatter the update from tempv1[*] into SPA dense[*] */ + /* Start dense rectangular L */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); + tempv1[i] = zero; + ++isub; + } + + } /* else segsze>=4 ... */ + + } /* for each column in the panel... */ + + } /* else 1-D update ... */ + + } /* for each updating supernode ... */ + +} + + + diff --git a/src/maths/SuperLU/zpanel_dfs.c b/src/maths/SuperLU/zpanel_dfs.c new file mode 100644 index 000000000..e160a2f2a --- /dev/null +++ b/src/maths/SuperLU/zpanel_dfs.c @@ -0,0 +1,254 @@ + +/*! @file zpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * + * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). + * + * A supernode representative is the last column of a supernode. + * The nonzeros in U[*,j] are segments that end at supernodal + * representatives. + * + * The routine returns one list of the supernodal representatives + * in topological order of the dfs that generates them. This list is + * a superset of the topological order of each individual column within + * the panel. + * The location of the first nonzero in each supernodal segment + * (supernodal entry location) is also returned. Each column has a + * separate list for this purpose. + * + * Two marker arrays are used for dfs: + * marker[i] == jj, if i was visited during dfs of current column jj; + * marker1[i] >= jcol, if i was visited by earlier columns in this panel; + * + * marker: A-row --> A-row/col (0/1) + * repfnz: SuperA-col --> PA-row + * parent: SuperA-col --> SuperA-col + * xplore: SuperA-col --> index to L-structure + *+ */ + +void +zpanel_dfs ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + doublecomplex *dense, /* out */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *xprune, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + NCPformat *Astore; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + doublecomplex *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/zpivotL.c b/src/maths/SuperLU/zpivotL.c new file mode 100644 index 000000000..59af91c72 --- /dev/null +++ b/src/maths/SuperLU/zpivotL.c @@ -0,0 +1,185 @@ + +/*! @file zpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * Performs the numerical pivoting on the current column of L, + * and the CDIV operation. + * + * Pivot policy: + * (1) Compute thresh = u * max_(i>=j) abs(A_ij); + * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN + * pivot row = k; + * ELSE IF abs(A_jj) >= thresh THEN + * pivot row = j; + * ELSE + * pivot row = m; + * + * Note: If you absolutely want to use a given pivot order, then set u=0.0. + * + * Return value: 0 success; + * i > 0 U(i,i) is exactly zero. + *+ */ + +int +zpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int *iperm_r, /* in - inverse of perm_r */ + int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ + int *pivrow, /* out */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + doublecomplex one = {1.0, 0.0}; + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + int pivptr, old_pivptr, diag, diagind; + double pivmax, rtemp, thresh; + doublecomplex temp; + doublecomplex *lu_sup_ptr; + doublecomplex *lu_col_ptr; + int *lsub_ptr; + int isub, icol, k, itemp; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + /* Initialize pointers */ + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + +#ifdef DEBUG +if ( jcol == MIN_COL ) { + printf("Before cdiv: col %d\n", jcol); + for (k = nsupc; k < nsupr; k++) + printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); +} +#endif + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + if ( *usepr ) *pivrow = iperm_r[jcol]; + diagind = iperm_c[jcol]; + pivmax = 0.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + for (isub = nsupc; isub < nsupr; ++isub) { + rtemp = z_abs1 (&lu_col_ptr[isub]); + if ( rtemp > pivmax ) { + pivmax = rtemp; + pivptr = isub; + } + if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; + if ( lsub_ptr[isub] == diagind ) diag = isub; + } + + /* Test for singularity */ + if ( pivmax == 0.0 ) { +#if 1 + *pivrow = lsub_ptr[pivptr]; + perm_r[*pivrow] = jcol; +#else + perm_r[diagind] = jcol; +#endif + *usepr = 0; + return (jcol+1); + } + + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + rtemp = z_abs1 (&lu_col_ptr[old_pivptr]); + if ( rtemp != 0.0 && rtemp >= thresh ) + pivptr = old_pivptr; + else + *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + rtemp = z_abs1 (&lu_col_ptr[diag]); + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += 10 * (nsupr - nsupc); + + z_div(&temp, &one, &lu_col_ptr[nsupc]); + for (k = nsupc+1; k < nsupr; k++) + zz_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); + + return 0; +} + diff --git a/src/maths/SuperLU/zpivotgrowth.c b/src/maths/SuperLU/zpivotgrowth.c new file mode 100644 index 000000000..ecf9993b7 --- /dev/null +++ b/src/maths/SuperLU/zpivotgrowth.c @@ -0,0 +1,114 @@ + +/*! @file zpivotgrowth.c + * \brief Computes the reciprocal pivot growth factor + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ */ +#include
+ * Purpose + * ======= + * + * Compute the reciprocal pivot growth factor of the leading ncols columns + * of the matrix, using the formula: + * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) + * + * Arguments + * ========= + * + * ncols (input) int + * The number of columns of matrices A, L and U. + * + * A (input) SuperMatrix* + * Original matrix A, permuted by columns, of dimension + * (A->nrow, A->ncol). The type of A can be: + * Stype = NC; Dtype = SLU_Z; Mtype = GE. + * + * L (output) SuperMatrix* + * The factor L from the factorization Pr*A=L*U; use compressed row + * subscripts storage for supernodes, i.e., L has type: + * Stype = SC; Dtype = SLU_Z; Mtype = TRLU. + * + * U (output) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise + * storage scheme, i.e., U has types: Stype = NC; + * Dtype = SLU_Z; Mtype = TRU. + *+ */ + +double +zPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, + SuperMatrix *L, SuperMatrix *U) +{ + + NCformat *Astore; + SCformat *Lstore; + NCformat *Ustore; + doublecomplex *Aval, *Lval, *Uval; + int fsupc, nsupr, luptr, nz_in_U; + int i, j, k, oldcol; + int *inv_perm_c; + double rpg, maxaj, maxuj; + double smlnum; + doublecomplex *luval; + doublecomplex temp_comp; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + rpg = 1. / smlnum; + + Astore = A->Store; + Lstore = L->Store; + Ustore = U->Store; + Aval = Astore->nzval; + Lval = Lstore->nzval; + Uval = Ustore->nzval; + + inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); + for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; + + for (k = 0; k <= Lstore->nsuper; ++k) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + luptr = L_NZ_START(fsupc); + luval = &Lval[luptr]; + nz_in_U = 1; + + for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { + maxaj = 0.; + oldcol = inv_perm_c[j]; + for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) + maxaj = SUPERLU_MAX( maxaj, z_abs1( &Aval[i]) ); + + maxuj = 0.; + for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) + maxuj = SUPERLU_MAX( maxuj, z_abs1( &Uval[i]) ); + + /* Supernode */ + for (i = 0; i < nz_in_U; ++i) + maxuj = SUPERLU_MAX( maxuj, z_abs1( &luval[i]) ); + + ++nz_in_U; + luval += nsupr; + + if ( maxuj == 0. ) + rpg = SUPERLU_MIN( rpg, 1.); + else + rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); + } + + if ( j >= ncols ) break; + } + + SUPERLU_FREE(inv_perm_c); + return (rpg); +} diff --git a/src/maths/SuperLU/zpruneL.c b/src/maths/SuperLU/zpruneL.c new file mode 100644 index 000000000..e16099b89 --- /dev/null +++ b/src/maths/SuperLU/zpruneL.c @@ -0,0 +1,154 @@ + +/*! @file zpruneL.c + * \brief Prunes the L-structure + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * Prunes the L-structure of supernodes whose L-structure + * contains the current pivot row "pivrow" + *+ */ + +void +zpruneL( + const int jcol, /* in */ + const int *perm_r, /* in */ + const int pivrow, /* in */ + const int nseg, /* in */ + const int *segrep, /* in */ + const int *repfnz, /* in */ + int *xprune, /* out */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + + doublecomplex utemp; + int jsupno, irep, irep1, kmin, kmax, krow, movnum; + int i, ktemp, minloc, maxloc; + int do_prune; /* logical variable */ + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + /* + * For each supernode-rep irep in U[*,j] + */ + jsupno = supno[jcol]; + for (i = 0; i < nseg; i++) { + + irep = segrep[i]; + irep1 = irep + 1; + do_prune = FALSE; + + /* Don't prune with a zero U-segment */ + if ( repfnz[irep] == EMPTY ) + continue; + + /* If a snode overlaps with the next panel, then the U-segment + * is fragmented into two parts -- irep and irep1. We should let + * pruning occur at the rep-column in irep1's snode. + */ + if ( supno[irep] == supno[irep1] ) /* Don't prune */ + continue; + + /* + * If it has not been pruned & it has a nonz in row L[pivrow,i] + */ + if ( supno[irep] != jsupno ) { + if ( xprune[irep] >= xlsub[irep1] ) { + kmin = xlsub[irep]; + kmax = xlsub[irep1] - 1; + for (krow = kmin; krow <= kmax; krow++) + if ( lsub[krow] == pivrow ) { + do_prune = TRUE; + break; + } + } + + if ( do_prune ) { + + /* Do a quicksort-type partition + * movnum=TRUE means that the num values have to be exchanged. + */ + movnum = FALSE; + if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ + movnum = TRUE; + + while ( kmin <= kmax ) { + + if ( perm_r[lsub[kmax]] == EMPTY ) + kmax--; + else if ( perm_r[lsub[kmin]] != EMPTY ) + kmin++; + else { /* kmin below pivrow (not yet pivoted), and kmax + * above pivrow: interchange the two subscripts + */ + ktemp = lsub[kmin]; + lsub[kmin] = lsub[kmax]; + lsub[kmax] = ktemp; + + /* If the supernode has only one column, then we + * only keep one set of subscripts. For any subscript + * interchange performed, similar interchange must be + * done on the numerical values. + */ + if ( movnum ) { + minloc = xlusup[irep] + (kmin - xlsub[irep]); + maxloc = xlusup[irep] + (kmax - xlsub[irep]); + utemp = lusup[minloc]; + lusup[minloc] = lusup[maxloc]; + lusup[maxloc] = utemp; + } + + kmin++; + kmax--; + + } + + } /* while */ + + xprune[irep] = kmin; /* Pruning */ + +#ifdef CHK_PRUNE + printf(" After zpruneL(),using col %d: xprune[%d] = %d\n", + jcol, irep, kmin); +#endif + } /* if do_prune */ + + } /* if */ + + } /* for each U-segment... */ +} diff --git a/src/maths/SuperLU/zreadhb.c b/src/maths/SuperLU/zreadhb.c new file mode 100644 index 000000000..90debee41 --- /dev/null +++ b/src/maths/SuperLU/zreadhb.c @@ -0,0 +1,267 @@ + +/*! @file zreadhb.c + * \brief Read a matrix stored in Harwell-Boeing format + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Purpose + * ======= + * + * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format + * as described below. + * + * Line 1 (A72,A8) + * Col. 1 - 72 Title (TITLE) + * Col. 73 - 80 Key (KEY) + * + * Line 2 (5I14) + * Col. 1 - 14 Total number of lines excluding header (TOTCRD) + * Col. 15 - 28 Number of lines for pointers (PTRCRD) + * Col. 29 - 42 Number of lines for row (or variable) indices (INDCRD) + * Col. 43 - 56 Number of lines for numerical values (VALCRD) + * Col. 57 - 70 Number of lines for right-hand sides (RHSCRD) + * (including starting guesses and solution vectors + * if present) + * (zero indicates no right-hand side data is present) + * + * Line 3 (A3, 11X, 4I14) + * Col. 1 - 3 Matrix type (see below) (MXTYPE) + * Col. 15 - 28 Number of rows (or variables) (NROW) + * Col. 29 - 42 Number of columns (or elements) (NCOL) + * Col. 43 - 56 Number of row (or variable) indices (NNZERO) + * (equal to number of entries for assembled matrices) + * Col. 57 - 70 Number of elemental matrix entries (NELTVL) + * (zero in the case of assembled matrices) + * Line 4 (2A16, 2A20) + * Col. 1 - 16 Format for pointers (PTRFMT) + * Col. 17 - 32 Format for row (or variable) indices (INDFMT) + * Col. 33 - 52 Format for numerical values of coefficient matrix (VALFMT) + * Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) + * + * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present + * Col. 1 Right-hand side type: + * F for full storage or M for same format as matrix + * Col. 2 G if a starting vector(s) (Guess) is supplied. (RHSTYP) + * Col. 3 X if an exact solution vector(s) is supplied. + * Col. 15 - 28 Number of right-hand sides (NRHS) + * Col. 29 - 42 Number of row indices (NRHSIX) + * (ignored in case of unassembled matrices) + * + * The three character type field on line 3 describes the matrix type. + * The following table lists the permitted values for each of the three + * characters. As an example of the type field, RSA denotes that the matrix + * is real, symmetric, and assembled. + * + * First Character: + * R Real matrix + * C Complex matrix + * P Pattern only (no numerical values supplied) + * + * Second Character: + * S Symmetric + * U Unsymmetric + * H Hermitian + * Z Skew symmetric + * R Rectangular + * + * Third Character: + * A Assembled + * E Elemental matrices (unassembled) + * + *+ */ +#include
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include
+ * Purpose + * ======= + * zsnode_dfs() - Determine the union of the row structures of those + * columns within the relaxed snode. + * Note: The relaxed snodes are leaves of the supernodal etree, therefore, + * the portion outside the rectangular supernode must be zero. + * + * Return value + * ============ + * 0 success; + * >0 number of bytes allocated when run out of memory. + *+ */ + +int +zsnode_dfs ( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *xprune, /* out */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, ifrom, ito, nextl, new_next; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) { + if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1, then make a copy of the subscripts for pruning */ + if ( jcol < kcol ) { + new_next = nextl + (nextl - xlsub[jcol]); + while ( new_next > nzlmax ) { + if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + ito = nextl; + for (ifrom = xlsub[jcol]; ifrom < nextl; ) + lsub[ito++] = lsub[ifrom++]; + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + nextl = ito; + } + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xprune[kcol] = nextl; + xlsub[kcol+1] = nextl; + + return 0; +} + diff --git a/src/maths/SuperLU/zsp_blas2.c b/src/maths/SuperLU/zsp_blas2.c new file mode 100644 index 000000000..c094f2d69 --- /dev/null +++ b/src/maths/SuperLU/zsp_blas2.c @@ -0,0 +1,573 @@ + +/*! @file zsp_blas2.c + * \brief Sparse BLAS 2, using some dense BLAS 2 operations + * + *
+ * -- SuperLU routine (version 3.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * October 15, 2003 + *+ */ +/* + * File name: zsp_blas2.c + * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. + */ + +#include
+ * Purpose + * ======= + * + * sp_ztrsv() solves one of the systems of equations + * A*x = b, or A'*x = b, + * where b and x are n element vectors and A is a sparse unit , or + * non-unit, upper or lower triangular matrix. + * No test for singularity or near-singularity is included in this + * routine. Such tests must be performed before calling this routine. + * + * Parameters + * ========== + * + * uplo - (input) char* + * On entry, uplo specifies whether the matrix is an upper or + * lower triangular matrix as follows: + * uplo = 'U' or 'u' A is an upper triangular matrix. + * uplo = 'L' or 'l' A is a lower triangular matrix. + * + * trans - (input) char* + * On entry, trans specifies the equations to be solved as + * follows: + * trans = 'N' or 'n' A*x = b. + * trans = 'T' or 't' A'*x = b. + * trans = 'C' or 'c' A^H*x = b. + * + * diag - (input) char* + * On entry, diag specifies whether or not A is unit + * triangular as follows: + * diag = 'U' or 'u' A is assumed to be unit triangular. + * diag = 'N' or 'n' A is not assumed to be unit + * triangular. + * + * L - (input) SuperMatrix* + * The factor L from the factorization Pr*A*Pc=L*U. Use + * compressed row subscripts storage for supernodes, + * i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU. + * + * U - (input) SuperMatrix* + * The factor U from the factorization Pr*A*Pc=L*U. + * U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU. + * + * x - (input/output) doublecomplex* + * Before entry, the incremented array X must contain the n + * element right-hand side vector b. On exit, X is overwritten + * with the solution vector x. + * + * info - (output) int* + * If *info = -i, the i-th argument had an illegal value. + *+ */ +int +sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, + SuperMatrix *U, doublecomplex *x, SuperLUStat_t *stat, int *info) +{ +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + SCformat *Lstore; + NCformat *Ustore; + doublecomplex *Lval, *Uval; + int incx = 1, incy = 1; + doublecomplex temp; + doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; + doublecomplex comp_zero = {0.0, 0.0}; + int nrow; + int fsupc, nsupr, nsupc, luptr, istart, irow; + int i, k, iptr, jcol; + doublecomplex *work; + flops_t solve_ops; + + /* Test the input parameters */ + *info = 0; + if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; + else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && + !lsame_(trans, "C")) *info = -2; + else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; + else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; + else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; + if ( *info ) { + i = -(*info); + xerbla_("sp_ztrsv", &i); + return 0; + } + + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( !(work = doublecomplexCalloc(L->nrow)) ) + ABORT_SuperLU("Malloc fails for work in sp_ztrsv()."); + + if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L)*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + nrow = nsupr - nsupc; + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc; + solve_ops += 8 * nrow * nsupc; + + if ( nsupc == 1 ) { + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { + irow = L_SUB(iptr); + ++luptr; + zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]); + z_sub(&x[irow], &x[irow], &comp_zero); + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#else + ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#endif +#else + zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); + + zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], + &x[fsupc], &work[0] ); +#endif + + iptr = istart + nsupc; + for (i = 0; i < nrow; ++i, ++iptr) { + irow = L_SUB(iptr); + z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ + work[i] = comp_zero; + + } + } + } /* for k ... */ + + } else { + /* Form x := inv(U)*x */ + + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; + + if ( nsupc == 1 ) { + z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); + for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { + irow = U_SUB(i); + zz_mult(&comp_zero, &x[fsupc], &Uval[i]); + z_sub(&x[irow], &x[irow], &comp_zero); + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif +#else + zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); +#endif + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); + i++) { + irow = U_SUB(i); + zz_mult(&comp_zero, &x[jcol], &Uval[i]); + z_sub(&x[irow], &x[irow], &comp_zero); + } + } + } + } /* for k ... */ + + } + } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L')*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; --k) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 8 * (nsupr - nsupc) * nsupc; + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + iptr = istart + nsupc; + for (i = L_NZ_START(jcol) + nsupc; + i < L_NZ_START(jcol+1); i++) { + irow = L_SUB(iptr); + zz_mult(&comp_zero, &x[irow], &Lval[i]); + z_sub(&x[jcol], &x[jcol], &comp_zero); + iptr++; + } + } + + if ( nsupc > 1 ) { + solve_ops += 4 * nsupc * (nsupc - 1); +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("U", strlen("U")); + CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } + } else { + /* Form x := inv(U')*x */ + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { + irow = U_SUB(i); + zz_mult(&comp_zero, &x[irow], &Uval[i]); + z_sub(&x[jcol], &x[jcol], &comp_zero); + } + } + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; + + if ( nsupc == 1 ) { + z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); + } else { +#ifdef _CRAY + ftcs1 = _cptofcd("U", strlen("U")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("N", strlen("N")); + CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } /* for k ... */ + } + } else { /* Form x := conj(inv(A'))*x */ + + if ( lsame_(uplo, "L") ) { + /* Form x := conj(inv(L'))*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; --k) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 8 * (nsupr - nsupc) * nsupc; + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + iptr = istart + nsupc; + for (i = L_NZ_START(jcol) + nsupc; + i < L_NZ_START(jcol+1); i++) { + irow = L_SUB(iptr); + zz_conj(&temp, &Lval[i]); + zz_mult(&comp_zero, &x[irow], &temp); + z_sub(&x[jcol], &x[jcol], &comp_zero); + iptr++; + } + } + + if ( nsupc > 1 ) { + solve_ops += 4 * nsupc * (nsupc - 1); +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd(trans, strlen("T")); + ftcs3 = _cptofcd("U", strlen("U")); + ZTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } + } else { + /* Form x := conj(inv(U'))*x */ + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { + irow = U_SUB(i); + zz_conj(&temp, &Uval[i]); + zz_mult(&comp_zero, &x[irow], &temp); + z_sub(&x[jcol], &x[jcol], &comp_zero); + } + } + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; + + if ( nsupc == 1 ) { + zz_conj(&temp, &Lval[luptr]); + z_div(&x[fsupc], &x[fsupc], &temp); + } else { +#ifdef _CRAY + ftcs1 = _cptofcd("U", strlen("U")); + ftcs2 = _cptofcd(trans, strlen("T")); + ftcs3 = _cptofcd("N", strlen("N")); + ZTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } /* for k ... */ + } + } + + stat->ops[SOLVE] += solve_ops; + SUPERLU_FREE(work); + return 0; +} + + + +/*! \brief Performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y + * + *
+ * Purpose + * ======= + * + * sp_zgemv() performs one of the matrix-vector operations + * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, + * where alpha and beta are scalars, x and y are vectors and A is a + * sparse A->nrow by A->ncol matrix. + * + * Parameters + * ========== + * + * TRANS - (input) char* + * On entry, TRANS specifies the operation to be performed as + * follows: + * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. + * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. + * TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. + * + * ALPHA - (input) doublecomplex + * On entry, ALPHA specifies the scalar alpha. + * + * A - (input) SuperMatrix* + * Before entry, the leading m by n part of the array A must + * contain the matrix of coefficients. + * + * X - (input) doublecomplex*, array of DIMENSION at least + * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' + * and at least + * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. + * Before entry, the incremented array X must contain the + * vector x. + * + * INCX - (input) int + * On entry, INCX specifies the increment for the elements of + * X. INCX must not be zero. + * + * BETA - (input) doublecomplex + * On entry, BETA specifies the scalar beta. When BETA is + * supplied as zero then Y need not be set on input. + * + * Y - (output) doublecomplex*, array of DIMENSION at least + * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' + * and at least + * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. + * Before entry with BETA non-zero, the incremented array Y + * must contain the vector y. On exit, Y is overwritten by the + * updated vector y. + * + * INCY - (input) int + * On entry, INCY specifies the increment for the elements of + * Y. INCY must not be zero. + * + * ==== Sparse Level 2 Blas routine. + *+*/ +int +sp_zgemv(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x, + int incx, doublecomplex beta, doublecomplex *y, int incy) +{ + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int info; + doublecomplex temp, temp1; + int lenx, leny, i, j, irow; + int iy, jx, jy, kx, ky; + int notran; + doublecomplex comp_zero = {0.0, 0.0}; + doublecomplex comp_one = {1.0, 0.0}; + + notran = lsame_(trans, "N"); + Astore = A->Store; + Aval = Astore->nzval; + + /* Test the input parameters */ + info = 0; + if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; + else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; + else if (incx == 0) info = 5; + else if (incy == 0) info = 8; + if (info != 0) { + xerbla_("sp_zgemv ", &info); + return 0; + } + + /* Quick return if possible. */ + if (A->nrow == 0 || A->ncol == 0 || + z_eq(&alpha, &comp_zero) && + z_eq(&beta, &comp_one)) + return 0; + + + /* Set LENX and LENY, the lengths of the vectors x and y, and set + up the start points in X and Y. */ + if (lsame_(trans, "N")) { + lenx = A->ncol; + leny = A->nrow; + } else { + lenx = A->nrow; + leny = A->ncol; + } + if (incx > 0) kx = 0; + else kx = - (lenx - 1) * incx; + if (incy > 0) ky = 0; + else ky = - (leny - 1) * incy; + + /* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + /* First form y := beta*y. */ + if ( !z_eq(&beta, &comp_one) ) { + if (incy == 1) { + if ( z_eq(&beta, &comp_zero) ) + for (i = 0; i < leny; ++i) y[i] = comp_zero; + else + for (i = 0; i < leny; ++i) + zz_mult(&y[i], &beta, &y[i]); + } else { + iy = ky; + if ( z_eq(&beta, &comp_zero) ) + for (i = 0; i < leny; ++i) { + y[iy] = comp_zero; + iy += incy; + } + else + for (i = 0; i < leny; ++i) { + zz_mult(&y[iy], &beta, &y[iy]); + iy += incy; + } + } + } + + if ( z_eq(&alpha, &comp_zero) ) return 0; + + if ( notran ) { + /* Form y := alpha*A*x + y. */ + jx = kx; + if (incy == 1) { + for (j = 0; j < A->ncol; ++j) { + if ( !z_eq(&x[jx], &comp_zero) ) { + zz_mult(&temp, &alpha, &x[jx]); + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + zz_mult(&temp1, &temp, &Aval[i]); + z_add(&y[irow], &y[irow], &temp1); + } + } + jx += incx; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } else { + /* Form y := alpha*A'*x + y. */ + jy = ky; + if (incx == 1) { + for (j = 0; j < A->ncol; ++j) { + temp = comp_zero; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + zz_mult(&temp1, &Aval[i], &x[irow]); + z_add(&temp, &temp, &temp1); + } + zz_mult(&temp1, &alpha, &temp); + z_add(&y[jy], &y[jy], &temp1); + jy += incy; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } + return 0; +} /* sp_zgemv */ + diff --git a/src/maths/SuperLU/zsp_blas3.c b/src/maths/SuperLU/zsp_blas3.c new file mode 100644 index 000000000..f208433bf --- /dev/null +++ b/src/maths/SuperLU/zsp_blas3.c @@ -0,0 +1,127 @@ + +/*! @file zsp_blas3.c + * \brief Sparse BLAS3, using some dense BLAS3 operations + * + *
+ * -- SuperLU routine (version 2.0) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November 15, 1997 + *+ */ +/* + * File name: sp_blas3.c + * Purpose: Sparse BLAS3, using some dense BLAS3 operations. + */ + +#include
+ * Purpose + * ======= + * + * sp_z performs one of the matrix-matrix operations + * + * C := alpha*op( A )*op( B ) + beta*C, + * + * where op( X ) is one of + * + * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), + * + * alpha and beta are scalars, and A, B and C are matrices, with op( A ) + * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. + * + * + * Parameters + * ========== + * + * TRANSA - (input) char* + * On entry, TRANSA specifies the form of op( A ) to be used in + * the matrix multiplication as follows: + * TRANSA = 'N' or 'n', op( A ) = A. + * TRANSA = 'T' or 't', op( A ) = A'. + * TRANSA = 'C' or 'c', op( A ) = conjg( A' ). + * Unchanged on exit. + * + * TRANSB - (input) char* + * On entry, TRANSB specifies the form of op( B ) to be used in + * the matrix multiplication as follows: + * TRANSB = 'N' or 'n', op( B ) = B. + * TRANSB = 'T' or 't', op( B ) = B'. + * TRANSB = 'C' or 'c', op( B ) = conjg( B' ). + * Unchanged on exit. + * + * M - (input) int + * On entry, M specifies the number of rows of the matrix + * op( A ) and of the matrix C. M must be at least zero. + * Unchanged on exit. + * + * N - (input) int + * On entry, N specifies the number of columns of the matrix + * op( B ) and the number of columns of the matrix C. N must be + * at least zero. + * Unchanged on exit. + * + * K - (input) int + * On entry, K specifies the number of columns of the matrix + * op( A ) and the number of rows of the matrix op( B ). K must + * be at least zero. + * Unchanged on exit. + * + * ALPHA - (input) doublecomplex + * On entry, ALPHA specifies the scalar alpha. + * + * A - (input) SuperMatrix* + * Matrix A with a sparse format, of dimension (A->nrow, A->ncol). + * Currently, the type of A can be: + * Stype = NC or NCP; Dtype = SLU_Z; Mtype = GE. + * In the future, more general A can be handled. + * + * B - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is + * n when TRANSB = 'N' or 'n', and is k otherwise. + * Before entry with TRANSB = 'N' or 'n', the leading k by n + * part of the array B must contain the matrix B, otherwise + * the leading n by k part of the array B must contain the + * matrix B. + * Unchanged on exit. + * + * LDB - (input) int + * On entry, LDB specifies the first dimension of B as declared + * in the calling (sub) program. LDB must be at least max( 1, n ). + * Unchanged on exit. + * + * BETA - (input) doublecomplex + * On entry, BETA specifies the scalar beta. When BETA is + * supplied as zero then C need not be set on input. + * + * C - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDC, n ). + * Before entry, the leading m by n part of the array C must + * contain the matrix C, except when beta is zero, in which + * case C need not be set on entry. + * On exit, the array C is overwritten by the m by n matrix + * ( alpha*op( A )*B + beta*C ). + * + * LDC - (input) int + * On entry, LDC specifies the first dimension of C as declared + * in the calling (sub)program. LDC must be at least max(1,m). + * Unchanged on exit. + * + * ==== Sparse Level 3 Blas routine. + *+ */ + +int +sp_zgemm(char *transa, char *transb, int m, int n, int k, + doublecomplex alpha, SuperMatrix *A, doublecomplex *b, int ldb, + doublecomplex beta, doublecomplex *c, int ldc) +{ + int incx = 1, incy = 1; + int j; + + for (j = 0; j < n; ++j) { + sp_zgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); + } + return 0; +} diff --git a/src/maths/SuperLU/zutil.c b/src/maths/SuperLU/zutil.c new file mode 100644 index 000000000..20478c743 --- /dev/null +++ b/src/maths/SuperLU/zutil.c @@ -0,0 +1,475 @@ + +/*! @file zutil.c + * \brief Matrix utility functions + * + *
+ * -- SuperLU routine (version 3.1) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * August 1, 2008 + * + * Copyright (c) 1994 by Xerox Corporation. All rights reserved. + * + * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY + * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. + * + * Permission is hereby granted to use or copy this program for any + * purpose, provided the above notices are retained on all copies. + * Permission to modify the code and to distribute modified code is + * granted, provided the above notices are retained, and a notice that + * the code was modified is included with the above copyright notice. + *+ */ + + +#include