From 7c868663bd1558a9a6090a8cc2546226c0c38c5d Mon Sep 17 00:00:00 2001 From: Francesco Lannutti Date: Thu, 23 Feb 2012 17:39:09 +0100 Subject: [PATCH] Added SuperLU as new experimental linear solver, (#1 new files) --- src/include/ngspice/slu_Cnames.h | 458 +++ src/include/ngspice/slu_dcomplex.h | 78 + src/include/ngspice/slu_ddefs.h | 281 ++ src/include/ngspice/slu_util.h | 367 +++ src/include/ngspice/slu_zdefs.h | 284 ++ src/include/ngspice/superlu_enum_consts.h | 71 + src/include/ngspice/supermatrix.h | 179 ++ src/maths/SuperLU/Makefile.am | 146 + src/maths/SuperLU/colamd.c | 3414 +++++++++++++++++++++ src/maths/SuperLU/colamd.h | 249 ++ src/maths/SuperLU/dcolumn_bmod.c | 352 +++ src/maths/SuperLU/dcolumn_dfs.c | 275 ++ src/maths/SuperLU/dcomplex.c | 147 + src/maths/SuperLU/dcopy_to_ucol.c | 103 + src/maths/SuperLU/ddiagonal.c | 129 + src/maths/SuperLU/dgscon.c | 157 + src/maths/SuperLU/dgsequ.c | 195 ++ src/maths/SuperLU/dgsisx.c | 727 +++++ src/maths/SuperLU/dgsitrf.c | 639 ++++ src/maths/SuperLU/dgsrfs.c | 452 +++ src/maths/SuperLU/dgssv.c | 227 ++ src/maths/SuperLU/dgssvx.c | 622 ++++ src/maths/SuperLU/dgstrf.c | 436 +++ src/maths/SuperLU/dgstrs.c | 337 ++ src/maths/SuperLU/dlacon.c | 236 ++ src/maths/SuperLU/dlamch.c | 971 ++++++ src/maths/SuperLU/dlangs.c | 119 + src/maths/SuperLU/dlaqgs.c | 145 + src/maths/SuperLU/dldperm.c | 165 + src/maths/SuperLU/dmemory.c | 701 +++++ src/maths/SuperLU/dmyblas2.c | 230 ++ src/maths/SuperLU/dpanel_bmod.c | 459 +++ src/maths/SuperLU/dpanel_dfs.c | 254 ++ src/maths/SuperLU/dpivotL.c | 184 ++ src/maths/SuperLU/dpivotgrowth.c | 113 + src/maths/SuperLU/dpruneL.c | 154 + src/maths/SuperLU/dreadhb.c | 257 ++ src/maths/SuperLU/dreadrb.c | 237 ++ src/maths/SuperLU/dreadtriple.c | 140 + src/maths/SuperLU/dsnode_bmod.c | 118 + src/maths/SuperLU/dsnode_dfs.c | 112 + src/maths/SuperLU/dsp_blas2.c | 477 +++ src/maths/SuperLU/dsp_blas3.c | 127 + src/maths/SuperLU/dutil.c | 471 +++ src/maths/SuperLU/dzsum1.c | 94 + src/maths/SuperLU/get_perm_c.c | 457 +++ src/maths/SuperLU/heap_relax_snode.c | 124 + src/maths/SuperLU/ilu_dcolumn_dfs.c | 258 ++ src/maths/SuperLU/ilu_dcopy_to_ucol.c | 207 ++ src/maths/SuperLU/ilu_ddrop_row.c | 329 ++ src/maths/SuperLU/ilu_dpanel_dfs.c | 248 ++ src/maths/SuperLU/ilu_dpivotL.c | 266 ++ src/maths/SuperLU/ilu_dsnode_dfs.c | 90 + src/maths/SuperLU/ilu_heap_relax_snode.c | 120 + src/maths/SuperLU/ilu_relax_snode.c | 69 + src/maths/SuperLU/ilu_zcolumn_dfs.c | 258 ++ src/maths/SuperLU/ilu_zcopy_to_ucol.c | 211 ++ src/maths/SuperLU/ilu_zdrop_row.c | 339 ++ src/maths/SuperLU/ilu_zpanel_dfs.c | 248 ++ src/maths/SuperLU/ilu_zpivotL.c | 274 ++ src/maths/SuperLU/ilu_zsnode_dfs.c | 90 + src/maths/SuperLU/izmax1.c | 113 + src/maths/SuperLU/lsame.c | 83 + src/maths/SuperLU/mark_relax.c | 47 + src/maths/SuperLU/mc64ad.c | 2641 ++++++++++++++++ src/maths/SuperLU/memory.c | 210 ++ src/maths/SuperLU/mmd.c | 1012 ++++++ src/maths/SuperLU/qselect.c | 74 + src/maths/SuperLU/relax_snode.c | 75 + src/maths/SuperLU/sp_coletree.c | 419 +++ src/maths/SuperLU/sp_ienv.c | 79 + src/maths/SuperLU/sp_preorder.c | 208 ++ src/maths/SuperLU/superlu_timer.c | 72 + src/maths/SuperLU/superlusmp.c | 722 +++++ src/maths/SuperLU/util.c | 495 +++ src/maths/SuperLU/xerbla.c | 43 + src/maths/SuperLU/zcolumn_bmod.c | 367 +++ src/maths/SuperLU/zcolumn_dfs.c | 275 ++ src/maths/SuperLU/zcopy_to_ucol.c | 103 + src/maths/SuperLU/zdiagonal.c | 133 + src/maths/SuperLU/zgscon.c | 154 + src/maths/SuperLU/zgsequ.c | 195 ++ src/maths/SuperLU/zgsisx.c | 727 +++++ src/maths/SuperLU/zgsitrf.c | 637 ++++ src/maths/SuperLU/zgsrfs.c | 460 +++ src/maths/SuperLU/zgssv.c | 227 ++ src/maths/SuperLU/zgssvx.c | 622 ++++ src/maths/SuperLU/zgstrf.c | 436 +++ src/maths/SuperLU/zgstrs.c | 350 +++ src/maths/SuperLU/zlacon.c | 221 ++ src/maths/SuperLU/zlangs.c | 119 + src/maths/SuperLU/zlaqgs.c | 147 + src/maths/SuperLU/zldperm.c | 168 + src/maths/SuperLU/zmemory.c | 701 +++++ src/maths/SuperLU/zmyblas2.c | 188 ++ src/maths/SuperLU/zpanel_bmod.c | 487 +++ src/maths/SuperLU/zpanel_dfs.c | 254 ++ src/maths/SuperLU/zpivotL.c | 185 ++ src/maths/SuperLU/zpivotgrowth.c | 114 + src/maths/SuperLU/zpruneL.c | 154 + src/maths/SuperLU/zreadhb.c | 267 ++ src/maths/SuperLU/zreadrb.c | 246 ++ src/maths/SuperLU/zreadtriple.c | 140 + src/maths/SuperLU/zsnode_bmod.c | 120 + src/maths/SuperLU/zsnode_dfs.c | 112 + src/maths/SuperLU/zsp_blas2.c | 573 ++++ src/maths/SuperLU/zsp_blas3.c | 127 + src/maths/SuperLU/zutil.c | 475 +++ 108 files changed, 35454 insertions(+) create mode 100644 src/include/ngspice/slu_Cnames.h create mode 100644 src/include/ngspice/slu_dcomplex.h create mode 100644 src/include/ngspice/slu_ddefs.h create mode 100644 src/include/ngspice/slu_util.h create mode 100644 src/include/ngspice/slu_zdefs.h create mode 100644 src/include/ngspice/superlu_enum_consts.h create mode 100644 src/include/ngspice/supermatrix.h create mode 100644 src/maths/SuperLU/Makefile.am create mode 100644 src/maths/SuperLU/colamd.c create mode 100644 src/maths/SuperLU/colamd.h create mode 100644 src/maths/SuperLU/dcolumn_bmod.c create mode 100644 src/maths/SuperLU/dcolumn_dfs.c create mode 100644 src/maths/SuperLU/dcomplex.c create mode 100644 src/maths/SuperLU/dcopy_to_ucol.c create mode 100644 src/maths/SuperLU/ddiagonal.c create mode 100644 src/maths/SuperLU/dgscon.c create mode 100644 src/maths/SuperLU/dgsequ.c create mode 100644 src/maths/SuperLU/dgsisx.c create mode 100644 src/maths/SuperLU/dgsitrf.c create mode 100644 src/maths/SuperLU/dgsrfs.c create mode 100644 src/maths/SuperLU/dgssv.c create mode 100644 src/maths/SuperLU/dgssvx.c create mode 100644 src/maths/SuperLU/dgstrf.c create mode 100644 src/maths/SuperLU/dgstrs.c create mode 100644 src/maths/SuperLU/dlacon.c create mode 100644 src/maths/SuperLU/dlamch.c create mode 100644 src/maths/SuperLU/dlangs.c create mode 100644 src/maths/SuperLU/dlaqgs.c create mode 100644 src/maths/SuperLU/dldperm.c create mode 100644 src/maths/SuperLU/dmemory.c create mode 100644 src/maths/SuperLU/dmyblas2.c create mode 100644 src/maths/SuperLU/dpanel_bmod.c create mode 100644 src/maths/SuperLU/dpanel_dfs.c create mode 100644 src/maths/SuperLU/dpivotL.c create mode 100644 src/maths/SuperLU/dpivotgrowth.c create mode 100644 src/maths/SuperLU/dpruneL.c create mode 100644 src/maths/SuperLU/dreadhb.c create mode 100644 src/maths/SuperLU/dreadrb.c create mode 100644 src/maths/SuperLU/dreadtriple.c create mode 100644 src/maths/SuperLU/dsnode_bmod.c create mode 100644 src/maths/SuperLU/dsnode_dfs.c create mode 100644 src/maths/SuperLU/dsp_blas2.c create mode 100644 src/maths/SuperLU/dsp_blas3.c create mode 100644 src/maths/SuperLU/dutil.c create mode 100644 src/maths/SuperLU/dzsum1.c create mode 100644 src/maths/SuperLU/get_perm_c.c create mode 100644 src/maths/SuperLU/heap_relax_snode.c create mode 100644 src/maths/SuperLU/ilu_dcolumn_dfs.c create mode 100644 src/maths/SuperLU/ilu_dcopy_to_ucol.c create mode 100644 src/maths/SuperLU/ilu_ddrop_row.c create mode 100644 src/maths/SuperLU/ilu_dpanel_dfs.c create mode 100644 src/maths/SuperLU/ilu_dpivotL.c create mode 100644 src/maths/SuperLU/ilu_dsnode_dfs.c create mode 100644 src/maths/SuperLU/ilu_heap_relax_snode.c create mode 100644 src/maths/SuperLU/ilu_relax_snode.c create mode 100644 src/maths/SuperLU/ilu_zcolumn_dfs.c create mode 100644 src/maths/SuperLU/ilu_zcopy_to_ucol.c create mode 100644 src/maths/SuperLU/ilu_zdrop_row.c create mode 100644 src/maths/SuperLU/ilu_zpanel_dfs.c create mode 100644 src/maths/SuperLU/ilu_zpivotL.c create mode 100644 src/maths/SuperLU/ilu_zsnode_dfs.c create mode 100644 src/maths/SuperLU/izmax1.c create mode 100644 src/maths/SuperLU/lsame.c create mode 100644 src/maths/SuperLU/mark_relax.c create mode 100644 src/maths/SuperLU/mc64ad.c create mode 100644 src/maths/SuperLU/memory.c create mode 100644 src/maths/SuperLU/mmd.c create mode 100644 src/maths/SuperLU/qselect.c create mode 100644 src/maths/SuperLU/relax_snode.c create mode 100644 src/maths/SuperLU/sp_coletree.c create mode 100644 src/maths/SuperLU/sp_ienv.c create mode 100644 src/maths/SuperLU/sp_preorder.c create mode 100644 src/maths/SuperLU/superlu_timer.c create mode 100644 src/maths/SuperLU/superlusmp.c create mode 100644 src/maths/SuperLU/util.c create mode 100644 src/maths/SuperLU/xerbla.c create mode 100644 src/maths/SuperLU/zcolumn_bmod.c create mode 100644 src/maths/SuperLU/zcolumn_dfs.c create mode 100644 src/maths/SuperLU/zcopy_to_ucol.c create mode 100644 src/maths/SuperLU/zdiagonal.c create mode 100644 src/maths/SuperLU/zgscon.c create mode 100644 src/maths/SuperLU/zgsequ.c create mode 100644 src/maths/SuperLU/zgsisx.c create mode 100644 src/maths/SuperLU/zgsitrf.c create mode 100644 src/maths/SuperLU/zgsrfs.c create mode 100644 src/maths/SuperLU/zgssv.c create mode 100644 src/maths/SuperLU/zgssvx.c create mode 100644 src/maths/SuperLU/zgstrf.c create mode 100644 src/maths/SuperLU/zgstrs.c create mode 100644 src/maths/SuperLU/zlacon.c create mode 100644 src/maths/SuperLU/zlangs.c create mode 100644 src/maths/SuperLU/zlaqgs.c create mode 100644 src/maths/SuperLU/zldperm.c create mode 100644 src/maths/SuperLU/zmemory.c create mode 100644 src/maths/SuperLU/zmyblas2.c create mode 100644 src/maths/SuperLU/zpanel_bmod.c create mode 100644 src/maths/SuperLU/zpanel_dfs.c create mode 100644 src/maths/SuperLU/zpivotL.c create mode 100644 src/maths/SuperLU/zpivotgrowth.c create mode 100644 src/maths/SuperLU/zpruneL.c create mode 100644 src/maths/SuperLU/zreadhb.c create mode 100644 src/maths/SuperLU/zreadrb.c create mode 100644 src/maths/SuperLU/zreadtriple.c create mode 100644 src/maths/SuperLU/zsnode_bmod.c create mode 100644 src/maths/SuperLU/zsnode_dfs.c create mode 100644 src/maths/SuperLU/zsp_blas2.c create mode 100644 src/maths/SuperLU/zsp_blas3.c create mode 100644 src/maths/SuperLU/zutil.c diff --git a/src/include/ngspice/slu_Cnames.h b/src/include/ngspice/slu_Cnames.h new file mode 100644 index 000000000..7bcd1bca8 --- /dev/null +++ b/src/include/ngspice/slu_Cnames.h @@ -0,0 +1,458 @@ +/*! @file slu_Cnames.h + * \brief Macros defining how C routines will be called + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 1, 1997
+ *
+ * These macros define how C routines will be called.  ADD_ assumes that
+ * they will be called by fortran, which expects C routines to have an
+ * underscore postfixed to the name (Suns, and the Intel expect this).
+ * NOCHANGE indicates that fortran will be calling, and that it expects
+ * the name called by fortran to be identical to that compiled by the C
+ * (RS6K's do this).  UPCASE says it expects C routines called by fortran
+ * to be in all upcase (CRAY wants this). 
+ * 
+ */ +#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ +#define __SUPERLU_CNAMES + + +#define ADD_ 0 +#define ADD__ 1 +#define NOCHANGE 2 +#define UPCASE 3 +#define OLD_CRAY 4 +#define C_CALL 5 + +#ifdef UpCase +#define F77_CALL_C UPCASE +#endif + +#ifdef NoChange +#define F77_CALL_C NOCHANGE +#endif + +#ifdef Add_ +#define F77_CALL_C ADD_ +#endif + +#ifdef Add__ +#define F77_CALL_C ADD__ +#endif + +#ifdef _CRAY +#define F77_CALL_C OLD_CRAY +#endif + +/* Default */ +#ifndef F77_CALL_C +#define F77_CALL_C ADD_ +#endif + + +#if (F77_CALL_C == ADD_) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * No redefinition necessary to have following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm_(...) + * + * This is the default. + */ + +#endif + +#if (F77_CALL_C == ADD__) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * for following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm__(...) + */ +/* BLAS */ +#define sswap_ sswap__ +#define saxpy_ saxpy__ +#define sasum_ sasum__ +#define isamax_ isamax__ +#define scopy_ scopy__ +#define sscal_ sscal__ +#define sger_ sger__ +#define snrm2_ snrm2__ +#define ssymv_ ssymv__ +#define sdot_ sdot__ +#define saxpy_ saxpy__ +#define ssyr2_ ssyr2__ +#define srot_ srot__ +#define sgemv_ sgemv__ +#define strsv_ strsv__ +#define sgemm_ sgemm__ +#define strsm_ strsm__ + +#define dswap_ dswap__ +#define daxpy_ daxpy__ +#define dasum_ dasum__ +#define idamax_ idamax__ +#define dcopy_ dcopy__ +#define dscal_ dscal__ +#define dger_ dger__ +#define dnrm2_ dnrm2__ +#define dsymv_ dsymv__ +#define ddot_ ddot__ +#define dsyr2_ dsyr2__ +#define drot_ drot__ +#define dgemv_ dgemv__ +#define dtrsv_ dtrsv__ +#define dgemm_ dgemm__ +#define dtrsm_ dtrsm__ + +#define cswap_ cswap__ +#define caxpy_ caxpy__ +#define scasum_ scasum__ +#define icamax_ icamax__ +#define ccopy_ ccopy__ +#define cscal_ cscal__ +#define scnrm2_ scnrm2__ +#define caxpy_ caxpy__ +#define cgemv_ cgemv__ +#define ctrsv_ ctrsv__ +#define cgemm_ cgemm__ +#define ctrsm_ ctrsm__ +#define cgerc_ cgerc__ +#define chemv_ chemv__ +#define cher2_ cher2__ + +#define zswap_ zswap__ +#define zaxpy_ zaxpy__ +#define dzasum_ dzasum__ +#define izamax_ izamax__ +#define zcopy_ zcopy__ +#define zscal_ zscal__ +#define dznrm2_ dznrm2__ +#define zaxpy_ zaxpy__ +#define zgemv_ zgemv__ +#define ztrsv_ ztrsv__ +#define zgemm_ zgemm__ +#define ztrsm_ ztrsm__ +#define zgerc_ zgerc__ +#define zhemv_ zhemv__ +#define zher2_ zher2__ + +/* LAPACK */ +#define dlamch_ dlamch__ +#define slamch_ slamch__ +#define xerbla_ xerbla__ +#define lsame_ lsame__ +#define dlacon_ dlacon__ +#define slacon_ slacon__ +#define icmax1_ icmax1__ +#define scsum1_ scsum1__ +#define clacon_ clacon__ +#define dzsum1_ dzsum1__ +#define izmax1_ izmax1__ +#define zlacon_ zlacon__ + +/* Fortran interface */ +#define c_bridge_dgssv_ c_bridge_dgssv__ +#define c_fortran_sgssv_ c_fortran_sgssv__ +#define c_fortran_dgssv_ c_fortran_dgssv__ +#define c_fortran_cgssv_ c_fortran_cgssv__ +#define c_fortran_zgssv_ c_fortran_zgssv__ +#endif + +#if (F77_CALL_C == UPCASE) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void DGEMM(...) + */ +/* BLAS */ +#define sswap_ SSWAP +#define saxpy_ SAXPY +#define sasum_ SASUM +#define isamax_ ISAMAX +#define scopy_ SCOPY +#define sscal_ SSCAL +#define sger_ SGER +#define snrm2_ SNRM2 +#define ssymv_ SSYMV +#define sdot_ SDOT +#define saxpy_ SAXPY +#define ssyr2_ SSYR2 +#define srot_ SROT +#define sgemv_ SGEMV +#define strsv_ STRSV +#define sgemm_ SGEMM +#define strsm_ STRSM + +#define dswap_ DSWAP +#define daxpy_ DAXPY +#define dasum_ DASUM +#define idamax_ IDAMAX +#define dcopy_ DCOPY +#define dscal_ DSCAL +#define dger_ DGER +#define dnrm2_ DNRM2 +#define dsymv_ DSYMV +#define ddot_ DDOT +#define dsyr2_ DSYR2 +#define drot_ DROT +#define dgemv_ DGEMV +#define dtrsv_ DTRSV +#define dgemm_ DGEMM +#define dtrsm_ DTRSM + +#define cswap_ CSWAP +#define caxpy_ CAXPY +#define scasum_ SCASUM +#define icamax_ ICAMAX +#define ccopy_ CCOPY +#define cscal_ CSCAL +#define scnrm2_ SCNRM2 +#define cgemv_ CGEMV +#define ctrsv_ CTRSV +#define cgemm_ CGEMM +#define ctrsm_ CTRSM +#define cgerc_ CGERC +#define chemv_ CHEMV +#define cher2_ CHER2 + +#define zswap_ ZSWAP +#define zaxpy_ ZAXPY +#define dzasum_ DZASUM +#define izamax_ IZAMAX +#define zcopy_ ZCOPY +#define zscal_ ZSCAL +#define dznrm2_ DZNRM2 +#define zgemv_ ZGEMV +#define ztrsv_ ZTRSV +#define zgemm_ ZGEMM +#define ztrsm_ ZTRSM +#define zgerc_ ZGERC +#define zhemv_ ZHEMV +#define zher2_ ZHER2 + +/* LAPACK */ +#define dlamch_ DLAMCH +#define slamch_ SLAMCH +#define xerbla_ XERBLA +#define lsame_ LSAME +#define dlacon_ DLACON +#define slacon_ SLACON +#define icmax1_ ICMAX1 +#define scsum1_ SCSUM1 +#define clacon_ CLACON +#define dzsum1_ DZSUM1 +#define izmax1_ IZMAX1 +#define zlacon_ ZLACON + +/* Fortran interface */ +#define c_bridge_dgssv_ C_BRIDGE_DGSSV +#define c_fortran_sgssv_ C_FORTRAN_SGSSV +#define c_fortran_dgssv_ C_FORTRAN_DGSSV +#define c_fortran_cgssv_ C_FORTRAN_CGSSV +#define c_fortran_zgssv_ C_FORTRAN_ZGSSV +#endif + + +#if (F77_CALL_C == OLD_CRAY) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void SGEMM(...) + */ +/* BLAS */ +#define sswap_ SSWAP +#define saxpy_ SAXPY +#define sasum_ SASUM +#define isamax_ ISAMAX +#define scopy_ SCOPY +#define sscal_ SSCAL +#define sger_ SGER +#define snrm2_ SNRM2 +#define ssymv_ SSYMV +#define sdot_ SDOT +#define ssyr2_ SSYR2 +#define srot_ SROT +#define sgemv_ SGEMV +#define strsv_ STRSV +#define sgemm_ SGEMM +#define strsm_ STRSM + +#define dswap_ SSWAP +#define daxpy_ SAXPY +#define dasum_ SASUM +#define idamax_ ISAMAX +#define dcopy_ SCOPY +#define dscal_ SSCAL +#define dger_ SGER +#define dnrm2_ SNRM2 +#define dsymv_ SSYMV +#define ddot_ SDOT +#define dsyr2_ SSYR2 +#define drot_ SROT +#define dgemv_ SGEMV +#define dtrsv_ STRSV +#define dgemm_ SGEMM +#define dtrsm_ STRSM + +#define cswap_ CSWAP +#define caxpy_ CAXPY +#define scasum_ SCASUM +#define icamax_ ICAMAX +#define ccopy_ CCOPY +#define cscal_ CSCAL +#define scnrm2_ SCNRM2 +#define caxpy_ CAXPY +#define cgemv_ CGEMV +#define ctrsv_ CTRSV +#define cgemm_ CGEMM +#define ctrsm_ CTRSM +#define cgerc_ CGERC +#define chemv_ CHEMV +#define cher2_ CHER2 + +#define zswap_ ZSWAP +#define zaxpy_ ZAXPY +#define dzasum_ DZASUM +#define izamax_ IZAMAX +#define zcopy_ ZCOPY +#define zscal_ ZSCAL +#define dznrm2_ DZNRM2 +#define zgemv_ ZGEMV +#define ztrsv_ ZTRSV +#define zgemm_ ZGEMM +#define ztrsm_ ZTRSM +#define zgerc_ ZGERC +#define zhemv_ ZHEMV +#define zher2_ ZHER2 + +/* LAPACK */ +#define dlamch_ DLAMCH +#define slamch_ SLAMCH +#define xerbla_ XERBLA +#define lsame_ LSAME +#define dlacon_ DLACON +#define slacon_ SLACON +#define icmax1_ ICMAX1 +#define scsum1_ SCSUM1 +#define clacon_ CLACON +#define dzsum1_ DZSUM1 +#define izmax1_ IZMAX1 +#define zlacon_ ZLACON + +/* Fortran interface */ +#define c_bridge_dgssv_ C_BRIDGE_DGSSV +#define c_fortran_sgssv_ C_FORTRAN_SGSSV +#define c_fortran_dgssv_ C_FORTRAN_DGSSV +#define c_fortran_cgssv_ C_FORTRAN_CGSSV +#define c_fortran_zgssv_ C_FORTRAN_ZGSSV +#endif + + +#if (F77_CALL_C == NOCHANGE) +/* + * These defines set up the naming scheme required to have a fortran 77 + * routine call a C routine + * for following Fortran to C interface: + * FORTRAN CALL C DECLARATION + * call dgemm(...) void dgemm(...) + */ +/* BLAS */ +#define sswap_ sswap +#define saxpy_ saxpy +#define sasum_ sasum +#define isamax_ isamax +#define scopy_ scopy +#define sscal_ sscal +#define sger_ sger +#define snrm2_ snrm2 +#define ssymv_ ssymv +#define sdot_ sdot +#define saxpy_ saxpy +#define ssyr2_ ssyr2 +#define srot_ srot +#define sgemv_ sgemv +#define strsv_ strsv +#define sgemm_ sgemm +#define strsm_ strsm + +#define dswap_ dswap +#define daxpy_ daxpy +#define dasum_ dasum +#define idamax_ idamax +#define dcopy_ dcopy +#define dscal_ dscal +#define dger_ dger +#define dnrm2_ dnrm2 +#define dsymv_ dsymv +#define ddot_ ddot +#define dsyr2_ dsyr2 +#define drot_ drot +#define dgemv_ dgemv +#define dtrsv_ dtrsv +#define dgemm_ dgemm +#define dtrsm_ dtrsm + +#define cswap_ cswap +#define caxpy_ caxpy +#define scasum_ scasum +#define icamax_ icamax +#define ccopy_ ccopy +#define cscal_ cscal +#define scnrm2_ scnrm2 +#define cgemv_ cgemv +#define ctrsv_ ctrsv +#define cgemm_ cgemm +#define ctrsm_ ctrsm +#define cgerc_ cgerc +#define chemv_ chemv +#define cher2_ cher2 + +#define zswap_ zswap +#define zaxpy_ zaxpy +#define dzasum_ dzasum +#define izamax_ izamax +#define zcopy_ zcopy +#define zscal_ zscal +#define dznrm2_ dznrm2 +#define zgemv_ zgemv +#define ztrsv_ ztrsv +#define zgemm_ zgemm +#define ztrsm_ ztrsm +#define zgerc_ zgerc +#define zhemv_ zhemv +#define zher2_ zher2 + +/* LAPACK */ +#define dlamch_ dlamch +#define slamch_ slamch +#define xerbla_ xerbla +#define lsame_ lsame +#define dlacon_ dlacon +#define slacon_ slacon +#define icmax1_ icmax1 +#define scsum1_ scsum1 +#define clacon_ clacon +#define dzsum1_ dzsum1 +#define izmax1_ izmax1 +#define zlacon_ zlacon + +/* Fortran interface */ +#define c_bridge_dgssv_ c_bridge_dgssv +#define c_fortran_sgssv_ c_fortran_sgssv +#define c_fortran_dgssv_ c_fortran_dgssv +#define c_fortran_cgssv_ c_fortran_cgssv +#define c_fortran_zgssv_ c_fortran_zgssv +#endif + + +#endif /* __SUPERLU_CNAMES */ diff --git a/src/include/ngspice/slu_dcomplex.h b/src/include/ngspice/slu_dcomplex.h new file mode 100644 index 000000000..386ad6893 --- /dev/null +++ b/src/include/ngspice/slu_dcomplex.h @@ -0,0 +1,78 @@ + +/*! @file slu_dcomplex.h + * \brief Header file for complex operations + *
 
+ *  -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Contains definitions for various complex operations.
+ * This header file is to be included in source files z*.c
+ * 
+ */ +#ifndef __SUPERLU_DCOMPLEX /* allow multiple inclusions */ +#define __SUPERLU_DCOMPLEX + + +#ifndef DCOMPLEX_INCLUDE +#define DCOMPLEX_INCLUDE + +typedef struct { double r, i; } doublecomplex; + + +/* Macro definitions */ + +/*! \brief Complex Addition c = a + b */ +#define z_add(c, a, b) { (c)->r = (a)->r + (b)->r; \ + (c)->i = (a)->i + (b)->i; } + +/*! \brief Complex Subtraction c = a - b */ +#define z_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \ + (c)->i = (a)->i - (b)->i; } + +/*! \brief Complex-Double Multiplication */ +#define zd_mult(c, a, b) { (c)->r = (a)->r * (b); \ + (c)->i = (a)->i * (b); } + +/*! \brief Complex-Complex Multiplication */ +#define zz_mult(c, a, b) { \ + double cr, ci; \ + cr = (a)->r * (b)->r - (a)->i * (b)->i; \ + ci = (a)->i * (b)->r + (a)->r * (b)->i; \ + (c)->r = cr; \ + (c)->i = ci; \ + } + +#define zz_conj(a, b) { \ + (a)->r = (b)->r; \ + (a)->i = -((b)->i); \ + } + +/*! \brief Complex equality testing */ +#define z_eq(a, b) ( (a)->r == (b)->r && (a)->i == (b)->i ) + + +#ifdef __cplusplus +extern "C" { +#endif + +/* Prototypes for functions in dcomplex.c */ +void z_div(doublecomplex *, doublecomplex *, doublecomplex *); +double z_abs(doublecomplex *); /* exact */ +double z_abs1(doublecomplex *); /* approximate */ +void z_exp(doublecomplex *, doublecomplex *); +void d_cnjg(doublecomplex *r, doublecomplex *z); +double d_imag(doublecomplex *); +doublecomplex z_sgn(doublecomplex *); +doublecomplex z_sqrt(doublecomplex *); + + + +#ifdef __cplusplus + } +#endif + +#endif + +#endif /* __SUPERLU_DCOMPLEX */ diff --git a/src/include/ngspice/slu_ddefs.h b/src/include/ngspice/slu_ddefs.h new file mode 100644 index 000000000..194c070a7 --- /dev/null +++ b/src/include/ngspice/slu_ddefs.h @@ -0,0 +1,281 @@ + +/*! @file slu_ddefs.h + * \brief Header file for real operations + * + *
 
+ * -- SuperLU routine (version 4.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November, 2010
+ * 
+ * Global data structures used in LU factorization -
+ * 
+ *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
+ *   (xsup,supno): supno[i] is the supernode no to which i belongs;
+ *	xsup(s) points to the beginning of the s-th supernode.
+ *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
+ *	        xsup 0 1 2 4 7 12
+ *	Note: dfs will be performed on supernode rep. relative to the new 
+ *	      row pivoting ordering
+ *
+ *   (xlsub,lsub): lsub[*] contains the compressed subscript of
+ *	rectangular supernodes; xlsub[j] points to the starting
+ *	location of the j-th column in lsub[*]. Note that xlsub 
+ *	is indexed by column.
+ *	Storage: original row subscripts
+ *
+ *      During the course of sparse LU factorization, we also use
+ *	(xlsub,lsub) for the purpose of symmetric pruning. For each
+ *	supernode {s,s+1,...,t=s+r} with first column s and last
+ *	column t, the subscript set
+ *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
+ *	is the structure of column s (i.e. structure of this supernode).
+ *	It is used for the storage of numerical values.
+ *	Furthermore,
+ *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
+ *	is the structure of the last column t of this supernode.
+ *	It is for the purpose of symmetric pruning. Therefore, the
+ *	structural subscripts can be rearranged without making physical
+ *	interchanges among the numerical values.
+ *
+ *	However, if the supernode has only one column, then we
+ *	only keep one set of subscripts. For any subscript interchange
+ *	performed, similar interchange must be done on the numerical
+ *	values.
+ *
+ *	The last column structures (for pruning) will be removed
+ *	after the numercial LU factorization phase.
+ *
+ *   (xlusup,lusup): lusup[*] contains the numerical values of the
+ *	rectangular supernodes; xlusup[j] points to the starting
+ *	location of the j-th column in storage vector lusup[*]
+ *	Note: xlusup is indexed by column.
+ *	Each rectangular supernode is stored by column-major
+ *	scheme, consistent with Fortran 2-dim array storage.
+ *
+ *   (xusub,ucol,usub): ucol[*] stores the numerical values of
+ *	U-columns outside the rectangular supernodes. The row
+ *	subscript of nonzero ucol[k] is stored in usub[k].
+ *	xusub[i] points to the starting location of column i in ucol.
+ *	Storage: new row subscripts; that is subscripts of PA.
+ * 
+ */ +#ifndef __SUPERLU_dSP_DEFS /* allow multiple inclusions */ +#define __SUPERLU_dSP_DEFS + +/* + * File name: dsp_defs.h + * Purpose: Sparse matrix types and function prototypes + * History: + */ + +#ifdef _CRAY +#include +#include +#endif + +/* Define my integer type int_t */ +typedef int int_t; /* default */ + +#include +#include +#include +#include +#include + + + +typedef struct { + int *xsup; /* supernode and column mapping */ + int *supno; + int *lsub; /* compressed L subscripts */ + int *xlsub; + double *lusup; /* L supernodes */ + int *xlusup; + double *ucol; /* U columns */ + int *usub; + int *xusub; + int nzlmax; /* current max size of lsub */ + int nzumax; /* " " " ucol */ + int nzlumax; /* " " " lusup */ + int n; /* number of columns in the matrix */ + LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ + int num_expansions; + ExpHeader *expanders; /* Array of pointers to 4 types of memory */ + LU_stack_t stack; /* use user supplied memory */ +} GlobalLU_t; + + +/* -------- Prototypes -------- */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Driver routines */ +extern void +dgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); +extern void +dgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, + char *, double *, double *, SuperMatrix *, SuperMatrix *, + void *, int, SuperMatrix *, SuperMatrix *, + double *, double *, double *, double *, + mem_usage_t *, SuperLUStat_t *, int *); + /* ILU */ +extern void +dgsisv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); +extern void +dgsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, + char *, double *, double *, SuperMatrix *, SuperMatrix *, + void *, int, SuperMatrix *, SuperMatrix *, double *, double *, + mem_usage_t *, SuperLUStat_t *, int *); + + +/*! \brief Supernodal LU factor related */ +extern void +dCreate_CompCol_Matrix(SuperMatrix *, int, int, int, double *, + int *, int *, Stype_t, Dtype_t, Mtype_t); +extern void +dCreate_CompRow_Matrix(SuperMatrix *, int, int, int, double *, + int *, int *, Stype_t, Dtype_t, Mtype_t); +extern void +dCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *); +extern void +dCreate_Dense_Matrix(SuperMatrix *, int, int, double *, int, + Stype_t, Dtype_t, Mtype_t); +extern void +dCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, double *, + int *, int *, int *, int *, int *, + Stype_t, Dtype_t, Mtype_t); +extern void +dCopy_Dense_Matrix(int, int, double *, int, double *, int); + +extern void countnz (const int, int *, int *, int *, GlobalLU_t *); +extern void ilu_countnz (const int, int *, int *, GlobalLU_t *); +extern void fixupL (const int, const int *, GlobalLU_t *); + +extern void dallocateA (int, int, double **, int **, int **); +extern void dgstrf (superlu_options_t*, SuperMatrix*, + int, int, int*, void *, int, int *, int *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *); +extern int dsnode_dfs (const int, const int, const int *, const int *, + const int *, int *, int *, GlobalLU_t *); +extern int dsnode_bmod (const int, const int, const int, double *, + double *, GlobalLU_t *, SuperLUStat_t*); +extern void dpanel_dfs (const int, const int, const int, SuperMatrix *, + int *, int *, double *, int *, int *, int *, + int *, int *, int *, int *, GlobalLU_t *); +extern void dpanel_bmod (const int, const int, const int, const int, + double *, double *, int *, int *, + GlobalLU_t *, SuperLUStat_t*); +extern int dcolumn_dfs (const int, const int, int *, int *, int *, int *, + int *, int *, int *, int *, int *, GlobalLU_t *); +extern int dcolumn_bmod (const int, const int, double *, + double *, int *, int *, int, + GlobalLU_t *, SuperLUStat_t*); +extern int dcopy_to_ucol (int, int, int *, int *, int *, + double *, GlobalLU_t *); +extern int dpivotL (const int, const double, int *, int *, + int *, int *, int *, GlobalLU_t *, SuperLUStat_t*); +extern void dpruneL (const int, const int *, const int, const int, + const int *, const int *, int *, GlobalLU_t *); +extern void dreadmt (int *, int *, int *, double **, int **, int **); +extern void dGenXtrue (int, int, double *, int); +extern void dFillRHS (trans_t, int, double *, int, SuperMatrix *, + SuperMatrix *); +extern void dgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *, + SuperMatrix *, SuperLUStat_t*, int *); +/* ILU */ +extern void dgsitrf (superlu_options_t*, SuperMatrix*, int, int, int*, + void *, int, int *, int *, SuperMatrix *, SuperMatrix *, + SuperLUStat_t*, int *); +extern int dldperm(int, int, int, int [], int [], double [], + int [], double [], double []); +extern int ilu_dsnode_dfs (const int, const int, const int *, const int *, + const int *, int *, GlobalLU_t *); +extern void ilu_dpanel_dfs (const int, const int, const int, SuperMatrix *, + int *, int *, double *, double *, int *, int *, + int *, int *, int *, int *, GlobalLU_t *); +extern int ilu_dcolumn_dfs (const int, const int, int *, int *, int *, + int *, int *, int *, int *, int *, + GlobalLU_t *); +extern int ilu_dcopy_to_ucol (int, int, int *, int *, int *, + double *, int, milu_t, double, int, + double *, int *, GlobalLU_t *, double *); +extern int ilu_dpivotL (const int, const double, int *, int *, int, int *, + int *, int *, int *, double, milu_t, + double, GlobalLU_t *, SuperLUStat_t*); +extern int ilu_ddrop_row (superlu_options_t *, int, int, double, + int, int *, double *, GlobalLU_t *, + double *, double *, int); + + +/*! \brief Driver related */ + +extern void dgsequ (SuperMatrix *, double *, double *, double *, + double *, double *, int *); +extern void dlaqgs (SuperMatrix *, double *, double *, double, + double, double, char *); +extern void dgscon (char *, SuperMatrix *, SuperMatrix *, + double, double *, SuperLUStat_t*, int *); +extern double dPivotGrowth(int, SuperMatrix *, int *, + SuperMatrix *, SuperMatrix *); +extern void dgsrfs (trans_t, SuperMatrix *, SuperMatrix *, + SuperMatrix *, int *, int *, char *, double *, + double *, SuperMatrix *, SuperMatrix *, + double *, double *, SuperLUStat_t*, int *); + +extern int sp_dtrsv (char *, char *, char *, SuperMatrix *, + SuperMatrix *, double *, SuperLUStat_t*, int *); +extern int sp_dgemv (char *, double, SuperMatrix *, double *, + int, double, double *, int); + +extern int sp_dgemm (char *, char *, int, int, int, double, + SuperMatrix *, double *, int, double, + double *, int); +extern double dlamch_(char *); + + +/*! \brief Memory-related */ +extern int dLUMemInit (fact_t, void *, int, int, int, int, int, + double, SuperMatrix *, SuperMatrix *, + GlobalLU_t *, int **, double **); +extern void dSetRWork (int, int, double *, double **, double **); +extern void dLUWorkFree (int *, double *, GlobalLU_t *); +extern int dLUMemXpand (int, int, MemType, int *, GlobalLU_t *); + +extern double *doubleMalloc(int); +extern double *doubleCalloc(int); +extern int dmemory_usage(const int, const int, const int, const int); +extern int dQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); +extern int ilu_dQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); + +/*! \brief Auxiliary routines */ +extern void dreadhb(int *, int *, int *, double **, int **, int **); +extern void dreadrb(int *, int *, int *, double **, int **, int **); +extern void dreadtriple(int *, int *, int *, double **, int **, int **); +extern void dCompRow_to_CompCol(int, int, int, double*, int*, int*, + double **, int **, int **); +extern void dfill (double *, int, double); +extern void dinf_norm_error (int, SuperMatrix *, double *); +extern void PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *, + double, double, double *, double *, char *); +extern double dqselect(int, double *, int); + + +/*! \brief Routines for debugging */ +extern void dPrint_CompCol_Matrix(char *, SuperMatrix *); +extern void dPrint_SuperNode_Matrix(char *, SuperMatrix *); +extern void dPrint_Dense_Matrix(char *, SuperMatrix *); +extern void dprint_lu_col(char *, int, int, int *, GlobalLU_t *); +extern int print_double_vec(char *, int, double *); +extern void check_tempv(int, double *); + +#ifdef __cplusplus + } +#endif + +#endif /* __SUPERLU_dSP_DEFS */ + diff --git a/src/include/ngspice/slu_util.h b/src/include/ngspice/slu_util.h new file mode 100644 index 000000000..a199fa4e9 --- /dev/null +++ b/src/include/ngspice/slu_util.h @@ -0,0 +1,367 @@ +/** @file slu_util.h + * \brief Utility header file + * + * -- SuperLU routine (version 4.1) -- + * Univ. of California Berkeley, Xerox Palo Alto Research Center, + * and Lawrence Berkeley National Lab. + * November, 2010 + * + */ + +#ifndef __SUPERLU_UTIL /* allow multiple inclusions */ +#define __SUPERLU_UTIL + +#include +#include +#include +/* +#ifndef __STDC__ +#include +#endif +*/ +#include +#include + +/*********************************************************************** + * Macros + ***********************************************************************/ +#define FIRSTCOL_OF_SNODE(i) (xsup[i]) +/* No of marker arrays used in the symbolic factorization, + each of size n */ +#define NO_MARKER 3 +#define NUM_TEMPV(m,w,t,b) ( SUPERLU_MAX(m, (t + b)*w) ) + +#ifndef USER_ABORT +#define USER_ABORT(msg) superlu_abort_and_exit(msg) +#endif + +#define ABORT_SuperLU(err_msg) \ + { char msg[256];\ + sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\ + USER_ABORT(msg); } + + +#ifndef USER_MALLOC +#if 1 +#define USER_MALLOC(size) superlu_malloc(size) +#else +/* The following may check out some uninitialized data */ +#define USER_MALLOC(size) memset (superlu_malloc(size), '\x0F', size) +#endif +#endif + +#define SUPERLU_MALLOC(size) USER_MALLOC(size) + +#ifndef USER_FREE +#define USER_FREE(addr) superlu_free(addr) +#endif + +#define SUPERLU_FREE(addr) USER_FREE(addr) + +#define CHECK_MALLOC(where) { \ + extern int superlu_malloc_total; \ + printf("%s: malloc_total %d Bytes\n", \ + where, superlu_malloc_total); \ +} + +#define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) ) +#define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) ) + +/********************************************************* + * Macros used for easy access of sparse matrix entries. * + *********************************************************/ +#define L_SUB_START(col) ( Lstore->rowind_colptr[col] ) +#define L_SUB(ptr) ( Lstore->rowind[ptr] ) +#define L_NZ_START(col) ( Lstore->nzval_colptr[col] ) +#define L_FST_SUPC(superno) ( Lstore->sup_to_col[superno] ) +#define U_NZ_START(col) ( Ustore->colptr[col] ) +#define U_SUB(ptr) ( Ustore->rowind[ptr] ) + + +/*********************************************************************** + * Constants + ***********************************************************************/ +#define EMPTY (-1) +/*#define NO (-1)*/ +#define FALSE 0 +#define TRUE 1 + +#define NO_MEMTYPE 4 /* 0: lusup; + 1: ucol; + 2: lsub; + 3: usub */ + +#define GluIntArray(n) (5 * (n) + 5) + +/* Dropping rules */ +#define NODROP ( 0x0000 ) +#define DROP_BASIC ( 0x0001 ) /* ILU(tau) */ +#define DROP_PROWS ( 0x0002 ) /* ILUTP: keep p maximum rows */ +#define DROP_COLUMN ( 0x0004 ) /* ILUTP: for j-th column, + p = gamma * nnz(A(:,j)) */ +#define DROP_AREA ( 0x0008 ) /* ILUTP: for j-th column, use + nnz(F(:,1:j)) / nnz(A(:,1:j)) + to limit memory growth */ +#define DROP_SECONDARY ( 0x000E ) /* PROWS | COLUMN | AREA */ +#define DROP_DYNAMIC ( 0x0010 ) /* adaptive tau */ +#define DROP_INTERP ( 0x0100 ) /* use interpolation */ + + +#if 1 +#define MILU_ALPHA (1.0e-2) /* multiple of drop_sum to be added to diagonal */ +#else +#define MILU_ALPHA 1.0 /* multiple of drop_sum to be added to diagonal */ +#endif + + +/*********************************************************************** + * Type definitions + ***********************************************************************/ +typedef float flops_t; +typedef unsigned char Logical; + +/* + *-- This contains the options used to control the solution process. + * + * Fact (fact_t) + * Specifies whether or not the factored form of the matrix + * A is supplied on entry, and if not, how the matrix A should + * be factorizaed. + * = DOFACT: The matrix A will be factorized from scratch, and the + * factors will be stored in L and U. + * = SamePattern: The matrix A will be factorized assuming + * that a factorization of a matrix with the same sparsity + * pattern was performed prior to this one. Therefore, this + * factorization will reuse column permutation vector + * ScalePermstruct->perm_c and the column elimination tree + * LUstruct->etree. + * = SamePattern_SameRowPerm: The matrix A will be factorized + * assuming that a factorization of a matrix with the same + * sparsity pattern and similar numerical values was performed + * prior to this one. Therefore, this factorization will reuse + * both row and column scaling factors R and C, both row and + * column permutation vectors perm_r and perm_c, and the + * data structure set up from the previous symbolic factorization. + * = FACTORED: On entry, L, U, perm_r and perm_c contain the + * factored form of A. If DiagScale is not NOEQUIL, the matrix + * A has been equilibrated with scaling factors R and C. + * + * Equil (yes_no_t) + * Specifies whether to equilibrate the system (scale A's row and + * columns to have unit norm). + * + * ColPerm (colperm_t) + * Specifies what type of column permutation to use to reduce fill. + * = NATURAL: use the natural ordering + * = MMD_ATA: use minimum degree ordering on structure of A'*A + * = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A + * = COLAMD: use approximate minimum degree column ordering + * = MY_PERMC: use the ordering specified by the user + * + * Trans (trans_t) + * Specifies the form of the system of equations: + * = NOTRANS: A * X = B (No transpose) + * = TRANS: A**T * X = B (Transpose) + * = CONJ: A**H * X = B (Transpose) + * + * IterRefine (IterRefine_t) + * Specifies whether to perform iterative refinement. + * = NO: no iterative refinement + * = SLU_SINGLE: perform iterative refinement in single precision + * = SLU_DOUBLE: perform iterative refinement in double precision + * = SLU_EXTRA: perform iterative refinement in extra precision + * + * DiagPivotThresh (double, in [0.0, 1.0]) (only for sequential SuperLU) + * Specifies the threshold used for a diagonal entry to be an + * acceptable pivot. + * + * SymmetricMode (yest_no_t) + * Specifies whether to use symmetric mode. Symmetric mode gives + * preference to diagonal pivots, and uses an (A'+A)-based column + * permutation algorithm. + * + * PivotGrowth (yes_no_t) + * Specifies whether to compute the reciprocal pivot growth. + * + * ConditionNumber (ues_no_t) + * Specifies whether to compute the reciprocal condition number. + * + * RowPerm (rowperm_t) (only for SuperLU_DIST or ILU) + * Specifies whether to permute rows of the original matrix. + * = NO: not to permute the rows + * = LargeDiag: make the diagonal large relative to the off-diagonal + * = MY_PERMR: use the permutation given by the user + * + * ILU_DropRule (int) + * Specifies the dropping rule: + * = DROP_BASIC: Basic dropping rule, supernodal based ILUTP(tau). + * = DROP_PROWS: Supernodal based ILUTP(p,tau), p = gamma * nnz(A)/n. + * = DROP_COLUMN: Variant of ILUTP(p,tau), for j-th column, + * p = gamma * nnz(A(:,j)). + * = DROP_AREA: Variation of ILUTP, for j-th column, use + * nnz(F(:,1:j)) / nnz(A(:,1:j)) to control memory. + * = DROP_DYNAMIC: Modify the threshold tau during factorizaion: + * If nnz(L(:,1:j)) / nnz(A(:,1:j)) > gamma + * tau_L(j) := MIN(tau_0, tau_L(j-1) * 2); + * Otherwise + * tau_L(j) := MAX(tau_0, tau_L(j-1) / 2); + * tau_U(j) uses the similar rule. + * NOTE: the thresholds used by L and U are separate. + * = DROP_INTERP: Compute the second dropping threshold by + * interpolation instead of sorting (default). + * In this case, the actual fill ratio is not + * guaranteed to be smaller than gamma. + * Note: DROP_PROWS, DROP_COLUMN and DROP_AREA are mutually exclusive. + * ( Default: DROP_BASIC | DROP_AREA ) + * + * ILU_DropTol (double) + * numerical threshold for dropping. + * + * ILU_FillFactor (double) + * Gamma in the secondary dropping. + * + * ILU_Norm (norm_t) + * Specify which norm to use to measure the row size in a + * supernode: infinity-norm, 1-norm, or 2-norm. + * + * ILU_FillTol (double) + * numerical threshold for zero pivot perturbation. + * + * ILU_MILU (milu_t) + * Specifies which version of MILU to use. + * + * ILU_MILU_Dim (double) + * Dimension of the PDE if available. + * + * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST) + * Specifies whether to replace the tiny diagonals by + * sqrt(epsilon)*||A|| during LU factorization. + * + * SolveInitialized (yes_no_t) (only for SuperLU_DIST) + * Specifies whether the initialization has been performed to the + * triangular solve. + * + * RefineInitialized (yes_no_t) (only for SuperLU_DIST) + * Specifies whether the initialization has been performed to the + * sparse matrix-vector multiplication routine needed in iterative + * refinement. + * + * PrintStat (yes_no_t) + * Specifies whether to print the solver's statistics. + */ +typedef struct { + fact_t Fact; + yes_no_t Equil; + colperm_t ColPerm; + trans_t Trans; + IterRefine_t IterRefine; + double DiagPivotThresh; + yes_no_t SymmetricMode; + yes_no_t PivotGrowth; + yes_no_t ConditionNumber; + rowperm_t RowPerm; + int ILU_DropRule; + double ILU_DropTol; /* threshold for dropping */ + double ILU_FillFactor; /* gamma in the secondary dropping */ + norm_t ILU_Norm; /* infinity-norm, 1-norm, or 2-norm */ + double ILU_FillTol; /* threshold for zero pivot perturbation */ + milu_t ILU_MILU; + double ILU_MILU_Dim; /* Dimension of PDE (if available) */ + yes_no_t ParSymbFact; + yes_no_t ReplaceTinyPivot; /* used in SuperLU_DIST */ + yes_no_t SolveInitialized; + yes_no_t RefineInitialized; + yes_no_t PrintStat; + int nnzL, nnzU; /* used to store nnzs for now */ + int num_lookaheads; /* num of levels in look-ahead */ + yes_no_t lookahead_etree; /* use etree computed from the + serial symbolic factorization */ + yes_no_t SymPattern; /* symmetric factorization */ +} superlu_options_t; + +/*! \brief Headers for 4 types of dynamatically managed memory */ +typedef struct e_node { + int size; /* length of the memory that has been used */ + void *mem; /* pointer to the new malloc'd store */ +} ExpHeader; + +typedef struct { + int size; + int used; + int top1; /* grow upward, relative to &array[0] */ + int top2; /* grow downward */ + void *array; +} LU_stack_t; + +typedef struct { + int *panel_histo; /* histogram of panel size distribution */ + double *utime; /* running time at various phases */ + flops_t *ops; /* operation count at various phases */ + int TinyPivots; /* number of tiny pivots */ + int RefineSteps; /* number of iterative refinement steps */ + int expansions; /* number of memory expansions */ +} SuperLUStat_t; + +typedef struct { + float for_lu; + float total_needed; +} mem_usage_t; + + +/*********************************************************************** + * Prototypes + ***********************************************************************/ +#ifdef __cplusplus +extern "C" { +#endif + +extern void Destroy_SuperMatrix_Store(SuperMatrix *); +extern void Destroy_CompCol_Matrix(SuperMatrix *); +extern void Destroy_CompRow_Matrix(SuperMatrix *); +extern void Destroy_SuperNode_Matrix(SuperMatrix *); +extern void Destroy_CompCol_Permuted(SuperMatrix *); +extern void Destroy_Dense_Matrix(SuperMatrix *); +extern void get_perm_c(int, SuperMatrix *, int *); +extern void set_default_options(superlu_options_t *options); +extern void ilu_set_default_options(superlu_options_t *options); +extern void sp_preorder (superlu_options_t *, SuperMatrix*, int*, int*, + SuperMatrix*); +extern void superlu_abort_and_exit(char*); +extern void *superlu_malloc (size_t); +extern int *intMalloc (int); +extern int *intCalloc (int); +extern void superlu_free (void*); +extern void SetIWork (int, int, int, int *, int **, int **, int **, + int **, int **, int **, int **); +extern int sp_coletree (int *, int *, int *, int, int, int *); +extern void relax_snode (const int, int *, const int, int *, int *); +extern void heap_relax_snode (const int, int *, const int, int *, int *); +extern int mark_relax(int, int *, int *, int *, int *, int *, int *); +extern void ilu_relax_snode (const int, int *, const int, int *, + int *, int *); +extern void ilu_heap_relax_snode (const int, int *, const int, int *, + int *, int*); +extern void resetrep_col (const int, const int *, int *); +extern int spcoletree (int *, int *, int *, int, int, int *); +extern int *TreePostorder (int, int *); +extern double SuperLU_timer_ (void); +extern int sp_ienv (int); +extern int lsame_ (char *, char *); +extern int xerbla_ (char *, int *); +extern void ifill (int *, int, int); +extern void snode_profile (int, int *); +extern void super_stats (int, int *); +extern void check_repfnz(int, int, int, int *); +extern void PrintSumm (char *, int, int, int); +extern void StatInit(SuperLUStat_t *); +extern void StatPrint (SuperLUStat_t *); +extern void StatFree(SuperLUStat_t *); +extern void print_panel_seg(int, int, int, int, int *, int *); +extern int print_int_vec(char *,int, int *); +extern int slu_PrintInt10(char *, int, int *); + +#ifdef __cplusplus + } +#endif + +#endif /* __SUPERLU_UTIL */ diff --git a/src/include/ngspice/slu_zdefs.h b/src/include/ngspice/slu_zdefs.h new file mode 100644 index 000000000..3ce638f51 --- /dev/null +++ b/src/include/ngspice/slu_zdefs.h @@ -0,0 +1,284 @@ + +/*! @file slu_zdefs.h + * \brief Header file for real operations + * + *
 
+ * -- SuperLU routine (version 4.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November, 2010
+ * 
+ * Global data structures used in LU factorization -
+ * 
+ *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
+ *   (xsup,supno): supno[i] is the supernode no to which i belongs;
+ *	xsup(s) points to the beginning of the s-th supernode.
+ *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
+ *	        xsup 0 1 2 4 7 12
+ *	Note: dfs will be performed on supernode rep. relative to the new 
+ *	      row pivoting ordering
+ *
+ *   (xlsub,lsub): lsub[*] contains the compressed subscript of
+ *	rectangular supernodes; xlsub[j] points to the starting
+ *	location of the j-th column in lsub[*]. Note that xlsub 
+ *	is indexed by column.
+ *	Storage: original row subscripts
+ *
+ *      During the course of sparse LU factorization, we also use
+ *	(xlsub,lsub) for the purpose of symmetric pruning. For each
+ *	supernode {s,s+1,...,t=s+r} with first column s and last
+ *	column t, the subscript set
+ *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
+ *	is the structure of column s (i.e. structure of this supernode).
+ *	It is used for the storage of numerical values.
+ *	Furthermore,
+ *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
+ *	is the structure of the last column t of this supernode.
+ *	It is for the purpose of symmetric pruning. Therefore, the
+ *	structural subscripts can be rearranged without making physical
+ *	interchanges among the numerical values.
+ *
+ *	However, if the supernode has only one column, then we
+ *	only keep one set of subscripts. For any subscript interchange
+ *	performed, similar interchange must be done on the numerical
+ *	values.
+ *
+ *	The last column structures (for pruning) will be removed
+ *	after the numercial LU factorization phase.
+ *
+ *   (xlusup,lusup): lusup[*] contains the numerical values of the
+ *	rectangular supernodes; xlusup[j] points to the starting
+ *	location of the j-th column in storage vector lusup[*]
+ *	Note: xlusup is indexed by column.
+ *	Each rectangular supernode is stored by column-major
+ *	scheme, consistent with Fortran 2-dim array storage.
+ *
+ *   (xusub,ucol,usub): ucol[*] stores the numerical values of
+ *	U-columns outside the rectangular supernodes. The row
+ *	subscript of nonzero ucol[k] is stored in usub[k].
+ *	xusub[i] points to the starting location of column i in ucol.
+ *	Storage: new row subscripts; that is subscripts of PA.
+ * 
+ */ +#ifndef __SUPERLU_zSP_DEFS /* allow multiple inclusions */ +#define __SUPERLU_zSP_DEFS + +/* + * File name: zsp_defs.h + * Purpose: Sparse matrix types and function prototypes + * History: + */ + +#ifdef _CRAY +#include +#include +#endif + +/* Define my integer type int_t */ +typedef int int_t; /* default */ + +#include +#include +#include +#include +#include +#include + + + +typedef struct { + int *xsup; /* supernode and column mapping */ + int *supno; + int *lsub; /* compressed L subscripts */ + int *xlsub; + doublecomplex *lusup; /* L supernodes */ + int *xlusup; + doublecomplex *ucol; /* U columns */ + int *usub; + int *xusub; + int nzlmax; /* current max size of lsub */ + int nzumax; /* " " " ucol */ + int nzlumax; /* " " " lusup */ + int n; /* number of columns in the matrix */ + LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ + int num_expansions; + ExpHeader *expanders; /* Array of pointers to 4 types of memory */ + LU_stack_t stack; /* use user supplied memory */ +} GlobalLU_t; + + +/* -------- Prototypes -------- */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Driver routines */ +extern void +zgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); +extern void +zgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, + char *, double *, double *, SuperMatrix *, SuperMatrix *, + void *, int, SuperMatrix *, SuperMatrix *, + double *, double *, double *, double *, + mem_usage_t *, SuperLUStat_t *, int *); + /* ILU */ +extern void +zgsisv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *); +extern void +zgsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, + char *, double *, double *, SuperMatrix *, SuperMatrix *, + void *, int, SuperMatrix *, SuperMatrix *, double *, double *, + mem_usage_t *, SuperLUStat_t *, int *); + + +/*! \brief Supernodal LU factor related */ +extern void +zCreate_CompCol_Matrix(SuperMatrix *, int, int, int, doublecomplex *, + int *, int *, Stype_t, Dtype_t, Mtype_t); +extern void +zCreate_CompRow_Matrix(SuperMatrix *, int, int, int, doublecomplex *, + int *, int *, Stype_t, Dtype_t, Mtype_t); +extern void +zCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *); +extern void +zCreate_Dense_Matrix(SuperMatrix *, int, int, doublecomplex *, int, + Stype_t, Dtype_t, Mtype_t); +extern void +zCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, doublecomplex *, + int *, int *, int *, int *, int *, + Stype_t, Dtype_t, Mtype_t); +extern void +zCopy_Dense_Matrix(int, int, doublecomplex *, int, doublecomplex *, int); + +extern void countnz (const int, int *, int *, int *, GlobalLU_t *); +extern void ilu_countnz (const int, int *, int *, GlobalLU_t *); +extern void fixupL (const int, const int *, GlobalLU_t *); + +extern void zallocateA (int, int, doublecomplex **, int **, int **); +extern void zgstrf (superlu_options_t*, SuperMatrix*, + int, int, int*, void *, int, int *, int *, + SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *); +extern int zsnode_dfs (const int, const int, const int *, const int *, + const int *, int *, int *, GlobalLU_t *); +extern int zsnode_bmod (const int, const int, const int, doublecomplex *, + doublecomplex *, GlobalLU_t *, SuperLUStat_t*); +extern void zpanel_dfs (const int, const int, const int, SuperMatrix *, + int *, int *, doublecomplex *, int *, int *, int *, + int *, int *, int *, int *, GlobalLU_t *); +extern void zpanel_bmod (const int, const int, const int, const int, + doublecomplex *, doublecomplex *, int *, int *, + GlobalLU_t *, SuperLUStat_t*); +extern int zcolumn_dfs (const int, const int, int *, int *, int *, int *, + int *, int *, int *, int *, int *, GlobalLU_t *); +extern int zcolumn_bmod (const int, const int, doublecomplex *, + doublecomplex *, int *, int *, int, + GlobalLU_t *, SuperLUStat_t*); +extern int zcopy_to_ucol (int, int, int *, int *, int *, + doublecomplex *, GlobalLU_t *); +extern int zpivotL (const int, const double, int *, int *, + int *, int *, int *, GlobalLU_t *, SuperLUStat_t*); +extern void zpruneL (const int, const int *, const int, const int, + const int *, const int *, int *, GlobalLU_t *); +extern void zreadmt (int *, int *, int *, doublecomplex **, int **, int **); +extern void zGenXtrue (int, int, doublecomplex *, int); +extern void zFillRHS (trans_t, int, doublecomplex *, int, SuperMatrix *, + SuperMatrix *); +extern void zgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *, + SuperMatrix *, SuperLUStat_t*, int *); +/* ILU */ +extern void zgsitrf (superlu_options_t*, SuperMatrix*, int, int, int*, + void *, int, int *, int *, SuperMatrix *, SuperMatrix *, + SuperLUStat_t*, int *); +extern int zldperm(int, int, int, int [], int [], doublecomplex [], + int [], double [], double []); +extern int ilu_zsnode_dfs (const int, const int, const int *, const int *, + const int *, int *, GlobalLU_t *); +extern void ilu_zpanel_dfs (const int, const int, const int, SuperMatrix *, + int *, int *, doublecomplex *, double *, int *, int *, + int *, int *, int *, int *, GlobalLU_t *); +extern int ilu_zcolumn_dfs (const int, const int, int *, int *, int *, + int *, int *, int *, int *, int *, + GlobalLU_t *); +extern int ilu_zcopy_to_ucol (int, int, int *, int *, int *, + doublecomplex *, int, milu_t, double, int, + doublecomplex *, int *, GlobalLU_t *, double *); +extern int ilu_zpivotL (const int, const double, int *, int *, int, int *, + int *, int *, int *, double, milu_t, + doublecomplex, GlobalLU_t *, SuperLUStat_t*); +extern int ilu_zdrop_row (superlu_options_t *, int, int, double, + int, int *, double *, GlobalLU_t *, + double *, double *, int); + + +/*! \brief Driver related */ + +extern void zgsequ (SuperMatrix *, double *, double *, double *, + double *, double *, int *); +extern void zlaqgs (SuperMatrix *, double *, double *, double, + double, double, char *); +extern void zgscon (char *, SuperMatrix *, SuperMatrix *, + double, double *, SuperLUStat_t*, int *); +extern double zPivotGrowth(int, SuperMatrix *, int *, + SuperMatrix *, SuperMatrix *); +extern void zgsrfs (trans_t, SuperMatrix *, SuperMatrix *, + SuperMatrix *, int *, int *, char *, double *, + double *, SuperMatrix *, SuperMatrix *, + double *, double *, SuperLUStat_t*, int *); + +extern int sp_ztrsv (char *, char *, char *, SuperMatrix *, + SuperMatrix *, doublecomplex *, SuperLUStat_t*, int *); +extern int sp_zgemv (char *, doublecomplex, SuperMatrix *, doublecomplex *, + int, doublecomplex, doublecomplex *, int); + +extern int sp_zgemm (char *, char *, int, int, int, doublecomplex, + SuperMatrix *, doublecomplex *, int, doublecomplex, + doublecomplex *, int); +extern double dlamch_(char *); + + +/*! \brief Memory-related */ +extern int zLUMemInit (fact_t, void *, int, int, int, int, int, + double, SuperMatrix *, SuperMatrix *, + GlobalLU_t *, int **, doublecomplex **); +extern void zSetRWork (int, int, doublecomplex *, doublecomplex **, doublecomplex **); +extern void zLUWorkFree (int *, doublecomplex *, GlobalLU_t *); +extern int zLUMemXpand (int, int, MemType, int *, GlobalLU_t *); + +extern doublecomplex *doublecomplexMalloc(int); +extern doublecomplex *doublecomplexCalloc(int); +extern double *doubleMalloc(int); +extern double *doubleCalloc(int); +extern int zmemory_usage(const int, const int, const int, const int); +extern int zQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); +extern int ilu_zQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *); + +/*! \brief Auxiliary routines */ +extern void zreadhb(int *, int *, int *, doublecomplex **, int **, int **); +extern void zreadrb(int *, int *, int *, doublecomplex **, int **, int **); +extern void zreadtriple(int *, int *, int *, doublecomplex **, int **, int **); +extern void zCompRow_to_CompCol(int, int, int, doublecomplex*, int*, int*, + doublecomplex **, int **, int **); +extern void zfill (doublecomplex *, int, doublecomplex); +extern void zinf_norm_error (int, SuperMatrix *, doublecomplex *); +extern void PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *, + doublecomplex, doublecomplex, doublecomplex *, doublecomplex *, char *); +extern double dqselect(int, double *, int); + + +/*! \brief Routines for debugging */ +extern void zPrint_CompCol_Matrix(char *, SuperMatrix *); +extern void zPrint_SuperNode_Matrix(char *, SuperMatrix *); +extern void zPrint_Dense_Matrix(char *, SuperMatrix *); +extern void zprint_lu_col(char *, int, int, int *, GlobalLU_t *); +extern int print_double_vec(char *, int, double *); +extern void check_tempv(int, doublecomplex *); + +#ifdef __cplusplus + } +#endif + +#endif /* __SUPERLU_zSP_DEFS */ + diff --git a/src/include/ngspice/superlu_enum_consts.h b/src/include/ngspice/superlu_enum_consts.h new file mode 100644 index 000000000..6324ae819 --- /dev/null +++ b/src/include/ngspice/superlu_enum_consts.h @@ -0,0 +1,71 @@ +/** @file superlu_enum_consts.h + * \brief enum constants header file + * + * -- SuperLU routine (version 4.1) -- + * Lawrence Berkeley National Lab, Univ. of California Berkeley, + * October 1, 2010 + * + */ + +#ifndef __SUPERLU_ENUM_CONSTS /* allow multiple inclusions */ +#define __SUPERLU_ENUM_CONSTS + +/*********************************************************************** + * Enumerate types + ***********************************************************************/ +typedef enum {NO_SuperLU, YES_SuperLU} yes_no_t; +typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t; +typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t; +typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD, + METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC} colperm_t; +typedef enum {NOTRANS, TRANS, CONJ} trans_t; +typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t; +typedef enum {NOREFINE, SLU_SINGLE=1, SLU_DOUBLE, SLU_EXTRA} IterRefine_t; +typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; +typedef enum {HEAD, TAIL} stack_end_t; +typedef enum {SYSTEM, USER} LU_space_t; +typedef enum {ONE_NORM, TWO_NORM, INF_NORM} norm_t; +typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3} milu_t; +#if 0 +typedef enum {NODROP = 0x0000, + DROP_BASIC = 0x0001, /* ILU(tau) */ + DROP_PROWS = 0x0002, /* ILUTP: keep p maximum rows */ + DROP_COLUMN = 0x0004, /* ILUTP: for j-th column, + p = gamma * nnz(A(:,j)) */ + DROP_AREA = 0x0008, /* ILUTP: for j-th column, use + nnz(F(:,1:j)) / nnz(A(:,1:j)) + to limit memory growth */ + DROP_SECONDARY = 0x000E, /* PROWS | COLUMN | AREA */ + DROP_DYNAMIC = 0x0010, + DROP_INTERP = 0x0100} rule_t; +#endif + + +/* + * The following enumerate type is used by the statistics variable + * to keep track of flop count and time spent at various stages. + * + * Note that not all of the fields are disjoint. + */ +typedef enum { + COLPERM, /* find a column ordering that minimizes fills */ + ROWPERM, /* find a row ordering maximizes diagonal. */ + RELAX, /* find artificial supernodes */ + ETREE, /* compute column etree */ + EQUIL, /* equilibrate the original matrix */ + SYMBFAC, /* symbolic factorization. */ + DIST, /* distribute matrix. */ + FACT, /* perform LU factorization */ + COMM, /* communication for factorization */ + SOL_COMM,/* communication for solve */ + RCOND, /* estimate reciprocal condition number */ + SOLVE, /* forward and back solves */ + REFINE, /* perform iterative refinement */ + TRSV, /* fraction of FACT spent in xTRSV */ + GEMV, /* fraction of FACT spent in xGEMV */ + FERR, /* estimate error bounds after iterative refinement */ + NPHASES /* total number of phases */ +} PhaseType; + + +#endif /* __SUPERLU_ENUM_CONSTS */ diff --git a/src/include/ngspice/supermatrix.h b/src/include/ngspice/supermatrix.h new file mode 100644 index 000000000..825c6e18e --- /dev/null +++ b/src/include/ngspice/supermatrix.h @@ -0,0 +1,179 @@ +/*! @file supermatrix.h + * \brief Defines matrix types + */ +#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */ +#define __SUPERLU_SUPERMATRIX + +/******************************************** + * The matrix types are defined as follows. * + ********************************************/ +typedef enum { + SLU_NC, /* column-wise, no supernode */ + SLU_NCP, /* column-wise, column-permuted, no supernode + (The consecutive columns of nonzeros, after permutation, + may not be stored contiguously.) */ + SLU_NR, /* row-wize, no supernode */ + SLU_SC, /* column-wise, supernode */ + SLU_SCP, /* supernode, column-wise, permuted */ + SLU_SR, /* row-wise, supernode */ + SLU_DN, /* Fortran style column-wise storage for dense matrix */ + SLU_NR_loc /* distributed compressed row format */ +} Stype_t; + +typedef enum { + SLU_S, /* single */ + SLU_D, /* double */ + SLU_C, /* single complex */ + SLU_Z /* double complex */ +} Dtype_t; + +typedef enum { + SLU_GE, /* general */ + SLU_TRLU, /* lower triangular, unit diagonal */ + SLU_TRUU, /* upper triangular, unit diagonal */ + SLU_TRL, /* lower triangular */ + SLU_TRU, /* upper triangular */ + SLU_SYL, /* symmetric, store lower half */ + SLU_SYU, /* symmetric, store upper half */ + SLU_HEL, /* Hermitian, store lower half */ + SLU_HEU /* Hermitian, store upper half */ +} Mtype_t; + +typedef struct { + Stype_t Stype; /* Storage type: interprets the storage structure + pointed to by *Store. */ + Dtype_t Dtype; /* Data type. */ + Mtype_t Mtype; /* Matrix type: describes the mathematical property of + the matrix. */ + int_t nrow; /* number of rows */ + int_t ncol; /* number of columns */ + void *Store; /* pointer to the actual storage of the matrix */ +} SuperMatrix; + +/*********************************************** + * The storage schemes are defined as follows. * + ***********************************************/ + +/* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + void *nzval; /* pointer to array of nonzero values, packed by column */ + int_t *rowind; /* pointer to array of row indices of the nonzeros */ + int_t *colptr; /* pointer to array of beginning of columns in nzval[] + and rowind[] */ + /* Note: + Zero-based indexing is used; + colptr[] has ncol+1 entries, the last one pointing + beyond the last column, so that colptr[ncol] = nnz. */ +} NCformat; + +/* Stype == SLU_NR */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + void *nzval; /* pointer to array of nonzero values, packed by raw */ + int_t *colind; /* pointer to array of columns indices of the nonzeros */ + int_t *rowptr; /* pointer to array of beginning of rows in nzval[] + and colind[] */ + /* Note: + Zero-based indexing is used; + rowptr[] has nrow+1 entries, the last one pointing + beyond the last row, so that rowptr[nrow] = nnz. */ +} NRformat; + +/* Stype == SLU_SC */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + int_t nsuper; /* number of supernodes, minus 1 */ + void *nzval; /* pointer to array of nonzero values, packed by column */ + int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */ + int_t *rowind; /* pointer to array of compressed row indices of + rectangular supernodes */ + int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */ + int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column + j belongs; mapping from column to supernode number. */ + int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th + supernode; mapping from supernode number to column. + e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) + sup_to_col: 0 1 2 4 7 12 (nsuper=4) */ + /* Note: + Zero-based indexing is used; + nzval_colptr[], rowind_colptr[], col_to_sup and + sup_to_col[] have ncol+1 entries, the last one + pointing beyond the last column. + For col_to_sup[], only the first ncol entries are + defined. For sup_to_col[], only the first nsuper+2 + entries are defined. */ +} SCformat; + +/* Stype == SLU_SCP */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + int_t nsuper; /* number of supernodes */ + void *nzval; /* pointer to array of nonzero values, packed by column */ + int_t *nzval_colbeg;/* nzval_colbeg[j] points to beginning of column j + in nzval[] */ + int_t *nzval_colend;/* nzval_colend[j] points to one past the last element + of column j in nzval[] */ + int_t *rowind; /* pointer to array of compressed row indices of + rectangular supernodes */ + int_t *rowind_colbeg;/* rowind_colbeg[j] points to beginning of column j + in rowind[] */ + int_t *rowind_colend;/* rowind_colend[j] points to one past the last element + of column j in rowind[] */ + int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column + j belongs; mapping from column to supernode. */ + int_t *sup_to_colbeg; /* sup_to_colbeg[s] points to the start of the s-th + supernode; mapping from supernode to column.*/ + int_t *sup_to_colend; /* sup_to_colend[s] points to one past the end of the + s-th supernode; mapping from supernode number to + column. + e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) + sup_to_colbeg: 0 1 2 4 7 (nsuper=4) + sup_to_colend: 1 2 4 7 12 */ + /* Note: + Zero-based indexing is used; + nzval_colptr[], rowind_colptr[], col_to_sup and + sup_to_col[] have ncol+1 entries, the last one + pointing beyond the last column. */ +} SCPformat; + +/* Stype == SLU_NCP */ +typedef struct { + int_t nnz; /* number of nonzeros in the matrix */ + void *nzval; /* pointer to array of nonzero values, packed by column */ + int_t *rowind;/* pointer to array of row indices of the nonzeros */ + /* Note: nzval[]/rowind[] always have the same length */ + int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] + and rowind[] */ + int_t *colend;/* colend[j] points to one past the last element of column + j in nzval[] and rowind[] */ + /* Note: + Zero-based indexing is used; + The consecutive columns of the nonzeros may not be + contiguous in storage, because the matrix has been + postmultiplied by a column permutation matrix. */ +} NCPformat; + +/* Stype == SLU_DN */ +typedef struct { + int_t lda; /* leading dimension */ + void *nzval; /* array of size lda*ncol to represent a dense matrix */ +} DNformat; + +/* Stype == SLU_NR_loc (Distributed Compressed Row Format) */ +typedef struct { + int_t nnz_loc; /* number of nonzeros in the local submatrix */ + int_t m_loc; /* number of rows local to this processor */ + int_t fst_row; /* global index of the first row */ + void *nzval; /* pointer to array of nonzero values, packed by row */ + int_t *rowptr; /* pointer to array of beginning of rows in nzval[] + and colind[] */ + int_t *colind; /* pointer to array of column indices of the nonzeros */ + /* Note: + Zero-based indexing is used; + rowptr[] has n_loc + 1 entries, the last one pointing + beyond the last row, so that rowptr[n_loc] = nnz_loc.*/ +} NRformat_loc; + + +#endif /* __SUPERLU_SUPERMATRIX */ diff --git a/src/maths/SuperLU/Makefile.am b/src/maths/SuperLU/Makefile.am new file mode 100644 index 000000000..bc034d3e9 --- /dev/null +++ b/src/maths/SuperLU/Makefile.am @@ -0,0 +1,146 @@ +## Process this file with automake to produce Makefile.in + +#TRY TO FIND A WAY NOT TO OPTIMIZE superlu_timer AND dlamch COMPILATION + +noinst_LTLIBRARIES = libSuperLU.la libSuperLU_real.la libSuperLU_complex.la + +libSuperLU_real_la_SOURCES = \ + colamd.c \ + dcolumn_bmod.c \ + dcolumn_dfs.c \ + dcopy_to_ucol.c \ + ddiagonal.c \ + dgscon.c \ + dgsequ.c \ + dgsisx.c \ + dgsitrf.c \ + dgsrfs.c \ + dgssv.c \ + dgssvx.c \ + dgstrf.c \ + dgstrs.c \ + dlacon.c \ + dlamch.c \ + dlangs.c \ + dlaqgs.c \ + dldperm.c \ + dmemory.c \ + dmyblas2.c \ + dpanel_bmod.c \ + dpanel_dfs.c \ + dpivotgrowth.c \ + dpivotL.c \ + dpruneL.c \ + dreadhb.c \ + dreadrb.c \ + dreadtriple.c \ + dsnode_bmod.c \ + dsnode_dfs.c \ + dsp_blas2.c \ + dsp_blas3.c \ + dutil.c \ + get_perm_c.c \ + heap_relax_snode.c \ + ilu_dcolumn_dfs.c \ + ilu_dcopy_to_ucol.c \ + ilu_ddrop_row.c \ + ilu_dpanel_dfs.c \ + ilu_dpivotL.c \ + ilu_dsnode_dfs.c \ + ilu_heap_relax_snode.c \ + ilu_relax_snode.c \ + lsame.c \ + mark_relax.c \ + mc64ad.c \ + memory.c \ + mmd.c \ + qselect.c \ + relax_snode.c \ + sp_coletree.c \ + sp_ienv.c \ + sp_preorder.c \ + superlu_timer.c \ + util.c \ + xerbla.c + +libSuperLU_real_la_CPPFLAGS = -DAdd_ -DUSE_VENDOR_BLAS -I$(top_srcdir)/src/include + +libSuperLU_real_la_LDFLAGS = -L/usr/lib -lblas + + +libSuperLU_complex_la_SOURCES = \ + colamd.c \ + dcomplex.c \ + dlamch.c \ + dzsum1.c \ + get_perm_c.c \ + heap_relax_snode.c \ + ilu_heap_relax_snode.c \ + ilu_relax_snode.c \ + ilu_zcolumn_dfs.c \ + ilu_zcopy_to_ucol.c \ + ilu_zdrop_row.c \ + ilu_zpanel_dfs.c \ + ilu_zpivotL.c \ + ilu_zsnode_dfs.c \ + izmax1.c \ + lsame.c \ + mark_relax.c \ + mc64ad.c \ + memory.c \ + mmd.c \ + qselect.c \ + relax_snode.c \ + sp_coletree.c \ + sp_ienv.c \ + sp_preorder.c \ + superlu_timer.c \ + util.c \ + xerbla.c \ + zgssv.c \ + zgssvx.c \ + zsp_blas2.c \ + zsp_blas3.c \ + zgscon.c \ + zlangs.c \ + zgsequ.c \ + zlaqgs.c \ + zpivotgrowth.c \ + zgsrfs.c \ + zgstrf.c \ + zgstrs.c \ + zcopy_to_ucol.c \ + zsnode_dfs.c \ + zsnode_bmod.c \ + zpanel_dfs.c \ + zpanel_bmod.c \ + zreadhb.c \ + zreadrb.c \ + zreadtriple.c \ + zcolumn_dfs.c \ + zcolumn_bmod.c \ + zpivotL.c \ + zpruneL.c \ + zmemory.c \ + zutil.c \ + zmyblas2.c \ + zgsisx.c \ + zgsitrf.c \ + zldperm.c \ + zdiagonal.c \ + zlacon.c + +libSuperLU_complex_la_CPPFLAGS = -DAdd_ -DUSE_VENDOR_BLAS -I$(top_srcdir)/src/include + +libSuperLU_complex_la_LDFLAGS = -L/usr/lib -lblas + + +libSuperLU_la_SOURCES = \ + superlusmp.c + +libSuperLU_la_LIBADD = \ + libSuperLU_real.la \ + libSuperLU_complex.la + + +MAINTAINERCLEANFILES = Makefile.in diff --git a/src/maths/SuperLU/colamd.c b/src/maths/SuperLU/colamd.c new file mode 100644 index 000000000..72c4390dc --- /dev/null +++ b/src/maths/SuperLU/colamd.c @@ -0,0 +1,3414 @@ +/*! @file colamd.c + *\brief A sparse matrix column ordering algorithm + +
+    ========================================================================== 
+    === colamd/symamd - a sparse matrix column ordering algorithm ============ 
+    ========================================================================== 
+
+
+    colamd:  an approximate minimum degree column ordering algorithm,
+    	for LU factorization of symmetric or unsymmetric matrices,
+	QR factorization, least squares, interior point methods for
+	linear programming problems, and other related problems.
+
+    symamd:  an approximate minimum degree ordering algorithm for Cholesky
+    	factorization of symmetric matrices.
+
+    Purpose:
+
+	Colamd computes a permutation Q such that the Cholesky factorization of
+	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
+	than A'A.  This also provides a good ordering for sparse partial
+	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
+	factorization, and P is computed during numerical factorization via
+	conventional partial pivoting with row interchanges.  Colamd is the
+	column ordering method used in SuperLU, part of the ScaLAPACK library.
+	It is also available as built-in function in MATLAB Version 6,
+	available from MathWorks, Inc. (http://www.mathworks.com).  This
+	routine can be used in place of colmmd in MATLAB.
+
+    	Symamd computes a permutation P of a symmetric matrix A such that the
+	Cholesky factorization of PAP' has less fill-in and requires fewer
+	floating point operations than A.  Symamd constructs a matrix M such
+	that M'M has the same nonzero pattern of A, and then orders the columns
+	of M using colmmd.  The column ordering of M is then returned as the
+	row and column ordering P of A. 
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Date:
+
+	September 8, 2003.  Version 2.3.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Copyright and License:
+
+	Copyright (c) 1998-2003 by the University of Florida.
+	All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use, copy, modify, and/or distribute
+	this program, provided that the Copyright, this License, and the
+	Availability of the original version is retained on all copies and made
+	accessible to the end-user of any code or package that includes COLAMD
+	or any modified version of COLAMD. 
+
+    Availability:
+
+	The colamd/symamd library is available at
+
+	    http://www.cise.ufl.edu/research/sparse/colamd/
+
+	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.c
+	file.  It requires the colamd.h file.  It is required by the colamdmex.c
+	and symamdmex.c files, for the MATLAB interface to colamd and symamd.
+
+    See the ChangeLog file for changes since Version 1.0.
+
+    ========================================================================== 
+    === Description of user-callable routines ================================ 
+    ========================================================================== 
+
+
+    ----------------------------------------------------------------------------
+    colamd_recommended:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int colamd_recommended (int nnz, int n_row, int n_col) ;
+
+	    or as a C macro
+
+	    #include "colamd.h"
+	    Alen = COLAMD_RECOMMENDED (int nnz, int n_row, int n_col) ;
+
+	Purpose:
+
+	    Returns recommended value of Alen for use by colamd.  Returns -1
+	    if any input argument is negative.  The use of this routine
+	    or macro is optional.  Note that the macro uses its arguments
+	    more than once, so be careful for side effects, if you pass
+	    expressions as arguments to COLAMD_RECOMMENDED.  Not needed for
+	    symamd, which dynamically allocates its own memory.
+
+	Arguments (all input arguments):
+
+	    int nnz ;		Number of nonzeros in the matrix A.  This must
+				be the same value as p [n_col] in the call to
+				colamd - otherwise you will get a wrong value
+				of the recommended memory to use.
+
+	    int n_row ;		Number of rows in the matrix A.
+
+	    int n_col ;		Number of columns in the matrix A.
+
+    ----------------------------------------------------------------------------
+    colamd_set_defaults:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    colamd_set_defaults (double knobs [COLAMD_KNOBS]) ;
+
+	Purpose:
+
+	    Sets the default parameters.  The use of this routine is optional.
+
+	Arguments:
+
+	    double knobs [COLAMD_KNOBS] ;	Output only.
+
+		Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
+		entries are removed prior to ordering.  Columns with more than
+		(knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
+		ordering, and placed last in the output column ordering. 
+
+		Symamd: uses only knobs [COLAMD_DENSE_ROW], which is knobs [0].
+		Rows and columns with more than (knobs [COLAMD_DENSE_ROW] * n)
+		entries are removed prior to ordering, and placed last in the
+		output ordering.
+
+		COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+		respectively, in colamd.h.  Default values of these two knobs
+		are both 0.5.  Currently, only knobs [0] and knobs [1] are
+		used, but future versions may use more knobs.  If so, they will
+		be properly set to their defaults by the future version of
+		colamd_set_defaults, so that the code that calls colamd will
+		not need to change, assuming that you either use
+		colamd_set_defaults, or pass a (double *) NULL pointer as the
+		knobs array to colamd or symamd.
+
+    ----------------------------------------------------------------------------
+    colamd:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int colamd (int n_row, int n_col, int Alen, int *A, int *p,
+	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Computes a column ordering (Q) of A such that P(AQ)=LU or
+	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
+	    operations than factorizing the unpermuted matrix A or A'A,
+	    respectively.
+	    
+	Returns:
+
+	    TRUE (1) if successful, FALSE (0) otherwise.
+
+	Arguments:
+
+	    int n_row ;		Input argument.
+
+		Number of rows in the matrix A.
+		Restriction:  n_row >= 0.
+		Colamd returns FALSE if n_row is negative.
+
+	    int n_col ;		Input argument.
+
+		Number of columns in the matrix A.
+		Restriction:  n_col >= 0.
+		Colamd returns FALSE if n_col is negative.
+
+	    int Alen ;		Input argument.
+
+		Restriction (see note):
+		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col
+		Colamd returns FALSE if these conditions are not met.
+
+		Note:  this restriction makes an modest assumption regarding
+		the size of the two typedef's structures in colamd.h.
+		We do, however, guarantee that
+
+			Alen >= colamd_recommended (nnz, n_row, n_col)
+		
+		or equivalently as a C preprocessor macro: 
+
+			Alen >= COLAMD_RECOMMENDED (nnz, n_row, n_col)
+
+		will be sufficient.
+
+	    int A [Alen] ;	Input argument, undefined on output.
+
+		A is an integer array of size Alen.  Alen must be at least as
+		large as the bare minimum value given above, but this is very
+		low, and can result in excessive run time.  For best
+		performance, we recommend that Alen be greater than or equal to
+		colamd_recommended (nnz, n_row, n_col), which adds
+		nnz/5 to the bare minimum value given above.
+
+		On input, the row indices of the entries in column c of the
+		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
+		in a given column c need not be in ascending order, and
+		duplicate row indices may be be present.  However, colamd will
+		work a little faster if both of these conditions are met
+		(Colamd puts the matrix into this format, if it finds that the
+		the conditions are not met).
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are modified during ordering, and are
+		undefined on output.
+
+	    int p [n_col+1] ;	Both input and output argument.
+
+		p is an integer array of size n_col+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n_col-1.  The value p [n_col] is
+		thus the total number of entries in the pattern of the matrix A.
+		Colamd returns FALSE if these conditions are not met.
+
+		On output, if colamd returns TRUE, the array p holds the column
+		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
+		the first column index in the new ordering, and p [n_col-1] is
+		the last.  That is, p [k] = j means that column j of A is the
+		kth pivot column, in AQ, where k is in the range 0 to n_col-1
+		(p [0] = j means that column j of A is the first column in AQ).
+
+		If colamd returns FALSE, then no permutation is returned, and
+		p is undefined on output.
+
+	    double knobs [COLAMD_KNOBS] ;	Input argument.
+
+		See colamd_set_defaults for a description.
+
+	    int stats [COLAMD_STATS] ;		Output argument.
+
+		Statistics on the ordering, and error status.
+		See colamd.h for related definitions.
+		Colamd returns FALSE if stats is not present.
+
+		stats [0]:  number of dense or empty rows ignored.
+
+		stats [1]:  number of dense or empty columns ignored (and
+				ordered last in the output permutation p)
+				Note that a row can become "empty" if it
+				contains only "dense" and/or "empty" columns,
+				and similarly a column can become "empty" if it
+				only contains "dense" and/or "empty" rows.
+
+		stats [2]:  number of garbage collections performed.
+				This can be excessively high if Alen is close
+				to the minimum required value.
+
+		stats [3]:  status code.  < 0 is an error code.
+			    > 1 is a warning or notice.
+
+			0	OK.  Each column of the input matrix contained
+				row indices in increasing order, with no
+				duplicates.
+
+			1	OK, but columns of input matrix were jumbled
+				(unsorted columns or duplicate entries).  Colamd
+				had to do some extra work to sort the matrix
+				first and remove duplicate entries, but it
+				still was able to return a valid permutation
+				(return value of colamd was TRUE).
+
+					stats [4]: highest numbered column that
+						is unsorted or has duplicate
+						entries.
+					stats [5]: last seen duplicate or
+						unsorted row index.
+					stats [6]: number of duplicate or
+						unsorted row indices.
+
+			-1	A is a null pointer
+
+			-2	p is a null pointer
+
+			-3 	n_row is negative
+
+					stats [4]: n_row
+
+			-4	n_col is negative
+
+					stats [4]: n_col
+
+			-5	number of nonzeros in matrix is negative
+
+					stats [4]: number of nonzeros, p [n_col]
+
+			-6	p [0] is nonzero
+
+					stats [4]: p [0]
+
+			-7	A is too small
+
+					stats [4]: required size
+					stats [5]: actual size (Alen)
+
+			-8	a column has a negative number of entries
+
+					stats [4]: column with < 0 entries
+					stats [5]: number of entries in col
+
+			-9	a row index is out of bounds
+
+					stats [4]: column with bad row index
+					stats [5]: bad row index
+					stats [6]: n_row, # of rows of matrx
+
+			-10	(unused; see symamd.c)
+
+			-999	(unused; see symamd.c)
+
+		Future versions may return more statistics in the stats array.
+
+	Example:
+	
+	    See http://www.cise.ufl.edu/research/sparse/colamd/example.c
+	    for a complete example.
+
+	    To order the columns of a 5-by-4 matrix with 11 nonzero entries in
+	    the following nonzero pattern
+
+	    	x 0 x 0
+		x 0 x x
+		0 x x 0
+		0 0 x x
+		x x 0 0
+
+	    with default knobs and no output statistics, do the following:
+
+		#include "colamd.h"
+		#define ALEN COLAMD_RECOMMENDED (11, 5, 4)
+		int A [ALEN] = {1, 2, 5, 3, 5, 1, 2, 3, 4, 2, 4} ;
+		int p [ ] = {0, 3, 5, 9, 11} ;
+		int stats [COLAMD_STATS] ;
+		colamd (5, 4, ALEN, A, p, (double *) NULL, stats) ;
+
+	    The permutation is returned in the array p, and A is destroyed.
+
+    ----------------------------------------------------------------------------
+    symamd:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    int symamd (int n, int *A, int *p, int *perm,
+	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS],
+		void (*allocate) (size_t, size_t), void (*release) (void *)) ;
+
+	Purpose:
+
+    	    The symamd routine computes an ordering P of a symmetric sparse
+	    matrix A such that the Cholesky factorization PAP' = LL' remains
+	    sparse.  It is based on a column ordering of a matrix M constructed
+	    so that the nonzero pattern of M'M is the same as A.  The matrix A
+	    is assumed to be symmetric; only the strictly lower triangular part
+	    is accessed.  You must pass your selected memory allocator (usually
+	    calloc/free or mxCalloc/mxFree) to symamd, for it to allocate
+	    memory for the temporary matrix M.
+
+	Returns:
+
+	    TRUE (1) if successful, FALSE (0) otherwise.
+
+	Arguments:
+
+	    int n ;		Input argument.
+
+	    	Number of rows and columns in the symmetrix matrix A.
+		Restriction:  n >= 0.
+		Symamd returns FALSE if n is negative.
+
+	    int A [nnz] ;	Input argument.
+
+	    	A is an integer array of size nnz, where nnz = p [n].
+		
+		The row indices of the entries in column c of the matrix are
+		held in A [(p [c]) ... (p [c+1]-1)].  The row indices in a
+		given column c need not be in ascending order, and duplicate
+		row indices may be present.  However, symamd will run faster
+		if the columns are in sorted order with no duplicate entries. 
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n-1, and columns are in the range 0 to n-1.  Symamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are not modified.
+
+	    int p [n+1] ;   	Input argument.
+
+		p is an integer array of size n+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n-1.  The value p [n] is
+		thus the total number of entries in the pattern of the matrix A.
+		Symamd returns FALSE if these conditions are not met.
+
+		The contents of p are not modified.
+
+	    int perm [n+1] ;   	Output argument.
+
+		On output, if symamd returns TRUE, the array perm holds the
+		permutation P, where perm [0] is the first index in the new
+		ordering, and perm [n-1] is the last.  That is, perm [k] = j
+		means that row and column j of A is the kth column in PAP',
+		where k is in the range 0 to n-1 (perm [0] = j means
+		that row and column j of A are the first row and column in
+		PAP').  The array is used as a workspace during the ordering,
+		which is why it must be of length n+1, not just n.
+
+	    double knobs [COLAMD_KNOBS] ;	Input argument.
+
+		See colamd_set_defaults for a description.
+
+	    int stats [COLAMD_STATS] ;		Output argument.
+
+		Statistics on the ordering, and error status.
+		See colamd.h for related definitions.
+		Symamd returns FALSE if stats is not present.
+
+		stats [0]:  number of dense or empty row and columns ignored
+				(and ordered last in the output permutation 
+				perm).  Note that a row/column can become
+				"empty" if it contains only "dense" and/or
+				"empty" columns/rows.
+
+		stats [1]:  (same as stats [0])
+
+		stats [2]:  number of garbage collections performed.
+
+		stats [3]:  status code.  < 0 is an error code.
+			    > 1 is a warning or notice.
+
+			0	OK.  Each column of the input matrix contained
+				row indices in increasing order, with no
+				duplicates.
+
+			1	OK, but columns of input matrix were jumbled
+				(unsorted columns or duplicate entries).  Symamd
+				had to do some extra work to sort the matrix
+				first and remove duplicate entries, but it
+				still was able to return a valid permutation
+				(return value of symamd was TRUE).
+
+					stats [4]: highest numbered column that
+						is unsorted or has duplicate
+						entries.
+					stats [5]: last seen duplicate or
+						unsorted row index.
+					stats [6]: number of duplicate or
+						unsorted row indices.
+
+			-1	A is a null pointer
+
+			-2	p is a null pointer
+
+			-3	(unused, see colamd.c)
+
+			-4 	n is negative
+
+					stats [4]: n
+
+			-5	number of nonzeros in matrix is negative
+
+					stats [4]: # of nonzeros (p [n]).
+
+			-6	p [0] is nonzero
+
+					stats [4]: p [0]
+
+			-7	(unused)
+
+			-8	a column has a negative number of entries
+
+					stats [4]: column with < 0 entries
+					stats [5]: number of entries in col
+
+			-9	a row index is out of bounds
+
+					stats [4]: column with bad row index
+					stats [5]: bad row index
+					stats [6]: n_row, # of rows of matrx
+
+			-10	out of memory (unable to allocate temporary
+				workspace for M or count arrays using the
+				"allocate" routine passed into symamd).
+
+			-999	internal error.  colamd failed to order the
+				matrix M, when it should have succeeded.  This
+				indicates a bug.  If this (and *only* this)
+				error code occurs, please contact the authors.
+				Don't contact the authors if you get any other
+				error code.
+
+		Future versions may return more statistics in the stats array.
+
+	    void * (*allocate) (size_t, size_t)
+
+	    	A pointer to a function providing memory allocation.  The
+		allocated memory must be returned initialized to zero.  For a
+		C application, this argument should normally be a pointer to
+		calloc.  For a MATLAB mexFunction, the routine mxCalloc is
+		passed instead.
+
+	    void (*release) (size_t, size_t)
+
+	    	A pointer to a function that frees memory allocated by the
+		memory allocation routine above.  For a C application, this
+		argument should normally be a pointer to free.  For a MATLAB
+		mexFunction, the routine mxFree is passed instead.
+
+
+    ----------------------------------------------------------------------------
+    colamd_report:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    colamd_report (int stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Prints the error status and statistics recorded in the stats
+	    array on the standard error output (for a standard C routine)
+	    or on the MATLAB output (for a mexFunction).
+
+	Arguments:
+
+	    int stats [COLAMD_STATS] ;	Input only.  Statistics from colamd.
+
+
+    ----------------------------------------------------------------------------
+    symamd_report:
+    ----------------------------------------------------------------------------
+
+	C syntax:
+
+	    #include "colamd.h"
+	    symamd_report (int stats [COLAMD_STATS]) ;
+
+	Purpose:
+
+	    Prints the error status and statistics recorded in the stats
+	    array on the standard error output (for a standard C routine)
+	    or on the MATLAB output (for a mexFunction).
+
+	Arguments:
+
+	    int stats [COLAMD_STATS] ;	Input only.  Statistics from symamd.
+
+ 
+*/ + +/* ========================================================================== */ +/* === Scaffolding code definitions ======================================== */ +/* ========================================================================== */ + +/* Ensure that debugging is turned off: */ +#ifndef NDEBUG +#define NDEBUG +#endif /* NDEBUG */ + +/* + Our "scaffolding code" philosophy: In our opinion, well-written library + code should keep its "debugging" code, and just normally have it turned off + by the compiler so as not to interfere with performance. This serves + several purposes: + + (1) assertions act as comments to the reader, telling you what the code + expects at that point. All assertions will always be true (unless + there really is a bug, of course). + + (2) leaving in the scaffolding code assists anyone who would like to modify + the code, or understand the algorithm (by reading the debugging output, + one can get a glimpse into what the code is doing). + + (3) (gasp!) for actually finding bugs. This code has been heavily tested + and "should" be fully functional and bug-free ... but you never know... + + To enable debugging, comment out the "#define NDEBUG" above. For a MATLAB + mexFunction, you will also need to modify mexopts.sh to remove the -DNDEBUG + definition. The code will become outrageously slow when debugging is + enabled. To control the level of debugging output, set an environment + variable D to 0 (little), 1 (some), 2, 3, or 4 (lots). When debugging, + you should see the following message on the standard output: + + colamd: debug version, D = 1 (THIS WILL BE SLOW!) + + or a similar message for symamd. If you don't, then debugging has not + been enabled. + +*/ + +/* ========================================================================== */ +/* === Include files ======================================================== */ +/* ========================================================================== */ + +#include "colamd.h" +#include + +#ifdef MATLAB_MEX_FILE +#include "mex.h" +#include "matrix.h" +#else +#include +#include +#endif /* MATLAB_MEX_FILE */ + +/* ========================================================================== */ +/* === Definitions ========================================================== */ +/* ========================================================================== */ + +/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */ +#define PUBLIC +#define PRIVATE static + +#define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) + +#define ONES_COMPLEMENT(r) (-(r)-1) + +/* -------------------------------------------------------------------------- */ +/* Change for version 2.1: define TRUE and FALSE only if not yet defined */ +/* -------------------------------------------------------------------------- */ + +#ifndef TRUE +#define TRUE (1) +#endif + +#ifndef FALSE +#define FALSE (0) +#endif + +/* -------------------------------------------------------------------------- */ + +#define EMPTY (-1) + +/* Row and column status */ +#define ALIVE (0) +#define DEAD (-1) + +/* Column status */ +#define DEAD_PRINCIPAL (-1) +#define DEAD_NON_PRINCIPAL (-2) + +/* Macros for row and column status update and checking. */ +#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) +#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) +#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) +#define COL_IS_DEAD(c) (Col [c].start < ALIVE) +#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) +#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) +#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } +#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } +#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } + +/* ========================================================================== */ +/* === Colamd reporting mechanism =========================================== */ +/* ========================================================================== */ + +#ifdef MATLAB_MEX_FILE + +/* use mexPrintf in a MATLAB mexFunction, for debugging and statistics output */ +#define PRINTF mexPrintf + +/* In MATLAB, matrices are 1-based to the user, but 0-based internally */ +#define INDEX(i) ((i)+1) + +#else + +/* Use printf in standard C environment, for debugging and statistics output. */ +/* Output is generated only if debugging is enabled at compile time, or if */ +/* the caller explicitly calls colamd_report or symamd_report. */ +#define PRINTF printf + +/* In C, matrices are 0-based and indices are reported as such in *_report */ +#define INDEX(i) (i) + +#endif /* MATLAB_MEX_FILE */ + +/* ========================================================================== */ +/* === Prototypes of PRIVATE routines ======================================= */ +/* ========================================================================== */ + +PRIVATE int init_rows_cols +( + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int A [], + int p [], + int stats [COLAMD_STATS] +) ; + +PRIVATE void init_scoring +( + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int A [], + int head [], + double knobs [COLAMD_KNOBS], + int *p_n_row2, + int *p_n_col2, + int *p_max_deg +) ; + +PRIVATE int find_ordering +( + int n_row, + int n_col, + int Alen, + Colamd_Row Row [], + Colamd_Col Col [], + int A [], + int head [], + int n_col2, + int max_deg, + int pfree +) ; + +PRIVATE void order_children +( + int n_col, + Colamd_Col Col [], + int p [] +) ; + +PRIVATE void detect_super_cols +( + +#ifndef NDEBUG + int n_col, + Colamd_Row Row [], +#endif /* NDEBUG */ + + Colamd_Col Col [], + int A [], + int head [], + int row_start, + int row_length +) ; + +PRIVATE int garbage_collection +( + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int A [], + int *pfree +) ; + +PRIVATE int clear_mark +( + int n_row, + Colamd_Row Row [] +) ; + +PRIVATE void print_report +( + char *method, + int stats [COLAMD_STATS] +) ; + +/* ========================================================================== */ +/* === Debugging prototypes and definitions ================================= */ +/* ========================================================================== */ + +#ifndef NDEBUG + +/* colamd_debug is the *ONLY* global variable, and is only */ +/* present when debugging */ + +PRIVATE int colamd_debug ; /* debug print level */ + +#define DEBUG0(params) { (void) PRINTF params ; } +#define DEBUG1(params) { if (colamd_debug >= 1) (void) PRINTF params ; } +#define DEBUG2(params) { if (colamd_debug >= 2) (void) PRINTF params ; } +#define DEBUG3(params) { if (colamd_debug >= 3) (void) PRINTF params ; } +#define DEBUG4(params) { if (colamd_debug >= 4) (void) PRINTF params ; } + +#ifdef MATLAB_MEX_FILE +#define ASSERT(expression) (mxAssert ((expression), "")) +#else +#define ASSERT(expression) (assert (expression)) +#endif /* MATLAB_MEX_FILE */ + +PRIVATE void colamd_get_debug /* gets the debug print level from getenv */ +( + char *method +) ; + +PRIVATE void debug_deg_lists +( + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int head [], + int min_score, + int should, + int max_deg +) ; + +PRIVATE void debug_mark +( + int n_row, + Colamd_Row Row [], + int tag_mark, + int max_mark +) ; + +PRIVATE void debug_matrix +( + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int A [] +) ; + +PRIVATE void debug_structures +( + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int A [], + int n_col2 +) ; + +#else /* NDEBUG */ + +/* === No debugging ========================================================= */ + +#define DEBUG0(params) ; +#define DEBUG1(params) ; +#define DEBUG2(params) ; +#define DEBUG3(params) ; +#define DEBUG4(params) ; + +#define ASSERT(expression) ((void) 0) + +#endif /* NDEBUG */ + +/* ========================================================================== */ + + + +/* ========================================================================== */ +/* === USER-CALLABLE ROUTINES: ============================================== */ +/* ========================================================================== */ + + +/* ========================================================================== */ +/* === colamd_recommended =================================================== */ +/* ========================================================================== */ + +/* + The colamd_recommended routine returns the suggested size for Alen. This + value has been determined to provide good balance between the number of + garbage collections and the memory requirements for colamd. If any + argument is negative, a -1 is returned as an error condition. This + function is also available as a macro defined in colamd.h, so that you + can use it for a statically-allocated array size. +*/ + +PUBLIC int colamd_recommended /* returns recommended value of Alen. */ +( + /* === Parameters ======================================================= */ + + int nnz, /* number of nonzeros in A */ + int n_row, /* number of rows in A */ + int n_col /* number of columns in A */ +) +{ + return (COLAMD_RECOMMENDED (nnz, n_row, n_col)) ; +} + + +/* ========================================================================== */ +/* === colamd_set_defaults ================================================== */ +/* ========================================================================== */ + +/* + The colamd_set_defaults routine sets the default values of the user- + controllable parameters for colamd: + + knobs [0] rows with knobs[0]*n_col entries or more are removed + prior to ordering in colamd. Rows and columns with + knobs[0]*n_col entries or more are removed prior to + ordering in symamd and placed last in the output + ordering. + + knobs [1] columns with knobs[1]*n_row entries or more are removed + prior to ordering in colamd, and placed last in the + column permutation. Symamd ignores this knob. + + knobs [2..19] unused, but future versions might use this +*/ + +PUBLIC void colamd_set_defaults +( + /* === Parameters ======================================================= */ + + double knobs [COLAMD_KNOBS] /* knob array */ +) +{ + /* === Local variables ================================================== */ + + int i ; + + if (!knobs) + { + return ; /* no knobs to initialize */ + } + for (i = 0 ; i < COLAMD_KNOBS ; i++) + { + knobs [i] = 0 ; + } + knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ + knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ +} + + +/* ========================================================================== */ +/* === symamd =============================================================== */ +/* ========================================================================== */ + +PUBLIC int symamd /* return TRUE if OK, FALSE otherwise */ +( + /* === Parameters ======================================================= */ + + int n, /* number of rows and columns of A */ + int A [], /* row indices of A */ + int p [], /* column pointers of A */ + int perm [], /* output permutation, size n+1 */ + double knobs [COLAMD_KNOBS], /* parameters (uses defaults if NULL) */ + int stats [COLAMD_STATS], /* output statistics and error codes */ + void * (*allocate) (size_t, size_t), + /* pointer to calloc (ANSI C) or */ + /* mxCalloc (for MATLAB mexFunction) */ + void (*release) (void *) + /* pointer to free (ANSI C) or */ + /* mxFree (for MATLAB mexFunction) */ +) +{ + /* === Local variables ================================================== */ + + int *count ; /* length of each column of M, and col pointer*/ + int *mark ; /* mark array for finding duplicate entries */ + int *M ; /* row indices of matrix M */ + int Mlen ; /* length of M */ + int n_row ; /* number of rows in M */ + int nnz ; /* number of entries in A */ + int i ; /* row index of A */ + int j ; /* column index of A */ + int k ; /* row index of M */ + int mnz ; /* number of nonzeros in M */ + int pp ; /* index into a column of A */ + int last_row ; /* last row seen in the current column */ + int length ; /* number of nonzeros in a column */ + + double cknobs [COLAMD_KNOBS] ; /* knobs for colamd */ + double default_knobs [COLAMD_KNOBS] ; /* default knobs for colamd */ + int cstats [COLAMD_STATS] ; /* colamd stats */ + +#ifndef NDEBUG + colamd_get_debug ("symamd") ; +#endif /* NDEBUG */ + + /* === Check the input arguments ======================================== */ + + if (!stats) + { + DEBUG0 (("symamd: stats not present\n")) ; + return (FALSE) ; + } + for (i = 0 ; i < COLAMD_STATS ; i++) + { + stats [i] = 0 ; + } + stats [COLAMD_STATUS] = COLAMD_OK ; + stats [COLAMD_INFO1] = -1 ; + stats [COLAMD_INFO2] = -1 ; + + if (!A) + { + stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; + DEBUG0 (("symamd: A not present\n")) ; + return (FALSE) ; + } + + if (!p) /* p is not present */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; + DEBUG0 (("symamd: p not present\n")) ; + return (FALSE) ; + } + + if (n < 0) /* n must be >= 0 */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; + stats [COLAMD_INFO1] = n ; + DEBUG0 (("symamd: n negative %d\n", n)) ; + return (FALSE) ; + } + + nnz = p [n] ; + if (nnz < 0) /* nnz must be >= 0 */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; + stats [COLAMD_INFO1] = nnz ; + DEBUG0 (("symamd: number of entries negative %d\n", nnz)) ; + return (FALSE) ; + } + + if (p [0] != 0) + { + stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; + stats [COLAMD_INFO1] = p [0] ; + DEBUG0 (("symamd: p[0] not zero %d\n", p [0])) ; + return (FALSE) ; + } + + /* === If no knobs, set default knobs =================================== */ + + if (!knobs) + { + colamd_set_defaults (default_knobs) ; + knobs = default_knobs ; + } + + /* === Allocate count and mark ========================================== */ + + count = (int *) ((*allocate) (n+1, sizeof (int))) ; + if (!count) + { + stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ; + DEBUG0 (("symamd: allocate count (size %d) failed\n", n+1)) ; + return (FALSE) ; + } + + mark = (int *) ((*allocate) (n+1, sizeof (int))) ; + if (!mark) + { + stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ; + (*release) ((void *) count) ; + DEBUG0 (("symamd: allocate mark (size %d) failed\n", n+1)) ; + return (FALSE) ; + } + + /* === Compute column counts of M, check if A is valid ================== */ + + stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ + + for (i = 0 ; i < n ; i++) + { + mark [i] = -1 ; + } + + for (j = 0 ; j < n ; j++) + { + last_row = -1 ; + + length = p [j+1] - p [j] ; + if (length < 0) + { + /* column pointers must be non-decreasing */ + stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; + stats [COLAMD_INFO1] = j ; + stats [COLAMD_INFO2] = length ; + (*release) ((void *) count) ; + (*release) ((void *) mark) ; + DEBUG0 (("symamd: col %d negative length %d\n", j, length)) ; + return (FALSE) ; + } + + for (pp = p [j] ; pp < p [j+1] ; pp++) + { + i = A [pp] ; + if (i < 0 || i >= n) + { + /* row index i, in column j, is out of bounds */ + stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; + stats [COLAMD_INFO1] = j ; + stats [COLAMD_INFO2] = i ; + stats [COLAMD_INFO3] = n ; + (*release) ((void *) count) ; + (*release) ((void *) mark) ; + DEBUG0 (("symamd: row %d col %d out of bounds\n", i, j)) ; + return (FALSE) ; + } + + if (i <= last_row || mark [i] == j) + { + /* row index is unsorted or repeated (or both), thus col */ + /* is jumbled. This is a notice, not an error condition. */ + stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; + stats [COLAMD_INFO1] = j ; + stats [COLAMD_INFO2] = i ; + (stats [COLAMD_INFO3]) ++ ; + DEBUG1 (("symamd: row %d col %d unsorted/duplicate\n", i, j)) ; + } + + if (i > j && mark [i] != j) + { + /* row k of M will contain column indices i and j */ + count [i]++ ; + count [j]++ ; + } + + /* mark the row as having been seen in this column */ + mark [i] = j ; + + last_row = i ; + } + } + + if (stats [COLAMD_STATUS] == COLAMD_OK) + { + /* if there are no duplicate entries, then mark is no longer needed */ + (*release) ((void *) mark) ; + } + + /* === Compute column pointers of M ===================================== */ + + /* use output permutation, perm, for column pointers of M */ + perm [0] = 0 ; + for (j = 1 ; j <= n ; j++) + { + perm [j] = perm [j-1] + count [j-1] ; + } + for (j = 0 ; j < n ; j++) + { + count [j] = perm [j] ; + } + + /* === Construct M ====================================================== */ + + mnz = perm [n] ; + n_row = mnz / 2 ; + Mlen = colamd_recommended (mnz, n_row, n) ; + M = (int *) ((*allocate) (Mlen, sizeof (int))) ; + DEBUG0 (("symamd: M is %d-by-%d with %d entries, Mlen = %d\n", + n_row, n, mnz, Mlen)) ; + + if (!M) + { + stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ; + (*release) ((void *) count) ; + (*release) ((void *) mark) ; + DEBUG0 (("symamd: allocate M (size %d) failed\n", Mlen)) ; + return (FALSE) ; + } + + k = 0 ; + + if (stats [COLAMD_STATUS] == COLAMD_OK) + { + /* Matrix is OK */ + for (j = 0 ; j < n ; j++) + { + ASSERT (p [j+1] - p [j] >= 0) ; + for (pp = p [j] ; pp < p [j+1] ; pp++) + { + i = A [pp] ; + ASSERT (i >= 0 && i < n) ; + if (i > j) + { + /* row k of M contains column indices i and j */ + M [count [i]++] = k ; + M [count [j]++] = k ; + k++ ; + } + } + } + } + else + { + /* Matrix is jumbled. Do not add duplicates to M. Unsorted cols OK. */ + DEBUG0 (("symamd: Duplicates in A.\n")) ; + for (i = 0 ; i < n ; i++) + { + mark [i] = -1 ; + } + for (j = 0 ; j < n ; j++) + { + ASSERT (p [j+1] - p [j] >= 0) ; + for (pp = p [j] ; pp < p [j+1] ; pp++) + { + i = A [pp] ; + ASSERT (i >= 0 && i < n) ; + if (i > j && mark [i] != j) + { + /* row k of M contains column indices i and j */ + M [count [i]++] = k ; + M [count [j]++] = k ; + k++ ; + mark [i] = j ; + } + } + } + (*release) ((void *) mark) ; + } + + /* count and mark no longer needed */ + (*release) ((void *) count) ; + ASSERT (k == n_row) ; + + /* === Adjust the knobs for M =========================================== */ + + for (i = 0 ; i < COLAMD_KNOBS ; i++) + { + cknobs [i] = knobs [i] ; + } + + /* there are no dense rows in M */ + cknobs [COLAMD_DENSE_ROW] = 1.0 ; + + if (n_row != 0 && n < n_row) + { + /* On input, the knob is a fraction of 1..n, the number of rows of A. */ + /* Convert it to a fraction of 1..n_row, of the number of rows of M. */ + cknobs [COLAMD_DENSE_COL] = (knobs [COLAMD_DENSE_ROW] * n) / n_row ; + } + else + { + /* no dense columns in M */ + cknobs [COLAMD_DENSE_COL] = 1.0 ; + } + + DEBUG0 (("symamd: dense col knob for M: %g\n", cknobs [COLAMD_DENSE_COL])) ; + + /* === Order the columns of M =========================================== */ + + if (!colamd (n_row, n, Mlen, M, perm, cknobs, cstats)) + { + /* This "cannot" happen, unless there is a bug in the code. */ + stats [COLAMD_STATUS] = COLAMD_ERROR_internal_error ; + (*release) ((void *) M) ; + DEBUG0 (("symamd: internal error!\n")) ; + return (FALSE) ; + } + + /* Note that the output permutation is now in perm */ + + /* === get the statistics for symamd from colamd ======================== */ + + /* note that a dense column in colamd means a dense row and col in symamd */ + stats [COLAMD_DENSE_ROW] = cstats [COLAMD_DENSE_COL] ; + stats [COLAMD_DENSE_COL] = cstats [COLAMD_DENSE_COL] ; + stats [COLAMD_DEFRAG_COUNT] = cstats [COLAMD_DEFRAG_COUNT] ; + + /* === Free M =========================================================== */ + + (*release) ((void *) M) ; + DEBUG0 (("symamd: done.\n")) ; + return (TRUE) ; + +} + +/* ========================================================================== */ +/* === colamd =============================================================== */ +/* ========================================================================== */ + +/* + The colamd routine computes a column ordering Q of a sparse matrix + A such that the LU factorization P(AQ) = LU remains sparse, where P is + selected via partial pivoting. The routine can also be viewed as + providing a permutation Q such that the Cholesky factorization + (AQ)'(AQ) = LL' remains sparse. +*/ + +PUBLIC int colamd /* returns TRUE if successful, FALSE otherwise*/ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows in A */ + int n_col, /* number of columns in A */ + int Alen, /* length of A */ + int A [], /* row indices of A */ + int p [], /* pointers to columns in A */ + double knobs [COLAMD_KNOBS],/* parameters (uses defaults if NULL) */ + int stats [COLAMD_STATS] /* output statistics and error codes */ +) +{ + /* === Local variables ================================================== */ + + int i ; /* loop index */ + int nnz ; /* nonzeros in A */ + int Row_size ; /* size of Row [], in integers */ + int Col_size ; /* size of Col [], in integers */ + int need ; /* minimum required length of A */ + Colamd_Row *Row ; /* pointer into A of Row [0..n_row] array */ + Colamd_Col *Col ; /* pointer into A of Col [0..n_col] array */ + int n_col2 ; /* number of non-dense, non-empty columns */ + int n_row2 ; /* number of non-dense, non-empty rows */ + int ngarbage ; /* number of garbage collections performed */ + int max_deg ; /* maximum row degree */ + double default_knobs [COLAMD_KNOBS] ; /* default knobs array */ + +#ifndef NDEBUG + colamd_get_debug ("colamd") ; +#endif /* NDEBUG */ + + /* === Check the input arguments ======================================== */ + + if (!stats) + { + DEBUG0 (("colamd: stats not present\n")) ; + return (FALSE) ; + } + for (i = 0 ; i < COLAMD_STATS ; i++) + { + stats [i] = 0 ; + } + stats [COLAMD_STATUS] = COLAMD_OK ; + stats [COLAMD_INFO1] = -1 ; + stats [COLAMD_INFO2] = -1 ; + + if (!A) /* A is not present */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; + DEBUG0 (("colamd: A not present\n")) ; + return (FALSE) ; + } + + if (!p) /* p is not present */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; + DEBUG0 (("colamd: p not present\n")) ; + return (FALSE) ; + } + + if (n_row < 0) /* n_row must be >= 0 */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ; + stats [COLAMD_INFO1] = n_row ; + DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; + return (FALSE) ; + } + + if (n_col < 0) /* n_col must be >= 0 */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; + stats [COLAMD_INFO1] = n_col ; + DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; + return (FALSE) ; + } + + nnz = p [n_col] ; + if (nnz < 0) /* nnz must be >= 0 */ + { + stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; + stats [COLAMD_INFO1] = nnz ; + DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; + return (FALSE) ; + } + + if (p [0] != 0) + { + stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; + stats [COLAMD_INFO1] = p [0] ; + DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; + return (FALSE) ; + } + + /* === If no knobs, set default knobs =================================== */ + + if (!knobs) + { + colamd_set_defaults (default_knobs) ; + knobs = default_knobs ; + } + + /* === Allocate the Row and Col arrays from array A ===================== */ + + Col_size = COLAMD_C (n_col) ; + Row_size = COLAMD_R (n_row) ; + need = 2*nnz + n_col + Col_size + Row_size ; + + if (need > Alen) + { + /* not enough space in array A to perform the ordering */ + stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ; + stats [COLAMD_INFO1] = need ; + stats [COLAMD_INFO2] = Alen ; + DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); + return (FALSE) ; + } + + Alen -= Col_size + Row_size ; + Col = (Colamd_Col *) &A [Alen] ; + Row = (Colamd_Row *) &A [Alen + Col_size] ; + + /* === Construct the row and column data structures ===================== */ + + if (!init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) + { + /* input matrix is invalid */ + DEBUG0 (("colamd: Matrix invalid\n")) ; + return (FALSE) ; + } + + /* === Initialize scores, kill dense rows/columns ======================= */ + + init_scoring (n_row, n_col, Row, Col, A, p, knobs, + &n_row2, &n_col2, &max_deg) ; + + /* === Order the supercolumns =========================================== */ + + ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p, + n_col2, max_deg, 2*nnz) ; + + /* === Order the non-principal columns ================================== */ + + order_children (n_col, Col, p) ; + + /* === Return statistics in stats ======================================= */ + + stats [COLAMD_DENSE_ROW] = n_row - n_row2 ; + stats [COLAMD_DENSE_COL] = n_col - n_col2 ; + stats [COLAMD_DEFRAG_COUNT] = ngarbage ; + DEBUG0 (("colamd: done.\n")) ; + return (TRUE) ; +} + + +/* ========================================================================== */ +/* === colamd_report ======================================================== */ +/* ========================================================================== */ + +PUBLIC void colamd_report +( + int stats [COLAMD_STATS] +) +{ + print_report ("colamd", stats) ; +} + + +/* ========================================================================== */ +/* === symamd_report ======================================================== */ +/* ========================================================================== */ + +PUBLIC void symamd_report +( + int stats [COLAMD_STATS] +) +{ + print_report ("symamd", stats) ; +} + + + +/* ========================================================================== */ +/* === NON-USER-CALLABLE ROUTINES: ========================================== */ +/* ========================================================================== */ + +/* There are no user-callable routines beyond this point in the file */ + + +/* ========================================================================== */ +/* === init_rows_cols ======================================================= */ +/* ========================================================================== */ + +/* + Takes the column form of the matrix in A and creates the row form of the + matrix. Also, row and column attributes are stored in the Col and Row + structs. If the columns are un-sorted or contain duplicate row indices, + this routine will also sort and remove duplicate row indices from the + column form of the matrix. Returns FALSE if the matrix is invalid, + TRUE otherwise. Not user-callable. +*/ + +PRIVATE int init_rows_cols /* returns TRUE if OK, or FALSE otherwise */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows of A */ + int n_col, /* number of columns of A */ + Colamd_Row Row [], /* of size n_row+1 */ + Colamd_Col Col [], /* of size n_col+1 */ + int A [], /* row indices of A, of size Alen */ + int p [], /* pointers to columns in A, of size n_col+1 */ + int stats [COLAMD_STATS] /* colamd statistics */ +) +{ + /* === Local variables ================================================== */ + + int col ; /* a column index */ + int row ; /* a row index */ + int *cp ; /* a column pointer */ + int *cp_end ; /* a pointer to the end of a column */ + int *rp ; /* a row pointer */ + int *rp_end ; /* a pointer to the end of a row */ + int last_row ; /* previous row */ + + /* === Initialize columns, and check column pointers ==================== */ + + for (col = 0 ; col < n_col ; col++) + { + Col [col].start = p [col] ; + Col [col].length = p [col+1] - p [col] ; + + if (Col [col].length < 0) + { + /* column pointers must be non-decreasing */ + stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; + stats [COLAMD_INFO1] = col ; + stats [COLAMD_INFO2] = Col [col].length ; + DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; + return (FALSE) ; + } + + Col [col].shared1.thickness = 1 ; + Col [col].shared2.score = 0 ; + Col [col].shared3.prev = EMPTY ; + Col [col].shared4.degree_next = EMPTY ; + } + + /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ + + /* === Scan columns, compute row degrees, and check row indices ========= */ + + stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ + + for (row = 0 ; row < n_row ; row++) + { + Row [row].length = 0 ; + Row [row].shared2.mark = -1 ; + } + + for (col = 0 ; col < n_col ; col++) + { + last_row = -1 ; + + cp = &A [p [col]] ; + cp_end = &A [p [col+1]] ; + + while (cp < cp_end) + { + row = *cp++ ; + + /* make sure row indices within range */ + if (row < 0 || row >= n_row) + { + stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; + stats [COLAMD_INFO1] = col ; + stats [COLAMD_INFO2] = row ; + stats [COLAMD_INFO3] = n_row ; + DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; + return (FALSE) ; + } + + if (row <= last_row || Row [row].shared2.mark == col) + { + /* row index are unsorted or repeated (or both), thus col */ + /* is jumbled. This is a notice, not an error condition. */ + stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; + stats [COLAMD_INFO1] = col ; + stats [COLAMD_INFO2] = row ; + (stats [COLAMD_INFO3]) ++ ; + DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); + } + + if (Row [row].shared2.mark != col) + { + Row [row].length++ ; + } + else + { + /* this is a repeated entry in the column, */ + /* it will be removed */ + Col [col].length-- ; + } + + /* mark the row as having been seen in this column */ + Row [row].shared2.mark = col ; + + last_row = row ; + } + } + + /* === Compute row pointers ============================================= */ + + /* row form of the matrix starts directly after the column */ + /* form of matrix in A */ + Row [0].start = p [n_col] ; + Row [0].shared1.p = Row [0].start ; + Row [0].shared2.mark = -1 ; + for (row = 1 ; row < n_row ; row++) + { + Row [row].start = Row [row-1].start + Row [row-1].length ; + Row [row].shared1.p = Row [row].start ; + Row [row].shared2.mark = -1 ; + } + + /* === Create row form ================================================== */ + + if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + { + /* if cols jumbled, watch for repeated row indices */ + for (col = 0 ; col < n_col ; col++) + { + cp = &A [p [col]] ; + cp_end = &A [p [col+1]] ; + while (cp < cp_end) + { + row = *cp++ ; + if (Row [row].shared2.mark != col) + { + A [(Row [row].shared1.p)++] = col ; + Row [row].shared2.mark = col ; + } + } + } + } + else + { + /* if cols not jumbled, we don't need the mark (this is faster) */ + for (col = 0 ; col < n_col ; col++) + { + cp = &A [p [col]] ; + cp_end = &A [p [col+1]] ; + while (cp < cp_end) + { + A [(Row [*cp++].shared1.p)++] = col ; + } + } + } + + /* === Clear the row marks and set row degrees ========================== */ + + for (row = 0 ; row < n_row ; row++) + { + Row [row].shared2.mark = 0 ; + Row [row].shared1.degree = Row [row].length ; + } + + /* === See if we need to re-create columns ============================== */ + + if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + { + DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ; + +#ifndef NDEBUG + /* make sure column lengths are correct */ + for (col = 0 ; col < n_col ; col++) + { + p [col] = Col [col].length ; + } + for (row = 0 ; row < n_row ; row++) + { + rp = &A [Row [row].start] ; + rp_end = rp + Row [row].length ; + while (rp < rp_end) + { + p [*rp++]-- ; + } + } + for (col = 0 ; col < n_col ; col++) + { + ASSERT (p [col] == 0) ; + } + /* now p is all zero (different than when debugging is turned off) */ +#endif /* NDEBUG */ + + /* === Compute col pointers ========================================= */ + + /* col form of the matrix starts at A [0]. */ + /* Note, we may have a gap between the col form and the row */ + /* form if there were duplicate entries, if so, it will be */ + /* removed upon the first garbage collection */ + Col [0].start = 0 ; + p [0] = Col [0].start ; + for (col = 1 ; col < n_col ; col++) + { + /* note that the lengths here are for pruned columns, i.e. */ + /* no duplicate row indices will exist for these columns */ + Col [col].start = Col [col-1].start + Col [col-1].length ; + p [col] = Col [col].start ; + } + + /* === Re-create col form =========================================== */ + + for (row = 0 ; row < n_row ; row++) + { + rp = &A [Row [row].start] ; + rp_end = rp + Row [row].length ; + while (rp < rp_end) + { + A [(p [*rp++])++] = row ; + } + } + } + + /* === Done. Matrix is not (or no longer) jumbled ====================== */ + + return (TRUE) ; +} + + +/* ========================================================================== */ +/* === init_scoring ========================================================= */ +/* ========================================================================== */ + +/* + Kills dense or empty columns and rows, calculates an initial score for + each column, and places all columns in the degree lists. Not user-callable. +*/ + +PRIVATE void init_scoring +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows of A */ + int n_col, /* number of columns of A */ + Colamd_Row Row [], /* of size n_row+1 */ + Colamd_Col Col [], /* of size n_col+1 */ + int A [], /* column form and row form of A */ + int head [], /* of size n_col+1 */ + double knobs [COLAMD_KNOBS],/* parameters */ + int *p_n_row2, /* number of non-dense, non-empty rows */ + int *p_n_col2, /* number of non-dense, non-empty columns */ + int *p_max_deg /* maximum row degree */ +) +{ + /* === Local variables ================================================== */ + + int c ; /* a column index */ + int r, row ; /* a row index */ + int *cp ; /* a column pointer */ + int deg ; /* degree of a row or column */ + int *cp_end ; /* a pointer to the end of a column */ + int *new_cp ; /* new column pointer */ + int col_length ; /* length of pruned column */ + int score ; /* current column score */ + int n_col2 ; /* number of non-dense, non-empty columns */ + int n_row2 ; /* number of non-dense, non-empty rows */ + int dense_row_count ; /* remove rows with more entries than this */ + int dense_col_count ; /* remove cols with more entries than this */ + int min_score ; /* smallest column score */ + int max_deg ; /* maximum row degree */ + int next_col ; /* Used to add to degree list.*/ + +#ifndef NDEBUG + int debug_count ; /* debug only. */ +#endif /* NDEBUG */ + + /* === Extract knobs ==================================================== */ + + dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ; + dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ; + DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; + max_deg = 0 ; + n_col2 = n_col ; + n_row2 = n_row ; + + /* === Kill empty columns =============================================== */ + + /* Put the empty columns at the end in their natural order, so that LU */ + /* factorization can proceed as far as possible. */ + for (c = n_col-1 ; c >= 0 ; c--) + { + deg = Col [c].length ; + if (deg == 0) + { + /* this is a empty column, kill and order it last */ + Col [c].shared2.order = --n_col2 ; + KILL_PRINCIPAL_COL (c) ; + } + } + DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ; + + /* === Kill dense columns =============================================== */ + + /* Put the dense columns at the end, in their natural order */ + for (c = n_col-1 ; c >= 0 ; c--) + { + /* skip any dead columns */ + if (COL_IS_DEAD (c)) + { + continue ; + } + deg = Col [c].length ; + if (deg > dense_col_count) + { + /* this is a dense column, kill and order it last */ + Col [c].shared2.order = --n_col2 ; + /* decrement the row degrees */ + cp = &A [Col [c].start] ; + cp_end = cp + Col [c].length ; + while (cp < cp_end) + { + Row [*cp++].shared1.degree-- ; + } + KILL_PRINCIPAL_COL (c) ; + } + } + DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ; + + /* === Kill dense and empty rows ======================================== */ + + for (r = 0 ; r < n_row ; r++) + { + deg = Row [r].shared1.degree ; + ASSERT (deg >= 0 && deg <= n_col) ; + if (deg > dense_row_count || deg == 0) + { + /* kill a dense or empty row */ + KILL_ROW (r) ; + --n_row2 ; + } + else + { + /* keep track of max degree of remaining rows */ + max_deg = MAX (max_deg, deg) ; + } + } + DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ; + + /* === Compute initial column scores ==================================== */ + + /* At this point the row degrees are accurate. They reflect the number */ + /* of "live" (non-dense) columns in each row. No empty rows exist. */ + /* Some "live" columns may contain only dead rows, however. These are */ + /* pruned in the code below. */ + + /* now find the initial matlab score for each column */ + for (c = n_col-1 ; c >= 0 ; c--) + { + /* skip dead column */ + if (COL_IS_DEAD (c)) + { + continue ; + } + score = 0 ; + cp = &A [Col [c].start] ; + new_cp = cp ; + cp_end = cp + Col [c].length ; + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + /* skip if dead */ + if (ROW_IS_DEAD (row)) + { + continue ; + } + /* compact the column */ + *new_cp++ = row ; + /* add row's external degree */ + score += Row [row].shared1.degree - 1 ; + /* guard against integer overflow */ + score = MIN (score, n_col) ; + } + /* determine pruned column length */ + col_length = (int) (new_cp - &A [Col [c].start]) ; + if (col_length == 0) + { + /* a newly-made null column (all rows in this col are "dense" */ + /* and have already been killed) */ + DEBUG2 (("Newly null killed: %d\n", c)) ; + Col [c].shared2.order = --n_col2 ; + KILL_PRINCIPAL_COL (c) ; + } + else + { + /* set column length and set score */ + ASSERT (score >= 0) ; + ASSERT (score <= n_col) ; + Col [c].length = col_length ; + Col [c].shared2.score = score ; + } + } + DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n", + n_col-n_col2)) ; + + /* At this point, all empty rows and columns are dead. All live columns */ + /* are "clean" (containing no dead rows) and simplicial (no supercolumns */ + /* yet). Rows may contain dead columns, but all live rows contain at */ + /* least one live column. */ + +#ifndef NDEBUG + debug_structures (n_row, n_col, Row, Col, A, n_col2) ; +#endif /* NDEBUG */ + + /* === Initialize degree lists ========================================== */ + +#ifndef NDEBUG + debug_count = 0 ; +#endif /* NDEBUG */ + + /* clear the hash buckets */ + for (c = 0 ; c <= n_col ; c++) + { + head [c] = EMPTY ; + } + min_score = n_col ; + /* place in reverse order, so low column indices are at the front */ + /* of the lists. This is to encourage natural tie-breaking */ + for (c = n_col-1 ; c >= 0 ; c--) + { + /* only add principal columns to degree lists */ + if (COL_IS_ALIVE (c)) + { + DEBUG4 (("place %d score %d minscore %d ncol %d\n", + c, Col [c].shared2.score, min_score, n_col)) ; + + /* === Add columns score to DList =============================== */ + + score = Col [c].shared2.score ; + + ASSERT (min_score >= 0) ; + ASSERT (min_score <= n_col) ; + ASSERT (score >= 0) ; + ASSERT (score <= n_col) ; + ASSERT (head [score] >= EMPTY) ; + + /* now add this column to dList at proper score location */ + next_col = head [score] ; + Col [c].shared3.prev = EMPTY ; + Col [c].shared4.degree_next = next_col ; + + /* if there already was a column with the same score, set its */ + /* previous pointer to this new column */ + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = c ; + } + head [score] = c ; + + /* see if this score is less than current min */ + min_score = MIN (min_score, score) ; + +#ifndef NDEBUG + debug_count++ ; +#endif /* NDEBUG */ + + } + } + +#ifndef NDEBUG + DEBUG1 (("colamd: Live cols %d out of %d, non-princ: %d\n", + debug_count, n_col, n_col-debug_count)) ; + ASSERT (debug_count == n_col2) ; + debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ; +#endif /* NDEBUG */ + + /* === Return number of remaining columns, and max row degree =========== */ + + *p_n_col2 = n_col2 ; + *p_n_row2 = n_row2 ; + *p_max_deg = max_deg ; +} + + +/* ========================================================================== */ +/* === find_ordering ======================================================== */ +/* ========================================================================== */ + +/* + Order the principal columns of the supercolumn form of the matrix + (no supercolumns on input). Uses a minimum approximate column minimum + degree ordering method. Not user-callable. +*/ + +PRIVATE int find_ordering /* return the number of garbage collections */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows of A */ + int n_col, /* number of columns of A */ + int Alen, /* size of A, 2*nnz + n_col or larger */ + Colamd_Row Row [], /* of size n_row+1 */ + Colamd_Col Col [], /* of size n_col+1 */ + int A [], /* column form and row form of A */ + int head [], /* of size n_col+1 */ + int n_col2, /* Remaining columns to order */ + int max_deg, /* Maximum row degree */ + int pfree /* index of first free slot (2*nnz on entry) */ +) +{ + /* === Local variables ================================================== */ + + int k ; /* current pivot ordering step */ + int pivot_col ; /* current pivot column */ + int *cp ; /* a column pointer */ + int *rp ; /* a row pointer */ + int pivot_row ; /* current pivot row */ + int *new_cp ; /* modified column pointer */ + int *new_rp ; /* modified row pointer */ + int pivot_row_start ; /* pointer to start of pivot row */ + int pivot_row_degree ; /* number of columns in pivot row */ + int pivot_row_length ; /* number of supercolumns in pivot row */ + int pivot_col_score ; /* score of pivot column */ + int needed_memory ; /* free space needed for pivot row */ + int *cp_end ; /* pointer to the end of a column */ + int *rp_end ; /* pointer to the end of a row */ + int row ; /* a row index */ + int col ; /* a column index */ + int max_score ; /* maximum possible score */ + int cur_score ; /* score of current column */ + unsigned int hash ; /* hash value for supernode detection */ + int head_column ; /* head of hash bucket */ + int first_col ; /* first column in hash bucket */ + int tag_mark ; /* marker value for mark array */ + int row_mark ; /* Row [row].shared2.mark */ + int set_difference ; /* set difference size of row with pivot row */ + int min_score ; /* smallest column score */ + int col_thickness ; /* "thickness" (no. of columns in a supercol) */ + int max_mark ; /* maximum value of tag_mark */ + int pivot_col_thickness ; /* number of columns represented by pivot col */ + int prev_col ; /* Used by Dlist operations. */ + int next_col ; /* Used by Dlist operations. */ + int ngarbage ; /* number of garbage collections performed */ + +#ifndef NDEBUG + int debug_d ; /* debug loop counter */ + int debug_step = 0 ; /* debug loop counter */ +#endif /* NDEBUG */ + + /* === Initialization and clear mark ==================================== */ + + max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ + tag_mark = clear_mark (n_row, Row) ; + min_score = 0 ; + ngarbage = 0 ; + DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ; + + /* === Order the columns ================================================ */ + + for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */) + { + +#ifndef NDEBUG + if (debug_step % 100 == 0) + { + DEBUG2 (("\n... Step k: %d out of n_col2: %d\n", k, n_col2)) ; + } + else + { + DEBUG3 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ; + } + debug_step++ ; + debug_deg_lists (n_row, n_col, Row, Col, head, + min_score, n_col2-k, max_deg) ; + debug_matrix (n_row, n_col, Row, Col, A) ; +#endif /* NDEBUG */ + + /* === Select pivot column, and order it ============================ */ + + /* make sure degree list isn't empty */ + ASSERT (min_score >= 0) ; + ASSERT (min_score <= n_col) ; + ASSERT (head [min_score] >= EMPTY) ; + +#ifndef NDEBUG + for (debug_d = 0 ; debug_d < min_score ; debug_d++) + { + ASSERT (head [debug_d] == EMPTY) ; + } +#endif /* NDEBUG */ + + /* get pivot column from head of minimum degree list */ + while (head [min_score] == EMPTY && min_score < n_col) + { + min_score++ ; + } + pivot_col = head [min_score] ; + ASSERT (pivot_col >= 0 && pivot_col <= n_col) ; + next_col = Col [pivot_col].shared4.degree_next ; + head [min_score] = next_col ; + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = EMPTY ; + } + + ASSERT (COL_IS_ALIVE (pivot_col)) ; + DEBUG3 (("Pivot col: %d\n", pivot_col)) ; + + /* remember score for defrag check */ + pivot_col_score = Col [pivot_col].shared2.score ; + + /* the pivot column is the kth column in the pivot order */ + Col [pivot_col].shared2.order = k ; + + /* increment order count by column thickness */ + pivot_col_thickness = Col [pivot_col].shared1.thickness ; + k += pivot_col_thickness ; + ASSERT (pivot_col_thickness > 0) ; + + /* === Garbage_collection, if necessary ============================= */ + + needed_memory = MIN (pivot_col_score, n_col - k) ; + if (pfree + needed_memory >= Alen) + { + pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; + ngarbage++ ; + /* after garbage collection we will have enough */ + ASSERT (pfree + needed_memory < Alen) ; + /* garbage collection has wiped out the Row[].shared2.mark array */ + tag_mark = clear_mark (n_row, Row) ; + +#ifndef NDEBUG + debug_matrix (n_row, n_col, Row, Col, A) ; +#endif /* NDEBUG */ + } + + /* === Compute pivot row pattern ==================================== */ + + /* get starting location for this new merged row */ + pivot_row_start = pfree ; + + /* initialize new row counts to zero */ + pivot_row_degree = 0 ; + + /* tag pivot column as having been visited so it isn't included */ + /* in merged pivot row */ + Col [pivot_col].shared1.thickness = -pivot_col_thickness ; + + /* pivot row is the union of all rows in the pivot column pattern */ + cp = &A [Col [pivot_col].start] ; + cp_end = cp + Col [pivot_col].length ; + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; + /* skip if row is dead */ + if (ROW_IS_DEAD (row)) + { + continue ; + } + rp = &A [Row [row].start] ; + rp_end = rp + Row [row].length ; + while (rp < rp_end) + { + /* get a column */ + col = *rp++ ; + /* add the column, if alive and untagged */ + col_thickness = Col [col].shared1.thickness ; + if (col_thickness > 0 && COL_IS_ALIVE (col)) + { + /* tag column in pivot row */ + Col [col].shared1.thickness = -col_thickness ; + ASSERT (pfree < Alen) ; + /* place column in pivot row */ + A [pfree++] = col ; + pivot_row_degree += col_thickness ; + } + } + } + + /* clear tag on pivot column */ + Col [pivot_col].shared1.thickness = pivot_col_thickness ; + max_deg = MAX (max_deg, pivot_row_degree) ; + +#ifndef NDEBUG + DEBUG3 (("check2\n")) ; + debug_mark (n_row, Row, tag_mark, max_mark) ; +#endif /* NDEBUG */ + + /* === Kill all rows used to construct pivot row ==================== */ + + /* also kill pivot row, temporarily */ + cp = &A [Col [pivot_col].start] ; + cp_end = cp + Col [pivot_col].length ; + while (cp < cp_end) + { + /* may be killing an already dead row */ + row = *cp++ ; + DEBUG3 (("Kill row in pivot col: %d\n", row)) ; + KILL_ROW (row) ; + } + + /* === Select a row index to use as the new pivot row =============== */ + + pivot_row_length = pfree - pivot_row_start ; + if (pivot_row_length > 0) + { + /* pick the "pivot" row arbitrarily (first row in col) */ + pivot_row = A [Col [pivot_col].start] ; + DEBUG3 (("Pivotal row is %d\n", pivot_row)) ; + } + else + { + /* there is no pivot row, since it is of zero length */ + pivot_row = EMPTY ; + ASSERT (pivot_row_length == 0) ; + } + ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ; + + /* === Approximate degree computation =============================== */ + + /* Here begins the computation of the approximate degree. The column */ + /* score is the sum of the pivot row "length", plus the size of the */ + /* set differences of each row in the column minus the pattern of the */ + /* pivot row itself. The column ("thickness") itself is also */ + /* excluded from the column score (we thus use an approximate */ + /* external degree). */ + + /* The time taken by the following code (compute set differences, and */ + /* add them up) is proportional to the size of the data structure */ + /* being scanned - that is, the sum of the sizes of each column in */ + /* the pivot row. Thus, the amortized time to compute a column score */ + /* is proportional to the size of that column (where size, in this */ + /* context, is the column "length", or the number of row indices */ + /* in that column). The number of row indices in a column is */ + /* monotonically non-decreasing, from the length of the original */ + /* column on input to colamd. */ + + /* === Compute set differences ====================================== */ + + DEBUG3 (("** Computing set differences phase. **\n")) ; + + /* pivot row is currently dead - it will be revived later. */ + + DEBUG3 (("Pivot row: ")) ; + /* for each column in pivot row */ + rp = &A [pivot_row_start] ; + rp_end = rp + pivot_row_length ; + while (rp < rp_end) + { + col = *rp++ ; + ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + DEBUG3 (("Col: %d\n", col)) ; + + /* clear tags used to construct pivot row pattern */ + col_thickness = -Col [col].shared1.thickness ; + ASSERT (col_thickness > 0) ; + Col [col].shared1.thickness = col_thickness ; + + /* === Remove column from degree list =========================== */ + + cur_score = Col [col].shared2.score ; + prev_col = Col [col].shared3.prev ; + next_col = Col [col].shared4.degree_next ; + ASSERT (cur_score >= 0) ; + ASSERT (cur_score <= n_col) ; + ASSERT (cur_score >= EMPTY) ; + if (prev_col == EMPTY) + { + head [cur_score] = next_col ; + } + else + { + Col [prev_col].shared4.degree_next = next_col ; + } + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = prev_col ; + } + + /* === Scan the column ========================================== */ + + cp = &A [Col [col].start] ; + cp_end = cp + Col [col].length ; + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + row_mark = Row [row].shared2.mark ; + /* skip if dead */ + if (ROW_IS_MARKED_DEAD (row_mark)) + { + continue ; + } + ASSERT (row != pivot_row) ; + set_difference = row_mark - tag_mark ; + /* check if the row has been seen yet */ + if (set_difference < 0) + { + ASSERT (Row [row].shared1.degree <= max_deg) ; + set_difference = Row [row].shared1.degree ; + } + /* subtract column thickness from this row's set difference */ + set_difference -= col_thickness ; + ASSERT (set_difference >= 0) ; + /* absorb this row if the set difference becomes zero */ + if (set_difference == 0) + { + DEBUG3 (("aggressive absorption. Row: %d\n", row)) ; + KILL_ROW (row) ; + } + else + { + /* save the new mark */ + Row [row].shared2.mark = set_difference + tag_mark ; + } + } + } + +#ifndef NDEBUG + debug_deg_lists (n_row, n_col, Row, Col, head, + min_score, n_col2-k-pivot_row_degree, max_deg) ; +#endif /* NDEBUG */ + + /* === Add up set differences for each column ======================= */ + + DEBUG3 (("** Adding set differences phase. **\n")) ; + + /* for each column in pivot row */ + rp = &A [pivot_row_start] ; + rp_end = rp + pivot_row_length ; + while (rp < rp_end) + { + /* get a column */ + col = *rp++ ; + ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + hash = 0 ; + cur_score = 0 ; + cp = &A [Col [col].start] ; + /* compact the column */ + new_cp = cp ; + cp_end = cp + Col [col].length ; + + DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ; + + while (cp < cp_end) + { + /* get a row */ + row = *cp++ ; + ASSERT(row >= 0 && row < n_row) ; + row_mark = Row [row].shared2.mark ; + /* skip if dead */ + if (ROW_IS_MARKED_DEAD (row_mark)) + { + continue ; + } + ASSERT (row_mark > tag_mark) ; + /* compact the column */ + *new_cp++ = row ; + /* compute hash function */ + hash += row ; + /* add set difference */ + cur_score += row_mark - tag_mark ; + /* integer overflow... */ + cur_score = MIN (cur_score, n_col) ; + } + + /* recompute the column's length */ + Col [col].length = (int) (new_cp - &A [Col [col].start]) ; + + /* === Further mass elimination ================================= */ + + if (Col [col].length == 0) + { + DEBUG4 (("further mass elimination. Col: %d\n", col)) ; + /* nothing left but the pivot row in this column */ + KILL_PRINCIPAL_COL (col) ; + pivot_row_degree -= Col [col].shared1.thickness ; + ASSERT (pivot_row_degree >= 0) ; + /* order it */ + Col [col].shared2.order = k ; + /* increment order count by column thickness */ + k += Col [col].shared1.thickness ; + } + else + { + /* === Prepare for supercolumn detection ==================== */ + + DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ; + + /* save score so far */ + Col [col].shared2.score = cur_score ; + + /* add column to hash table, for supercolumn detection */ + hash %= n_col + 1 ; + + DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ; + ASSERT (hash <= n_col) ; + + head_column = head [hash] ; + if (head_column > EMPTY) + { + /* degree list "hash" is non-empty, use prev (shared3) of */ + /* first column in degree list as head of hash bucket */ + first_col = Col [head_column].shared3.headhash ; + Col [head_column].shared3.headhash = col ; + } + else + { + /* degree list "hash" is empty, use head as hash bucket */ + first_col = - (head_column + 2) ; + head [hash] = - (col + 2) ; + } + Col [col].shared4.hash_next = first_col ; + + /* save hash function in Col [col].shared3.hash */ + Col [col].shared3.hash = (int) hash ; + ASSERT (COL_IS_ALIVE (col)) ; + } + } + + /* The approximate external column degree is now computed. */ + + /* === Supercolumn detection ======================================== */ + + DEBUG3 (("** Supercolumn detection phase. **\n")) ; + + detect_super_cols ( + +#ifndef NDEBUG + n_col, Row, +#endif /* NDEBUG */ + + Col, A, head, pivot_row_start, pivot_row_length) ; + + /* === Kill the pivotal column ====================================== */ + + KILL_PRINCIPAL_COL (pivot_col) ; + + /* === Clear mark =================================================== */ + + tag_mark += (max_deg + 1) ; + if (tag_mark >= max_mark) + { + DEBUG2 (("clearing tag_mark\n")) ; + tag_mark = clear_mark (n_row, Row) ; + } + +#ifndef NDEBUG + DEBUG3 (("check3\n")) ; + debug_mark (n_row, Row, tag_mark, max_mark) ; +#endif /* NDEBUG */ + + /* === Finalize the new pivot row, and column scores ================ */ + + DEBUG3 (("** Finalize scores phase. **\n")) ; + + /* for each column in pivot row */ + rp = &A [pivot_row_start] ; + /* compact the pivot row */ + new_rp = rp ; + rp_end = rp + pivot_row_length ; + while (rp < rp_end) + { + col = *rp++ ; + /* skip dead columns */ + if (COL_IS_DEAD (col)) + { + continue ; + } + *new_rp++ = col ; + /* add new pivot row to column */ + A [Col [col].start + (Col [col].length++)] = pivot_row ; + + /* retrieve score so far and add on pivot row's degree. */ + /* (we wait until here for this in case the pivot */ + /* row's degree was reduced due to mass elimination). */ + cur_score = Col [col].shared2.score + pivot_row_degree ; + + /* calculate the max possible score as the number of */ + /* external columns minus the 'k' value minus the */ + /* columns thickness */ + max_score = n_col - k - Col [col].shared1.thickness ; + + /* make the score the external degree of the union-of-rows */ + cur_score -= Col [col].shared1.thickness ; + + /* make sure score is less or equal than the max score */ + cur_score = MIN (cur_score, max_score) ; + ASSERT (cur_score >= 0) ; + + /* store updated score */ + Col [col].shared2.score = cur_score ; + + /* === Place column back in degree list ========================= */ + + ASSERT (min_score >= 0) ; + ASSERT (min_score <= n_col) ; + ASSERT (cur_score >= 0) ; + ASSERT (cur_score <= n_col) ; + ASSERT (head [cur_score] >= EMPTY) ; + next_col = head [cur_score] ; + Col [col].shared4.degree_next = next_col ; + Col [col].shared3.prev = EMPTY ; + if (next_col != EMPTY) + { + Col [next_col].shared3.prev = col ; + } + head [cur_score] = col ; + + /* see if this score is less than current min */ + min_score = MIN (min_score, cur_score) ; + + } + +#ifndef NDEBUG + debug_deg_lists (n_row, n_col, Row, Col, head, + min_score, n_col2-k, max_deg) ; +#endif /* NDEBUG */ + + /* === Resurrect the new pivot row ================================== */ + + if (pivot_row_degree > 0) + { + /* update pivot row length to reflect any cols that were killed */ + /* during super-col detection and mass elimination */ + Row [pivot_row].start = pivot_row_start ; + Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ; + Row [pivot_row].shared1.degree = pivot_row_degree ; + Row [pivot_row].shared2.mark = 0 ; + /* pivot row is no longer dead */ + } + } + + /* === All principal columns have now been ordered ====================== */ + + return (ngarbage) ; +} + + +/* ========================================================================== */ +/* === order_children ======================================================= */ +/* ========================================================================== */ + +/* + The find_ordering routine has ordered all of the principal columns (the + representatives of the supercolumns). The non-principal columns have not + yet been ordered. This routine orders those columns by walking up the + parent tree (a column is a child of the column which absorbed it). The + final permutation vector is then placed in p [0 ... n_col-1], with p [0] + being the first column, and p [n_col-1] being the last. It doesn't look + like it at first glance, but be assured that this routine takes time linear + in the number of columns. Although not immediately obvious, the time + taken by this routine is O (n_col), that is, linear in the number of + columns. Not user-callable. +*/ + +PRIVATE void order_children +( + /* === Parameters ======================================================= */ + + int n_col, /* number of columns of A */ + Colamd_Col Col [], /* of size n_col+1 */ + int p [] /* p [0 ... n_col-1] is the column permutation*/ +) +{ + /* === Local variables ================================================== */ + + int i ; /* loop counter for all columns */ + int c ; /* column index */ + int parent ; /* index of column's parent */ + int order ; /* column's order */ + + /* === Order each non-principal column ================================== */ + + for (i = 0 ; i < n_col ; i++) + { + /* find an un-ordered non-principal column */ + ASSERT (COL_IS_DEAD (i)) ; + if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY) + { + parent = i ; + /* once found, find its principal parent */ + do + { + parent = Col [parent].shared1.parent ; + } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; + + /* now, order all un-ordered non-principal columns along path */ + /* to this parent. collapse tree at the same time */ + c = i ; + /* get order of parent */ + order = Col [parent].shared2.order ; + + do + { + ASSERT (Col [c].shared2.order == EMPTY) ; + + /* order this column */ + Col [c].shared2.order = order++ ; + /* collaps tree */ + Col [c].shared1.parent = parent ; + + /* get immediate parent of this column */ + c = Col [c].shared1.parent ; + + /* continue until we hit an ordered column. There are */ + /* guarranteed not to be anymore unordered columns */ + /* above an ordered column */ + } while (Col [c].shared2.order == EMPTY) ; + + /* re-order the super_col parent to largest order for this group */ + Col [parent].shared2.order = order ; + } + } + + /* === Generate the permutation ========================================= */ + + for (c = 0 ; c < n_col ; c++) + { + p [Col [c].shared2.order] = c ; + } +} + + +/* ========================================================================== */ +/* === detect_super_cols ==================================================== */ +/* ========================================================================== */ + +/* + Detects supercolumns by finding matches between columns in the hash buckets. + Check amongst columns in the set A [row_start ... row_start + row_length-1]. + The columns under consideration are currently *not* in the degree lists, + and have already been placed in the hash buckets. + + The hash bucket for columns whose hash function is equal to h is stored + as follows: + + if head [h] is >= 0, then head [h] contains a degree list, so: + + head [h] is the first column in degree bucket h. + Col [head [h]].headhash gives the first column in hash bucket h. + + otherwise, the degree list is empty, and: + + -(head [h] + 2) is the first column in hash bucket h. + + For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous + column" pointer. Col [c].shared3.hash is used instead as the hash number + for that column. The value of Col [c].shared4.hash_next is the next column + in the same hash bucket. + + Assuming no, or "few" hash collisions, the time taken by this routine is + linear in the sum of the sizes (lengths) of each column whose score has + just been computed in the approximate degree computation. + Not user-callable. +*/ + +PRIVATE void detect_super_cols +( + /* === Parameters ======================================================= */ + +#ifndef NDEBUG + /* these two parameters are only needed when debugging is enabled: */ + int n_col, /* number of columns of A */ + Colamd_Row Row [], /* of size n_row+1 */ +#endif /* NDEBUG */ + + Colamd_Col Col [], /* of size n_col+1 */ + int A [], /* row indices of A */ + int head [], /* head of degree lists and hash buckets */ + int row_start, /* pointer to set of columns to check */ + int row_length /* number of columns to check */ +) +{ + /* === Local variables ================================================== */ + + int hash ; /* hash value for a column */ + int *rp ; /* pointer to a row */ + int c ; /* a column index */ + int super_c ; /* column index of the column to absorb into */ + int *cp1 ; /* column pointer for column super_c */ + int *cp2 ; /* column pointer for column c */ + int length ; /* length of column super_c */ + int prev_c ; /* column preceding c in hash bucket */ + int i ; /* loop counter */ + int *rp_end ; /* pointer to the end of the row */ + int col ; /* a column index in the row to check */ + int head_column ; /* first column in hash bucket or degree list */ + int first_col ; /* first column in hash bucket */ + + /* === Consider each column in the row ================================== */ + + rp = &A [row_start] ; + rp_end = rp + row_length ; + while (rp < rp_end) + { + col = *rp++ ; + if (COL_IS_DEAD (col)) + { + continue ; + } + + /* get hash number for this column */ + hash = Col [col].shared3.hash ; + ASSERT (hash <= n_col) ; + + /* === Get the first column in this hash bucket ===================== */ + + head_column = head [hash] ; + if (head_column > EMPTY) + { + first_col = Col [head_column].shared3.headhash ; + } + else + { + first_col = - (head_column + 2) ; + } + + /* === Consider each column in the hash bucket ====================== */ + + for (super_c = first_col ; super_c != EMPTY ; + super_c = Col [super_c].shared4.hash_next) + { + ASSERT (COL_IS_ALIVE (super_c)) ; + ASSERT (Col [super_c].shared3.hash == hash) ; + length = Col [super_c].length ; + + /* prev_c is the column preceding column c in the hash bucket */ + prev_c = super_c ; + + /* === Compare super_c with all columns after it ================ */ + + for (c = Col [super_c].shared4.hash_next ; + c != EMPTY ; c = Col [c].shared4.hash_next) + { + ASSERT (c != super_c) ; + ASSERT (COL_IS_ALIVE (c)) ; + ASSERT (Col [c].shared3.hash == hash) ; + + /* not identical if lengths or scores are different */ + if (Col [c].length != length || + Col [c].shared2.score != Col [super_c].shared2.score) + { + prev_c = c ; + continue ; + } + + /* compare the two columns */ + cp1 = &A [Col [super_c].start] ; + cp2 = &A [Col [c].start] ; + + for (i = 0 ; i < length ; i++) + { + /* the columns are "clean" (no dead rows) */ + ASSERT (ROW_IS_ALIVE (*cp1)) ; + ASSERT (ROW_IS_ALIVE (*cp2)) ; + /* row indices will same order for both supercols, */ + /* no gather scatter nessasary */ + if (*cp1++ != *cp2++) + { + break ; + } + } + + /* the two columns are different if the for-loop "broke" */ + if (i != length) + { + prev_c = c ; + continue ; + } + + /* === Got it! two columns are identical =================== */ + + ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ; + + Col [super_c].shared1.thickness += Col [c].shared1.thickness ; + Col [c].shared1.parent = super_c ; + KILL_NON_PRINCIPAL_COL (c) ; + /* order c later, in order_children() */ + Col [c].shared2.order = EMPTY ; + /* remove c from hash bucket */ + Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; + } + } + + /* === Empty this hash bucket ======================================= */ + + if (head_column > EMPTY) + { + /* corresponding degree list "hash" is not empty */ + Col [head_column].shared3.headhash = EMPTY ; + } + else + { + /* corresponding degree list "hash" is empty */ + head [hash] = EMPTY ; + } + } +} + + +/* ========================================================================== */ +/* === garbage_collection =================================================== */ +/* ========================================================================== */ + +/* + Defragments and compacts columns and rows in the workspace A. Used when + all avaliable memory has been used while performing row merging. Returns + the index of the first free position in A, after garbage collection. The + time taken by this routine is linear is the size of the array A, which is + itself linear in the number of nonzeros in the input matrix. + Not user-callable. +*/ + +PRIVATE int garbage_collection /* returns the new value of pfree */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows */ + int n_col, /* number of columns */ + Colamd_Row Row [], /* row info */ + Colamd_Col Col [], /* column info */ + int A [], /* A [0 ... Alen-1] holds the matrix */ + int *pfree /* &A [0] ... pfree is in use */ +) +{ + /* === Local variables ================================================== */ + + int *psrc ; /* source pointer */ + int *pdest ; /* destination pointer */ + int j ; /* counter */ + int r ; /* a row index */ + int c ; /* a column index */ + int length ; /* length of a row or column */ + +#ifndef NDEBUG + int debug_rows ; + DEBUG2 (("Defrag..\n")) ; + for (psrc = &A[0] ; psrc < pfree ; psrc++) ASSERT (*psrc >= 0) ; + debug_rows = 0 ; +#endif /* NDEBUG */ + + /* === Defragment the columns =========================================== */ + + pdest = &A[0] ; + for (c = 0 ; c < n_col ; c++) + { + if (COL_IS_ALIVE (c)) + { + psrc = &A [Col [c].start] ; + + /* move and compact the column */ + ASSERT (pdest <= psrc) ; + Col [c].start = (int) (pdest - &A [0]) ; + length = Col [c].length ; + for (j = 0 ; j < length ; j++) + { + r = *psrc++ ; + if (ROW_IS_ALIVE (r)) + { + *pdest++ = r ; + } + } + Col [c].length = (int) (pdest - &A [Col [c].start]) ; + } + } + + /* === Prepare to defragment the rows =================================== */ + + for (r = 0 ; r < n_row ; r++) + { + if (ROW_IS_ALIVE (r)) + { + if (Row [r].length == 0) + { + /* this row is of zero length. cannot compact it, so kill it */ + DEBUG3 (("Defrag row kill\n")) ; + KILL_ROW (r) ; + } + else + { + /* save first column index in Row [r].shared2.first_column */ + psrc = &A [Row [r].start] ; + Row [r].shared2.first_column = *psrc ; + ASSERT (ROW_IS_ALIVE (r)) ; + /* flag the start of the row with the one's complement of row */ + *psrc = ONES_COMPLEMENT (r) ; + +#ifndef NDEBUG + debug_rows++ ; +#endif /* NDEBUG */ + + } + } + } + + /* === Defragment the rows ============================================== */ + + psrc = pdest ; + while (psrc < pfree) + { + /* find a negative number ... the start of a row */ + if (*psrc++ < 0) + { + psrc-- ; + /* get the row index */ + r = ONES_COMPLEMENT (*psrc) ; + ASSERT (r >= 0 && r < n_row) ; + /* restore first column index */ + *psrc = Row [r].shared2.first_column ; + ASSERT (ROW_IS_ALIVE (r)) ; + + /* move and compact the row */ + ASSERT (pdest <= psrc) ; + Row [r].start = (int) (pdest - &A [0]) ; + length = Row [r].length ; + for (j = 0 ; j < length ; j++) + { + c = *psrc++ ; + if (COL_IS_ALIVE (c)) + { + *pdest++ = c ; + } + } + Row [r].length = (int) (pdest - &A [Row [r].start]) ; + +#ifndef NDEBUG + debug_rows-- ; +#endif /* NDEBUG */ + + } + } + /* ensure we found all the rows */ + ASSERT (debug_rows == 0) ; + + /* === Return the new value of pfree ==================================== */ + + return ((int) (pdest - &A [0])) ; +} + + +/* ========================================================================== */ +/* === clear_mark =========================================================== */ +/* ========================================================================== */ + +/* + Clears the Row [].shared2.mark array, and returns the new tag_mark. + Return value is the new tag_mark. Not user-callable. +*/ + +PRIVATE int clear_mark /* return the new value for tag_mark */ +( + /* === Parameters ======================================================= */ + + int n_row, /* number of rows in A */ + Colamd_Row Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ +) +{ + /* === Local variables ================================================== */ + + int r ; + + for (r = 0 ; r < n_row ; r++) + { + if (ROW_IS_ALIVE (r)) + { + Row [r].shared2.mark = 0 ; + } + } + return (1) ; +} + + +/* ========================================================================== */ +/* === print_report ========================================================= */ +/* ========================================================================== */ + +PRIVATE void print_report +( + char *method, + int stats [COLAMD_STATS] +) +{ + + int i1, i2, i3 ; + + if (!stats) + { + PRINTF ("%s: No statistics available.\n", method) ; + return ; + } + + i1 = stats [COLAMD_INFO1] ; + i2 = stats [COLAMD_INFO2] ; + i3 = stats [COLAMD_INFO3] ; + + if (stats [COLAMD_STATUS] >= 0) + { + PRINTF ("%s: OK. ", method) ; + } + else + { + PRINTF ("%s: ERROR. ", method) ; + } + + switch (stats [COLAMD_STATUS]) + { + + case COLAMD_OK_BUT_JUMBLED: + + PRINTF ("Matrix has unsorted or duplicate row indices.\n") ; + + PRINTF ("%s: number of duplicate or out-of-order row indices: %d\n", + method, i3) ; + + PRINTF ("%s: last seen duplicate or out-of-order row index: %d\n", + method, INDEX (i2)) ; + + PRINTF ("%s: last seen in column: %d", + method, INDEX (i1)) ; + + /* no break - fall through to next case instead */ + + case COLAMD_OK: + + PRINTF ("\n") ; + + PRINTF ("%s: number of dense or empty rows ignored: %d\n", + method, stats [COLAMD_DENSE_ROW]) ; + + PRINTF ("%s: number of dense or empty columns ignored: %d\n", + method, stats [COLAMD_DENSE_COL]) ; + + PRINTF ("%s: number of garbage collections performed: %d\n", + method, stats [COLAMD_DEFRAG_COUNT]) ; + break ; + + case COLAMD_ERROR_A_not_present: + + PRINTF ("Array A (row indices of matrix) not present.\n") ; + break ; + + case COLAMD_ERROR_p_not_present: + + PRINTF ("Array p (column pointers for matrix) not present.\n") ; + break ; + + case COLAMD_ERROR_nrow_negative: + + PRINTF ("Invalid number of rows (%d).\n", i1) ; + break ; + + case COLAMD_ERROR_ncol_negative: + + PRINTF ("Invalid number of columns (%d).\n", i1) ; + break ; + + case COLAMD_ERROR_nnz_negative: + + PRINTF ("Invalid number of nonzero entries (%d).\n", i1) ; + break ; + + case COLAMD_ERROR_p0_nonzero: + + PRINTF ("Invalid column pointer, p [0] = %d, must be zero.\n", i1) ; + break ; + + case COLAMD_ERROR_A_too_small: + + PRINTF ("Array A too small.\n") ; + PRINTF (" Need Alen >= %d, but given only Alen = %d.\n", + i1, i2) ; + break ; + + case COLAMD_ERROR_col_length_negative: + + PRINTF + ("Column %d has a negative number of nonzero entries (%d).\n", + INDEX (i1), i2) ; + break ; + + case COLAMD_ERROR_row_index_out_of_bounds: + + PRINTF + ("Row index (row %d) out of bounds (%d to %d) in column %d.\n", + INDEX (i2), INDEX (0), INDEX (i3-1), INDEX (i1)) ; + break ; + + case COLAMD_ERROR_out_of_memory: + + PRINTF ("Out of memory.\n") ; + break ; + + case COLAMD_ERROR_internal_error: + + /* if this happens, there is a bug in the code */ + PRINTF + ("Internal error! Please contact authors (davis@cise.ufl.edu).\n") ; + break ; + } +} + + + + +/* ========================================================================== */ +/* === colamd debugging routines ============================================ */ +/* ========================================================================== */ + +/* When debugging is disabled, the remainder of this file is ignored. */ + +#ifndef NDEBUG + + +/* ========================================================================== */ +/* === debug_structures ===================================================== */ +/* ========================================================================== */ + +/* + At this point, all empty rows and columns are dead. All live columns + are "clean" (containing no dead rows) and simplicial (no supercolumns + yet). Rows may contain dead columns, but all live rows contain at + least one live column. +*/ + +PRIVATE void debug_structures +( + /* === Parameters ======================================================= */ + + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int A [], + int n_col2 +) +{ + /* === Local variables ================================================== */ + + int i ; + int c ; + int *cp ; + int *cp_end ; + int len ; + int score ; + int r ; + int *rp ; + int *rp_end ; + int deg ; + + /* === Check A, Row, and Col ============================================ */ + + for (c = 0 ; c < n_col ; c++) + { + if (COL_IS_ALIVE (c)) + { + len = Col [c].length ; + score = Col [c].shared2.score ; + DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ; + ASSERT (len > 0) ; + ASSERT (score >= 0) ; + ASSERT (Col [c].shared1.thickness == 1) ; + cp = &A [Col [c].start] ; + cp_end = cp + len ; + while (cp < cp_end) + { + r = *cp++ ; + ASSERT (ROW_IS_ALIVE (r)) ; + } + } + else + { + i = Col [c].shared2.order ; + ASSERT (i >= n_col2 && i < n_col) ; + } + } + + for (r = 0 ; r < n_row ; r++) + { + if (ROW_IS_ALIVE (r)) + { + i = 0 ; + len = Row [r].length ; + deg = Row [r].shared1.degree ; + ASSERT (len > 0) ; + ASSERT (deg > 0) ; + rp = &A [Row [r].start] ; + rp_end = rp + len ; + while (rp < rp_end) + { + c = *rp++ ; + if (COL_IS_ALIVE (c)) + { + i++ ; + } + } + ASSERT (i > 0) ; + } + } +} + + +/* ========================================================================== */ +/* === debug_deg_lists ====================================================== */ +/* ========================================================================== */ + +/* + Prints the contents of the degree lists. Counts the number of columns + in the degree list and compares it to the total it should have. Also + checks the row degrees. +*/ + +PRIVATE void debug_deg_lists +( + /* === Parameters ======================================================= */ + + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int head [], + int min_score, + int should, + int max_deg +) +{ + /* === Local variables ================================================== */ + + int deg ; + int col ; + int have ; + int row ; + + /* === Check the degree lists =========================================== */ + + if (n_col > 10000 && colamd_debug <= 0) + { + return ; + } + have = 0 ; + DEBUG4 (("Degree lists: %d\n", min_score)) ; + for (deg = 0 ; deg <= n_col ; deg++) + { + col = head [deg] ; + if (col == EMPTY) + { + continue ; + } + DEBUG4 (("%d:", deg)) ; + while (col != EMPTY) + { + DEBUG4 ((" %d", col)) ; + have += Col [col].shared1.thickness ; + ASSERT (COL_IS_ALIVE (col)) ; + col = Col [col].shared4.degree_next ; + } + DEBUG4 (("\n")) ; + } + DEBUG4 (("should %d have %d\n", should, have)) ; + ASSERT (should == have) ; + + /* === Check the row degrees ============================================ */ + + if (n_row > 10000 && colamd_debug <= 0) + { + return ; + } + for (row = 0 ; row < n_row ; row++) + { + if (ROW_IS_ALIVE (row)) + { + ASSERT (Row [row].shared1.degree <= max_deg) ; + } + } +} + + +/* ========================================================================== */ +/* === debug_mark =========================================================== */ +/* ========================================================================== */ + +/* + Ensures that the tag_mark is less that the maximum and also ensures that + each entry in the mark array is less than the tag mark. +*/ + +PRIVATE void debug_mark +( + /* === Parameters ======================================================= */ + + int n_row, + Colamd_Row Row [], + int tag_mark, + int max_mark +) +{ + /* === Local variables ================================================== */ + + int r ; + + /* === Check the Row marks ============================================== */ + + ASSERT (tag_mark > 0 && tag_mark <= max_mark) ; + if (n_row > 10000 && colamd_debug <= 0) + { + return ; + } + for (r = 0 ; r < n_row ; r++) + { + ASSERT (Row [r].shared2.mark < tag_mark) ; + } +} + + +/* ========================================================================== */ +/* === debug_matrix ========================================================= */ +/* ========================================================================== */ + +/* + Prints out the contents of the columns and the rows. +*/ + +PRIVATE void debug_matrix +( + /* === Parameters ======================================================= */ + + int n_row, + int n_col, + Colamd_Row Row [], + Colamd_Col Col [], + int A [] +) +{ + /* === Local variables ================================================== */ + + int r ; + int c ; + int *rp ; + int *rp_end ; + int *cp ; + int *cp_end ; + + /* === Dump the rows and columns of the matrix ========================== */ + + if (colamd_debug < 3) + { + return ; + } + DEBUG3 (("DUMP MATRIX:\n")) ; + for (r = 0 ; r < n_row ; r++) + { + DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ; + if (ROW_IS_DEAD (r)) + { + continue ; + } + DEBUG3 (("start %d length %d degree %d\n", + Row [r].start, Row [r].length, Row [r].shared1.degree)) ; + rp = &A [Row [r].start] ; + rp_end = rp + Row [r].length ; + while (rp < rp_end) + { + c = *rp++ ; + DEBUG4 ((" %d col %d\n", COL_IS_ALIVE (c), c)) ; + } + } + + for (c = 0 ; c < n_col ; c++) + { + DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ; + if (COL_IS_DEAD (c)) + { + continue ; + } + DEBUG3 (("start %d length %d shared1 %d shared2 %d\n", + Col [c].start, Col [c].length, + Col [c].shared1.thickness, Col [c].shared2.score)) ; + cp = &A [Col [c].start] ; + cp_end = cp + Col [c].length ; + while (cp < cp_end) + { + r = *cp++ ; + DEBUG4 ((" %d row %d\n", ROW_IS_ALIVE (r), r)) ; + } + } +} + +PRIVATE void colamd_get_debug +( + char *method +) +{ + colamd_debug = 0 ; /* no debug printing */ + + /* get "D" environment variable, which gives the debug printing level */ + if (getenv ("D")) + { + colamd_debug = atoi (getenv ("D")) ; + } + + DEBUG0 (("%s: debug version, D = %d (THIS WILL BE SLOW!)\n", + method, colamd_debug)) ; +} + +#endif /* NDEBUG */ + diff --git a/src/maths/SuperLU/colamd.h b/src/maths/SuperLU/colamd.h new file mode 100644 index 000000000..aacbd3f52 --- /dev/null +++ b/src/maths/SuperLU/colamd.h @@ -0,0 +1,249 @@ +/*! @file colamd.h + \brief Colamd prototypes and definitions + +
 
+    ==========================================================================
+    === colamd/symamd prototypes and definitions =============================
+    ==========================================================================
+
+    You must include this file (colamd.h) in any routine that uses colamd,
+    symamd, or the related macros and definitions.
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Date:
+
+	September 8, 2003.  Version 2.3.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Notice:
+
+	Copyright (c) 1998-2003 by the University of Florida.
+	All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use, copy, modify, and/or distribute
+	this program, provided that the Copyright, this License, and the
+	Availability of the original version is retained on all copies and made
+	accessible to the end-user of any code or package that includes COLAMD
+	or any modified version of COLAMD. 
+
+    Availability:
+
+	The colamd/symamd library is available at
+
+	    http://www.cise.ufl.edu/research/sparse/colamd/
+
+	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
+	file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
+	files, and by any C code that calls the routines whose prototypes are
+	listed below, or that uses the colamd/symamd definitions listed below.
+ 
+*/ + +#ifndef COLAMD_H +#define COLAMD_H + +/* ========================================================================== */ +/* === Include files ======================================================== */ +/* ========================================================================== */ + +#include + +/* ========================================================================== */ +/* === Knob and statistics definitions ====================================== */ +/* ========================================================================== */ + +/* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ +#define COLAMD_KNOBS 20 + +/* number of output statistics. Only stats [0..6] are currently used. */ +#define COLAMD_STATS 20 + +/* knobs [0] and stats [0]: dense row knob and output statistic. */ +#define COLAMD_DENSE_ROW 0 + +/* knobs [1] and stats [1]: dense column knob and output statistic. */ +#define COLAMD_DENSE_COL 1 + +/* stats [2]: memory defragmentation count output statistic */ +#define COLAMD_DEFRAG_COUNT 2 + +/* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ +#define COLAMD_STATUS 3 + +/* stats [4..6]: error info, or info on jumbled columns */ +#define COLAMD_INFO1 4 +#define COLAMD_INFO2 5 +#define COLAMD_INFO3 6 + +/* error codes returned in stats [3]: */ +#define COLAMD_OK (0) +#define COLAMD_OK_BUT_JUMBLED (1) +#define COLAMD_ERROR_A_not_present (-1) +#define COLAMD_ERROR_p_not_present (-2) +#define COLAMD_ERROR_nrow_negative (-3) +#define COLAMD_ERROR_ncol_negative (-4) +#define COLAMD_ERROR_nnz_negative (-5) +#define COLAMD_ERROR_p0_nonzero (-6) +#define COLAMD_ERROR_A_too_small (-7) +#define COLAMD_ERROR_col_length_negative (-8) +#define COLAMD_ERROR_row_index_out_of_bounds (-9) +#define COLAMD_ERROR_out_of_memory (-10) +#define COLAMD_ERROR_internal_error (-999) + +/* ========================================================================== */ +/* === Row and Column structures ============================================ */ +/* ========================================================================== */ + +/* User code that makes use of the colamd/symamd routines need not directly */ +/* reference these structures. They are used only for the COLAMD_RECOMMENDED */ +/* macro. */ + +typedef struct Colamd_Col_struct +{ + int start ; /* index for A of first row in this column, or DEAD */ + /* if column is dead */ + int length ; /* number of rows in this column */ + union + { + int thickness ; /* number of original columns represented by this */ + /* col, if the column is alive */ + int parent ; /* parent in parent tree super-column structure, if */ + /* the column is dead */ + } shared1 ; + union + { + int score ; /* the score used to maintain heap, if col is alive */ + int order ; /* pivot ordering of this column, if col is dead */ + } shared2 ; + union + { + int headhash ; /* head of a hash bucket, if col is at the head of */ + /* a degree list */ + int hash ; /* hash value, if col is not in a degree list */ + int prev ; /* previous column in degree list, if col is in a */ + /* degree list (but not at the head of a degree list) */ + } shared3 ; + union + { + int degree_next ; /* next column, if col is in a degree list */ + int hash_next ; /* next column, if col is in a hash list */ + } shared4 ; + +} Colamd_Col ; + +typedef struct Colamd_Row_struct +{ + int start ; /* index for A of first col in this row */ + int length ; /* number of principal columns in this row */ + union + { + int degree ; /* number of principal & non-principal columns in row */ + int p ; /* used as a row pointer in init_rows_cols () */ + } shared1 ; + union + { + int mark ; /* for computing set differences and marking dead rows*/ + int first_column ;/* first column in row (used in garbage collection) */ + } shared2 ; + +} Colamd_Row ; + +/* ========================================================================== */ +/* === Colamd recommended memory size ======================================= */ +/* ========================================================================== */ + +/* + The recommended length Alen of the array A passed to colamd is given by + the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro. It returns -1 if any + argument is negative. 2*nnz space is required for the row and column + indices of the matrix. COLAMD_C (n_col) + COLAMD_R (n_row) space is + required for the Col and Row arrays, respectively, which are internal to + colamd. An additional n_col space is the minimal amount of "elbow room", + and nnz/5 more space is recommended for run time efficiency. + + This macro is not needed when using symamd. + + Explicit typecast to int added Sept. 23, 2002, COLAMD version 2.2, to avoid + gcc -pedantic warning messages. +*/ + +#define COLAMD_C(n_col) ((int) (((n_col) + 1) * sizeof (Colamd_Col) / sizeof (int))) +#define COLAMD_R(n_row) ((int) (((n_row) + 1) * sizeof (Colamd_Row) / sizeof (int))) + +#define COLAMD_RECOMMENDED(nnz, n_row, n_col) \ +( \ +((nnz) < 0 || (n_row) < 0 || (n_col) < 0) \ +? \ + (-1) \ +: \ + (2 * (nnz) + COLAMD_C (n_col) + COLAMD_R (n_row) + (n_col) + ((nnz) / 5)) \ +) + +/* ========================================================================== */ +/* === Prototypes of user-callable routines ================================= */ +/* ========================================================================== */ + +int colamd_recommended /* returns recommended value of Alen, */ + /* or (-1) if input arguments are erroneous */ +( + int nnz, /* nonzeros in A */ + int n_row, /* number of rows in A */ + int n_col /* number of columns in A */ +) ; + +void colamd_set_defaults /* sets default parameters */ +( /* knobs argument is modified on output */ + double knobs [COLAMD_KNOBS] /* parameter settings for colamd */ +) ; + +int colamd /* returns (1) if successful, (0) otherwise*/ +( /* A and p arguments are modified on output */ + int n_row, /* number of rows in A */ + int n_col, /* number of columns in A */ + int Alen, /* size of the array A */ + int A [], /* row indices of A, of size Alen */ + int p [], /* column pointers of A, of size n_col+1 */ + double knobs [COLAMD_KNOBS],/* parameter settings for colamd */ + int stats [COLAMD_STATS] /* colamd output statistics and error codes */ +) ; + +int symamd /* return (1) if OK, (0) otherwise */ +( + int n, /* number of rows and columns of A */ + int A [], /* row indices of A */ + int p [], /* column pointers of A */ + int perm [], /* output permutation, size n_col+1 */ + double knobs [COLAMD_KNOBS], /* parameters (uses defaults if NULL) */ + int stats [COLAMD_STATS], /* output statistics and error codes */ + void * (*allocate) (size_t, size_t), + /* pointer to calloc (ANSI C) or */ + /* mxCalloc (for MATLAB mexFunction) */ + void (*release) (void *) + /* pointer to free (ANSI C) or */ + /* mxFree (for MATLAB mexFunction) */ +) ; + +void colamd_report +( + int stats [COLAMD_STATS] +) ; + +void symamd_report +( + int stats [COLAMD_STATS] +) ; + +#endif /* COLAMD_H */ diff --git a/src/maths/SuperLU/dcolumn_bmod.c b/src/maths/SuperLU/dcolumn_bmod.c new file mode 100644 index 000000000..17febdc48 --- /dev/null +++ b/src/maths/SuperLU/dcolumn_bmod.c @@ -0,0 +1,352 @@ + +/*! @file dcolumn_bmod.c + * \brief performs numeric block updates + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ *  Permission is hereby granted to use or copy this program for any
+ *  purpose, provided the above notices are retained on all copies.
+ *  Permission to modify the code and to distribute modified code is
+ *  granted, provided the above notices are retained, and a notice that
+ *  the code was modified is included with the above copyright notice.
+ * 
+*/ + +#include +#include +#include + +/* + * Function prototypes + */ +void dusolve(int, int, double*, double*); +void dlsolve(int, int, double*, double*); +void dmatvec(int, int, int, double*, double*, double*); + + + +/*! \brief + * + *
+ * Purpose:
+ * ========
+ * Performs numeric block updates (sup-col) in topological order.
+ * It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
+ * Special processing on the supernodal portion of L\U[*,j]
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ * 
+ */ +int +dcolumn_bmod ( + const int jcol, /* in */ + const int nseg, /* in */ + double *dense, /* in */ + double *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in */ + int fpanelc, /* in -- first column in the current panel */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + double alpha, beta; + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in supernode + * nsupr = no of rows in supernode (used as leading dimension) + * luptr = location of supernodal LU-block in storage + * kfnz = first nonz in the k-th supernodal segment + * no_zeros = no of leading zeros in a supernodal U-segment + */ + double ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int fsupc, nsupc, nsupr, segsze; + int nrow; /* No of rows in the matrix of matrix-vector */ + int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; + register int lptr, kfnz, isub, irow, i; + register int no_zeros, new_next; + int ufirst, nextlu; + int fst_col; /* First column within small LU update */ + int d_fsupc; /* Distance between the first column of the current + panel and the first column of the current snode. */ + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + int nzlumax; + double *tempv1; + double zero = 0.0; + double one = 1.0; + double none = -1.0; + int mem_error; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + nzlumax = Glu->nzlumax; + jcolp1 = jcol + 1; + jsupno = supno[jcol]; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + + krep = segrep[k]; + k--; + ksupno = supno[krep]; + if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ + + fsupc = xsup[ksupno]; + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + /* Distance from the current supernode to the current panel; + d_fsupc=0 if fsupc > fpanelc. */ + d_fsupc = fst_col - fsupc; + + luptr = xlusup[fst_col] + d_fsupc; + lptr = xlsub[fsupc] + d_fsupc; + + kfnz = repfnz[krep]; + kfnz = SUPERLU_MAX ( kfnz, fpanelc ); + + segsze = krep - kfnz + 1; + nsupc = krep - fst_col + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nrow = nsupr - d_fsupc - nsupc; + krep_ind = lptr + nsupc - 1; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + + /* + * Case 1: Update U-segment of size 1 -- col-col update + */ + if ( segsze == 1 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + dense[irow] -= ukj*lusup[luptr]; + luptr++; + } + + } else if ( segsze <= 3 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { /* Case 2: 2cols-col update */ + ukj -= ukj1 * lusup[luptr1]; + dense[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + dense[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] ); + } + } else { /* Case 3: 3cols-col update */ + ukj2 = dense[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense[lsub[krep_ind]] = ukj; + dense[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + luptr2++; + dense[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + + + } else { + /* + * Case: sup-col update + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense + */ + + no_zeros = kfnz - fst_col; + + /* Copy U[*,j] segment from dense[*] to tempv[*] */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + tempv[i] = dense[irow]; + ++isub; + } + + /* Dense triangular solve -- start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + dlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + dmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); +#endif + + + /* Scatter tempv[] into SPA dense[] as a temporary storage */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense[irow] = tempv[i]; + tempv[i] = zero; + ++isub; + } + + /* Scatter tempv1[] into SPA dense[] */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + dense[irow] -= tempv1[i]; + tempv1[i] = zero; + ++isub; + } + } + + } /* if jsupno ... */ + + } /* for each segment... */ + + /* + * Process the supernodal portion of L\U[*,j] + */ + nextlu = xlusup[jcol]; + fsupc = xsup[jsupno]; + + /* Copy the SPA dense into L\U[*,j] */ + new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; + while ( new_next > nzlumax ) { + if (mem_error = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) + return (mem_error); + lusup = Glu->lusup; + lsub = Glu->lsub; + } + + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = zero; + ++nextlu; + } + + xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ + + /* For more updates within the panel (also within the current supernode), + * should start from the first column of the panel, or the first column + * of the supernode, whichever is bigger. There are 2 cases: + * 1) fsupc < fpanelc, then fst_col := fpanelc + * 2) fsupc >= fpanelc, then fst_col := fsupc + */ + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + if ( fst_col < jcol ) { + + /* Distance between the current supernode and the current panel. + d_fsupc=0 if fsupc >= fpanelc. */ + d_fsupc = fst_col - fsupc; + + lptr = xlsub[fsupc] + d_fsupc; + luptr = xlusup[fst_col] + d_fsupc; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nsupc = jcol - fst_col; /* Excluding jcol */ + nrow = nsupr - d_fsupc - nsupc; + + /* Points to the beginning of jcol in snode L\U(jsupno) */ + ufirst = xlusup[jcol] + d_fsupc; + + ops[TRSV] += nsupc * (nsupc - 1); + ops[GEMV] += 2 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#else + dtrsv_( "L", "N", "U", &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#endif + + alpha = none; beta = one; /* y := beta*y + alpha*A*x */ + +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + dgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + dlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + + dmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], tempv ); + + /* Copy updates from tempv[*] into lusup[*] */ + isub = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + lusup[isub] -= tempv[i]; + tempv[i] = 0.0; + ++isub; + } + +#endif + + + } /* if fst_col < jcol ... */ + + return 0; +} diff --git a/src/maths/SuperLU/dcolumn_dfs.c b/src/maths/SuperLU/dcolumn_dfs.c new file mode 100644 index 000000000..258830f58 --- /dev/null +++ b/src/maths/SuperLU/dcolumn_dfs.c @@ -0,0 +1,275 @@ + +/*! @file dcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+*/ + +#include + +/*! \brief What type of supernodes we want */ +#define T2_SUPER + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   DCOLUMN_DFS performs a symbolic factorization on column jcol, and
+ *   decide the supernode boundary.
+ *
+ *   This routine does not use numeric values, but only use the RHS 
+ *   row indices to start the dfs.
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives. The routine returns a list of such supernodal 
+ *   representatives in topological order of the dfs that generates them.
+ *   The location of the first nonzero in each such supernodal segment
+ *   (supernodal entry location) is also returned.
+ *
+ * Local parameters
+ * ================
+ *   nseg: no of segments in current U[*,j]
+ *   jsuper: jsuper=EMPTY if column j does not belong to the same
+ *	supernode as j-1. Otherwise, jsuper=nsuper.
+ *
+ *   marker2: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ *
+ * Return value
+ * ============
+ *     0  success;
+ *   > 0  number of bytes allocated when run out of space.
+ * 
+ */ +int +dcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *xprune, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom, istop; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(3); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonz */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ( mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( mem_error = + dLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + +#ifdef T2_SUPER + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; +#endif + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first and last columns of + * a supernode. (first for num values, last for pruning) + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + istop = ito + jptr - jm1ptr; + xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ + xlsub[jcol] = istop; + for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; /* = istop + length(jcol) */ + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xprune[jcol] = nextl; /* Initialize upper bound for pruning */ + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/dcomplex.c b/src/maths/SuperLU/dcomplex.c new file mode 100644 index 000000000..706612498 --- /dev/null +++ b/src/maths/SuperLU/dcomplex.c @@ -0,0 +1,147 @@ + +/*! @file dcomplex.c + * \brief Common arithmetic for complex type + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * This file defines common arithmetic operations for complex type.
+ * 
+ */ + +#include +#include +#include +#include + + +/*! \brief Complex Division c = a/b */ +void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) +{ + double ratio, den; + double abr, abi, cr, ci; + + if( (abr = b->r) < 0.) + abr = - abr; + if( (abi = b->i) < 0.) + abi = - abi; + if( abr <= abi ) { + if (abi == 0) { + fprintf(stderr, "z_div.c: division by zero\n"); + exit(-1); + } + ratio = b->r / b->i ; + den = b->i * (1 + ratio*ratio); + cr = (a->r*ratio + a->i) / den; + ci = (a->i*ratio - a->r) / den; + } else { + ratio = b->i / b->r ; + den = b->r * (1 + ratio*ratio); + cr = (a->r + a->i*ratio) / den; + ci = (a->i - a->r*ratio) / den; + } + c->r = cr; + c->i = ci; +} + + +/*! \brief Returns sqrt(z.r^2 + z.i^2) */ +double z_abs(doublecomplex *z) +{ + double temp; + double real = z->r; + double imag = z->i; + + if (real < 0) real = -real; + if (imag < 0) imag = -imag; + if (imag > real) { + temp = real; + real = imag; + imag = temp; + } + if ((real+imag) == real) return(real); + + temp = imag/real; + temp = real*sqrt(1.0 + temp*temp); /*overflow!!*/ + return (temp); +} + + +/*! \brief Approximates the abs. Returns abs(z.r) + abs(z.i) */ +double z_abs1(doublecomplex *z) +{ + double real = z->r; + double imag = z->i; + + if (real < 0) real = -real; + if (imag < 0) imag = -imag; + + return (real + imag); +} + +/*! \brief Return the exponentiation */ +void z_exp(doublecomplex *r, doublecomplex *z) +{ + double expx; + + expx = exp(z->r); + r->r = expx * cos(z->i); + r->i = expx * sin(z->i); +} + +/*! \brief Return the complex conjugate */ +void d_cnjg(doublecomplex *r, doublecomplex *z) +{ + r->r = z->r; + r->i = -z->i; +} + +/*! \brief Return the imaginary part */ +double d_imag(doublecomplex *z) +{ + return (z->i); +} + + +/*! \brief SIGN functions for complex number. Returns z/abs(z) */ +doublecomplex z_sgn(doublecomplex *z) +{ + register double t = z_abs(z); + register doublecomplex retval; + + if (t == 0.0) { + retval.r = 1.0, retval.i = 0.0; + } else { + retval.r = z->r / t, retval.i = z->i / t; + } + + return retval; +} + +/*! \brief Square-root of a complex number. */ +doublecomplex z_sqrt(doublecomplex *z) +{ + doublecomplex retval; + register double cr, ci, real, imag; + + real = z->r; + imag = z->i; + + if ( imag == 0.0 ) { + retval.r = sqrt(real); + retval.i = 0.0; + } else { + ci = (sqrt(real*real + imag*imag) - real) / 2.0; + ci = sqrt(ci); + cr = imag / (2.0 * ci); + retval.r = cr; + retval.i = ci; + } + + return retval; +} + + diff --git a/src/maths/SuperLU/dcopy_to_ucol.c b/src/maths/SuperLU/dcopy_to_ucol.c new file mode 100644 index 000000000..513752b3f --- /dev/null +++ b/src/maths/SuperLU/dcopy_to_ucol.c @@ -0,0 +1,103 @@ + +/*! @file dcopy_to_ucol.c + * \brief Copy a computed column of U to the compressed data structure + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + +#include + +int +dcopy_to_ucol( + int jcol, /* in */ + int nseg, /* in */ + int *segrep, /* in */ + int *repfnz, /* in */ + int *perm_r, /* in */ + double *dense, /* modified - reset to zero on return */ + GlobalLU_t *Glu /* modified */ + ) +{ +/* + * Gather from SPA dense[*] to global ucol[*]. + */ + int ksub, krep, ksupno; + int i, k, kfnz, segsze; + int fsupc, isub, irow; + int jsupno, nextu; + int new_next, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + double *ucol; + int *usub, *xusub; + int nzumax; + double zero = 0.0; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + nzumax = Glu->nzumax; + + jsupno = supno[jcol]; + nextu = xusub[jcol]; + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + krep = segrep[k--]; + ksupno = supno[krep]; + + if ( ksupno != jsupno ) { /* Should go into ucol[] */ + kfnz = repfnz[krep]; + if ( kfnz != EMPTY ) { /* Nonzero U-segment */ + + fsupc = xsup[ksupno]; + isub = xlsub[fsupc] + kfnz - fsupc; + segsze = krep - kfnz + 1; + + new_next = nextu + segsze; + while ( new_next > nzumax ) { + if (mem_error = dLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)) + return (mem_error); + ucol = Glu->ucol; + if (mem_error = dLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)) + return (mem_error); + usub = Glu->usub; + lsub = Glu->lsub; + } + + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + usub[nextu] = perm_r[irow]; + ucol[nextu] = dense[irow]; + dense[irow] = zero; + nextu++; + isub++; + } + + } + + } + + } /* for each segment... */ + + xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ + return 0; +} diff --git a/src/maths/SuperLU/ddiagonal.c b/src/maths/SuperLU/ddiagonal.c new file mode 100644 index 000000000..cc7357438 --- /dev/null +++ b/src/maths/SuperLU/ddiagonal.c @@ -0,0 +1,129 @@ + +/*! @file ddiagonal.c + * \brief Auxiliary routines to work with diagonal elements + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + +#include + +int dfill_diag(int n, NCformat *Astore) +/* fill explicit zeros on the diagonal entries, so that the matrix is not + structurally singular. */ +{ + double *nzval = (double *)Astore->nzval; + int *rowind = Astore->rowind; + int *colptr = Astore->colptr; + int nnz = colptr[n]; + int fill = 0; + double *nzval_new; + double zero = 0.0; + int *rowind_new; + int i, j, diag; + + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + if (rowind[j] == i) diag = j; + if (diag < 0) fill++; + } + if (fill) + { + nzval_new = doubleMalloc(nnz + fill); + rowind_new = intMalloc(nnz + fill); + fill = 0; + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i] - fill; j < colptr[i + 1]; j++) + { + if ((rowind_new[j + fill] = rowind[j]) == i) diag = j; + nzval_new[j + fill] = nzval[j]; + } + if (diag < 0) + { + rowind_new[colptr[i + 1] + fill] = i; + nzval_new[colptr[i + 1] + fill] = zero; + fill++; + } + colptr[i + 1] += fill; + } + Astore->nzval = nzval_new; + Astore->rowind = rowind_new; + SUPERLU_FREE(nzval); + SUPERLU_FREE(rowind); + } + Astore->nnz += fill; + return fill; +} + +int ddominate(int n, NCformat *Astore) +/* make the matrix diagonally dominant */ +{ + double *nzval = (double *)Astore->nzval; + int *rowind = Astore->rowind; + int *colptr = Astore->colptr; + int nnz = colptr[n]; + int fill = 0; + double *nzval_new; + int *rowind_new; + int i, j, diag; + double s; + + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + if (rowind[j] == i) diag = j; + if (diag < 0) fill++; + } + if (fill) + { + nzval_new = doubleMalloc(nnz + fill); + rowind_new = intMalloc(nnz+ fill); + fill = 0; + for (i = 0; i < n; i++) + { + s = 1e-6; + diag = -1; + for (j = colptr[i] - fill; j < colptr[i + 1]; j++) + { + if ((rowind_new[j + fill] = rowind[j]) == i) diag = j; + s += fabs(nzval_new[j + fill] = nzval[j]); + } + if (diag >= 0) { + nzval_new[diag+fill] = s * 3.0; + } else { + rowind_new[colptr[i + 1] + fill] = i; + nzval_new[colptr[i + 1] + fill] = s * 3.0; + fill++; + } + colptr[i + 1] += fill; + } + Astore->nzval = nzval_new; + Astore->rowind = rowind_new; + SUPERLU_FREE(nzval); + SUPERLU_FREE(rowind); + } + else + { + for (i = 0; i < n; i++) + { + s = 1e-6; + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + { + if (rowind[j] == i) diag = j; + s += fabs(nzval[j]); + } + nzval[diag] = s * 3.0; + } + } + Astore->nnz += fill; + return fill; +} diff --git a/src/maths/SuperLU/dgscon.c b/src/maths/SuperLU/dgscon.c new file mode 100644 index 000000000..8fe88255e --- /dev/null +++ b/src/maths/SuperLU/dgscon.c @@ -0,0 +1,157 @@ + +/*! @file dgscon.c + * \brief Estimates reciprocal of the condition number of a general matrix + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Modified from lapack routines DGECON.
+ * 
+ */ + +/* + * File name: dgscon.c + * History: Modified from lapack routines DGECON. + */ +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   DGSCON estimates the reciprocal of the condition number of a general 
+ *   real matrix A, in either the 1-norm or the infinity-norm, using   
+ *   the LU factorization computed by DGETRF.   *
+ *
+ *   An estimate is obtained for norm(inv(A)), and the reciprocal of the   
+ *   condition number is computed as   
+ *      RCOND = 1 / ( norm(A) * norm(inv(A)) ).   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ * 
+ *   Arguments   
+ *   =========   
+ *
+ *    NORM    (input) char*
+ *            Specifies whether the 1-norm condition number or the   
+ *            infinity-norm condition number is required:   
+ *            = '1' or 'O':  1-norm;   
+ *            = 'I':         Infinity-norm.
+ *	    
+ *    L       (input) SuperMatrix*
+ *            The factor L from the factorization Pr*A*Pc=L*U as computed by
+ *            dgstrf(). Use compressed row subscripts storage for supernodes,
+ *            i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ * 
+ *    U       (input) SuperMatrix*
+ *            The factor U from the factorization Pr*A*Pc=L*U as computed by
+ *            dgstrf(). Use column-wise storage scheme, i.e., U has types:
+ *            Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *	    
+ *    ANORM   (input) double
+ *            If NORM = '1' or 'O', the 1-norm of the original matrix A.   
+ *            If NORM = 'I', the infinity-norm of the original matrix A.
+ *	    
+ *    RCOND   (output) double*
+ *           The reciprocal of the condition number of the matrix A,   
+ *           computed as RCOND = 1/(norm(A) * norm(inv(A))).
+ *	    
+ *    INFO    (output) int*
+ *           = 0:  successful exit   
+ *           < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *
+ *    ===================================================================== 
+ * 
+ */ + +void +dgscon(char *norm, SuperMatrix *L, SuperMatrix *U, + double anorm, double *rcond, SuperLUStat_t *stat, int *info) +{ + + + /* Local variables */ + int kase, kase1, onenrm, i; + double ainvnm; + double *work; + int *iwork; + extern int drscl_(int *, double *, double *, int *); + + extern int dlacon_(int *, double *, double *, int *, double *, int *); + + + /* Test the input parameters. */ + *info = 0; + onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O"); + if (! onenrm && ! lsame_(norm, "I")) *info = -1; + else if (L->nrow < 0 || L->nrow != L->ncol || + L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU) + *info = -2; + else if (U->nrow < 0 || U->nrow != U->ncol || + U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU) + *info = -3; + if (*info != 0) { + i = -(*info); + xerbla_("dgscon", &i); + return; + } + + /* Quick return if possible */ + *rcond = 0.; + if ( L->nrow == 0 || U->nrow == 0) { + *rcond = 1.; + return; + } + + work = doubleCalloc( 3*L->nrow ); + iwork = intMalloc( L->nrow ); + + + if ( !work || !iwork ) + ABORT_SuperLU("Malloc fails for work arrays in dgscon."); + + /* Estimate the norm of inv(A). */ + ainvnm = 0.; + if ( onenrm ) kase1 = 1; + else kase1 = 2; + kase = 0; + + do { + dlacon_(&L->nrow, &work[L->nrow], &work[0], &iwork[0], &ainvnm, &kase); + + if (kase == 0) break; + + if (kase == kase1) { + /* Multiply by inv(L). */ + sp_dtrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); + + /* Multiply by inv(U). */ + sp_dtrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); + + } else { + + /* Multiply by inv(U'). */ + sp_dtrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); + + /* Multiply by inv(L'). */ + sp_dtrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); + + } + + } while ( kase != 0 ); + + /* Compute the estimate of the reciprocal condition number. */ + if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; + + SUPERLU_FREE (work); + SUPERLU_FREE (iwork); + return; + +} /* dgscon */ + diff --git a/src/maths/SuperLU/dgsequ.c b/src/maths/SuperLU/dgsequ.c new file mode 100644 index 000000000..35754187a --- /dev/null +++ b/src/maths/SuperLU/dgsequ.c @@ -0,0 +1,195 @@ + +/*! @file dgsequ.c + * \brief Computes row and column scalings + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Modified from LAPACK routine DGEEQU
+ * 
+ */ +/* + * File name: dgsequ.c + * History: Modified from LAPACK routine DGEEQU + */ +#include +#include + + + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   DGSEQU computes row and column scalings intended to equilibrate an   
+ *   M-by-N sparse matrix A and reduce its condition number. R returns the row
+ *   scale factors and C the column scale factors, chosen to try to make   
+ *   the largest element in each row and column of the matrix B with   
+ *   elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+ *
+ *   R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+ *   number and BIGNUM = largest safe number.  Use of these scaling   
+ *   factors is not guaranteed to reduce the condition number of A but   
+ *   works well in practice.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input) SuperMatrix*
+ *           The matrix of dimension (A->nrow, A->ncol) whose equilibration
+ *           factors are to be computed. The type of A can be:
+ *           Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
+ *	    
+ *   R       (output) double*, size A->nrow
+ *           If INFO = 0 or INFO > M, R contains the row scale factors   
+ *           for A.
+ *	    
+ *   C       (output) double*, size A->ncol
+ *           If INFO = 0,  C contains the column scale factors for A.
+ *	    
+ *   ROWCND  (output) double*
+ *           If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+ *           smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+ *           AMAX is neither too large nor too small, it is not worth   
+ *           scaling by R.
+ *	    
+ *   COLCND  (output) double*
+ *           If INFO = 0, COLCND contains the ratio of the smallest   
+ *           C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+ *           worth scaling by C.
+ *	    
+ *   AMAX    (output) double*
+ *           Absolute value of largest matrix element.  If AMAX is very   
+ *           close to overflow or very close to underflow, the matrix   
+ *           should be scaled.
+ *	    
+ *   INFO    (output) int*
+ *           = 0:  successful exit   
+ *           < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *           > 0:  if INFO = i,  and i is   
+ *                 <= A->nrow:  the i-th row of A is exactly zero   
+ *                 >  A->ncol:  the (i-M)-th column of A is exactly zero   
+ *
+ *   ===================================================================== 
+ * 
+ */ +void +dgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, + double *colcnd, double *amax, int *info) +{ + + + /* Local variables */ + NCformat *Astore; + double *Aval; + int i, j, irow; + double rcmin, rcmax; + double bignum, smlnum; + extern double dlamch_(char *); + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + xerbla_("dgsequ", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = 0; i < A->nrow; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + return; + +} /* dgsequ */ + + diff --git a/src/maths/SuperLU/dgsisx.c b/src/maths/SuperLU/dgsisx.c new file mode 100644 index 000000000..01883fd13 --- /dev/null +++ b/src/maths/SuperLU/dgsisx.c @@ -0,0 +1,727 @@ + +/*! @file dgsisx.c + * \brief Computes an approximate solutions of linear equations A*X=B or A'*X=B + * + *
+ * -- SuperLU routine (version 4.2) --
+ * Lawrence Berkeley National Laboratory.
+ * November, 2010
+ * August, 2011
+ * 
+ */ +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DGSISX computes an approximate solutions of linear equations
+ * A*X=B or A'*X=B, using the ILU factorization from dgsitrf().
+ * An estimation of the condition number is provided. 
+ * The routine performs the following steps:
+ *
+ *   1. If A is stored column-wise (A->Stype = SLU_NC):
+ *  
+ *	1.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling
+ *	     factors are computed to equilibrate the system:
+ *	     options->Trans = NOTRANS:
+ *		 diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *	     options->Trans = TRANS:
+ *		 (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *	     options->Trans = CONJ:
+ *		 (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *	     Whether or not the system will be equilibrated depends on the
+ *	     scaling of the matrix A, but if equilibration is used, A is
+ *	     overwritten by diag(R)*A*diag(C) and B by diag(R)*B
+ *	     (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
+ *	     = TRANS or CONJ).
+ *
+ *	1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
+ *	     matrix that usually preserves sparsity.
+ *	     For more details of this step, see sp_preorder.c.
+ *
+ *	1.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *	     factor the matrix A (after equilibration if options->Equil = YES)
+ *	     as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
+ *
+ *	1.4. Compute the reciprocal pivot growth factor.
+ *
+ *	1.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *	     routine fills a small number on the diagonal entry, that is
+ *		U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n),
+ *	     and info will be increased by 1. The factored form of A is used
+ *	     to estimate the condition number of the preconditioner. If the
+ *	     reciprocal of the condition number is less than machine precision,
+ *	     info = A->ncol+1 is returned as a warning, but the routine still
+ *	     goes on to solve for X.
+ *
+ *	1.6. The system of equations is solved for X using the factored form
+ *	     of A.
+ *
+ *	1.7. options->IterRefine is not used
+ *
+ *	1.8. If equilibration was used, the matrix X is premultiplied by
+ *	     diag(C) (if options->Trans = NOTRANS) or diag(R)
+ *	     (if options->Trans = TRANS or CONJ) so that it solves the
+ *	     original system before equilibration.
+ *
+ *	1.9. options for ILU only
+ *	     1) If options->RowPerm = LargeDiag, MC64 is used to scale and
+ *		permute the matrix to an I-matrix, that is Pr*Dr*A*Dc has
+ *		entries of modulus 1 on the diagonal and off-diagonal entries
+ *		of modulus at most 1. If MC64 fails, dgsequ() is used to
+ *		equilibrate the system.
+ *              ( Default: LargeDiag )
+ *	     2) options->ILU_DropTol = tau is the threshold for dropping.
+ *		For L, it is used directly (for the whole row in a supernode);
+ *		For U, ||A(:,i)||_oo * tau is used as the threshold
+ *	        for the	i-th column.
+ *		If a secondary dropping rule is required, tau will
+ *	        also be used to compute the second threshold.
+ *              ( Default: 1e-4 )
+ *	     3) options->ILU_FillFactor = gamma, used as the initial guess
+ *		of memory growth.
+ *		If a secondary dropping rule is required, it will also
+ *              be used as an upper bound of the memory.
+ *              ( Default: 10 )
+ *	     4) options->ILU_DropRule specifies the dropping rule.
+ *		Option	      Meaning
+ *		======	      ===========
+ *		DROP_BASIC:   Basic dropping rule, supernodal based ILUTP(tau).
+ *		DROP_PROWS:   Supernodal based ILUTP(p,tau), p = gamma*nnz(A)/n.
+ *		DROP_COLUMN:  Variant of ILUTP(p,tau), for j-th column,
+ *			      p = gamma * nnz(A(:,j)).
+ *		DROP_AREA:    Variation of ILUTP, for j-th column, use
+ *			      nnz(F(:,1:j)) / nnz(A(:,1:j)) to control memory.
+ *		DROP_DYNAMIC: Modify the threshold tau during factorizaion:
+ *			      If nnz(L(:,1:j)) / nnz(A(:,1:j)) > gamma
+ *				  tau_L(j) := MIN(tau_0, tau_L(j-1) * 2);
+ *			      Otherwise
+ *				  tau_L(j) := MAX(tau_0, tau_L(j-1) / 2);
+ *			      tau_U(j) uses the similar rule.
+ *			      NOTE: the thresholds used by L and U are separate.
+ *		DROP_INTERP:  Compute the second dropping threshold by
+ *			      interpolation instead of sorting (default).
+ *			      In this case, the actual fill ratio is not
+ *			      guaranteed smaller than gamma.
+ *		DROP_PROWS, DROP_COLUMN and DROP_AREA are mutually exclusive.
+ *		( Default: DROP_BASIC | DROP_AREA )
+ *	     5) options->ILU_Norm is the criterion of measuring the magnitude
+ *		of a row in a supernode of L. ( Default is INF_NORM )
+ *		options->ILU_Norm	RowSize(x[1:n])
+ *		=================	===============
+ *		ONE_NORM		||x||_1 / n
+ *		TWO_NORM		||x||_2 / sqrt(n)
+ *		INF_NORM		max{|x[i]|}
+ *	     6) options->ILU_MILU specifies the type of MILU's variation.
+ *		= SILU: do not perform Modified ILU;
+ *		= SMILU_1 (not recommended):
+ *		    U(i,i) := U(i,i) + sum(dropped entries);
+ *		= SMILU_2:
+ *		    U(i,i) := U(i,i) + SGN(U(i,i)) * sum(dropped entries);
+ *		= SMILU_3:
+ *		    U(i,i) := U(i,i) + SGN(U(i,i)) * sum(|dropped entries|);
+ *		NOTE: Even SMILU_1 does not preserve the column sum because of
+ *		late dropping.
+ *              ( Default: SILU )
+ *	     7) options->ILU_FillTol is used as the perturbation when
+ *		encountering zero pivots. If some U(i,i) = 0, so that U is
+ *		exactly singular, then
+ *		   U(i,i) := ||A(:,i)|| * options->ILU_FillTol ** (1 - i / n).
+ *              ( Default: 1e-2 )
+ *
+ *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
+ *	to the transpose of A:
+ *
+ *	2.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling
+ *	     factors are computed to equilibrate the system:
+ *	     options->Trans = NOTRANS:
+ *		 diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *	     options->Trans = TRANS:
+ *		 (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *	     options->Trans = CONJ:
+ *		 (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *	     Whether or not the system will be equilibrated depends on the
+ *	     scaling of the matrix A, but if equilibration is used, A' is
+ *	     overwritten by diag(R)*A'*diag(C) and B by diag(R)*B
+ *	     (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
+ *
+ *	2.2. Permute columns of transpose(A) (rows of A),
+ *	     forming transpose(A)*Pc, where Pc is a permutation matrix that
+ *	     usually preserves sparsity.
+ *	     For more details of this step, see sp_preorder.c.
+ *
+ *	2.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *	     factor the transpose(A) (after equilibration if
+ *	     options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
+ *	     permutation Pr determined by partial pivoting.
+ *
+ *	2.4. Compute the reciprocal pivot growth factor.
+ *
+ *	2.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *	     routine fills a small number on the diagonal entry, that is
+ *		 U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n).
+ *	     And info will be increased by 1. The factored form of A is used
+ *	     to estimate the condition number of the preconditioner. If the
+ *	     reciprocal of the condition number is less than machine precision,
+ *	     info = A->ncol+1 is returned as a warning, but the routine still
+ *	     goes on to solve for X.
+ *
+ *	2.6. The system of equations is solved for X using the factored form
+ *	     of transpose(A).
+ *
+ *	2.7. If options->IterRefine is not used.
+ *
+ *	2.8. If equilibration was used, the matrix X is premultiplied by
+ *	     diag(C) (if options->Trans = NOTRANS) or diag(R)
+ *	     (if options->Trans = TRANS or CONJ) so that it solves the
+ *	     original system before equilibration.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *	   The structure defines the input parameters to control
+ *	   how the LU decomposition will be performed and how the
+ *	   system will be solved.
+ *
+ * A	   (input/output) SuperMatrix*
+ *	   Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *	   of the linear equations is A->nrow. Currently, the type of A can be:
+ *	   Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
+ *	   In the future, more general A may be handled.
+ *
+ *	   On entry, If options->Fact = FACTORED and equed is not 'N',
+ *	   then A must have been equilibrated by the scaling factors in
+ *	   R and/or C.
+ *	   On exit, A is not modified
+ *         if options->Equil = NO, or
+ *         if options->Equil = YES but equed = 'N' on exit, or
+ *         if options->RowPerm = NO.
+ *
+ *	   Otherwise, if options->Equil = YES and equed is not 'N',
+ *	   A is scaled as follows:
+ *	   If A->Stype = SLU_NC:
+ *	     equed = 'R':  A := diag(R) * A
+ *	     equed = 'C':  A := A * diag(C)
+ *	     equed = 'B':  A := diag(R) * A * diag(C).
+ *	   If A->Stype = SLU_NR:
+ *	     equed = 'R':  transpose(A) := diag(R) * transpose(A)
+ *	     equed = 'C':  transpose(A) := transpose(A) * diag(C)
+ *	     equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
+ *
+ *         If options->RowPerm = LargeDiag, MC64 is used to scale and permute
+ *            the matrix to an I-matrix, that is A is modified as follows:
+ *            P*Dr*A*Dc has entries of modulus 1 on the diagonal and 
+ *            off-diagonal entries of modulus at most 1. P is a permutation
+ *            obtained from MC64.
+ *            If MC64 fails, dgsequ() is used to equilibrate the system,
+ *            and A is scaled as above, but no permutation is involved.
+ *            On exit, A is restored to the orginal row numbering, so
+ *            Dr*A*Dc is returned.
+ *
+ * perm_c  (input/output) int*
+ *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
+ *	   which defines the permutation matrix Pc; perm_c[i] = j means
+ *	   column i of A is in position j in A*Pc.
+ *	   On exit, perm_c may be overwritten by the product of the input
+ *	   perm_c and a permutation that postorders the elimination tree
+ *	   of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ *	   is already in postorder.
+ *
+ *	   If A->Stype = SLU_NR, column permutation vector of size A->nrow,
+ *	   which describes permutation of columns of transpose(A) 
+ *	   (rows of A) as described above.
+ *
+ * perm_r  (input/output) int*
+ *	   If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
+ *	   which defines the permutation matrix Pr, and is determined
+ *	   by MC64 first then followed by partial pivoting.
+ *         perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ *	   If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ *	   determines permutation of rows of transpose(A)
+ *	   (columns of A) as described above.
+ *
+ *	   If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *	   will try to use the input perm_r, unless a certain threshold
+ *	   criterion is violated. In that case, perm_r is overwritten by a
+ *	   new permutation determined by partial pivoting or diagonal
+ *	   threshold pivoting.
+ *	   Otherwise, perm_r is output argument.
+ *
+ * etree   (input/output) int*,  dimension (A->ncol)
+ *	   Elimination tree of Pc'*A'*A*Pc.
+ *	   If options->Fact != FACTORED and options->Fact != DOFACT,
+ *	   etree is an input argument, otherwise it is an output argument.
+ *	   Note: etree is a vector of parent pointers for a forest whose
+ *	   vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * equed   (input/output) char*
+ *	   Specifies the form of equilibration that was done.
+ *	   = 'N': No equilibration.
+ *	   = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
+ *	   = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
+ *	   = 'B': Both row and column equilibration, i.e., A was replaced 
+ *		  by diag(R)*A*diag(C).
+ *	   If options->Fact = FACTORED, equed is an input argument,
+ *	   otherwise it is an output argument.
+ *
+ * R	   (input/output) double*, dimension (A->nrow)
+ *	   The row scale factors for A or transpose(A).
+ *	   If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *	       (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
+ *	   If equed = 'N' or 'C', R is not accessed.
+ *	   If options->Fact = FACTORED, R is an input argument,
+ *	       otherwise, R is output.
+ *	   If options->Fact = FACTORED and equed = 'R' or 'B', each element
+ *	       of R must be positive.
+ *
+ * C	   (input/output) double*, dimension (A->ncol)
+ *	   The column scale factors for A or transpose(A).
+ *	   If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *	       (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
+ *	   If equed = 'N' or 'R', C is not accessed.
+ *	   If options->Fact = FACTORED, C is an input argument,
+ *	       otherwise, C is output.
+ *	   If options->Fact = FACTORED and equed = 'C' or 'B', each element
+ *	       of C must be positive.
+ *
+ * L	   (output) SuperMatrix*
+ *	   The factor L from the factorization
+ *	       Pr*A*Pc=L*U		(if A->Stype SLU_= NC) or
+ *	       Pr*transpose(A)*Pc=L*U	(if A->Stype = SLU_NR).
+ *	   Uses compressed row subscripts storage for supernodes, i.e.,
+ *	   L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *
+ * U	   (output) SuperMatrix*
+ *	   The factor U from the factorization
+ *	       Pr*A*Pc=L*U		(if A->Stype = SLU_NC) or
+ *	       Pr*transpose(A)*Pc=L*U	(if A->Stype = SLU_NR).
+ *	   Uses column-wise storage scheme, i.e., U has types:
+ *	   Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ * work    (workspace/output) void*, size (lwork) (in bytes)
+ *	   User supplied workspace, should be large enough
+ *	   to hold data structures for factors L and U.
+ *	   On exit, if fact is not 'F', L and U point to this array.
+ *
+ * lwork   (input) int
+ *	   Specifies the size of work array in bytes.
+ *	   = 0:  allocate space internally by system malloc;
+ *	   > 0:  use user-supplied work array of length lwork in bytes,
+ *		 returns error if space runs out.
+ *	   = -1: the routine guesses the amount of space needed without
+ *		 performing the factorization, and returns it in
+ *		 mem_usage->total_needed; no other side effects.
+ *
+ *	   See argument 'mem_usage' for memory usage statistics.
+ *
+ * B	   (input/output) SuperMatrix*
+ *	   B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ *	   On entry, the right hand side matrix.
+ *	   If B->ncol = 0, only LU decomposition is performed, the triangular
+ *			   solve is skipped.
+ *	   On exit,
+ *	      if equed = 'N', B is not modified; otherwise
+ *	      if A->Stype = SLU_NC:
+ *		 if options->Trans = NOTRANS and equed = 'R' or 'B',
+ *		    B is overwritten by diag(R)*B;
+ *		 if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
+ *		    B is overwritten by diag(C)*B;
+ *	      if A->Stype = SLU_NR:
+ *		 if options->Trans = NOTRANS and equed = 'C' or 'B',
+ *		    B is overwritten by diag(C)*B;
+ *		 if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
+ *		    B is overwritten by diag(R)*B.
+ *
+ * X	   (output) SuperMatrix*
+ *	   X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ *	   If info = 0 or info = A->ncol+1, X contains the solution matrix
+ *	   to the original system of equations. Note that A and B are modified
+ *	   on exit if equed is not 'N', and the solution to the equilibrated
+ *	   system is inv(diag(C))*X if options->Trans = NOTRANS and
+ *	   equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
+ *	   and equed = 'R' or 'B'.
+ *
+ * recip_pivot_growth (output) double*
+ *	   The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
+ *	   The infinity norm is used. If recip_pivot_growth is much less
+ *	   than 1, the stability of the LU factorization could be poor.
+ *
+ * rcond   (output) double*
+ *	   The estimate of the reciprocal condition number of the matrix A
+ *	   after equilibration (if done). If rcond is less than the machine
+ *	   precision (in particular, if rcond = 0), the matrix is singular
+ *	   to working precision. This condition is indicated by a return
+ *	   code of info > 0.
+ *
+ * mem_usage (output) mem_usage_t*
+ *	   Record the memory usage statistics, consisting of following fields:
+ *	   - for_lu (float)
+ *	     The amount of space used in bytes for L\U data structures.
+ *	   - total_needed (float)
+ *	     The amount of space needed in bytes to perform factorization.
+ *	   - expansions (int)
+ *	     The number of memory expansions during the LU factorization.
+ *
+ * stat   (output) SuperLUStat_t*
+ *	  Record the statistics on runtime and floating-point operation count.
+ *	  See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ *	   > 0: if info = i, and i is
+ *		<= A->ncol: number of zero pivots. They are replaced by small
+ *		      entries due to options->ILU_FillTol.
+ *		= A->ncol+1: U is nonsingular, but RCOND is less than machine
+ *		      precision, meaning that the matrix is singular to
+ *		      working precision. Nevertheless, the solution and
+ *		      error bounds are computed because there are a number
+ *		      of situations where the computed solution can be more
+ *		      accurate than the value of RCOND would suggest.
+ *		> A->ncol+1: number of bytes allocated when memory allocation
+ *		      failure occurred, plus A->ncol.
+ * 
+ */ + +void +dgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + int *etree, char *equed, double *R, double *C, + SuperMatrix *L, SuperMatrix *U, void *work, int lwork, + SuperMatrix *B, SuperMatrix *X, + double *recip_pivot_growth, double *rcond, + mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info) +{ + + DNformat *Bstore, *Xstore; + double *Bmat, *Xmat; + int ldb, ldx, nrhs, n; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int colequ, equil, nofact, notran, rowequ, permc_spec, mc64; + trans_t trant; + char norm[1]; + int i, j, info1; + double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; + int relax, panel_size; + double diag_pivot_thresh; + double t0; /* temporary time */ + double *utime; + + int *perm = NULL; /* permutation returned from MC64 */ + + /* External functions */ + extern double dlangs(char *, SuperMatrix *); + + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + n = B->nrow; + + *info = 0; + nofact = (options->Fact != FACTORED); + equil = (options->Equil == YES_SuperLU); + notran = (options->Trans == NOTRANS); + mc64 = (options->RowPerm == LargeDiag); + if ( nofact ) { + *(unsigned char *)equed = 'N'; + rowequ = FALSE; + colequ = FALSE; + } else { + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + smlnum = dlamch_("Safe minimum"); + bignum = 1. / smlnum; + } + + /* Test the input parameters */ + if (options->Fact != DOFACT && options->Fact != SamePattern && + options->Fact != SamePattern_SameRowPerm && + options->Fact != FACTORED && + options->Trans != NOTRANS && options->Trans != TRANS && + options->Trans != CONJ && + options->Equil != NO_SuperLU && options->Equil != YES_SuperLU) + *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -2; + else if (options->Fact == FACTORED && + !(rowequ || colequ || lsame_(equed, "N"))) + *info = -6; + else { + if (rowequ) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, R[j]); + rcmax = SUPERLU_MAX(rcmax, R[j]); + } + if (rcmin <= 0.) *info = -7; + else if ( A->nrow > 0) + rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else rowcnd = 1.; + } + if (colequ && *info == 0) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, C[j]); + rcmax = SUPERLU_MAX(rcmax, C[j]); + } + if (rcmin <= 0.) *info = -8; + else if (A->nrow > 0) + colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else colcnd = 1.; + } + if (*info == 0) { + if ( lwork < -1 ) *info = -12; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || + B->Mtype != SLU_GE ) + *info = -13; + else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || + (B->ncol != 0 && B->ncol != X->ncol) || + X->Stype != SLU_DN || + X->Dtype != SLU_D || X->Mtype != SLU_GE ) + *info = -14; + } + } + if (*info != 0) { + i = -(*info); + xerbla_("dgsisx", &i); + return; + } + + /* Initialization for factor parameters */ + panel_size = sp_ienv(1); + relax = sp_ienv(2); + diag_pivot_thresh = options->DiagPivotThresh; + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + if ( notran ) { /* Reverse the transpose argument. */ + trant = TRANS; + notran = 0; + } else { + trant = NOTRANS; + notran = 1; + } + } else { /* A->Stype == SLU_NC */ + trant = options->Trans; + AA = A; + } + + if ( nofact ) { + register int i, j; + NCformat *Astore = AA->Store; + int nnz = Astore->nnz; + int *colptr = Astore->colptr; + int *rowind = Astore->rowind; + double *nzval = (double *)Astore->nzval; + + if ( mc64 ) { + t0 = SuperLU_timer_(); + if ((perm = intMalloc(n)) == NULL) + ABORT_SuperLU("SUPERLU_MALLOC fails for perm[]"); + + info1 = dldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C); + + if (info1 != 0) { /* MC64 fails, call dgsequ() later */ + mc64 = 0; + SUPERLU_FREE(perm); + perm = NULL; + } else { + if ( equil ) { + rowequ = colequ = 1; + for (i = 0; i < n; i++) { + R[i] = exp(R[i]); + C[i] = exp(C[i]); + } + /* scale the matrix */ + for (j = 0; j < n; j++) { + for (i = colptr[j]; i < colptr[j + 1]; i++) { + nzval[i] *= R[rowind[i]] * C[j]; + } + } + *equed = 'B'; + } + + /* permute the matrix */ + for (j = 0; j < n; j++) { + for (i = colptr[j]; i < colptr[j + 1]; i++) { + /*nzval[i] *= R[rowind[i]] * C[j];*/ + rowind[i] = perm[rowind[i]]; + } + } + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + + if ( !mc64 & equil ) { /* Only perform equilibration, no row perm */ + t0 = SuperLU_timer_(); + /* Compute row and column scalings to equilibrate the matrix A. */ + dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); + + if ( info1 == 0 ) { + /* Equilibrate matrix A. */ + dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + } + + + if ( nofact ) { + + t0 = SuperLU_timer_(); + /* + * Gnet column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t0; + + t0 = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t0; + + /* Compute the LU factorization of A*Pc. */ + t0 = SuperLU_timer_(); + dgsitrf(options, &AC, relax, panel_size, etree, work, lwork, + perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t0; + + if ( lwork == -1 ) { + mem_usage->total_needed = *info - A->ncol; + return; + } + + if ( mc64 ) { /* Fold MC64's perm[] into perm_r[]. */ + NCformat *Astore = AA->Store; + int nnz = Astore->nnz, *rowind = Astore->rowind; + int *perm_tmp, *iperm; + if ((perm_tmp = intMalloc(2*n)) == NULL) + ABORT_SuperLU("SUPERLU_MALLOC fails for perm_tmp[]"); + iperm = perm_tmp + n; + for (i = 0; i < n; ++i) perm_tmp[i] = perm_r[perm[i]]; + for (i = 0; i < n; ++i) { + perm_r[i] = perm_tmp[i]; + iperm[perm[i]] = i; + } + + /* Restore A's original row indices. */ + for (i = 0; i < nnz; ++i) rowind[i] = iperm[rowind[i]]; + + SUPERLU_FREE(perm); /* MC64 permutation */ + SUPERLU_FREE(perm_tmp); + } + } + + if ( options->PivotGrowth ) { + if ( *info > 0 ) return; + + /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ + *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U); + } + + if ( options->ConditionNumber ) { + /* Estimate the reciprocal of the condition number of A. */ + t0 = SuperLU_timer_(); + if ( notran ) { + *(unsigned char *)norm = '1'; + } else { + *(unsigned char *)norm = 'I'; + } + anorm = dlangs(norm, AA); + dgscon(norm, L, U, anorm, rcond, stat, &info1); + utime[RCOND] = SuperLU_timer_() - t0; + } + + if ( nrhs > 0 ) { /* Solve the system */ + double *rhs_work; + + /* Scale and permute the right-hand side if equilibration + and permutation from MC64 were performed. */ + if ( notran ) { + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) + Bmat[i + j*ldb] *= R[i]; + } + } else if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + Bmat[i + j*ldb] *= C[i]; + } + } + + /* Compute the solution matrix X. */ + for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ + for (i = 0; i < B->nrow; i++) + Xmat[i + j*ldx] = Bmat[i + j*ldb]; + + t0 = SuperLU_timer_(); + dgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1); + utime[SOLVE] = SuperLU_timer_() - t0; + + /* Transform the solution matrix X to a solution of the original + system. */ + if ( notran ) { + if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + Xmat[i + j*ldx] *= C[i]; + } + } + } else { /* transposed system */ + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) { + Xmat[i + j*ldx] *= R[i]; + } + } + } + + } /* end if nrhs > 0 */ + + if ( options->ConditionNumber ) { + /* The matrix is singular to working precision. */ + if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1; + } + + if ( nofact ) { + ilu_dQuerySpace(L, U, mem_usage); + Destroy_CompCol_Permuted(&AC); + } + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/dgsitrf.c b/src/maths/SuperLU/dgsitrf.c new file mode 100644 index 000000000..88a184c71 --- /dev/null +++ b/src/maths/SuperLU/dgsitrf.c @@ -0,0 +1,639 @@ + +/*! @file dgsitrf.c + * \brief Computes an ILU factorization of a general sparse matrix + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ *
+ * 
+ */ + +#include + +#ifdef DEBUG +int num_drop_L; +#endif + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DGSITRF computes an ILU factorization of a general sparse m-by-n
+ * matrix A using partial pivoting with row interchanges.
+ * The factorization has the form
+ *     Pr * A = L * U
+ * where Pr is a row permutation matrix, L is lower triangular with unit
+ * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper
+ * triangular (upper trapezoidal if A->nrow < A->ncol).
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *	   The structure defines the input parameters to control
+ *	   how the ILU decomposition will be performed.
+ *
+ * A	    (input) SuperMatrix*
+ *	    Original matrix A, permuted by columns, of dimension
+ *	    (A->nrow, A->ncol). The type of A can be:
+ *	    Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * relax    (input) int
+ *	    To control degree of relaxing supernodes. If the number
+ *	    of nodes (columns) in a subtree of the elimination tree is less
+ *	    than relax, this subtree is considered as one supernode,
+ *	    regardless of the row structures of those columns.
+ *
+ * panel_size (input) int
+ *	    A panel consists of at most panel_size consecutive columns.
+ *
+ * etree    (input) int*, dimension (A->ncol)
+ *	    Elimination tree of A'*A.
+ *	    Note: etree is a vector of parent pointers for a forest whose
+ *	    vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *	    On input, the columns of A should be permuted so that the
+ *	    etree is in a certain postorder.
+ *
+ * work     (input/output) void*, size (lwork) (in bytes)
+ *	    User-supplied work space and space for the output data structures.
+ *	    Not referenced if lwork = 0;
+ *
+ * lwork   (input) int
+ *	   Specifies the size of work array in bytes.
+ *	   = 0:  allocate space internally by system malloc;
+ *	   > 0:  use user-supplied work array of length lwork in bytes,
+ *		 returns error if space runs out.
+ *	   = -1: the routine guesses the amount of space needed without
+ *		 performing the factorization, and returns it in
+ *		 *info; no other side effects.
+ *
+ * perm_c   (input) int*, dimension (A->ncol)
+ *	    Column permutation vector, which defines the
+ *	    permutation matrix Pc; perm_c[i] = j means column i of A is
+ *	    in position j in A*Pc.
+ *	    When searching for diagonal, perm_c[*] is applied to the
+ *	    row subscripts of A, so that diagonal threshold pivoting
+ *	    can find the diagonal of A, rather than that of A*Pc.
+ *
+ * perm_r   (input/output) int*, dimension (A->nrow)
+ *	    Row permutation vector which defines the permutation matrix Pr,
+ *	    perm_r[i] = j means row i of A is in position j in Pr*A.
+ *	    If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *	       will try to use the input perm_r, unless a certain threshold
+ *	       criterion is violated. In that case, perm_r is overwritten by
+ *	       a new permutation determined by partial pivoting or diagonal
+ *	       threshold pivoting.
+ *	    Otherwise, perm_r is output argument;
+ *
+ * L	    (output) SuperMatrix*
+ *	    The factor L from the factorization Pr*A=L*U; use compressed row
+ *	    subscripts storage for supernodes, i.e., L has type:
+ *	    Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *
+ * U	    (output) SuperMatrix*
+ *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
+ *	    storage scheme, i.e., U has types: Stype = SLU_NC,
+ *	    Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ * stat     (output) SuperLUStat_t*
+ *	    Record the statistics on runtime and floating-point operation count.
+ *	    See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info     (output) int*
+ *	    = 0: successful exit
+ *	    < 0: if info = -i, the i-th argument had an illegal value
+ *	    > 0: if info = i, and i is
+ *	       <= A->ncol: number of zero pivots. They are replaced by small
+ *		  entries according to options->ILU_FillTol.
+ *	       > A->ncol: number of bytes allocated when memory allocation
+ *		  failure occurred, plus A->ncol. If lwork = -1, it is
+ *		  the estimated amount of space needed, plus A->ncol.
+ *
+ * ======================================================================
+ *
+ * Local Working Arrays:
+ * ======================
+ *   m = number of rows in the matrix
+ *   n = number of columns in the matrix
+ *
+ *   marker[0:3*m-1]: marker[i] = j means that node i has been
+ *	reached when working on column j.
+ *	Storage: relative to original row subscripts
+ *	NOTE: There are 4 of them:
+ *	      marker/marker1 are used for panel dfs, see (ilu_)dpanel_dfs.c;
+ *	      marker2 is used for inner-factorization, see (ilu)_dcolumn_dfs.c;
+ *	      marker_relax(has its own space) is used for relaxed supernodes.
+ *
+ *   parent[0:m-1]: parent vector used during dfs
+ *	Storage: relative to new row subscripts
+ *
+ *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs)
+ *	unexplored neighbor of i in lsub[*]
+ *
+ *   segrep[0:nseg-1]: contains the list of supernodal representatives
+ *	in topological order of the dfs. A supernode representative is the
+ *	last column of a supernode.
+ *	The maximum size of segrep[] is n.
+ *
+ *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a
+ *	supernodal representative r, repfnz[r] is the location of the first
+ *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
+ *	indicates the supernode r has been explored.
+ *	NOTE: There are W of them, each used for one column of a panel.
+ *
+ *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below
+ *	the panel diagonal. These are filled in during dpanel_dfs(), and are
+ *	used later in the inner LU factorization within the panel.
+ *	panel_lsub[]/dense[] pair forms the SPA data structure.
+ *	NOTE: There are W of them.
+ *
+ *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
+ *		   NOTE: there are W of them.
+ *
+ *   tempv[0:*]: real temporary used for dense numeric kernels;
+ *	The size of this array is defined by NUM_TEMPV() in slu_util.h.
+ *	It is also used by the dropping routine ilu_ddrop_row().
+ * 
+ */ + +void +dgsitrf(superlu_options_t *options, SuperMatrix *A, int relax, int panel_size, + int *etree, void *work, int lwork, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *swap, *iswap; /* swap is used to store the row permutation + during the factorization. Initially, it is set + to iperm_c (row indeces of Pc*A*Pc'). + iswap is the inverse of swap. After the + factorization, it is equal to perm_r. */ + int *iwork; + double *dwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *marker, *marker_relax; + double *dense, *tempv; + int *relax_end, *relax_fsupc; + double *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double *amax; + double drop_sum; + double alpha, omega; /* used in MILU, mimicing DRIC */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + double *dwork2; /* used by the second dropping rule */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + double drop_tol = options->ILU_DropTol; /* tau */ + double fill_ini = options->ILU_FillTol; /* tau^hat */ + double gamma = options->ILU_FillFactor; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + double fill_tol; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + int last_drop;/* the last column which the dropping rules applied */ + int quota; + int nnzAj; /* number of nonzeros in A(:,1:j) */ + int nnzLj, nnzUj; + double tol_L = drop_tol, tol_U = drop_tol; + double zero = 0.0; + double one = 1.0; + + /* Executable */ + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, + gamma, L, U, &Glu, &iwork, &dwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &marker_relax, &marker); + dSetRWork(m, panel_size, dwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + swap = (int *)intMalloc(n); + for (k = 0; k < n; k++) swap[k] = iperm_c[k]; + iswap = (int *)intMalloc(n); + for (k = 0; k < n; k++) iswap[k] = perm_c[k]; + amax = (double *) doubleMalloc(panel_size); + if (drop_rule & DROP_SECONDARY) + dwork2 = (double *)doubleMalloc(n); + else + dwork2 = NULL; + + nnzAj = 0; + nnzLj = 0; + nnzUj = 0; + last_drop = SUPERLU_MAX(min_mn - 2 * sp_ienv(7), (int)(min_mn * 0.95)); + alpha = pow((double)n, -1.0 / options->ILU_MILU_Dim); + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + relax_fsupc = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) + ilu_heap_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + else + ilu_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* Mark the rows used by relaxed supernodes */ + ifill (marker_relax, m, EMPTY); + i = mark_relax(m, relax_end, relax_fsupc, xa_begin, xa_end, + asub, marker_relax); +#if ( PRNTlevel >= 1) + printf("%d relaxed supernodes.\n", i); +#endif + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* Drop small rows in the previous supernode. */ + if (jcol > 0 && jcol < last_drop) { + int first = xsup[supno[jcol - 1]]; + int last = jcol - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / min_mn); + + /* Drop small rows */ + i = ilu_ddrop_row(options, first, last, tol_L, quota, &nnzLj, + &fill_tol, &Glu, tempv, dwork2, 0); + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = ilu_dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ((*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu))) + return; + } + + for (icol = jcol; icol <= kcol; icol++) { + xusub[icol+1] = nextu; + + amax[0] = 0.0; + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) { + register double tmp = fabs(a[k]); + if (tmp > amax[0]) amax[0] = tmp; + dense[asub[k]] = a[k]; + } + nnzAj += xa_end[icol] - xa_begin[icol]; + if (amax[0] == 0.0) { + amax[0] = fill_ini; +#if ( PRNTlevel >= 1) + printf("Column %d is entirely zero!\n", icol); + fflush(stdout); +#endif + } + + /* Numeric update within the snode */ + dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if (usepr) pivrow = iperm_r[icol]; + fill_tol = pow(fill_ini, 1.0 - (double)icol / (double)min_mn); + if ( (*info = ilu_dpivotL(icol, diag_pivot_thresh, &usepr, + perm_r, iperm_c[icol], swap, iswap, + marker_relax, &pivrow, + amax[0] * fill_tol, milu, zero, + &Glu, stat)) ) { + iinfo++; + marker[pivrow] = kcol; + } + + } + + jcol = kcol + 1; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + ilu_dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, amax, panel_lsub, segrep, repfnz, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + dpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for (jj = jcol; jj < jcol + panel_size; jj++) { + + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + nnzAj += xa_end[jj] - xa_begin[jj]; + + if ((*info = ilu_dcolumn_dfs(m, jj, perm_r, &nseg, + &panel_lsub[k], segrep, &repfnz[k], + marker, parent, xplore, &Glu))) + return; + + /* Numeric updates */ + if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Make a fill-in position if the column is entirely zero */ + if (xlsub[jj + 1] == xlsub[jj]) { + register int i, row; + int nextl; + int nzlmax = Glu.nzlmax; + int *lsub = Glu.lsub; + int *marker2 = marker + 2 * m; + + /* Allocate memory */ + nextl = xlsub[jj] + 1; + if (nextl >= nzlmax) { + int error = dLUMemXpand(jj, nextl, LSUB, &nzlmax, &Glu); + if (error) { *info = error; return; } + lsub = Glu.lsub; + } + xlsub[jj + 1]++; + assert(xlusup[jj]==xlusup[jj+1]); + xlusup[jj + 1]++; + Glu.lusup[xlusup[jj]] = zero; + + /* Choose a row index (pivrow) for fill-in */ + for (i = jj; i < n; i++) + if (marker_relax[swap[i]] <= jj) break; + row = swap[i]; + marker2[row] = jj; + lsub[xlsub[jj]] = row; +#ifdef DEBUG + printf("Fill col %d.\n", jj); + fflush(stdout); +#endif + } + + /* Computer the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * jj / m; + else if (drop_rule & DROP_COLUMN) + quota = gamma * (xa_end[jj] - xa_begin[jj]) * + (jj + 1) / m; + else if (drop_rule & DROP_AREA) + quota = gamma * 0.9 * nnzAj * 0.5 - nnzUj; + else + quota = m; + + /* Copy the U-segments to ucol[*] and drop small entries */ + if ((*info = ilu_dcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], drop_rule, + milu, amax[jj - jcol] * tol_U, + quota, &drop_sum, &nnzUj, &Glu, + dwork2)) != 0) + return; + + /* Reset the dropping threshold if required */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * 0.9 * nnzAj * 0.5 < nnzLj) + tol_U = SUPERLU_MIN(1.0, tol_U * 2.0); + else + tol_U = SUPERLU_MAX(drop_tol, tol_U * 0.5); + } + + if (drop_sum != zero) + { + if (drop_sum > zero) + omega = SUPERLU_MIN(2.0 * (1.0 - alpha) + * amax[jj - jcol] / drop_sum, one); + else + omega = SUPERLU_MAX(2.0 * (1.0 - alpha) + * amax[jj - jcol] / drop_sum, -one); + drop_sum *= omega; + } + if (usepr) pivrow = iperm_r[jj]; + fill_tol = pow(fill_ini, 1.0 - (double)jj / (double)min_mn); + if ( (*info = ilu_dpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_c[jj], swap, iswap, + marker_relax, &pivrow, + amax[jj - jcol] * fill_tol, milu, + drop_sum, &Glu, stat)) ) { + iinfo++; + marker[m + pivrow] = jj; + marker[2 * m + pivrow] = jj; + } + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + + /* Start a new supernode, drop the previous one */ + if (jj > 0 && supno[jj] > supno[jj - 1] && jj < last_drop) { + int first = xsup[supno[jj - 1]]; + int last = jj - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) + / m) - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / + (double)min_mn); + + /* Drop small rows */ + i = ilu_ddrop_row(options, first, last, tol_L, quota, + &nnzLj, &fill_tol, &Glu, tempv, dwork2, + 1); + + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } /* if start a new supernode */ + + } /* for */ + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + ilu_countnz(min_mn, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_D, SLU_TRLU); + dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + SUPERLU_FREE (swap); + SUPERLU_FREE (iswap); + SUPERLU_FREE (relax_fsupc); + SUPERLU_FREE (amax); + if ( dwork2 ) SUPERLU_FREE (dwork2); + +} diff --git a/src/maths/SuperLU/dgsrfs.c b/src/maths/SuperLU/dgsrfs.c new file mode 100644 index 000000000..d24619f85 --- /dev/null +++ b/src/maths/SuperLU/dgsrfs.c @@ -0,0 +1,452 @@ + +/*! @file dgsrfs.c + * \brief Improves computed solution to a system of inear equations + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Modified from lapack routine DGERFS
+ * 
+ */ +/* + * File name: dgsrfs.c + * History: Modified from lapack routine DGERFS + */ +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   DGSRFS improves the computed solution to a system of linear   
+ *   equations and provides error bounds and backward error estimates for 
+ *   the solution.   
+ *
+ *   If equilibration was performed, the system becomes:
+ *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ * trans   (input) trans_t
+ *          Specifies the form of the system of equations:
+ *          = NOTRANS: A * X = B  (No transpose)
+ *          = TRANS:   A'* X = B  (Transpose)
+ *          = CONJ:    A**H * X = B  (Conjugate transpose)
+ *   
+ *   A       (input) SuperMatrix*
+ *           The original matrix A in the system, or the scaled A if
+ *           equilibration was done. The type of A can be:
+ *           Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_GE.
+ *    
+ *   L       (input) SuperMatrix*
+ *	     The factor L from the factorization Pr*A*Pc=L*U. Use
+ *           compressed row subscripts storage for supernodes, 
+ *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ * 
+ *   U       (input) SuperMatrix*
+ *           The factor U from the factorization Pr*A*Pc=L*U as computed by
+ *           dgstrf(). Use column-wise storage scheme, 
+ *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ *   perm_c  (input) int*, dimension (A->ncol)
+ *	     Column permutation vector, which defines the 
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           in position j in A*Pc.
+ *
+ *   perm_r  (input) int*, dimension (A->nrow)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ *   equed   (input) Specifies the form of equilibration that was done.
+ *           = 'N': No equilibration.
+ *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
+ *           = 'C': Column equilibration, i.e., A was postmultiplied by
+ *                  diag(C).
+ *           = 'B': Both row and column equilibration, i.e., A was replaced 
+ *                  by diag(R)*A*diag(C).
+ *
+ *   R       (input) double*, dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If equed = 'R' or 'B', A is premultiplied by diag(R).
+ *           If equed = 'N' or 'C', R is not accessed.
+ * 
+ *   C       (input) double*, dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
+ *           If equed = 'N' or 'R', C is not accessed.
+ *
+ *   B       (input) SuperMatrix*
+ *           B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ *           The right hand side matrix B.
+ *           if equed = 'R' or 'B', B is premultiplied by diag(R).
+ *
+ *   X       (input/output) SuperMatrix*
+ *           X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ *           On entry, the solution matrix X, as computed by dgstrs().
+ *           On exit, the improved solution matrix X.
+ *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
+ *               in order to obtain the solution to the original system.
+ *
+ *   FERR    (output) double*, dimension (B->ncol)   
+ *           The estimated forward error bound for each solution vector   
+ *           X(j) (the j-th column of the solution matrix X).   
+ *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
+ *           is an estimated upper bound for the magnitude of the largest 
+ *           element in (X(j) - XTRUE) divided by the magnitude of the   
+ *           largest element in X(j).  The estimate is as reliable as   
+ *           the estimate for RCOND, and is almost always a slight   
+ *           overestimate of the true error.
+ *
+ *   BERR    (output) double*, dimension (B->ncol)   
+ *           The componentwise relative backward error of each solution   
+ *           vector X(j) (i.e., the smallest relative change in   
+ *           any element of A or B that makes X(j) an exact solution).
+ *
+ *   stat     (output) SuperLUStat_t*
+ *            Record the statistics on runtime and floating-point operation count.
+ *            See util.h for the definition of 'SuperLUStat_t'.
+ *
+ *   info    (output) int*   
+ *           = 0:  successful exit   
+ *            < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *
+ *    Internal Parameters   
+ *    ===================   
+ *
+ *    ITMAX is the maximum number of steps of iterative refinement.   
+ *
+ * 
+ */ +void +dgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, char *equed, double *R, double *C, + SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, + SuperLUStat_t *stat, int *info) +{ + + +#define ITMAX 5 + + /* Table of constant values */ + int ione = 1; + double ndone = -1.; + double done = 1.; + + /* Local variables */ + NCformat *Astore; + double *Aval; + SuperMatrix Bjcol; + DNformat *Bstore, *Xstore, *Bjcol_store; + double *Bmat, *Xmat, *Bptr, *Xptr; + int kase; + double safe1, safe2; + int i, j, k, irow, nz, count, notran, rowequ, colequ; + int ldb, ldx, nrhs; + double s, xk, lstres, eps, safmin; + char transc[1]; + trans_t transt; + double *work; + double *rwork; + int *iwork; + + extern int dlacon_(int *, double *, double *, int *, double *, int *); +#ifdef _CRAY + extern int SCOPY(int *, double *, int *, double *, int *); + extern int SSAXPY(int *, double *, double *, int *, double *, int *); +#else + extern int dcopy_(int *, double *, int *, double *, int *); + extern int daxpy_(int *, double *, double *, int *, double *, int *); +#endif + + Astore = A->Store; + Aval = Astore->nzval; + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + /* Test the input parameters */ + *info = 0; + notran = (trans == NOTRANS); + if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -2; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) + *info = -3; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) + *info = -4; + else if ( ldb < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) + *info = -10; + else if ( ldx < SUPERLU_MAX(0, A->nrow) || + X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE ) + *info = -11; + if (*info != 0) { + i = -(*info); + xerbla_("dgsrfs", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || nrhs == 0) { + for (j = 0; j < nrhs; ++j) { + ferr[j] = 0.; + berr[j] = 0.; + } + return; + } + + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + + /* Allocate working space */ + work = doubleMalloc(2*A->nrow); + rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); + iwork = intMalloc(2*A->nrow); + if ( !work || !rwork || !iwork ) + ABORT_SuperLU("Malloc fails for work/rwork/iwork."); + + if ( notran ) { + *(unsigned char *)transc = 'N'; + transt = TRANS; + } else { + *(unsigned char *)transc = 'T'; + transt = NOTRANS; + } + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = dlamch_("Epsilon"); + safmin = dlamch_("Safe minimum"); + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + + /* Compute the number of nonzeros in each row (or column) of A */ + for (i = 0; i < A->nrow; ++i) iwork[i] = 0; + if ( notran ) { + for (k = 0; k < A->ncol; ++k) + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + ++iwork[Astore->rowind[i]]; + } else { + for (k = 0; k < A->ncol; ++k) + iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; + } + + /* Copy one column of RHS B into Bjcol. */ + Bjcol.Stype = B->Stype; + Bjcol.Dtype = B->Dtype; + Bjcol.Mtype = B->Mtype; + Bjcol.nrow = B->nrow; + Bjcol.ncol = 1; + Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); + if ( !Bjcol.Store ) ABORT_SuperLU("SUPERLU_MALLOC fails for Bjcol.Store"); + Bjcol_store = Bjcol.Store; + Bjcol_store->lda = ldb; + Bjcol_store->nzval = work; /* address aliasing */ + + /* Do for each right hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + Bptr = &Bmat[j*ldb]; + Xptr = &Xmat[j*ldx]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + +#ifdef _CRAY + SCOPY(&A->nrow, Bptr, &ione, work, &ione); +#else + dcopy_(&A->nrow, Bptr, &ione, work, &ione); +#endif + sp_dgemv(transc, ndone, A, Xptr, ione, done, work, ione); + + /* Compute componentwise relative backward error from formula + max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) + where abs(Z) is the componentwise absolute value of the matrix + or vector Z. If the i-th component of the denominator is less + than SAFE2, then SAFE1 is added to the i-th component of the + numerator before dividing. */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if (notran) { + for (k = 0; k < A->ncol; ++k) { + xk = fabs( Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + s += fabs(Aval[i]) * fabs(Xptr[irow]); + } + rwork[k] += s; + } + } + s = 0.; + for (i = 0; i < A->nrow; ++i) { + if (rwork[i] > safe2) { + s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] ); + } else if ( rwork[i] != 0.0 ) { + /* Adding SAFE1 to the numerator guards against + spuriously zero residuals (underflow). */ + s = SUPERLU_MAX( s, (safe1 + fabs(work[i])) / rwork[i] ); + } + /* If rwork[i] is exactly 0.0, then we know the true + residual also must be exactly 0.0. */ + } + berr[j] = s; + + /* Test stopping criterion. Continue iterating if + 1) The residual BERR(J) is larger than machine epsilon, and + 2) BERR(J) decreased by at least a factor of 2 during the + last iteration, and + 3) At most ITMAX iterations tried. */ + + if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { + /* Update solution and try again. */ + dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + +#ifdef _CRAY + SAXPY(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#else + daxpy_(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#endif + lstres = berr[j]; + ++count; + } else { + break; + } + + } /* end while */ + + stat->RefineSteps = count; + + /* Bound error from formula: + norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* + ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) + where + norm(Z) is the magnitude of the largest component of Z + inv(op(A)) is the inverse of op(A) + abs(Z) is the componentwise absolute value of the matrix or + vector Z + NZ is the maximum number of nonzeros in any row of A, plus 1 + EPS is machine epsilon + + The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) + is incremented by SAFE1 if the i-th component of + abs(op(A))*abs(X) + abs(B) is less than SAFE2. + + Use DLACON to estimate the infinity-norm of the matrix + inv(op(A)) * diag(W), + where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if ( notran ) { + for (k = 0; k < A->ncol; ++k) { + xk = fabs( Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + xk = fabs( Xptr[irow] ); + s += fabs(Aval[i]) * xk; + } + rwork[k] += s; + } + } + + for (i = 0; i < A->nrow; ++i) + if (rwork[i] > safe2) + rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i]; + else + rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; + + kase = 0; + + do { + dlacon_(&A->nrow, &work[A->nrow], work, + &iwork[A->nrow], &ferr[j], &kase); + if (kase == 0) break; + + if (kase == 1) { + /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; + else if ( !notran && rowequ ) + for (i = 0; i < A->nrow; ++i) work[i] *= R[i]; + + dgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); + + for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; + } else { + /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ + for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; + + dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; + else if ( !notran && rowequ ) + for (i = 0; i < A->ncol; ++i) work[i] *= R[i]; + } + + } while ( kase != 0 ); + + + /* Normalize error. */ + lstres = 0.; + if ( notran && colequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) ); + } else if ( !notran && rowequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) ); + } else { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) ); + } + if ( lstres != 0. ) + ferr[j] /= lstres; + + } /* for each RHS j ... */ + + SUPERLU_FREE(work); + SUPERLU_FREE(rwork); + SUPERLU_FREE(iwork); + SUPERLU_FREE(Bjcol.Store); + + return; + +} /* dgsrfs */ diff --git a/src/maths/SuperLU/dgssv.c b/src/maths/SuperLU/dgssv.c new file mode 100644 index 000000000..d62202cd0 --- /dev/null +++ b/src/maths/SuperLU/dgssv.c @@ -0,0 +1,227 @@ + +/*! @file dgssv.c + * \brief Solves the system of linear equations A*X=B + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ */ +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DGSSV solves the system of linear equations A*X=B, using the
+ * LU factorization from DGSTRF. It performs the following steps:
+ *
+ *   1. If A is stored column-wise (A->Stype = SLU_NC):
+ *
+ *      1.1. Permute the columns of A, forming A*Pc, where Pc
+ *           is a permutation matrix. For more details of this step, 
+ *           see sp_preorder.c.
+ *
+ *      1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined
+ *           by Gaussian elimination with partial pivoting.
+ *           L is unit lower triangular with offdiagonal entries
+ *           bounded by 1 in magnitude, and U is upper triangular.
+ *
+ *      1.3. Solve the system of equations A*X=B using the factored
+ *           form of A.
+ *
+ *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the
+ *      above algorithm to the transpose of A:
+ *
+ *      2.1. Permute columns of transpose(A) (rows of A),
+ *           forming transpose(A)*Pc, where Pc is a permutation matrix. 
+ *           For more details of this step, see sp_preorder.c.
+ *
+ *      2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr
+ *           determined by Gaussian elimination with partial pivoting.
+ *           L is unit lower triangular with offdiagonal entries
+ *           bounded by 1 in magnitude, and U is upper triangular.
+ *
+ *      2.3. Solve the system of equations A*X=B using the factored
+ *           form of A.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ * 
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed and how the
+ *         system will be solved.
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = SLU_NC or SLU_NR; Dtype = SLU_D; Mtype = SLU_GE.
+ *         In the future, more general A may be handled.
+ *
+ * perm_c  (input/output) int*
+ *         If A->Stype = SLU_NC, column permutation vector of size A->ncol
+ *         which defines the permutation matrix Pc; perm_c[i] = j means 
+ *         column i of A is in position j in A*Pc.
+ *         If A->Stype = SLU_NR, column permutation vector of size A->nrow
+ *         which describes permutation of columns of transpose(A) 
+ *         (rows of A) as described above.
+ * 
+ *         If options->ColPerm = MY_PERMC or options->Fact = SamePattern or
+ *            options->Fact = SamePattern_SameRowPerm, it is an input argument.
+ *            On exit, perm_c may be overwritten by the product of the input
+ *            perm_c and a permutation that postorders the elimination tree
+ *            of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ *            is already in postorder.
+ *         Otherwise, it is an output argument.
+ * 
+ * perm_r  (input/output) int*
+ *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
+ *         which defines the permutation matrix Pr, and is determined 
+ *         by partial pivoting.  perm_r[i] = j means row i of A is in 
+ *         position j in Pr*A.
+ *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ *         determines permutation of rows of transpose(A)
+ *         (columns of A) as described above.
+ *
+ *         If options->RowPerm = MY_PERMR or
+ *            options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *            input argument.
+ *         otherwise it is an output argument.
+ *
+ * L       (output) SuperMatrix*
+ *         The factor L from the factorization 
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses compressed row subscripts storage for supernodes, i.e.,
+ *         L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *         
+ * U       (output) SuperMatrix*
+ *	   The factor U from the factorization 
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *	   = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ * 
+ */ + +void +dgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, + SuperLUStat_t *stat, int *info ) +{ + + DNformat *Bstore; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int lwork = 0, *etree, i; + + /* Set default values for some parameters */ + int panel_size; /* panel size */ + int relax; /* no of columns in a relaxed snodes */ + int permc_spec; + trans_t trans = NOTRANS; + double *utime; + double t; /* Temporary time */ + + /* Test the input parameters ... */ + *info = 0; + Bstore = B->Store; + if ( options->Fact != DOFACT ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -2; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) + *info = -7; + if ( *info != 0 ) { + i = -(*info); + xerbla_("dgssv", &i); + return; + } + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + trans = TRANS; + } else { + if ( A->Stype == SLU_NC ) AA = A; + } + + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t; + + etree = intMalloc(A->ncol); + + t = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + + /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4));*/ + t = SuperLU_timer_(); + /* Compute the LU factorization of A. */ + dgstrf(options, &AC, relax, panel_size, etree, + NULL, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t; + + t = SuperLU_timer_(); + if ( *info == 0 ) { + /* Solve the system A*X=B, overwriting B with X. */ + dgstrs (trans, L, U, perm_c, perm_r, B, stat, info); + } + utime[SOLVE] = SuperLU_timer_() - t; + + SUPERLU_FREE (etree); + Destroy_CompCol_Permuted(&AC); + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/dgssvx.c b/src/maths/SuperLU/dgssvx.c new file mode 100644 index 000000000..5f06db4fe --- /dev/null +++ b/src/maths/SuperLU/dgssvx.c @@ -0,0 +1,622 @@ + +/*! @file dgssvx.c + * \brief Solves the system of linear equations A*X=B or A'*X=B + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ */ +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DGSSVX solves the system of linear equations A*X=B or A'*X=B, using
+ * the LU factorization from dgstrf(). Error bounds on the solution and
+ * a condition estimate are also provided. It performs the following steps:
+ *
+ *   1. If A is stored column-wise (A->Stype = SLU_NC):
+ *  
+ *      1.1. If options->Equil = YES, scaling factors are computed to
+ *           equilibrate the system:
+ *           options->Trans = NOTRANS:
+ *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *           options->Trans = TRANS:
+ *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *           options->Trans = CONJ:
+ *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *           Whether or not the system will be equilibrated depends on the
+ *           scaling of the matrix A, but if equilibration is used, A is
+ *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B
+ *           (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
+ *           = TRANS or CONJ).
+ *
+ *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
+ *           matrix that usually preserves sparsity.
+ *           For more details of this step, see sp_preorder.c.
+ *
+ *      1.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *           factor the matrix A (after equilibration if options->Equil = YES)
+ *           as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
+ *
+ *      1.4. Compute the reciprocal pivot growth factor.
+ *
+ *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *           routine returns with info = i. Otherwise, the factored form of 
+ *           A is used to estimate the condition number of the matrix A. If
+ *           the reciprocal of the condition number is less than machine
+ *           precision, info = A->ncol+1 is returned as a warning, but the
+ *           routine still goes on to solve for X and computes error bounds
+ *           as described below.
+ *
+ *      1.6. The system of equations is solved for X using the factored form
+ *           of A.
+ *
+ *      1.7. If options->IterRefine != NOREFINE, iterative refinement is
+ *           applied to improve the computed solution matrix and calculate
+ *           error bounds and backward error estimates for it.
+ *
+ *      1.8. If equilibration was used, the matrix X is premultiplied by
+ *           diag(C) (if options->Trans = NOTRANS) or diag(R)
+ *           (if options->Trans = TRANS or CONJ) so that it solves the
+ *           original system before equilibration.
+ *
+ *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
+ *      to the transpose of A:
+ *
+ *      2.1. If options->Equil = YES, scaling factors are computed to
+ *           equilibrate the system:
+ *           options->Trans = NOTRANS:
+ *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *           options->Trans = TRANS:
+ *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *           options->Trans = CONJ:
+ *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *           Whether or not the system will be equilibrated depends on the
+ *           scaling of the matrix A, but if equilibration is used, A' is
+ *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
+ *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
+ *
+ *      2.2. Permute columns of transpose(A) (rows of A), 
+ *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
+ *           usually preserves sparsity.
+ *           For more details of this step, see sp_preorder.c.
+ *
+ *      2.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *           factor the transpose(A) (after equilibration if 
+ *           options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
+ *           permutation Pr determined by partial pivoting.
+ *
+ *      2.4. Compute the reciprocal pivot growth factor.
+ *
+ *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *           routine returns with info = i. Otherwise, the factored form 
+ *           of transpose(A) is used to estimate the condition number of the
+ *           matrix A. If the reciprocal of the condition number
+ *           is less than machine precision, info = A->nrow+1 is returned as
+ *           a warning, but the routine still goes on to solve for X and
+ *           computes error bounds as described below.
+ *
+ *      2.6. The system of equations is solved for X using the factored form
+ *           of transpose(A).
+ *
+ *      2.7. If options->IterRefine != NOREFINE, iterative refinement is
+ *           applied to improve the computed solution matrix and calculate
+ *           error bounds and backward error estimates for it.
+ *
+ *      2.8. If equilibration was used, the matrix X is premultiplied by
+ *           diag(C) (if options->Trans = NOTRANS) or diag(R) 
+ *           (if options->Trans = TRANS or CONJ) so that it solves the
+ *           original system before equilibration.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed and how the
+ *         system will be solved.
+ *
+ * A       (input/output) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
+ *         In the future, more general A may be handled.
+ *
+ *         On entry, If options->Fact = FACTORED and equed is not 'N', 
+ *         then A must have been equilibrated by the scaling factors in
+ *         R and/or C.  
+ *         On exit, A is not modified if options->Equil = NO, or if 
+ *         options->Equil = YES but equed = 'N' on exit.
+ *         Otherwise, if options->Equil = YES and equed is not 'N',
+ *         A is scaled as follows:
+ *         If A->Stype = SLU_NC:
+ *           equed = 'R':  A := diag(R) * A
+ *           equed = 'C':  A := A * diag(C)
+ *           equed = 'B':  A := diag(R) * A * diag(C).
+ *         If A->Stype = SLU_NR:
+ *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
+ *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
+ *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
+ *
+ * perm_c  (input/output) int*
+ *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
+ *         which defines the permutation matrix Pc; perm_c[i] = j means
+ *         column i of A is in position j in A*Pc.
+ *         On exit, perm_c may be overwritten by the product of the input
+ *         perm_c and a permutation that postorders the elimination tree
+ *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ *         is already in postorder.
+ *
+ *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
+ *         which describes permutation of columns of transpose(A) 
+ *         (rows of A) as described above.
+ * 
+ * perm_r  (input/output) int*
+ *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
+ *         which defines the permutation matrix Pr, and is determined
+ *         by partial pivoting.  perm_r[i] = j means row i of A is in 
+ *         position j in Pr*A.
+ *
+ *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ *         determines permutation of rows of transpose(A)
+ *         (columns of A) as described above.
+ *
+ *         If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *         will try to use the input perm_r, unless a certain threshold
+ *         criterion is violated. In that case, perm_r is overwritten by a
+ *         new permutation determined by partial pivoting or diagonal
+ *         threshold pivoting.
+ *         Otherwise, perm_r is output argument.
+ * 
+ * etree   (input/output) int*,  dimension (A->ncol)
+ *         Elimination tree of Pc'*A'*A*Pc.
+ *         If options->Fact != FACTORED and options->Fact != DOFACT,
+ *         etree is an input argument, otherwise it is an output argument.
+ *         Note: etree is a vector of parent pointers for a forest whose
+ *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * equed   (input/output) char*
+ *         Specifies the form of equilibration that was done.
+ *         = 'N': No equilibration.
+ *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
+ *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
+ *         = 'B': Both row and column equilibration, i.e., A was replaced 
+ *                by diag(R)*A*diag(C).
+ *         If options->Fact = FACTORED, equed is an input argument,
+ *         otherwise it is an output argument.
+ *
+ * R       (input/output) double*, dimension (A->nrow)
+ *         The row scale factors for A or transpose(A).
+ *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *             (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
+ *         If equed = 'N' or 'C', R is not accessed.
+ *         If options->Fact = FACTORED, R is an input argument,
+ *             otherwise, R is output.
+ *         If options->zFact = FACTORED and equed = 'R' or 'B', each element
+ *             of R must be positive.
+ * 
+ * C       (input/output) double*, dimension (A->ncol)
+ *         The column scale factors for A or transpose(A).
+ *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *             (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
+ *         If equed = 'N' or 'R', C is not accessed.
+ *         If options->Fact = FACTORED, C is an input argument,
+ *             otherwise, C is output.
+ *         If options->Fact = FACTORED and equed = 'C' or 'B', each element
+ *             of C must be positive.
+ *         
+ * L       (output) SuperMatrix*
+ *	   The factor L from the factorization
+ *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses compressed row subscripts storage for supernodes, i.e.,
+ *         L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *
+ * U       (output) SuperMatrix*
+ *	   The factor U from the factorization
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ * work    (workspace/output) void*, size (lwork) (in bytes)
+ *         User supplied workspace, should be large enough
+ *         to hold data structures for factors L and U.
+ *         On exit, if fact is not 'F', L and U point to this array.
+ *
+ * lwork   (input) int
+ *         Specifies the size of work array in bytes.
+ *         = 0:  allocate space internally by system malloc;
+ *         > 0:  use user-supplied work array of length lwork in bytes,
+ *               returns error if space runs out.
+ *         = -1: the routine guesses the amount of space needed without
+ *               performing the factorization, and returns it in
+ *               mem_usage->total_needed; no other side effects.
+ *
+ *         See argument 'mem_usage' for memory usage statistics.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         If B->ncol = 0, only LU decomposition is performed, the triangular
+ *                         solve is skipped.
+ *         On exit,
+ *            if equed = 'N', B is not modified; otherwise
+ *            if A->Stype = SLU_NC:
+ *               if options->Trans = NOTRANS and equed = 'R' or 'B',
+ *                  B is overwritten by diag(R)*B;
+ *               if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
+ *                  B is overwritten by diag(C)*B;
+ *            if A->Stype = SLU_NR:
+ *               if options->Trans = NOTRANS and equed = 'C' or 'B',
+ *                  B is overwritten by diag(C)*B;
+ *               if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
+ *                  B is overwritten by diag(R)*B.
+ *
+ * X       (output) SuperMatrix*
+ *         X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. 
+ *         If info = 0 or info = A->ncol+1, X contains the solution matrix
+ *         to the original system of equations. Note that A and B are modified
+ *         on exit if equed is not 'N', and the solution to the equilibrated
+ *         system is inv(diag(C))*X if options->Trans = NOTRANS and
+ *         equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
+ *         and equed = 'R' or 'B'.
+ *
+ * recip_pivot_growth (output) double*
+ *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
+ *         The infinity norm is used. If recip_pivot_growth is much less
+ *         than 1, the stability of the LU factorization could be poor.
+ *
+ * rcond   (output) double*
+ *         The estimate of the reciprocal condition number of the matrix A
+ *         after equilibration (if done). If rcond is less than the machine
+ *         precision (in particular, if rcond = 0), the matrix is singular
+ *         to working precision. This condition is indicated by a return
+ *         code of info > 0.
+ *
+ * FERR    (output) double*, dimension (B->ncol)   
+ *         The estimated forward error bound for each solution vector   
+ *         X(j) (the j-th column of the solution matrix X).   
+ *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
+ *         is an estimated upper bound for the magnitude of the largest 
+ *         element in (X(j) - XTRUE) divided by the magnitude of the   
+ *         largest element in X(j).  The estimate is as reliable as   
+ *         the estimate for RCOND, and is almost always a slight   
+ *         overestimate of the true error.
+ *         If options->IterRefine = NOREFINE, ferr = 1.0.
+ *
+ * BERR    (output) double*, dimension (B->ncol)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *         If options->IterRefine = NOREFINE, berr = 1.0.
+ *
+ * mem_usage (output) mem_usage_t*
+ *         Record the memory usage statistics, consisting of following fields:
+ *         - for_lu (float)
+ *           The amount of space used in bytes for L\U data structures.
+ *         - total_needed (float)
+ *           The amount of space needed in bytes to perform factorization.
+ *         - expansions (int)
+ *           The number of memory expansions during the LU factorization.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit   
+ *         < 0: if info = -i, the i-th argument had an illegal value   
+ *         > 0: if info = i, and i is   
+ *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
+ *                    been completed, but the factor U is exactly   
+ *                    singular, so the solution and error bounds   
+ *                    could not be computed.   
+ *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
+ *                    precision, meaning that the matrix is singular to
+ *                    working precision. Nevertheless, the solution and
+ *                    error bounds are computed because there are a number
+ *                    of situations where the computed solution can be more
+ *                    accurate than the value of RCOND would suggest.   
+ *              > A->ncol+1: number of bytes allocated when memory allocation
+ *                    failure occurred, plus A->ncol.
+ * 
+ */ + +void +dgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + int *etree, char *equed, double *R, double *C, + SuperMatrix *L, SuperMatrix *U, void *work, int lwork, + SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, + double *rcond, double *ferr, double *berr, + mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) +{ + + + DNformat *Bstore, *Xstore; + double *Bmat, *Xmat; + int ldb, ldx, nrhs; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int colequ, equil, nofact, notran, rowequ, permc_spec; + trans_t trant; + char norm[1]; + int i, j, info1; + double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; + int relax, panel_size; + double diag_pivot_thresh; + double t0; /* temporary time */ + double *utime; + + /* External functions */ + extern double dlangs(char *, SuperMatrix *); + + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + *info = 0; + nofact = (options->Fact != FACTORED); + equil = (options->Equil == YES_SuperLU); + notran = (options->Trans == NOTRANS); + if ( nofact ) { + *(unsigned char *)equed = 'N'; + rowequ = FALSE; + colequ = FALSE; + } else { + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + smlnum = dlamch_("Safe minimum"); + bignum = 1. / smlnum; + } + +#if 0 +printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", + options->Fact, options->Trans, *equed); +#endif + + /* Test the input parameters */ + if (options->Fact != DOFACT && options->Fact != SamePattern && + options->Fact != SamePattern_SameRowPerm && + options->Fact != FACTORED && + options->Trans != NOTRANS && options->Trans != TRANS && + options->Trans != CONJ && + options->Equil != NO_SuperLU && options->Equil != YES_SuperLU) + *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -2; + else if (options->Fact == FACTORED && + !(rowequ || colequ || lsame_(equed, "N"))) + *info = -6; + else { + if (rowequ) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, R[j]); + rcmax = SUPERLU_MAX(rcmax, R[j]); + } + if (rcmin <= 0.) *info = -7; + else if ( A->nrow > 0) + rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else rowcnd = 1.; + } + if (colequ && *info == 0) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, C[j]); + rcmax = SUPERLU_MAX(rcmax, C[j]); + } + if (rcmin <= 0.) *info = -8; + else if (A->nrow > 0) + colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else colcnd = 1.; + } + if (*info == 0) { + if ( lwork < -1 ) *info = -12; + else if ( B->ncol < 0 ) *info = -13; + else if ( B->ncol > 0 ) { /* no checking if B->ncol=0 */ + if ( Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || + B->Mtype != SLU_GE ) + *info = -13; + } + if ( X->ncol < 0 ) *info = -14; + else if ( X->ncol > 0 ) { /* no checking if X->ncol=0 */ + if ( Xstore->lda < SUPERLU_MAX(0, A->nrow) || + (B->ncol != 0 && B->ncol != X->ncol) || + X->Stype != SLU_DN || + X->Dtype != SLU_D || X->Mtype != SLU_GE ) + *info = -14; + } + } + } + if (*info != 0) { + i = -(*info); + xerbla_("dgssvx", &i); + return; + } + + /* Initialization for factor parameters */ + panel_size = sp_ienv(1); + relax = sp_ienv(2); + diag_pivot_thresh = options->DiagPivotThresh; + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + if ( notran ) { /* Reverse the transpose argument. */ + trant = TRANS; + notran = 0; + } else { + trant = NOTRANS; + notran = 1; + } + } else { /* A->Stype == SLU_NC */ + trant = options->Trans; + AA = A; + } + + if ( nofact && equil ) { + t0 = SuperLU_timer_(); + /* Compute row and column scalings to equilibrate the matrix A. */ + dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); + + if ( info1 == 0 ) { + /* Equilibrate matrix A. */ + dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + + + if ( nofact ) { + + t0 = SuperLU_timer_(); + /* + * Gnet column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t0; + + t0 = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t0; + +/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4)); + fflush(stdout); */ + + /* Compute the LU factorization of A*Pc. */ + t0 = SuperLU_timer_(); + dgstrf(options, &AC, relax, panel_size, etree, + work, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t0; + + if ( lwork == -1 ) { + mem_usage->total_needed = *info - A->ncol; + return; + } + } + + if ( options->PivotGrowth ) { + if ( *info > 0 ) { + if ( *info <= A->ncol ) { + /* Compute the reciprocal pivot growth factor of the leading + rank-deficient *info columns of A. */ + *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U); + } + return; + } + + /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ + *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U); + } + + if ( options->ConditionNumber ) { + /* Estimate the reciprocal of the condition number of A. */ + t0 = SuperLU_timer_(); + if ( notran ) { + *(unsigned char *)norm = '1'; + } else { + *(unsigned char *)norm = 'I'; + } + anorm = dlangs(norm, AA); + dgscon(norm, L, U, anorm, rcond, stat, info); + utime[RCOND] = SuperLU_timer_() - t0; + } + + if ( nrhs > 0 ) { + /* Scale the right hand side if equilibration was performed. */ + if ( notran ) { + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Bmat[i + j*ldb] *= R[i]; + } + } else if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Bmat[i + j*ldb] *= C[i]; + } + + /* Compute the solution matrix X. */ + for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ + for (i = 0; i < B->nrow; i++) + Xmat[i + j*ldx] = Bmat[i + j*ldb]; + + t0 = SuperLU_timer_(); + dgstrs (trant, L, U, perm_c, perm_r, X, stat, info); + utime[SOLVE] = SuperLU_timer_() - t0; + + /* Use iterative refinement to improve the computed solution and compute + error bounds and backward error estimates for it. */ + t0 = SuperLU_timer_(); + if ( options->IterRefine != NOREFINE ) { + dgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, + X, ferr, berr, stat, info); + } else { + for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; + } + utime[REFINE] = SuperLU_timer_() - t0; + + /* Transform the solution matrix X to a solution of the original system. */ + if ( notran ) { + if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Xmat[i + j*ldx] *= C[i]; + } + } else if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + Xmat[i + j*ldx] *= R[i]; + } + } /* end if nrhs > 0 */ + + if ( options->ConditionNumber ) { + /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ + if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; + } + + if ( nofact ) { + dQuerySpace(L, U, mem_usage); + Destroy_CompCol_Permuted(&AC); + } + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/dgstrf.c b/src/maths/SuperLU/dgstrf.c new file mode 100644 index 000000000..d6e65d648 --- /dev/null +++ b/src/maths/SuperLU/dgstrf.c @@ -0,0 +1,436 @@ + +/*! @file dgstrf.c + * \brief Computes an LU factorization of a general sparse matrix + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DGSTRF computes an LU factorization of a general sparse m-by-n
+ * matrix A using partial pivoting with row interchanges.
+ * The factorization has the form
+ *     Pr * A = L * U
+ * where Pr is a row permutation matrix, L is lower triangular with unit
+ * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
+ * triangular (upper trapezoidal if A->nrow < A->ncol).
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * A        (input) SuperMatrix*
+ *	    Original matrix A, permuted by columns, of dimension
+ *          (A->nrow, A->ncol). The type of A can be:
+ *          Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * relax    (input) int
+ *          To control degree of relaxing supernodes. If the number
+ *          of nodes (columns) in a subtree of the elimination tree is less
+ *          than relax, this subtree is considered as one supernode,
+ *          regardless of the row structures of those columns.
+ *
+ * panel_size (input) int
+ *          A panel consists of at most panel_size consecutive columns.
+ *
+ * etree    (input) int*, dimension (A->ncol)
+ *          Elimination tree of A'*A.
+ *          Note: etree is a vector of parent pointers for a forest whose
+ *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *          On input, the columns of A should be permuted so that the
+ *          etree is in a certain postorder.
+ *
+ * work     (input/output) void*, size (lwork) (in bytes)
+ *          User-supplied work space and space for the output data structures.
+ *          Not referenced if lwork = 0;
+ *
+ * lwork   (input) int
+ *         Specifies the size of work array in bytes.
+ *         = 0:  allocate space internally by system malloc;
+ *         > 0:  use user-supplied work array of length lwork in bytes,
+ *               returns error if space runs out.
+ *         = -1: the routine guesses the amount of space needed without
+ *               performing the factorization, and returns it in
+ *               *info; no other side effects.
+ *
+ * perm_c   (input) int*, dimension (A->ncol)
+ *	    Column permutation vector, which defines the 
+ *          permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *          in position j in A*Pc.
+ *          When searching for diagonal, perm_c[*] is applied to the
+ *          row subscripts of A, so that diagonal threshold pivoting
+ *          can find the diagonal of A, rather than that of A*Pc.
+ *
+ * perm_r   (input/output) int*, dimension (A->nrow)
+ *          Row permutation vector which defines the permutation matrix Pr,
+ *          perm_r[i] = j means row i of A is in position j in Pr*A.
+ *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *             will try to use the input perm_r, unless a certain threshold
+ *             criterion is violated. In that case, perm_r is overwritten by
+ *             a new permutation determined by partial pivoting or diagonal
+ *             threshold pivoting.
+ *          Otherwise, perm_r is output argument;
+ *
+ * L        (output) SuperMatrix*
+ *          The factor L from the factorization Pr*A=L*U; use compressed row 
+ *          subscripts storage for supernodes, i.e., L has type: 
+ *          Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *
+ * U        (output) SuperMatrix*
+ *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
+ *          storage scheme, i.e., U has types: Stype = SLU_NC, 
+ *          Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ * stat     (output) SuperLUStat_t*
+ *          Record the statistics on runtime and floating-point operation count.
+ *          See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info     (output) int*
+ *          = 0: successful exit
+ *          < 0: if info = -i, the i-th argument had an illegal value
+ *          > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                and division by zero will occur if it is used to solve a
+ *                system of equations.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol. If lwork = -1, it is
+ *                the estimated amount of space needed, plus A->ncol.
+ *
+ * ======================================================================
+ *
+ * Local Working Arrays: 
+ * ======================
+ *   m = number of rows in the matrix
+ *   n = number of columns in the matrix
+ *
+ *   xprune[0:n-1]: xprune[*] points to locations in subscript 
+ *	vector lsub[*]. For column i, xprune[i] denotes the point where 
+ *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
+ *	to be traversed for symbolic factorization.
+ *
+ *   marker[0:3*m-1]: marker[i] = j means that node i has been 
+ *	reached when working on column j.
+ *	Storage: relative to original row subscripts
+ *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
+ *	      see dpanel_dfs.c; marker2 is used for inner-factorization,
+ *            see dcolumn_dfs.c.
+ *
+ *   parent[0:m-1]: parent vector used during dfs
+ *      Storage: relative to new row subscripts
+ *
+ *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
+ *	unexplored neighbor of i in lsub[*]
+ *
+ *   segrep[0:nseg-1]: contains the list of supernodal representatives
+ *	in topological order of the dfs. A supernode representative is the 
+ *	last column of a supernode.
+ *      The maximum size of segrep[] is n.
+ *
+ *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
+ *	supernodal representative r, repfnz[r] is the location of the first 
+ *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
+ *	indicates the supernode r has been explored.
+ *	NOTE: There are W of them, each used for one column of a panel. 
+ *
+ *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
+ *      the panel diagonal. These are filled in during dpanel_dfs(), and are
+ *      used later in the inner LU factorization within the panel.
+ *	panel_lsub[]/dense[] pair forms the SPA data structure.
+ *	NOTE: There are W of them.
+ *
+ *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
+ *	    	   NOTE: there are W of them.
+ *
+ *   tempv[0:*]: real temporary used for dense numeric kernels;
+ *	The size of this array is defined by NUM_TEMPV() in slu_ddefs.h.
+ * 
+ */ + +void +dgstrf (superlu_options_t *options, SuperMatrix *A, + int relax, int panel_size, int *etree, void *work, int lwork, + int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, + SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *iwork; + double *dwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *xprune; + int *marker; + double *dense, *tempv; + int *relax_end; + double *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double fill_ratio = sp_ienv(6); /* estimated fill ratio */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz, + panel_size, fill_ratio, L, U, &Glu, &iwork, &dwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &xprune, &marker); + dSetRWork(m, panel_size, dwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) { + heap_relax_snode(n, etree, relax, marker, relax_end); + } else { + relax_snode(n, etree, relax, marker, relax_end); + } + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + xprune, marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ( (*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) + return; + } + + for (icol = jcol; icol<= kcol; icol++) { + xusub[icol+1] = nextu; + + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) + dense[asub[k]] = a[k]; + + /* Numeric update within the snode */ + dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if ( (*info = dpivotL(icol, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + +#ifdef DEBUG + dprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); +#endif + + } + + jcol = icol; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, panel_lsub, segrep, repfnz, xprune, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + dpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for ( jj = jcol; jj < jcol + panel_size; jj++) { + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + if ((*info = dcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], + segrep, &repfnz[k], xprune, marker, + parent, xplore, &Glu)) != 0) return; + + /* Numeric updates */ + if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Copy the U-segments to ucol[*] */ + if ((*info = dcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], &Glu)) != 0) + return; + + if ( (*info = dpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + + /* Prune columns (0:jj-1) using column jj */ + dpruneL(jj, perm_r, pivrow, nseg, segrep, + &repfnz[k], xprune, &Glu); + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + +#ifdef DEBUG + dprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); +#endif + + } + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_D, SLU_TRLU); + dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + +} diff --git a/src/maths/SuperLU/dgstrs.c b/src/maths/SuperLU/dgstrs.c new file mode 100644 index 000000000..30a3975db --- /dev/null +++ b/src/maths/SuperLU/dgstrs.c @@ -0,0 +1,337 @@ + +/*! @file dgstrs.c + * \brief Solves a system using LU factorization + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + +#include + + +/* + * Function prototypes + */ +void dusolve(int, int, double*, double*); +void dlsolve(int, int, double*, double*); +void dmatvec(int, int, int, double*, double*, double*); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DGSTRS solves a system of linear equations A*X=B or A'*X=B
+ * with A sparse and B dense, using the LU factorization computed by
+ * DGSTRF.
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * trans   (input) trans_t
+ *          Specifies the form of the system of equations:
+ *          = NOTRANS: A * X = B  (No transpose)
+ *          = TRANS:   A'* X = B  (Transpose)
+ *          = CONJ:    A**H * X = B  (Conjugate transpose)
+ *
+ * L       (input) SuperMatrix*
+ *         The factor L from the factorization Pr*A*Pc=L*U as computed by
+ *         dgstrf(). Use compressed row subscripts storage for supernodes,
+ *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
+ *
+ * U       (input) SuperMatrix*
+ *         The factor U from the factorization Pr*A*Pc=L*U as computed by
+ *         dgstrf(). Use column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
+ *
+ * perm_c  (input) int*, dimension (L->ncol)
+ *	   Column permutation vector, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *
+ * perm_r  (input) int*, dimension (L->nrow)
+ *         Row permutation vector, which defines the permutation matrix Pr; 
+ *         perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * stat     (output) SuperLUStat_t*
+ *          Record the statistics on runtime and floating-point operation count.
+ *          See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void +dgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, SuperMatrix *B, + SuperLUStat_t *stat, int *info) +{ + +#ifdef _CRAY + _fcd ftcs1, ftcs2, ftcs3, ftcs4; +#endif + int incx = 1, incy = 1; +#ifdef USE_VENDOR_BLAS + double alpha = 1.0, beta = 1.0; + double *work_col; +#endif + DNformat *Bstore; + double *Bmat; + SCformat *Lstore; + NCformat *Ustore; + double *Lval, *Uval; + int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; + int i, j, k, iptr, jcol, n, ldb, nrhs; + double *work, *rhs_work, *soln; + flops_t solve_ops; + void dprint_soln(); + + /* Test input parameters ... */ + *info = 0; + Bstore = B->Store; + ldb = Bstore->lda; + nrhs = B->ncol; + if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) + *info = -2; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) + *info = -3; + else if ( ldb < SUPERLU_MAX(0, L->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) + *info = -6; + if ( *info ) { + i = -(*info); + xerbla_("dgstrs", &i); + return; + } + + n = L->nrow; + work = doubleCalloc(n * nrhs); + if ( !work ) ABORT_SuperLU("Malloc fails for local work[]."); + soln = doubleMalloc(n); + if ( !soln ) ABORT_SuperLU("Malloc fails for local soln[]."); + + Bmat = Bstore->nzval; + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( trans == NOTRANS ) { + /* Permute right hand sides to form Pr*B */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + /* Forward solve PLy=Pb. */ + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + nrow = nsupr - nsupc; + + solve_ops += nsupc * (nsupc - 1) * nrhs; + solve_ops += 2 * nrow * nsupc * nrhs; + + if ( nsupc == 1 ) { + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + luptr = L_NZ_START(fsupc); + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ + irow = L_SUB(iptr); + ++luptr; + rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; + } + } + } else { + luptr = L_NZ_START(fsupc); +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); + STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#else + dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#endif + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + work_col = &work[j*n]; + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + rhs_work[irow] -= work_col[i]; /* Scatter */ + work_col[i] = 0.0; + iptr++; + } + } +#else + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); + dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], + &rhs_work[fsupc], &work[0] ); + + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + rhs_work[irow] -= work[i]; + work[i] = 0.0; + iptr++; + } + } +#endif + } /* else ... */ + } /* for L-solve */ + +#ifdef DEBUG + printf("After L-solve: y=\n"); + dprint_soln(n, nrhs, Bmat); +#endif + + /* + * Back solve Ux=y. + */ + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += nsupc * (nsupc + 1) * nrhs; + + if ( nsupc == 1 ) { + rhs_work = &Bmat[0]; + for (j = 0; j < nrhs; j++) { + rhs_work[fsupc] /= Lval[luptr]; + rhs_work += ldb; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("U", strlen("U")); + ftcs3 = _cptofcd("N", strlen("N")); + STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#else + dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#endif +#else + for (j = 0; j < nrhs; j++) + dusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); +#endif + } + + for (j = 0; j < nrhs; ++j) { + rhs_work = &Bmat[j*ldb]; + for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ + irow = U_SUB(i); + rhs_work[irow] -= rhs_work[jcol] * Uval[i]; + } + } + } + + } /* for U-solve */ + +#ifdef DEBUG + printf("After U-solve: x=\n"); + dprint_soln(n, nrhs, Bmat); +#endif + + /* Compute the final solution X := Pc*X. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = solve_ops; + + } else { /* Solve A'*X=B or CONJ(A)*X=B */ + /* Permute right hand sides to form Pc'*B. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = 0; + for (k = 0; k < nrhs; ++k) { + + /* Multiply by inv(U'). */ + sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); + + /* Multiply by inv(L'). */ + sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); + + } + /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + } + + SUPERLU_FREE(work); + SUPERLU_FREE(soln); +} + +/* + * Diagnostic print of the solution vector + */ +void +dprint_soln(int n, int nrhs, double *soln) +{ + int i; + + for (i = 0; i < n; i++) + printf("\t%d: %.4f\n", i, soln[i]); +} diff --git a/src/maths/SuperLU/dlacon.c b/src/maths/SuperLU/dlacon.c new file mode 100644 index 000000000..bcbf32815 --- /dev/null +++ b/src/maths/SuperLU/dlacon.c @@ -0,0 +1,236 @@ + +/*! @file dlacon.c + * \brief Estimates the 1-norm + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   DLACON estimates the 1-norm of a square matrix A.   
+ *   Reverse communication is used for evaluating matrix-vector products. 
+ * 
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   N      (input) INT
+ *          The order of the matrix.  N >= 1.   
+ *
+ *   V      (workspace) DOUBLE PRECISION array, dimension (N)   
+ *          On the final return, V = A*W,  where  EST = norm(V)/norm(W)   
+ *          (W is not returned).   
+ *
+ *   X      (input/output) DOUBLE PRECISION array, dimension (N)   
+ *          On an intermediate return, X should be overwritten by   
+ *                A * X,   if KASE=1,   
+ *                A' * X,  if KASE=2,
+ *         and DLACON must be re-called with all the other parameters   
+ *          unchanged.   
+ *
+ *   ISGN   (workspace) INT array, dimension (N)
+ *
+ *   EST    (output) DOUBLE PRECISION   
+ *          An estimate (a lower bound) for norm(A).   
+ *
+ *   KASE   (input/output) INT
+ *          On the initial call to DLACON, KASE should be 0.   
+ *          On an intermediate return, KASE will be 1 or 2, indicating   
+ *          whether X should be overwritten by A * X  or A' * X.   
+ *          On the final return from DLACON, KASE will again be 0.   
+ *
+ *   Further Details   
+ *   ======= =======   
+ *
+ *   Contributed by Nick Higham, University of Manchester.   
+ *   Originally named CONEST, dated March 16, 1988.   
+ *
+ *   Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of 
+ *   a real or complex matrix, with applications to condition estimation", 
+ *   ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988.   
+ *   ===================================================================== 
+ * 
+ */ + +int +dlacon_(int *n, double *v, double *x, int *isgn, double *est, int *kase) + +{ + + + /* Table of constant values */ + int c__1 = 1; + double zero = 0.0; + double one = 1.0; + + /* Local variables */ + static int iter; + static int jump, jlast; + static double altsgn, estold; + static int i, j; + double temp; +#ifdef _CRAY + extern int ISAMAX(int *, double *, int *); + extern double SASUM(int *, double *, int *); + extern int SCOPY(int *, double *, int *, double *, int *); +#else + extern int idamax_(int *, double *, int *); + extern double dasum_(int *, double *, int *); + extern int dcopy_(int *, double *, int *, double *, int *); +#endif +#define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a)) /* Copy sign */ +#define i_dnnt(a) \ + ( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */ + + if ( *kase == 0 ) { + for (i = 0; i < *n; ++i) { + x[i] = 1. / (double) (*n); + } + *kase = 1; + jump = 1; + return 0; + } + + switch (jump) { + case 1: goto L20; + case 2: goto L40; + case 3: goto L70; + case 4: goto L110; + case 5: goto L140; + } + + /* ................ ENTRY (JUMP = 1) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ + L20: + if (*n == 1) { + v[0] = x[0]; + *est = fabs(v[0]); + /* ... QUIT */ + goto L150; + } +#ifdef _CRAY + *est = SASUM(n, x, &c__1); +#else + *est = dasum_(n, x, &c__1); +#endif + + for (i = 0; i < *n; ++i) { + x[i] = d_sign(one, x[i]); + isgn[i] = i_dnnt(x[i]); + } + *kase = 2; + jump = 2; + return 0; + + /* ................ ENTRY (JUMP = 2) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ +L40: +#ifdef _CRAY + j = ISAMAX(n, &x[0], &c__1); +#else + j = idamax_(n, &x[0], &c__1); +#endif + --j; + iter = 2; + + /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ +L50: + for (i = 0; i < *n; ++i) x[i] = zero; + x[j] = one; + *kase = 1; + jump = 3; + return 0; + + /* ................ ENTRY (JUMP = 3) + X HAS BEEN OVERWRITTEN BY A*X. */ +L70: +#ifdef _CRAY + SCOPY(n, x, &c__1, v, &c__1); +#else + dcopy_(n, x, &c__1, v, &c__1); +#endif + estold = *est; +#ifdef _CRAY + *est = SASUM(n, v, &c__1); +#else + *est = dasum_(n, v, &c__1); +#endif + + for (i = 0; i < *n; ++i) + if (i_dnnt(d_sign(one, x[i])) != isgn[i]) + goto L90; + + /* REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */ + goto L120; + +L90: + /* TEST FOR CYCLING. */ + if (*est <= estold) goto L120; + + for (i = 0; i < *n; ++i) { + x[i] = d_sign(one, x[i]); + isgn[i] = i_dnnt(x[i]); + } + *kase = 2; + jump = 4; + return 0; + + /* ................ ENTRY (JUMP = 4) + X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ +L110: + jlast = j; +#ifdef _CRAY + j = ISAMAX(n, &x[0], &c__1); +#else + j = idamax_(n, &x[0], &c__1); +#endif + --j; + if (x[jlast] != fabs(x[j]) && iter < 5) { + ++iter; + goto L50; + } + + /* ITERATION COMPLETE. FINAL STAGE. */ +L120: + altsgn = 1.; + for (i = 1; i <= *n; ++i) { + x[i-1] = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.); + altsgn = -altsgn; + } + *kase = 1; + jump = 5; + return 0; + + /* ................ ENTRY (JUMP = 5) + X HAS BEEN OVERWRITTEN BY A*X. */ +L140: +#ifdef _CRAY + temp = SASUM(n, x, &c__1) / (double)(*n * 3) * 2.; +#else + temp = dasum_(n, x, &c__1) / (double)(*n * 3) * 2.; +#endif + if (temp > *est) { +#ifdef _CRAY + SCOPY(n, &x[0], &c__1, &v[0], &c__1); +#else + dcopy_(n, &x[0], &c__1, &v[0], &c__1); +#endif + *est = temp; + } + +L150: + *kase = 0; + return 0; + +} /* dlacon_ */ diff --git a/src/maths/SuperLU/dlamch.c b/src/maths/SuperLU/dlamch.c new file mode 100644 index 000000000..628cf5307 --- /dev/null +++ b/src/maths/SuperLU/dlamch.c @@ -0,0 +1,971 @@ +/*! @file dlamch.c + * \brief Determines double precision machine parameters + * + *
+ *       -- LAPACK auxiliary routine (version 2.0) --   
+ *       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+ *       Courant Institute, Argonne National Lab, and Rice University   
+ *       October 31, 1992   
+ * 
+ */ +#include +#include + +#define TRUE_ (1) +#define FALSE_ (0) +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + +/*! \brief + +
+    Purpose   
+    =======   
+
+    DLAMCH determines double precision machine parameters.   
+
+    Arguments   
+    =========   
+
+    CMACH   (input) CHARACTER*1   
+            Specifies the value to be returned by DLAMCH:   
+            = 'E' or 'e',   DLAMCH := eps   
+            = 'S' or 's ,   DLAMCH := sfmin   
+            = 'B' or 'b',   DLAMCH := base   
+            = 'P' or 'p',   DLAMCH := eps*base   
+            = 'N' or 'n',   DLAMCH := t   
+            = 'R' or 'r',   DLAMCH := rnd   
+            = 'M' or 'm',   DLAMCH := emin   
+            = 'U' or 'u',   DLAMCH := rmin   
+            = 'L' or 'l',   DLAMCH := emax   
+            = 'O' or 'o',   DLAMCH := rmax   
+
+            where   
+
+            eps   = relative machine precision   
+            sfmin = safe minimum, such that 1/sfmin does not overflow   
+            base  = base of the machine   
+            prec  = eps*base   
+            t     = number of (base) digits in the mantissa   
+            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
+            emin  = minimum exponent before (gradual) underflow   
+            rmin  = underflow threshold - base**(emin-1)   
+            emax  = largest exponent before overflow   
+            rmax  = overflow threshold  - (base**emax)*(1-eps)   
+
+   ===================================================================== 
+
+*/ +double dlamch_(char *cmach) +{ + + + static int first = TRUE_; + + /* System generated locals */ + int i__1; + double ret_val; + /* Builtin functions */ + double pow_di(double *, int *); + /* Local variables */ + static double base; + static int beta; + static double emin, prec, emax; + static int imin, imax; + static int lrnd; + static double rmin, rmax, t, rmach; + extern int lsame_(char *, char *); + static double small, sfmin; + extern /* Subroutine */ int dlamc2_(int *, int *, int *, + double *, int *, double *, int *, double *); + static int it; + static double rnd, eps; + + if (first) { + first = FALSE_; + dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); + base = (double) beta; + t = (double) it; + if (lrnd) { + rnd = 1.; + i__1 = 1 - it; + eps = pow_di(&base, &i__1) / 2; + } else { + rnd = 0.; + i__1 = 1 - it; + eps = pow_di(&base, &i__1); + } + prec = eps * base; + emin = (double) imin; + emax = (double) imax; + sfmin = rmin; + small = 1. / rmax; + if (small >= sfmin) { + + /* Use SMALL plus a bit, to avoid the possibility of rounding + causing overflow when computing 1/sfmin. */ + sfmin = small * (eps + 1.); + } + } + + if (lsame_(cmach, "E")) { + rmach = eps; + } else if (lsame_(cmach, "S")) { + rmach = sfmin; + } else if (lsame_(cmach, "B")) { + rmach = base; + } else if (lsame_(cmach, "P")) { + rmach = prec; + } else if (lsame_(cmach, "N")) { + rmach = t; + } else if (lsame_(cmach, "R")) { + rmach = rnd; + } else if (lsame_(cmach, "M")) { + rmach = emin; + } else if (lsame_(cmach, "U")) { + rmach = rmin; + } else if (lsame_(cmach, "L")) { + rmach = emax; + } else if (lsame_(cmach, "O")) { + rmach = rmax; + } + + ret_val = rmach; + return ret_val; + +/* End of DLAMCH */ + +} /* dlamch_ */ +/* Subroutine */ +/*! \brief + +
+ Purpose   
+    =======   
+
+    DLAMC1 determines the machine parameters given by BETA, T, RND, and   
+    IEEE1.   
+
+    Arguments   
+    =========   
+
+    BETA    (output) INT   
+            The base of the machine.   
+
+    T       (output) INT   
+            The number of ( BETA ) digits in the mantissa.   
+
+    RND     (output) INT   
+            Specifies whether proper rounding  ( RND = .TRUE. )  or   
+            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
+  
+            be a reliable guide to the way in which the machine performs 
+  
+            its arithmetic.   
+
+    IEEE1   (output) INT   
+            Specifies whether rounding appears to be done in the IEEE   
+            'round to nearest' style.   
+
+    Further Details   
+    ===============   
+
+    The routine is based on the routine  ENVRON  by Malcolm and   
+    incorporates suggestions by Gentleman and Marovich. See   
+
+       Malcolm M. A. (1972) Algorithms to reveal properties of   
+          floating-point arithmetic. Comms. of the ACM, 15, 949-951.   
+
+       Gentleman W. M. and Marovich S. B. (1974) More on algorithms   
+          that reveal properties of floating point arithmetic units.   
+          Comms. of the ACM, 17, 276-277.   
+
+   ===================================================================== 
+
+*/ +int dlamc1_(int *beta, int *t, int *rnd, int + *ieee1) +{ + /* Initialized data */ + static int first = TRUE_; + /* System generated locals */ + double d__1, d__2; + /* Local variables */ + static int lrnd; + static double a, b, c, f; + static int lbeta; + static double savec; + extern double dlamc3_(double *, double *); + static int lieee1; + static double t1, t2; + static int lt; + static double one, qtr; + + if (first) { + first = FALSE_; + one = 1.; + +/* LBETA, LIEEE1, LT and LRND are the local values of BE +TA, + IEEE1, T and RND. + + Throughout this routine we use the function DLAMC3 to ens +ure + that relevant values are stored and not held in registers, + or + are not affected by optimizers. + + Compute a = 2.0**m with the smallest positive integer m s +uch + that + + fl( a + 1.0 ) = a. */ + + a = 1.; + c = 1.; + +/* + WHILE( C.EQ.ONE )LOOP */ +L10: + if (c == one) { + a *= 2; + c = dlamc3_(&a, &one); + d__1 = -a; + c = dlamc3_(&c, &d__1); + goto L10; + } +/* + END WHILE + + Now compute b = 2.0**m with the smallest positive integer +m + such that + + fl( a + b ) .gt. a. */ + + b = 1.; + c = dlamc3_(&a, &b); + +/* + WHILE( C.EQ.A )LOOP */ +L20: + if (c == a) { + b *= 2; + c = dlamc3_(&a, &b); + goto L20; + } +/* + END WHILE + + Now compute the base. a and c are neighbouring floating po +int + numbers in the interval ( beta**t, beta**( t + 1 ) ) and + so + their difference is beta. Adding 0.25 to c is to ensure that + it + is truncated to beta and not ( beta - 1 ). */ + + qtr = one / 4; + savec = c; + d__1 = -a; + c = dlamc3_(&c, &d__1); + lbeta = (int) (c + qtr); + +/* Now determine whether rounding or chopping occurs, by addin +g a + bit less than beta/2 and a bit more than beta/2 to + a. */ + + b = (double) lbeta; + d__1 = b / 2; + d__2 = -b / 100; + f = dlamc3_(&d__1, &d__2); + c = dlamc3_(&f, &a); + if (c == a) { + lrnd = TRUE_; + } else { + lrnd = FALSE_; + } + d__1 = b / 2; + d__2 = b / 100; + f = dlamc3_(&d__1, &d__2); + c = dlamc3_(&f, &a); + if (lrnd && c == a) { + lrnd = FALSE_; + } + +/* Try and decide whether rounding is done in the IEEE 'round + to + nearest' style. B/2 is half a unit in the last place of the +two + numbers A and SAVEC. Furthermore, A is even, i.e. has last +bit + zero, and SAVEC is odd. Thus adding B/2 to A should not cha +nge + A, but adding B/2 to SAVEC should change SAVEC. */ + + d__1 = b / 2; + t1 = dlamc3_(&d__1, &a); + d__1 = b / 2; + t2 = dlamc3_(&d__1, &savec); + lieee1 = t1 == a && t2 > savec && lrnd; + +/* Now find the mantissa, t. It should be the integer part + of + log to the base beta of a, however it is safer to determine + t + by powering. So we find t as the smallest positive integer +for + which + + fl( beta**t + 1.0 ) = 1.0. */ + + lt = 0; + a = 1.; + c = 1.; + +/* + WHILE( C.EQ.ONE )LOOP */ +L30: + if (c == one) { + ++lt; + a *= lbeta; + c = dlamc3_(&a, &one); + d__1 = -a; + c = dlamc3_(&c, &d__1); + goto L30; + } +/* + END WHILE */ + + } + + *beta = lbeta; + *t = lt; + *rnd = lrnd; + *ieee1 = lieee1; + return 0; + +/* End of DLAMC1 */ + +} /* dlamc1_ */ + + +/* Subroutine */ +/*! \brief + +
+    Purpose   
+    =======   
+
+    DLAMC2 determines the machine parameters specified in its argument   
+    list.   
+
+    Arguments   
+    =========   
+
+    BETA    (output) INT   
+            The base of the machine.   
+
+    T       (output) INT   
+            The number of ( BETA ) digits in the mantissa.   
+
+    RND     (output) INT   
+            Specifies whether proper rounding  ( RND = .TRUE. )  or   
+            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
+  
+            be a reliable guide to the way in which the machine performs 
+  
+            its arithmetic.   
+
+    EPS     (output) DOUBLE PRECISION   
+            The smallest positive number such that   
+
+               fl( 1.0 - EPS ) .LT. 1.0,   
+
+            where fl denotes the computed value.   
+
+    EMIN    (output) INT   
+            The minimum exponent before (gradual) underflow occurs.   
+
+    RMIN    (output) DOUBLE PRECISION   
+            The smallest normalized number for the machine, given by   
+            BASE**( EMIN - 1 ), where  BASE  is the floating point value 
+  
+            of BETA.   
+
+    EMAX    (output) INT   
+            The maximum exponent before overflow occurs.   
+
+    RMAX    (output) DOUBLE PRECISION   
+            The largest positive number for the machine, given by   
+            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point 
+  
+            value of BETA.   
+
+    Further Details   
+    ===============   
+
+    The computation of  EPS  is based on a routine PARANOIA by   
+    W. Kahan of the University of California at Berkeley.   
+
+   ===================================================================== 
+
+*/ +int dlamc2_(int *beta, int *t, int *rnd, + double *eps, int *emin, double *rmin, int *emax, + double *rmax) +{ + + /* Table of constant values */ + static int c__1 = 1; + + /* Initialized data */ + static int first = TRUE_; + static int iwarn = FALSE_; + /* System generated locals */ + int i__1; + double d__1, d__2, d__3, d__4, d__5; + /* Builtin functions */ + double pow_di(double *, int *); + /* Local variables */ + static int ieee; + static double half; + static int lrnd; + static double leps, zero, a, b, c; + static int i, lbeta; + static double rbase; + static int lemin, lemax, gnmin; + static double small; + static int gpmin; + static double third, lrmin, lrmax, sixth; + extern /* Subroutine */ int dlamc1_(int *, int *, int *, + int *); + extern double dlamc3_(double *, double *); + static int lieee1; + extern /* Subroutine */ int dlamc4_(int *, double *, int *), + dlamc5_(int *, int *, int *, int *, int *, + double *); + static int lt, ngnmin, ngpmin; + static double one, two; + + if (first) { + first = FALSE_; + zero = 0.; + one = 1.; + two = 2.; + +/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values + of + BETA, T, RND, EPS, EMIN and RMIN. + + Throughout this routine we use the function DLAMC3 to ens +ure + that relevant values are stored and not held in registers, + or + are not affected by optimizers. + + DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. +*/ + + dlamc1_(&lbeta, <, &lrnd, &lieee1); + +/* Start to find EPS. */ + + b = (double) lbeta; + i__1 = -lt; + a = pow_di(&b, &i__1); + leps = a; + +/* Try some tricks to see whether or not this is the correct E +PS. */ + + b = two / 3; + half = one / 2; + d__1 = -half; + sixth = dlamc3_(&b, &d__1); + third = dlamc3_(&sixth, &sixth); + d__1 = -half; + b = dlamc3_(&third, &d__1); + b = dlamc3_(&b, &sixth); + b = abs(b); + if (b < leps) { + b = leps; + } + + leps = 1.; + +/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ +L10: + if (leps > b && b > zero) { + leps = b; + d__1 = half * leps; +/* Computing 5th power */ + d__3 = two, d__4 = d__3, d__3 *= d__3; +/* Computing 2nd power */ + d__5 = leps; + d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5); + c = dlamc3_(&d__1, &d__2); + d__1 = -c; + c = dlamc3_(&half, &d__1); + b = dlamc3_(&half, &c); + d__1 = -b; + c = dlamc3_(&half, &d__1); + b = dlamc3_(&half, &c); + goto L10; + } +/* + END WHILE */ + + if (a < leps) { + leps = a; + } + +/* Computation of EPS complete. + + Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3 +)). + Keep dividing A by BETA until (gradual) underflow occurs. T +his + is detected when we cannot recover the previous A. */ + + rbase = one / lbeta; + small = one; + for (i = 1; i <= 3; ++i) { + d__1 = small * rbase; + small = dlamc3_(&d__1, &zero); +/* L20: */ + } + a = dlamc3_(&one, &small); + dlamc4_(&ngpmin, &one, &lbeta); + d__1 = -one; + dlamc4_(&ngnmin, &d__1, &lbeta); + dlamc4_(&gpmin, &a, &lbeta); + d__1 = -a; + dlamc4_(&gnmin, &d__1, &lbeta); + ieee = FALSE_; + + if (ngpmin == ngnmin && gpmin == gnmin) { + if (ngpmin == gpmin) { + lemin = ngpmin; +/* ( Non twos-complement machines, no gradual under +flow; + e.g., VAX ) */ + } else if (gpmin - ngpmin == 3) { + lemin = ngpmin - 1 + lt; + ieee = TRUE_; +/* ( Non twos-complement machines, with gradual und +erflow; + e.g., IEEE standard followers ) */ + } else { + lemin = min(ngpmin,gpmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } + + } else if (ngpmin == gpmin && ngnmin == gnmin) { + if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) { + lemin = max(ngpmin,ngnmin); +/* ( Twos-complement machines, no gradual underflow +; + e.g., CYBER 205 ) */ + } else { + lemin = min(ngpmin,ngnmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } + + } else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin) + { + if (gpmin - min(ngpmin,ngnmin) == 3) { + lemin = max(ngpmin,ngnmin) - 1 + lt; +/* ( Twos-complement machines with gradual underflo +w; + no known machine ) */ + } else { + lemin = min(ngpmin,ngnmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } + + } else { +/* Computing MIN */ + i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin); + lemin = min(i__1,gnmin); +/* ( A guess; no known machine ) */ + iwarn = TRUE_; + } +/* ** + Comment out this if block if EMIN is ok */ + if (iwarn) { + first = TRUE_; + printf("\n\n WARNING. The value EMIN may be incorrect:- "); + printf("EMIN = %8i\n",lemin); + printf("If, after inspection, the value EMIN looks acceptable"); + printf("please comment out \n the IF block as marked within the"); + printf("code of routine DLAMC2, \n otherwise supply EMIN"); + printf("explicitly.\n"); + } +/* ** + + Assume IEEE arithmetic if we found denormalised numbers abo +ve, + or if arithmetic seems to round in the IEEE style, determi +ned + in routine DLAMC1. A true IEEE machine should have both thi +ngs + true; however, faulty machines may have one or the other. */ + + ieee = ieee || lieee1; + +/* Compute RMIN by successive division by BETA. We could comp +ute + RMIN as BASE**( EMIN - 1 ), but some machines underflow dur +ing + this computation. */ + + lrmin = 1.; + i__1 = 1 - lemin; + for (i = 1; i <= 1-lemin; ++i) { + d__1 = lrmin * rbase; + lrmin = dlamc3_(&d__1, &zero); +/* L30: */ + } + +/* Finally, call DLAMC5 to compute EMAX and RMAX. */ + + dlamc5_(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); + } + + *beta = lbeta; + *t = lt; + *rnd = lrnd; + *eps = leps; + *emin = lemin; + *rmin = lrmin; + *emax = lemax; + *rmax = lrmax; + + return 0; + + +/* End of DLAMC2 */ + +} /* dlamc2_ */ + +/*! \brief + +
+    Purpose   
+    =======   
+
+    DLAMC3  is intended to force  A  and  B  to be stored prior to doing 
+  
+    the addition of  A  and  B ,  for use in situations where optimizers 
+  
+    might hold one of these in a register.   
+
+    Arguments   
+    =========   
+
+    A, B    (input) DOUBLE PRECISION   
+            The values A and B.   
+
+   ===================================================================== 
+
+*/ +double dlamc3_(double *a, double *b) +{ +/* >>Start of File<< + System generated locals */ + double ret_val; + + ret_val = *a + *b; + + return ret_val; + +/* End of DLAMC3 */ + +} /* dlamc3_ */ + + +/* Subroutine */ +/*! \brief + +
+    Purpose   
+    =======   
+
+    DLAMC4 is a service routine for DLAMC2.   
+
+    Arguments   
+    =========   
+
+    EMIN    (output) EMIN   
+            The minimum exponent before (gradual) underflow, computed by 
+  
+            setting A = START and dividing by BASE until the previous A   
+            can not be recovered.   
+
+    START   (input) DOUBLE PRECISION   
+            The starting point for determining EMIN.   
+
+    BASE    (input) INT   
+            The base of the machine.   
+
+   ===================================================================== 
+
+*/ + +int dlamc4_(int *emin, double *start, int *base) +{ + /* System generated locals */ + int i__1; + double d__1; + /* Local variables */ + static double zero, a; + static int i; + static double rbase, b1, b2, c1, c2, d1, d2; + extern double dlamc3_(double *, double *); + static double one; + + a = *start; + one = 1.; + rbase = one / *base; + zero = 0.; + *emin = 1; + d__1 = a * rbase; + b1 = dlamc3_(&d__1, &zero); + c1 = a; + c2 = a; + d1 = a; + d2 = a; +/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. + $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ +L10: + if (c1 == a && c2 == a && d1 == a && d2 == a) { + --(*emin); + a = b1; + d__1 = a / *base; + b1 = dlamc3_(&d__1, &zero); + d__1 = b1 * *base; + c1 = dlamc3_(&d__1, &zero); + d1 = zero; + i__1 = *base; + for (i = 1; i <= *base; ++i) { + d1 += b1; +/* L20: */ + } + d__1 = a * rbase; + b2 = dlamc3_(&d__1, &zero); + d__1 = b2 / rbase; + c2 = dlamc3_(&d__1, &zero); + d2 = zero; + i__1 = *base; + for (i = 1; i <= *base; ++i) { + d2 += b2; +/* L30: */ + } + goto L10; + } +/* + END WHILE */ + + return 0; + +/* End of DLAMC4 */ + +} /* dlamc4_ */ + + +/* Subroutine */ +/*! \brief + +
+    Purpose   
+    =======   
+
+    DLAMC5 attempts to compute RMAX, the largest machine floating-point   
+    number, without overflow.  It assumes that EMAX + abs(EMIN) sum   
+    approximately to a power of 2.  It will fail on machines where this   
+    assumption does not hold, for example, the Cyber 205 (EMIN = -28625, 
+  
+    EMAX = 28718).  It will also fail if the value supplied for EMIN is   
+    too large (i.e. too close to zero), probably with overflow.   
+
+    Arguments   
+    =========   
+
+    BETA    (input) INT   
+            The base of floating-point arithmetic.   
+
+    P       (input) INT   
+            The number of base BETA digits in the mantissa of a   
+            floating-point value.   
+
+    EMIN    (input) INT   
+            The minimum exponent before (gradual) underflow.   
+
+    IEEE    (input) INT   
+            A int flag specifying whether or not the arithmetic   
+            system is thought to comply with the IEEE standard.   
+
+    EMAX    (output) INT   
+            The largest exponent before overflow   
+
+    RMAX    (output) DOUBLE PRECISION   
+            The largest machine floating-point number.   
+
+   ===================================================================== 
+  
+
+
+       First compute LEXP and UEXP, two powers of 2 that bound   
+       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum   
+       approximately to the bound that is closest to abs(EMIN).   
+       (EMAX is the exponent of the required number RMAX).
+
+*/ +int dlamc5_(int *beta, int *p, int *emin, + int *ieee, int *emax, double *rmax) +{ + + /* Table of constant values */ + static double c_b5 = 0.; + + /* System generated locals */ + int i__1; + double d__1; + /* Local variables */ + static int lexp; + static double oldy; + static int uexp, i; + static double y, z; + static int nbits; + extern double dlamc3_(double *, double *); + static double recbas; + static int exbits, expsum, try__; + + + + lexp = 1; + exbits = 1; +L10: + try__ = lexp << 1; + if (try__ <= -(*emin)) { + lexp = try__; + ++exbits; + goto L10; + } + if (lexp == -(*emin)) { + uexp = lexp; + } else { + uexp = try__; + ++exbits; + } + +/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater + than or equal to EMIN. EXBITS is the number of bits needed to + store the exponent. */ + + if (uexp + *emin > -lexp - *emin) { + expsum = lexp << 1; + } else { + expsum = uexp << 1; + } + +/* EXPSUM is the exponent range, approximately equal to + EMAX - EMIN + 1 . */ + + *emax = expsum + *emin - 1; + nbits = exbits + 1 + *p; + +/* NBITS is the total number of bits needed to store a + floating-point number. */ + + if (nbits % 2 == 1 && *beta == 2) { + +/* Either there are an odd number of bits used to store a + floating-point number, which is unlikely, or some bits are + + not used in the representation of numbers, which is possible +, + (e.g. Cray machines) or the mantissa has an implicit bit, + (e.g. IEEE machines, Dec Vax machines), which is perhaps the + + most likely. We have to assume the last alternative. + If this is true, then we need to reduce EMAX by one because + + there must be some way of representing zero in an implicit-b +it + system. On machines like Cray, we are reducing EMAX by one + + unnecessarily. */ + + --(*emax); + } + + if (*ieee) { + +/* Assume we are on an IEEE machine which reserves one exponent + + for infinity and NaN. */ + + --(*emax); + } + +/* Now create RMAX, the largest machine number, which should + be equal to (1.0 - BETA**(-P)) * BETA**EMAX . + + First compute 1.0 - BETA**(-P), being careful that the + result is less than 1.0 . */ + + recbas = 1. / *beta; + z = *beta - 1.; + y = 0.; + i__1 = *p; + for (i = 1; i <= *p; ++i) { + z *= recbas; + if (y < 1.) { + oldy = y; + } + y = dlamc3_(&y, &z); +/* L20: */ + } + if (y >= 1.) { + y = oldy; + } + +/* Now multiply by BETA**EMAX to get RMAX. */ + + i__1 = *emax; + for (i = 1; i <= *emax; ++i) { + d__1 = y * *beta; + y = dlamc3_(&d__1, &c_b5); +/* L30: */ + } + + *rmax = y; + return 0; + +/* End of DLAMC5 */ + +} /* dlamc5_ */ + +double pow_di(double *ap, int *bp) +{ + double pow, x; + int n; + + pow = 1; + x = *ap; + n = *bp; + + if(n != 0){ + if(n < 0) { + n = -n; + x = 1/x; + } + for( ; ; ) { + if(n & 01) pow *= x; + if(n >>= 1) x *= x; + else break; + } + } + return(pow); +} + diff --git a/src/maths/SuperLU/dlangs.c b/src/maths/SuperLU/dlangs.c new file mode 100644 index 000000000..c0e9797cd --- /dev/null +++ b/src/maths/SuperLU/dlangs.c @@ -0,0 +1,119 @@ + +/*! @file dlangs.c + * \brief Returns the value of the one norm + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Modified from lapack routine DLANGE 
+ * 
+ */ +/* + * File name: dlangs.c + * History: Modified from lapack routine DLANGE + */ +#include +#include + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   DLANGS returns the value of the one norm, or the Frobenius norm, or 
+ *   the infinity norm, or the element of largest absolute value of a 
+ *   real matrix A.   
+ *
+ *   Description   
+ *   ===========   
+ *
+ *   DLANGE returns the value   
+ *
+ *      DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+ *               (   
+ *               ( norm1(A),         NORM = '1', 'O' or 'o'   
+ *               (   
+ *               ( normI(A),         NORM = 'I' or 'i'   
+ *               (   
+ *               ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+ *
+ *   where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+ *   normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+ *   normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+ *   squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   NORM    (input) CHARACTER*1   
+ *           Specifies the value to be returned in DLANGE as described above.   
+ *   A       (input) SuperMatrix*
+ *           The M by N sparse matrix A. 
+ *
+ *  =====================================================================
+ * 
+ */ + +double dlangs(char *norm, SuperMatrix *A) +{ + + /* Local variables */ + NCformat *Astore; + double *Aval; + int i, j, irow; + double value, sum; + double *rwork; + + Astore = A->Store; + Aval = Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + + } else if (lsame_(norm, "M")) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + value = SUPERLU_MAX( value, fabs( Aval[i]) ); + + } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += fabs(Aval[i]); + value = SUPERLU_MAX(value,sum); + } + + } else if (lsame_(norm, "I")) { + /* Find normI(A). */ + if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for rwork."); + for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { + irow = Astore->rowind[i]; + rwork[irow] += fabs(Aval[i]); + } + value = 0.; + for (i = 0; i < A->nrow; ++i) + value = SUPERLU_MAX(value, rwork[i]); + + SUPERLU_FREE (rwork); + + } else if (lsame_(norm, "F") || lsame_(norm, "E")) { + /* Find normF(A). */ + ABORT_SuperLU("Not implemented."); + } else + ABORT_SuperLU("Illegal norm specified."); + + return (value); + +} /* dlangs */ + diff --git a/src/maths/SuperLU/dlaqgs.c b/src/maths/SuperLU/dlaqgs.c new file mode 100644 index 000000000..aebbc0782 --- /dev/null +++ b/src/maths/SuperLU/dlaqgs.c @@ -0,0 +1,145 @@ + +/*! @file dlaqgs.c + * \brief Equlibrates a general sprase matrix + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ * Modified from LAPACK routine DLAQGE
+ * 
+ */ +/* + * File name: dlaqgs.c + * History: Modified from LAPACK routine DLAQGE + */ +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   DLAQGS equilibrates a general sparse M by N matrix A using the row and   
+ *   scaling factors in the vectors R and C.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input/output) SuperMatrix*
+ *           On exit, the equilibrated matrix.  See EQUED for the form of 
+ *           the equilibrated matrix. The type of A can be:
+ *	    Stype = NC; Dtype = SLU_D; Mtype = GE.
+ *	    
+ *   R       (input) double*, dimension (A->nrow)
+ *           The row scale factors for A.
+ *	    
+ *   C       (input) double*, dimension (A->ncol)
+ *           The column scale factors for A.
+ *	    
+ *   ROWCND  (input) double
+ *           Ratio of the smallest R(i) to the largest R(i).
+ *	    
+ *   COLCND  (input) double
+ *           Ratio of the smallest C(i) to the largest C(i).
+ *	    
+ *   AMAX    (input) double
+ *           Absolute value of largest matrix entry.
+ *	    
+ *   EQUED   (output) char*
+ *           Specifies the form of equilibration that was done.   
+ *           = 'N':  No equilibration   
+ *           = 'R':  Row equilibration, i.e., A has been premultiplied by  
+ *                   diag(R).   
+ *           = 'C':  Column equilibration, i.e., A has been postmultiplied  
+ *                   by diag(C).   
+ *           = 'B':  Both row and column equilibration, i.e., A has been
+ *                   replaced by diag(R) * A * diag(C).   
+ *
+ *   Internal Parameters   
+ *   ===================   
+ *
+ *   THRESH is a threshold value used to decide if row or column scaling   
+ *   should be done based on the ratio of the row or column scaling   
+ *   factors.  If ROWCND < THRESH, row scaling is done, and if   
+ *   COLCND < THRESH, column scaling is done.   
+ *
+ *   LARGE and SMALL are threshold values used to decide if row scaling   
+ *   should be done based on the absolute size of the largest matrix   
+ *   element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+ *
+ *   ===================================================================== 
+ * 
+ */ + +void +dlaqgs(SuperMatrix *A, double *r, double *c, + double rowcnd, double colcnd, double amax, char *equed) +{ + + +#define THRESH (0.1) + + /* Local variables */ + NCformat *Astore; + double *Aval; + int i, j, irow; + double large, small, cj; + + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Initialize LARGE and SMALL. */ + small = dlamch_("Safe minimum") / dlamch_("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + Aval[i] *= cj; + } + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= r[irow]; + } + *(unsigned char *)equed = 'R'; + } else { + /* Row and column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= cj * r[irow]; + } + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* dlaqgs */ + diff --git a/src/maths/SuperLU/dldperm.c b/src/maths/SuperLU/dldperm.c new file mode 100644 index 000000000..69c6bce50 --- /dev/null +++ b/src/maths/SuperLU/dldperm.c @@ -0,0 +1,165 @@ + +/*! @file + * \brief Finds a row permutation so that the matrix has large entries on the diagonal + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ * 
+ */ + +#include + +extern int_t mc64id_(int_t*); +extern int_t mc64ad_(int_t*, int_t*, int_t*, int_t [], int_t [], double [], + int_t*, int_t [], int_t*, int_t[], int_t*, double [], + int_t [], int_t []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   DLDPERM finds a row permutation so that the matrix has large
+ *   entries on the diagonal.
+ *
+ * Arguments
+ * =========
+ *
+ * job    (input) int
+ *        Control the action. Possible values for JOB are:
+ *        = 1 : Compute a row permutation of the matrix so that the
+ *              permuted matrix has as many entries on its diagonal as
+ *              possible. The values on the diagonal are of arbitrary size.
+ *              HSL subroutine MC21A/AD is used for this.
+ *        = 2 : Compute a row permutation of the matrix so that the smallest 
+ *              value on the diagonal of the permuted matrix is maximized.
+ *        = 3 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *              The algorithm differs from the one used for JOB = 2 and may
+ *              have quite a different performance.
+ *        = 4 : Compute a row permutation of the matrix so that the sum
+ *              of the diagonal entries of the permuted matrix is maximized.
+ *        = 5 : Compute a row permutation of the matrix so that the product
+ *              of the diagonal entries of the permuted matrix is maximized
+ *              and vectors to scale the matrix so that the nonzero diagonal 
+ *              entries of the permuted matrix are one in absolute value and 
+ *              all the off-diagonal entries are less than or equal to one in 
+ *              absolute value.
+ *        Restriction: 1 <= JOB <= 5.
+ *
+ * n      (input) int
+ *        The order of the matrix.
+ *
+ * nnz    (input) int
+ *        The number of nonzeros in the matrix.
+ *
+ * adjncy (input) int*, of size nnz
+ *        The adjacency structure of the matrix, which contains the row
+ *        indices of the nonzeros.
+ *
+ * colptr (input) int*, of size n+1
+ *        The pointers to the beginning of each column in ADJNCY.
+ *
+ * nzval  (input) double*, of size nnz
+ *        The nonzero values of the matrix. nzval[k] is the value of
+ *        the entry corresponding to adjncy[k].
+ *        It is not used if job = 1.
+ *
+ * perm   (output) int*, of size n
+ *        The permutation vector. perm[i] = j means row i in the
+ *        original matrix is in row j of the permuted matrix.
+ *
+ * u      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the row scaling factors. 
+ *
+ * v      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the column scaling factors. 
+ *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
+ * 
+ */ + +int +dldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], + double nzval[], int_t *perm, double u[], double v[]) +{ + int_t i, liw, ldw, num; + int_t *iw, icntl[10], info[10]; + double *dw; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Enter dldperm()"); +#endif + liw = 5*n; + if ( job == 3 ) liw = 10*n + nnz; + if ( !(iw = intMalloc(liw)) ) ABORT_SuperLU("Malloc fails for iw[]"); + ldw = 3*n + nnz; + if ( !(dw = (double*) SUPERLU_MALLOC(ldw * sizeof(double))) ) + ABORT_SuperLU("Malloc fails for dw[]"); + + /* Increment one to get 1-based indexing. */ + for (i = 0; i <= n; ++i) ++colptr[i]; + for (i = 0; i < nnz; ++i) ++adjncy[i]; +#if ( DEBUGlevel>=2 ) + printf("LDPERM(): n %d, nnz %d\n", n, nnz); + slu_PrintInt10("colptr", n+1, colptr); + slu_PrintInt10("adjncy", nnz, adjncy); +#endif + + /* + * NOTE: + * ===== + * + * MC64AD assumes that column permutation vector is defined as: + * perm(i) = j means column i of permuted A is in column j of original A. + * + * Since a symmetric permutation preserves the diagonal entries. Then + * by the following relation: + * P'(A*P')P = P'A + * we can apply inverse(perm) to rows of A to get large diagonal entries. + * But, since 'perm' defined in MC64AD happens to be the reverse of + * SuperLU's definition of permutation vector, therefore, it is already + * an inverse for our purpose. We will thus use it directly. + * + */ + mc64id_(icntl); +#if 0 + /* Suppress error and warning messages. */ + icntl[0] = -1; + icntl[1] = -1; +#endif + + mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval, &num, perm, + &liw, iw, &ldw, dw, icntl, info); + +#if ( DEBUGlevel>=2 ) + slu_PrintInt10("perm", n, perm); + printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); +#endif + if ( info[0] == 1 ) { /* Structurally singular */ + printf(".. The last %d permutations:\n", n-num); + slu_PrintInt10("perm", n-num, &perm[num]); + } + + /* Restore to 0-based indexing. */ + for (i = 0; i <= n; ++i) --colptr[i]; + for (i = 0; i < nnz; ++i) --adjncy[i]; + for (i = 0; i < n; ++i) --perm[i]; + + if ( job == 5 ) + for (i = 0; i < n; ++i) { + u[i] = dw[i]; + v[i] = dw[n+i]; + } + + SUPERLU_FREE(iw); + SUPERLU_FREE(dw); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Exit dldperm()"); +#endif + + return info[0]; +} diff --git a/src/maths/SuperLU/dmemory.c b/src/maths/SuperLU/dmemory.c new file mode 100644 index 000000000..de3f46349 --- /dev/null +++ b/src/maths/SuperLU/dmemory.c @@ -0,0 +1,701 @@ + +/*! @file dmemory.c + * \brief Memory details + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ * 
+ */ +#include + + +/* Internal prototypes */ +void *dexpand (int *, MemType,int, int, GlobalLU_t *); +int dLUWorkInit (int, int, int, int **, double **, GlobalLU_t *); +void copy_mem_double (int, void *, void *); +void dStackCompress (GlobalLU_t *); +void dSetupSpace (void *, int, GlobalLU_t *); +void *duser_malloc (int, int, GlobalLU_t *); +void duser_free (int, int, GlobalLU_t *); + +/* External prototypes (in memory.c - prec-independent) */ +extern void copy_mem_int (int, void *, void *); +extern void user_bcopy (char *, char *, int); + + +/* Macros to manipulate stack */ +#define StackFull(x) ( x + Glu->stack.used >= Glu->stack.size ) +#define NotDoubleAlign(addr) ( (long int)addr & 7 ) +#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) +#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ + (w + 1) * m * sizeof(double) ) +#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ + + + + +/*! \brief Setup the memory model to be used for factorization. + * + * lwork = 0: use system malloc; + * lwork > 0: use user-supplied work[] space. + */ +void dSetupSpace(void *work, int lwork, GlobalLU_t *Glu) +{ + if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; /* malloc/free */ + } else if ( lwork > 0 ) { + Glu->MemModel = USER; /* user provided space */ + Glu->stack.used = 0; + Glu->stack.top1 = 0; + Glu->stack.top2 = (lwork/4)*4; /* must be word addressable */ + Glu->stack.size = Glu->stack.top2; + Glu->stack.array = (void *) work; + } +} + + + +void *duser_malloc(int bytes, int which_end, GlobalLU_t *Glu) +{ + void *buf; + + if ( StackFull(bytes) ) return (NULL); + + if ( which_end == HEAD ) { + buf = (char*) Glu->stack.array + Glu->stack.top1; + Glu->stack.top1 += bytes; + } else { + Glu->stack.top2 -= bytes; + buf = (char*) Glu->stack.array + Glu->stack.top2; + } + + Glu->stack.used += bytes; + return buf; +} + + +void duser_free(int bytes, int which_end, GlobalLU_t *Glu) +{ + if ( which_end == HEAD ) { + Glu->stack.top1 -= bytes; + } else { + Glu->stack.top2 += bytes; + } + Glu->stack.used -= bytes; +} + + + +/*! \brief + * + *
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total_needed (float)
+ *      The amount of space needed in bytes to perform factorization.
+ * 
+ */ +int dQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, iword, dword, panel_size = sp_ienv(1); + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(double); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0*n + 3.0) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0 * panel_size + 4.0 + NO_MARKER) * n * iword + + (panel_size + 1.0) * n * dword ); + + return 0; +} /* dQuerySpace */ + + +/*! \brief + * + *
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total_needed (float)
+ *      The amount of space needed in bytes to perform factorization.
+ * 
+ */ +int ilu_dQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, panel_size = sp_ienv(1); + register float iword, dword; + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(double); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0f * n + 3.0f) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0f) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization. + ILU needs 5*n more integers than LU */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0f * panel_size + 9.0f + NO_MARKER) * n * iword + + (panel_size + 1.0f) * n * dword ); + + return 0; +} /* ilu_dQuerySpace */ + + +/*! \brief Allocate storage for the data structures common to all factor routines. + * + *
+ * For those unpredictable size, estimate as fill_ratio * nnz(A).
+ * Return value:
+ *     If lwork = -1, return the estimated amount of space required, plus n;
+ *     otherwise, return the amount of space actually allocated when
+ *     memory allocation failure occurred.
+ * 
+ */ +int +dLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, + int panel_size, double fill_ratio, SuperMatrix *L, SuperMatrix *U, + GlobalLU_t *Glu, int **iwork, double **dwork) +{ + int info, iword, dword; + SCformat *Lstore; + NCformat *Ustore; + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + double *ucol; + int *usub, *xusub; + int nzlmax, nzumax, nzlumax; + + iword = sizeof(int); + dword = sizeof(double); + Glu->n = n; + Glu->num_expansions = 0; + + if ( !Glu->expanders ) + Glu->expanders = (ExpHeader*)SUPERLU_MALLOC( NO_MEMTYPE * + sizeof(ExpHeader) ); + if ( !Glu->expanders ) ABORT_SuperLU("SUPERLU_MALLOC fails for expanders"); + + if ( fact != SamePattern_SameRowPerm ) { + /* Guess for L\U factors */ + nzumax = nzlumax = fill_ratio * annz; + nzlmax = SUPERLU_MAX(1, fill_ratio/4.) * annz; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else { + dSetupSpace(work, lwork, Glu); + } + +#if ( PRNTlevel >= 1 ) + printf("dLUMemInit() called: fill_ratio %.0f, nzlmax %ld, nzumax %ld\n", + fill_ratio, nzlmax, nzumax); + fflush(stdout); +#endif + + /* Integer pointers for L\U factors */ + if ( Glu->MemModel == SYSTEM ) { + xsup = intMalloc(n+1); + supno = intMalloc(n+1); + xlsub = intMalloc(n+1); + xlusup = intMalloc(n+1); + xusub = intMalloc(n+1); + } else { + xsup = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + supno = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + xlsub = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + xlusup = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + xusub = (int *)duser_malloc((n+1) * iword, HEAD, Glu); + } + + lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) dexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) dexpand( &nzumax, USUB, 0, 1, Glu ); + + while ( !lusup || !ucol || !lsub || !usub ) { + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE(lusup); + SUPERLU_FREE(ucol); + SUPERLU_FREE(lsub); + SUPERLU_FREE(usub); + } else { + duser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, + HEAD, Glu); + } + nzlumax /= 2; + nzumax /= 2; + nzlmax /= 2; + if ( nzlumax < annz ) { + printf("Not enough memory to perform factorization.\n"); + return (dmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + } +#if ( PRNTlevel >= 1) + printf("dLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", + nzlmax, nzumax); + fflush(stdout); +#endif + lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) dexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) dexpand( &nzumax, USUB, 0, 1, Glu ); + } + + } else { + /* fact == SamePattern_SameRowPerm */ + Lstore = L->Store; + Ustore = U->Store; + xsup = Lstore->sup_to_col; + supno = Lstore->col_to_sup; + xlsub = Lstore->rowind_colptr; + xlusup = Lstore->nzval_colptr; + xusub = Ustore->colptr; + nzlmax = Glu->nzlmax; /* max from previous factorization */ + nzumax = Glu->nzumax; + nzlumax = Glu->nzlumax; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; + } else { + Glu->MemModel = USER; + Glu->stack.top2 = (lwork/4)*4; /* must be word-addressable */ + Glu->stack.size = Glu->stack.top2; + } + + lsub = Glu->expanders[LSUB].mem = Lstore->rowind; + lusup = Glu->expanders[LUSUP].mem = Lstore->nzval; + usub = Glu->expanders[USUB].mem = Ustore->rowind; + ucol = Glu->expanders[UCOL].mem = Ustore->nzval;; + Glu->expanders[LSUB].size = nzlmax; + Glu->expanders[LUSUP].size = nzlumax; + Glu->expanders[USUB].size = nzumax; + Glu->expanders[UCOL].size = nzumax; + } + + Glu->xsup = xsup; + Glu->supno = supno; + Glu->lsub = lsub; + Glu->xlsub = xlsub; + Glu->lusup = lusup; + Glu->xlusup = xlusup; + Glu->ucol = ucol; + Glu->usub = usub; + Glu->xusub = xusub; + Glu->nzlmax = nzlmax; + Glu->nzumax = nzumax; + Glu->nzlumax = nzlumax; + + info = dLUWorkInit(m, n, panel_size, iwork, dwork, Glu); + if ( info ) + return ( info + dmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + + ++Glu->num_expansions; + return 0; + +} /* dLUMemInit */ + +/*! \brief Allocate known working storage. Returns 0 if success, otherwise + returns the number of bytes allocated so far when failure occurred. */ +int +dLUWorkInit(int m, int n, int panel_size, int **iworkptr, + double **dworkptr, GlobalLU_t *Glu) +{ + int isize, dsize, extra; + double *old_ptr; + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + + isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); + dsize = (m * panel_size + + NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(double); + + if ( Glu->MemModel == SYSTEM ) + *iworkptr = (int *) intCalloc(isize/sizeof(int)); + else + *iworkptr = (int *) duser_malloc(isize, TAIL, Glu); + if ( ! *iworkptr ) { + fprintf(stderr, "dLUWorkInit: malloc fails for local iworkptr[]\n"); + return (isize + n); + } + + if ( Glu->MemModel == SYSTEM ) + *dworkptr = (double *) SUPERLU_MALLOC(dsize); + else { + *dworkptr = (double *) duser_malloc(dsize, TAIL, Glu); + if ( NotDoubleAlign(*dworkptr) ) { + old_ptr = *dworkptr; + *dworkptr = (double*) DoubleAlign(*dworkptr); + *dworkptr = (double*) ((double*)*dworkptr - 1); + extra = (char*)old_ptr - (char*)*dworkptr; +#ifdef DEBUG + printf("dLUWorkInit: not aligned, extra %d\n", extra); +#endif + Glu->stack.top2 -= extra; + Glu->stack.used += extra; + } + } + if ( ! *dworkptr ) { + fprintf(stderr, "malloc fails for local dworkptr[]."); + return (isize + dsize + n); + } + + return 0; +} + + +/*! \brief Set up pointers for real working arrays. + */ +void +dSetRWork(int m, int panel_size, double *dworkptr, + double **dense, double **tempv) +{ + double zero = 0.0; + + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + *dense = dworkptr; + *tempv = *dense + panel_size*m; + dfill (*dense, m * panel_size, zero); + dfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); +} + +/*! \brief Free the working storage used by factor routines. + */ +void dLUWorkFree(int *iwork, double *dwork, GlobalLU_t *Glu) +{ + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE (iwork); + SUPERLU_FREE (dwork); + } else { + Glu->stack.used -= (Glu->stack.size - Glu->stack.top2); + Glu->stack.top2 = Glu->stack.size; +/* dStackCompress(Glu); */ + } + + SUPERLU_FREE (Glu->expanders); + Glu->expanders = NULL; +} + +/*! \brief Expand the data structures for L and U during the factorization. + * + *
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ * 
+ */ +int +dLUMemXpand(int jcol, + int next, /* number of elements currently in the factors */ + MemType mem_type, /* which type of memory to expand */ + int *maxlen, /* modified - maximum length of a data structure */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + void *new_mem; + +#ifdef DEBUG + printf("dLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", + jcol, next, *maxlen, mem_type); +#endif + + if (mem_type == USUB) + new_mem = dexpand(maxlen, mem_type, next, 1, Glu); + else + new_mem = dexpand(maxlen, mem_type, next, 0, Glu); + + if ( !new_mem ) { + int nzlmax = Glu->nzlmax; + int nzumax = Glu->nzumax; + int nzlumax = Glu->nzlumax; + fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); + return (dmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); + } + + switch ( mem_type ) { + case LUSUP: + Glu->lusup = (double *) new_mem; + Glu->nzlumax = *maxlen; + break; + case UCOL: + Glu->ucol = (double *) new_mem; + Glu->nzumax = *maxlen; + break; + case LSUB: + Glu->lsub = (int *) new_mem; + Glu->nzlmax = *maxlen; + break; + case USUB: + Glu->usub = (int *) new_mem; + Glu->nzumax = *maxlen; + break; + } + + return 0; + +} + + + +void +copy_mem_double(int howmany, void *old, void *new) +{ + register int i; + double *dold = old; + double *dnew = new; + for (i = 0; i < howmany; i++) dnew[i] = dold[i]; +} + +/*! \brief Expand the existing storage to accommodate more fill-ins. + */ +void +*dexpand ( + int *prev_len, /* length used from previous call */ + MemType type, /* which part of the memory to expand */ + int len_to_copy, /* size of the memory to be copied to new store */ + int keep_prev, /* = 1: use prev_len; + = 0: compute new_len to expand */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + float EXPAND = 1.5; + float alpha; + void *new_mem, *old_mem; + int new_len, tries, lword, extra, bytes_to_copy; + ExpHeader *expanders = Glu->expanders; /* Array of 4 types of memory */ + + alpha = EXPAND; + + if ( Glu->num_expansions == 0 || keep_prev ) { + /* First time allocate requested */ + new_len = *prev_len; + } else { + new_len = alpha * *prev_len; + } + + if ( type == LSUB || type == USUB ) lword = sizeof(int); + else lword = sizeof(double); + + if ( Glu->MemModel == SYSTEM ) { + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + if ( Glu->num_expansions != 0 ) { + tries = 0; + if ( keep_prev ) { + if ( !new_mem ) return (NULL); + } else { + while ( !new_mem ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + } + } + if ( type == LSUB || type == USUB ) { + copy_mem_int(len_to_copy, expanders[type].mem, new_mem); + } else { + copy_mem_double(len_to_copy, expanders[type].mem, new_mem); + } + SUPERLU_FREE (expanders[type].mem); + } + expanders[type].mem = (void *) new_mem; + + } else { /* MemModel == USER */ + if ( Glu->num_expansions == 0 ) { + new_mem = duser_malloc(new_len * lword, HEAD, Glu); + if ( NotDoubleAlign(new_mem) && + (type == LUSUP || type == UCOL) ) { + old_mem = new_mem; + new_mem = (void *)DoubleAlign(new_mem); + extra = (char*)new_mem - (char*)old_mem; +#ifdef DEBUG + printf("expand(): not aligned, extra %d\n", extra); +#endif + Glu->stack.top1 += extra; + Glu->stack.used += extra; + } + expanders[type].mem = (void *) new_mem; + } else { + tries = 0; + extra = (new_len - *prev_len) * lword; + if ( keep_prev ) { + if ( StackFull(extra) ) return (NULL); + } else { + while ( StackFull(extra) ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + extra = (new_len - *prev_len) * lword; + } + } + + if ( type != USUB ) { + new_mem = (void*)((char*)expanders[type + 1].mem + extra); + bytes_to_copy = (char*)Glu->stack.array + Glu->stack.top1 + - (char*)expanders[type + 1].mem; + user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); + + if ( type < USUB ) { + Glu->usub = expanders[USUB].mem = + (void*)((char*)expanders[USUB].mem + extra); + } + if ( type < LSUB ) { + Glu->lsub = expanders[LSUB].mem = + (void*)((char*)expanders[LSUB].mem + extra); + } + if ( type < UCOL ) { + Glu->ucol = expanders[UCOL].mem = + (void*)((char*)expanders[UCOL].mem + extra); + } + Glu->stack.top1 += extra; + Glu->stack.used += extra; + if ( type == UCOL ) { + Glu->stack.top1 += extra; /* Add same amount for USUB */ + Glu->stack.used += extra; + } + + } /* if ... */ + + } /* else ... */ + } + + expanders[type].size = new_len; + *prev_len = new_len; + if ( Glu->num_expansions ) ++Glu->num_expansions; + + return (void *) expanders[type].mem; + +} /* dexpand */ + + +/*! \brief Compress the work[] array to remove fragmentation. + */ +void +dStackCompress(GlobalLU_t *Glu) +{ + register int iword, dword, ndim; + char *last, *fragment; + int *ifrom, *ito; + double *dfrom, *dto; + int *xlsub, *lsub, *xusub, *usub, *xlusup; + double *ucol, *lusup; + + iword = sizeof(int); + dword = sizeof(double); + ndim = Glu->n; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + lusup = Glu->lusup; + + dfrom = ucol; + dto = (double *)((char*)lusup + xlusup[ndim] * dword); + copy_mem_double(xusub[ndim], dfrom, dto); + ucol = dto; + + ifrom = lsub; + ito = (int *) ((char*)ucol + xusub[ndim] * iword); + copy_mem_int(xlsub[ndim], ifrom, ito); + lsub = ito; + + ifrom = usub; + ito = (int *) ((char*)lsub + xlsub[ndim] * iword); + copy_mem_int(xusub[ndim], ifrom, ito); + usub = ito; + + last = (char*)usub + xusub[ndim] * iword; + fragment = (char*) (((char*)Glu->stack.array + Glu->stack.top1) - last); + Glu->stack.used -= (long int) fragment; + Glu->stack.top1 -= (long int) fragment; + + Glu->ucol = ucol; + Glu->lsub = lsub; + Glu->usub = usub; + +#ifdef DEBUG + printf("dStackCompress: fragment %d\n", fragment); + /* for (last = 0; last < ndim; ++last) + print_lu_col("After compress:", last, 0);*/ +#endif + +} + +/*! \brief Allocate storage for original matrix A + */ +void +dallocateA(int n, int nnz, double **a, int **asub, int **xa) +{ + *a = (double *) doubleMalloc(nnz); + *asub = (int *) intMalloc(nnz); + *xa = (int *) intMalloc(n+1); +} + + +double *doubleMalloc(int n) +{ + double *buf; + buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doubleMalloc()\n"); + } + return (buf); +} + +double *doubleCalloc(int n) +{ + double *buf; + register int i; + double zero = 0.0; + buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doubleCalloc()\n"); + } + for (i = 0; i < n; ++i) buf[i] = zero; + return (buf); +} + + +int dmemory_usage(const int nzlmax, const int nzumax, + const int nzlumax, const int n) +{ + register int iword, dword; + + iword = sizeof(int); + dword = sizeof(double); + + return (10 * n * iword + + nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); + +} diff --git a/src/maths/SuperLU/dmyblas2.c b/src/maths/SuperLU/dmyblas2.c new file mode 100644 index 000000000..2a58f5bc3 --- /dev/null +++ b/src/maths/SuperLU/dmyblas2.c @@ -0,0 +1,230 @@ + +/*! @file dmyblas2.c + * \brief Level 2 Blas operations + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ * Purpose: + * Level 2 BLAS operations: solves and matvec, written in C. + * Note: + * This is only used when the system lacks an efficient BLAS library. + * + */ +/* + * File name: dmyblas2.c + */ + +/*! \brief Solves a dense UNIT lower triangular system + * + * The unit lower + * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). + * The solution will be returned in the rhs vector. + */ +void dlsolve ( int ldm, int ncol, double *M, double *rhs ) +{ + int k; + double x0, x1, x2, x3, x4, x5, x6, x7; + double *M0; + register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + + M0 = &M[0]; + + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + Mki4 = Mki3 + ldm + 1; + Mki5 = Mki4 + ldm + 1; + Mki6 = Mki5 + ldm + 1; + Mki7 = Mki6 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++; + x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++; + x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; + x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + rhs[++firstcol] = x4; + rhs[++firstcol] = x5; + rhs[++firstcol] = x6; + rhs[++firstcol] = x7; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++ + - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++ - x7 * *Mki7++; + + M0 += 8 * ldm + 8; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++; + + M0 += 4 * ldm + 4; + } + + if ( firstcol < ncol - 1 ) { /* Do 2 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + + rhs[++firstcol] = x1; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; + + } + +} + +/*! \brief Solves a dense upper triangular system + * + * The upper triangular matrix is + * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned + * in the rhs vector. + */ +void +dusolve ( ldm, ncol, M, rhs ) +int ldm; /* in */ +int ncol; /* in */ +double *M; /* in */ +double *rhs; /* modified */ +{ + double xj; + int jcol, j, irow; + + jcol = ncol - 1; + + for (j = 0; j < ncol; j++) { + + xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ + rhs[jcol] = xj; + + for (irow = 0; irow < jcol; irow++) + rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ + + jcol--; + + } +} + + +/*! \brief Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. + * + * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. + */ +void dmatvec ( ldm, nrow, ncol, M, vec, Mxvec ) + +int ldm; /* in -- leading dimension of M */ +int nrow; /* in */ +int ncol; /* in */ +double *M; /* in */ +double *vec; /* in */ +double *Mxvec; /* in/out */ + +{ + double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; + double *M0; + register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + int k; + + M0 = &M[0]; + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + Mki4 = Mki3 + ldm; + Mki5 = Mki4 + ldm; + Mki6 = Mki5 + ldm; + Mki7 = Mki6 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + vi4 = vec[firstcol++]; + vi5 = vec[firstcol++]; + vi6 = vec[firstcol++]; + vi7 = vec[firstcol++]; + + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ + + vi4 * *Mki4++ + vi5 * *Mki5++ + + vi6 * *Mki6++ + vi7 * *Mki7++; + + M0 += 8 * ldm; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ ; + + M0 += 4 * ldm; + } + + while ( firstcol < ncol ) { /* Do 1 column */ + + Mki0 = M0; + vi0 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++; + + M0 += ldm; + } + +} + diff --git a/src/maths/SuperLU/dpanel_bmod.c b/src/maths/SuperLU/dpanel_bmod.c new file mode 100644 index 000000000..b4255db6d --- /dev/null +++ b/src/maths/SuperLU/dpanel_bmod.c @@ -0,0 +1,459 @@ + +/*! @file dpanel_bmod.c + * \brief Performs numeric block updates + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ +/* + +*/ + +#include +#include +#include + +/* + * Function prototypes + */ +void dlsolve(int, int, double *, double *); +void dmatvec(int, int, int, double *, double *, double *); +extern void dcheck_tempv(); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *    Performs numeric block updates (sup-panel) in topological order.
+ *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
+ *    Special processing on the supernodal portion of L\U[*,j]
+ *
+ *    Before entering this routine, the original nonzeros in the panel 
+ *    were already copied into the spa[m,w].
+ *
+ *    Updated/Output parameters-
+ *    dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
+ *    collectively in the m-by-w vector dense[*]. 
+ * 
+ */ + +void +dpanel_bmod ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + const int nseg, /* in */ + double *dense, /* out, of size n by w */ + double *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in, of size n by w */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + double alpha, beta; +#endif + + register int k, ksub; + int fsupc, nsupc, nsupr, nrow; + int krep, krep_ind; + double ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int segsze; + int block_nrow; /* no of rows in a block row */ + register int lptr; /* Points to the row subscripts of a supernode */ + int kfnz, irow, no_zeros; + register int isub, isub1, i; + register int jj; /* Index through each column in the panel */ + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + int *repfnz_col; /* repfnz[] for a column in the panel */ + double *dense_col; /* dense[] for a column in the panel */ + double *tempv1; /* Used in 1-D update */ + double *TriTmp, *MatvecTmp; /* used in 2-D update */ + double zero = 0.0; + double one = 1.0; + register int ldaTmp; + register int r_ind, r_hi; + static int first = 1, maxsuper, rowblk, colblk; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + if ( first ) { + maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); + rowblk = sp_ienv(4); + colblk = sp_ienv(5); + first = 0; + } + ldaTmp = maxsuper + rowblk; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in a supernode + * nsupr = no of rows in a supernode + */ + krep = segrep[k--]; + fsupc = xsup[supno[krep]]; + nsupc = krep - fsupc + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nrow = nsupr - nsupc; + lptr = xlsub[fsupc]; + krep_ind = lptr + nsupc - 1; + + repfnz_col = repfnz; + dense_col = dense; + + if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ + + TriTmp = tempv; + + /* Sequence through each column in panel -- triangular solves */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + dense_col[irow] -= ukj * lusup[luptr]; + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + ukj -= ukj1 * lusup[luptr1]; + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; + dense_col[irow] -= (ukj*lusup[luptr] + + ukj1*lusup[luptr1]); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; luptr2++; + dense_col[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + } else { /* segsze >= 4 */ + + /* Copy U[*,j] segment from dense[*] to TriTmp[*], which + holds the result of triangular solves. */ + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + TriTmp[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#else + dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#endif +#else + dlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); +#endif + + + } /* else ... */ + + } /* for jj ... end tri-solves */ + + /* Block row updates; push all the way into dense[*] block */ + for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { + + r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); + block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); + luptr = xlusup[fsupc] + nsupc + r_ind; + isub1 = lptr + nsupc + r_ind; + + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + /* Sequence through each column in panel -- matrix-vector */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + /* Perform a block update, and scatter the result of + matrix-vector to dense[]. */ + no_zeros = kfnz - fsupc; + luptr1 = luptr + nsupr * no_zeros; + MatvecTmp = &TriTmp[maxsuper]; + +#ifdef USE_VENDOR_BLAS + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#else + dgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#endif +#else + dmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], + TriTmp, MatvecTmp); +#endif + + /* Scatter MatvecTmp[*] into SPA dense[*] temporarily + * such that MatvecTmp[*] can be re-used for the + * the next blok row update. dense[] will be copied into + * global store after the whole panel has been finished. + */ + isub = isub1; + for (i = 0; i < block_nrow; i++) { + irow = lsub[isub]; + dense_col[irow] -= MatvecTmp[i]; + MatvecTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } /* for each block row ... */ + + /* Scatter the triangular solves into SPA dense[*] */ + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = TriTmp[i]; + TriTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } else { /* 1-D block modification */ + + + /* Sequence through each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += segsze * (segsze - 1); + ops[GEMV] += 2 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + dense_col[irow] -= ukj * lusup[luptr]; + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + ukj -= ukj1 * lusup[luptr1]; + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; + dense_col[irow] -= (ukj*lusup[luptr] + + ukj1*lusup[luptr1]); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + ukj1 -= ukj2 * lusup[luptr2-1]; + ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; ++luptr2; + dense_col[irow] -= ( ukj*lusup[luptr] + + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); + } + } + + } else { /* segsze >= 4 */ + /* + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense[]. + */ + no_zeros = kfnz - fsupc; + + /* Copy U[*,j] segment from dense[*] to tempv[*]: + * The result of triangular solve is in tempv[*]; + * The result of matrix vector update is in dense_col[*] + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + tempv[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + dlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + dmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); +#endif + + /* Scatter tempv[*] into SPA dense[*] temporarily, such + * that tempv[*] can be used for the triangular solve of + * the next column of the panel. They will be copied into + * ucol[*] after the whole panel has been finished. + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = tempv[i]; + tempv[i] = zero; + isub++; + } + + /* Scatter the update from tempv1[*] into SPA dense[*] */ + /* Start dense rectangular L */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + dense_col[irow] -= tempv1[i]; + tempv1[i] = zero; + ++isub; + } + + } /* else segsze>=4 ... */ + + } /* for each column in the panel... */ + + } /* else 1-D update ... */ + + } /* for each updating supernode ... */ + +} + + + diff --git a/src/maths/SuperLU/dpanel_dfs.c b/src/maths/SuperLU/dpanel_dfs.c new file mode 100644 index 000000000..6f8bf414c --- /dev/null +++ b/src/maths/SuperLU/dpanel_dfs.c @@ -0,0 +1,254 @@ + +/*! @file dpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives.
+ *
+ *   The routine returns one list of the supernodal representatives
+ *   in topological order of the dfs that generates them. This list is
+ *   a superset of the topological order of each individual column within
+ *   the panel. 
+ *   The location of the first nonzero in each supernodal segment
+ *   (supernodal entry location) is also returned. Each column has a 
+ *   separate list for this purpose.
+ *
+ *   Two marker arrays are used for dfs:
+ *     marker[i] == jj, if i was visited during dfs of current column jj;
+ *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
+ *
+ *   marker: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ * 
+ */ + +void +dpanel_dfs ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + double *dense, /* out */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *xprune, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + NCPformat *Astore; + double *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + double *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/dpivotL.c b/src/maths/SuperLU/dpivotL.c new file mode 100644 index 000000000..4844bbe86 --- /dev/null +++ b/src/maths/SuperLU/dpivotL.c @@ -0,0 +1,184 @@ + +/*! @file dpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include +#include +#include + +#undef DEBUG + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Performs the numerical pivoting on the current column of L,
+ *   and the CDIV operation.
+ *
+ *   Pivot policy:
+ *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
+ *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
+ *           pivot row = k;
+ *       ELSE IF abs(A_jj) >= thresh THEN
+ *           pivot row = j;
+ *       ELSE
+ *           pivot row = m;
+ * 
+ *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
+ *
+ *   Return value: 0      success;
+ *                 i > 0  U(i,i) is exactly zero.
+ * 
+ */ + +int +dpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int *iperm_r, /* in - inverse of perm_r */ + int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ + int *pivrow, /* out */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + int pivptr, old_pivptr, diag, diagind; + double pivmax, rtemp, thresh; + double temp; + double *lu_sup_ptr; + double *lu_col_ptr; + int *lsub_ptr; + int isub, icol, k, itemp; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + /* Initialize pointers */ + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + +#ifdef DEBUG +if ( jcol == MIN_COL ) { + printf("Before cdiv: col %d\n", jcol); + for (k = nsupc; k < nsupr; k++) + printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); +} +#endif + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + if ( *usepr ) *pivrow = iperm_r[jcol]; + diagind = iperm_c[jcol]; + pivmax = 0.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + for (isub = nsupc; isub < nsupr; ++isub) { + rtemp = fabs (lu_col_ptr[isub]); + if ( rtemp > pivmax ) { + pivmax = rtemp; + pivptr = isub; + } + if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; + if ( lsub_ptr[isub] == diagind ) diag = isub; + } + + /* Test for singularity */ + if ( pivmax == 0.0 ) { +#if 1 + *pivrow = lsub_ptr[pivptr]; + perm_r[*pivrow] = jcol; +#else + perm_r[diagind] = jcol; +#endif + *usepr = 0; + return (jcol+1); + } + + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + rtemp = fabs (lu_col_ptr[old_pivptr]); + if ( rtemp != 0.0 && rtemp >= thresh ) + pivptr = old_pivptr; + else + *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + rtemp = fabs (lu_col_ptr[diag]); + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += nsupr - nsupc; + + temp = 1.0 / lu_col_ptr[nsupc]; + for (k = nsupc+1; k < nsupr; k++) + lu_col_ptr[k] *= temp; + + return 0; +} + diff --git a/src/maths/SuperLU/dpivotgrowth.c b/src/maths/SuperLU/dpivotgrowth.c new file mode 100644 index 000000000..a75061631 --- /dev/null +++ b/src/maths/SuperLU/dpivotgrowth.c @@ -0,0 +1,113 @@ + +/*! @file dpivotgrowth.c + * \brief Computes the reciprocal pivot growth factor + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +#include +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Compute the reciprocal pivot growth factor of the leading ncols columns
+ * of the matrix, using the formula:
+ *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
+ *
+ * Arguments
+ * =========
+ *
+ * ncols    (input) int
+ *          The number of columns of matrices A, L and U.
+ *
+ * A        (input) SuperMatrix*
+ *	    Original matrix A, permuted by columns, of dimension
+ *          (A->nrow, A->ncol). The type of A can be:
+ *          Stype = NC; Dtype = SLU_D; Mtype = GE.
+ *
+ * L        (output) SuperMatrix*
+ *          The factor L from the factorization Pr*A=L*U; use compressed row 
+ *          subscripts storage for supernodes, i.e., L has type: 
+ *          Stype = SC; Dtype = SLU_D; Mtype = TRLU.
+ *
+ * U        (output) SuperMatrix*
+ *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
+ *          storage scheme, i.e., U has types: Stype = NC;
+ *          Dtype = SLU_D; Mtype = TRU.
+ * 
+ */ + +double +dPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, + SuperMatrix *L, SuperMatrix *U) +{ + + NCformat *Astore; + SCformat *Lstore; + NCformat *Ustore; + double *Aval, *Lval, *Uval; + int fsupc, nsupr, luptr, nz_in_U; + int i, j, k, oldcol; + int *inv_perm_c; + double rpg, maxaj, maxuj; + double smlnum; + double *luval; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + rpg = 1. / smlnum; + + Astore = A->Store; + Lstore = L->Store; + Ustore = U->Store; + Aval = Astore->nzval; + Lval = Lstore->nzval; + Uval = Ustore->nzval; + + inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); + for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; + + for (k = 0; k <= Lstore->nsuper; ++k) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + luptr = L_NZ_START(fsupc); + luval = &Lval[luptr]; + nz_in_U = 1; + + for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { + maxaj = 0.; + oldcol = inv_perm_c[j]; + for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) + maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) ); + + maxuj = 0.; + for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) + maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) ); + + /* Supernode */ + for (i = 0; i < nz_in_U; ++i) + maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) ); + + ++nz_in_U; + luval += nsupr; + + if ( maxuj == 0. ) + rpg = SUPERLU_MIN( rpg, 1.); + else + rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); + } + + if ( j >= ncols ) break; + } + + SUPERLU_FREE(inv_perm_c); + return (rpg); +} diff --git a/src/maths/SuperLU/dpruneL.c b/src/maths/SuperLU/dpruneL.c new file mode 100644 index 000000000..024dc67db --- /dev/null +++ b/src/maths/SuperLU/dpruneL.c @@ -0,0 +1,154 @@ + +/*! @file dpruneL.c + * \brief Prunes the L-structure + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ *
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Prunes the L-structure of supernodes whose L-structure
+ *   contains the current pivot row "pivrow"
+ * 
+ */ + +void +dpruneL( + const int jcol, /* in */ + const int *perm_r, /* in */ + const int pivrow, /* in */ + const int nseg, /* in */ + const int *segrep, /* in */ + const int *repfnz, /* in */ + int *xprune, /* out */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + + double utemp; + int jsupno, irep, irep1, kmin, kmax, krow, movnum; + int i, ktemp, minloc, maxloc; + int do_prune; /* logical variable */ + int *xsup, *supno; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + /* + * For each supernode-rep irep in U[*,j] + */ + jsupno = supno[jcol]; + for (i = 0; i < nseg; i++) { + + irep = segrep[i]; + irep1 = irep + 1; + do_prune = FALSE; + + /* Don't prune with a zero U-segment */ + if ( repfnz[irep] == EMPTY ) + continue; + + /* If a snode overlaps with the next panel, then the U-segment + * is fragmented into two parts -- irep and irep1. We should let + * pruning occur at the rep-column in irep1's snode. + */ + if ( supno[irep] == supno[irep1] ) /* Don't prune */ + continue; + + /* + * If it has not been pruned & it has a nonz in row L[pivrow,i] + */ + if ( supno[irep] != jsupno ) { + if ( xprune[irep] >= xlsub[irep1] ) { + kmin = xlsub[irep]; + kmax = xlsub[irep1] - 1; + for (krow = kmin; krow <= kmax; krow++) + if ( lsub[krow] == pivrow ) { + do_prune = TRUE; + break; + } + } + + if ( do_prune ) { + + /* Do a quicksort-type partition + * movnum=TRUE means that the num values have to be exchanged. + */ + movnum = FALSE; + if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ + movnum = TRUE; + + while ( kmin <= kmax ) { + + if ( perm_r[lsub[kmax]] == EMPTY ) + kmax--; + else if ( perm_r[lsub[kmin]] != EMPTY ) + kmin++; + else { /* kmin below pivrow (not yet pivoted), and kmax + * above pivrow: interchange the two subscripts + */ + ktemp = lsub[kmin]; + lsub[kmin] = lsub[kmax]; + lsub[kmax] = ktemp; + + /* If the supernode has only one column, then we + * only keep one set of subscripts. For any subscript + * interchange performed, similar interchange must be + * done on the numerical values. + */ + if ( movnum ) { + minloc = xlusup[irep] + (kmin - xlsub[irep]); + maxloc = xlusup[irep] + (kmax - xlsub[irep]); + utemp = lusup[minloc]; + lusup[minloc] = lusup[maxloc]; + lusup[maxloc] = utemp; + } + + kmin++; + kmax--; + + } + + } /* while */ + + xprune[irep] = kmin; /* Pruning */ + +#ifdef CHK_PRUNE + printf(" After dpruneL(),using col %d: xprune[%d] = %d\n", + jcol, irep, kmin); +#endif + } /* if do_prune */ + + } /* if */ + + } /* for each U-segment... */ +} diff --git a/src/maths/SuperLU/dreadhb.c b/src/maths/SuperLU/dreadhb.c new file mode 100644 index 000000000..852e0b7af --- /dev/null +++ b/src/maths/SuperLU/dreadhb.c @@ -0,0 +1,257 @@ + +/*! @file dreadhb.c + * \brief Read a matrix stored in Harwell-Boeing format + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Purpose
+ * =======
+ * 
+ * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format 
+ * as described below.
+ * 
+ * Line 1 (A72,A8) 
+ *  	Col. 1 - 72   Title (TITLE) 
+ *	Col. 73 - 80  Key (KEY) 
+ * 
+ * Line 2 (5I14) 
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
+ *                    (including starting guesses and solution vectors 
+ *		       if present) 
+ *           	      (zero indicates no right-hand side data is present) 
+ *
+ * Line 3 (A3, 11X, 4I14) 
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
+ *	              (equal to number of entries for assembled matrices) 
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
+ *	              (zero in the case of assembled matrices) 
+ * Line 4 (2A16, 2A20) 
+ * 	Col. 1 - 16   Format for pointers (PTRFMT) 
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
+ *    	Col. 1 	      Right-hand side type: 
+ *	         	  F for full storage or M for same format as matrix 
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
+ *    	Col. 3        X if an exact solution vector(s) is supplied. 
+ *	Col. 15 - 28  Number of right-hand sides (NRHS) 
+ *	Col. 29 - 42  Number of row indices (NRHSIX) 
+ *          	      (ignored in case of unassembled matrices) 
+ *
+ * The three character type field on line 3 describes the matrix type. 
+ * The following table lists the permitted values for each of the three 
+ * characters. As an example of the type field, RSA denotes that the matrix 
+ * is real, symmetric, and assembled. 
+ *
+ * First Character: 
+ *	R Real matrix 
+ *	C Complex matrix 
+ *	P Pattern only (no numerical values supplied) 
+ *
+ * Second Character: 
+ *	S Symmetric 
+ *	U Unsymmetric 
+ *	H Hermitian 
+ *	Z Skew symmetric 
+ *	R Rectangular 
+ *
+ * Third Character: 
+ *	A Assembled 
+ *	E Elemental matrices (unassembled) 
+ *
+ * 
+ */ +#include +#include +#include + + +/*! \brief Eat up the rest of the current line */ +int dDumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +int dParseIntFormat(char *buf, int *num, int *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + sscanf(tmp, "%d", num); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + sscanf(tmp, "%d", size); + return 0; +} + +int dParseFloatFormat(char *buf, int *num, int *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ + + return 0; +} + +static int ReadVector(FILE *fp, int n, int *where, int perline, int persize) +{ + register int i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + * + * + * Purpose + * ======= + * + * Read a DOUBLE PRECISION matrix stored in Rutherford-Boeing format + * as described below. + * + * Line 1 (A72, A8) + * Col. 1 - 72 Title (TITLE) + * Col. 73 - 80 Matrix name / identifier (MTRXID) + * + * Line 2 (I14, 3(1X, I13)) + * Col. 1 - 14 Total number of lines excluding header (TOTCRD) + * Col. 16 - 28 Number of lines for pointers (PTRCRD) + * Col. 30 - 42 Number of lines for row (or variable) indices (INDCRD) + * Col. 44 - 56 Number of lines for numerical values (VALCRD) + * + * Line 3 (A3, 11X, 4(1X, I13)) + * Col. 1 - 3 Matrix type (see below) (MXTYPE) + * Col. 15 - 28 Compressed Column: Number of rows (NROW) + * Elemental: Largest integer used to index variable (MVAR) + * Col. 30 - 42 Compressed Column: Number of columns (NCOL) + * Elemental: Number of element matrices (NELT) + * Col. 44 - 56 Compressed Column: Number of entries (NNZERO) + * Elemental: Number of variable indeces (NVARIX) + * Col. 58 - 70 Compressed Column: Unused, explicitly zero + * Elemental: Number of elemental matrix entries (NELTVL) + * + * Line 4 (2A16, A20) + * Col. 1 - 16 Fortran format for pointers (PTRFMT) + * Col. 17 - 32 Fortran format for row (or variable) indices (INDFMT) + * Col. 33 - 52 Fortran format for numerical values of coefficient matrix + * (VALFMT) + * (blank in the case of matrix patterns) + * + * The three character type field on line 3 describes the matrix type. + * The following table lists the permitted values for each of the three + * characters. As an example of the type field, RSA denotes that the matrix + * is real, symmetric, and assembled. + * + * First Character: + * R Real matrix + * C Complex matrix + * I integer matrix + * P Pattern only (no numerical values supplied) + * Q Pattern only (numerical values supplied in associated auxiliary value + * file) + * + * Second Character: + * S Symmetric + * U Unsymmetric + * H Hermitian + * Z Skew symmetric + * R Rectangular + * + * Third Character: + * A Compressed column form + * E Elemental form + * + * + */ + +#include + + +/*! \brief Eat up the rest of the current line */ +static int dDumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +static int dParseIntFormat(char *buf, int *num, int *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + sscanf(tmp, "%d", num); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + sscanf(tmp, "%d", size); + return 0; +} + +static int dParseFloatFormat(char *buf, int *num, int *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ + + return 0; +} + +static int ReadVector(FILE *fp, int n, int *where, int perline, int persize) +{ + register int i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + * + */ + +#include + + +void +dreadtriple(int *m, int *n, int *nonz, + double **nzval, int **rowind, int **colptr) +{ +/* + * Output parameters + * ================= + * (a,asub,xa): asub[*] contains the row subscripts of nonzeros + * in columns of matrix A; a[*] the numerical values; + * row i of A is given by a[k],k=xa[i],...,xa[i+1]-1. + * + */ + int j, k, jsize, nnz, nz; + double *a, *val; + int *asub, *xa, *row, *col; + int zero_base = 0; + + /* Matrix format: + * First line: #rows, #cols, #non-zero + * Triplet in the rest of lines: + * row, col, value + */ + + scanf("%d%d", n, nonz); + *m = *n; + printf("m %d, n %d, nonz %d\n", *m, *n, *nonz); + dallocateA(*n, *nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + val = (double *) SUPERLU_MALLOC(*nonz * sizeof(double)); + row = (int *) SUPERLU_MALLOC(*nonz * sizeof(int)); + col = (int *) SUPERLU_MALLOC(*nonz * sizeof(int)); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* Read into the triplet array from a file */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { + scanf("%d%d%lf\n", &row[nz], &col[nz], &val[nz]); + + if ( nnz == 0 ) { /* first nonzero */ + if ( row[0] == 0 || col[0] == 0 ) { + zero_base = 1; + printf("triplet file: row/col indices are zero-based.\n"); + } else + printf("triplet file: row/col indices are one-based.\n"); + } + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", + nz, row[nz], col[nz], val[nz]); + exit(-1); + } else { + ++xa[col[nz]]; + ++nz; + } + } + + *nonz = nz; + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + { + int i; + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } + } +#endif + +} + + +void dreadrhs(int m, double *b) +{ + FILE *fp, *fopen(); + int i; + /*int j;*/ + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "dreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + fscanf(fp, "%lf\n", &b[i]); + + /* readpair_(j, &b[i]);*/ + fclose(fp); +} diff --git a/src/maths/SuperLU/dsnode_bmod.c b/src/maths/SuperLU/dsnode_bmod.c new file mode 100644 index 000000000..bb66c9f9e --- /dev/null +++ b/src/maths/SuperLU/dsnode_bmod.c @@ -0,0 +1,118 @@ + +/*! @file dsnode_bmod.c + * \brief Performs numeric block updates within the relaxed snode. + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + + +/*! \brief Performs numeric block updates within the relaxed snode. + */ +int +dsnode_bmod ( + const int jcol, /* in */ + const int jsupno, /* in */ + const int fsupc, /* in */ + double *dense, /* in */ + double *tempv, /* working array */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + double alpha = -1.0, beta = 1.0; +#endif + + int luptr, nsupc, nsupr, nrow; + int isub, irow, i, iptr; + register int ufirst, nextlu; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + nextlu = xlusup[jcol]; + + /* + * Process the supernodal portion of L\U[*,j] + */ + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = 0; + ++nextlu; + } + + xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ + + if ( fsupc < jcol ) { + + luptr = xlusup[fsupc]; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nsupc = jcol - fsupc; /* Excluding jcol */ + ufirst = xlusup[jcol]; /* Points to the beginning of column + jcol in supernode L\U(jsupno). */ + nrow = nsupr - nsupc; + + ops[TRSV] += nsupc * (nsupc - 1); + ops[GEMV] += 2 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, + &lusup[ufirst], &incx ); + SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + dtrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, + &lusup[ufirst], &incx ); + dgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + dlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + dmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], &tempv[0] ); + + /* Scatter tempv[*] into lusup[*] */ + iptr = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + lusup[iptr++] -= tempv[i]; + tempv[i] = 0.0; + } +#endif + + } + + return 0; +} diff --git a/src/maths/SuperLU/dsnode_dfs.c b/src/maths/SuperLU/dsnode_dfs.c new file mode 100644 index 000000000..84b2cc7ae --- /dev/null +++ b/src/maths/SuperLU/dsnode_dfs.c @@ -0,0 +1,112 @@ + +/*! @file dsnode_dfs.c + * \brief Determines the union of row structures of columns within the relaxed node + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    dsnode_dfs() - Determine the union of the row structures of those 
+ *    columns within the relaxed snode.
+ *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
+ *    the portion outside the rectangular supernode must be zero.
+ *
+ * Return value
+ * ============
+ *     0   success;
+ *    >0   number of bytes allocated when run out of memory.
+ * 
+ */ + +int +dsnode_dfs ( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *xprune, /* out */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, ifrom, ito, nextl, new_next; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) { + if ( mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1, then make a copy of the subscripts for pruning */ + if ( jcol < kcol ) { + new_next = nextl + (nextl - xlsub[jcol]); + while ( new_next > nzlmax ) { + if ( mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + ito = nextl; + for (ifrom = xlsub[jcol]; ifrom < nextl; ) + lsub[ito++] = lsub[ifrom++]; + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + nextl = ito; + } + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xprune[kcol] = nextl; + xlsub[kcol+1] = nextl; + + return 0; +} + diff --git a/src/maths/SuperLU/dsp_blas2.c b/src/maths/SuperLU/dsp_blas2.c new file mode 100644 index 000000000..49a625a7e --- /dev/null +++ b/src/maths/SuperLU/dsp_blas2.c @@ -0,0 +1,477 @@ + +/*! @file dsp_blas2.c + * \brief Sparse BLAS 2, using some dense BLAS 2 operations + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ */ +/* + * File name: dsp_blas2.c + * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. + */ + +#include + +/* + * Function prototypes + */ +void dusolve(int, int, double*, double*); +void dlsolve(int, int, double*, double*); +void dmatvec(int, int, int, double*, double*, double*); + +/*! \brief Solves one of the systems of equations A*x = b, or A'*x = b + * + *
+ *   Purpose
+ *   =======
+ *
+ *   sp_dtrsv() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.
+ *                trans = 'C' or 'c'   A'*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes,
+ *             i.e., L has types: Stype = SC, Dtype = SLU_D, Mtype = TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = NC, Dtype = SLU_D, Mtype = TRU.
+ *    
+ *   x       - (input/output) double*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ * 
+ */ +int +sp_dtrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, + SuperMatrix *U, double *x, SuperLUStat_t *stat, int *info) +{ +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + SCformat *Lstore; + NCformat *Ustore; + double *Lval, *Uval; + int incx = 1, incy = 1; + double alpha = 1.0, beta = 1.0; + int nrow; + int fsupc, nsupr, nsupc, luptr, istart, irow; + int i, k, iptr, jcol; + double *work; + flops_t solve_ops; + + /* Test the input parameters */ + *info = 0; + if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; + else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && + !lsame_(trans, "C")) *info = -2; + else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; + else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; + else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; + if ( *info ) { + i = -(*info); + xerbla_("sp_dtrsv", &i); + return 0; + } + + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( !(work = doubleCalloc(L->nrow)) ) + ABORT_SuperLU("Malloc fails for work in sp_dtrsv()."); + + if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L)*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + nrow = nsupr - nsupc; + + solve_ops += nsupc * (nsupc - 1); + solve_ops += 2 * nrow * nsupc; + + if ( nsupc == 1 ) { + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { + irow = L_SUB(iptr); + ++luptr; + x[irow] -= x[fsupc] * Lval[luptr]; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#else + dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#endif +#else + dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); + + dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], + &x[fsupc], &work[0] ); +#endif + + iptr = istart + nsupc; + for (i = 0; i < nrow; ++i, ++iptr) { + irow = L_SUB(iptr); + x[irow] -= work[i]; /* Scatter */ + work[i] = 0.0; + + } + } + } /* for k ... */ + + } else { + /* Form x := inv(U)*x */ + + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += nsupc * (nsupc + 1); + + if ( nsupc == 1 ) { + x[fsupc] /= Lval[luptr]; + for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { + irow = U_SUB(i); + x[irow] -= x[fsupc] * Uval[i]; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif +#else + dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); +#endif + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); + i++) { + irow = U_SUB(i); + x[irow] -= x[jcol] * Uval[i]; + } + } + } + } /* for k ... */ + + } + } else { /* Form x := inv(A')*x */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L')*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; --k) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 2 * (nsupr - nsupc) * nsupc; + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + iptr = istart + nsupc; + for (i = L_NZ_START(jcol) + nsupc; + i < L_NZ_START(jcol+1); i++) { + irow = L_SUB(iptr); + x[jcol] -= x[irow] * Lval[i]; + iptr++; + } + } + + if ( nsupc > 1 ) { + solve_ops += nsupc * (nsupc - 1); +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("U", strlen("U")); + STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } + } else { + /* Form x := inv(U')*x */ + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { + irow = U_SUB(i); + x[jcol] -= x[irow] * Uval[i]; + } + } + + solve_ops += nsupc * (nsupc + 1); + + if ( nsupc == 1 ) { + x[fsupc] /= Lval[luptr]; + } else { +#ifdef _CRAY + ftcs1 = _cptofcd("U", strlen("U")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("N", strlen("N")); + STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } /* for k ... */ + } + } + + stat->ops[SOLVE] += solve_ops; + SUPERLU_FREE(work); + return 0; +} + + + +/*! \brief Performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   sp_dgemv()  performs one of the matrix-vector operations   
+ *      y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+ *   where alpha and beta are scalars, x and y are vectors and A is a
+ *   sparse A->nrow by A->ncol matrix.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   TRANS  - (input) char*
+ *            On entry, TRANS specifies the operation to be performed as   
+ *            follows:   
+ *               TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+ *               TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+ *               TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+ *
+ *   ALPHA  - (input) double
+ *            On entry, ALPHA specifies the scalar alpha.   
+ *
+ *   A      - (input) SuperMatrix*
+ *            Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+ *            Currently, the type of A can be:
+ *                Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. 
+ *            In the future, more general A can be handled.
+ *
+ *   X      - (input) double*, array of DIMENSION at least   
+ *            ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+ *            and at least   
+ *            ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+ *            Before entry, the incremented array X must contain the   
+ *            vector x.   
+ *
+ *   INCX   - (input) int
+ *            On entry, INCX specifies the increment for the elements of   
+ *            X. INCX must not be zero.   
+ *
+ *   BETA   - (input) double
+ *            On entry, BETA specifies the scalar beta. When BETA is   
+ *            supplied as zero then Y need not be set on input.   
+ *
+ *   Y      - (output) double*,  array of DIMENSION at least   
+ *            ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+ *            and at least   
+ *            ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+ *            Before entry with BETA non-zero, the incremented array Y   
+ *            must contain the vector y. On exit, Y is overwritten by the 
+ *            updated vector y.
+ *	     
+ *   INCY   - (input) int
+ *            On entry, INCY specifies the increment for the elements of   
+ *            Y. INCY must not be zero.   
+ *
+ *   ==== Sparse Level 2 Blas routine.   
+ * 
+ */ + +int +sp_dgemv(char *trans, double alpha, SuperMatrix *A, double *x, + int incx, double beta, double *y, int incy) +{ + /* Local variables */ + NCformat *Astore; + double *Aval; + int info; + double temp; + int lenx, leny, i, j, irow; + int iy, jx, jy, kx, ky; + int notran; + + notran = lsame_(trans, "N"); + Astore = A->Store; + Aval = Astore->nzval; + + /* Test the input parameters */ + info = 0; + if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; + else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; + else if (incx == 0) info = 5; + else if (incy == 0) info = 8; + if (info != 0) { + xerbla_("sp_dgemv ", &info); + return 0; + } + + /* Quick return if possible. */ + if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.)) + return 0; + + /* Set LENX and LENY, the lengths of the vectors x and y, and set + up the start points in X and Y. */ + if (lsame_(trans, "N")) { + lenx = A->ncol; + leny = A->nrow; + } else { + lenx = A->nrow; + leny = A->ncol; + } + if (incx > 0) kx = 0; + else kx = - (lenx - 1) * incx; + if (incy > 0) ky = 0; + else ky = - (leny - 1) * incy; + + /* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + /* First form y := beta*y. */ + if (beta != 1.) { + if (incy == 1) { + if (beta == 0.) + for (i = 0; i < leny; ++i) y[i] = 0.; + else + for (i = 0; i < leny; ++i) y[i] = beta * y[i]; + } else { + iy = ky; + if (beta == 0.) + for (i = 0; i < leny; ++i) { + y[iy] = 0.; + iy += incy; + } + else + for (i = 0; i < leny; ++i) { + y[iy] = beta * y[iy]; + iy += incy; + } + } + } + + if (alpha == 0.) return 0; + + if ( notran ) { + /* Form y := alpha*A*x + y. */ + jx = kx; + if (incy == 1) { + for (j = 0; j < A->ncol; ++j) { + if (x[jx] != 0.) { + temp = alpha * x[jx]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + y[irow] += temp * Aval[i]; + } + } + jx += incx; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } else { + /* Form y := alpha*A'*x + y. */ + jy = ky; + if (incx == 1) { + for (j = 0; j < A->ncol; ++j) { + temp = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + temp += Aval[i] * x[irow]; + } + y[jy] += alpha * temp; + jy += incy; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } + return 0; +} /* sp_dgemv */ + + + diff --git a/src/maths/SuperLU/dsp_blas3.c b/src/maths/SuperLU/dsp_blas3.c new file mode 100644 index 000000000..56a94f901 --- /dev/null +++ b/src/maths/SuperLU/dsp_blas3.c @@ -0,0 +1,127 @@ + +/*! @file dsp_blas3.c + * \brief Sparse BLAS3, using some dense BLAS3 operations + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +/* + * File name: sp_blas3.c + * Purpose: Sparse BLAS3, using some dense BLAS3 operations. + */ + +#include + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ * 
+ *   sp_d performs one of the matrix-matrix operations   
+ * 
+ *      C := alpha*op( A )*op( B ) + beta*C,   
+ * 
+ *   where  op( X ) is one of 
+ * 
+ *      op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+ * 
+ *   alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+ *   an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+ *   
+ * 
+ *   Parameters   
+ *   ==========   
+ * 
+ *   TRANSA - (input) char*
+ *            On entry, TRANSA specifies the form of op( A ) to be used in 
+ *            the matrix multiplication as follows:   
+ *               TRANSA = 'N' or 'n',  op( A ) = A.   
+ *               TRANSA = 'T' or 't',  op( A ) = A'.   
+ *               TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+ *            Unchanged on exit.   
+ * 
+ *   TRANSB - (input) char*
+ *            On entry, TRANSB specifies the form of op( B ) to be used in 
+ *            the matrix multiplication as follows:   
+ *               TRANSB = 'N' or 'n',  op( B ) = B.   
+ *               TRANSB = 'T' or 't',  op( B ) = B'.   
+ *               TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+ *            Unchanged on exit.   
+ * 
+ *   M      - (input) int   
+ *            On entry,  M  specifies  the number of rows of the matrix 
+ *	     op( A ) and of the matrix C.  M must be at least zero. 
+ *	     Unchanged on exit.   
+ * 
+ *   N      - (input) int
+ *            On entry,  N specifies the number of columns of the matrix 
+ *	     op( B ) and the number of columns of the matrix C. N must be 
+ *	     at least zero.
+ *	     Unchanged on exit.   
+ * 
+ *   K      - (input) int
+ *            On entry, K specifies the number of columns of the matrix 
+ *	     op( A ) and the number of rows of the matrix op( B ). K must 
+ *	     be at least  zero.   
+ *           Unchanged on exit.
+ *      
+ *   ALPHA  - (input) double
+ *            On entry, ALPHA specifies the scalar alpha.   
+ * 
+ *   A      - (input) SuperMatrix*
+ *            Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+ *            Currently, the type of A can be:
+ *                Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. 
+ *            In the future, more general A can be handled.
+ * 
+ *   B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is 
+ *            n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+ *            Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+ *            part of the array B must contain the matrix B, otherwise 
+ *            the leading n by k part of the array B must contain the 
+ *            matrix B.   
+ *            Unchanged on exit.   
+ * 
+ *   LDB    - (input) int
+ *            On entry, LDB specifies the first dimension of B as declared 
+ *            in the calling (sub) program. LDB must be at least max( 1, n ).  
+ *            Unchanged on exit.   
+ * 
+ *   BETA   - (input) double
+ *            On entry, BETA specifies the scalar beta. When BETA is   
+ *            supplied as zero then C need not be set on input.   
+ *  
+ *   C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).   
+ *            Before entry, the leading m by n part of the array C must 
+ *            contain the matrix C,  except when beta is zero, in which 
+ *            case C need not be set on entry.   
+ *            On exit, the array C is overwritten by the m by n matrix 
+ *	     ( alpha*op( A )*B + beta*C ).   
+ *  
+ *   LDC    - (input) int
+ *            On entry, LDC specifies the first dimension of C as declared 
+ *            in the calling (sub)program. LDC must be at least max(1,m).   
+ *            Unchanged on exit.   
+ *  
+ *   ==== Sparse Level 3 Blas routine.   
+ * 
+ */ + +int +sp_dgemm(char *transa, char *transb, int m, int n, int k, + double alpha, SuperMatrix *A, double *b, int ldb, + double beta, double *c, int ldc) +{ + int incx = 1, incy = 1; + int j; + + for (j = 0; j < n; ++j) { + sp_dgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); + } + return 0; +} diff --git a/src/maths/SuperLU/dutil.c b/src/maths/SuperLU/dutil.c new file mode 100644 index 000000000..ea850fff3 --- /dev/null +++ b/src/maths/SuperLU/dutil.c @@ -0,0 +1,471 @@ + +/*! @file dutil.c + * \brief Matrix utility functions + * + *
+ * -- SuperLU routine (version 3.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * August 1, 2008
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include +#include + +void +dCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, + double *nzval, int *rowind, int *colptr, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + NCformat *Astore; + + A->Stype = stype; + A->Dtype = dtype; + A->Mtype = mtype; + A->nrow = m; + A->ncol = n; + A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); + if ( !(A->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for A->Store"); + Astore = A->Store; + Astore->nnz = nnz; + Astore->nzval = nzval; + Astore->rowind = rowind; + Astore->colptr = colptr; +} + +void +dCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, + double *nzval, int *colind, int *rowptr, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + NRformat *Astore; + + A->Stype = stype; + A->Dtype = dtype; + A->Mtype = mtype; + A->nrow = m; + A->ncol = n; + A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) ); + if ( !(A->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for A->Store"); + Astore = A->Store; + Astore->nnz = nnz; + Astore->nzval = nzval; + Astore->colind = colind; + Astore->rowptr = rowptr; +} + +/*! \brief Copy matrix A into matrix B. */ +void +dCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B) +{ + NCformat *Astore, *Bstore; + int ncol, nnz, i; + + B->Stype = A->Stype; + B->Dtype = A->Dtype; + B->Mtype = A->Mtype; + B->nrow = A->nrow;; + B->ncol = ncol = A->ncol; + Astore = (NCformat *) A->Store; + Bstore = (NCformat *) B->Store; + Bstore->nnz = nnz = Astore->nnz; + for (i = 0; i < nnz; ++i) + ((double *)Bstore->nzval)[i] = ((double *)Astore->nzval)[i]; + for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; + for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; +} + + +void +dCreate_Dense_Matrix(SuperMatrix *X, int m, int n, double *x, int ldx, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + DNformat *Xstore; + + X->Stype = stype; + X->Dtype = dtype; + X->Mtype = mtype; + X->nrow = m; + X->ncol = n; + X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); + if ( !(X->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for X->Store"); + Xstore = (DNformat *) X->Store; + Xstore->lda = ldx; + Xstore->nzval = (double *) x; +} + +void +dCopy_Dense_Matrix(int M, int N, double *X, int ldx, + double *Y, int ldy) +{ +/*! \brief Copies a two-dimensional matrix X to another matrix Y. + */ + int i, j; + + for (j = 0; j < N; ++j) + for (i = 0; i < M; ++i) + Y[i + j*ldy] = X[i + j*ldx]; +} + +void +dCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, + double *nzval, int *nzval_colptr, int *rowind, + int *rowind_colptr, int *col_to_sup, int *sup_to_col, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + SCformat *Lstore; + + L->Stype = stype; + L->Dtype = dtype; + L->Mtype = mtype; + L->nrow = m; + L->ncol = n; + L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); + if ( !(L->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for L->Store"); + Lstore = L->Store; + Lstore->nnz = nnz; + Lstore->nsuper = col_to_sup[n]; + Lstore->nzval = nzval; + Lstore->nzval_colptr = nzval_colptr; + Lstore->rowind = rowind; + Lstore->rowind_colptr = rowind_colptr; + Lstore->col_to_sup = col_to_sup; + Lstore->sup_to_col = sup_to_col; + +} + + +/*! \brief Convert a row compressed storage into a column compressed storage. + */ +void +dCompRow_to_CompCol(int m, int n, int nnz, + double *a, int *colind, int *rowptr, + double **at, int **rowind, int **colptr) +{ + register int i, j, col, relpos; + int *marker; + + /* Allocate storage for another copy of the matrix. */ + *at = (double *) doubleMalloc(nnz); + *rowind = (int *) intMalloc(nnz); + *colptr = (int *) intMalloc(n+1); + marker = (int *) intCalloc(n); + + /* Get counts of each column of A, and set up column pointers */ + for (i = 0; i < m; ++i) + for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; + (*colptr)[0] = 0; + for (j = 0; j < n; ++j) { + (*colptr)[j+1] = (*colptr)[j] + marker[j]; + marker[j] = (*colptr)[j]; + } + + /* Transfer the matrix into the compressed column storage. */ + for (i = 0; i < m; ++i) { + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + col = colind[j]; + relpos = marker[col]; + (*rowind)[relpos] = i; + (*at)[relpos] = a[j]; + ++marker[col]; + } + } + + SUPERLU_FREE(marker); +} + + +void +dPrint_CompCol_Matrix(char *what, SuperMatrix *A) +{ + NCformat *Astore; + register int i,n; + double *dp; + + printf("\nCompCol matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + n = A->ncol; + Astore = (NCformat *) A->Store; + dp = (double *) Astore->nzval; + printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); + printf("nzval: "); + for (i = 0; i < Astore->colptr[n]; ++i) printf("%f ", dp[i]); + printf("\nrowind: "); + for (i = 0; i < Astore->colptr[n]; ++i) printf("%d ", Astore->rowind[i]); + printf("\ncolptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->colptr[i]); + printf("\n"); + fflush(stdout); +} + +void +dPrint_SuperNode_Matrix(char *what, SuperMatrix *A) +{ + SCformat *Astore; + register int i, j, k, c, d, n, nsup; + double *dp; + int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr; + + printf("\nSuperNode matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + n = A->ncol; + Astore = (SCformat *) A->Store; + dp = (double *) Astore->nzval; + col_to_sup = Astore->col_to_sup; + sup_to_col = Astore->sup_to_col; + rowind_colptr = Astore->rowind_colptr; + rowind = Astore->rowind; + printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", + A->nrow,A->ncol,Astore->nnz,Astore->nsuper); + printf("nzval:\n"); + for (k = 0; k <= Astore->nsuper; ++k) { + c = sup_to_col[k]; + nsup = sup_to_col[k+1] - c; + for (j = c; j < c + nsup; ++j) { + d = Astore->nzval_colptr[j]; + for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) { + printf("%d\t%d\t%e\n", rowind[i], j, dp[d++]); + } + } + } +#if 0 + for (i = 0; i < Astore->nzval_colptr[n]; ++i) printf("%f ", dp[i]); +#endif + printf("\nnzval_colptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->nzval_colptr[i]); + printf("\nrowind: "); + for (i = 0; i < Astore->rowind_colptr[n]; ++i) + printf("%d ", Astore->rowind[i]); + printf("\nrowind_colptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->rowind_colptr[i]); + printf("\ncol_to_sup: "); + for (i = 0; i < n; ++i) printf("%d ", col_to_sup[i]); + printf("\nsup_to_col: "); + for (i = 0; i <= Astore->nsuper+1; ++i) + printf("%d ", sup_to_col[i]); + printf("\n"); + fflush(stdout); +} + +void +dPrint_Dense_Matrix(char *what, SuperMatrix *A) +{ + DNformat *Astore = (DNformat *) A->Store; + register int i, j, lda = Astore->lda; + double *dp; + + printf("\nDense matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + dp = (double *) Astore->nzval; + printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda); + printf("\nnzval: "); + for (j = 0; j < A->ncol; ++j) { + for (i = 0; i < A->nrow; ++i) printf("%f ", dp[i + j*lda]); + printf("\n"); + } + printf("\n"); + fflush(stdout); +} + +/*! \brief Diagnostic print of column "jcol" in the U/L factor. + */ +void +dprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu) +{ + int i, k, fsupc; + int *xsup, *supno; + int *xlsub, *lsub; + double *lusup; + int *xlusup; + double *ucol; + int *usub, *xusub; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + + printf("%s", msg); + printf("col %d: pivrow %d, supno %d, xprune %d\n", + jcol, pivrow, supno[jcol], xprune[jcol]); + + printf("\tU-col:\n"); + for (i = xusub[jcol]; i < xusub[jcol+1]; i++) + printf("\t%d%10.4f\n", usub[i], ucol[i]); + printf("\tL-col in rectangular snode:\n"); + fsupc = xsup[supno[jcol]]; /* first col of the snode */ + i = xlsub[fsupc]; + k = xlusup[jcol]; + while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) { + printf("\t%d\t%10.4f\n", lsub[i], lusup[k]); + i++; k++; + } + fflush(stdout); +} + + +/*! \brief Check whether tempv[] == 0. This should be true before and after calling any numeric routines, i.e., "panel_bmod" and "column_bmod". + */ +void dcheck_tempv(int n, double *tempv) +{ + int i; + + for (i = 0; i < n; i++) { + if (tempv[i] != 0.0) + { + fprintf(stderr,"tempv[%d] = %f\n", i,tempv[i]); + ABORT_SuperLU("dcheck_tempv"); + } + } +} + + +void +dGenXtrue(int n, int nrhs, double *x, int ldx) +{ + int i, j; + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/ + } +} + +/*! \brief Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's + */ +void +dFillRHS(trans_t trans, int nrhs, double *x, int ldx, + SuperMatrix *A, SuperMatrix *B) +{ + NCformat *Astore; + double *Aval; + DNformat *Bstore; + double *rhs; + double one = 1.0; + double zero = 0.0; + int ldc; + char transc[1]; + + Astore = A->Store; + Aval = (double *) Astore->nzval; + Bstore = B->Store; + rhs = Bstore->nzval; + ldc = Bstore->lda; + + if ( trans == NOTRANS ) *(unsigned char *)transc = 'N'; + else *(unsigned char *)transc = 'T'; + + sp_dgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A, + x, ldx, zero, rhs, ldc); + +} + +/*! \brief Fills a double precision array with a given value. + */ +void +dfill(double *a, int alen, double dval) +{ + register int i; + for (i = 0; i < alen; i++) a[i] = dval; +} + + + +/*! \brief Check the inf-norm of the error vector + */ +void dinf_norm_error(int nrhs, SuperMatrix *X, double *xtrue) +{ + DNformat *Xstore; + double err, xnorm; + double *Xmat, *soln_work; + int i, j; + + Xstore = X->Store; + Xmat = Xstore->nzval; + + for (j = 0; j < nrhs; j++) { + soln_work = &Xmat[j*Xstore->lda]; + err = xnorm = 0.0; + for (i = 0; i < X->nrow; i++) { + err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i])); + } + err = err / xnorm; + printf("||X - Xtrue||/||X|| = %e\n", err); + } +} + + + +/*! \brief Print performance of the code. */ +void +dPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage, + double rpg, double rcond, double *ferr, + double *berr, char *equed, SuperLUStat_t *stat) +{ + SCformat *Lstore; + NCformat *Ustore; + double *utime; + flops_t *ops; + + utime = stat->utime; + ops = stat->ops; + + if ( utime[FACT] != 0. ) + printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], + ops[FACT]*1e-6/utime[FACT]); + printf("Identify relaxed snodes = %8.2f\n", utime[RELAX]); + if ( utime[SOLVE] != 0. ) + printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE], + ops[SOLVE]*1e-6/utime[SOLVE]); + + Lstore = (SCformat *) L->Store; + Ustore = (NCformat *) U->Store; + printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz); + printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz); + printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); + + printf("L\\U MB %.3f\ttotal MB needed %.3f\n", + mem_usage->for_lu/1e6, mem_usage->total_needed/1e6); + printf("Number of memory expansions: %d\n", stat->expansions); + + printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n"); + printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n", + utime[FACT], ops[FACT]*1e-6/utime[FACT], + utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE], + utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]); + + printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n"); + printf("NUM:\t%e\t%e\t%e\t%e\t%s\n", + rpg, rcond, ferr[0], berr[0], equed); + +} + + + + +print_double_vec(char *what, int n, double *vec) +{ + int i; + printf("%s: n %d\n", what, n); + for (i = 0; i < n; ++i) printf("%d\t%f\n", i, vec[i]); + return 0; +} + diff --git a/src/maths/SuperLU/dzsum1.c b/src/maths/SuperLU/dzsum1.c new file mode 100644 index 000000000..62f3a4a40 --- /dev/null +++ b/src/maths/SuperLU/dzsum1.c @@ -0,0 +1,94 @@ +/*! @file dzsum1.c + * \brief Takes sum of the absolute values of a complex vector and returns a double precision result + * + *
+ *     -- LAPACK auxiliary routine (version 2.0) --   
+ *     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+ *     Courant Institute, Argonne National Lab, and Rice University   
+ *     October 31, 1992   
+ * 
+ */ + +#include +#include + +/*! \brief + +
+    Purpose   
+    =======   
+
+    DZSUM1 takes the sum of the absolute values of a complex   
+    vector and returns a double precision result.   
+
+    Based on DZASUM from the Level 1 BLAS.   
+    The change is to use the 'genuine' absolute value.   
+
+    Contributed by Nick Higham for use with ZLACON.   
+
+    Arguments   
+    =========   
+
+    N       (input) INT   
+            The number of elements in the vector CX.   
+
+    CX      (input) COMPLEX*16 array, dimension (N)   
+            The vector whose elements will be summed.   
+
+    INCX    (input) INT   
+            The spacing between successive values of CX.  INCX > 0.   
+
+    ===================================================================== 
+
+*/ +double dzsum1_(int *n, doublecomplex *cx, int *incx) +{ + + /* Builtin functions */ + double z_abs(doublecomplex *); + + /* Local variables */ + int i, nincx; + double stemp; + + +#define CX(I) cx[(I)-1] + + stemp = 0.; + if (*n <= 0) { + return stemp; + } + if (*incx == 1) { + goto L20; + } + + /* CODE FOR INCREMENT NOT EQUAL TO 1 */ + + nincx = *n * *incx; + for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { + + /* NEXT LINE MODIFIED. */ + + stemp += z_abs(&CX(i)); +/* L10: */ + } + + return stemp; + + /* CODE FOR INCREMENT EQUAL TO 1 */ + +L20: + for (i = 1; i <= *n; ++i) { + + /* NEXT LINE MODIFIED. */ + + stemp += z_abs(&CX(i)); +/* L30: */ + } + + return stemp; + + /* End of DZSUM1 */ + +} /* dzsum1_ */ + diff --git a/src/maths/SuperLU/get_perm_c.c b/src/maths/SuperLU/get_perm_c.c new file mode 100644 index 000000000..c0d5cbb6c --- /dev/null +++ b/src/maths/SuperLU/get_perm_c.c @@ -0,0 +1,457 @@ +/*! @file get_perm_c.c + * \brief Matrix permutation operations + * + *
+ * -- SuperLU routine (version 3.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * August 1, 2008
+ * 
+ */ +#include +#include + +extern int genmmd_(int *, int *, int *, int *, int *, int *, int *, + int *, int *, int *, int *, int *); + +void +get_colamd( + const int m, /* number of rows in matrix A. */ + const int n, /* number of columns in matrix A. */ + const int nnz,/* number of nonzeros in matrix A. */ + int *colptr, /* column pointer of size n+1 for matrix A. */ + int *rowind, /* row indices of size nz for matrix A. */ + int *perm_c /* out - the column permutation vector. */ + ) +{ + int Alen, *A, i, info, *p; + double knobs[COLAMD_KNOBS]; + int stats[COLAMD_STATS]; + + Alen = colamd_recommended(nnz, m, n); + + colamd_set_defaults(knobs); + + if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) + ABORT_SuperLU("Malloc fails for A[]"); + if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) + ABORT_SuperLU("Malloc fails for p[]"); + for (i = 0; i <= n; ++i) p[i] = colptr[i]; + for (i = 0; i < nnz; ++i) A[i] = rowind[i]; + info = colamd(m, n, Alen, A, p, knobs, stats); + if ( info == FALSE ) ABORT_SuperLU("COLAMD failed"); + + for (i = 0; i < n; ++i) perm_c[p[i]] = i; + + SUPERLU_FREE(A); + SUPERLU_FREE(p); +} +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Form the structure of A'*A. A is an m-by-n matrix in column oriented
+ * format represented by (colptr, rowind). The output A'*A is in column
+ * oriented format (symmetrically, also row oriented), represented by
+ * (ata_colptr, ata_rowind).
+ *
+ * This routine is modified from GETATA routine by Tim Davis.
+ * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2,
+ * i.e., the sum of the square of the row counts.
+ *
+ * Questions
+ * =========
+ *     o  Do I need to withhold the *dense* rows?
+ *     o  How do I know the number of nonzeros in A'*A?
+ * 
+ */ +void +getata( + const int m, /* number of rows in matrix A. */ + const int n, /* number of columns in matrix A. */ + const int nz, /* number of nonzeros in matrix A */ + int *colptr, /* column pointer of size n+1 for matrix A. */ + int *rowind, /* row indices of size nz for matrix A. */ + int *atanz, /* out - on exit, returns the actual number of + nonzeros in matrix A'*A. */ + int **ata_colptr, /* out - size n+1 */ + int **ata_rowind /* out - size *atanz */ + ) +{ + register int i, j, k, col, num_nz, ti, trow; + int *marker, *b_colptr, *b_rowind; + int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ + + if ( !(marker = (int*) SUPERLU_MALLOC((SUPERLU_MAX(m,n)+1)*sizeof(int))) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int*) SUPERLU_MALLOC((m+1) * sizeof(int))) ) + ABORT_SuperLU("SUPERLU_MALLOC t_colptr[]"); + if ( !(t_rowind = (int*) SUPERLU_MALLOC(nz * sizeof(int))) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for t_rowind[]"); + + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < m; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) + ++marker[rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < m; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose the matrix from A to T */ + for (j = 0; j < n; ++j) + for (i = colptr[j]; i < colptr[j+1]; ++i) { + col = rowind[i]; + t_rowind[marker[col]] = j; + ++marker[col]; + } + + + /* ---------------------------------------------------------------- + compute B = T * A, where column j of B is: + + Struct (B_*j) = UNION ( Struct (T_*k) ) + A_kj != 0 + + do not include the diagonal entry + + ( Partition A as: A = (A_*1, ..., A_*n) + Then B = T * A = (T * A_*1, ..., T * A_*n), where + T * A_*j = (T_*1, ..., T_*m) * A_*j. ) + ---------------------------------------------------------------- */ + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* First pass determines number of nonzeros in B */ + num_nz = 0; + for (j = 0; j < n; ++j) { + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + for (i = colptr[j]; i < colptr[j+1]; ++i) { + /* A_kj is nonzero, add pattern of column T_*k to B_*j */ + k = rowind[i]; + for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { + trow = t_rowind[ti]; + if ( marker[trow] != j ) { + marker[trow] = j; + num_nz++; + } + } + } + } + *atanz = num_nz; + + /* Allocate storage for A'*A */ + if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for ata_colptr[]"); + if ( *atanz ) { + if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for ata_rowind[]"); + } + b_colptr = *ata_colptr; /* aliasing */ + b_rowind = *ata_rowind; + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* Compute each column of B, one at a time */ + num_nz = 0; + for (j = 0; j < n; ++j) { + b_colptr[j] = num_nz; + + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + for (i = colptr[j]; i < colptr[j+1]; ++i) { + /* A_kj is nonzero, add pattern of column T_*k to B_*j */ + k = rowind[i]; + for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { + trow = t_rowind[ti]; + if ( marker[trow] != j ) { + marker[trow] = j; + b_rowind[num_nz++] = trow; + } + } + } + } + b_colptr[n] = num_nz; + + SUPERLU_FREE(marker); + SUPERLU_FREE(t_colptr); + SUPERLU_FREE(t_rowind); +} + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Form the structure of A'+A. A is an n-by-n matrix in column oriented
+ * format represented by (colptr, rowind). The output A'+A is in column
+ * oriented format (symmetrically, also row oriented), represented by
+ * (b_colptr, b_rowind).
+ * 
+ */ +void +at_plus_a( + const int n, /* number of columns in matrix A. */ + const int nz, /* number of nonzeros in matrix A */ + int *colptr, /* column pointer of size n+1 for matrix A. */ + int *rowind, /* row indices of size nz for matrix A. */ + int *bnz, /* out - on exit, returns the actual number of + nonzeros in matrix A'*A. */ + int **b_colptr, /* out - size n+1 */ + int **b_rowind /* out - size *bnz */ + ) +{ + register int i, j, k, col, num_nz; + int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ + int *marker; + + if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for t_colptr[]"); + if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails t_rowind[]"); + + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < n; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) + ++marker[rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < n; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose the matrix from A to T */ + for (j = 0; j < n; ++j) + for (i = colptr[j]; i < colptr[j+1]; ++i) { + col = rowind[i]; + t_rowind[marker[col]] = j; + ++marker[col]; + } + + + /* ---------------------------------------------------------------- + compute B = A + T, where column j of B is: + + Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k) + + do not include the diagonal entry + ---------------------------------------------------------------- */ + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* First pass determines number of nonzeros in B */ + num_nz = 0; + for (j = 0; j < n; ++j) { + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + /* Add pattern of column A_*k to B_*j */ + for (i = colptr[j]; i < colptr[j+1]; ++i) { + k = rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + ++num_nz; + } + } + + /* Add pattern of column T_*k to B_*j */ + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + k = t_rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + ++num_nz; + } + } + } + *bnz = num_nz; + + /* Allocate storage for A+A' */ + if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for b_colptr[]"); + if ( *bnz) { + if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for b_rowind[]"); + } + + /* Zero the diagonal flag */ + for (i = 0; i < n; ++i) marker[i] = -1; + + /* Compute each column of B, one at a time */ + num_nz = 0; + for (j = 0; j < n; ++j) { + (*b_colptr)[j] = num_nz; + + /* Flag the diagonal so it's not included in the B matrix */ + marker[j] = j; + + /* Add pattern of column A_*k to B_*j */ + for (i = colptr[j]; i < colptr[j+1]; ++i) { + k = rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + (*b_rowind)[num_nz++] = k; + } + } + + /* Add pattern of column T_*k to B_*j */ + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + k = t_rowind[i]; + if ( marker[k] != j ) { + marker[k] = j; + (*b_rowind)[num_nz++] = k; + } + } + } + (*b_colptr)[n] = num_nz; + + SUPERLU_FREE(marker); + SUPERLU_FREE(t_colptr); + SUPERLU_FREE(t_rowind); +} + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple
+ * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'.
+ * or using approximate minimum degree column ordering by Davis et. al.
+ * The LU factorization of A*Pc tends to have less fill than the LU 
+ * factorization of A.
+ *
+ * Arguments
+ * =========
+ *
+ * ispec   (input) int
+ *         Specifies the type of column ordering to reduce fill:
+ *         = 1: minimum degree on the structure of A^T * A
+ *         = 2: minimum degree on the structure of A^T + A
+ *         = 3: approximate minimum degree for unsymmetric matrices
+ *         If ispec == 0, the natural ordering (i.e., Pc = I) is returned.
+ * 
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A 
+ *         can be: Stype = NC; Dtype = _D; Mtype = GE. In the future,
+ *         more general A can be handled.
+ *
+ * perm_c  (output) int*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ * 
+ */ +void +get_perm_c(int ispec, SuperMatrix *A, int *perm_c) +{ + NCformat *Astore = A->Store; + int m, n, bnz = 0, *b_colptr, i; + int delta, maxint, nofsub, *invp; + int *b_rowind, *dhead, *qsize, *llist, *marker; + double t, SuperLU_timer_(); + + m = A->nrow; + n = A->ncol; + + t = SuperLU_timer_(); + switch ( ispec ) { + case (NATURAL): /* Natural ordering */ + for (i = 0; i < n; ++i) perm_c[i] = i; +#if ( PRNTlevel>=1 ) + printf("Use natural column ordering.\n"); +#endif + return; + case (MMD_ATA): /* Minimum degree ordering on A'*A */ + getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); +#if ( PRNTlevel>=1 ) + printf("Use minimum degree ordering on A'*A.\n"); +#endif + t = SuperLU_timer_() - t; + /*printf("Form A'*A time = %8.3f\n", t);*/ + break; + case (MMD_AT_PLUS_A): /* Minimum degree ordering on A'+A */ + if ( m != n ) ABORT_SuperLU("Matrix is not square"); + at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); +#if ( PRNTlevel>=1 ) + printf("Use minimum degree ordering on A'+A.\n"); +#endif + t = SuperLU_timer_() - t; + /*printf("Form A'+A time = %8.3f\n", t);*/ + break; + case (COLAMD): /* Approximate minimum degree column ordering. */ + get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind, perm_c); +#if ( PRNTlevel>=1 ) + printf(".. Use approximate minimum degree column ordering.\n"); +#endif + return; + default: + ABORT_SuperLU("Invalid ISPEC"); + } + + if ( bnz != 0 ) { + t = SuperLU_timer_(); + + /* Initialize and allocate storage for GENMMD. */ + delta = 0; /* DELTA is a parameter to allow the choice of nodes + whose degree <= min-degree + DELTA. */ + maxint = 2147483647; /* 2**31 - 1 */ + invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !invp ) ABORT_SuperLU("SUPERLU_MALLOC fails for invp."); + dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !dhead ) ABORT_SuperLU("SUPERLU_MALLOC fails for dhead."); + qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); + if ( !qsize ) ABORT_SuperLU("SUPERLU_MALLOC fails for qsize."); + llist = (int *) SUPERLU_MALLOC(n*sizeof(int)); + if ( !llist ) ABORT_SuperLU("SUPERLU_MALLOC fails for llist."); + marker = (int *) SUPERLU_MALLOC(n*sizeof(int)); + if ( !marker ) ABORT_SuperLU("SUPERLU_MALLOC fails for marker."); + + /* Transform adjacency list into 1-based indexing required by GENMMD.*/ + for (i = 0; i <= n; ++i) ++b_colptr[i]; + for (i = 0; i < bnz; ++i) ++b_rowind[i]; + + genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, + qsize, llist, marker, &maxint, &nofsub); + + /* Transform perm_c into 0-based indexing. */ + for (i = 0; i < n; ++i) --perm_c[i]; + + SUPERLU_FREE(invp); + SUPERLU_FREE(dhead); + SUPERLU_FREE(qsize); + SUPERLU_FREE(llist); + SUPERLU_FREE(marker); + SUPERLU_FREE(b_rowind); + + t = SuperLU_timer_() - t; + /* printf("call GENMMD time = %8.3f\n", t);*/ + + } else { /* Empty adjacency structure */ + for (i = 0; i < n; ++i) perm_c[i] = i; + } + + SUPERLU_FREE(b_colptr); +} diff --git a/src/maths/SuperLU/heap_relax_snode.c b/src/maths/SuperLU/heap_relax_snode.c new file mode 100644 index 000000000..b3e3bd42d --- /dev/null +++ b/src/maths/SuperLU/heap_relax_snode.c @@ -0,0 +1,124 @@ +/*! @file heap_relax_snode.c + * \brief Identify the initial relaxed supernodes + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    relax_snode() - Identify the initial relaxed supernodes, assuming that 
+ *    the matrix has been reordered according to the postorder of the etree.
+ * 
+ */ + +void +heap_relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end /* last column in a supernode */ + ) +{ + register int i, j, k, l, parent; + register int snode_start; /* beginning of a snode */ + int *et_save, *post, *inv_post, *iwork; + int nsuper_et = 0, nsuper_et_post = 0; + + /* The etree may not be postordered, but is heap ordered. */ + + iwork = (int*) intMalloc(3*n+2); + if ( !iwork ) ABORT_SuperLU("SUPERLU_MALLOC fails for iwork[]"); + inv_post = iwork + n+1; + et_save = inv_post + n+1; + + /* Post order etree */ + post = (int *) TreePostorder(n, et); + for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) { + iwork[post[i]] = post[et[i]]; + et_save[i] = et[i]; /* Save the original etree */ + } + for (i = 0; i < n; ++i) et[i] = iwork[i]; + + /* Compute the number of descendants of each node in the etree */ + ifill (relax_end, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode in postordered etree; j is the last column. */ + ++nsuper_et_post; + k = n; + for (i = snode_start; i <= j; ++i) + k = SUPERLU_MIN(k, inv_post[i]); + l = inv_post[j]; + if ( (l - k) == (j - snode_start) ) { + /* It's also a supernode in the original etree */ + relax_end[k] = l; /* Last column is recorded */ + ++nsuper_et; + } else { + for (i = snode_start; i <= j; ++i) { + l = inv_post[i]; + if ( descendants[i] == 0 ) { + relax_end[l] = l; + ++nsuper_et; + } + } + } + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + +#if ( PRNTlevel>=1 ) + printf(".. heap_snode_relax:\n" + "\tNo of relaxed snodes in postordered etree:\t%d\n" + "\tNo of relaxed snodes in original etree:\t%d\n", + nsuper_et_post, nsuper_et); +#endif + + /* Recover the original etree */ + for (i = 0; i < n; ++i) et[i] = et_save[i]; + + SUPERLU_FREE(post); + SUPERLU_FREE(iwork); +} + + diff --git a/src/maths/SuperLU/ilu_dcolumn_dfs.c b/src/maths/SuperLU/ilu_dcolumn_dfs.c new file mode 100644 index 000000000..3423bb9b1 --- /dev/null +++ b/src/maths/SuperLU/ilu_dcolumn_dfs.c @@ -0,0 +1,258 @@ + +/*! @file ilu_dcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+*/ + +#include + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   ILU_DCOLUMN_DFS performs a symbolic factorization on column jcol, and
+ *   decide the supernode boundary.
+ *
+ *   This routine does not use numeric values, but only use the RHS
+ *   row indices to start the dfs.
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives. The routine returns a list of such supernodal
+ *   representatives in topological order of the dfs that generates them.
+ *   The location of the first nonzero in each such supernodal segment
+ *   (supernodal entry location) is also returned.
+ *
+ * Local parameters
+ * ================
+ *   nseg: no of segments in current U[*,j]
+ *   jsuper: jsuper=EMPTY if column j does not belong to the same
+ *	supernode as j-1. Otherwise, jsuper=nsuper.
+ *
+ *   marker2: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ *
+ * Return value
+ * ============
+ *     0  success;
+ *   > 0  number of bytes allocated when run out of space.
+ * 
+ */ +int +ilu_dcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the + dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(7); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonzero */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ((mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu))) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( (mem_error = dLUMemXpand(jcol,nextl, + LSUB,&nzlmax,Glu)) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; + + /* Always start a new supernode for a singular column */ + if ( nextl == jptr ) jsuper = EMPTY; + + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first columns of the supernode. + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1) ) { /* >= 2 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + xlsub[jcol] = ito; + for (ifrom = jptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_dcopy_to_ucol.c b/src/maths/SuperLU/ilu_dcopy_to_ucol.c new file mode 100644 index 000000000..5dba8baee --- /dev/null +++ b/src/maths/SuperLU/ilu_dcopy_to_ucol.c @@ -0,0 +1,207 @@ + +/*! @file ilu_dcopy_to_ucol.c + * \brief Copy a computed column of U to the compressed data structure + * and drop some small entries + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Laboratory
+ * November, 2010
+ * 
+ */ + +#include + +#ifdef DEBUG +int num_drop_U; +#endif + +extern void dcopy_(int *, double [], int *, double [], int *); + +#if 0 +static double *A; /* used in _compare_ only */ +static int _compare_(const void *a, const void *b) +{ + register int *x = (int *)a, *y = (int *)b; + register double xx = fabs(A[*x]), yy = fabs(A[*y]); + if (xx > yy) return -1; + else if (xx < yy) return 1; + else return 0; +} +#endif + +int +ilu_dcopy_to_ucol( + int jcol, /* in */ + int nseg, /* in */ + int *segrep, /* in */ + int *repfnz, /* in */ + int *perm_r, /* in */ + double *dense, /* modified - reset to zero on return */ + int drop_rule,/* in */ + milu_t milu, /* in */ + double drop_tol, /* in */ + int quota, /* maximum nonzero entries allowed */ + double *sum, /* out - the sum of dropped entries */ + int *nnzUj, /* in - out */ + GlobalLU_t *Glu, /* modified */ + double *work /* working space with minimum size n, + * used by the second dropping rule */ + ) +{ +/* + * Gather from SPA dense[*] to global ucol[*]. + */ + int ksub, krep, ksupno; + int i, k, kfnz, segsze; + int fsupc, isub, irow; + int jsupno, nextu; + int new_next, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + double *ucol; + int *usub, *xusub; + int nzumax; + int m; /* number of entries in the nonzero U-segments */ + register double d_max = 0.0, d_min = 1.0 / dlamch_("Safe minimum"); + register double tmp; + double zero = 0.0; + int i_1 = 1; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + nzumax = Glu->nzumax; + + *sum = zero; + if (drop_rule == NODROP) { + drop_tol = -1.0, quota = Glu->n; + } + + jsupno = supno[jcol]; + nextu = xusub[jcol]; + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + krep = segrep[k--]; + ksupno = supno[krep]; + + if ( ksupno != jsupno ) { /* Should go into ucol[] */ + kfnz = repfnz[krep]; + if ( kfnz != EMPTY ) { /* Nonzero U-segment */ + + fsupc = xsup[ksupno]; + isub = xlsub[fsupc] + kfnz - fsupc; + segsze = krep - kfnz + 1; + + new_next = nextu + segsze; + while ( new_next > nzumax ) { + if ((mem_error = dLUMemXpand(jcol, nextu, UCOL, &nzumax, + Glu)) != 0) + return (mem_error); + ucol = Glu->ucol; + if ((mem_error = dLUMemXpand(jcol, nextu, USUB, &nzumax, + Glu)) != 0) + return (mem_error); + usub = Glu->usub; + lsub = Glu->lsub; + } + + for (i = 0; i < segsze; i++) { + irow = lsub[isub++]; + tmp = fabs(dense[irow]); + + /* first dropping rule */ + if (quota > 0 && tmp >= drop_tol) { + if (tmp > d_max) d_max = tmp; + if (tmp < d_min) d_min = tmp; + usub[nextu] = perm_r[irow]; + ucol[nextu] = dense[irow]; + nextu++; + } else { + switch (milu) { + case SMILU_1: + case SMILU_2: + *sum += dense[irow]; + break; + case SMILU_3: + /* *sum += fabs(dense[irow]);*/ + *sum += tmp; + break; + case SILU: + default: + break; + } +#ifdef DEBUG + num_drop_U++; +#endif + } + dense[irow] = zero; + } + + } + + } + + } /* for each segment... */ + + xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ + m = xusub[jcol + 1] - xusub[jcol]; + + /* second dropping rule */ + if (drop_rule & DROP_SECONDARY && m > quota) { + register double tol = d_max; + register int m0 = xusub[jcol] + m - 1; + + if (quota > 0) { + if (drop_rule & DROP_INTERP) { + d_max = 1.0 / d_max; d_min = 1.0 / d_min; + tol = 1.0 / (d_max + (d_min - d_max) * quota / m); + } else { + dcopy_(&m, &ucol[xusub[jcol]], &i_1, work, &i_1); + tol = dqselect(m, work, quota); +#if 0 + A = &ucol[xusub[jcol]]; + for (i = 0; i < m; i++) work[i] = i; + qsort(work, m, sizeof(int), _compare_); + tol = fabs(usub[xusub[jcol] + work[quota]]); +#endif + } + } + for (i = xusub[jcol]; i <= m0; ) { + if (fabs(ucol[i]) <= tol) { + switch (milu) { + case SMILU_1: + case SMILU_2: + *sum += ucol[i]; + break; + case SMILU_3: + *sum += fabs(ucol[i]); + break; + case SILU: + default: + break; + } + ucol[i] = ucol[m0]; + usub[i] = usub[m0]; + m0--; + m--; +#ifdef DEBUG + num_drop_U++; +#endif + xusub[jcol + 1]--; + continue; + } + i++; + } + } + + if (milu == SMILU_2) *sum = fabs(*sum); + + *nnzUj += m; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_ddrop_row.c b/src/maths/SuperLU/ilu_ddrop_row.c new file mode 100644 index 000000000..31c4b4a1a --- /dev/null +++ b/src/maths/SuperLU/ilu_ddrop_row.c @@ -0,0 +1,329 @@ + +/*! @file ilu_ddrop_row.c + * \brief Drop small rows from L + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ * 
+ */ + +#include +#include +#include + +extern void dswap_(int *, double [], int *, double [], int *); +extern void daxpy_(int *, double *, double [], int *, double [], int *); +extern void dcopy_(int *, double [], int *, double [], int *); +extern double dasum_(int *, double *, int *); +extern double dnrm2_(int *, double *, int *); +extern double dnrm2_(int *, double [], int *); +extern int idamax_(int *, double [], int *); + +static double *A; /* used in _compare_ only */ +static int _compare_(const void *a, const void *b) +{ + register int *x = (int *)a, *y = (int *)b; + if (A[*x] - A[*y] > 0.0) return -1; + else if (A[*x] - A[*y] < 0.0) return 1; + else return 0; +} + +/*! \brief + *
+ * Purpose
+ * =======
+ *    ilu_ddrop_row() - Drop some small rows from the previous 
+ *    supernode (L-part only).
+ * 
+ */ +int ilu_ddrop_row( + superlu_options_t *options, /* options */ + int first, /* index of the first column in the supernode */ + int last, /* index of the last column in the supernode */ + double drop_tol, /* dropping parameter */ + int quota, /* maximum nonzero entries allowed */ + int *nnzLj, /* in/out number of nonzeros in L(:, 1:last) */ + double *fill_tol, /* in/out - on exit, fill_tol=-num_zero_pivots, + * does not change if options->ILU_MILU != SMILU1 */ + GlobalLU_t *Glu, /* modified */ + double dwork[], /* working space + * the length of dwork[] should be no less than + * the number of rows in the supernode */ + double dwork2[], /* working space with the same size as dwork[], + * used only by the second dropping rule */ + int lastc /* if lastc == 0, there is nothing after the + * working supernode [first:last]; + * if lastc == 1, there is one more column after + * the working supernode. */ ) +{ + register int i, j, k, m1; + register int nzlc; /* number of nonzeros in column last+1 */ + register int xlusup_first, xlsub_first; + int m, n; /* m x n is the size of the supernode */ + int r = 0; /* number of dropped rows */ + register double *temp; + register double *lusup = Glu->lusup; + register int *lsub = Glu->lsub; + register int *xlsub = Glu->xlsub; + register int *xlusup = Glu->xlusup; + register double d_max = 0.0, d_min = 1.0; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + norm_t nrm = options->ILU_Norm; + double zero = 0.0; + double one = 1.0; + double none = -1.0; + int i_1 = 1; + int inc_diag; /* inc_diag = m + 1 */ + int nzp = 0; /* number of zero pivots */ + double alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim); + + xlusup_first = xlusup[first]; + xlsub_first = xlsub[first]; + m = xlusup[first + 1] - xlusup_first; + n = last - first + 1; + m1 = m - 1; + inc_diag = m + 1; + nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0; + temp = dwork - n; + + /* Quick return if nothing to do. */ + if (m == 0 || m == n || drop_rule == NODROP) + { + *nnzLj += m * n; + return 0; + } + + /* basic dropping: ILU(tau) */ + for (i = n; i <= m1; ) + { + /* the average abs value of ith row */ + switch (nrm) + { + case ONE_NORM: + temp[i] = dasum_(&n, &lusup[xlusup_first + i], &m) / (double)n; + break; + case TWO_NORM: + temp[i] = dnrm2_(&n, &lusup[xlusup_first + i], &m) + / sqrt((double)n); + break; + case INF_NORM: + default: + k = idamax_(&n, &lusup[xlusup_first + i], &m) - 1; + temp[i] = fabs(lusup[xlusup_first + i + m * k]); + break; + } + + /* drop small entries due to drop_tol */ + if (drop_rule & DROP_BASIC && temp[i] < drop_tol) + { + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + daxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m] += + fabs(lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + dcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + dswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m] = + fabs(lusup[xlusup_first + m1 + j * m]); + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + continue; + } /* if dropping */ + else + { + if (temp[i] > d_max) d_max = temp[i]; + if (temp[i] < d_min) d_min = temp[i]; + } + i++; + } /* for */ + + /* Secondary dropping: drop more rows according to the quota. */ + quota = ceil((double)quota / (double)n); + if (drop_rule & DROP_SECONDARY && m - r > quota) + { + register double tol = d_max; + + /* Calculate the second dropping tolerance */ + if (quota > n) + { + if (drop_rule & DROP_INTERP) /* by interpolation */ + { + d_max = 1.0 / d_max; d_min = 1.0 / d_min; + tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r)); + } + else /* by quick select */ + { + int len = m1 - n + 1; + dcopy_(&len, dwork, &i_1, dwork2, &i_1); + tol = dqselect(len, dwork2, quota - n); +#if 0 + register int *itemp = iwork - n; + A = temp; + for (i = n; i <= m1; i++) itemp[i] = i; + qsort(iwork, m1 - n + 1, sizeof(int), _compare_); + tol = temp[itemp[quota]]; +#endif + } + } + + for (i = n; i <= m1; ) + { + if (temp[i] <= tol) + { + register int j; + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + daxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m] += + fabs(lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + dcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + dswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m] = + fabs(lusup[xlusup_first + m1 + j * m]); + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + temp[i] = temp[m1]; + + continue; + } + i++; + + } /* for */ + + } /* if secondary dropping */ + + for (i = n; i < m; i++) temp[i] = 0.0; + + if (r == 0) + { + *nnzLj += m * n; + return 0; + } + + /* add dropped entries to the diagnal */ + if (milu != SILU) + { + register int j; + double t; + double omega; + for (j = 0; j < n; j++) + { + t = lusup[xlusup_first + (m - 1) + j * m]; + if (t == zero) continue; + if (t > zero) + omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / t, 1.0); + else + omega = SUPERLU_MAX(2.0 * (1.0 - alpha) / t, -1.0); + t *= omega; + + switch (milu) + { + case SMILU_1: + if (t != none) { + lusup[xlusup_first + j * inc_diag] *= (one + t); + } + else + { + lusup[xlusup_first + j * inc_diag] *= *fill_tol; +#ifdef DEBUG + printf("[1] ZERO PIVOT: FILL col %d.\n", first + j); + fflush(stdout); +#endif + nzp++; + } + break; + case SMILU_2: + lusup[xlusup_first + j * inc_diag] *= (1.0 + fabs(t)); + break; + case SMILU_3: + lusup[xlusup_first + j * inc_diag] *= (one + t); + break; + case SILU: + default: + break; + } + } + if (nzp > 0) *fill_tol = -nzp; + } + + /* Remove dropped entries from the memory and fix the pointers. */ + m1 = m - r; + for (j = 1; j < n; j++) + { + register int tmp1, tmp2; + tmp1 = xlusup_first + j * m1; + tmp2 = xlusup_first + j * m; + for (i = 0; i < m1; i++) + lusup[i + tmp1] = lusup[i + tmp2]; + } + for (i = 0; i < nzlc; i++) + lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m]; + for (i = 0; i < nzlc; i++) + lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i]; + for (i = first + 1; i <= last + 1; i++) + { + xlusup[i] -= r * (i - first); + xlsub[i] -= r; + } + if (lastc) + { + xlusup[last + 2] -= r * n; + xlsub[last + 2] -= r; + } + + *nnzLj += (m - r) * n; + return r; +} diff --git a/src/maths/SuperLU/ilu_dpanel_dfs.c b/src/maths/SuperLU/ilu_dpanel_dfs.c new file mode 100644 index 000000000..ac5686670 --- /dev/null +++ b/src/maths/SuperLU/ilu_dpanel_dfs.c @@ -0,0 +1,248 @@ + +/*! @file ilu_dpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols and + * record the entries with maximum absolute value in each column + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives.
+ *
+ *   The routine returns one list of the supernodal representatives
+ *   in topological order of the dfs that generates them. This list is
+ *   a superset of the topological order of each individual column within
+ *   the panel.
+ *   The location of the first nonzero in each supernodal segment
+ *   (supernodal entry location) is also returned. Each column has a
+ *   separate list for this purpose.
+ *
+ *   Two marker arrays are used for dfs:
+ *     marker[i] == jj, if i was visited during dfs of current column jj;
+ *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
+ *
+ *   marker: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ * 
+ */ +void +ilu_dpanel_dfs( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + double *dense, /* out */ + double *amax, /* out - max. abs. value of each column in panel */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ +) +{ + + NCPformat *Astore; + double *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + double *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + double *amax_col; + register double tmp; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + amax_col = amax; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + *amax_col = 0.0; + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + tmp = fabs(a[k]); + if (tmp > *amax_col) *amax_col = tmp; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + amax_col++; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/ilu_dpivotL.c b/src/maths/SuperLU/ilu_dpivotL.c new file mode 100644 index 000000000..bd929932a --- /dev/null +++ b/src/maths/SuperLU/ilu_dpivotL.c @@ -0,0 +1,266 @@ + +/*! @file ilu_dpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + + +#include +#include +#include + +#ifndef SGN +#define SGN(x) ((x)>=0?1:-1) +#endif + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Performs the numerical pivoting on the current column of L,
+ *   and the CDIV operation.
+ *
+ *   Pivot policy:
+ *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
+ *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
+ *	     pivot row = k;
+ *	 ELSE IF abs(A_jj) >= thresh THEN
+ *	     pivot row = j;
+ *	 ELSE
+ *	     pivot row = m;
+ *
+ *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
+ *
+ *   Return value: 0	  success;
+ *		   i > 0  U(i,i) is exactly zero.
+ * 
+ */ + +int +ilu_dpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by + * perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int diagind, /* diagonal of Pc*A*Pc' */ + int *swap, /* in/out record the row permutation */ + int *iswap, /* in/out inverse of swap, it is the same as + perm_r after the factorization */ + int *marker, /* in */ + int *pivrow, /* in/out, as an input if *usepr!=0 */ + double fill_tol, /* in - fill tolerance of current column + * used for a singular column */ + milu_t milu, /* in */ + double drop_sum, /* in - computed in ilu_dcopy_to_ucol() + (MILU only) */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + int n; /* number of columns */ + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + register int pivptr; + int old_pivptr, diag, ptr0; + register double pivmax, rtemp; + double thresh; + double temp; + double *lu_sup_ptr; + double *lu_col_ptr; + int *lsub_ptr; + register int isub, icol, k, itemp; + int *lsub, *xlsub; + double *lusup; + int *xlusup; + flops_t *ops = stat->ops; + int info; + + /* Initialize pointers */ + n = Glu->n; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + pivmax = -1.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + ptr0 = EMPTY; + for (isub = nsupc; isub < nsupr; ++isub) { + if (marker[lsub_ptr[isub]] > jcol) + continue; /* do not overlap with a later relaxed supernode */ + + switch (milu) { + case SMILU_1: + rtemp = fabs(lu_col_ptr[isub] + drop_sum); + break; + case SMILU_2: + case SMILU_3: + /* In this case, drop_sum contains the sum of the abs. value */ + rtemp = fabs(lu_col_ptr[isub]); + break; + case SILU: + default: + rtemp = fabs(lu_col_ptr[isub]); + break; + } + if (rtemp > pivmax) { pivmax = rtemp; pivptr = isub; } + if (*usepr && lsub_ptr[isub] == *pivrow) old_pivptr = isub; + if (lsub_ptr[isub] == diagind) diag = isub; + if (ptr0 == EMPTY) ptr0 = isub; + } + + if (milu == SMILU_2 || milu == SMILU_3) pivmax += drop_sum; + + /* Test for singularity */ + if (pivmax < 0.0) { + fprintf(stderr, "[0]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + if ( pivmax == 0.0 ) { + if (diag != EMPTY) + *pivrow = lsub_ptr[pivptr = diag]; + else if (ptr0 != EMPTY) + *pivrow = lsub_ptr[pivptr = ptr0]; + else { + /* look for the first row which does not + belong to any later supernodes */ + for (icol = jcol; icol < n; icol++) + if (marker[swap[icol]] <= jcol) break; + if (icol >= n) { + fprintf(stderr, "[1]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + + *pivrow = swap[icol]; + + /* pick up the pivot row */ + for (isub = nsupc; isub < nsupr; ++isub) + if ( lsub_ptr[isub] == *pivrow ) { pivptr = isub; break; } + } + pivmax = fill_tol; + lu_col_ptr[pivptr] = pivmax; + *usepr = 0; +#ifdef DEBUG + printf("[0] ZERO PIVOT: FILL (%d, %d).\n", *pivrow, jcol); + fflush(stdout); +#endif + info =jcol + 1; + } /* if (*pivrow == 0.0) */ + else { + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + switch (milu) { + case SMILU_1: + rtemp = fabs(lu_col_ptr[old_pivptr] + drop_sum); + break; + case SMILU_2: + case SMILU_3: + rtemp = fabs(lu_col_ptr[old_pivptr]) + drop_sum; + break; + case SILU: + default: + rtemp = fabs(lu_col_ptr[old_pivptr]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = old_pivptr; + else *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + switch (milu) { + case SMILU_1: + rtemp = fabs(lu_col_ptr[diag] + drop_sum); + break; + case SMILU_2: + case SMILU_3: + rtemp = fabs(lu_col_ptr[diag]) + drop_sum; + break; + case SILU: + default: + rtemp = fabs(lu_col_ptr[diag]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + info = 0; + + /* Reset the diagonal */ + switch (milu) { + case SMILU_1: + lu_col_ptr[pivptr] += drop_sum; + break; + case SMILU_2: + case SMILU_3: + lu_col_ptr[pivptr] += SGN(lu_col_ptr[pivptr]) * drop_sum; + break; + case SILU: + default: + break; + } + + } /* else */ + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + if (jcol < n - 1) { + register int t1, t2, t; + t1 = iswap[*pivrow]; t2 = jcol; + if (t1 != t2) { + t = swap[t1]; swap[t1] = swap[t2]; swap[t2] = t; + t1 = swap[t1]; t2 = t; + t = iswap[t1]; iswap[t1] = iswap[t2]; iswap[t2] = t; + } + } /* if (jcol < n - 1) */ + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += nsupr - nsupc; + temp = 1.0 / lu_col_ptr[nsupc]; + for (k = nsupc+1; k < nsupr; k++) lu_col_ptr[k] *= temp; + + return info; +} diff --git a/src/maths/SuperLU/ilu_dsnode_dfs.c b/src/maths/SuperLU/ilu_dsnode_dfs.c new file mode 100644 index 000000000..87a5cd4e4 --- /dev/null +++ b/src/maths/SuperLU/ilu_dsnode_dfs.c @@ -0,0 +1,90 @@ + +/*! @file ilu_dsnode_dfs.c + * \brief Determines the union of row structures of columns within the relaxed node + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    ilu_dsnode_dfs() - Determine the union of the row structures of those
+ *    columns within the relaxed snode.
+ *    Note: The relaxed snodes are leaves of the supernodal etree, therefore,
+ *    the portion outside the rectangular supernode must be zero.
+ *
+ * Return value
+ * ============
+ *     0   success;
+ *    >0   number of bytes allocated when run out of memory.
+ * 
+ */ + +int +ilu_dsnode_dfs( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, nextl; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) + { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) + { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) + { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) + { + if ( (mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, + Glu)) != 0) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1 */ + if ( jcol < kcol ) + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xlsub[kcol+1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_heap_relax_snode.c b/src/maths/SuperLU/ilu_heap_relax_snode.c new file mode 100644 index 000000000..b9940de6f --- /dev/null +++ b/src/maths/SuperLU/ilu_heap_relax_snode.c @@ -0,0 +1,120 @@ +/*! @file ilu_heap_relax_snode.c + * \brief Identify the initial relaxed supernodes + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 1, 2009
+ * 
+ */ + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    ilu_heap_relax_snode() - Identify the initial relaxed supernodes,
+ *    assuming that the matrix has been reordered according to the postorder
+ *    of the etree.
+ * 
+ */ + +void +ilu_heap_relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end, /* last column in a supernode + * if j-th column starts a relaxed + * supernode, relax_end[j] represents + * the last column of this supernode */ + int *relax_fsupc /* first column in a supernode + * relax_fsupc[j] represents the first + * column of j-th supernode */ + ) +{ + register int i, j, k, l, f, parent; + register int snode_start; /* beginning of a snode */ + int *et_save, *post, *inv_post, *iwork; + int nsuper_et = 0, nsuper_et_post = 0; + + /* The etree may not be postordered, but is heap ordered. */ + + iwork = (int*) intMalloc(3*n+2); + if ( !iwork ) ABORT_SuperLU("SUPERLU_MALLOC fails for iwork[]"); + inv_post = iwork + n+1; + et_save = inv_post + n+1; + + /* Post order etree */ + post = (int *) TreePostorder(n, et); + for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) { + iwork[post[i]] = post[et[i]]; + et_save[i] = et[i]; /* Save the original etree */ + } + for (i = 0; i < n; ++i) et[i] = iwork[i]; + + /* Compute the number of descendants of each node in the etree */ + ifill (relax_end, n, EMPTY); + ifill (relax_fsupc, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for ( f = j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode in postordered etree; j is the last column. */ + ++nsuper_et_post; + k = n; + for (i = snode_start; i <= j; ++i) + k = SUPERLU_MIN(k, inv_post[i]); + l = inv_post[j]; + if ( (l - k) == (j - snode_start) ) { + /* It's also a supernode in the original etree */ + relax_end[k] = l; /* Last column is recorded */ + relax_fsupc[f++] = k; + ++nsuper_et; + } else { + for (i = snode_start; i <= j; ++i) { + l = inv_post[i]; + if ( descendants[i] == 0 ) { + relax_end[l] = l; + relax_fsupc[f++] = l; + ++nsuper_et; + } + } + } + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + +#if ( PRNTlevel>=1 ) + printf(".. heap_snode_relax:\n" + "\tNo of relaxed snodes in postordered etree:\t%d\n" + "\tNo of relaxed snodes in original etree:\t%d\n", + nsuper_et_post, nsuper_et); +#endif + + /* Recover the original etree */ + for (i = 0; i < n; ++i) et[i] = et_save[i]; + + SUPERLU_FREE(post); + SUPERLU_FREE(iwork); +} diff --git a/src/maths/SuperLU/ilu_relax_snode.c b/src/maths/SuperLU/ilu_relax_snode.c new file mode 100644 index 000000000..5634098a5 --- /dev/null +++ b/src/maths/SuperLU/ilu_relax_snode.c @@ -0,0 +1,69 @@ +/*! @file ilu_relax_snode.c + * \brief Identify initial relaxed supernodes + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 1, 2009
+ * 
+ */ + +#include +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    ilu_relax_snode() - Identify the initial relaxed supernodes, assuming
+ *    that the matrix has been reordered according to the postorder of the
+ *    etree.
+ * 
+ */ +void +ilu_relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end, /* last column in a supernode + * if j-th column starts a relaxed + * supernode, relax_end[j] represents + * the last column of this supernode */ + int *relax_fsupc /* first column in a supernode + * relax_fsupc[j] represents the first + * column of j-th supernode */ + ) +{ + + register int j, f, parent; + register int snode_start; /* beginning of a snode */ + + ifill (relax_end, n, EMPTY); + ifill (relax_fsupc, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + + /* Compute the number of descendants of each node in the etree */ + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = f = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode with j being the last column. */ + relax_end[snode_start] = j; /* Last column is recorded */ + j++; + relax_fsupc[f++] = snode_start; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } +} diff --git a/src/maths/SuperLU/ilu_zcolumn_dfs.c b/src/maths/SuperLU/ilu_zcolumn_dfs.c new file mode 100644 index 000000000..a8e1e22d6 --- /dev/null +++ b/src/maths/SuperLU/ilu_zcolumn_dfs.c @@ -0,0 +1,258 @@ + +/*! @file ilu_zcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+*/ + +#include + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   ILU_ZCOLUMN_DFS performs a symbolic factorization on column jcol, and
+ *   decide the supernode boundary.
+ *
+ *   This routine does not use numeric values, but only use the RHS
+ *   row indices to start the dfs.
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives. The routine returns a list of such supernodal
+ *   representatives in topological order of the dfs that generates them.
+ *   The location of the first nonzero in each such supernodal segment
+ *   (supernodal entry location) is also returned.
+ *
+ * Local parameters
+ * ================
+ *   nseg: no of segments in current U[*,j]
+ *   jsuper: jsuper=EMPTY if column j does not belong to the same
+ *	supernode as j-1. Otherwise, jsuper=nsuper.
+ *
+ *   marker2: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ *
+ * Return value
+ * ============
+ *     0  success;
+ *   > 0  number of bytes allocated when run out of space.
+ * 
+ */ +int +ilu_zcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the + dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(7); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonzero */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ((mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu))) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( (mem_error = zLUMemXpand(jcol,nextl, + LSUB,&nzlmax,Glu)) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; + + /* Always start a new supernode for a singular column */ + if ( nextl == jptr ) jsuper = EMPTY; + + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first columns of the supernode. + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1) ) { /* >= 2 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + xlsub[jcol] = ito; + for (ifrom = jptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_zcopy_to_ucol.c b/src/maths/SuperLU/ilu_zcopy_to_ucol.c new file mode 100644 index 000000000..59f01ee9c --- /dev/null +++ b/src/maths/SuperLU/ilu_zcopy_to_ucol.c @@ -0,0 +1,211 @@ + +/*! @file ilu_zcopy_to_ucol.c + * \brief Copy a computed column of U to the compressed data structure + * and drop some small entries + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Laboratory
+ * November, 2010
+ * 
+ */ + +#include + +#ifdef DEBUG +int num_drop_U; +#endif + +extern void zcopy_(int *, doublecomplex [], int *, doublecomplex [], int *); + +#if 0 +static doublecomplex *A; /* used in _compare_ only */ +static int _compare_(const void *a, const void *b) +{ + register int *x = (int *)a, *y = (int *)b; + register double xx = z_abs1(&A[*x]), yy = z_abs1(&A[*y]); + if (xx > yy) return -1; + else if (xx < yy) return 1; + else return 0; +} +#endif + +int +ilu_zcopy_to_ucol( + int jcol, /* in */ + int nseg, /* in */ + int *segrep, /* in */ + int *repfnz, /* in */ + int *perm_r, /* in */ + doublecomplex *dense, /* modified - reset to zero on return */ + int drop_rule,/* in */ + milu_t milu, /* in */ + double drop_tol, /* in */ + int quota, /* maximum nonzero entries allowed */ + doublecomplex *sum, /* out - the sum of dropped entries */ + int *nnzUj, /* in - out */ + GlobalLU_t *Glu, /* modified */ + double *work /* working space with minimum size n, + * used by the second dropping rule */ + ) +{ +/* + * Gather from SPA dense[*] to global ucol[*]. + */ + int ksub, krep, ksupno; + int i, k, kfnz, segsze; + int fsupc, isub, irow; + int jsupno, nextu; + int new_next, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *ucol; + int *usub, *xusub; + int nzumax; + int m; /* number of entries in the nonzero U-segments */ + register double d_max = 0.0, d_min = 1.0 / dlamch_("Safe minimum"); + register double tmp; + doublecomplex zero = {0.0, 0.0}; + int i_1 = 1; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + nzumax = Glu->nzumax; + + *sum = zero; + if (drop_rule == NODROP) { + drop_tol = -1.0, quota = Glu->n; + } + + jsupno = supno[jcol]; + nextu = xusub[jcol]; + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + krep = segrep[k--]; + ksupno = supno[krep]; + + if ( ksupno != jsupno ) { /* Should go into ucol[] */ + kfnz = repfnz[krep]; + if ( kfnz != EMPTY ) { /* Nonzero U-segment */ + + fsupc = xsup[ksupno]; + isub = xlsub[fsupc] + kfnz - fsupc; + segsze = krep - kfnz + 1; + + new_next = nextu + segsze; + while ( new_next > nzumax ) { + if ((mem_error = zLUMemXpand(jcol, nextu, UCOL, &nzumax, + Glu)) != 0) + return (mem_error); + ucol = Glu->ucol; + if ((mem_error = zLUMemXpand(jcol, nextu, USUB, &nzumax, + Glu)) != 0) + return (mem_error); + usub = Glu->usub; + lsub = Glu->lsub; + } + + for (i = 0; i < segsze; i++) { + irow = lsub[isub++]; + tmp = z_abs1(&dense[irow]); + + /* first dropping rule */ + if (quota > 0 && tmp >= drop_tol) { + if (tmp > d_max) d_max = tmp; + if (tmp < d_min) d_min = tmp; + usub[nextu] = perm_r[irow]; + ucol[nextu] = dense[irow]; + nextu++; + } else { + switch (milu) { + case SMILU_1: + case SMILU_2: + z_add(sum, sum, &dense[irow]); + break; + case SMILU_3: + /* *sum += fabs(dense[irow]);*/ + sum->r += tmp; + break; + case SILU: + default: + break; + } +#ifdef DEBUG + num_drop_U++; +#endif + } + dense[irow] = zero; + } + + } + + } + + } /* for each segment... */ + + xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ + m = xusub[jcol + 1] - xusub[jcol]; + + /* second dropping rule */ + if (drop_rule & DROP_SECONDARY && m > quota) { + register double tol = d_max; + register int m0 = xusub[jcol] + m - 1; + + if (quota > 0) { + if (drop_rule & DROP_INTERP) { + d_max = 1.0 / d_max; d_min = 1.0 / d_min; + tol = 1.0 / (d_max + (d_min - d_max) * quota / m); + } else { + i_1 = xusub[jcol]; + for (i = 0; i < m; ++i, ++i_1) work[i] = z_abs1(&ucol[i_1]); + tol = dqselect(m, work, quota); +#if 0 + A = &ucol[xusub[jcol]]; + for (i = 0; i < m; i++) work[i] = i; + qsort(work, m, sizeof(int), _compare_); + tol = fabs(usub[xusub[jcol] + work[quota]]); +#endif + } + } + for (i = xusub[jcol]; i <= m0; ) { + if (z_abs1(&ucol[i]) <= tol) { + switch (milu) { + case SMILU_1: + case SMILU_2: + z_add(sum, sum, &ucol[i]); + break; + case SMILU_3: + sum->r += tmp; + break; + case SILU: + default: + break; + } + ucol[i] = ucol[m0]; + usub[i] = usub[m0]; + m0--; + m--; +#ifdef DEBUG + num_drop_U++; +#endif + xusub[jcol + 1]--; + continue; + } + i++; + } + } + + if (milu == SMILU_2) { + sum->r = z_abs1(sum); sum->i = 0.0; + } + if (milu == SMILU_3) sum->i = 0.0; + + *nnzUj += m; + + return 0; +} diff --git a/src/maths/SuperLU/ilu_zdrop_row.c b/src/maths/SuperLU/ilu_zdrop_row.c new file mode 100644 index 000000000..9436d0365 --- /dev/null +++ b/src/maths/SuperLU/ilu_zdrop_row.c @@ -0,0 +1,339 @@ + +/*! @file ilu_zdrop_row.c + * \brief Drop small rows from L + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ * 
+ */ + +#include +#include +#include + +extern void zswap_(int *, doublecomplex [], int *, doublecomplex [], int *); +extern void zaxpy_(int *, doublecomplex *, doublecomplex [], int *, doublecomplex [], int *); +extern void zcopy_(int *, doublecomplex [], int *, doublecomplex [], int *); +extern double dzasum_(int *, doublecomplex *, int *); +extern double dznrm2_(int *, doublecomplex *, int *); +extern double dnrm2_(int *, double [], int *); +extern int izamax_(int *, doublecomplex [], int *); + +static double *A; /* used in _compare_ only */ +static int _compare_(const void *a, const void *b) +{ + register int *x = (int *)a, *y = (int *)b; + if (A[*x] - A[*y] > 0.0) return -1; + else if (A[*x] - A[*y] < 0.0) return 1; + else return 0; +} + +/*! \brief + *
+ * Purpose
+ * =======
+ *    ilu_zdrop_row() - Drop some small rows from the previous 
+ *    supernode (L-part only).
+ * 
+ */ +int ilu_zdrop_row( + superlu_options_t *options, /* options */ + int first, /* index of the first column in the supernode */ + int last, /* index of the last column in the supernode */ + double drop_tol, /* dropping parameter */ + int quota, /* maximum nonzero entries allowed */ + int *nnzLj, /* in/out number of nonzeros in L(:, 1:last) */ + double *fill_tol, /* in/out - on exit, fill_tol=-num_zero_pivots, + * does not change if options->ILU_MILU != SMILU1 */ + GlobalLU_t *Glu, /* modified */ + double dwork[], /* working space + * the length of dwork[] should be no less than + * the number of rows in the supernode */ + double dwork2[], /* working space with the same size as dwork[], + * used only by the second dropping rule */ + int lastc /* if lastc == 0, there is nothing after the + * working supernode [first:last]; + * if lastc == 1, there is one more column after + * the working supernode. */ ) +{ + register int i, j, k, m1; + register int nzlc; /* number of nonzeros in column last+1 */ + register int xlusup_first, xlsub_first; + int m, n; /* m x n is the size of the supernode */ + int r = 0; /* number of dropped rows */ + register double *temp; + register doublecomplex *lusup = Glu->lusup; + register int *lsub = Glu->lsub; + register int *xlsub = Glu->xlsub; + register int *xlusup = Glu->xlusup; + register double d_max = 0.0, d_min = 1.0; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + norm_t nrm = options->ILU_Norm; + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + doublecomplex none = {-1.0, 0.0}; + int i_1 = 1; + int inc_diag; /* inc_diag = m + 1 */ + int nzp = 0; /* number of zero pivots */ + double alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim); + + xlusup_first = xlusup[first]; + xlsub_first = xlsub[first]; + m = xlusup[first + 1] - xlusup_first; + n = last - first + 1; + m1 = m - 1; + inc_diag = m + 1; + nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0; + temp = dwork - n; + + /* Quick return if nothing to do. */ + if (m == 0 || m == n || drop_rule == NODROP) + { + *nnzLj += m * n; + return 0; + } + + /* basic dropping: ILU(tau) */ + for (i = n; i <= m1; ) + { + /* the average abs value of ith row */ + switch (nrm) + { + case ONE_NORM: + temp[i] = dzasum_(&n, &lusup[xlusup_first + i], &m) / (double)n; + break; + case TWO_NORM: + temp[i] = dznrm2_(&n, &lusup[xlusup_first + i], &m) + / sqrt((double)n); + break; + case INF_NORM: + default: + k = izamax_(&n, &lusup[xlusup_first + i], &m) - 1; + temp[i] = z_abs1(&lusup[xlusup_first + i + m * k]); + break; + } + + /* drop small entries due to drop_tol */ + if (drop_rule & DROP_BASIC && temp[i] < drop_tol) + { + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + zaxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m].r += + z_abs1(&lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + zcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + zswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m].r = + z_abs1(&lusup[xlusup_first + m1 + j * m]); + lusup[xlusup_first + m1 + j * m].i = 0.0; + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + continue; + } /* if dropping */ + else + { + if (temp[i] > d_max) d_max = temp[i]; + if (temp[i] < d_min) d_min = temp[i]; + } + i++; + } /* for */ + + /* Secondary dropping: drop more rows according to the quota. */ + quota = ceil((double)quota / (double)n); + if (drop_rule & DROP_SECONDARY && m - r > quota) + { + register double tol = d_max; + + /* Calculate the second dropping tolerance */ + if (quota > n) + { + if (drop_rule & DROP_INTERP) /* by interpolation */ + { + d_max = 1.0 / d_max; d_min = 1.0 / d_min; + tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r)); + } + else /* by quick select */ + { + int len = m1 - n + 1; + dcopy_(&len, dwork, &i_1, dwork2, &i_1); + tol = dqselect(len, dwork2, quota - n); +#if 0 + register int *itemp = iwork - n; + A = temp; + for (i = n; i <= m1; i++) itemp[i] = i; + qsort(iwork, m1 - n + 1, sizeof(int), _compare_); + tol = temp[itemp[quota]]; +#endif + } + } + + for (i = n; i <= m1; ) + { + if (temp[i] <= tol) + { + register int j; + r++; + /* drop the current row and move the last undropped row here */ + if (r > 1) /* add to last row */ + { + /* accumulate the sum (for MILU) */ + switch (milu) + { + case SMILU_1: + case SMILU_2: + zaxpy_(&n, &one, &lusup[xlusup_first + i], &m, + &lusup[xlusup_first + m - 1], &m); + break; + case SMILU_3: + for (j = 0; j < n; j++) + lusup[xlusup_first + (m - 1) + j * m].r += + z_abs1(&lusup[xlusup_first + i + j * m]); + break; + case SILU: + default: + break; + } + zcopy_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + } /* if (r > 1) */ + else /* move to last row */ + { + zswap_(&n, &lusup[xlusup_first + m1], &m, + &lusup[xlusup_first + i], &m); + if (milu == SMILU_3) + for (j = 0; j < n; j++) { + lusup[xlusup_first + m1 + j * m].r = + z_abs1(&lusup[xlusup_first + m1 + j * m]); + lusup[xlusup_first + m1 + j * m].i = 0.0; + } + } + lsub[xlsub_first + i] = lsub[xlsub_first + m1]; + m1--; + temp[i] = temp[m1]; + + continue; + } + i++; + + } /* for */ + + } /* if secondary dropping */ + + for (i = n; i < m; i++) temp[i] = 0.0; + + if (r == 0) + { + *nnzLj += m * n; + return 0; + } + + /* add dropped entries to the diagnal */ + if (milu != SILU) + { + register int j; + doublecomplex t; + double omega; + for (j = 0; j < n; j++) + { + t = lusup[xlusup_first + (m - 1) + j * m]; + if (t.r == 0.0 && t.i == 0.0) continue; + omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / z_abs1(&t), 1.0); + zd_mult(&t, &t, omega); + + switch (milu) + { + case SMILU_1: + if ( !(z_eq(&t, &none)) ) { + z_add(&t, &t, &one); + zz_mult(&lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + &t); + } + else + { + zd_mult( + &lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + *fill_tol); +#ifdef DEBUG + printf("[1] ZERO PIVOT: FILL col %d.\n", first + j); + fflush(stdout); +#endif + nzp++; + } + break; + case SMILU_2: + zd_mult(&lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + 1.0 + z_abs1(&t)); + break; + case SMILU_3: + z_add(&t, &t, &one); + zz_mult(&lusup[xlusup_first + j * inc_diag], + &lusup[xlusup_first + j * inc_diag], + &t); + break; + case SILU: + default: + break; + } + } + if (nzp > 0) *fill_tol = -nzp; + } + + /* Remove dropped entries from the memory and fix the pointers. */ + m1 = m - r; + for (j = 1; j < n; j++) + { + register int tmp1, tmp2; + tmp1 = xlusup_first + j * m1; + tmp2 = xlusup_first + j * m; + for (i = 0; i < m1; i++) + lusup[i + tmp1] = lusup[i + tmp2]; + } + for (i = 0; i < nzlc; i++) + lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m]; + for (i = 0; i < nzlc; i++) + lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i]; + for (i = first + 1; i <= last + 1; i++) + { + xlusup[i] -= r * (i - first); + xlsub[i] -= r; + } + if (lastc) + { + xlusup[last + 2] -= r * n; + xlsub[last + 2] -= r; + } + + *nnzLj += (m - r) * n; + return r; +} diff --git a/src/maths/SuperLU/ilu_zpanel_dfs.c b/src/maths/SuperLU/ilu_zpanel_dfs.c new file mode 100644 index 000000000..471c64751 --- /dev/null +++ b/src/maths/SuperLU/ilu_zpanel_dfs.c @@ -0,0 +1,248 @@ + +/*! @file ilu_zpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols and + * record the entries with maximum absolute value in each column + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives.
+ *
+ *   The routine returns one list of the supernodal representatives
+ *   in topological order of the dfs that generates them. This list is
+ *   a superset of the topological order of each individual column within
+ *   the panel.
+ *   The location of the first nonzero in each supernodal segment
+ *   (supernodal entry location) is also returned. Each column has a
+ *   separate list for this purpose.
+ *
+ *   Two marker arrays are used for dfs:
+ *     marker[i] == jj, if i was visited during dfs of current column jj;
+ *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
+ *
+ *   marker: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ * 
+ */ +void +ilu_zpanel_dfs( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + doublecomplex *dense, /* out */ + double *amax, /* out - max. abs. value of each column in panel */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ +) +{ + + NCPformat *Astore; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + doublecomplex *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + double *amax_col; + register double tmp; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + amax_col = amax; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + *amax_col = 0.0; + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + tmp = z_abs1(&a[k]); + if (tmp > *amax_col) *amax_col = tmp; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[xsup[supno[krep]]]; + maxdfs = xlsub[krep + 1]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xlsub[krep + 1]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + amax_col++; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/ilu_zpivotL.c b/src/maths/SuperLU/ilu_zpivotL.c new file mode 100644 index 000000000..bf658b90a --- /dev/null +++ b/src/maths/SuperLU/ilu_zpivotL.c @@ -0,0 +1,274 @@ + +/*! @file ilu_zpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + + +#include +#include +#include + +#ifndef SGN +#define SGN(x) ((x)>=0?1:-1) +#endif + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Performs the numerical pivoting on the current column of L,
+ *   and the CDIV operation.
+ *
+ *   Pivot policy:
+ *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
+ *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
+ *	     pivot row = k;
+ *	 ELSE IF abs(A_jj) >= thresh THEN
+ *	     pivot row = j;
+ *	 ELSE
+ *	     pivot row = m;
+ *
+ *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
+ *
+ *   Return value: 0	  success;
+ *		   i > 0  U(i,i) is exactly zero.
+ * 
+ */ + +int +ilu_zpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by + * perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int diagind, /* diagonal of Pc*A*Pc' */ + int *swap, /* in/out record the row permutation */ + int *iswap, /* in/out inverse of swap, it is the same as + perm_r after the factorization */ + int *marker, /* in */ + int *pivrow, /* in/out, as an input if *usepr!=0 */ + double fill_tol, /* in - fill tolerance of current column + * used for a singular column */ + milu_t milu, /* in */ + doublecomplex drop_sum, /* in - computed in ilu_zcopy_to_ucol() + (MILU only) */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + int n; /* number of columns */ + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + register int pivptr; + int old_pivptr, diag, ptr0; + register double pivmax, rtemp; + double thresh; + doublecomplex temp; + doublecomplex *lu_sup_ptr; + doublecomplex *lu_col_ptr; + int *lsub_ptr; + register int isub, icol, k, itemp; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + flops_t *ops = stat->ops; + int info; + doublecomplex one = {1.0, 0.0}; + + /* Initialize pointers */ + n = Glu->n; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + pivmax = -1.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + ptr0 = EMPTY; + for (isub = nsupc; isub < nsupr; ++isub) { + if (marker[lsub_ptr[isub]] > jcol) + continue; /* do not overlap with a later relaxed supernode */ + + switch (milu) { + case SMILU_1: + z_add(&temp, &lu_col_ptr[isub], &drop_sum); + rtemp = z_abs1(&temp); + break; + case SMILU_2: + case SMILU_3: + /* In this case, drop_sum contains the sum of the abs. value */ + rtemp = z_abs1(&lu_col_ptr[isub]); + break; + case SILU: + default: + rtemp = z_abs1(&lu_col_ptr[isub]); + break; + } + if (rtemp > pivmax) { pivmax = rtemp; pivptr = isub; } + if (*usepr && lsub_ptr[isub] == *pivrow) old_pivptr = isub; + if (lsub_ptr[isub] == diagind) diag = isub; + if (ptr0 == EMPTY) ptr0 = isub; + } + + if (milu == SMILU_2 || milu == SMILU_3) pivmax += drop_sum.r; + + /* Test for singularity */ + if (pivmax < 0.0) { + fprintf(stderr, "[0]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + if ( pivmax == 0.0 ) { + if (diag != EMPTY) + *pivrow = lsub_ptr[pivptr = diag]; + else if (ptr0 != EMPTY) + *pivrow = lsub_ptr[pivptr = ptr0]; + else { + /* look for the first row which does not + belong to any later supernodes */ + for (icol = jcol; icol < n; icol++) + if (marker[swap[icol]] <= jcol) break; + if (icol >= n) { + fprintf(stderr, "[1]: jcol=%d, SINGULAR!!!\n", jcol); + fflush(stderr); + exit(1); + } + + *pivrow = swap[icol]; + + /* pick up the pivot row */ + for (isub = nsupc; isub < nsupr; ++isub) + if ( lsub_ptr[isub] == *pivrow ) { pivptr = isub; break; } + } + pivmax = fill_tol; + lu_col_ptr[pivptr].r = pivmax; + lu_col_ptr[pivptr].i = 0.0; + *usepr = 0; +#ifdef DEBUG + printf("[0] ZERO PIVOT: FILL (%d, %d).\n", *pivrow, jcol); + fflush(stdout); +#endif + info =jcol + 1; + } /* if (*pivrow == 0.0) */ + else { + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + switch (milu) { + case SMILU_1: + z_add(&temp, &lu_col_ptr[old_pivptr], &drop_sum); + rtemp = z_abs1(&temp); + break; + case SMILU_2: + case SMILU_3: + rtemp = z_abs1(&lu_col_ptr[old_pivptr]) + drop_sum.r; + break; + case SILU: + default: + rtemp = z_abs1(&lu_col_ptr[old_pivptr]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = old_pivptr; + else *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + switch (milu) { + case SMILU_1: + z_add(&temp, &lu_col_ptr[diag], &drop_sum); + rtemp = z_abs1(&temp); + break; + case SMILU_2: + case SMILU_3: + rtemp = z_abs1(&lu_col_ptr[diag]) + drop_sum.r; + break; + case SILU: + default: + rtemp = z_abs1(&lu_col_ptr[diag]); + break; + } + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + info = 0; + + /* Reset the diagonal */ + switch (milu) { + case SMILU_1: + z_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum); + break; + case SMILU_2: + case SMILU_3: + temp = z_sgn(&lu_col_ptr[pivptr]); + zz_mult(&temp, &temp, &drop_sum); + z_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum); + break; + case SILU: + default: + break; + } + + } /* else */ + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + if (jcol < n - 1) { + register int t1, t2, t; + t1 = iswap[*pivrow]; t2 = jcol; + if (t1 != t2) { + t = swap[t1]; swap[t1] = swap[t2]; swap[t2] = t; + t1 = swap[t1]; t2 = t; + t = iswap[t1]; iswap[t1] = iswap[t2]; iswap[t2] = t; + } + } /* if (jcol < n - 1) */ + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += 10 * (nsupr - nsupc); + z_div(&temp, &one, &lu_col_ptr[nsupc]); + for (k = nsupc+1; k < nsupr; k++) + zz_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); + + return info; +} diff --git a/src/maths/SuperLU/ilu_zsnode_dfs.c b/src/maths/SuperLU/ilu_zsnode_dfs.c new file mode 100644 index 000000000..0a8975819 --- /dev/null +++ b/src/maths/SuperLU/ilu_zsnode_dfs.c @@ -0,0 +1,90 @@ + +/*! @file ilu_zsnode_dfs.c + * \brief Determines the union of row structures of columns within the relaxed node + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    ilu_zsnode_dfs() - Determine the union of the row structures of those
+ *    columns within the relaxed snode.
+ *    Note: The relaxed snodes are leaves of the supernodal etree, therefore,
+ *    the portion outside the rectangular supernode must be zero.
+ *
+ * Return value
+ * ============
+ *     0   success;
+ *    >0   number of bytes allocated when run out of memory.
+ * 
+ */ + +int +ilu_zsnode_dfs( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, nextl; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) + { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) + { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) + { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) + { + if ( (mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, + Glu)) != 0) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1 */ + if ( jcol < kcol ) + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xlsub[kcol+1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/izmax1.c b/src/maths/SuperLU/izmax1.c new file mode 100644 index 000000000..13272a9d6 --- /dev/null +++ b/src/maths/SuperLU/izmax1.c @@ -0,0 +1,113 @@ +/*! @file izmax1.c + * \brief Finds the index of the element whose real part has maximum absolute value + * + *
+ *     -- LAPACK auxiliary routine (version 2.0) --   
+ *     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+ *     Courant Institute, Argonne National Lab, and Rice University   
+ *     October 31, 1992   
+ * 
+ */ +#include +#include +#include + +/*! \brief + +
+    Purpose   
+    =======   
+
+    IZMAX1 finds the index of the element whose real part has maximum   
+    absolute value.   
+
+    Based on IZAMAX from Level 1 BLAS.   
+    The change is to use the 'genuine' absolute value.   
+
+    Contributed by Nick Higham for use with ZLACON.   
+
+    Arguments   
+    =========   
+
+    N       (input) INT   
+            The number of elements in the vector CX.   
+
+    CX      (input) COMPLEX*16 array, dimension (N)   
+            The vector whose elements will be summed.   
+
+    INCX    (input) INT   
+            The spacing between successive values of CX.  INCX >= 1.   
+
+   ===================================================================== 
+
+*/ + +int +izmax1_(int *n, doublecomplex *cx, int *incx) +{ + + + /* System generated locals */ + int ret_val, i__1, i__2; + double d__1; + + /* Local variables */ + double smax; + int i, ix; + +#define CX(I) cx[(I)-1] + + ret_val = 0; + if (*n < 1) { + return ret_val; + } + ret_val = 1; + if (*n == 1) { + return ret_val; + } + if (*incx == 1) { + goto L30; + } + +/* CODE FOR INCREMENT NOT EQUAL TO 1 */ + + ix = 1; + smax = (d__1 = CX(1).r, fabs(d__1)); + ix += *incx; + i__1 = *n; + for (i = 2; i <= *n; ++i) { + i__2 = ix; + if ((d__1 = CX(ix).r, fabs(d__1)) <= smax) { + goto L10; + } + ret_val = i; + i__2 = ix; + smax = (d__1 = CX(ix).r, fabs(d__1)); +L10: + ix += *incx; +/* L20: */ + } + return ret_val; + +/* CODE FOR INCREMENT EQUAL TO 1 */ + +L30: + smax = (d__1 = CX(1).r, fabs(d__1)); + i__1 = *n; + for (i = 2; i <= *n; ++i) { + i__2 = i; + if ((d__1 = CX(i).r, fabs(d__1)) <= smax) { + goto L40; + } + ret_val = i; + i__2 = i; + smax = (d__1 = CX(i).r, fabs(d__1)); +L40: + ; + } + return ret_val; + +/* End of IZMAX1 */ + +} /* izmax1_ */ + diff --git a/src/maths/SuperLU/lsame.c b/src/maths/SuperLU/lsame.c new file mode 100644 index 000000000..cd7fa00d0 --- /dev/null +++ b/src/maths/SuperLU/lsame.c @@ -0,0 +1,83 @@ +/*! @file lsame.c + * \brief Check if CA is the same letter as CB regardless of case. + * + *
+ * -- LAPACK auxiliary routine (version 2.0) --   
+ *      Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+ *      Courant Institute, Argonne National Lab, and Rice University   
+ *      September 30, 1994   
+ * 
+ */ +#include + +/*! \brief + +
+    Purpose   
+    =======   
+
+    LSAME returns .TRUE. if CA is the same letter as CB regardless of case.   
+
+    Arguments   
+    =========   
+
+    CA      (input) CHARACTER*1   
+    CB      (input) CHARACTER*1   
+            CA and CB specify the single characters to be compared.   
+
+   ===================================================================== 
+
+*/ + +int lsame_(char *ca, char *cb) +{ + + + /* System generated locals */ + int ret_val; + + /* Local variables */ + int inta, intb, zcode; + + ret_val = *(unsigned char *)ca == *(unsigned char *)cb; + if (ret_val) { + return ret_val; + } + + /* Now test for equivalence if both characters are alphabetic. */ + + zcode = 'Z'; + + /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime + machines, on which ICHAR returns a value with bit 8 set. + ICHAR('A') on Prime machines returns 193 which is the same as + ICHAR('A') on an EBCDIC machine. */ + + inta = *(unsigned char *)ca; + intb = *(unsigned char *)cb; + + if (zcode == 90 || zcode == 122) { + /* ASCII is assumed - ZCODE is the ASCII code of either lower or + upper case 'Z'. */ + if (inta >= 97 && inta <= 122) inta += -32; + if (intb >= 97 && intb <= 122) intb += -32; + + } else if (zcode == 233 || zcode == 169) { + /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or + upper case 'Z'. */ + if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta + >= 162 && inta <= 169) + inta += 64; + if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb + >= 162 && intb <= 169) + intb += 64; + } else if (zcode == 218 || zcode == 250) { + /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code + plus 128 of either lower or upper case 'Z'. */ + if (inta >= 225 && inta <= 250) inta += -32; + if (intb >= 225 && intb <= 250) intb += -32; + } + ret_val = inta == intb; + return ret_val; + +} /* lsame_ */ diff --git a/src/maths/SuperLU/mark_relax.c b/src/maths/SuperLU/mark_relax.c new file mode 100644 index 000000000..16664dcd5 --- /dev/null +++ b/src/maths/SuperLU/mark_relax.c @@ -0,0 +1,47 @@ +/*! @file mark_relax.c + * \brief Record the rows pivoted by the relaxed supernodes. + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 1, 2009
+ * <\pre>
+ */
+#include 
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *    mark_relax() - record the rows used by the relaxed supernodes.
+ * 
+ */ +int mark_relax( + int n, /* order of the matrix A */ + int *relax_end, /* last column in a relaxed supernode. + * if j-th column starts a relaxed supernode, + * relax_end[j] represents the last column of + * this supernode. */ + int *relax_fsupc, /* first column in a relaxed supernode. + * relax_fsupc[j] represents the first column of + * j-th supernode. */ + int *xa_begin, /* Astore->colbeg */ + int *xa_end, /* Astore->colend */ + int *asub, /* row index of A */ + int *marker /* marker[j] is the maximum column index if j-th + * row belongs to a relaxed supernode. */ ) +{ + register int jcol, kcol; + register int i, j, k; + + for (i = 0; i < n && relax_fsupc[i] != EMPTY; i++) + { + jcol = relax_fsupc[i]; /* first column */ + kcol = relax_end[jcol]; /* last column */ + for (j = jcol; j <= kcol; j++) + for (k = xa_begin[j]; k < xa_end[j]; k++) + marker[asub[k]] = jcol; + } + return i; +} diff --git a/src/maths/SuperLU/mc64ad.c b/src/maths/SuperLU/mc64ad.c new file mode 100644 index 000000000..160155529 --- /dev/null +++ b/src/maths/SuperLU/mc64ad.c @@ -0,0 +1,2641 @@ +/* mc64ad.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include + +#define abs(a) ((a) >= 0) ? (a) : -(a) +#define min(a,b) ((a) < (b)) ? (a) : (b) + +/* Table of constant values */ + +static int_t c__1 = 1; +static int_t c__2 = 2; + +/* CCCC COPYRIGHT (c) 1999 Council for the Central Laboratory of the */ +/* CCCC Research Councils. All rights reserved. */ +/* CCCC PACKAGE MC64A/AD */ +/* CCCC AUTHORS Iain Duff (i.duff@rl.ac.uk) and Jacko Koster (jak@ii.uib.no) */ +/* CCCC LAST UPDATE 20/09/99 */ +/* CCCC */ +/* *** Conditions on external use *** */ + +/* The user shall acknowledge the contribution of this */ +/* package in any publication of material dependent upon the use of */ +/* the package. The user shall use reasonable endeavours to notify */ +/* the authors of the package of this publication. */ + +/* The user can modify this code but, at no time */ +/* shall the right or title to all or any part of this package pass */ +/* to the user. The user shall make available free of charge */ +/* to the authors for any purpose all information relating to any */ +/* alteration or addition made to this package for the purposes of */ +/* extending the capabilities or enhancing the performance of this */ +/* package. */ + +/* The user shall not pass this code directly to a third party without the */ +/* express prior consent of the authors. Users wanting to licence their */ +/* own copy of these routines should send email to hsl@aeat.co.uk */ + +/* None of the comments from the Copyright notice up to and including this */ +/* one shall be removed or altered in any way. */ +/* ********************************************************************** */ +/* Subroutine */ int_t mc64id_(int_t *icntl) +{ + int_t i__; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Purpose */ +/* ======= */ + +/* The components of the array ICNTL control the action of MC64A/AD. */ +/* Default values for these are set in this subroutine. */ + +/* Parameters */ +/* ========== */ + + +/* Local variables */ + +/* ICNTL(1) has default value 6. */ +/* It is the output stream for error messages. If it */ +/* is negative, these messages will be suppressed. */ + +/* ICNTL(2) has default value 6. */ +/* It is the output stream for warning messages. */ +/* If it is negative, these messages are suppressed. */ + +/* ICNTL(3) has default value -1. */ +/* It is the output stream for monitoring printing. */ +/* If it is negative, these messages are suppressed. */ + +/* ICNTL(4) has default value 0. */ +/* If left at the defaut value, the incoming data is checked for */ +/* out-of-range indices and duplicates. Setting ICNTL(4) to any */ +/* other will avoid the checks but is likely to cause problems */ +/* later if out-of-range indices or duplicates are present. */ +/* The user should only set ICNTL(4) non-zero, if the data is */ +/* known to avoid these problems. */ + +/* ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to */ +/* zero in this routine. */ +/* Initialization of the ICNTL array. */ + /* Parameter adjustments */ + --icntl; + + /* Function Body */ + icntl[1] = 6; + icntl[2] = 6; + icntl[3] = -1; + for (i__ = 4; i__ <= 10; ++i__) { + icntl[i__] = 0; +/* L10: */ + } + return 0; +} /* mc64id_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64ad_(int_t *job, int_t *n, int_t *ne, int_t * + ip, int_t *irn, double *a, int_t *num, int_t *cperm, + int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t * + icntl, int_t *info) +{ + /* System generated locals */ + int_t i__1, i__2; + double d__1, d__2; + + /* Builtin functions */ + double log(double); + + /* Local variables */ + int_t i__, j, k; + double fact, rinf; + + extern /* Subroutine */ int_t mc21ad_(int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *), mc64bd_( + int_t *, int_t *, int_t *, int_t *, double *, int_t + *, int_t *, int_t *, int_t *, int_t *, int_t *, + double *), mc64rd_(int_t *, int_t *, int_t *, int_t *, + double *), mc64sd_(int_t *, int_t *, int_t *, int_t * + , double *, int_t *, int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *), mc64wd_( + int_t *, int_t *, int_t *, int_t *, double *, int_t + *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t + *, double *, double *); + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Purpose */ +/* ======= */ + +/* This subroutine attempts to find a column permutation for an NxN */ +/* sparse matrix A = {a_ij} that makes the permuted matrix have N */ +/* entries on its diagonal. */ +/* If the matrix is structurally nonsingular, the subroutine optionally */ +/* returns a column permutation that maximizes the smallest element */ +/* on the diagonal, maximizes the sum of the diagonal entries, or */ +/* maximizes the product of the diagonal entries of the permuted matrix. */ +/* For the latter option, the subroutine also finds scaling factors */ +/* that may be used to scale the matrix so that the nonzero diagonal */ +/* entries of the permuted matrix are one in absolute value and all the */ +/* off-diagonal entries are less than or equal to one in absolute value. */ +/* The natural logarithms of the scaling factors u(i), i=1..N, for the */ +/* rows and v(j), j=1..N, for the columns are returned so that the */ +/* scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j). */ + +/* Parameters */ +/* ========== */ + + +/* JOB is an INT_T variable which must be set by the user to */ +/* control the action. It is not altered by the subroutine. */ +/* Possible values for JOB are: */ +/* 1 Compute a column permutation of the matrix so that the */ +/* permuted matrix has as many entries on its diagonal as possible. */ +/* The values on the diagonal are of arbitrary size. HSL subroutine */ +/* MC21A/AD is used for this. See [1]. */ +/* 2 Compute a column permutation of the matrix so that the smallest */ +/* value on the diagonal of the permuted matrix is maximized. */ +/* See [3]. */ +/* 3 Compute a column permutation of the matrix so that the smallest */ +/* value on the diagonal of the permuted matrix is maximized. */ +/* The algorithm differs from the one used for JOB = 2 and may */ +/* have quite a different performance. See [2]. */ +/* 4 Compute a column permutation of the matrix so that the sum */ +/* of the diagonal entries of the permuted matrix is maximized. */ +/* See [3]. */ +/* 5 Compute a column permutation of the matrix so that the product */ +/* of the diagonal entries of the permuted matrix is maximized */ +/* and vectors to scale the matrix so that the nonzero diagonal */ +/* entries of the permuted matrix are one in absolute value and */ +/* all the off-diagonal entries are less than or equal to one in */ +/* absolute value. See [3]. */ +/* Restriction: 1 <= JOB <= 5. */ + +/* N is an INT_T variable which must be set by the user to the */ +/* order of the matrix A. It is not altered by the subroutine. */ +/* Restriction: N >= 1. */ + +/* NE is an INT_T variable which must be set by the user to the */ +/* number of entries in the matrix. It is not altered by the */ +/* subroutine. */ +/* Restriction: NE >= 1. */ + +/* IP is an INT_T array of length N+1. */ +/* IP(J), J=1..N, must be set by the user to the position in array IRN */ +/* of the first row index of an entry in column J. IP(N+1) must be set */ +/* to NE+1. It is not altered by the subroutine. */ + +/* IRN is an INT_T array of length NE. */ +/* IRN(K), K=1..NE, must be set by the user to hold the row indices of */ +/* the entries of the matrix. Those belonging to column J must be */ +/* stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering */ +/* of the row indices within each column is unimportant. Repeated */ +/* entries are not allowed. The array IRN is not altered by the */ +/* subroutine. */ + +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ +/* The user must set A(K), K=1..NE, to the numerical value of the */ +/* entry that corresponds to IRN(K). */ +/* It is not used by the subroutine when JOB = 1. */ +/* It is not altered by the subroutine. */ + +/* NUM is an INT_T variable that need not be set by the user. */ +/* On successful exit, NUM will be the number of entries on the */ +/* diagonal of the permuted matrix. */ +/* If NUM < N, the matrix is structurally singular. */ + +/* CPERM is an INT_T array of length N that need not be set by the */ +/* user. On successful exit, CPERM contains the column permutation. */ +/* Column CPERM(J) of the original matrix is column J in the permuted */ +/* matrix, J=1..N. */ + +/* LIW is an INT_T variable that must be set by the user to */ +/* the dimension of array IW. It is not altered by the subroutine. */ +/* Restriction: */ +/* JOB = 1 : LIW >= 5N */ +/* JOB = 2 : LIW >= 4N */ +/* JOB = 3 : LIW >= 10N + NE */ +/* JOB = 4 : LIW >= 5N */ +/* JOB = 5 : LIW >= 5N */ + +/* IW is an INT_T array of length LIW that is used for workspace. */ + +/* LDW is an INT_T variable that must be set by the user to the */ +/* dimension of array DW. It is not altered by the subroutine. */ +/* Restriction: */ +/* JOB = 1 : LDW is not used */ +/* JOB = 2 : LDW >= N */ +/* JOB = 3 : LDW >= NE */ +/* JOB = 4 : LDW >= 2N + NE */ +/* JOB = 5 : LDW >= 3N + NE */ + +/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW */ +/* that is used for workspace. If JOB = 5, on return, */ +/* DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. */ + +/* ICNTL is an INT_T array of length 10. Its components control the */ +/* output of MC64A/AD and must be set by the user before calling */ +/* MC64A/AD. They are not altered by the subroutine. */ + +/* ICNTL(1) must be set to specify the output stream for */ +/* error messages. If ICNTL(1) < 0, messages are suppressed. */ +/* The default value set by MC46I/ID is 6. */ + +/* ICNTL(2) must be set by the user to specify the output stream for */ +/* warning messages. If ICNTL(2) < 0, messages are suppressed. */ +/* The default value set by MC46I/ID is 6. */ + +/* ICNTL(3) must be set by the user to specify the output stream for */ +/* diagnostic messages. If ICNTL(3) < 0, messages are suppressed. */ +/* The default value set by MC46I/ID is -1. */ + +/* ICNTL(4) must be set by the user to a value other than 0 to avoid */ +/* checking of the input data. */ +/* The default value set by MC46I/ID is 0. */ + +/* INFO is an INT_T array of length 10 which need not be set by the */ +/* user. INFO(1) is set non-negative to indicate success. A negative */ +/* value is returned if an error occurred, a positive value if a */ +/* warning occurred. INFO(2) holds further information on the error. */ +/* On exit from the subroutine, INFO(1) will take one of the */ +/* following values: */ +/* 0 : successful entry (for structurally nonsingular matrix). */ +/* +1 : successful entry (for structurally singular matrix). */ +/* +2 : the returned scaling factors are large and may cause */ +/* overflow when used to scale the matrix. */ +/* (For JOB = 5 entry only.) */ +/* -1 : JOB < 1 or JOB > 5. Value of JOB held in INFO(2). */ +/* -2 : N < 1. Value of N held in INFO(2). */ +/* -3 : NE < 1. Value of NE held in INFO(2). */ +/* -4 : the defined length LIW violates the restriction on LIW. */ +/* Value of LIW required given by INFO(2). */ +/* -5 : the defined length LDW violates the restriction on LDW. */ +/* Value of LDW required given by INFO(2). */ +/* -6 : entries are found whose row indices are out of range. INFO(2) */ +/* contains the index of a column in which such an entry is found. */ +/* -7 : repeated entries are found. INFO(2) contains the index of a */ +/* column in which such entries are found. */ +/* INFO(3) to INFO(10) are not currently used and are set to zero by */ +/* the routine. */ + +/* References: */ +/* [1] I. S. Duff, (1981), */ +/* "Algorithm 575. Permutations for a zero-free diagonal", */ +/* ACM Trans. Math. Software 7(3), 387-390. */ +/* [2] I. S. Duff and J. Koster, (1998), */ +/* "The design and use of algorithms for permuting large */ +/* entries to the diagonal of sparse matrices", */ +/* SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. */ +/* [3] I. S. Duff and J. Koster, (1999), */ +/* "On algorithms for permuting large entries to the diagonal */ +/* of sparse matrices", */ +/* Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. */ +/* Local variables and parameters */ +/* External routines and functions */ +/* EXTERNAL FD05AD */ +/* DOUBLE PRECISION FD05AD */ +/* Intrinsic functions */ +/* Set RINF to largest positive real number (infinity) */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --cperm; + --ip; + --a; + --irn; + --iw; + --dw; + --icntl; + --info; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Check value of JOB */ + if (*job < 1 || *job > 5) { + info[1] = -1; + info[2] = *job; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " because JOB = %d\n", *job); + } + goto L99; + } +/* Check value of N */ + if (*n < 1) { + info[1] = -2; + info[2] = *n; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " because N = %d\n", *job); + } + goto L99; + } +/* Check value of NE */ + if (*ne < 1) { + info[1] = -3; + info[2] = *ne; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " because NE = %d\n", *job); + } + goto L99; + } +/* Check LIW */ + if (*job == 1) { + k = *n * 5; + } + if (*job == 2) { + k = *n << 2; + } + if (*job == 3) { + k = *n * 10 + *ne; + } + if (*job == 4) { + k = *n * 5; + } + if (*job == 5) { + k = *n * 5; + } + if (*liw < k) { + info[1] = -4; + info[2] = k; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " LIW too small, must be at least %8d\n", k); + } + goto L99; + } +/* Check LDW */ +/* If JOB = 1, do not check */ + if (*job > 1) { + if (*job == 2) { + k = *n; + } + if (*job == 3) { + k = *ne; + } + if (*job == 4) { + k = (*n << 1) + *ne; + } + if (*job == 5) { + k = *n * 3 + *ne; + } + if (*ldw < k) { + info[1] = -5; + info[2] = k; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " LDW too small, must be at least %8d\n", k); + } + goto L99; + } + } + if (icntl[4] == 0) { +/* Check row indices. Use IW(1:N) as workspace */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + iw[i__] = 0; +/* L3: */ + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; +/* Check for row indices that are out of range */ + if (i__ < 1 || i__ > *n) { + info[1] = -6; + info[2] = j; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", + info[1], " Column %8d", j, + " contains an entry with invalid row index %8d\n", i__); + } + goto L99; + } +/* Check for repeated row indices within a column */ + if (iw[i__] == j) { + info[1] = -7; + info[2] = j; + if (icntl[1] >= 0) { + printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], + " Column %8d", j, + " contains two or more entries with row index %8d\n", i__); + } + goto L99; + } else { + iw[i__] = j; + } +/* L4: */ + } +/* L6: */ + } + } +/* Print diagnostics on input */ + if (icntl[3] >= 0) { + printf(" ****** Input parameters for MC64A/AD: JOB = %8d," + " N = %d, NE = %8d\n", *job, *n, *ne); + printf(" IP(1:N+1) = "); + for (j=1; j<=(*n+1); ++j) { + printf("%8d", ip[j]); + if (j%8 == 0) printf("\n"); + } + printf("\n IRN(1:NE) = "); + for (j=1; j<=(*ne); ++j) { + printf("%8d", irn[j]); + if (j%8 == 0) printf("\n"); + } + printf("\n"); + + if (*job > 1) { + printf(" A(1:NE) = "); + for (j=1; j<=(*ne); ++j) { + printf("%f14.4", a[j]); + if (j%4 == 0) printf("\n"); + } + printf("\n"); + } + } +/* Set components of INFO to zero */ + for (i__ = 1; i__ <= 10; ++i__) { + info[i__] = 0; +/* L8: */ + } +/* Compute maximum matching with MC21A/AD */ + if (*job == 1) { +/* Put length of column J in IW(J) */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + iw[j] = ip[j + 1] - ip[j]; +/* L10: */ + } +/* IW(N+1:5N) is workspace */ +#if 0 + mc21ad_(n, &irn[1], ne, &ip[1], &iw[1], &cperm[1], num, &iw[*n+1]); +#else + printf(" ****** Warning from MC64A/AD. Need to link mc21ad.\n"); +#endif + goto L90; + } +/* Compute bottleneck matching */ + if (*job == 2) { +/* IW(1:5N), DW(1:N) are workspaces */ + mc64bd_(n, ne, &ip[1], &irn[1], &a[1], &cperm[1], num, &iw[1], &iw[*n + + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &dw[1]); + goto L90; + } +/* Compute bottleneck matching */ + if (*job == 3) { +/* Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE */ + i__1 = *ne; + for (k = 1; k <= i__1; ++k) { + iw[k] = irn[k]; + dw[k] = (d__1 = a[k], abs(d__1)); +/* L20: */ + } +/* Sort entries in each column by decreasing value. */ + mc64rd_(n, ne, &ip[1], &iw[1], &dw[1]); +/* IW(NE+1:NE+10N) is workspace */ + mc64sd_(n, ne, &ip[1], &iw[1], &dw[1], &cperm[1], num, &iw[*ne + 1], & + iw[*ne + *n + 1], &iw[*ne + (*n << 1) + 1], &iw[*ne + *n * 3 + + 1], &iw[*ne + (*n << 2) + 1], &iw[*ne + *n * 5 + 1], &iw[* + ne + *n * 6 + 1]); + goto L90; + } + if (*job == 4) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + fact = 0.; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if ((d__1 = a[k], abs(d__1)) > fact) { + fact = (d__2 = a[k], abs(d__2)); + } +/* L30: */ + } + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + dw[(*n << 1) + k] = fact - (d__1 = a[k], abs(d__1)); +/* L40: */ + } +/* L50: */ + } +/* B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces */ + mc64wd_(n, ne, &ip[1], &irn[1], &dw[(*n << 1) + 1], &cperm[1], num, & + iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[( + *n << 2) + 1], &dw[1], &dw[*n + 1]); + goto L90; + } + if (*job == 5) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + fact = 0.; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + dw[*n * 3 + k] = (d__1 = a[k], abs(d__1)); + if (dw[*n * 3 + k] > fact) { + fact = dw[*n * 3 + k]; + } +/* L60: */ + } + dw[(*n << 1) + j] = fact; + if (fact != 0.) { + fact = log(fact); + } else { + fact = rinf / *n; + } + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if (dw[*n * 3 + k] != 0.) { + dw[*n * 3 + k] = fact - log(dw[*n * 3 + k]); + } else { + dw[*n * 3 + k] = rinf / *n; + } +/* L70: */ + } +/* L75: */ + } +/* B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces */ + mc64wd_(n, ne, &ip[1], &irn[1], &dw[*n * 3 + 1], &cperm[1], num, &iw[ + 1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[(*n + << 2) + 1], &dw[1], &dw[*n + 1]); + if (*num == *n) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (dw[(*n << 1) + j] != 0.) { + dw[*n + j] -= log(dw[(*n << 1) + j]); + } else { + dw[*n + j] = 0.; + } +/* L80: */ + } + } +/* Check size of scaling factors */ + fact = log(rinf) * .5f; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (dw[j] < fact && dw[*n + j] < fact) { + goto L86; + } + info[1] = 2; + goto L90; +L86: + ; + } +/* GO TO 90 */ + } +L90: + if (info[1] == 0 && *num < *n) { +/* Matrix is structurally singular, return with warning */ + info[1] = 1; + if (icntl[2] >= 0) { + printf(" ****** Warning from MC64A/AD. INFO(1) = %2d", info[1], + " The matrix is structurally singular.\n"); + } + } + if (info[1] == 2) { +/* Scaling factors are large, return with warning */ + if (icntl[2] >= 0) { + printf(" ****** Warning from MC64A/AD. INFO(1) = %2d\n", info[1], + " Some scaling factors may be too large.\n"); + } + } +/* Print diagnostics on output */ + if (icntl[3] >= 0) { + printf(" ****** Output parameters for MC64A/AD: INFO(1:2) = %8d%8d\n", + info[1], info[2]); + printf(" NUM = ", *num); + printf(" CPERM(1:N) = "); + for (j=1; j<=*n; ++j) { + printf("%8d", cperm[j]); + if (j%8 == 0) printf("\n"); + } + if (*job == 5) { + printf("\n DW(1:N) = "); + for (j=1; j<=*n; ++j) { + printf("%11.3f", dw[j]); + if (j%5 == 0) printf("\n"); + } + printf("\n DW(N+1:2N) = "); + for (j=1; j<=*n; ++j) { + printf("%11.3f", dw[*n+j]); + if (j%5 == 0) printf("\n"); + } + printf("\n"); + } + } +/* Return from subroutine. */ +L99: + return 0; +} /* mc64ad_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64bd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int_t *num, int_t *jperm, + int_t *pr, int_t *q, int_t *l, double *d__) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + double d__1, d__2, d__3; + + /* Local variables */ + int_t i__, j, k; + double a0; + int_t i0, q0; + double ai, di; + int_t ii, jj, kk; + double bv; + int_t up; + double dq0; + int_t kk1, kk2; + double csp; + int_t isp, jsp, low; + double dnew; + int_t jord, qlen, idum, jdum; + double rinf; + extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *, + double *, int_t *, int_t *), mc64ed_(int_t *, int_t *, + int_t *, double *, int_t *, int_t *), mc64fd_(int_t * + , int_t *, int_t *, int_t *, double *, int_t *, int_t *); + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* N, NE, IP, IRN are described in MC64A/AD. */ +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length */ +/* NE. A(K), K=1..NE, must be set to the value of the entry */ +/* that corresponds to IRN(K). It is not altered. */ +/* IPERM is an INT_T array of length N. On exit, it contains the */ +/* matching: IPERM(I) = 0 or row I is matched to column IPERM(I). */ +/* NUM is INT_T variable. On exit, it contains the cardinality of the */ +/* matching stored in IPERM. */ +/* IW is an INT_T work array of length 4N. */ +/* DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. */ +/* Local variables */ +/* Local parameters */ +/* Intrinsic functions */ +/* External subroutines and/or functions */ +/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DLAMCH */ +/* DOUBLE PRECISION FD05AD, DLAMCH */ +/* Set RINF to largest positive real number */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --d__; + --l; + --q; + --pr; + --jperm; + --iperm; + --ip; + --a; + --irn; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Initialization */ + *num = 0; + bv = rinf; + i__1 = *n; + for (k = 1; k <= i__1; ++k) { + iperm[k] = 0; + jperm[k] = 0; + pr[k] = ip[k]; + d__[k] = 0.; +/* L10: */ + } +/* Scan columns of matrix; */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + a0 = -1.; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + ai = (d__1 = a[k], abs(d__1)); + if (ai > d__[i__]) { + d__[i__] = ai; + } + if (jperm[j] != 0) { + goto L30; + } + if (ai >= bv) { + a0 = bv; + if (iperm[i__] != 0) { + goto L30; + } + jperm[j] = i__; + iperm[i__] = j; + ++(*num); + } else { + if (ai <= a0) { + goto L30; + } + a0 = ai; + i0 = i__; + } +L30: + ; + } + if (a0 != -1. && a0 < bv) { + bv = a0; + if (iperm[i0] != 0) { + goto L20; + } + iperm[i0] = j; + jperm[j] = i0; + ++(*num); + } +L20: + ; + } +/* Update BV with smallest of all the largest maximum absolute values */ +/* of the rows. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Computing MIN */ + d__1 = bv, d__2 = d__[i__]; + bv = min(d__1,d__2); +/* L25: */ + } + if (*num == *n) { + goto L1000; + } +/* Rescan unassigned columns; improve initial assignment */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (jperm[j] != 0) { + goto L95; + } + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + ai = (d__1 = a[k], abs(d__1)); + if (ai < bv) { + goto L50; + } + if (iperm[i__] == 0) { + goto L90; + } + jj = iperm[i__]; + kk1 = pr[jj]; + kk2 = ip[jj + 1] - 1; + if (kk1 > kk2) { + goto L50; + } + i__3 = kk2; + for (kk = kk1; kk <= i__3; ++kk) { + ii = irn[kk]; + if (iperm[ii] != 0) { + goto L70; + } + if ((d__1 = a[kk], abs(d__1)) >= bv) { + goto L80; + } +L70: + ; + } + pr[jj] = kk2 + 1; +L50: + ; + } + goto L95; +L80: + jperm[jj] = ii; + iperm[ii] = jj; + pr[jj] = kk + 1; +L90: + ++(*num); + jperm[j] = i__; + iperm[i__] = j; + pr[j] = k + 1; +L95: + ; + } + if (*num == *n) { + goto L1000; + } +/* Prepare for main loop */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + d__[i__] = -1.; + l[i__] = 0; +/* L99: */ + } +/* Main loop ... each pass round this loop is similar to Dijkstra's */ +/* algorithm for solving the single source shortest path problem */ + i__1 = *n; + for (jord = 1; jord <= i__1; ++jord) { + if (jperm[jord] != 0) { + goto L100; + } + qlen = 0; + low = *n + 1; + up = *n + 1; +/* CSP is cost of shortest path to any unassigned row */ +/* ISP is matrix position of unassigned row element in shortest path */ +/* JSP is column index of unassigned row element in shortest path */ + csp = -1.; +/* Build shortest path tree starting from unassigned column JORD */ + j = jord; + pr[j] = -1; +/* Scan column J */ + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + dnew = (d__1 = a[k], abs(d__1)); + if (csp >= dnew) { + goto L115; + } + if (iperm[i__] == 0) { +/* Row I is unassigned; update shortest path info */ + csp = dnew; + isp = i__; + jsp = j; + if (csp >= bv) { + goto L160; + } + } else { + d__[i__] = dnew; + if (dnew >= bv) { +/* Add row I to Q2 */ + --low; + q[low] = i__; + } else { +/* Add row I to Q, and push it */ + ++qlen; + l[i__] = qlen; + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1); + } + jj = iperm[i__]; + pr[jj] = j; + } +L115: + ; + } + i__2 = *num; + for (jdum = 1; jdum <= i__2; ++jdum) { +/* If Q2 is empty, extract new rows from Q */ + if (low == up) { + if (qlen == 0) { + goto L160; + } + i__ = q[1]; + if (csp >= d__[i__]) { + goto L160; + } + bv = d__[i__]; + i__3 = *n; + for (idum = 1; idum <= i__3; ++idum) { + mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__1); + l[i__] = 0; + --low; + q[low] = i__; + if (qlen == 0) { + goto L153; + } + i__ = q[1]; + if (d__[i__] != bv) { + goto L153; + } +/* L152: */ + } +/* End of dummy loop; this point is never reached */ + } +/* Move row Q0 */ +L153: + --up; + q0 = q[up]; + dq0 = d__[q0]; + l[q0] = up; +/* Scan column that matches with row Q0 */ + j = iperm[q0]; + i__3 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__3; ++k) { + i__ = irn[k]; +/* Update D(I) */ + if (l[i__] >= up) { + goto L155; + } +/* Computing MIN */ + d__2 = dq0, d__3 = (d__1 = a[k], abs(d__1)); + dnew = min(d__2,d__3); + if (csp >= dnew) { + goto L155; + } + if (iperm[i__] == 0) { +/* Row I is unassigned; update shortest path info */ + csp = dnew; + isp = i__; + jsp = j; + if (csp >= bv) { + goto L160; + } + } else { + di = d__[i__]; + if (di >= bv || di >= dnew) { + goto L155; + } + d__[i__] = dnew; + if (dnew >= bv) { +/* Delete row I from Q (if necessary); add row I to Q2 */ + if (di != -1.) { + mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], + &c__1); + } + l[i__] = 0; + --low; + q[low] = i__; + } else { +/* Add row I to Q (if necessary); push row I up Q */ + if (di == -1.) { + ++qlen; + l[i__] = qlen; + } + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1); + } +/* Update tree */ + jj = iperm[i__]; + pr[jj] = j; + } +L155: + ; + } +/* L150: */ + } +/* If CSP = MINONE, no augmenting path is found */ +L160: + if (csp == -1.) { + goto L190; + } +/* Update bottleneck value */ + bv = min(bv,csp); +/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */ + ++(*num); + i__ = isp; + j = jsp; + i__2 = *num + 1; + for (jdum = 1; jdum <= i__2; ++jdum) { + i0 = jperm[j]; + jperm[j] = i__; + iperm[i__] = j; + j = pr[j]; + if (j == -1) { + goto L190; + } + i__ = i0; +/* L170: */ + } +/* End of dummy loop; this point is never reached */ +L190: + i__2 = *n; + for (kk = up; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = -1.; + l[i__] = 0; +/* L191: */ + } + i__2 = up - 1; + for (kk = low; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = -1.; +/* L192: */ + } + i__2 = qlen; + for (kk = 1; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = -1.; + l[i__] = 0; +/* L193: */ + } +L100: + ; + } +/* End of main loop */ +/* BV is bottleneck value of final matching */ + if (*num == *n) { + goto L1000; + } +/* Matrix is structurally singular, complete IPERM. */ +/* JPERM, PR are work arrays */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + jperm[j] = 0; +/* L300: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (iperm[i__] == 0) { + ++k; + pr[k] = i__; + } else { + j = iperm[i__]; + jperm[j] = i__; + } +/* L310: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (jperm[i__] != 0) { + goto L320; + } + ++k; + jdum = pr[k]; + iperm[jdum] = i__; +L320: + ; + } +L1000: + return 0; +} /* mc64bd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64dd_(int_t *i__, int_t *n, int_t *q, double + *d__, int_t *l, int_t *iway) +{ + /* System generated locals */ + int_t i__1; + + /* Local variables */ + double di; + int_t qk, pos, idum, posk; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Variables N,Q,D,L are described in MC64B/BD */ +/* IF IWAY is equal to 1, then */ +/* node I is pushed from its current position upwards */ +/* IF IWAY is not equal to 1, then */ +/* node I is pushed from its current position downwards */ +/* Local variables and parameters */ + /* Parameter adjustments */ + --l; + --d__; + --q; + + /* Function Body */ + di = d__[*i__]; + pos = l[*i__]; +/* POS is index of current position of I in the tree */ + if (*iway == 1) { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L20; + } + posk = pos / 2; + qk = q[posk]; + if (di <= d__[qk]) { + goto L20; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L10: */ + } +/* End of dummy loop; this point is never reached */ + } else { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L20; + } + posk = pos / 2; + qk = q[posk]; + if (di >= d__[qk]) { + goto L20; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L15: */ + } +/* End of dummy loop; this point is never reached */ + } +/* End of dummy if; this point is never reached */ +L20: + q[pos] = *i__; + l[*i__] = pos; + return 0; +} /* mc64dd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64ed_(int_t *qlen, int_t *n, int_t *q, + double *d__, int_t *l, int_t *iway) +{ + /* System generated locals */ + int_t i__1; + + /* Local variables */ + int_t i__; + double di, dk, dr; + int_t pos, idum, posk; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */ +/* MC64W/WD (IWAY = 2) */ +/* The root node is deleted from the binary heap. */ +/* Local variables and parameters */ +/* Move last element to begin of Q */ + /* Parameter adjustments */ + --l; + --d__; + --q; + + /* Function Body */ + i__ = q[*qlen]; + di = d__[i__]; + --(*qlen); + pos = 1; + if (*iway == 1) { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L20; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk < dr) { + ++posk; + dk = dr; + } + } + if (di >= dk) { + goto L20; + } +/* Exchange old last element with larger priority child */ + q[pos] = q[posk]; + l[q[pos]] = pos; + pos = posk; +/* L10: */ + } +/* End of dummy loop; this point is never reached */ + } else { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L20; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk > dr) { + ++posk; + dk = dr; + } + } + if (di <= dk) { + goto L20; + } +/* Exchange old last element with smaller child */ + q[pos] = q[posk]; + l[q[pos]] = pos; + pos = posk; +/* L15: */ + } +/* End of dummy loop; this point is never reached */ + } +/* End of dummy if; this point is never reached */ +L20: + q[pos] = i__; + l[i__] = pos; + return 0; +} /* mc64ed_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64fd_(int_t *pos0, int_t *qlen, int_t *n, + int_t *q, double *d__, int_t *l, int_t *iway) +{ + /* System generated locals */ + int_t i__1; + + /* Local variables */ + int_t i__; + double di, dk, dr; + int_t qk, pos, idum, posk; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */ +/* MC64WD (IWAY = 2). */ +/* Move last element in the heap */ +/* Quick return, if possible */ + /* Parameter adjustments */ + --l; + --d__; + --q; + + /* Function Body */ + if (*qlen == *pos0) { + --(*qlen); + return 0; + } +/* Move last element from queue Q to position POS0 */ +/* POS is current position of node I in the tree */ + i__ = q[*qlen]; + di = d__[i__]; + --(*qlen); + pos = *pos0; + if (*iway == 1) { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L20; + } + posk = pos / 2; + qk = q[posk]; + if (di <= d__[qk]) { + goto L20; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L10: */ + } +/* End of dummy loop; this point is never reached */ +L20: + q[pos] = i__; + l[i__] = pos; + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L40; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk < dr) { + ++posk; + dk = dr; + } + } + if (di >= dk) { + goto L40; + } + qk = q[posk]; + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L30: */ + } +/* End of dummy loop; this point is never reached */ + } else { + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + if (pos <= 1) { + goto L34; + } + posk = pos / 2; + qk = q[posk]; + if (di >= d__[qk]) { + goto L34; + } + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L32: */ + } +/* End of dummy loop; this point is never reached */ +L34: + q[pos] = i__; + l[i__] = pos; + i__1 = *n; + for (idum = 1; idum <= i__1; ++idum) { + posk = pos << 1; + if (posk > *qlen) { + goto L40; + } + dk = d__[q[posk]]; + if (posk < *qlen) { + dr = d__[q[posk + 1]]; + if (dk > dr) { + ++posk; + dk = dr; + } + } + if (di <= dk) { + goto L40; + } + qk = q[posk]; + q[pos] = qk; + l[qk] = pos; + pos = posk; +/* L36: */ + } +/* End of dummy loop; this point is never reached */ + } +/* End of dummy if; this point is never reached */ +L40: + q[pos] = i__; + l[i__] = pos; + return 0; +} /* mc64fd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64rd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + + /* Local variables */ + int_t j, k, r__, s; + double ha; + int_t hi, td, mid, len, ipj; + double key; + int_t last, todo[50], first; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* This subroutine sorts the entries in each column of the */ +/* sparse matrix (defined by N,NE,IP,IRN,A) by decreasing */ +/* numerical value. */ +/* Local constants */ +/* Local variables */ +/* Local arrays */ + /* Parameter adjustments */ + --ip; + --a; + --irn; + + /* Function Body */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + len = ip[j + 1] - ip[j]; + if (len <= 1) { + goto L100; + } + ipj = ip[j]; +/* Sort array roughly with partial quicksort */ + if (len < 15) { + goto L400; + } + todo[0] = ipj; + todo[1] = ipj + len; + td = 2; +L500: + first = todo[td - 2]; + last = todo[td - 1]; +/* KEY is the smallest of two values present in interval [FIRST,LAST) */ + key = a[(first + last) / 2]; + i__2 = last - 1; + for (k = first; k <= i__2; ++k) { + ha = a[k]; + if (ha == key) { + goto L475; + } + if (ha > key) { + goto L470; + } + key = ha; + goto L470; +L475: + ; + } +/* Only one value found in interval, so it is already sorted */ + td += -2; + goto L425; +/* Reorder interval [FIRST,LAST) such that entries before MID are gt KEY */ +L470: + mid = first; + i__2 = last - 1; + for (k = first; k <= i__2; ++k) { + if (a[k] <= key) { + goto L450; + } + ha = a[mid]; + a[mid] = a[k]; + a[k] = ha; + hi = irn[mid]; + irn[mid] = irn[k]; + irn[k] = hi; + ++mid; +L450: + ; + } +/* Both subintervals [FIRST,MID), [MID,LAST) are nonempty */ +/* Stack the longest of the two subintervals first */ + if (mid - first >= last - mid) { + todo[td + 1] = last; + todo[td] = mid; + todo[td - 1] = mid; +/* TODO(TD-1) = FIRST */ + } else { + todo[td + 1] = mid; + todo[td] = first; + todo[td - 1] = last; + todo[td - 2] = mid; + } + td += 2; +L425: + if (td == 0) { + goto L400; + } +/* There is still work to be done */ + if (todo[td - 1] - todo[td - 2] >= 15) { + goto L500; + } +/* Next interval is already short enough for straightforward insertion */ + td += -2; + goto L425; +/* Complete sorting with straightforward insertion */ +L400: + i__2 = ipj + len - 1; + for (r__ = ipj + 1; r__ <= i__2; ++r__) { + if (a[r__ - 1] < a[r__]) { + ha = a[r__]; + hi = irn[r__]; + a[r__] = a[r__ - 1]; + irn[r__] = irn[r__ - 1]; + i__3 = ipj + 1; + for (s = r__ - 1; s >= i__3; --s) { + if (a[s - 1] < ha) { + a[s] = a[s - 1]; + irn[s] = irn[s - 1]; + } else { + a[s] = ha; + irn[s] = hi; + goto L200; + } +/* L300: */ + } + a[ipj] = ha; + irn[ipj] = hi; + } +L200: + ; + } +L100: + ; + } + return 0; +} /* mc64rd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64sd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int_t *numx, int_t *w, + int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw, + int_t *iw4) +{ + /* System generated locals */ + int_t i__1, i__2, i__3, i__4; + + /* Local variables */ + int_t i__, j, k, l, ii, mod, cnt, num; + double bval, bmin, bmax, rinf; + int_t nval, wlen, idum1, idum2, idum3; + extern /* Subroutine */ int_t mc64qd_(int_t *, int_t *, int_t *, + int_t *, int_t *, double *, int_t *, double *), + mc64ud_(int_t *, int_t *, int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *); + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* N, NE, IP, IRN, are described in MC64A/AD. */ +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ +/* A(K), K=1..NE, must be set to the value of the entry that */ +/* corresponds to IRN(k). The entries in each column must be */ +/* non-negative and ordered by decreasing value. */ +/* IPERM is an INT_T array of length N. On exit, it contains the */ +/* bottleneck matching: IPERM(I) - 0 or row I is matched to column */ +/* IPERM(I). */ +/* NUMX is an INT_T variable. On exit, it contains the cardinality */ +/* of the matching stored in IPERM. */ +/* IW is an INT_T work array of length 10N. */ +/* FC is an int_t array of length N that contains the list of */ +/* unmatched columns. */ +/* LEN(J), LENL(J), LENH(J) are int_t arrays of length N that point */ +/* to entries in matrix column J. */ +/* In the matrix defined by the column parts IP(J)+LENL(J) we know */ +/* a matching does not exist; in the matrix defined by the column */ +/* parts IP(J)+LENH(J) we know one exists. */ +/* LEN(J) lies between LENL(J) and LENH(J) and determines the matrix */ +/* that is tested for a maximum matching. */ +/* W is an int_t array of length N and contains the indices of the */ +/* columns for which LENL ne LENH. */ +/* WLEN is number of indices stored in array W. */ +/* IW is int_t work array of length N. */ +/* IW4 is int_t work array of length 4N used by MC64U/UD. */ +/* EXTERNAL FD05AD,MC64QD,MC64UD */ +/* DOUBLE PRECISION FD05AD */ +/* BMIN and BMAX are such that a maximum matching exists for the input */ +/* matrix in which all entries smaller than BMIN are dropped. */ +/* For BMAX, a maximum matching does not exist. */ +/* BVAL is a value between BMIN and BMAX. */ +/* CNT is the number of calls made to MC64U/UD so far. */ +/* NUM is the cardinality of last matching found. */ +/* Set RINF to largest positive real number */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --iw4; + --iw; + --fc; + --lenh; + --lenl; + --len; + --w; + --iperm; + --ip; + --a; + --irn; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Compute a first maximum matching from scratch on whole matrix. */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + fc[j] = j; + iw[j] = 0; + len[j] = ip[j + 1] - ip[j]; +/* L20: */ + } +/* The first call to MC64U/UD */ + cnt = 1; + mod = 1; + *numx = 0; + mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], numx, + n, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[*n * 3 + 1]); +/* IW contains a maximum matching of length NUMX. */ + num = *numx; + if (num != *n) { +/* Matrix is structurally singular */ + bmax = rinf; + } else { +/* Matrix is structurally nonsingular, NUM=NUMX=N; */ +/* Set BMAX just above the smallest of all the maximum absolute */ +/* values of the columns */ + bmax = rinf; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + bval = 0.f; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if (a[k] > bval) { + bval = a[k]; + } +/* L25: */ + } + if (bval < bmax) { + bmax = bval; + } +/* L30: */ + } + bmax *= 1.001f; + } +/* Initialize BVAL,BMIN */ + bval = 0.f; + bmin = 0.f; +/* Initialize LENL,LEN,LENH,W,WLEN according to BMAX. */ +/* Set LEN(J), LENH(J) just after last entry in column J. */ +/* Set LENL(J) just after last entry in column J with value ge BMAX. */ + wlen = 0; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + l = ip[j + 1] - ip[j]; + lenh[j] = l; + len[j] = l; + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + if (a[k] < bmax) { + goto L46; + } +/* L45: */ + } +/* Column J is empty or all entries are ge BMAX */ + k = ip[j + 1]; +L46: + lenl[j] = k - ip[j]; +/* Add J to W if LENL(J) ne LENH(J) */ + if (lenl[j] == l) { + goto L48; + } + ++wlen; + w[wlen] = j; +L48: + ; + } +/* Main loop */ + i__1 = *ne; + for (idum1 = 1; idum1 <= i__1; ++idum1) { + if (num == *numx) { +/* We have a maximum matching in IW; store IW in IPERM */ + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + iperm[i__] = iw[i__]; +/* L50: */ + } +/* Keep going round this loop until matching IW is no longer maximum. */ + i__2 = *ne; + for (idum2 = 1; idum2 <= i__2; ++idum2) { + bmin = bval; + if (bmax == bmin) { + goto L99; + } +/* Find splitting value BVAL */ + mc64qd_(&ip[1], &lenl[1], &len[1], &w[1], &wlen, &a[1], &nval, + &bval); + if (nval <= 1) { + goto L99; + } +/* Set LEN such that all matrix entries with value lt BVAL are */ +/* discarded. Store old LEN in LENH. Do this for all columns W(K). */ +/* Each step, either K is incremented or WLEN is decremented. */ + k = 1; + i__3 = *n; + for (idum3 = 1; idum3 <= i__3; ++idum3) { + if (k > wlen) { + goto L71; + } + j = w[k]; + i__4 = ip[j] + lenl[j]; + for (ii = ip[j] + len[j] - 1; ii >= i__4; --ii) { + if (a[ii] >= bval) { + goto L60; + } + i__ = irn[ii]; + if (iw[i__] != j) { + goto L55; + } +/* Remove entry from matching */ + iw[i__] = 0; + --num; + fc[*n - num] = j; +L55: + ; + } +L60: + lenh[j] = len[j]; +/* IP(J)+LEN(J)-1 is last entry in column ge BVAL */ + len[j] = ii - ip[j] + 1; +/* If LENH(J) = LENL(J), remove J from W */ + if (lenl[j] == lenh[j]) { + w[k] = w[wlen]; + --wlen; + } else { + ++k; + } +/* L70: */ + } +L71: + if (num < *numx) { + goto L81; + } +/* L80: */ + } +/* End of dummy loop; this point is never reached */ +/* Set mode for next call to MC64U/UD */ +L81: + mod = 1; + } else { +/* We do not have a maximum matching in IW. */ + bmax = bval; +/* BMIN is the bottleneck value of a maximum matching; */ +/* for BMAX the matching is not maximum, so BMAX>BMIN */ +/* IF (BMAX .EQ. BMIN) GO TO 99 */ +/* Find splitting value BVAL */ + mc64qd_(&ip[1], &len[1], &lenh[1], &w[1], &wlen, &a[1], &nval, & + bval); + if (nval == 0 || bval == bmin) { + goto L99; + } +/* Set LEN such that all matrix entries with value ge BVAL are */ +/* inside matrix. Store old LEN in LENL. Do this for all columns W(K). */ +/* Each step, either K is incremented or WLEN is decremented. */ + k = 1; + i__2 = *n; + for (idum3 = 1; idum3 <= i__2; ++idum3) { + if (k > wlen) { + goto L88; + } + j = w[k]; + i__3 = ip[j] + lenh[j] - 1; + for (ii = ip[j] + len[j]; ii <= i__3; ++ii) { + if (a[ii] < bval) { + goto L86; + } +/* L85: */ + } +L86: + lenl[j] = len[j]; + len[j] = ii - ip[j]; + if (lenl[j] == lenh[j]) { + w[k] = w[wlen]; + --wlen; + } else { + ++k; + } +/* L87: */ + } +/* End of dummy loop; this point is never reached */ +/* Set mode for next call to MC64U/UD */ +L88: + mod = 0; + } + ++cnt; + mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], & + num, numx, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[* + n * 3 + 1]); +/* IW contains maximum matching of length NUM */ +/* L90: */ + } +/* End of dummy loop; this point is never reached */ +/* BMIN is bottleneck value of final matching */ +L99: + if (*numx == *n) { + goto L1000; + } +/* The matrix is structurally singular, complete IPERM */ +/* W, IW are work arrays */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + w[j] = 0; +/* L300: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (iperm[i__] == 0) { + ++k; + iw[k] = i__; + } else { + j = iperm[i__]; + w[j] = i__; + } +/* L310: */ + } + k = 0; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (w[j] != 0) { + goto L320; + } + ++k; + idum1 = iw[k]; + iperm[idum1] = j; +L320: + ; + } +L1000: + return 0; +} /* mc64sd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64qd_(int_t *ip, int_t *lenl, int_t *lenh, + int_t *w, int_t *wlen, double *a, int_t *nval, double * + val) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + + /* Local variables */ + int_t j, k, s; + double ha; + int_t ii, pos; + double split[10]; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* This routine searches for at most XX different numerical values */ +/* in the columns W(1:WLEN). XX>=2. */ +/* Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 */ +/* until XX values are found or all columns have been considered. */ +/* On output, NVAL is the number of different values that is found */ +/* and SPLIT(1:NVAL) contains the values in decreasing order. */ +/* If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). */ + +/* Scan columns in W(1:WLEN). For each encountered value, if value not */ +/* already present in SPLIT(1:NVAL), insert value such that SPLIT */ +/* remains sorted by decreasing value. */ +/* The sorting is done by straightforward insertion; therefore the use */ +/* of this routine should be avoided for large XX (XX < 20). */ + /* Parameter adjustments */ + --a; + --w; + --lenh; + --lenl; + --ip; + + /* Function Body */ + *nval = 0; + i__1 = *wlen; + for (k = 1; k <= i__1; ++k) { + j = w[k]; + i__2 = ip[j] + lenh[j] - 1; + for (ii = ip[j] + lenl[j]; ii <= i__2; ++ii) { + ha = a[ii]; + if (*nval == 0) { + split[0] = ha; + *nval = 1; + } else { +/* Check presence of HA in SPLIT */ + for (s = *nval; s >= 1; --s) { + if (split[s - 1] == ha) { + goto L15; + } + if (split[s - 1] > ha) { + pos = s + 1; + goto L21; + } +/* L20: */ + } + pos = 1; +/* The insertion */ +L21: + i__3 = pos; + for (s = *nval; s >= i__3; --s) { + split[s] = split[s - 1]; +/* L22: */ + } + split[pos - 1] = ha; + ++(*nval); + } +/* Exit loop if XX values are found */ + if (*nval == 10) { + goto L11; + } +L15: + ; + } +/* L10: */ + } +/* Determine VAL */ +L11: + if (*nval > 0) { + *val = split[(*nval + 1) / 2 - 1]; + } + return 0; +} /* mc64qd_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64ud_(int_t *id, int_t *mod, int_t *n, int_t * + irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t * + iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp, + int_t *cv, int_t *out) +{ + /* System generated locals */ + int_t i__1, i__2, i__3, i__4; + + /* Local variables */ + int_t i__, j, k, j1, ii, kk, id0, id1, in1, in2, nfc, num0, num1, num2, + jord, last; + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* PR(J) is the previous column to J in the depth first search. */ +/* Array PR is used as workspace in the sorting algorithm. */ +/* Elements (I,IPERM(I)) I=1,..,N are entries at the end of the */ +/* algorithm unless N assignments have not been made in which case */ +/* N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. */ +/* CV(I) is the most recent loop number (ID+JORD) at which row I */ +/* was visited. */ +/* ARP(J) is the number of entries in column J which have been scanned */ +/* when looking for a cheap assignment. */ +/* OUT(J) is one less than the number of entries in column J which have */ +/* not been scanned during one pass through the main loop. */ +/* NUMX is maximum possible size of matching. */ + /* Parameter adjustments */ + --out; + --cv; + --arp; + --pr; + --iperm; + --fc; + --lenc; + --ip; + --irn; + + /* Function Body */ + if (*id == 1) { +/* The first call to MC64U/UD. */ +/* Initialize CV and ARP; parameters MOD, NUMX are not accessed */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + cv[i__] = 0; + arp[i__] = 0; +/* L5: */ + } + num1 = *n; + num2 = *n; + } else { +/* Not the first call to MC64U/UD. */ +/* Re-initialize ARP if entries were deleted since last call to MC64U/UD */ + if (*mod == 1) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + arp[i__] = 0; +/* L8: */ + } + } + num1 = *numx; + num2 = *n - *numx; + } + num0 = *num; +/* NUM0 is size of input matching */ +/* NUM1 is maximum possible size of matching */ +/* NUM2 is maximum allowed number of unassigned rows/columns */ +/* NUM is size of current matching */ +/* Quick return if possible */ +/* IF (NUM.EQ.N) GO TO 199 */ +/* NFC is number of rows/columns that could not be assigned */ + nfc = 0; +/* Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, */ +/* so 1st call uses 1..N, 2nd call uses N+1..2N, etc */ + id0 = (*id - 1) * *n; +/* Main loop. Each pass round this loop either results in a new */ +/* assignment or gives a column with no assignment */ + i__1 = *n; + for (jord = num0 + 1; jord <= i__1; ++jord) { +/* Each pass uses unique number ID1 */ + id1 = id0 + jord; +/* J is unmatched column */ + j = fc[jord - num0]; + pr[j] = -1; + i__2 = jord; + for (k = 1; k <= i__2; ++k) { +/* Look for a cheap assignment */ + if (arp[j] >= lenc[j]) { + goto L30; + } + in1 = ip[j] + arp[j]; + in2 = ip[j] + lenc[j] - 1; + i__3 = in2; + for (ii = in1; ii <= i__3; ++ii) { + i__ = irn[ii]; + if (iperm[i__] == 0) { + goto L80; + } +/* L20: */ + } +/* No cheap assignment in row */ + arp[j] = lenc[j]; +/* Begin looking for assignment chain starting with row J */ +L30: + out[j] = lenc[j] - 1; +/* Inner loop. Extends chain by one or backtracks */ + i__3 = jord; + for (kk = 1; kk <= i__3; ++kk) { + in1 = out[j]; + if (in1 < 0) { + goto L50; + } + in2 = ip[j] + lenc[j] - 1; + in1 = in2 - in1; +/* Forward scan */ + i__4 = in2; + for (ii = in1; ii <= i__4; ++ii) { + i__ = irn[ii]; + if (cv[i__] == id1) { + goto L40; + } +/* Column J has not yet been accessed during this pass */ + j1 = j; + j = iperm[i__]; + cv[i__] = id1; + pr[j] = j1; + out[j1] = in2 - ii - 1; + goto L70; +L40: + ; + } +/* Backtracking step. */ +L50: + j1 = pr[j]; + if (j1 == -1) { +/* No augmenting path exists for column J. */ + ++nfc; + fc[nfc] = j; + if (nfc > num2) { +/* A matching of maximum size NUM1 is not possible */ + last = jord; + goto L101; + } + goto L100; + } + j = j1; +/* L60: */ + } +/* End of dummy loop; this point is never reached */ +L70: + ; + } +/* End of dummy loop; this point is never reached */ +/* New assignment is made. */ +L80: + iperm[i__] = j; + arp[j] = ii - ip[j] + 1; + ++(*num); + i__2 = jord; + for (k = 1; k <= i__2; ++k) { + j = pr[j]; + if (j == -1) { + goto L95; + } + ii = ip[j] + lenc[j] - out[j] - 2; + i__ = irn[ii]; + iperm[i__] = j; +/* L90: */ + } +/* End of dummy loop; this point is never reached */ +L95: + if (*num == num1) { +/* A matching of maximum size NUM1 is found */ + last = jord; + goto L101; + } + +L100: + ; + } +/* All unassigned columns have been considered */ + last = *n; +/* Now, a transversal is computed or is not possible. */ +/* Complete FC before returning. */ +L101: + i__1 = *n; + for (jord = last + 1; jord <= i__1; ++jord) { + ++nfc; + fc[nfc] = fc[jord - num0]; +/* L110: */ + } +/* 199 RETURN */ + return 0; +} /* mc64ud_ */ + +/* ********************************************************************** */ +/* Subroutine */ int_t mc64wd_(int_t *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int_t *num, int_t *jperm, + int_t *out, int_t *pr, int_t *q, int_t *l, double *u, + double *d__) +{ + /* System generated locals */ + int_t i__1, i__2, i__3; + + /* Local variables */ + int_t i__, j, k, i0, k0, k1, k2, q0; + double di; + int_t ii, jj, kk; + double vj; + int_t up; + double dq0; + int_t kk1, kk2; + double csp; + int_t isp, jsp, low; + double dmin__, dnew; + int_t jord, qlen, jdum; + double rinf; + extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *, + double *, int_t *, int_t *), mc64ed_(int_t *, int_t *, + int_t *, double *, int_t *, int_t *), mc64fd_(int_t * + , int_t *, int_t *, int_t *, double *, int_t *, + int_t *); + + +/* *** Copyright (c) 1999 Council for the Central Laboratory of the */ +/* Research Councils *** */ +/* *** Although every effort has been made to ensure robustness and *** */ +/* *** reliability of the subroutines in this MC64 suite, we *** */ +/* *** disclaim any liability arising through the use or misuse of *** */ +/* *** any of the subroutines. *** */ +/* *** Any problems? Contact ... */ +/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ + +/* N, NE, IP, IRN are described in MC64A/AD. */ +/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ +/* A(K), K=1..NE, must be set to the value of the entry that */ +/* corresponds to IRN(K). It is not altered. */ +/* All values A(K) must be non-negative. */ +/* IPERM is an INT_T array of length N. On exit, it contains the */ +/* weighted matching: IPERM(I) = 0 or row I is matched to column */ +/* IPERM(I). */ +/* NUM is an INT_T variable. On exit, it contains the cardinality of */ +/* the matching stored in IPERM. */ +/* IW is an INT_T work array of length 5N. */ +/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. */ +/* On exit, U = D(1:N) contains the dual row variable and */ +/* V = D(N+1:2N) contains the dual column variable. If the matrix */ +/* is structurally nonsingular (NUM = N), the following holds: */ +/* U(I)+V(J) <= A(I,J) if IPERM(I) |= J */ +/* U(I)+V(J) = A(I,J) if IPERM(I) = J */ +/* U(I) = 0 if IPERM(I) = 0 */ +/* V(J) = 0 if there is no I for which IPERM(I) = J */ +/* Local variables */ +/* Local parameters */ +/* External subroutines and/or functions */ +/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD */ +/* DOUBLE PRECISION FD05AD */ +/* Set RINF to largest positive real number */ +/* XSL RINF = FD05AD(5) */ + /* Parameter adjustments */ + --d__; + --u; + --l; + --q; + --pr; + --out; + --jperm; + --iperm; + --ip; + --a; + --irn; + + /* Function Body */ + rinf = dlamch_("Overflow"); +/* Initialization */ + *num = 0; + i__1 = *n; + for (k = 1; k <= i__1; ++k) { + u[k] = rinf; + d__[k] = 0.; + iperm[k] = 0; + jperm[k] = 0; + pr[k] = ip[k]; + l[k] = 0; +/* L10: */ + } +/* Initialize U(I) */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + if (a[k] > u[i__]) { + goto L20; + } + u[i__] = a[k]; + iperm[i__] = j; + l[i__] = k; +L20: + ; + } +/* L30: */ + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + j = iperm[i__]; + if (j == 0) { + goto L40; + } +/* Row I is not empty */ + iperm[i__] = 0; + if (jperm[j] != 0) { + goto L40; + } +/* Assignment of column J to row I */ + ++(*num); + iperm[i__] = j; + jperm[j] = l[i__]; +L40: + ; + } + if (*num == *n) { + goto L1000; + } +/* Scan unassigned columns; improve assignment */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { +/* JPERM(J) ne 0 iff column J is already assigned */ + if (jperm[j] != 0) { + goto L95; + } + k1 = ip[j]; + k2 = ip[j + 1] - 1; +/* Continue only if column J is not empty */ + if (k1 > k2) { + goto L95; + } + vj = rinf; + i__2 = k2; + for (k = k1; k <= i__2; ++k) { + i__ = irn[k]; + di = a[k] - u[i__]; + if (di > vj) { + goto L50; + } + if (di < vj || di == rinf) { + goto L55; + } + if (iperm[i__] != 0 || iperm[i0] == 0) { + goto L50; + } +L55: + vj = di; + i0 = i__; + k0 = k; +L50: + ; + } + d__[j] = vj; + k = k0; + i__ = i0; + if (iperm[i__] == 0) { + goto L90; + } + i__2 = k2; + for (k = k0; k <= i__2; ++k) { + i__ = irn[k]; + if (a[k] - u[i__] > vj) { + goto L60; + } + jj = iperm[i__]; +/* Scan remaining part of assigned column JJ */ + kk1 = pr[jj]; + kk2 = ip[jj + 1] - 1; + if (kk1 > kk2) { + goto L60; + } + i__3 = kk2; + for (kk = kk1; kk <= i__3; ++kk) { + ii = irn[kk]; + if (iperm[ii] > 0) { + goto L70; + } + if (a[kk] - u[ii] <= d__[jj]) { + goto L80; + } +L70: + ; + } + pr[jj] = kk2 + 1; +L60: + ; + } + goto L95; +L80: + jperm[jj] = kk; + iperm[ii] = jj; + pr[jj] = kk + 1; +L90: + ++(*num); + jperm[j] = k; + iperm[i__] = j; + pr[j] = k + 1; +L95: + ; + } + if (*num == *n) { + goto L1000; + } +/* Prepare for main loop */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + d__[i__] = rinf; + l[i__] = 0; +/* L99: */ + } +/* Main loop ... each pass round this loop is similar to Dijkstra's */ +/* algorithm for solving the single source shortest path problem */ + i__1 = *n; + for (jord = 1; jord <= i__1; ++jord) { + if (jperm[jord] != 0) { + goto L100; + } +/* JORD is next unmatched column */ +/* DMIN is the length of shortest path in the tree */ + dmin__ = rinf; + qlen = 0; + low = *n + 1; + up = *n + 1; +/* CSP is the cost of the shortest augmenting path to unassigned row */ +/* IRN(ISP). The corresponding column index is JSP. */ + csp = rinf; +/* Build shortest path tree starting from unassigned column (root) JORD */ + j = jord; + pr[j] = -1; +/* Scan column J */ + i__2 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__2; ++k) { + i__ = irn[k]; + dnew = a[k] - u[i__]; + if (dnew >= csp) { + goto L115; + } + if (iperm[i__] == 0) { + csp = dnew; + isp = k; + jsp = j; + } else { + if (dnew < dmin__) { + dmin__ = dnew; + } + d__[i__] = dnew; + ++qlen; + q[qlen] = k; + } +L115: + ; + } +/* Initialize heap Q and Q2 with rows held in Q(1:QLEN) */ + q0 = qlen; + qlen = 0; + i__2 = q0; + for (kk = 1; kk <= i__2; ++kk) { + k = q[kk]; + i__ = irn[k]; + if (csp <= d__[i__]) { + d__[i__] = rinf; + goto L120; + } + if (d__[i__] <= dmin__) { + --low; + q[low] = i__; + l[i__] = low; + } else { + ++qlen; + l[i__] = qlen; + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2); + } +/* Update tree */ + jj = iperm[i__]; + out[jj] = k; + pr[jj] = j; +L120: + ; + } + i__2 = *num; + for (jdum = 1; jdum <= i__2; ++jdum) { +/* If Q2 is empty, extract rows from Q */ + if (low == up) { + if (qlen == 0) { + goto L160; + } + i__ = q[1]; + if (d__[i__] >= csp) { + goto L160; + } + dmin__ = d__[i__]; +L152: + mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__2); + --low; + q[low] = i__; + l[i__] = low; + if (qlen == 0) { + goto L153; + } + i__ = q[1]; + if (d__[i__] > dmin__) { + goto L153; + } + goto L152; + } +/* Q0 is row whose distance D(Q0) to the root is smallest */ +L153: + q0 = q[up - 1]; + dq0 = d__[q0]; +/* Exit loop if path to Q0 is longer than the shortest augmenting path */ + if (dq0 >= csp) { + goto L160; + } + --up; +/* Scan column that matches with row Q0 */ + j = iperm[q0]; + vj = dq0 - a[jperm[j]] + u[q0]; + i__3 = ip[j + 1] - 1; + for (k = ip[j]; k <= i__3; ++k) { + i__ = irn[k]; + if (l[i__] >= up) { + goto L155; + } +/* DNEW is new cost */ + dnew = vj + a[k] - u[i__]; +/* Do not update D(I) if DNEW ge cost of shortest path */ + if (dnew >= csp) { + goto L155; + } + if (iperm[i__] == 0) { +/* Row I is unmatched; update shortest path info */ + csp = dnew; + isp = k; + jsp = j; + } else { +/* Row I is matched; do not update D(I) if DNEW is larger */ + di = d__[i__]; + if (di <= dnew) { + goto L155; + } + if (l[i__] >= low) { + goto L155; + } + d__[i__] = dnew; + if (dnew <= dmin__) { + if (l[i__] != 0) { + mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], + &c__2); + } + --low; + q[low] = i__; + l[i__] = low; + } else { + if (l[i__] == 0) { + ++qlen; + l[i__] = qlen; + } + mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2); + } +/* Update tree */ + jj = iperm[i__]; + out[jj] = k; + pr[jj] = j; + } +L155: + ; + } +/* L150: */ + } +/* If CSP = RINF, no augmenting path is found */ +L160: + if (csp == rinf) { + goto L190; + } +/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */ + ++(*num); + i__ = irn[isp]; + iperm[i__] = jsp; + jperm[jsp] = isp; + j = jsp; + i__2 = *num; + for (jdum = 1; jdum <= i__2; ++jdum) { + jj = pr[j]; + if (jj == -1) { + goto L180; + } + k = out[j]; + i__ = irn[k]; + iperm[i__] = jj; + jperm[jj] = k; + j = jj; +/* L170: */ + } +/* End of dummy loop; this point is never reached */ +/* Update U for rows in Q(UP:N) */ +L180: + i__2 = *n; + for (kk = up; kk <= i__2; ++kk) { + i__ = q[kk]; + u[i__] = u[i__] + d__[i__] - csp; +/* L185: */ + } +L190: + i__2 = *n; + for (kk = low; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = rinf; + l[i__] = 0; +/* L191: */ + } + i__2 = qlen; + for (kk = 1; kk <= i__2; ++kk) { + i__ = q[kk]; + d__[i__] = rinf; + l[i__] = 0; +/* L193: */ + } +L100: + ; + } +/* End of main loop */ +/* Set dual column variable in D(1:N) */ +L1000: + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + k = jperm[j]; + if (k != 0) { + d__[j] = a[k] - u[irn[k]]; + } else { + d__[j] = 0.; + } + if (iperm[j] == 0) { + u[j] = 0.; + } +/* L200: */ + } + if (*num == *n) { + goto L1100; + } +/* The matrix is structurally singular, complete IPERM. */ +/* JPERM, OUT are work arrays */ + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + jperm[j] = 0; +/* L300: */ + } + k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (iperm[i__] == 0) { + ++k; + out[k] = i__; + } else { + j = iperm[i__]; + jperm[j] = i__; + } +/* L310: */ + } + k = 0; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (jperm[j] != 0) { + goto L320; + } + ++k; + jdum = out[k]; + iperm[jdum] = j; +L320: + ; + } +L1100: + return 0; +} /* mc64wd_ */ + + diff --git a/src/maths/SuperLU/memory.c b/src/maths/SuperLU/memory.c new file mode 100644 index 000000000..f7fdccf44 --- /dev/null +++ b/src/maths/SuperLU/memory.c @@ -0,0 +1,210 @@ +/*! @file memory.c + * \brief Precision-independent memory-related routines + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +/** Precision-independent memory-related routines. + (Shared by [sdcz]memory.c) **/ + +#include + + +#if ( DEBUGlevel>=1 ) /* Debug malloc/free. */ +int superlu_malloc_total = 0; + +#define PAD_FACTOR 2 +#define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */ +/* size_t is usually defined as 'unsigned long' */ + +void *superlu_malloc(size_t size) +{ + char *buf; + + buf = (char *) malloc(size + DWORD); + if ( !buf ) { + printf("superlu_malloc fails: malloc_total %.0f MB, size %ld\n", + superlu_malloc_total*1e-6, size); + ABORT_SuperLU("superlu_malloc: out of memory"); + } + + ((int_t *) buf)[0] = size; +#if 0 + superlu_malloc_total += size + DWORD; +#else + superlu_malloc_total += size; +#endif + return (void *) (buf + DWORD); +} + +void superlu_free(void *addr) +{ + char *p = ((char *) addr) - DWORD; + + if ( !addr ) + ABORT_SuperLU("superlu_free: tried to free NULL pointer"); + + if ( !p ) + ABORT_SuperLU("superlu_free: tried to free NULL+DWORD pointer"); + + { + int_t n = ((int_t *) p)[0]; + + if ( !n ) + ABORT_SuperLU("superlu_free: tried to free a freed pointer"); + *((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */ +#if 0 + superlu_malloc_total -= (n + DWORD); +#else + superlu_malloc_total -= n; +#endif + + if ( superlu_malloc_total < 0 ) + ABORT_SuperLU("superlu_malloc_total went negative!"); + + /*free (addr);*/ + free (p); + } + +} + +#else /* production mode */ + +void *superlu_malloc(size_t size) +{ + void *buf; + buf = (void *) malloc(size); + return (buf); +} + +void superlu_free(void *addr) +{ + free (addr); +} + +#endif + + +/*! \brief Set up pointers for integer working arrays. + */ +void +SetIWork(int m, int n, int panel_size, int *iworkptr, int **segrep, + int **parent, int **xplore, int **repfnz, int **panel_lsub, + int **xprune, int **marker) +{ + *segrep = iworkptr; + *parent = iworkptr + m; + *xplore = *parent + m; + *repfnz = *xplore + m; + *panel_lsub = *repfnz + panel_size * m; + *xprune = *panel_lsub + panel_size * m; + *marker = *xprune + n; + ifill (*repfnz, m * panel_size, EMPTY); + ifill (*panel_lsub, m * panel_size, EMPTY); +} + + +void +copy_mem_int(int howmany, void *old, void *new) +{ + register int i; + int *iold = old; + int *inew = new; + for (i = 0; i < howmany; i++) inew[i] = iold[i]; +} + + +void +user_bcopy(char *src, char *dest, int bytes) +{ + char *s_ptr, *d_ptr; + + s_ptr = src + bytes - 1; + d_ptr = dest + bytes - 1; + for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr; +} + + + +int *intMalloc(int n) +{ + int *buf; + buf = (int *) SUPERLU_MALLOC((size_t) n * sizeof(int)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC fails for buf in intMalloc()"); + } + return (buf); +} + +int *intCalloc(int n) +{ + int *buf; + register int i; + buf = (int *) SUPERLU_MALLOC(n * sizeof(int)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC fails for buf in intCalloc()"); + } + for (i = 0; i < n; ++i) buf[i] = 0; + return (buf); +} + + + +#if 0 +check_expanders() +{ + int p; + printf("Check expanders:\n"); + for (p = 0; p < NO_MEMTYPE; p++) { + printf("type %d, size %d, mem %d\n", + p, expanders[p].size, (int)expanders[p].mem); + } + + return 0; +} + + +StackInfo() +{ + printf("Stack: size %d, used %d, top1 %d, top2 %d\n", + stack.size, stack.used, stack.top1, stack.top2); + return 0; +} + + + +PrintStack(char *msg, GlobalLU_t *Glu) +{ + int i; + int *xlsub, *lsub, *xusub, *usub; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + + printf("%s\n", msg); + +/* printf("\nUCOL: "); + for (i = 0; i < xusub[ndim]; ++i) + printf("%f ", ucol[i]); + + printf("\nLSUB: "); + for (i = 0; i < xlsub[ndim]; ++i) + printf("%d ", lsub[i]); + + printf("\nUSUB: "); + for (i = 0; i < xusub[ndim]; ++i) + printf("%d ", usub[i]); + + printf("\n");*/ + return 0; +} +#endif + + + diff --git a/src/maths/SuperLU/mmd.c b/src/maths/SuperLU/mmd.c new file mode 100644 index 000000000..05f26ce09 --- /dev/null +++ b/src/maths/SuperLU/mmd.c @@ -0,0 +1,1012 @@ + +typedef int shortint; + +/* *************************************************************** */ +/* *************************************************************** */ +/* **** GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE **** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */ +/* ALGORITHM. IT MAKES USE OF THE IMPLICIT REPRESENTATION */ +/* OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */ +/* NOTION OF INDISTINGUISHABLE NODES. IT ALSO IMPLEMENTS */ +/* THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */ +/* EXTERNAL DEGREE. */ +/* --------------------------------------------- */ +/* CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */ +/* DESTROYED. */ +/* --------------------------------------------- */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */ +/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ +/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */ +/* (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */ +/* NODES. */ + +/* OUTPUT PARAMETERS - */ +/* PERM - THE MINIMUM DEGREE ORDERING. */ +/* INVP - THE INVERSE OF PERM. */ +/* NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */ +/* SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */ + +/* WORKING PARAMETERS - */ +/* DHEAD - VECTOR FOR HEAD OF DEGREE LISTS. */ +/* INVP - USED TEMPORARILY FOR DEGREE FORWARD LINK. */ +/* PERM - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */ +/* QSIZE - VECTOR FOR SIZE OF SUPERNODES. */ +/* LLIST - VECTOR FOR TEMPORARY LINKED LISTS. */ +/* MARKER - A TEMPORARY MARKER VECTOR. */ + +/* PROGRAM SUBROUTINES - */ +/* MMDELM, MMDINT, MMDNUM, MMDUPD. */ + +/* *************************************************************** */ + +/* Subroutine */ int genmmd_(int *neqns, int *xadj, shortint *adjncy, + shortint *invp, shortint *perm, int *delta, shortint *dhead, + shortint *qsize, shortint *llist, shortint *marker, int *maxint, + int *nofsub) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int mdeg, ehead, i, mdlmt, mdnode; + extern /* Subroutine */ int mmdelm_(int *, int *, shortint *, + shortint *, shortint *, shortint *, shortint *, shortint *, + shortint *, int *, int *), mmdupd_(int *, int *, + int *, shortint *, int *, int *, shortint *, shortint + *, shortint *, shortint *, shortint *, shortint *, int *, + int *), mmdint_(int *, int *, shortint *, shortint *, + shortint *, shortint *, shortint *, shortint *, shortint *), + mmdnum_(int *, shortint *, shortint *, shortint *); + static int nextmd, tag, num; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dhead; + --perm; + --invp; + --adjncy; + --xadj; + + /* Function Body */ + if (*neqns <= 0) { + return 0; + } + +/* ------------------------------------------------ */ +/* INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */ +/* ------------------------------------------------ */ + *nofsub = 0; + mmdint_(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & + qsize[1], &llist[1], &marker[1]); + +/* ---------------------------------------------- */ +/* NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */ +/* ---------------------------------------------- */ + num = 1; + +/* ----------------------------- */ +/* ELIMINATE ALL ISOLATED NODES. */ +/* ----------------------------- */ + nextmd = dhead[1]; +L100: + if (nextmd <= 0) { + goto L200; + } + mdnode = nextmd; + nextmd = invp[mdnode]; + marker[mdnode] = *maxint; + invp[mdnode] = -num; + ++num; + goto L100; + +L200: +/* ---------------------------------------- */ +/* SEARCH FOR NODE OF THE MINIMUM DEGREE. */ +/* MDEG IS THE CURRENT MINIMUM DEGREE; */ +/* TAG IS USED TO FACILITATE MARKING NODES. */ +/* ---------------------------------------- */ + if (num > *neqns) { + goto L1000; + } + tag = 1; + dhead[1] = 0; + mdeg = 2; +L300: + if (dhead[mdeg] > 0) { + goto L400; + } + ++mdeg; + goto L300; +L400: +/* ------------------------------------------------- */ +/* USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */ +/* WHEN A DEGREE UPDATE IS TO BE PERFORMED. */ +/* ------------------------------------------------- */ + mdlmt = mdeg + *delta; + ehead = 0; + +L500: + mdnode = dhead[mdeg]; + if (mdnode > 0) { + goto L600; + } + ++mdeg; + if (mdeg > mdlmt) { + goto L900; + } + goto L500; +L600: +/* ---------------------------------------- */ +/* REMOVE MDNODE FROM THE DEGREE STRUCTURE. */ +/* ---------------------------------------- */ + nextmd = invp[mdnode]; + dhead[mdeg] = nextmd; + if (nextmd > 0) { + perm[nextmd] = -mdeg; + } + invp[mdnode] = -num; + *nofsub = *nofsub + mdeg + qsize[mdnode] - 2; + if (num + qsize[mdnode] > *neqns) { + goto L1000; + } +/* ---------------------------------------------- */ +/* ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */ +/* TRANSFORMATION. RESET TAG VALUE IF NECESSARY. */ +/* ---------------------------------------------- */ + ++tag; + if (tag < *maxint) { + goto L800; + } + tag = 1; + i__1 = *neqns; + for (i = 1; i <= i__1; ++i) { + if (marker[i] < *maxint) { + marker[i] = 0; + } +/* L700: */ + } +L800: + mmdelm_(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], & + qsize[1], &llist[1], &marker[1], maxint, &tag); + num += qsize[mdnode]; + llist[mdnode] = ehead; + ehead = mdnode; + if (*delta >= 0) { + goto L500; + } +L900: +/* ------------------------------------------- */ +/* UPDATE DEGREES OF THE NODES INVOLVED IN THE */ +/* MINIMUM DEGREE NODES ELIMINATION. */ +/* ------------------------------------------- */ + if (num > *neqns) { + goto L1000; + } + mmdupd_(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1], & + invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint, &tag) + ; + goto L300; + +L1000: + mmdnum_(neqns, &perm[1], &invp[1], &qsize[1]); + return 0; + +} /* genmmd_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* *** MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION *** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */ +/* MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */ +/* ALGORITHM. */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ + +/* OUTPUT PARAMETERS - */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE (INITIALIZED TO ONE). */ +/* LLIST - LINKED LIST. */ +/* MARKER - MARKER VECTOR. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdint_(int *neqns, int *xadj, shortint *adjncy, + shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, + shortint *llist, shortint *marker) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int ndeg, node, fnode; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + dhead[node] = 0; + qsize[node] = 1; + marker[node] = 0; + llist[node] = 0; +/* L100: */ + } +/* ------------------------------------------ */ +/* INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */ +/* ------------------------------------------ */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + ndeg = xadj[node + 1] - xadj[node] + 1; + fnode = dhead[ndeg]; + dforw[node] = fnode; + dhead[ndeg] = node; + if (fnode > 0) { + dbakw[fnode] = node; + } + dbakw[node] = -ndeg; +/* L200: */ + } + return 0; + +} /* mmdint_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ** MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION *** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */ +/* MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */ +/* IS STORED IN THE QUOTIENT GRAPH FORMAT. IT ALSO */ +/* TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */ +/* ELIMINATION GRAPH. */ + +/* INPUT PARAMETERS - */ +/* MDNODE - NODE OF MINIMUM DEGREE. */ +/* MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */ +/* INT. */ +/* TAG - TAG VALUE. */ + +/* UPDATED PARAMETERS - */ +/* (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE. */ +/* MARKER - MARKER VECTOR. */ +/* LLIST - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdelm_(int *mdnode, int *xadj, shortint *adjncy, + shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, + shortint *llist, shortint *marker, int *maxint, int *tag) +{ + /* System generated locals */ + int i__1, i__2; + + /* Local variables */ + static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, + istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + +/* ----------------------------------------------- */ +/* FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */ +/* ----------------------------------------------- */ + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + marker[*mdnode] = *tag; + istrt = xadj[*mdnode]; + istop = xadj[*mdnode + 1] - 1; +/* ------------------------------------------------------- */ +/* ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */ +/* NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */ +/* FOR THE NEXT REACHABLE NODE. */ +/* ------------------------------------------------------- */ + elmnt = 0; + rloc = istrt; + rlmt = istop; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + nabor = adjncy[i]; + if (nabor == 0) { + goto L300; + } + if (marker[nabor] >= *tag) { + goto L200; + } + marker[nabor] = *tag; + if (dforw[nabor] < 0) { + goto L100; + } + adjncy[rloc] = nabor; + ++rloc; + goto L200; +L100: + llist[nabor] = elmnt; + elmnt = nabor; +L200: + ; + } +L300: +/* ----------------------------------------------------- */ +/* MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */ +/* ----------------------------------------------------- */ + if (elmnt <= 0) { + goto L1000; + } + adjncy[rlmt] = -elmnt; + link = elmnt; +L400: + jstrt = xadj[link]; + jstop = xadj[link + 1] - 1; + i__1 = jstop; + for (j = jstrt; j <= i__1; ++j) { + node = adjncy[j]; + link = -node; + if (node < 0) { + goto L400; + } else if (node == 0) { + goto L900; + } else { + goto L500; + } +L500: + if (marker[node] >= *tag || dforw[node] < 0) { + goto L800; + } + marker[node] = *tag; +/* --------------------------------- */ +/* USE STORAGE FROM ELIMINATED NODES */ +/* IF NECESSARY. */ +/* --------------------------------- */ +L600: + if (rloc < rlmt) { + goto L700; + } + link = -adjncy[rlmt]; + rloc = xadj[link]; + rlmt = xadj[link + 1] - 1; + goto L600; +L700: + adjncy[rloc] = node; + ++rloc; +L800: + ; + } +L900: + elmnt = llist[elmnt]; + goto L300; +L1000: + if (rloc <= rlmt) { + adjncy[rloc] = 0; + } +/* -------------------------------------------------------- */ +/* FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */ +/* -------------------------------------------------------- */ + link = *mdnode; +L1100: + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + rnode = adjncy[i]; + link = -rnode; + if (rnode < 0) { + goto L1100; + } else if (rnode == 0) { + goto L1800; + } else { + goto L1200; + } +L1200: +/* -------------------------------------------- */ +/* IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */ +/* -------------------------------------------- */ + pvnode = dbakw[rnode]; + if (pvnode == 0 || pvnode == -(*maxint)) { + goto L1300; + } +/* ------------------------------------- */ +/* THEN REMOVE RNODE FROM THE STRUCTURE. */ +/* ------------------------------------- */ + nxnode = dforw[rnode]; + if (nxnode > 0) { + dbakw[nxnode] = pvnode; + } + if (pvnode > 0) { + dforw[pvnode] = nxnode; + } + npv = -pvnode; + if (pvnode < 0) { + dhead[npv] = nxnode; + } +L1300: +/* ---------------------------------------- */ +/* PURGE INACTIVE QUOTIENT NABORS OF RNODE. */ +/* ---------------------------------------- */ + jstrt = xadj[rnode]; + jstop = xadj[rnode + 1] - 1; + xqnbr = jstrt; + i__2 = jstop; + for (j = jstrt; j <= i__2; ++j) { + nabor = adjncy[j]; + if (nabor == 0) { + goto L1500; + } + if (marker[nabor] >= *tag) { + goto L1400; + } + adjncy[xqnbr] = nabor; + ++xqnbr; +L1400: + ; + } +L1500: +/* ---------------------------------------- */ +/* IF NO ACTIVE NABOR AFTER THE PURGING ... */ +/* ---------------------------------------- */ + nqnbrs = xqnbr - jstrt; + if (nqnbrs > 0) { + goto L1600; + } +/* ----------------------------- */ +/* THEN MERGE RNODE WITH MDNODE. */ +/* ----------------------------- */ + qsize[*mdnode] += qsize[rnode]; + qsize[rnode] = 0; + marker[rnode] = *maxint; + dforw[rnode] = -(*mdnode); + dbakw[rnode] = -(*maxint); + goto L1700; +L1600: +/* -------------------------------------- */ +/* ELSE FLAG RNODE FOR DEGREE UPDATE, AND */ +/* ADD MDNODE AS A NABOR OF RNODE. */ +/* -------------------------------------- */ + dforw[rnode] = nqnbrs + 1; + dbakw[rnode] = 0; + adjncy[xqnbr] = *mdnode; + ++xqnbr; + if (xqnbr <= jstop) { + adjncy[xqnbr] = 0; + } + +L1700: + ; + } +L1800: + return 0; + +} /* mmdelm_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ***** MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE ***** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */ +/* AFTER A MULTIPLE ELIMINATION STEP. */ + +/* INPUT PARAMETERS - */ +/* EHEAD - THE BEGINNING OF THE LIST OF ELIMINATED */ +/* NODES (I.E., NEWLY FORMED ELEMENTS). */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */ +/* DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */ +/* MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */ +/* INTEGER. */ + +/* UPDATED PARAMETERS - */ +/* MDEG - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */ +/* (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */ +/* QSIZE - SIZE OF SUPERNODE. */ +/* LLIST - WORKING LINKED LIST. */ +/* MARKER - MARKER VECTOR FOR DEGREE UPDATE. */ +/* TAG - TAG VALUE. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdupd_(int *ehead, int *neqns, int *xadj, + shortint *adjncy, int *delta, int *mdeg, shortint *dhead, + shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, + shortint *marker, int *maxint, int *tag) +{ + /* System generated locals */ + int i__1, i__2; + + /* Local variables */ + static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, + istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --marker; + --llist; + --qsize; + --dbakw; + --dforw; + --dhead; + --adjncy; + --xadj; + + /* Function Body */ + mdeg0 = *mdeg + *delta; + elmnt = *ehead; +L100: +/* ------------------------------------------------------- */ +/* FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */ +/* (RESET TAG VALUE IF NECESSARY.) */ +/* ------------------------------------------------------- */ + if (elmnt <= 0) { + return 0; + } + mtag = *tag + mdeg0; + if (mtag < *maxint) { + goto L300; + } + *tag = 1; + i__1 = *neqns; + for (i = 1; i <= i__1; ++i) { + if (marker[i] < *maxint) { + marker[i] = 0; + } +/* L200: */ + } + mtag = *tag + mdeg0; +L300: +/* --------------------------------------------- */ +/* CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */ +/* WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */ +/* ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */ +/* THAN TWO NABORS (QXHEAD). ALSO COMPUTE DEG0, */ +/* NUMBER OF NODES IN THIS ELEMENT. */ +/* --------------------------------------------- */ + q2head = 0; + qxhead = 0; + deg0 = 0; + link = elmnt; +L400: + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + enode = adjncy[i]; + link = -enode; + if (enode < 0) { + goto L400; + } else if (enode == 0) { + goto L800; + } else { + goto L500; + } + +L500: + if (qsize[enode] == 0) { + goto L700; + } + deg0 += qsize[enode]; + marker[enode] = mtag; +/* ---------------------------------- */ +/* IF ENODE REQUIRES A DEGREE UPDATE, */ +/* THEN DO THE FOLLOWING. */ +/* ---------------------------------- */ + if (dbakw[enode] != 0) { + goto L700; + } +/* --------------------------------------- +*/ +/* PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. +*/ +/* --------------------------------------- +*/ + if (dforw[enode] == 2) { + goto L600; + } + llist[enode] = qxhead; + qxhead = enode; + goto L700; +L600: + llist[enode] = q2head; + q2head = enode; +L700: + ; + } +L800: +/* -------------------------------------------- */ +/* FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */ +/* -------------------------------------------- */ + enode = q2head; + iq2 = 1; +L900: + if (enode <= 0) { + goto L1500; + } + if (dbakw[enode] != 0) { + goto L2200; + } + ++(*tag); + deg = deg0; +/* ------------------------------------------ */ +/* IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */ +/* ------------------------------------------ */ + istrt = xadj[enode]; + nabor = adjncy[istrt]; + if (nabor == elmnt) { + nabor = adjncy[istrt + 1]; + } +/* ------------------------------------------------ */ +/* IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */ +/* ------------------------------------------------ */ + link = nabor; + if (dforw[nabor] < 0) { + goto L1000; + } + deg += qsize[nabor]; + goto L2100; +L1000: +/* -------------------------------------------- */ +/* OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */ +/* DO THE FOLLOWING. */ +/* -------------------------------------------- */ + istrt = xadj[link]; + istop = xadj[link + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + node = adjncy[i]; + link = -node; + if (node == enode) { + goto L1400; + } + if (node < 0) { + goto L1000; + } else if (node == 0) { + goto L2100; + } else { + goto L1100; + } + +L1100: + if (qsize[node] == 0) { + goto L1400; + } + if (marker[node] >= *tag) { + goto L1200; + } +/* ----------------------------------- +-- */ +/* CASE WHEN NODE IS NOT YET CONSIDERED +. */ +/* ----------------------------------- +-- */ + marker[node] = *tag; + deg += qsize[node]; + goto L1400; +L1200: +/* ---------------------------------------- + */ +/* CASE WHEN NODE IS INDISTINGUISHABLE FROM + */ +/* ENODE. MERGE THEM INTO A NEW SUPERNODE. + */ +/* ---------------------------------------- + */ + if (dbakw[node] != 0) { + goto L1400; + } + if (dforw[node] != 2) { + goto L1300; + } + qsize[enode] += qsize[node]; + qsize[node] = 0; + marker[node] = *maxint; + dforw[node] = -enode; + dbakw[node] = -(*maxint); + goto L1400; +L1300: +/* -------------------------------------- +*/ +/* CASE WHEN NODE IS OUTMATCHED BY ENODE. +*/ +/* -------------------------------------- +*/ + if (dbakw[node] == 0) { + dbakw[node] = -(*maxint); + } +L1400: + ; + } + goto L2100; +L1500: +/* ------------------------------------------------ */ +/* FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */ +/* ------------------------------------------------ */ + enode = qxhead; + iq2 = 0; +L1600: + if (enode <= 0) { + goto L2300; + } + if (dbakw[enode] != 0) { + goto L2200; + } + ++(*tag); + deg = deg0; +/* --------------------------------- */ +/* FOR EACH UNMARKED NABOR OF ENODE, */ +/* DO THE FOLLOWING. */ +/* --------------------------------- */ + istrt = xadj[enode]; + istop = xadj[enode + 1] - 1; + i__1 = istop; + for (i = istrt; i <= i__1; ++i) { + nabor = adjncy[i]; + if (nabor == 0) { + goto L2100; + } + if (marker[nabor] >= *tag) { + goto L2000; + } + marker[nabor] = *tag; + link = nabor; +/* ------------------------------ */ +/* IF UNELIMINATED, INCLUDE IT IN */ +/* DEG COUNT. */ +/* ------------------------------ */ + if (dforw[nabor] < 0) { + goto L1700; + } + deg += qsize[nabor]; + goto L2000; +L1700: +/* ------------------------------- +*/ +/* IF ELIMINATED, INCLUDE UNMARKED +*/ +/* NODES IN THIS ELEMENT INTO THE +*/ +/* DEGREE COUNT. */ +/* ------------------------------- +*/ + jstrt = xadj[link]; + jstop = xadj[link + 1] - 1; + i__2 = jstop; + for (j = jstrt; j <= i__2; ++j) { + node = adjncy[j]; + link = -node; + if (node < 0) { + goto L1700; + } else if (node == 0) { + goto L2000; + } else { + goto L1800; + } + +L1800: + if (marker[node] >= *tag) { + goto L1900; + } + marker[node] = *tag; + deg += qsize[node]; +L1900: + ; + } +L2000: + ; + } +L2100: +/* ------------------------------------------- */ +/* UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */ +/* STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */ +/* ------------------------------------------- */ + deg = deg - qsize[enode] + 1; + fnode = dhead[deg]; + dforw[enode] = fnode; + dbakw[enode] = -deg; + if (fnode > 0) { + dbakw[fnode] = enode; + } + dhead[deg] = enode; + if (deg < *mdeg) { + *mdeg = deg; + } +L2200: +/* ---------------------------------- */ +/* GET NEXT ENODE IN CURRENT ELEMENT. */ +/* ---------------------------------- */ + enode = llist[enode]; + if (iq2 == 1) { + goto L900; + } + goto L1600; +L2300: +/* ----------------------------- */ +/* GET NEXT ELEMENT IN THE LIST. */ +/* ----------------------------- */ + *tag = mtag; + elmnt = llist[elmnt]; + goto L100; + +} /* mmdupd_ */ + +/* *************************************************************** */ +/* *************************************************************** */ +/* ***** MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING ***** */ +/* *************************************************************** */ +/* *************************************************************** */ + +/* AUTHOR - JOSEPH W.H. LIU */ +/* DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */ + +/* PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */ +/* PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */ +/* VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */ +/* MINIMUM DEGREE ORDERING ALGORITHM. */ + +/* INPUT PARAMETERS - */ +/* NEQNS - NUMBER OF EQUATIONS. */ +/* QSIZE - SIZE OF SUPERNODES AT ELIMINATION. */ + +/* UPDATED PARAMETERS - */ +/* INVP - INVERSE PERMUTATION VECTOR. ON INPUT, */ +/* IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */ +/* INTO THE NODE -INVP(NODE); OTHERWISE, */ +/* -INVP(NODE) IS ITS INVERSE LABELLING. */ + +/* OUTPUT PARAMETERS - */ +/* PERM - THE PERMUTATION VECTOR. */ + +/* *************************************************************** */ + +/* Subroutine */ int mmdnum_(int *neqns, shortint *perm, shortint *invp, + shortint *qsize) +{ + /* System generated locals */ + int i__1; + + /* Local variables */ + static int node, root, nextf, father, nqsize, num; + + +/* *************************************************************** */ + + +/* *************************************************************** */ + + /* Parameter adjustments */ + --qsize; + --invp; + --perm; + + /* Function Body */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + nqsize = qsize[node]; + if (nqsize <= 0) { + perm[node] = invp[node]; + } + if (nqsize > 0) { + perm[node] = -invp[node]; + } +/* L100: */ + } +/* ------------------------------------------------------ */ +/* FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */ +/* ------------------------------------------------------ */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + if (perm[node] > 0) { + goto L500; + } +/* ----------------------------------------- */ +/* TRACE THE MERGED TREE UNTIL ONE WHICH HAS */ +/* NOT BEEN MERGED, CALL IT ROOT. */ +/* ----------------------------------------- */ + father = node; +L200: + if (perm[father] > 0) { + goto L300; + } + father = -perm[father]; + goto L200; +L300: +/* ----------------------- */ +/* NUMBER NODE AFTER ROOT. */ +/* ----------------------- */ + root = father; + num = perm[root] + 1; + invp[node] = -num; + perm[root] = num; +/* ------------------------ */ +/* SHORTEN THE MERGED TREE. */ +/* ------------------------ */ + father = node; +L400: + nextf = -perm[father]; + if (nextf <= 0) { + goto L500; + } + perm[father] = -root; + father = nextf; + goto L400; +L500: + ; + } +/* ---------------------- */ +/* READY TO COMPUTE PERM. */ +/* ---------------------- */ + i__1 = *neqns; + for (node = 1; node <= i__1; ++node) { + num = -invp[node]; + invp[node] = num; + perm[num] = node; +/* L600: */ + } + return 0; + +} /* mmdnum_ */ + diff --git a/src/maths/SuperLU/qselect.c b/src/maths/SuperLU/qselect.c new file mode 100644 index 000000000..bb04dd44c --- /dev/null +++ b/src/maths/SuperLU/qselect.c @@ -0,0 +1,74 @@ + +/*! @file qselect.c + * \brief Quickselect: returns the k-th (zero-based) largest value in A[]. + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Laboratory.
+ * November, 2010
+ * 
+ */ + +#include + +double dqselect(int n, double A[], int k) +{ + register int i, j, p; + register double val; + + k = SUPERLU_MAX(k, 0); + k = SUPERLU_MIN(k, n - 1); + while (n > 1) + { + i = 0; j = n-1; + p = j; val = A[p]; + while (i < j) + { + for (; A[i] >= val && i < p; i++); + if (A[i] < val) { A[p] = A[i]; p = i; } + for (; A[j] <= val && j > p; j--); + if (A[j] > val) { A[p] = A[j]; p = j; } + } + A[p] = val; + if (p == k) return val; + else if (p > k) n = p; + else + { + p++; + n -= p; A += p; k -= p; + } + } + + return A[0]; +} + +float sqselect(int n, float A[], int k) +{ + register int i, j, p; + register float val; + + k = SUPERLU_MAX(k, 0); + k = SUPERLU_MIN(k, n - 1); + while (n > 1) + { + i = 0; j = n-1; + p = j; val = A[p]; + while (i < j) + { + for (; A[i] >= val && i < p; i++); + if (A[i] < val) { A[p] = A[i]; p = i; } + for (; A[j] <= val && j > p; j--); + if (A[j] > val) { A[p] = A[j]; p = j; } + } + A[p] = val; + if (p == k) return val; + else if (p > k) n = p; + else + { + p++; + n -= p; A += p; k -= p; + } + } + + return A[0]; +} diff --git a/src/maths/SuperLU/relax_snode.c b/src/maths/SuperLU/relax_snode.c new file mode 100644 index 000000000..f9b06bcbd --- /dev/null +++ b/src/maths/SuperLU/relax_snode.c @@ -0,0 +1,75 @@ +/*! @file relax_snode.c + * \brief Identify initial relaxed supernodes + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + +#include +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    relax_snode() - Identify the initial relaxed supernodes, assuming that 
+ *    the matrix has been reordered according to the postorder of the etree.
+ * 
+ */ +void +relax_snode ( + const int n, + int *et, /* column elimination tree */ + const int relax_columns, /* max no of columns allowed in a + relaxed snode */ + int *descendants, /* no of descendants of each node + in the etree */ + int *relax_end /* last column in a supernode */ + ) +{ + + register int j, parent; + register int snode_start; /* beginning of a snode */ + + ifill (relax_end, n, EMPTY); + for (j = 0; j < n; j++) descendants[j] = 0; + + /* Compute the number of descendants of each node in the etree */ + for (j = 0; j < n; j++) { + parent = et[j]; + if ( parent != n ) /* not the dummy root */ + descendants[parent] += descendants[j] + 1; + } + + /* Identify the relaxed supernodes by postorder traversal of the etree. */ + for (j = 0; j < n; ) { + parent = et[j]; + snode_start = j; + while ( parent != n && descendants[parent] < relax_columns ) { + j = parent; + parent = et[j]; + } + /* Found a supernode with j being the last column. */ + relax_end[snode_start] = j; /* Last column is recorded */ + j++; + /* Search for a new leaf */ + while ( descendants[j] != 0 && j < n ) j++; + } + + /*printf("No of relaxed snodes: %d; relaxed columns: %d\n", + nsuper, no_relaxed_col); */ +} diff --git a/src/maths/SuperLU/sp_coletree.c b/src/maths/SuperLU/sp_coletree.c new file mode 100644 index 000000000..d02dfdd6d --- /dev/null +++ b/src/maths/SuperLU/sp_coletree.c @@ -0,0 +1,419 @@ +/*! @file sp_coletree.c + * \brief Tree layout and computation routines + * + *
+ * -- SuperLU routine (version 3.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * August 1, 2008
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+*/ + +/* Elimination tree computation and layout routines */ + +#include +#include +#include + +/* + * Implementation of disjoint set union routines. + * Elements are integers in 0..n-1, and the + * names of the sets themselves are of type int. + * + * Calls are: + * initialize_disjoint_sets (n) initial call. + * s = make_set (i) returns a set containing only i. + * s = link (t, u) returns s = t union u, destroying t and u. + * s = find (i) return name of set containing i. + * finalize_disjoint_sets final call. + * + * This implementation uses path compression but not weighted union. + * See Tarjan's book for details. + * John Gilbert, CMI, 1987. + * + * Implemented path-halving by XSL 07/05/95. + */ + + +static +int *mxCallocInt(int n) +{ + register int i; + int *buf; + + buf = (int *) SUPERLU_MALLOC( n * sizeof(int) ); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC fails for buf in mxCallocInt()"); + } + for (i = 0; i < n; i++) buf[i] = 0; + return (buf); +} + +static +void initialize_disjoint_sets ( + int n, + int **pp + ) +{ + (*pp) = mxCallocInt(n); +} + + +static +int make_set ( + int i, + int *pp + ) +{ + pp[i] = i; + return i; +} + + +static +int link ( + int s, + int t, + int *pp + ) +{ + pp[s] = t; + return t; +} + + +/* PATH HALVING */ +static +int find ( + int i, + int *pp + ) +{ + register int p, gp; + + p = pp[i]; + gp = pp[p]; + while (gp != p) { + pp[i] = gp; + i = gp; + p = pp[i]; + gp = pp[p]; + } + return (p); +} + +#if 0 +/* PATH COMPRESSION */ +static +int find ( + int i + ) +{ + if (pp[i] != i) + pp[i] = find (pp[i]); + return pp[i]; +} +#endif + +static +void finalize_disjoint_sets ( + int *pp + ) +{ + SUPERLU_FREE(pp); +} + + +/* + * Find the elimination tree for A'*A. + * This uses something similar to Liu's algorithm. + * It runs in time O(nz(A)*log n) and does not form A'*A. + * + * Input: + * Sparse matrix A. Numeric values are ignored, so any + * explicit zeros are treated as nonzero. + * Output: + * Integer array of parents representing the elimination + * tree of the symbolic product A'*A. Each vertex is a + * column of A, and nc means a root of the elimination forest. + * + * John R. Gilbert, Xerox, 10 Dec 1990 + * Based on code by JRG dated 1987, 1988, and 1990. + */ + +/* + * Nonsymmetric elimination tree + */ +int +sp_coletree( + int *acolst, int *acolend, /* column start and end past 1 */ + int *arow, /* row indices of A */ + int nr, int nc, /* dimension of A */ + int *parent /* parent in elim tree */ + ) +{ + int *root; /* root of subtee of etree */ + int *firstcol; /* first nonzero col in each row*/ + int rset, cset; + int row, col; + int rroot; + int p; + int *pp; + + root = mxCallocInt (nc); + initialize_disjoint_sets (nc, &pp); + + /* Compute firstcol[row] = first nonzero column in row */ + + firstcol = mxCallocInt (nr); + for (row = 0; row < nr; firstcol[row++] = nc); + for (col = 0; col < nc; col++) + for (p = acolst[col]; p < acolend[col]; p++) { + row = arow[p]; + firstcol[row] = SUPERLU_MIN(firstcol[row], col); + } + + /* Compute etree by Liu's algorithm for symmetric matrices, + except use (firstcol[r],c) in place of an edge (r,c) of A. + Thus each row clique in A'*A is replaced by a star + centered at its first vertex, which has the same fill. */ + + for (col = 0; col < nc; col++) { + cset = make_set (col, pp); + root[cset] = col; + parent[col] = nc; /* Matlab */ + for (p = acolst[col]; p < acolend[col]; p++) { + row = firstcol[arow[p]]; + if (row >= col) continue; + rset = find (row, pp); + rroot = root[rset]; + if (rroot != col) { + parent[rroot] = col; + cset = link (cset, rset, pp); + root[cset] = col; + } + } + } + + SUPERLU_FREE (root); + SUPERLU_FREE (firstcol); + finalize_disjoint_sets (pp); + return 0; +} + +/* + * q = TreePostorder (n, p); + * + * Postorder a tree. + * Input: + * p is a vector of parent pointers for a forest whose + * vertices are the integers 0 to n-1; p[root]==n. + * Output: + * q is a vector indexed by 0..n-1 such that q[i] is the + * i-th vertex in a postorder numbering of the tree. + * + * ( 2/7/95 modified by X.Li: + * q is a vector indexed by 0:n-1 such that vertex i is the + * q[i]-th vertex in a postorder numbering of the tree. + * That is, this is the inverse of the previous q. ) + * + * In the child structure, lower-numbered children are represented + * first, so that a tree which is already numbered in postorder + * will not have its order changed. + * + * Written by John Gilbert, Xerox, 10 Dec 1990. + * Based on code written by John Gilbert at CMI in 1987. + */ + +static +/* + * Depth-first search from vertex v. + */ +void etdfs ( + int v, + int first_kid[], + int next_kid[], + int post[], + int *postnum + ) +{ + int w; + + for (w = first_kid[v]; w != -1; w = next_kid[w]) { + etdfs (w, first_kid, next_kid, post, postnum); + } + /* post[postnum++] = v; in Matlab */ + post[v] = (*postnum)++; /* Modified by X. Li on 08/10/07 */ +} + + +static +/* + * Depth-first search from vertex n. No recursion. + * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France. + */ +void nr_etdfs (int n, int *parent, + int *first_kid, int *next_kid, + int *post, int postnum) +{ + int current = n, first, next; + + while (postnum != n){ + + /* no kid for the current node */ + first = first_kid[current]; + + /* no first kid for the current node */ + if (first == -1){ + + /* numbering this node because it has no kid */ + post[current] = postnum++; + + /* looking for the next kid */ + next = next_kid[current]; + + while (next == -1){ + + /* no more kids : back to the parent node */ + current = parent[current]; + + /* numbering the parent node */ + post[current] = postnum++; + + /* get the next kid */ + next = next_kid[current]; + } + + /* stopping criterion */ + if (postnum==n+1) return; + + /* updating current node */ + current = next; + } + /* updating current node */ + else { + current = first; + } + } +} + +/* + * Post order a tree + */ +int *TreePostorder( + int n, + int *parent + ) +{ + int *first_kid, *next_kid; /* Linked list of children. */ + int *post, postnum; + int v, dad; + + /* Allocate storage for working arrays and results */ + first_kid = mxCallocInt (n+1); + next_kid = mxCallocInt (n+1); + post = mxCallocInt (n+1); + + /* Set up structure describing children */ + for (v = 0; v <= n; first_kid[v++] = -1); + for (v = n-1; v >= 0; v--) { + dad = parent[v]; + next_kid[v] = first_kid[dad]; + first_kid[dad] = v; + } + + /* Depth-first search from dummy root vertex #n */ + postnum = 0; +#if 0 + /* recursion */ + etdfs (n, first_kid, next_kid, post, &postnum); +#else + /* no recursion */ + nr_etdfs(n, parent, first_kid, next_kid, post, postnum); +#endif + + SUPERLU_FREE (first_kid); + SUPERLU_FREE (next_kid); + return post; +} + + +/* + * p = spsymetree (A); + * + * Find the elimination tree for symmetric matrix A. + * This uses Liu's algorithm, and runs in time O(nz*log n). + * + * Input: + * Square sparse matrix A. No check is made for symmetry; + * elements below and on the diagonal are ignored. + * Numeric values are ignored, so any explicit zeros are + * treated as nonzero. + * Output: + * Integer array of parents representing the etree, with n + * meaning a root of the elimination forest. + * Note: + * This routine uses only the upper triangle, while sparse + * Cholesky (as in spchol.c) uses only the lower. Matlab's + * dense Cholesky uses only the upper. This routine could + * be modified to use the lower triangle either by transposing + * the matrix or by traversing it by rows with auxiliary + * pointer and link arrays. + * + * John R. Gilbert, Xerox, 10 Dec 1990 + * Based on code by JRG dated 1987, 1988, and 1990. + * Modified by X.S. Li, November 1999. + */ + +/* + * Symmetric elimination tree + */ +int +sp_symetree( + int *acolst, int *acolend, /* column starts and ends past 1 */ + int *arow, /* row indices of A */ + int n, /* dimension of A */ + int *parent /* parent in elim tree */ + ) +{ + int *root; /* root of subtree of etree */ + int rset, cset; + int row, col; + int rroot; + int p; + int *pp; + + root = mxCallocInt (n); + initialize_disjoint_sets (n, &pp); + + for (col = 0; col < n; col++) { + cset = make_set (col, pp); + root[cset] = col; + parent[col] = n; /* Matlab */ + for (p = acolst[col]; p < acolend[col]; p++) { + row = arow[p]; + if (row >= col) continue; + rset = find (row, pp); + rroot = root[rset]; + if (rroot != col) { + parent[rroot] = col; + cset = link (cset, rset, pp); + root[cset] = col; + } + } + } + SUPERLU_FREE (root); + finalize_disjoint_sets (pp); + return 0; +} /* SP_SYMETREE */ diff --git a/src/maths/SuperLU/sp_ienv.c b/src/maths/SuperLU/sp_ienv.c new file mode 100644 index 000000000..883932fdc --- /dev/null +++ b/src/maths/SuperLU/sp_ienv.c @@ -0,0 +1,79 @@ +/*! @file sp_ienv.c + * \brief Chooses machine-dependent parameters for the local environment. + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November, 2010
+ * 
+*/ + +/* + * File name: sp_ienv.c + * History: Modified from lapack routine ILAENV + */ +#include + +/*! \brief + +
+    Purpose   
+    =======   
+
+    sp_ienv() is inquired to choose machine-dependent parameters for the
+    local environment. See ISPEC for a description of the parameters.   
+
+    This version provides a set of parameters which should give good,   
+    but not optimal, performance on many of the currently available   
+    computers.  Users are encouraged to modify this subroutine to set   
+    the tuning parameters for their particular machine using the option   
+    and problem size information in the arguments.   
+
+    Arguments   
+    =========   
+
+    ISPEC   (input) int
+            Specifies the parameter to be returned as the value of SP_IENV.   
+            = 1: the panel size w; a panel consists of w consecutive
+	         columns of matrix A in the process of Gaussian elimination.
+		 The best value depends on machine's cache characters.
+            = 2: the relaxation parameter relax; if the number of
+	         nodes (columns) in a subtree of the elimination tree is less
+		 than relax, this subtree is considered as one supernode,
+		 regardless of their row structures.
+            = 3: the maximum size for a supernode in complete LU;
+	    = 4: the minimum row dimension for 2-D blocking to be used;
+	    = 5: the minimum column dimension for 2-D blocking to be used;
+	    = 6: the estimated fills factor for L and U, compared with A;
+	    = 7: the maximum size for a supernode in ILU.
+	    
+   (SP_IENV) (output) int
+            >= 0: the value of the parameter specified by ISPEC   
+            < 0:  if SP_IENV = -k, the k-th argument had an illegal value. 
+  
+    ===================================================================== 
+
+*/ +int +sp_ienv(int ispec) +{ + int i; + + switch (ispec) { + case 1: return (12); + case 2: return (6); + case 3: return (100); + case 4: return (200); + case 5: return (60); + case 6: return (20); + case 7: return (10); + } + + /* Invalid value for ISPEC */ + i = 1; + xerbla_("sp_ienv", &i); + return 0; + +} /* sp_ienv_ */ + diff --git a/src/maths/SuperLU/sp_preorder.c b/src/maths/SuperLU/sp_preorder.c new file mode 100644 index 000000000..d147e5050 --- /dev/null +++ b/src/maths/SuperLU/sp_preorder.c @@ -0,0 +1,208 @@ +/*! @file sp_preorder.c + * \brief Permute and performs functions on columns of orginal matrix + */ +#include + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * sp_preorder() permutes the columns of the original matrix. It performs
+ * the following steps:
+ *
+ *    1. Apply column permutation perm_c[] to A's column pointers to form AC;
+ *
+ *    2. If options->Fact = DOFACT, then
+ *       (1) Compute column elimination tree etree[] of AC'AC;
+ *       (2) Post order etree[] to get a postordered elimination tree etree[],
+ *           and a postorder permutation post[];
+ *       (3) Apply post[] permutation to columns of AC;
+ *       (4) Overwrite perm_c[] with the product perm_c * post.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         Specifies whether or not the elimination tree will be re-used.
+ *         If options->Fact == DOFACT, this means first time factor A, 
+ *         etree is computed, postered, and output.
+ *         Otherwise, re-factor A, etree is input, unchanged on exit.
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = NC or SLU_NCP; Mtype = SLU_GE.
+ *         In the future, more general A may be handled.
+ *
+ * perm_c  (input/output) int*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *         If options->Fact == DOFACT, perm_c is both input and output.
+ *         On output, it is changed according to a postorder of etree.
+ *         Otherwise, perm_c is input.
+ *
+ * etree   (input/output) int*
+ *         Elimination tree of Pc'*A'*A*Pc, dimension A->ncol.
+ *         If options->Fact == DOFACT, etree is an output argument,
+ *         otherwise it is an input argument.
+ *         Note: etree is a vector of parent pointers for a forest whose
+ *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * AC      (output) SuperMatrix*
+ *         The resulting matrix after applied the column permutation
+ *         perm_c[] to matrix A. The type of AC can be:
+ *         Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE.
+ * 
+ */ +void +sp_preorder(superlu_options_t *options, SuperMatrix *A, int *perm_c, + int *etree, SuperMatrix *AC) +{ + NCformat *Astore; + NCPformat *ACstore; + int *iwork, *post; + register int n, i; + + n = A->ncol; + + /* Apply column permutation perm_c to A's column pointers so to + obtain NCP format in AC = A*Pc. */ + AC->Stype = SLU_NCP; + AC->Dtype = A->Dtype; + AC->Mtype = A->Mtype; + AC->nrow = A->nrow; + AC->ncol = A->ncol; + Astore = A->Store; + ACstore = AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) ); + if ( !ACstore ) ABORT_SuperLU("SUPERLU_MALLOC fails for ACstore"); + ACstore->nnz = Astore->nnz; + ACstore->nzval = Astore->nzval; + ACstore->rowind = Astore->rowind; + ACstore->colbeg = (int*) SUPERLU_MALLOC(n*sizeof(int)); + if ( !(ACstore->colbeg) ) ABORT_SuperLU("SUPERLU_MALLOC fails for ACstore->colbeg"); + ACstore->colend = (int*) SUPERLU_MALLOC(n*sizeof(int)); + if ( !(ACstore->colend) ) ABORT_SuperLU("SUPERLU_MALLOC fails for ACstore->colend"); + +#ifdef DEBUG + print_int_vec("pre_order:", n, perm_c); + check_perm("Initial perm_c", n, perm_c); +#endif + + for (i = 0; i < n; i++) { + ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; + ACstore->colend[perm_c[i]] = Astore->colptr[i+1]; + } + + if ( options->Fact == DOFACT ) { +#undef ETREE_ATplusA +#ifdef ETREE_ATplusA + /*-------------------------------------------- + COMPUTE THE ETREE OF Pc*(A'+A)*Pc'. + --------------------------------------------*/ + int *b_colptr, *b_rowind, bnz, j; + int *c_colbeg, *c_colend; + + /*printf("Use etree(A'+A)\n");*/ + + /* Form B = A + A'. */ + at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); + + /* Form C = Pc*B*Pc'. */ + c_colbeg = (int*) SUPERLU_MALLOC(2*n*sizeof(int)); + c_colend = c_colbeg + n; + if (!c_colbeg ) ABORT_SuperLU("SUPERLU_MALLOC fails for c_colbeg/c_colend"); + for (i = 0; i < n; i++) { + c_colbeg[perm_c[i]] = b_colptr[i]; + c_colend[perm_c[i]] = b_colptr[i+1]; + } + for (j = 0; j < n; ++j) { + for (i = c_colbeg[j]; i < c_colend[j]; ++i) { + b_rowind[i] = perm_c[b_rowind[i]]; + } + } + + /* Compute etree of C. */ + sp_symetree(c_colbeg, c_colend, b_rowind, n, etree); + + SUPERLU_FREE(b_colptr); + if ( bnz ) SUPERLU_FREE(b_rowind); + SUPERLU_FREE(c_colbeg); + +#else + /*-------------------------------------------- + COMPUTE THE COLUMN ELIMINATION TREE. + --------------------------------------------*/ + sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind, + A->nrow, A->ncol, etree); +#endif +#ifdef DEBUG + print_int_vec("etree:", n, etree); +#endif + + /* In symmetric mode, do not do postorder here. */ + if ( options->SymmetricMode == NO_SuperLU ) { + /* Post order etree */ + post = (int *) TreePostorder(n, etree); + /* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; + iwork = post; */ + +#ifdef DEBUG + print_int_vec("post:", n+1, post); + check_perm("post", n, post); +#endif + iwork = (int*) SUPERLU_MALLOC((n+1)*sizeof(int)); + if ( !iwork ) ABORT_SuperLU("SUPERLU_MALLOC fails for iwork[]"); + + /* Renumber etree in postorder */ + for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]]; + for (i = 0; i < n; ++i) etree[i] = iwork[i]; + +#ifdef DEBUG + print_int_vec("postorder etree:", n, etree); +#endif + + /* Postmultiply A*Pc by post[] */ + for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i]; + for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i]; + for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i]; + for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i]; + + for (i = 0; i < n; ++i) + iwork[i] = post[perm_c[i]]; /* product of perm_c and post */ + for (i = 0; i < n; ++i) perm_c[i] = iwork[i]; + +#ifdef DEBUG + print_int_vec("Pc*post:", n, perm_c); + check_perm("final perm_c", n, perm_c); +#endif + SUPERLU_FREE (post); + SUPERLU_FREE (iwork); + } /* end postordering */ + + } /* if options->Fact == DOFACT ... */ + +} + +int check_perm(char *what, int n, int *perm) +{ + register int i; + int *marker; + marker = (int *) calloc(n, sizeof(int)); + + for (i = 0; i < n; ++i) { + if ( marker[perm[i]] == 1 || perm[i] >= n ) { + printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]); + ABORT_SuperLU("check_perm"); + } else { + marker[perm[i]] = 1; + } + } + + SUPERLU_FREE(marker); + return 0; +} diff --git a/src/maths/SuperLU/superlu_timer.c b/src/maths/SuperLU/superlu_timer.c new file mode 100644 index 000000000..e888f0823 --- /dev/null +++ b/src/maths/SuperLU/superlu_timer.c @@ -0,0 +1,72 @@ +/*! @file superlu_timer.c + * \brief Returns the time used + * + *
+ * Purpose
+ * ======= 
+ * 
+ * Returns the time in seconds used by the process.
+ *
+ * Note: the timer function call is machine dependent. Use conditional
+ *       compilation to choose the appropriate function.
+ * 
+ */ + + +#ifdef SUN +/* + * It uses the system call gethrtime(3C), which is accurate to + * nanoseconds. +*/ +#include + +double SuperLU_timer_() { + return ( (double)gethrtime() / 1e9 ); +} + +#elif _WIN32 + +#include + +double SuperLU_timer_() +{ + clock_t t; + t=clock(); + + return ((double)t)/CLOCKS_PER_SEC; +} + +#else + +#ifndef NO_TIMER +#include +#include +#include +#include +#endif + +/*! \brief Timer function + */ + +double SuperLU_timer_() +{ +#ifdef NO_TIMER + /* no sys/times.h on WIN32 */ + double tmp; + tmp = 0.0; + /* return (double)(tmp) / CLK_TCK;*/ + return 0.0; +#else + struct tms use; + double tmp; + int clocks_per_sec = sysconf(_SC_CLK_TCK); + + times ( &use ); + tmp = use.tms_utime; + tmp += use.tms_stime; + return (double)(tmp) / clocks_per_sec; +#endif +} + +#endif + diff --git a/src/maths/SuperLU/superlusmp.c b/src/maths/SuperLU/superlusmp.c new file mode 100644 index 000000000..31cceb9e1 --- /dev/null +++ b/src/maths/SuperLU/superlusmp.c @@ -0,0 +1,722 @@ +/* + * Spice3 COMPATIBILITY MODULE + * + * Author: Advising professor: + * Kenneth S. Kundert Alberto Sangiovanni-Vincentelli + * UC Berkeley + * + * This module contains routines that make Sparse1.3 a direct + * replacement for the SMP sparse matrix package in Spice3c1 or Spice3d1. + * Sparse1.3 is in general a faster and more robust package than SMP. + * These advantages become significant on large circuits. + * + * >>> User accessible functions contained in this file: + * SMPaddElt + * SMPmakeElt + * SMPcClear + * SMPclear + * SMPcLUfac + * SMPluFac + * SMPcReorder + * SMPreorder + * SMPcaSolve + * SMPcSolve + * SMPsolve + * SMPmatSize + * SMPnewMatrix + * SMPdestroy + * SMPpreOrder + * SMPprint + * SMPgetError + * SMPcProdDiag + * LoadGmin + * SMPfindElt + * SMPcombine + * SMPcCombine + */ + +/* + * To replace SMP with Sparse, rename the file spSpice3.h to + * spMatrix.h and place Sparse in a subdirectory of SPICE called + * `sparse'. Then on UNIX compile Sparse by executing `make spice'. + * If not on UNIX, after compiling Sparse and creating the sparse.a + * archive, compile this file (spSMP.c) and spSMP.o to the archive, + * then copy sparse.a into the SPICE main directory and rename it + * SMP.a. Finally link SPICE. + * + * To be compatible with SPICE, the following Sparse compiler options + * (in spConfig.h) should be set as shown below: + * + * EXPANDABLE YES + * TRANSLATE NO + * INITIALIZE NO or YES, YES for use with test prog. + * DIAGONAL_PIVOTING YES + * MODIFIED_MARKOWITZ NO + * DELETE NO + * STRIP NO + * MODIFIED_NODAL YES + * QUAD_ELEMENT NO + * TRANSPOSE YES + * SCALING NO + * DOCUMENTATION YES + * MULTIPLICATION NO + * DETERMINANT YES + * STABILITY NO + * CONDITION NO + * PSEUDOCONDITION NO + * DEBUG YES + * + * spREAL double + */ + +/* + * Revision and copyright information. + * + * Copyright (c) 1985,86,87,88,89,90 + * by Kenneth S. Kundert and the University of California. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose and without fee is hereby granted, provided + * that the above copyright notice appear in all copies and supporting + * documentation and that the authors and the University of California + * are properly credited. The authors and the University of California + * make no representations as to the suitability of this software for + * any purpose. It is provided `as is', without express or implied warranty. + */ + +/* + * IMPORTS + * + * >>> Import descriptions: + * spMatrix.h + * Sparse macros and declarations. + * SMPdefs.h + * Spice3's matrix macro definitions. + */ + +#include "ngspice/config.h" +#include +#include +#include +#include "ngspice/spmatrix.h" +#include "../sparse/spdefs.h" +#include "ngspice/smpdefs.h" + +#if defined (_MSC_VER) +extern double scalbn(double, int); +#define logb _logb +extern double logb(double); +#endif + +static void LoadGmin_CSC (double **diag, int n, double Gmin) ; +static void LoadGmin(SMPmatrix *eMatrix, double Gmin); + +void +SMPmatrix_CSC (SMPmatrix *Matrix) +{ + spMatrix_CSC (Matrix->SPmatrix, Matrix->CKTsuperluAp, Matrix->CKTsuperluAi, Matrix->CKTsuperluAx, + Matrix->CKTsuperluN, Matrix->CKTbind_Sparse, Matrix->CKTbind_CSC, Matrix->CKTdiag_CSC) ; + return ; +} + +void +SMPnnz (SMPmatrix *Matrix) +{ + Matrix->CKTsuperluN = spGetSize (Matrix->SPmatrix, 1) ; + Matrix->CKTsuperlunz = Matrix->SPmatrix->Elements ; + return ; +} + +/* + * SMPaddElt() + */ +int +SMPaddElt (SMPmatrix *Matrix, int Row, int Col, double Value) +{ + *spGetElement (Matrix->SPmatrix, Row, Col) = Value; + return spError (Matrix->SPmatrix) ; +} + +/* + * SMPmakeElt() + */ +double * +SMPmakeElt (SMPmatrix *Matrix, int Row, int Col) +{ + return spGetElement (Matrix->SPmatrix, Row, Col) ; +} + + +/* + * SMPcClear() + */ + +void +SMPcClear (SMPmatrix *Matrix) +{ + spClear (Matrix->SPmatrix) ; +} + +/* + * SMPclear() + */ + +void +SMPclear (SMPmatrix *Matrix) +{ + int i ; + if (Matrix->CKTsuperluMODE) + { + spClear (Matrix->SPmatrix) ; + if (Matrix->CKTsuperluAx != NULL) + { + for (i = 0 ; i < Matrix->CKTsuperlunz ; i++) + Matrix->CKTsuperluAx [i] = 0 ; + } + } else { + spClear (Matrix->SPmatrix) ; + } +} + +#define NG_IGNORE(x) (void)x + +/* + * SMPcLUfac() + */ +/*ARGSUSED*/ + +int +SMPcLUfac (SMPmatrix *Matrix, double PivTol) +{ + NG_IGNORE(PivTol); + + spSetComplex (Matrix->SPmatrix) ; + return spFactor (Matrix->SPmatrix) ; +} + +/* + * SMPluFac() + */ +/*ARGSUSED*/ + +int +SMPluFac (SMPmatrix *Matrix, double PivTol, double Gmin) +{//printf("ReFactor\n"); + int relax, panel_size, lwork = 0 ; + + NG_IGNORE(PivTol) ; + + if (Matrix->CKTsuperluMODE) + { + spSetReal (Matrix->SPmatrix) ; + LoadGmin_CSC (Matrix->CKTdiag_CSC, Matrix->CKTsuperluN, Gmin) ; + Matrix->CKTsuperluOptions.Fact = SamePattern_SameRowPerm ; /* IMPORTANT */ + panel_size = sp_ienv (1) ; + relax = sp_ienv (2) ; +// t = SuperLU_timer_(); + dgstrf (&(Matrix->CKTsuperluOptions), &(Matrix->CKTsuperluAC), relax, panel_size, Matrix->CKTsuperluEtree, + NULL, lwork, Matrix->CKTsuperluPerm_c, Matrix->CKTsuperluPerm_r, &(Matrix->CKTsuperluL), + &(Matrix->CKTsuperluU), &(Matrix->CKTsuperluStat), &(Matrix->CKTsuperluInfo)) ; +// utime[FACT] = SuperLU_timer_() - t; + if (Matrix->CKTsuperluInfo == 0) return 0 ; /* ReFactorization DONE */ + else return 1 ; + } else { + spSetReal (Matrix->SPmatrix) ; + LoadGmin (Matrix, Gmin) ; + return spFactor (Matrix->SPmatrix) ; + } +} + +/* + * SMPcReorder() + */ + +int +SMPcReorder (SMPmatrix *Matrix, double PivTol, double PivRel, int *NumSwaps) +{ + *NumSwaps = 1; + spSetComplex (Matrix->SPmatrix) ; + return spOrderAndFactor (Matrix->SPmatrix, NULL, + (spREAL)PivRel, (spREAL)PivTol, YES) ; +} + +/* + * SMPreorder() + */ + +int +SMPreorder (SMPmatrix *Matrix, double PivTol, double PivRel, double Gmin) +{//printf("Factor\n"); +// int permc_spec ; + int relax, panel_size, lwork = 0 ; + + if (Matrix->CKTsuperluMODE) + { + spSetReal (Matrix->SPmatrix) ; + LoadGmin_CSC (Matrix->CKTdiag_CSC, Matrix->CKTsuperluN, Gmin) ; + Matrix->CKTsuperluOptions.Fact = SamePattern ; /* IMPORTANT */ +// permc_spec = options->ColPerm; +// if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, A, perm_c); +// sp_preorder(options, A, perm_c, etree, AC); + panel_size = sp_ienv (1) ; + relax = sp_ienv (2) ; +// t = SuperLU_timer_(); + dgstrf (&(Matrix->CKTsuperluOptions), &(Matrix->CKTsuperluAC), relax, panel_size, Matrix->CKTsuperluEtree, + NULL, lwork, Matrix->CKTsuperluPerm_c, Matrix->CKTsuperluPerm_r, &(Matrix->CKTsuperluL), + &(Matrix->CKTsuperluU), &(Matrix->CKTsuperluStat), &(Matrix->CKTsuperluInfo)) ; +// utime[FACT] = SuperLU_timer_() - t; + return 0 ; + } + else { + spSetReal (Matrix->SPmatrix) ; + LoadGmin (Matrix, Gmin) ; + return spOrderAndFactor (Matrix->SPmatrix, NULL, + (spREAL)PivRel, (spREAL)PivTol, YES) ; + } +} + +/* + * SMPcaSolve() + */ +void +SMPcaSolve (SMPmatrix *Matrix, double RHS[], double iRHS[], + double Spare[], double iSpare[]) +{ + printf ("SMPcaSolve\n") ; + NG_IGNORE (iSpare) ; + NG_IGNORE (Spare) ; + + spSolveTransposed (Matrix->SPmatrix, RHS, RHS, iRHS, iRHS) ; +} + +/* + * SMPcSolve() + */ + +void +SMPcSolve (SMPmatrix *Matrix, double RHS[], double iRHS[], double Spare[], double iSpare[]) +{ + NG_IGNORE (iSpare) ; + NG_IGNORE (Spare) ; + + spSolve (Matrix->SPmatrix, RHS, RHS, iRHS, iRHS) ; +} + +/* + * SMPsolve() + */ + +void +SMPsolve(SMPmatrix *Matrix, double RHS[], double Spare[]) +{//printf("Solve\n"); + int i, *pExtOrder ; + + NG_IGNORE (Spare) ; + + if (Matrix->CKTsuperluMODE) + { + pExtOrder = &Matrix->SPmatrix->IntToExtRowMap[Matrix->CKTsuperluN] ; + for (i = Matrix->CKTsuperluN - 1 ; i >= 0 ; i--) + Matrix->CKTsuperluIntermediate [i] = RHS [*(pExtOrder--)] ; + + dgstrs (NOTRANS, &(Matrix->CKTsuperluL), &(Matrix->CKTsuperluU), Matrix->CKTsuperluPerm_c, + Matrix->CKTsuperluPerm_r, &(Matrix->CKTsuperluI), &(Matrix->CKTsuperluStat), + &(Matrix->CKTsuperluInfo)) ; + + pExtOrder = &Matrix->SPmatrix->IntToExtColMap[Matrix->CKTsuperluN] ; + for (i = Matrix->CKTsuperluN - 1 ; i >= 0 ; i--) + RHS [*(pExtOrder--)] = Matrix->CKTsuperluIntermediate [i] ; + } else { + spSolve (Matrix->SPmatrix, RHS, RHS, NULL, NULL) ; + } +} + +/* + * SMPmatSize() + */ +int +SMPmatSize (SMPmatrix *Matrix) +{ + return spGetSize (Matrix->SPmatrix, 1) ; +} + +/* + * SMPnewMatrix() + */ +int +SMPnewMatrix(SMPmatrix *Matrix) +{ + int Error; + Matrix->SPmatrix = spCreate (0, 1, &Error) ; + return Error ; +} + +/* + * SMPdestroy() + */ + +void +SMPdestroy (SMPmatrix *Matrix) +{ + spDestroy (Matrix->SPmatrix) ; +} + +/* + * SMPpreOrder() + */ + +int +SMPpreOrder (SMPmatrix *Matrix) +{//printf("PreOrder\n"); +// DNformat *Bstore ; + int permc_spec ; +// int n, i ; + if (Matrix->CKTsuperluMODE) + { + Matrix->CKTsuperluOptions.Fact = DOFACT ; + Matrix->CKTsuperluOptions.Equil = NO_SuperLU ; + Matrix->CKTsuperluOptions.ColPerm = MMD_AT_PLUS_A ; + Matrix->CKTsuperluOptions.Trans = NOTRANS ; + Matrix->CKTsuperluOptions.IterRefine = NOREFINE ; + Matrix->CKTsuperluOptions.DiagPivotThresh = 0.001 ; + Matrix->CKTsuperluOptions.SymmetricMode = YES_SuperLU ; + Matrix->CKTsuperluOptions.PivotGrowth = NO_SuperLU ; + Matrix->CKTsuperluOptions.ConditionNumber = NO_SuperLU ; + Matrix->CKTsuperluOptions.PrintStat = NO_SuperLU ; + + /* Test the input parameters ... */ +// *info = 0 ; +// Bstore = B->Store; +// if ( options->Fact != DOFACT ) *info = -1 ; +// else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2 ; +// else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -7 ; +// if ( *info != 0 ) { +// i = -(*info); +// xerbla_("dgssv", &i); +// return 0 ; /* Make sure that 0 is correct - Francesco Lannutti */ +// } + +// utime = stat->utime; + +// t = SuperLU_timer_(); + permc_spec = Matrix->CKTsuperluOptions.ColPerm; + if ( permc_spec != MY_PERMC && Matrix->CKTsuperluOptions.Fact == DOFACT ) + get_perm_c (permc_spec, &(Matrix->CKTsuperluA), Matrix->CKTsuperluPerm_c) ; +// utime[COLPERM] = SuperLU_timer_() - t; + +// t = SuperLU_timer_(); + sp_preorder (&(Matrix->CKTsuperluOptions), &(Matrix->CKTsuperluA), Matrix->CKTsuperluPerm_c, + Matrix->CKTsuperluEtree, &(Matrix->CKTsuperluAC)) ; +// utime[ETREE] = SuperLU_timer_() - t; + + return 0 ; + } else { + spMNA_Preorder (Matrix->SPmatrix) ; + return spError (Matrix->SPmatrix) ; + } +} + +/* + * SMPprint() + */ +/*ARGSUSED*/ +void +SMPprint (SMPmatrix *Matrix, FILE *File) +{ + NG_IGNORE (File) ; + + spPrint (Matrix->SPmatrix, 0, 1, 1) ; +} + +/* + * SMPgetError() + */ +void +SMPgetError (SMPmatrix *Matrix, int *Col, int *Row) +{ + spWhereSingular (Matrix->SPmatrix, Row, Col) ; +} + +/* + * SMPcProdDiag() + * note: obsolete for Spice3d2 and later + */ +int +SMPcProdDiag (SMPmatrix *Matrix, SPcomplex *pMantissa, int *pExponent) +{ + spDeterminant (Matrix->SPmatrix, pExponent, &(pMantissa->real), &(pMantissa->imag)) ; + return spError (Matrix->SPmatrix) ; +} + +/* + * SMPcDProd() + */ +int +SMPcDProd (SMPmatrix *Matrix, SPcomplex *pMantissa, int *pExponent) +{ + double re, im, x, y, z; + int p; + + spDeterminant (Matrix->SPmatrix, &p, &re, &im) ; + +#ifndef M_LN2 +#define M_LN2 0.69314718055994530942 +#endif +#ifndef M_LN10 +#define M_LN10 2.30258509299404568402 +#endif + +#ifdef debug_print + printf("Determinant 10: (%20g,%20g)^%d\n", re, im, p); +#endif + + /* Convert base 10 numbers to base 2 numbers, for comparison */ + y = p * M_LN10 / M_LN2; + x = (int) y; + y -= x; + + /* ASSERT + * x = integral part of exponent, y = fraction part of exponent + */ + + /* Fold in the fractional part */ +#ifdef debug_print + printf(" ** base10 -> base2 int = %g, frac = %20g\n", x, y); +#endif + z = pow(2.0, y); + re *= z; + im *= z; +#ifdef debug_print + printf(" ** multiplier = %20g\n", z); +#endif + + /* Re-normalize (re or im may be > 2.0 or both < 1.0 */ + if (re != 0.0) { + y = logb(re); + if (im != 0.0) + z = logb(im); + else + z = 0; + } else if (im != 0.0) { + z = logb(im); + y = 0; + } else { + /* Singular */ + /*printf("10 -> singular\n");*/ + y = 0; + z = 0; + } + +#ifdef debug_print + printf(" ** renormalize changes = %g,%g\n", y, z); +#endif + if (y < z) + y = z; + + *pExponent = (int)(x + y); + x = scalbn(re, (int) -y); + z = scalbn(im, (int) -y); +#ifdef debug_print + printf(" ** values are: re %g, im %g, y %g, re' %g, im' %g\n", + re, im, y, x, z); +#endif + pMantissa->real = scalbn(re, (int) -y); + pMantissa->imag = scalbn(im, (int) -y); + +#ifdef debug_print + printf("Determinant 10->2: (%20g,%20g)^%d\n", pMantissa->real, + pMantissa->imag, *pExponent); +#endif + return spError (Matrix->SPmatrix) ; +} + + + +/* + * The following routines need internal knowledge of the Sparse data + * structures. + */ + +/* + * LOAD GMIN + * + * This routine adds Gmin to each diagonal element. Because Gmin is + * added to the current diagonal, which may bear little relation to + * what the outside world thinks is a diagonal, and because the + * elements that are diagonals may change after calling spOrderAndFactor, + * use of this routine is not recommended. It is included here simply + * for compatibility with Spice3. + */ + + +static void +LoadGmin_CSC (double **diag, int n, double Gmin) +{ + + int i ; + if (Gmin != 0.0) { + for (i = 0 ; i < n ; i++) { + if (diag [i] != NULL) *(diag [i]) += Gmin ; + } + } + + return ; +} + +static void +LoadGmin(SMPmatrix *eMatrix, double Gmin) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + int I; + ArrayOfElementPtrs Diag; + ElementPtr diag; + + /* Begin `LoadGmin'. */ + assert (IS_SPARSE (Matrix)) ; + + if (Gmin != 0.0) { + Diag = Matrix->Diag; + for (I = Matrix->Size; I > 0; I--) { + if ((diag = Diag[I]) != NULL) + diag->Real += Gmin; + } + } + return; +} + + +/* + * FIND ELEMENT + * + * This routine finds an element in the matrix by row and column number. + * If the element exists, a pointer to it is returned. If not, then NULL + * is returned unless the CreateIfMissing flag is TRUE, in which case a + * pointer to the new element is returned. + */ + +SMPelement * +SMPfindElt(SMPmatrix *eMatrix, int Row, int Col, int CreateIfMissing) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Element; + + /* Begin `SMPfindElt'. */ + assert( IS_SPARSE( Matrix ) ); + Row = Matrix->ExtToIntRowMap[Row]; + Col = Matrix->ExtToIntColMap[Col]; + Element = Matrix->FirstInCol[Col]; + Element = spcFindElementInCol(Matrix, &Element, Row, Col, CreateIfMissing); + return (SMPelement *)Element; +} + +/* XXX The following should probably be implemented in spUtils */ + +/* + * SMPcZeroCol() + */ +int +SMPcZeroCol(SMPmatrix *eMatrix, int Col) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Element; + + Col = Matrix->ExtToIntColMap[Col]; + + for (Element = Matrix->FirstInCol[Col]; + Element != NULL; + Element = Element->NextInCol) + { + Element->Real = 0.0; + Element->Imag = 0.0; + } + + return spError( Matrix ); +} + +/* + * SMPcAddCol() + */ +int +SMPcAddCol(SMPmatrix *eMatrix, int Accum_Col, int Addend_Col) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Accum, Addend, *Prev; + + Accum_Col = Matrix->ExtToIntColMap[Accum_Col]; + Addend_Col = Matrix->ExtToIntColMap[Addend_Col]; + + Addend = Matrix->FirstInCol[Addend_Col]; + Prev = &Matrix->FirstInCol[Accum_Col]; + Accum = *Prev;; + + while (Addend != NULL) { + while (Accum && Accum->Row < Addend->Row) { + Prev = &Accum->NextInCol; + Accum = *Prev; + } + if (!Accum || Accum->Row > Addend->Row) { + Accum = spcCreateElement(Matrix, Addend->Row, Accum_Col, Prev, 0); + } + Accum->Real += Addend->Real; + Accum->Imag += Addend->Imag; + Addend = Addend->NextInCol; + } + + return spError( Matrix ); +} + +/* + * SMPzeroRow() + */ +int +SMPzeroRow(SMPmatrix *eMatrix, int Row) +{ + MatrixPtr Matrix = eMatrix->SPmatrix ; + ElementPtr Element; + + Row = Matrix->ExtToIntColMap[Row]; + + if (Matrix->RowsLinked == NO) + spcLinkRows(Matrix); + + if (Matrix->PreviousMatrixWasComplex || Matrix->Complex) { + for (Element = Matrix->FirstInRow[Row]; + Element != NULL; + Element = Element->NextInRow) + { + Element->Real = 0.0; + Element->Imag = 0.0; + } + } else { + for (Element = Matrix->FirstInRow[Row]; + Element != NULL; + Element = Element->NextInRow) + { + Element->Real = 0.0; + } + } + + return spError( Matrix ); +} + +#ifdef PARALLEL_ARCH +/* + * SMPcombine() + */ +void +SMPcombine(SMPmatrix *Matrix, double RHS[], double Spare[]) +{ + spSetReal (Matrix->SPmatrix) ; + spCombine (Matrix->SPmatrix, RHS, Spare, NULL, NULL) ; +} + +/* + * SMPcCombine() + */ +void +SMPcCombine (SMPmatrix *Matrix, double RHS[], double Spare[], double iRHS[], double iSpare[]) +{ + spSetComplex (Matrix->SPmatrix) ; + spCombine (Matrix->SPmatrix, RHS, Spare, iRHS, iSpare) ; +} +#endif /* PARALLEL_ARCH */ diff --git a/src/maths/SuperLU/util.c b/src/maths/SuperLU/util.c new file mode 100644 index 000000000..9e82056f0 --- /dev/null +++ b/src/maths/SuperLU/util.c @@ -0,0 +1,495 @@ +/*! @file util.c + * \brief Utility functions + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November, 2010
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include +#include + +/*! \brief Global statistics variale + */ + +void superlu_abort_and_exit(char* msg) +{ + fprintf(stderr, msg); + exit (-1); +} + +/*! \brief Set the default values for the options argument. + */ +void set_default_options(superlu_options_t *options) +{ + options->Fact = DOFACT; + options->Equil = YES_SuperLU; + options->ColPerm = COLAMD; + options->Trans = NOTRANS; + options->IterRefine = NOREFINE; + options->DiagPivotThresh = 1.0; + options->SymmetricMode = NO_SuperLU; + options->PivotGrowth = NO_SuperLU; + options->ConditionNumber = NO_SuperLU; + options->PrintStat = YES_SuperLU; +} + +/*! \brief Set the default values for the options argument for ILU. + */ +void ilu_set_default_options(superlu_options_t *options) +{ + set_default_options(options); + + /* further options for incomplete factorization */ + options->DiagPivotThresh = 0.1; + options->RowPerm = LargeDiag; + options->ILU_DropRule = DROP_BASIC | DROP_AREA; + options->ILU_DropTol = 1e-4; + options->ILU_FillFactor = 10.0; + options->ILU_Norm = INF_NORM; + options->ILU_MILU = SILU; + options->ILU_MILU_Dim = 3.0; /* -log(n)/log(h) is perfect */ + options->ILU_FillTol = 1e-2; +} + +/*! \brief Print the options setting. + */ +void print_options(superlu_options_t *options) +{ + printf(".. options:\n"); + printf("\tFact\t %8d\n", options->Fact); + printf("\tEquil\t %8d\n", options->Equil); + printf("\tColPerm\t %8d\n", options->ColPerm); + printf("\tDiagPivotThresh %8.4f\n", options->DiagPivotThresh); + printf("\tTrans\t %8d\n", options->Trans); + printf("\tIterRefine\t%4d\n", options->IterRefine); + printf("\tSymmetricMode\t%4d\n", options->SymmetricMode); + printf("\tPivotGrowth\t%4d\n", options->PivotGrowth); + printf("\tConditionNumber\t%4d\n", options->ConditionNumber); + printf("..\n"); +} + +/*! \brief Print the options setting. + */ +void print_ilu_options(superlu_options_t *options) +{ + printf(".. ILU options:\n"); + printf("\tDiagPivotThresh\t%6.2e\n", options->DiagPivotThresh); + printf("\ttau\t%6.2e\n", options->ILU_DropTol); + printf("\tgamma\t%6.2f\n", options->ILU_FillFactor); + printf("\tDropRule\t%0x\n", options->ILU_DropRule); + printf("\tMILU\t%d\n", options->ILU_MILU); + printf("\tMILU_ALPHA\t%6.2e\n", MILU_ALPHA); + printf("\tDiagFillTol\t%6.2e\n", options->ILU_FillTol); + printf("..\n"); +} + +/*! \brief Deallocate the structure pointing to the actual storage of the matrix. */ +void +Destroy_SuperMatrix_Store(SuperMatrix *A) +{ + SUPERLU_FREE ( A->Store ); +} + +void +Destroy_CompCol_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE( ((NCformat *)A->Store)->rowind ); + SUPERLU_FREE( ((NCformat *)A->Store)->colptr ); + SUPERLU_FREE( ((NCformat *)A->Store)->nzval ); + SUPERLU_FREE( A->Store ); +} + +void +Destroy_CompRow_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE( ((NRformat *)A->Store)->colind ); + SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); + SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); + SUPERLU_FREE( A->Store ); +} + +void +Destroy_SuperNode_Matrix(SuperMatrix *A) +{ + SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); + SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); + SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); + SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); + SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); + SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); + SUPERLU_FREE ( A->Store ); +} + +/*! \brief A is of type Stype==NCP */ +void +Destroy_CompCol_Permuted(SuperMatrix *A) +{ + SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); + SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); + SUPERLU_FREE ( A->Store ); +} + +/*! \brief A is of type Stype==DN */ +void +Destroy_Dense_Matrix(SuperMatrix *A) +{ + DNformat* Astore = A->Store; + SUPERLU_FREE (Astore->nzval); + SUPERLU_FREE ( A->Store ); +} + +/*! \brief Reset repfnz[] for the current column + */ +void +resetrep_col (const int nseg, const int *segrep, int *repfnz) +{ + int i, irep; + + for (i = 0; i < nseg; i++) { + irep = segrep[i]; + repfnz[irep] = EMPTY; + } +} + + +/*! \brief Count the total number of nonzeros in factors L and U, and in the symmetrically reduced L. + */ +void +countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLU_t *Glu) +{ + int nsuper, fsupc, i, j; + int nnzL0, jlen, irep; + int *xsup, *xlsub; + + xsup = Glu->xsup; + xlsub = Glu->xlsub; + *nnzL = 0; + *nnzU = (Glu->xusub)[n]; + nnzL0 = 0; + nsuper = (Glu->supno)[n]; + + if ( n <= 0 ) return; + + /* + * For each supernode + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jlen = xlsub[fsupc+1] - xlsub[fsupc]; + + for (j = fsupc; j < xsup[i+1]; j++) { + *nnzL += jlen; + *nnzU += j - fsupc + 1; + jlen--; + } + irep = xsup[i+1] - 1; + nnzL0 += xprune[irep] - xlsub[irep]; + } + + /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/ +} + +/*! \brief Count the total number of nonzeros in factors L and U. + */ +void +ilu_countnz(const int n, int *nnzL, int *nnzU, GlobalLU_t *Glu) +{ + int nsuper, fsupc, i, j; + int jlen, irep; + int *xsup, *xlsub; + + xsup = Glu->xsup; + xlsub = Glu->xlsub; + *nnzL = 0; + *nnzU = (Glu->xusub)[n]; + nsuper = (Glu->supno)[n]; + + if ( n <= 0 ) return; + + /* + * For each supernode + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jlen = xlsub[fsupc+1] - xlsub[fsupc]; + + for (j = fsupc; j < xsup[i+1]; j++) { + *nnzL += jlen; + *nnzU += j - fsupc + 1; + jlen--; + } + irep = xsup[i+1] - 1; + } +} + + +/*! \brief Fix up the data storage lsub for L-subscripts. It removes the subscript sets for structural pruning, and applies permuation to the remaining subscripts. + */ +void +fixupL(const int n, const int *perm_r, GlobalLU_t *Glu) +{ + register int nsuper, fsupc, nextl, i, j, k, jstrt; + int *xsup, *lsub, *xlsub; + + if ( n <= 1 ) return; + + xsup = Glu->xsup; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nextl = 0; + nsuper = (Glu->supno)[n]; + + /* + * For each supernode ... + */ + for (i = 0; i <= nsuper; i++) { + fsupc = xsup[i]; + jstrt = xlsub[fsupc]; + xlsub[fsupc] = nextl; + for (j = jstrt; j < xlsub[fsupc+1]; j++) { + lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ + nextl++; + } + for (k = fsupc+1; k < xsup[i+1]; k++) + xlsub[k] = nextl; /* Other columns in supernode i */ + + } + + xlsub[n] = nextl; +} + + +/*! \brief Diagnostic print of segment info after panel_dfs(). + */ +void print_panel_seg(int n, int w, int jcol, int nseg, + int *segrep, int *repfnz) +{ + int j, k; + + for (j = jcol; j < jcol+w; j++) { + printf("\tcol %d:\n", j); + for (k = 0; k < nseg; k++) + printf("\t\tseg %d, segrep %d, repfnz %d\n", k, + segrep[k], repfnz[(j-jcol)*n + segrep[k]]); + } + +} + + +void +StatInit(SuperLUStat_t *stat) +{ + register int i, w, panel_size, relax; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + w = SUPERLU_MAX(panel_size, relax); + stat->panel_histo = intCalloc(w+1); + stat->utime = (double *) SUPERLU_MALLOC(NPHASES * sizeof(double)); + if (!stat->utime) ABORT_SuperLU("SUPERLU_MALLOC fails for stat->utime"); + stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); + if (!stat->ops) ABORT_SuperLU("SUPERLU_MALLOC fails for stat->ops"); + for (i = 0; i < NPHASES; ++i) { + stat->utime[i] = 0.; + stat->ops[i] = 0.; + } + stat->TinyPivots = 0; + stat->RefineSteps = 0; + stat->expansions = 0; +#if ( PRNTlevel >= 1 ) + printf(".. parameters in sp_ienv():\n"); + printf("\t 1: panel size \t %4d \n" + "\t 2: relax \t %4d \n" + "\t 3: max. super \t %4d \n" + "\t 4: row-dim 2D \t %4d \n" + "\t 5: col-dim 2D \t %4d \n" + "\t 6: fill ratio \t %4d \n", + sp_ienv(1), sp_ienv(2), sp_ienv(3), + sp_ienv(4), sp_ienv(5), sp_ienv(6)); +#endif +} + + +void +StatPrint(SuperLUStat_t *stat) +{ + double *utime; + flops_t *ops; + + utime = stat->utime; + ops = stat->ops; + printf("Factor time = %8.2f\n", utime[FACT]); + if ( utime[FACT] != 0.0 ) + printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], + ops[FACT]*1e-6/utime[FACT]); + + printf("Solve time = %8.2f\n", utime[SOLVE]); + if ( utime[SOLVE] != 0.0 ) + printf("Solve flops = %e\tMflops = %8.2f\n", ops[SOLVE], + ops[SOLVE]*1e-6/utime[SOLVE]); + + printf("Number of memory expansions: %d\n", stat->expansions); + +} + + +void +StatFree(SuperLUStat_t *stat) +{ + SUPERLU_FREE(stat->panel_histo); + SUPERLU_FREE(stat->utime); + SUPERLU_FREE(stat->ops); +} + + +flops_t +LUFactFlops(SuperLUStat_t *stat) +{ + return (stat->ops[FACT]); +} + +flops_t +LUSolveFlops(SuperLUStat_t *stat) +{ + return (stat->ops[SOLVE]); +} + + + + + +/*! \brief Fills an integer array with a given value. + */ +void ifill(int *a, int alen, int ival) +{ + register int i; + for (i = 0; i < alen; i++) a[i] = ival; +} + + + +/*! \brief Get the statistics of the supernodes + */ +#define NBUCKS 10 +static int max_sup_size; + +void super_stats(int nsuper, int *xsup) +{ + register int nsup1 = 0; + int i, isize, whichb, bl, bh; + int bucket[NBUCKS]; + + max_sup_size = 0; + + for (i = 0; i <= nsuper; i++) { + isize = xsup[i+1] - xsup[i]; + if ( isize == 1 ) nsup1++; + if ( max_sup_size < isize ) max_sup_size = isize; + } + + printf(" Supernode statistics:\n\tno of super = %d\n", nsuper+1); + printf("\tmax supernode size = %d\n", max_sup_size); + printf("\tno of size 1 supernodes = %d\n", nsup1); + + /* Histogram of the supernode sizes */ + ifill (bucket, NBUCKS, 0); + + for (i = 0; i <= nsuper; i++) { + isize = xsup[i+1] - xsup[i]; + whichb = (float) isize / max_sup_size * NBUCKS; + if (whichb >= NBUCKS) whichb = NBUCKS - 1; + bucket[whichb]++; + } + + printf("\tHistogram of supernode sizes:\n"); + for (i = 0; i < NBUCKS; i++) { + bl = (float) i * max_sup_size / NBUCKS; + bh = (float) (i+1) * max_sup_size / NBUCKS; + printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]); + } + +} + + +float SpaSize(int n, int np, float sum_npw) +{ + return (sum_npw*8 + np*8 + n*4)/1024.; +} + +float DenseSize(int n, float sum_nw) +{ + return (sum_nw*8 + n*8)/1024.;; +} + + + +/*! \brief Check whether repfnz[] == EMPTY after reset. + */ +void check_repfnz(int n, int w, int jcol, int *repfnz) +{ + int jj, k; + + for (jj = jcol; jj < jcol+w; jj++) + for (k = 0; k < n; k++) + if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { + fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj, + k, repfnz[(jj-jcol)*n + k]); + ABORT_SuperLU("check_repfnz"); + } +} + + +/*! \brief Print a summary of the testing results. */ +void +PrintSumm(char *type, int nfail, int nrun, int nerrs) +{ + if ( nfail > 0 ) + printf("%3s driver: %d out of %d tests failed to pass the threshold\n", + type, nfail, nrun); + else + printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun); + + if ( nerrs > 0 ) + printf("%6d error messages recorded\n", nerrs); +} + + +int print_int_vec(char *what, int n, int *vec) +{ + int i; + printf("%s\n", what); + for (i = 0; i < n; ++i) printf("%d\t%d\n", i, vec[i]); + return 0; +} + +int slu_PrintInt10(char *name, int len, int *x) +{ + register int i; + + printf("%10s:", name); + for (i = 0; i < len; ++i) + { + if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i + 9); + printf("%6d", x[i]); + } + printf("\n"); + return 0; +} + + diff --git a/src/maths/SuperLU/xerbla.c b/src/maths/SuperLU/xerbla.c new file mode 100644 index 000000000..b00a69cfe --- /dev/null +++ b/src/maths/SuperLU/xerbla.c @@ -0,0 +1,43 @@ +#include +#include + +/* Subroutine */ int xerbla_(char *srname, int *info) +{ +/* -- LAPACK auxiliary routine (version 2.0) -- + Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., + Courant Institute, Argonne National Lab, and Rice University + September 30, 1994 + + + Purpose + ======= + + XERBLA is an error handler for the LAPACK routines. + It is called by an LAPACK routine if an input parameter has an + invalid value. A message is printed and execution stops. + + Installers may consider modifying the STOP statement in order to + call system-specific exception-handling facilities. + + Arguments + ========= + + SRNAME (input) CHARACTER*6 + The name of the routine which called XERBLA. + + INFO (input) INT + The position of the invalid parameter in the parameter list + + of the calling routine. + + ===================================================================== +*/ + + printf("** On entry to %6s, parameter number %2d had an illegal value\n", + srname, *info); + +/* End of XERBLA */ + + return 0; +} /* xerbla_ */ + diff --git a/src/maths/SuperLU/zcolumn_bmod.c b/src/maths/SuperLU/zcolumn_bmod.c new file mode 100644 index 000000000..68c0b020b --- /dev/null +++ b/src/maths/SuperLU/zcolumn_bmod.c @@ -0,0 +1,367 @@ + +/*! @file zcolumn_bmod.c + * \brief performs numeric block updates + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ *  Permission is hereby granted to use or copy this program for any
+ *  purpose, provided the above notices are retained on all copies.
+ *  Permission to modify the code and to distribute modified code is
+ *  granted, provided the above notices are retained, and a notice that
+ *  the code was modified is included with the above copyright notice.
+ * 
+*/ + +#include +#include +#include + +/* + * Function prototypes + */ +void zusolve(int, int, doublecomplex*, doublecomplex*); +void zlsolve(int, int, doublecomplex*, doublecomplex*); +void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); + + + +/*! \brief + * + *
+ * Purpose:
+ * ========
+ * Performs numeric block updates (sup-col) in topological order.
+ * It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
+ * Special processing on the supernodal portion of L\U[*,j]
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ * 
+ */ +int +zcolumn_bmod ( + const int jcol, /* in */ + const int nseg, /* in */ + doublecomplex *dense, /* in */ + doublecomplex *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in */ + int fpanelc, /* in -- first column in the current panel */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + doublecomplex alpha, beta; + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in supernode + * nsupr = no of rows in supernode (used as leading dimension) + * luptr = location of supernodal LU-block in storage + * kfnz = first nonz in the k-th supernodal segment + * no_zeros = no of leading zeros in a supernodal U-segment + */ + doublecomplex ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int fsupc, nsupc, nsupr, segsze; + int nrow; /* No of rows in the matrix of matrix-vector */ + int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; + register int lptr, kfnz, isub, irow, i; + register int no_zeros, new_next; + int ufirst, nextlu; + int fst_col; /* First column within small LU update */ + int d_fsupc; /* Distance between the first column of the current + panel and the first column of the current snode. */ + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + int nzlumax; + doublecomplex *tempv1; + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + doublecomplex none = {-1.0, 0.0}; + doublecomplex comp_temp, comp_temp1; + int mem_error; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + nzlumax = Glu->nzlumax; + jcolp1 = jcol + 1; + jsupno = supno[jcol]; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + + krep = segrep[k]; + k--; + ksupno = supno[krep]; + if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ + + fsupc = xsup[ksupno]; + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + /* Distance from the current supernode to the current panel; + d_fsupc=0 if fsupc > fpanelc. */ + d_fsupc = fst_col - fsupc; + + luptr = xlusup[fst_col] + d_fsupc; + lptr = xlsub[fsupc] + d_fsupc; + + kfnz = repfnz[krep]; + kfnz = SUPERLU_MAX ( kfnz, fpanelc ); + + segsze = krep - kfnz + 1; + nsupc = krep - fst_col + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nrow = nsupr - d_fsupc - nsupc; + krep_ind = lptr + nsupc - 1; + + ops[TRSV] += 4 * segsze * (segsze - 1); + ops[GEMV] += 8 * nrow * segsze; + + + + /* + * Case 1: Update U-segment of size 1 -- col-col update + */ + if ( segsze == 1 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + z_sub(&dense[irow], &dense[irow], &comp_temp); + luptr++; + } + + } else if ( segsze <= 3 ) { + ukj = dense[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { /* Case 2: 2cols-col update */ + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + z_sub(&ukj, &ukj, &comp_temp); + dense[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense[irow], &dense[irow], &comp_temp); + } + } else { /* Case 3: 3cols-col update */ + ukj2 = dense[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); + z_sub(&ukj1, &ukj1, &comp_temp); + + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&ukj, &ukj, &comp_temp); + + dense[lsub[krep_ind]] = ukj; + dense[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; + luptr1++; + luptr2++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense[irow], &dense[irow], &comp_temp); + } + } + + + } else { + /* + * Case: sup-col update + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense + */ + + no_zeros = kfnz - fst_col; + + /* Copy U[*,j] segment from dense[*] to tempv[*] */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + tempv[i] = dense[irow]; + ++isub; + } + + /* Dense triangular solve -- start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + zlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + zmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); +#endif + + + /* Scatter tempv[] into SPA dense[] as a temporary storage */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense[irow] = tempv[i]; + tempv[i] = zero; + ++isub; + } + + /* Scatter tempv1[] into SPA dense[] */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + z_sub(&dense[irow], &dense[irow], &tempv1[i]); + tempv1[i] = zero; + ++isub; + } + } + + } /* if jsupno ... */ + + } /* for each segment... */ + + /* + * Process the supernodal portion of L\U[*,j] + */ + nextlu = xlusup[jcol]; + fsupc = xsup[jsupno]; + + /* Copy the SPA dense into L\U[*,j] */ + new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; + while ( new_next > nzlumax ) { + if (mem_error = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) + return (mem_error); + lusup = Glu->lusup; + lsub = Glu->lsub; + } + + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = zero; + ++nextlu; + } + + xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ + + /* For more updates within the panel (also within the current supernode), + * should start from the first column of the panel, or the first column + * of the supernode, whichever is bigger. There are 2 cases: + * 1) fsupc < fpanelc, then fst_col := fpanelc + * 2) fsupc >= fpanelc, then fst_col := fsupc + */ + fst_col = SUPERLU_MAX ( fsupc, fpanelc ); + + if ( fst_col < jcol ) { + + /* Distance between the current supernode and the current panel. + d_fsupc=0 if fsupc >= fpanelc. */ + d_fsupc = fst_col - fsupc; + + lptr = xlsub[fsupc] + d_fsupc; + luptr = xlusup[fst_col] + d_fsupc; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ + nsupc = jcol - fst_col; /* Excluding jcol */ + nrow = nsupr - d_fsupc - nsupc; + + /* Points to the beginning of jcol in snode L\U(jsupno) */ + ufirst = xlusup[jcol] + d_fsupc; + + ops[TRSV] += 4 * nsupc * (nsupc - 1); + ops[GEMV] += 8 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#else + ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], + &nsupr, &lusup[ufirst], &incx ); +#endif + + alpha = none; beta = one; /* y := beta*y + alpha*A*x */ + +#ifdef _CRAY + CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + + zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], tempv ); + + /* Copy updates from tempv[*] into lusup[*] */ + isub = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + z_sub(&lusup[isub], &lusup[isub], &tempv[i]); + tempv[i] = zero; + ++isub; + } + +#endif + + + } /* if fst_col < jcol ... */ + + return 0; +} diff --git a/src/maths/SuperLU/zcolumn_dfs.c b/src/maths/SuperLU/zcolumn_dfs.c new file mode 100644 index 000000000..e7e5eaec0 --- /dev/null +++ b/src/maths/SuperLU/zcolumn_dfs.c @@ -0,0 +1,275 @@ + +/*! @file zcolumn_dfs.c + * \brief Performs a symbolic factorization + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+*/ + +#include + +/*! \brief What type of supernodes we want */ +#define T2_SUPER + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   ZCOLUMN_DFS performs a symbolic factorization on column jcol, and
+ *   decide the supernode boundary.
+ *
+ *   This routine does not use numeric values, but only use the RHS 
+ *   row indices to start the dfs.
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives. The routine returns a list of such supernodal 
+ *   representatives in topological order of the dfs that generates them.
+ *   The location of the first nonzero in each such supernodal segment
+ *   (supernodal entry location) is also returned.
+ *
+ * Local parameters
+ * ================
+ *   nseg: no of segments in current U[*,j]
+ *   jsuper: jsuper=EMPTY if column j does not belong to the same
+ *	supernode as j-1. Otherwise, jsuper=nsuper.
+ *
+ *   marker2: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ *
+ * Return value
+ * ============
+ *     0  success;
+ *   > 0  number of bytes allocated when run out of space.
+ * 
+ */ +int +zcolumn_dfs( + const int m, /* in - number of rows in the matrix */ + const int jcol, /* in */ + int *perm_r, /* in */ + int *nseg, /* modified - with new segments appended */ + int *lsub_col, /* in - defines the RHS vector to start the dfs */ + int *segrep, /* modified - with new segments appended */ + int *repfnz, /* modified */ + int *xprune, /* modified */ + int *marker, /* modified */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + int jcolp1, jcolm1, jsuper, nsuper, nextl; + int k, krep, krow, kmark, kperm; + int *marker2; /* Used for small panel LU */ + int fsupc; /* First column of a snode */ + int myfnz; /* First nonz column of a U-segment */ + int chperm, chmark, chrep, kchild; + int xdfs, maxdfs, kpar, oldrep; + int jptr, jm1ptr; + int ito, ifrom, istop; /* Used to compress row subscripts */ + int mem_error; + int *xsup, *supno, *lsub, *xlsub; + int nzlmax; + static int first = 1, maxsuper; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + if ( first ) { + maxsuper = sp_ienv(3); + first = 0; + } + jcolp1 = jcol + 1; + jcolm1 = jcol - 1; + nsuper = supno[jcol]; + jsuper = nsuper; + nextl = xlsub[jcol]; + marker2 = &marker[2*m]; + + + /* For each nonzero in A[*,jcol] do dfs */ + for (k = 0; lsub_col[k] != EMPTY; k++) { + + krow = lsub_col[k]; + lsub_col[k] = EMPTY; + kmark = marker2[krow]; + + /* krow was visited before, go to the next nonz */ + if ( kmark == jcol ) continue; + + /* For each unmarked nbr krow of jcol + * krow is in L: place it in structure of L[*,jcol] + */ + marker2[krow] = jcol; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + lsub[nextl++] = krow; /* krow is indexed into A */ + if ( nextl >= nzlmax ) { + if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */ + } else { + /* krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz[krep]; + + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > kperm ) repfnz[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker2[kchild]; + + if ( chmark != jcol ) { /* Not reached yet */ + marker2[kchild] = jcol; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,k] */ + if ( chperm == EMPTY ) { + lsub[nextl++] = kchild; + if ( nextl >= nzlmax ) { + if ( mem_error = + zLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + if ( chmark != jcolm1 ) jsuper = EMPTY; + } else { + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz[chrep]; + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz[chrep] = chperm; + } else { + /* Continue dfs at super-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L^t) */ + parent[krep] = oldrep; + repfnz[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + } /* else */ + + } /* else */ + + } /* if */ + + } /* while */ + + /* krow has no more unexplored nbrs; + * place supernode-rep krep in postorder DFS. + * backtrack dfs to its parent + */ + segrep[*nseg] = krep; + ++(*nseg); + kpar = parent[krep]; /* Pop from stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + + } while ( kpar != EMPTY ); /* Until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonzero ... */ + + /* Check to see if j belongs in the same supernode as j-1 */ + if ( jcol == 0 ) { /* Do nothing for column 0 */ + nsuper = supno[0] = 0; + } else { + fsupc = xsup[nsuper]; + jptr = xlsub[jcol]; /* Not compressed yet */ + jm1ptr = xlsub[jcolm1]; + +#ifdef T2_SUPER + if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY; +#endif + /* Make sure the number of columns in a supernode doesn't + exceed threshold. */ + if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY; + + /* If jcol starts a new supernode, reclaim storage space in + * lsub from the previous supernode. Note we only store + * the subscript set of the first and last columns of + * a supernode. (first for num values, last for pruning) + */ + if ( jsuper == EMPTY ) { /* starts a new supernode */ + if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */ +#ifdef CHK_COMPRESS + printf(" Compress lsub[] at super %d-%d\n", fsupc, jcolm1); +#endif + ito = xlsub[fsupc+1]; + xlsub[jcolm1] = ito; + istop = ito + jptr - jm1ptr; + xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */ + xlsub[jcol] = istop; + for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) + lsub[ito] = lsub[ifrom]; + nextl = ito; /* = istop + length(jcol) */ + } + nsuper++; + supno[jcol] = nsuper; + } /* if a new supernode */ + + } /* else: jcol > 0 */ + + /* Tidy up the pointers before exit */ + xsup[nsuper+1] = jcolp1; + supno[jcolp1] = nsuper; + xprune[jcol] = nextl; /* Initialize upper bound for pruning */ + xlsub[jcolp1] = nextl; + + return 0; +} diff --git a/src/maths/SuperLU/zcopy_to_ucol.c b/src/maths/SuperLU/zcopy_to_ucol.c new file mode 100644 index 000000000..33e8251e4 --- /dev/null +++ b/src/maths/SuperLU/zcopy_to_ucol.c @@ -0,0 +1,103 @@ + +/*! @file zcopy_to_ucol.c + * \brief Copy a computed column of U to the compressed data structure + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + +#include + +int +zcopy_to_ucol( + int jcol, /* in */ + int nseg, /* in */ + int *segrep, /* in */ + int *repfnz, /* in */ + int *perm_r, /* in */ + doublecomplex *dense, /* modified - reset to zero on return */ + GlobalLU_t *Glu /* modified */ + ) +{ +/* + * Gather from SPA dense[*] to global ucol[*]. + */ + int ksub, krep, ksupno; + int i, k, kfnz, segsze; + int fsupc, isub, irow; + int jsupno, nextu; + int new_next, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *ucol; + int *usub, *xusub; + int nzumax; + doublecomplex zero = {0.0, 0.0}; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + nzumax = Glu->nzumax; + + jsupno = supno[jcol]; + nextu = xusub[jcol]; + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { + krep = segrep[k--]; + ksupno = supno[krep]; + + if ( ksupno != jsupno ) { /* Should go into ucol[] */ + kfnz = repfnz[krep]; + if ( kfnz != EMPTY ) { /* Nonzero U-segment */ + + fsupc = xsup[ksupno]; + isub = xlsub[fsupc] + kfnz - fsupc; + segsze = krep - kfnz + 1; + + new_next = nextu + segsze; + while ( new_next > nzumax ) { + if (mem_error = zLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)) + return (mem_error); + ucol = Glu->ucol; + if (mem_error = zLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)) + return (mem_error); + usub = Glu->usub; + lsub = Glu->lsub; + } + + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + usub[nextu] = perm_r[irow]; + ucol[nextu] = dense[irow]; + dense[irow] = zero; + nextu++; + isub++; + } + + } + + } + + } /* for each segment... */ + + xusub[jcol + 1] = nextu; /* Close U[*,jcol] */ + return 0; +} diff --git a/src/maths/SuperLU/zdiagonal.c b/src/maths/SuperLU/zdiagonal.c new file mode 100644 index 000000000..05b3a6daf --- /dev/null +++ b/src/maths/SuperLU/zdiagonal.c @@ -0,0 +1,133 @@ + +/*! @file zdiagonal.c + * \brief Auxiliary routines to work with diagonal elements + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory
+ * June 30, 2009
+ * 
+ */ + +#include + +int zfill_diag(int n, NCformat *Astore) +/* fill explicit zeros on the diagonal entries, so that the matrix is not + structurally singular. */ +{ + doublecomplex *nzval = (doublecomplex *)Astore->nzval; + int *rowind = Astore->rowind; + int *colptr = Astore->colptr; + int nnz = colptr[n]; + int fill = 0; + doublecomplex *nzval_new; + doublecomplex zero = {1.0, 0.0}; + int *rowind_new; + int i, j, diag; + + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + if (rowind[j] == i) diag = j; + if (diag < 0) fill++; + } + if (fill) + { + nzval_new = doublecomplexMalloc(nnz + fill); + rowind_new = intMalloc(nnz + fill); + fill = 0; + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i] - fill; j < colptr[i + 1]; j++) + { + if ((rowind_new[j + fill] = rowind[j]) == i) diag = j; + nzval_new[j + fill] = nzval[j]; + } + if (diag < 0) + { + rowind_new[colptr[i + 1] + fill] = i; + nzval_new[colptr[i + 1] + fill] = zero; + fill++; + } + colptr[i + 1] += fill; + } + Astore->nzval = nzval_new; + Astore->rowind = rowind_new; + SUPERLU_FREE(nzval); + SUPERLU_FREE(rowind); + } + Astore->nnz += fill; + return fill; +} + +int zdominate(int n, NCformat *Astore) +/* make the matrix diagonally dominant */ +{ + doublecomplex *nzval = (doublecomplex *)Astore->nzval; + int *rowind = Astore->rowind; + int *colptr = Astore->colptr; + int nnz = colptr[n]; + int fill = 0; + doublecomplex *nzval_new; + int *rowind_new; + int i, j, diag; + double s; + + for (i = 0; i < n; i++) + { + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + if (rowind[j] == i) diag = j; + if (diag < 0) fill++; + } + if (fill) + { + nzval_new = doublecomplexMalloc(nnz + fill); + rowind_new = intMalloc(nnz+ fill); + fill = 0; + for (i = 0; i < n; i++) + { + s = 1e-6; + diag = -1; + for (j = colptr[i] - fill; j < colptr[i + 1]; j++) + { + if ((rowind_new[j + fill] = rowind[j]) == i) diag = j; + nzval_new[j + fill] = nzval[j]; + s += z_abs1(&nzval_new[j + fill]); + } + if (diag >= 0) { + nzval_new[diag+fill].r = s * 3.0; + nzval_new[diag+fill].i = 0.0; + } else { + rowind_new[colptr[i + 1] + fill] = i; + nzval_new[colptr[i + 1] + fill].r = s * 3.0; + nzval_new[colptr[i + 1] + fill].i = 0.0; + fill++; + } + colptr[i + 1] += fill; + } + Astore->nzval = nzval_new; + Astore->rowind = rowind_new; + SUPERLU_FREE(nzval); + SUPERLU_FREE(rowind); + } + else + { + for (i = 0; i < n; i++) + { + s = 1e-6; + diag = -1; + for (j = colptr[i]; j < colptr[i + 1]; j++) + { + if (rowind[j] == i) diag = j; + s += z_abs1(&nzval[j]); + } + nzval[diag].r = s * 3.0; + nzval[diag].i = 0.0; + } + } + Astore->nnz += fill; + return fill; +} diff --git a/src/maths/SuperLU/zgscon.c b/src/maths/SuperLU/zgscon.c new file mode 100644 index 000000000..52a3939a6 --- /dev/null +++ b/src/maths/SuperLU/zgscon.c @@ -0,0 +1,154 @@ + +/*! @file zgscon.c + * \brief Estimates reciprocal of the condition number of a general matrix + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Modified from lapack routines ZGECON.
+ * 
+ */ + +/* + * File name: zgscon.c + * History: Modified from lapack routines ZGECON. + */ +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   ZGSCON estimates the reciprocal of the condition number of a general 
+ *   real matrix A, in either the 1-norm or the infinity-norm, using   
+ *   the LU factorization computed by ZGETRF.   *
+ *
+ *   An estimate is obtained for norm(inv(A)), and the reciprocal of the   
+ *   condition number is computed as   
+ *      RCOND = 1 / ( norm(A) * norm(inv(A)) ).   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ * 
+ *   Arguments   
+ *   =========   
+ *
+ *    NORM    (input) char*
+ *            Specifies whether the 1-norm condition number or the   
+ *            infinity-norm condition number is required:   
+ *            = '1' or 'O':  1-norm;   
+ *            = 'I':         Infinity-norm.
+ *	    
+ *    L       (input) SuperMatrix*
+ *            The factor L from the factorization Pr*A*Pc=L*U as computed by
+ *            zgstrf(). Use compressed row subscripts storage for supernodes,
+ *            i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ * 
+ *    U       (input) SuperMatrix*
+ *            The factor U from the factorization Pr*A*Pc=L*U as computed by
+ *            zgstrf(). Use column-wise storage scheme, i.e., U has types:
+ *            Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
+ *	    
+ *    ANORM   (input) double
+ *            If NORM = '1' or 'O', the 1-norm of the original matrix A.   
+ *            If NORM = 'I', the infinity-norm of the original matrix A.
+ *	    
+ *    RCOND   (output) double*
+ *           The reciprocal of the condition number of the matrix A,   
+ *           computed as RCOND = 1/(norm(A) * norm(inv(A))).
+ *	    
+ *    INFO    (output) int*
+ *           = 0:  successful exit   
+ *           < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *
+ *    ===================================================================== 
+ * 
+ */ + +void +zgscon(char *norm, SuperMatrix *L, SuperMatrix *U, + double anorm, double *rcond, SuperLUStat_t *stat, int *info) +{ + + + /* Local variables */ + int kase, kase1, onenrm, i; + double ainvnm; + doublecomplex *work; + extern int zrscl_(int *, doublecomplex *, doublecomplex *, int *); + + extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *); + + + /* Test the input parameters. */ + *info = 0; + onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O"); + if (! onenrm && ! lsame_(norm, "I")) *info = -1; + else if (L->nrow < 0 || L->nrow != L->ncol || + L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU) + *info = -2; + else if (U->nrow < 0 || U->nrow != U->ncol || + U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU) + *info = -3; + if (*info != 0) { + i = -(*info); + xerbla_("zgscon", &i); + return; + } + + /* Quick return if possible */ + *rcond = 0.; + if ( L->nrow == 0 || U->nrow == 0) { + *rcond = 1.; + return; + } + + work = doublecomplexCalloc( 3*L->nrow ); + + + if ( !work ) + ABORT_SuperLU("Malloc fails for work arrays in zgscon."); + + /* Estimate the norm of inv(A). */ + ainvnm = 0.; + if ( onenrm ) kase1 = 1; + else kase1 = 2; + kase = 0; + + do { + zlacon_(&L->nrow, &work[L->nrow], &work[0], &ainvnm, &kase); + + if (kase == 0) break; + + if (kase == kase1) { + /* Multiply by inv(L). */ + sp_ztrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); + + /* Multiply by inv(U). */ + sp_ztrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); + + } else { + + /* Multiply by inv(U'). */ + sp_ztrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); + + /* Multiply by inv(L'). */ + sp_ztrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); + + } + + } while ( kase != 0 ); + + /* Compute the estimate of the reciprocal condition number. */ + if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; + + SUPERLU_FREE (work); + return; + +} /* zgscon */ + diff --git a/src/maths/SuperLU/zgsequ.c b/src/maths/SuperLU/zgsequ.c new file mode 100644 index 000000000..2ca1e4abe --- /dev/null +++ b/src/maths/SuperLU/zgsequ.c @@ -0,0 +1,195 @@ + +/*! @file zgsequ.c + * \brief Computes row and column scalings + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Modified from LAPACK routine ZGEEQU
+ * 
+ */ +/* + * File name: zgsequ.c + * History: Modified from LAPACK routine ZGEEQU + */ +#include +#include + + + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   ZGSEQU computes row and column scalings intended to equilibrate an   
+ *   M-by-N sparse matrix A and reduce its condition number. R returns the row
+ *   scale factors and C the column scale factors, chosen to try to make   
+ *   the largest element in each row and column of the matrix B with   
+ *   elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+ *
+ *   R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+ *   number and BIGNUM = largest safe number.  Use of these scaling   
+ *   factors is not guaranteed to reduce the condition number of A but   
+ *   works well in practice.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input) SuperMatrix*
+ *           The matrix of dimension (A->nrow, A->ncol) whose equilibration
+ *           factors are to be computed. The type of A can be:
+ *           Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE.
+ *	    
+ *   R       (output) double*, size A->nrow
+ *           If INFO = 0 or INFO > M, R contains the row scale factors   
+ *           for A.
+ *	    
+ *   C       (output) double*, size A->ncol
+ *           If INFO = 0,  C contains the column scale factors for A.
+ *	    
+ *   ROWCND  (output) double*
+ *           If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+ *           smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+ *           AMAX is neither too large nor too small, it is not worth   
+ *           scaling by R.
+ *	    
+ *   COLCND  (output) double*
+ *           If INFO = 0, COLCND contains the ratio of the smallest   
+ *           C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+ *           worth scaling by C.
+ *	    
+ *   AMAX    (output) double*
+ *           Absolute value of largest matrix element.  If AMAX is very   
+ *           close to overflow or very close to underflow, the matrix   
+ *           should be scaled.
+ *	    
+ *   INFO    (output) int*
+ *           = 0:  successful exit   
+ *           < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *           > 0:  if INFO = i,  and i is   
+ *                 <= A->nrow:  the i-th row of A is exactly zero   
+ *                 >  A->ncol:  the (i-M)-th column of A is exactly zero   
+ *
+ *   ===================================================================== 
+ * 
+ */ +void +zgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, + double *colcnd, double *amax, int *info) +{ + + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int i, j, irow; + double rcmin, rcmax; + double bignum, smlnum; + extern double dlamch_(char *); + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + xerbla_("zgsequ", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[i]) ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = 0; i < A->nrow; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + c[j] = SUPERLU_MAX( c[j], z_abs1(&Aval[i]) * r[irow] ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + return; + +} /* zgsequ */ + + diff --git a/src/maths/SuperLU/zgsisx.c b/src/maths/SuperLU/zgsisx.c new file mode 100644 index 000000000..aaee79f66 --- /dev/null +++ b/src/maths/SuperLU/zgsisx.c @@ -0,0 +1,727 @@ + +/*! @file zgsisx.c + * \brief Computes an approximate solutions of linear equations A*X=B or A'*X=B + * + *
+ * -- SuperLU routine (version 4.2) --
+ * Lawrence Berkeley National Laboratory.
+ * November, 2010
+ * August, 2011
+ * 
+ */ +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZGSISX computes an approximate solutions of linear equations
+ * A*X=B or A'*X=B, using the ILU factorization from zgsitrf().
+ * An estimation of the condition number is provided. 
+ * The routine performs the following steps:
+ *
+ *   1. If A is stored column-wise (A->Stype = SLU_NC):
+ *  
+ *	1.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling
+ *	     factors are computed to equilibrate the system:
+ *	     options->Trans = NOTRANS:
+ *		 diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *	     options->Trans = TRANS:
+ *		 (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *	     options->Trans = CONJ:
+ *		 (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *	     Whether or not the system will be equilibrated depends on the
+ *	     scaling of the matrix A, but if equilibration is used, A is
+ *	     overwritten by diag(R)*A*diag(C) and B by diag(R)*B
+ *	     (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
+ *	     = TRANS or CONJ).
+ *
+ *	1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
+ *	     matrix that usually preserves sparsity.
+ *	     For more details of this step, see sp_preorder.c.
+ *
+ *	1.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *	     factor the matrix A (after equilibration if options->Equil = YES)
+ *	     as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
+ *
+ *	1.4. Compute the reciprocal pivot growth factor.
+ *
+ *	1.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *	     routine fills a small number on the diagonal entry, that is
+ *		U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n),
+ *	     and info will be increased by 1. The factored form of A is used
+ *	     to estimate the condition number of the preconditioner. If the
+ *	     reciprocal of the condition number is less than machine precision,
+ *	     info = A->ncol+1 is returned as a warning, but the routine still
+ *	     goes on to solve for X.
+ *
+ *	1.6. The system of equations is solved for X using the factored form
+ *	     of A.
+ *
+ *	1.7. options->IterRefine is not used
+ *
+ *	1.8. If equilibration was used, the matrix X is premultiplied by
+ *	     diag(C) (if options->Trans = NOTRANS) or diag(R)
+ *	     (if options->Trans = TRANS or CONJ) so that it solves the
+ *	     original system before equilibration.
+ *
+ *	1.9. options for ILU only
+ *	     1) If options->RowPerm = LargeDiag, MC64 is used to scale and
+ *		permute the matrix to an I-matrix, that is Pr*Dr*A*Dc has
+ *		entries of modulus 1 on the diagonal and off-diagonal entries
+ *		of modulus at most 1. If MC64 fails, dgsequ() is used to
+ *		equilibrate the system.
+ *              ( Default: LargeDiag )
+ *	     2) options->ILU_DropTol = tau is the threshold for dropping.
+ *		For L, it is used directly (for the whole row in a supernode);
+ *		For U, ||A(:,i)||_oo * tau is used as the threshold
+ *	        for the	i-th column.
+ *		If a secondary dropping rule is required, tau will
+ *	        also be used to compute the second threshold.
+ *              ( Default: 1e-4 )
+ *	     3) options->ILU_FillFactor = gamma, used as the initial guess
+ *		of memory growth.
+ *		If a secondary dropping rule is required, it will also
+ *              be used as an upper bound of the memory.
+ *              ( Default: 10 )
+ *	     4) options->ILU_DropRule specifies the dropping rule.
+ *		Option	      Meaning
+ *		======	      ===========
+ *		DROP_BASIC:   Basic dropping rule, supernodal based ILUTP(tau).
+ *		DROP_PROWS:   Supernodal based ILUTP(p,tau), p = gamma*nnz(A)/n.
+ *		DROP_COLUMN:  Variant of ILUTP(p,tau), for j-th column,
+ *			      p = gamma * nnz(A(:,j)).
+ *		DROP_AREA:    Variation of ILUTP, for j-th column, use
+ *			      nnz(F(:,1:j)) / nnz(A(:,1:j)) to control memory.
+ *		DROP_DYNAMIC: Modify the threshold tau during factorizaion:
+ *			      If nnz(L(:,1:j)) / nnz(A(:,1:j)) > gamma
+ *				  tau_L(j) := MIN(tau_0, tau_L(j-1) * 2);
+ *			      Otherwise
+ *				  tau_L(j) := MAX(tau_0, tau_L(j-1) / 2);
+ *			      tau_U(j) uses the similar rule.
+ *			      NOTE: the thresholds used by L and U are separate.
+ *		DROP_INTERP:  Compute the second dropping threshold by
+ *			      interpolation instead of sorting (default).
+ *			      In this case, the actual fill ratio is not
+ *			      guaranteed smaller than gamma.
+ *		DROP_PROWS, DROP_COLUMN and DROP_AREA are mutually exclusive.
+ *		( Default: DROP_BASIC | DROP_AREA )
+ *	     5) options->ILU_Norm is the criterion of measuring the magnitude
+ *		of a row in a supernode of L. ( Default is INF_NORM )
+ *		options->ILU_Norm	RowSize(x[1:n])
+ *		=================	===============
+ *		ONE_NORM		||x||_1 / n
+ *		TWO_NORM		||x||_2 / sqrt(n)
+ *		INF_NORM		max{|x[i]|}
+ *	     6) options->ILU_MILU specifies the type of MILU's variation.
+ *		= SILU: do not perform Modified ILU;
+ *		= SMILU_1 (not recommended):
+ *		    U(i,i) := U(i,i) + sum(dropped entries);
+ *		= SMILU_2:
+ *		    U(i,i) := U(i,i) + SGN(U(i,i)) * sum(dropped entries);
+ *		= SMILU_3:
+ *		    U(i,i) := U(i,i) + SGN(U(i,i)) * sum(|dropped entries|);
+ *		NOTE: Even SMILU_1 does not preserve the column sum because of
+ *		late dropping.
+ *              ( Default: SILU )
+ *	     7) options->ILU_FillTol is used as the perturbation when
+ *		encountering zero pivots. If some U(i,i) = 0, so that U is
+ *		exactly singular, then
+ *		   U(i,i) := ||A(:,i)|| * options->ILU_FillTol ** (1 - i / n).
+ *              ( Default: 1e-2 )
+ *
+ *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
+ *	to the transpose of A:
+ *
+ *	2.1. If options->Equil = YES or options->RowPerm = LargeDiag, scaling
+ *	     factors are computed to equilibrate the system:
+ *	     options->Trans = NOTRANS:
+ *		 diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *	     options->Trans = TRANS:
+ *		 (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *	     options->Trans = CONJ:
+ *		 (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *	     Whether or not the system will be equilibrated depends on the
+ *	     scaling of the matrix A, but if equilibration is used, A' is
+ *	     overwritten by diag(R)*A'*diag(C) and B by diag(R)*B
+ *	     (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
+ *
+ *	2.2. Permute columns of transpose(A) (rows of A),
+ *	     forming transpose(A)*Pc, where Pc is a permutation matrix that
+ *	     usually preserves sparsity.
+ *	     For more details of this step, see sp_preorder.c.
+ *
+ *	2.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *	     factor the transpose(A) (after equilibration if
+ *	     options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
+ *	     permutation Pr determined by partial pivoting.
+ *
+ *	2.4. Compute the reciprocal pivot growth factor.
+ *
+ *	2.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *	     routine fills a small number on the diagonal entry, that is
+ *		 U(i,i) = ||A(:,i)||_oo * options->ILU_FillTol ** (1 - i / n).
+ *	     And info will be increased by 1. The factored form of A is used
+ *	     to estimate the condition number of the preconditioner. If the
+ *	     reciprocal of the condition number is less than machine precision,
+ *	     info = A->ncol+1 is returned as a warning, but the routine still
+ *	     goes on to solve for X.
+ *
+ *	2.6. The system of equations is solved for X using the factored form
+ *	     of transpose(A).
+ *
+ *	2.7. If options->IterRefine is not used.
+ *
+ *	2.8. If equilibration was used, the matrix X is premultiplied by
+ *	     diag(C) (if options->Trans = NOTRANS) or diag(R)
+ *	     (if options->Trans = TRANS or CONJ) so that it solves the
+ *	     original system before equilibration.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *	   The structure defines the input parameters to control
+ *	   how the LU decomposition will be performed and how the
+ *	   system will be solved.
+ *
+ * A	   (input/output) SuperMatrix*
+ *	   Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *	   of the linear equations is A->nrow. Currently, the type of A can be:
+ *	   Stype = SLU_NC or SLU_NR, Dtype = SLU_Z, Mtype = SLU_GE.
+ *	   In the future, more general A may be handled.
+ *
+ *	   On entry, If options->Fact = FACTORED and equed is not 'N',
+ *	   then A must have been equilibrated by the scaling factors in
+ *	   R and/or C.
+ *	   On exit, A is not modified
+ *         if options->Equil = NO, or
+ *         if options->Equil = YES but equed = 'N' on exit, or
+ *         if options->RowPerm = NO.
+ *
+ *	   Otherwise, if options->Equil = YES and equed is not 'N',
+ *	   A is scaled as follows:
+ *	   If A->Stype = SLU_NC:
+ *	     equed = 'R':  A := diag(R) * A
+ *	     equed = 'C':  A := A * diag(C)
+ *	     equed = 'B':  A := diag(R) * A * diag(C).
+ *	   If A->Stype = SLU_NR:
+ *	     equed = 'R':  transpose(A) := diag(R) * transpose(A)
+ *	     equed = 'C':  transpose(A) := transpose(A) * diag(C)
+ *	     equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
+ *
+ *         If options->RowPerm = LargeDiag, MC64 is used to scale and permute
+ *            the matrix to an I-matrix, that is A is modified as follows:
+ *            P*Dr*A*Dc has entries of modulus 1 on the diagonal and 
+ *            off-diagonal entries of modulus at most 1. P is a permutation
+ *            obtained from MC64.
+ *            If MC64 fails, zgsequ() is used to equilibrate the system,
+ *            and A is scaled as above, but no permutation is involved.
+ *            On exit, A is restored to the orginal row numbering, so
+ *            Dr*A*Dc is returned.
+ *
+ * perm_c  (input/output) int*
+ *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
+ *	   which defines the permutation matrix Pc; perm_c[i] = j means
+ *	   column i of A is in position j in A*Pc.
+ *	   On exit, perm_c may be overwritten by the product of the input
+ *	   perm_c and a permutation that postorders the elimination tree
+ *	   of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ *	   is already in postorder.
+ *
+ *	   If A->Stype = SLU_NR, column permutation vector of size A->nrow,
+ *	   which describes permutation of columns of transpose(A) 
+ *	   (rows of A) as described above.
+ *
+ * perm_r  (input/output) int*
+ *	   If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
+ *	   which defines the permutation matrix Pr, and is determined
+ *	   by MC64 first then followed by partial pivoting.
+ *         perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ *	   If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ *	   determines permutation of rows of transpose(A)
+ *	   (columns of A) as described above.
+ *
+ *	   If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *	   will try to use the input perm_r, unless a certain threshold
+ *	   criterion is violated. In that case, perm_r is overwritten by a
+ *	   new permutation determined by partial pivoting or diagonal
+ *	   threshold pivoting.
+ *	   Otherwise, perm_r is output argument.
+ *
+ * etree   (input/output) int*,  dimension (A->ncol)
+ *	   Elimination tree of Pc'*A'*A*Pc.
+ *	   If options->Fact != FACTORED and options->Fact != DOFACT,
+ *	   etree is an input argument, otherwise it is an output argument.
+ *	   Note: etree is a vector of parent pointers for a forest whose
+ *	   vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * equed   (input/output) char*
+ *	   Specifies the form of equilibration that was done.
+ *	   = 'N': No equilibration.
+ *	   = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
+ *	   = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
+ *	   = 'B': Both row and column equilibration, i.e., A was replaced 
+ *		  by diag(R)*A*diag(C).
+ *	   If options->Fact = FACTORED, equed is an input argument,
+ *	   otherwise it is an output argument.
+ *
+ * R	   (input/output) double*, dimension (A->nrow)
+ *	   The row scale factors for A or transpose(A).
+ *	   If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *	       (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
+ *	   If equed = 'N' or 'C', R is not accessed.
+ *	   If options->Fact = FACTORED, R is an input argument,
+ *	       otherwise, R is output.
+ *	   If options->Fact = FACTORED and equed = 'R' or 'B', each element
+ *	       of R must be positive.
+ *
+ * C	   (input/output) double*, dimension (A->ncol)
+ *	   The column scale factors for A or transpose(A).
+ *	   If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *	       (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
+ *	   If equed = 'N' or 'R', C is not accessed.
+ *	   If options->Fact = FACTORED, C is an input argument,
+ *	       otherwise, C is output.
+ *	   If options->Fact = FACTORED and equed = 'C' or 'B', each element
+ *	       of C must be positive.
+ *
+ * L	   (output) SuperMatrix*
+ *	   The factor L from the factorization
+ *	       Pr*A*Pc=L*U		(if A->Stype SLU_= NC) or
+ *	       Pr*transpose(A)*Pc=L*U	(if A->Stype = SLU_NR).
+ *	   Uses compressed row subscripts storage for supernodes, i.e.,
+ *	   L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ *
+ * U	   (output) SuperMatrix*
+ *	   The factor U from the factorization
+ *	       Pr*A*Pc=L*U		(if A->Stype = SLU_NC) or
+ *	       Pr*transpose(A)*Pc=L*U	(if A->Stype = SLU_NR).
+ *	   Uses column-wise storage scheme, i.e., U has types:
+ *	   Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
+ *
+ * work    (workspace/output) void*, size (lwork) (in bytes)
+ *	   User supplied workspace, should be large enough
+ *	   to hold data structures for factors L and U.
+ *	   On exit, if fact is not 'F', L and U point to this array.
+ *
+ * lwork   (input) int
+ *	   Specifies the size of work array in bytes.
+ *	   = 0:  allocate space internally by system malloc;
+ *	   > 0:  use user-supplied work array of length lwork in bytes,
+ *		 returns error if space runs out.
+ *	   = -1: the routine guesses the amount of space needed without
+ *		 performing the factorization, and returns it in
+ *		 mem_usage->total_needed; no other side effects.
+ *
+ *	   See argument 'mem_usage' for memory usage statistics.
+ *
+ * B	   (input/output) SuperMatrix*
+ *	   B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
+ *	   On entry, the right hand side matrix.
+ *	   If B->ncol = 0, only LU decomposition is performed, the triangular
+ *			   solve is skipped.
+ *	   On exit,
+ *	      if equed = 'N', B is not modified; otherwise
+ *	      if A->Stype = SLU_NC:
+ *		 if options->Trans = NOTRANS and equed = 'R' or 'B',
+ *		    B is overwritten by diag(R)*B;
+ *		 if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
+ *		    B is overwritten by diag(C)*B;
+ *	      if A->Stype = SLU_NR:
+ *		 if options->Trans = NOTRANS and equed = 'C' or 'B',
+ *		    B is overwritten by diag(C)*B;
+ *		 if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
+ *		    B is overwritten by diag(R)*B.
+ *
+ * X	   (output) SuperMatrix*
+ *	   X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
+ *	   If info = 0 or info = A->ncol+1, X contains the solution matrix
+ *	   to the original system of equations. Note that A and B are modified
+ *	   on exit if equed is not 'N', and the solution to the equilibrated
+ *	   system is inv(diag(C))*X if options->Trans = NOTRANS and
+ *	   equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
+ *	   and equed = 'R' or 'B'.
+ *
+ * recip_pivot_growth (output) double*
+ *	   The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
+ *	   The infinity norm is used. If recip_pivot_growth is much less
+ *	   than 1, the stability of the LU factorization could be poor.
+ *
+ * rcond   (output) double*
+ *	   The estimate of the reciprocal condition number of the matrix A
+ *	   after equilibration (if done). If rcond is less than the machine
+ *	   precision (in particular, if rcond = 0), the matrix is singular
+ *	   to working precision. This condition is indicated by a return
+ *	   code of info > 0.
+ *
+ * mem_usage (output) mem_usage_t*
+ *	   Record the memory usage statistics, consisting of following fields:
+ *	   - for_lu (float)
+ *	     The amount of space used in bytes for L\U data structures.
+ *	   - total_needed (float)
+ *	     The amount of space needed in bytes to perform factorization.
+ *	   - expansions (int)
+ *	     The number of memory expansions during the LU factorization.
+ *
+ * stat   (output) SuperLUStat_t*
+ *	  Record the statistics on runtime and floating-point operation count.
+ *	  See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ *	   > 0: if info = i, and i is
+ *		<= A->ncol: number of zero pivots. They are replaced by small
+ *		      entries due to options->ILU_FillTol.
+ *		= A->ncol+1: U is nonsingular, but RCOND is less than machine
+ *		      precision, meaning that the matrix is singular to
+ *		      working precision. Nevertheless, the solution and
+ *		      error bounds are computed because there are a number
+ *		      of situations where the computed solution can be more
+ *		      accurate than the value of RCOND would suggest.
+ *		> A->ncol+1: number of bytes allocated when memory allocation
+ *		      failure occurred, plus A->ncol.
+ * 
+ */ + +void +zgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + int *etree, char *equed, double *R, double *C, + SuperMatrix *L, SuperMatrix *U, void *work, int lwork, + SuperMatrix *B, SuperMatrix *X, + double *recip_pivot_growth, double *rcond, + mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info) +{ + + DNformat *Bstore, *Xstore; + doublecomplex *Bmat, *Xmat; + int ldb, ldx, nrhs, n; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int colequ, equil, nofact, notran, rowequ, permc_spec, mc64; + trans_t trant; + char norm[1]; + int i, j, info1; + double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; + int relax, panel_size; + double diag_pivot_thresh; + double t0; /* temporary time */ + double *utime; + + int *perm = NULL; /* permutation returned from MC64 */ + + /* External functions */ + extern double zlangs(char *, SuperMatrix *); + + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + n = B->nrow; + + *info = 0; + nofact = (options->Fact != FACTORED); + equil = (options->Equil == YES_SuperLU); + notran = (options->Trans == NOTRANS); + mc64 = (options->RowPerm == LargeDiag); + if ( nofact ) { + *(unsigned char *)equed = 'N'; + rowequ = FALSE; + colequ = FALSE; + } else { + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + smlnum = dlamch_("Safe minimum"); + bignum = 1. / smlnum; + } + + /* Test the input parameters */ + if (options->Fact != DOFACT && options->Fact != SamePattern && + options->Fact != SamePattern_SameRowPerm && + options->Fact != FACTORED && + options->Trans != NOTRANS && options->Trans != TRANS && + options->Trans != CONJ && + options->Equil != NO_SuperLU && options->Equil != YES_SuperLU) + *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if (options->Fact == FACTORED && + !(rowequ || colequ || lsame_(equed, "N"))) + *info = -6; + else { + if (rowequ) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, R[j]); + rcmax = SUPERLU_MAX(rcmax, R[j]); + } + if (rcmin <= 0.) *info = -7; + else if ( A->nrow > 0) + rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else rowcnd = 1.; + } + if (colequ && *info == 0) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, C[j]); + rcmax = SUPERLU_MAX(rcmax, C[j]); + } + if (rcmin <= 0.) *info = -8; + else if (A->nrow > 0) + colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else colcnd = 1.; + } + if (*info == 0) { + if ( lwork < -1 ) *info = -12; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || + B->Mtype != SLU_GE ) + *info = -13; + else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || + (B->ncol != 0 && B->ncol != X->ncol) || + X->Stype != SLU_DN || + X->Dtype != SLU_Z || X->Mtype != SLU_GE ) + *info = -14; + } + } + if (*info != 0) { + i = -(*info); + xerbla_("zgsisx", &i); + return; + } + + /* Initialization for factor parameters */ + panel_size = sp_ienv(1); + relax = sp_ienv(2); + diag_pivot_thresh = options->DiagPivotThresh; + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + if ( notran ) { /* Reverse the transpose argument. */ + trant = TRANS; + notran = 0; + } else { + trant = NOTRANS; + notran = 1; + } + } else { /* A->Stype == SLU_NC */ + trant = options->Trans; + AA = A; + } + + if ( nofact ) { + register int i, j; + NCformat *Astore = AA->Store; + int nnz = Astore->nnz; + int *colptr = Astore->colptr; + int *rowind = Astore->rowind; + doublecomplex *nzval = (doublecomplex *)Astore->nzval; + + if ( mc64 ) { + t0 = SuperLU_timer_(); + if ((perm = intMalloc(n)) == NULL) + ABORT_SuperLU("SUPERLU_MALLOC fails for perm[]"); + + info1 = zldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C); + + if (info1 != 0) { /* MC64 fails, call zgsequ() later */ + mc64 = 0; + SUPERLU_FREE(perm); + perm = NULL; + } else { + if ( equil ) { + rowequ = colequ = 1; + for (i = 0; i < n; i++) { + R[i] = exp(R[i]); + C[i] = exp(C[i]); + } + /* scale the matrix */ + for (j = 0; j < n; j++) { + for (i = colptr[j]; i < colptr[j + 1]; i++) { + zd_mult(&nzval[i], &nzval[i], R[rowind[i]] * C[j]); + } + } + *equed = 'B'; + } + + /* permute the matrix */ + for (j = 0; j < n; j++) { + for (i = colptr[j]; i < colptr[j + 1]; i++) { + /*nzval[i] *= R[rowind[i]] * C[j];*/ + rowind[i] = perm[rowind[i]]; + } + } + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + + if ( !mc64 & equil ) { /* Only perform equilibration, no row perm */ + t0 = SuperLU_timer_(); + /* Compute row and column scalings to equilibrate the matrix A. */ + zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); + + if ( info1 == 0 ) { + /* Equilibrate matrix A. */ + zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + } + + + if ( nofact ) { + + t0 = SuperLU_timer_(); + /* + * Gnet column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t0; + + t0 = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t0; + + /* Compute the LU factorization of A*Pc. */ + t0 = SuperLU_timer_(); + zgsitrf(options, &AC, relax, panel_size, etree, work, lwork, + perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t0; + + if ( lwork == -1 ) { + mem_usage->total_needed = *info - A->ncol; + return; + } + + if ( mc64 ) { /* Fold MC64's perm[] into perm_r[]. */ + NCformat *Astore = AA->Store; + int nnz = Astore->nnz, *rowind = Astore->rowind; + int *perm_tmp, *iperm; + if ((perm_tmp = intMalloc(2*n)) == NULL) + ABORT_SuperLU("SUPERLU_MALLOC fails for perm_tmp[]"); + iperm = perm_tmp + n; + for (i = 0; i < n; ++i) perm_tmp[i] = perm_r[perm[i]]; + for (i = 0; i < n; ++i) { + perm_r[i] = perm_tmp[i]; + iperm[perm[i]] = i; + } + + /* Restore A's original row indices. */ + for (i = 0; i < nnz; ++i) rowind[i] = iperm[rowind[i]]; + + SUPERLU_FREE(perm); /* MC64 permutation */ + SUPERLU_FREE(perm_tmp); + } + } + + if ( options->PivotGrowth ) { + if ( *info > 0 ) return; + + /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ + *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); + } + + if ( options->ConditionNumber ) { + /* Estimate the reciprocal of the condition number of A. */ + t0 = SuperLU_timer_(); + if ( notran ) { + *(unsigned char *)norm = '1'; + } else { + *(unsigned char *)norm = 'I'; + } + anorm = zlangs(norm, AA); + zgscon(norm, L, U, anorm, rcond, stat, &info1); + utime[RCOND] = SuperLU_timer_() - t0; + } + + if ( nrhs > 0 ) { /* Solve the system */ + doublecomplex *rhs_work; + + /* Scale and permute the right-hand side if equilibration + and permutation from MC64 were performed. */ + if ( notran ) { + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); + } + } else if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); + } + } + + /* Compute the solution matrix X. */ + for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ + for (i = 0; i < B->nrow; i++) + Xmat[i + j*ldx] = Bmat[i + j*ldb]; + + t0 = SuperLU_timer_(); + zgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1); + utime[SOLVE] = SuperLU_timer_() - t0; + + /* Transform the solution matrix X to a solution of the original + system. */ + if ( notran ) { + if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); + } + } + } else { /* transposed system */ + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) { + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); + } + } + } + + } /* end if nrhs > 0 */ + + if ( options->ConditionNumber ) { + /* The matrix is singular to working precision. */ + if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1; + } + + if ( nofact ) { + ilu_zQuerySpace(L, U, mem_usage); + Destroy_CompCol_Permuted(&AC); + } + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/zgsitrf.c b/src/maths/SuperLU/zgsitrf.c new file mode 100644 index 000000000..c8cc256ee --- /dev/null +++ b/src/maths/SuperLU/zgsitrf.c @@ -0,0 +1,637 @@ + +/*! @file zgsitrf.c + * \brief Computes an ILU factorization of a general sparse matrix + * + *
+ * -- SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ *
+ * 
+ */ + +#include + +#ifdef DEBUG +int num_drop_L; +#endif + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZGSITRF computes an ILU factorization of a general sparse m-by-n
+ * matrix A using partial pivoting with row interchanges.
+ * The factorization has the form
+ *     Pr * A = L * U
+ * where Pr is a row permutation matrix, L is lower triangular with unit
+ * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper
+ * triangular (upper trapezoidal if A->nrow < A->ncol).
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *	   The structure defines the input parameters to control
+ *	   how the ILU decomposition will be performed.
+ *
+ * A	    (input) SuperMatrix*
+ *	    Original matrix A, permuted by columns, of dimension
+ *	    (A->nrow, A->ncol). The type of A can be:
+ *	    Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ * relax    (input) int
+ *	    To control degree of relaxing supernodes. If the number
+ *	    of nodes (columns) in a subtree of the elimination tree is less
+ *	    than relax, this subtree is considered as one supernode,
+ *	    regardless of the row structures of those columns.
+ *
+ * panel_size (input) int
+ *	    A panel consists of at most panel_size consecutive columns.
+ *
+ * etree    (input) int*, dimension (A->ncol)
+ *	    Elimination tree of A'*A.
+ *	    Note: etree is a vector of parent pointers for a forest whose
+ *	    vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *	    On input, the columns of A should be permuted so that the
+ *	    etree is in a certain postorder.
+ *
+ * work     (input/output) void*, size (lwork) (in bytes)
+ *	    User-supplied work space and space for the output data structures.
+ *	    Not referenced if lwork = 0;
+ *
+ * lwork   (input) int
+ *	   Specifies the size of work array in bytes.
+ *	   = 0:  allocate space internally by system malloc;
+ *	   > 0:  use user-supplied work array of length lwork in bytes,
+ *		 returns error if space runs out.
+ *	   = -1: the routine guesses the amount of space needed without
+ *		 performing the factorization, and returns it in
+ *		 *info; no other side effects.
+ *
+ * perm_c   (input) int*, dimension (A->ncol)
+ *	    Column permutation vector, which defines the
+ *	    permutation matrix Pc; perm_c[i] = j means column i of A is
+ *	    in position j in A*Pc.
+ *	    When searching for diagonal, perm_c[*] is applied to the
+ *	    row subscripts of A, so that diagonal threshold pivoting
+ *	    can find the diagonal of A, rather than that of A*Pc.
+ *
+ * perm_r   (input/output) int*, dimension (A->nrow)
+ *	    Row permutation vector which defines the permutation matrix Pr,
+ *	    perm_r[i] = j means row i of A is in position j in Pr*A.
+ *	    If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *	       will try to use the input perm_r, unless a certain threshold
+ *	       criterion is violated. In that case, perm_r is overwritten by
+ *	       a new permutation determined by partial pivoting or diagonal
+ *	       threshold pivoting.
+ *	    Otherwise, perm_r is output argument;
+ *
+ * L	    (output) SuperMatrix*
+ *	    The factor L from the factorization Pr*A=L*U; use compressed row
+ *	    subscripts storage for supernodes, i.e., L has type:
+ *	    Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ *
+ * U	    (output) SuperMatrix*
+ *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
+ *	    storage scheme, i.e., U has types: Stype = SLU_NC,
+ *	    Dtype = SLU_Z, Mtype = SLU_TRU.
+ *
+ * stat     (output) SuperLUStat_t*
+ *	    Record the statistics on runtime and floating-point operation count.
+ *	    See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info     (output) int*
+ *	    = 0: successful exit
+ *	    < 0: if info = -i, the i-th argument had an illegal value
+ *	    > 0: if info = i, and i is
+ *	       <= A->ncol: number of zero pivots. They are replaced by small
+ *		  entries according to options->ILU_FillTol.
+ *	       > A->ncol: number of bytes allocated when memory allocation
+ *		  failure occurred, plus A->ncol. If lwork = -1, it is
+ *		  the estimated amount of space needed, plus A->ncol.
+ *
+ * ======================================================================
+ *
+ * Local Working Arrays:
+ * ======================
+ *   m = number of rows in the matrix
+ *   n = number of columns in the matrix
+ *
+ *   marker[0:3*m-1]: marker[i] = j means that node i has been
+ *	reached when working on column j.
+ *	Storage: relative to original row subscripts
+ *	NOTE: There are 4 of them:
+ *	      marker/marker1 are used for panel dfs, see (ilu_)dpanel_dfs.c;
+ *	      marker2 is used for inner-factorization, see (ilu)_dcolumn_dfs.c;
+ *	      marker_relax(has its own space) is used for relaxed supernodes.
+ *
+ *   parent[0:m-1]: parent vector used during dfs
+ *	Storage: relative to new row subscripts
+ *
+ *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs)
+ *	unexplored neighbor of i in lsub[*]
+ *
+ *   segrep[0:nseg-1]: contains the list of supernodal representatives
+ *	in topological order of the dfs. A supernode representative is the
+ *	last column of a supernode.
+ *	The maximum size of segrep[] is n.
+ *
+ *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a
+ *	supernodal representative r, repfnz[r] is the location of the first
+ *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
+ *	indicates the supernode r has been explored.
+ *	NOTE: There are W of them, each used for one column of a panel.
+ *
+ *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below
+ *	the panel diagonal. These are filled in during dpanel_dfs(), and are
+ *	used later in the inner LU factorization within the panel.
+ *	panel_lsub[]/dense[] pair forms the SPA data structure.
+ *	NOTE: There are W of them.
+ *
+ *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
+ *		   NOTE: there are W of them.
+ *
+ *   tempv[0:*]: real temporary used for dense numeric kernels;
+ *	The size of this array is defined by NUM_TEMPV() in slu_util.h.
+ *	It is also used by the dropping routine ilu_ddrop_row().
+ * 
+ */ + +void +zgsitrf(superlu_options_t *options, SuperMatrix *A, int relax, int panel_size, + int *etree, void *work, int lwork, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *swap, *iswap; /* swap is used to store the row permutation + during the factorization. Initially, it is set + to iperm_c (row indeces of Pc*A*Pc'). + iswap is the inverse of swap. After the + factorization, it is equal to perm_r. */ + int *iwork; + doublecomplex *zwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *marker, *marker_relax; + doublecomplex *dense, *tempv; + double *dtempv; + int *relax_end, *relax_fsupc; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double *amax; + doublecomplex drop_sum; + double alpha, omega; /* used in MILU, mimicing DRIC */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + double *dwork2; /* used by the second dropping rule */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + double drop_tol = options->ILU_DropTol; /* tau */ + double fill_ini = options->ILU_FillTol; /* tau^hat */ + double gamma = options->ILU_FillFactor; + int drop_rule = options->ILU_DropRule; + milu_t milu = options->ILU_MILU; + double fill_tol; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + int last_drop;/* the last column which the dropping rules applied */ + int quota; + int nnzAj; /* number of nonzeros in A(:,1:j) */ + int nnzLj, nnzUj; + double tol_L = drop_tol, tol_U = drop_tol; + doublecomplex zero = {0.0, 0.0}; + double one = 1.0; + + /* Executable */ + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, + gamma, L, U, &Glu, &iwork, &zwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &marker_relax, &marker); + zSetRWork(m, panel_size, zwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + swap = (int *)intMalloc(n); + for (k = 0; k < n; k++) swap[k] = iperm_c[k]; + iswap = (int *)intMalloc(n); + for (k = 0; k < n; k++) iswap[k] = perm_c[k]; + amax = (double *) doubleMalloc(panel_size); + if (drop_rule & DROP_SECONDARY) + dwork2 = (double *)doubleMalloc(n); + else + dwork2 = NULL; + + nnzAj = 0; + nnzLj = 0; + nnzUj = 0; + last_drop = SUPERLU_MAX(min_mn - 2 * sp_ienv(7), (int)(min_mn * 0.95)); + alpha = pow((double)n, -1.0 / options->ILU_MILU_Dim); + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + relax_fsupc = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) + ilu_heap_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + else + ilu_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* Mark the rows used by relaxed supernodes */ + ifill (marker_relax, m, EMPTY); + i = mark_relax(m, relax_end, relax_fsupc, xa_begin, xa_end, + asub, marker_relax); +#if ( PRNTlevel >= 1) + printf("%d relaxed supernodes.\n", i); +#endif + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* Drop small rows in the previous supernode. */ + if (jcol > 0 && jcol < last_drop) { + int first = xsup[supno[jcol - 1]]; + int last = jcol - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / min_mn); + + /* Drop small rows */ + dtempv = (double *) tempv; + i = ilu_zdrop_row(options, first, last, tol_L, quota, &nnzLj, + &fill_tol, &Glu, dtempv, dwork2, 0); + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = ilu_zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ((*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu))) + return; + } + + for (icol = jcol; icol <= kcol; icol++) { + xusub[icol+1] = nextu; + + amax[0] = 0.0; + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) { + register double tmp = z_abs1 (&a[k]); + if (tmp > amax[0]) amax[0] = tmp; + dense[asub[k]] = a[k]; + } + nnzAj += xa_end[icol] - xa_begin[icol]; + if (amax[0] == 0.0) { + amax[0] = fill_ini; +#if ( PRNTlevel >= 1) + printf("Column %d is entirely zero!\n", icol); + fflush(stdout); +#endif + } + + /* Numeric update within the snode */ + zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if (usepr) pivrow = iperm_r[icol]; + fill_tol = pow(fill_ini, 1.0 - (double)icol / (double)min_mn); + if ( (*info = ilu_zpivotL(icol, diag_pivot_thresh, &usepr, + perm_r, iperm_c[icol], swap, iswap, + marker_relax, &pivrow, + amax[0] * fill_tol, milu, zero, + &Glu, stat)) ) { + iinfo++; + marker[pivrow] = kcol; + } + + } + + jcol = kcol + 1; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + ilu_zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, amax, panel_lsub, segrep, repfnz, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + zpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for (jj = jcol; jj < jcol + panel_size; jj++) { + + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + nnzAj += xa_end[jj] - xa_begin[jj]; + + if ((*info = ilu_zcolumn_dfs(m, jj, perm_r, &nseg, + &panel_lsub[k], segrep, &repfnz[k], + marker, parent, xplore, &Glu))) + return; + + /* Numeric updates */ + if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Make a fill-in position if the column is entirely zero */ + if (xlsub[jj + 1] == xlsub[jj]) { + register int i, row; + int nextl; + int nzlmax = Glu.nzlmax; + int *lsub = Glu.lsub; + int *marker2 = marker + 2 * m; + + /* Allocate memory */ + nextl = xlsub[jj] + 1; + if (nextl >= nzlmax) { + int error = zLUMemXpand(jj, nextl, LSUB, &nzlmax, &Glu); + if (error) { *info = error; return; } + lsub = Glu.lsub; + } + xlsub[jj + 1]++; + assert(xlusup[jj]==xlusup[jj+1]); + xlusup[jj + 1]++; + Glu.lusup[xlusup[jj]] = zero; + + /* Choose a row index (pivrow) for fill-in */ + for (i = jj; i < n; i++) + if (marker_relax[swap[i]] <= jj) break; + row = swap[i]; + marker2[row] = jj; + lsub[xlsub[jj]] = row; +#ifdef DEBUG + printf("Fill col %d.\n", jj); + fflush(stdout); +#endif + } + + /* Computer the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * jj / m; + else if (drop_rule & DROP_COLUMN) + quota = gamma * (xa_end[jj] - xa_begin[jj]) * + (jj + 1) / m; + else if (drop_rule & DROP_AREA) + quota = gamma * 0.9 * nnzAj * 0.5 - nnzUj; + else + quota = m; + + /* Copy the U-segments to ucol[*] and drop small entries */ + if ((*info = ilu_zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], drop_rule, + milu, amax[jj - jcol] * tol_U, + quota, &drop_sum, &nnzUj, &Glu, + dwork2)) != 0) + return; + + /* Reset the dropping threshold if required */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * 0.9 * nnzAj * 0.5 < nnzLj) + tol_U = SUPERLU_MIN(1.0, tol_U * 2.0); + else + tol_U = SUPERLU_MAX(drop_tol, tol_U * 0.5); + } + + if (drop_sum.r != 0.0 && drop_sum.i != 0.0) + { + omega = SUPERLU_MIN(2.0*(1.0-alpha)/z_abs1(&drop_sum), 1.0); + zd_mult(&drop_sum, &drop_sum, omega); + } + if (usepr) pivrow = iperm_r[jj]; + fill_tol = pow(fill_ini, 1.0 - (double)jj / (double)min_mn); + if ( (*info = ilu_zpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_c[jj], swap, iswap, + marker_relax, &pivrow, + amax[jj - jcol] * fill_tol, milu, + drop_sum, &Glu, stat)) ) { + iinfo++; + marker[m + pivrow] = jj; + marker[2 * m + pivrow] = jj; + } + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + + /* Start a new supernode, drop the previous one */ + if (jj > 0 && supno[jj] > supno[jj - 1] && jj < last_drop) { + int first = xsup[supno[jj - 1]]; + int last = jj - 1; + int quota; + + /* Compute the quota */ + if (drop_rule & DROP_PROWS) + quota = gamma * Astore->nnz / m * (m - first) / m + * (last - first + 1); + else if (drop_rule & DROP_COLUMN) { + int i; + quota = 0; + for (i = first; i <= last; i++) + quota += xa_end[i] - xa_begin[i]; + quota = gamma * quota * (m - first) / m; + } else if (drop_rule & DROP_AREA) + quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) + / m) - nnzLj; + else + quota = m * n; + fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / + (double)min_mn); + + /* Drop small rows */ + dtempv = (double *) tempv; + i = ilu_zdrop_row(options, first, last, tol_L, quota, + &nnzLj, &fill_tol, &Glu, dtempv, dwork2, + 1); + + /* Reset the parameters */ + if (drop_rule & DROP_DYNAMIC) { + if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) + < nnzLj) + tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); + else + tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); + } + if (fill_tol < 0) iinfo -= (int)fill_tol; +#ifdef DEBUG + num_drop_L += i * (last - first + 1); +#endif + } /* if start a new supernode */ + + } /* for */ + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + ilu_countnz(min_mn, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU); + zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + SUPERLU_FREE (swap); + SUPERLU_FREE (iswap); + SUPERLU_FREE (relax_fsupc); + SUPERLU_FREE (amax); + if ( dwork2 ) SUPERLU_FREE (dwork2); + +} diff --git a/src/maths/SuperLU/zgsrfs.c b/src/maths/SuperLU/zgsrfs.c new file mode 100644 index 000000000..99b3af680 --- /dev/null +++ b/src/maths/SuperLU/zgsrfs.c @@ -0,0 +1,460 @@ + +/*! @file zgsrfs.c + * \brief Improves computed solution to a system of inear equations + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Modified from lapack routine ZGERFS
+ * 
+ */ +/* + * File name: zgsrfs.c + * History: Modified from lapack routine ZGERFS + */ +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   ZGSRFS improves the computed solution to a system of linear   
+ *   equations and provides error bounds and backward error estimates for 
+ *   the solution.   
+ *
+ *   If equilibration was performed, the system becomes:
+ *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ * trans   (input) trans_t
+ *          Specifies the form of the system of equations:
+ *          = NOTRANS: A * X = B  (No transpose)
+ *          = TRANS:   A'* X = B  (Transpose)
+ *          = CONJ:    A**H * X = B  (Conjugate transpose)
+ *   
+ *   A       (input) SuperMatrix*
+ *           The original matrix A in the system, or the scaled A if
+ *           equilibration was done. The type of A can be:
+ *           Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_GE.
+ *    
+ *   L       (input) SuperMatrix*
+ *	     The factor L from the factorization Pr*A*Pc=L*U. Use
+ *           compressed row subscripts storage for supernodes, 
+ *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ * 
+ *   U       (input) SuperMatrix*
+ *           The factor U from the factorization Pr*A*Pc=L*U as computed by
+ *           zgstrf(). Use column-wise storage scheme, 
+ *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
+ *
+ *   perm_c  (input) int*, dimension (A->ncol)
+ *	     Column permutation vector, which defines the 
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *           in position j in A*Pc.
+ *
+ *   perm_r  (input) int*, dimension (A->nrow)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ *   equed   (input) Specifies the form of equilibration that was done.
+ *           = 'N': No equilibration.
+ *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
+ *           = 'C': Column equilibration, i.e., A was postmultiplied by
+ *                  diag(C).
+ *           = 'B': Both row and column equilibration, i.e., A was replaced 
+ *                  by diag(R)*A*diag(C).
+ *
+ *   R       (input) double*, dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If equed = 'R' or 'B', A is premultiplied by diag(R).
+ *           If equed = 'N' or 'C', R is not accessed.
+ * 
+ *   C       (input) double*, dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
+ *           If equed = 'N' or 'R', C is not accessed.
+ *
+ *   B       (input) SuperMatrix*
+ *           B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
+ *           The right hand side matrix B.
+ *           if equed = 'R' or 'B', B is premultiplied by diag(R).
+ *
+ *   X       (input/output) SuperMatrix*
+ *           X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
+ *           On entry, the solution matrix X, as computed by zgstrs().
+ *           On exit, the improved solution matrix X.
+ *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
+ *               in order to obtain the solution to the original system.
+ *
+ *   FERR    (output) double*, dimension (B->ncol)   
+ *           The estimated forward error bound for each solution vector   
+ *           X(j) (the j-th column of the solution matrix X).   
+ *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
+ *           is an estimated upper bound for the magnitude of the largest 
+ *           element in (X(j) - XTRUE) divided by the magnitude of the   
+ *           largest element in X(j).  The estimate is as reliable as   
+ *           the estimate for RCOND, and is almost always a slight   
+ *           overestimate of the true error.
+ *
+ *   BERR    (output) double*, dimension (B->ncol)   
+ *           The componentwise relative backward error of each solution   
+ *           vector X(j) (i.e., the smallest relative change in   
+ *           any element of A or B that makes X(j) an exact solution).
+ *
+ *   stat     (output) SuperLUStat_t*
+ *            Record the statistics on runtime and floating-point operation count.
+ *            See util.h for the definition of 'SuperLUStat_t'.
+ *
+ *   info    (output) int*   
+ *           = 0:  successful exit   
+ *            < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *
+ *    Internal Parameters   
+ *    ===================   
+ *
+ *    ITMAX is the maximum number of steps of iterative refinement.   
+ *
+ * 
+ */ +void +zgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, char *equed, double *R, double *C, + SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, + SuperLUStat_t *stat, int *info) +{ + + +#define ITMAX 5 + + /* Table of constant values */ + int ione = 1; + doublecomplex ndone = {-1., 0.}; + doublecomplex done = {1., 0.}; + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + SuperMatrix Bjcol; + DNformat *Bstore, *Xstore, *Bjcol_store; + doublecomplex *Bmat, *Xmat, *Bptr, *Xptr; + int kase; + double safe1, safe2; + int i, j, k, irow, nz, count, notran, rowequ, colequ; + int ldb, ldx, nrhs; + double s, xk, lstres, eps, safmin; + char transc[1]; + trans_t transt; + doublecomplex *work; + double *rwork; + int *iwork; + + extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *); +#ifdef _CRAY + extern int CCOPY(int *, doublecomplex *, int *, doublecomplex *, int *); + extern int CSAXPY(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); +#else + extern int zcopy_(int *, doublecomplex *, int *, doublecomplex *, int *); + extern int zaxpy_(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); +#endif + + Astore = A->Store; + Aval = Astore->nzval; + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + /* Test the input parameters */ + *info = 0; + notran = (trans == NOTRANS); + if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) + *info = -3; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) + *info = -4; + else if ( ldb < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) + *info = -10; + else if ( ldx < SUPERLU_MAX(0, A->nrow) || + X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) + *info = -11; + if (*info != 0) { + i = -(*info); + xerbla_("zgsrfs", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || nrhs == 0) { + for (j = 0; j < nrhs; ++j) { + ferr[j] = 0.; + berr[j] = 0.; + } + return; + } + + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + + /* Allocate working space */ + work = doublecomplexMalloc(2*A->nrow); + rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); + iwork = intMalloc(A->nrow); + if ( !work || !rwork || !iwork ) + ABORT_SuperLU("Malloc fails for work/rwork/iwork."); + + if ( notran ) { + *(unsigned char *)transc = 'N'; + transt = TRANS; + } else { + *(unsigned char *)transc = 'T'; + transt = NOTRANS; + } + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = dlamch_("Epsilon"); + safmin = dlamch_("Safe minimum"); + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + + /* Compute the number of nonzeros in each row (or column) of A */ + for (i = 0; i < A->nrow; ++i) iwork[i] = 0; + if ( notran ) { + for (k = 0; k < A->ncol; ++k) + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + ++iwork[Astore->rowind[i]]; + } else { + for (k = 0; k < A->ncol; ++k) + iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; + } + + /* Copy one column of RHS B into Bjcol. */ + Bjcol.Stype = B->Stype; + Bjcol.Dtype = B->Dtype; + Bjcol.Mtype = B->Mtype; + Bjcol.nrow = B->nrow; + Bjcol.ncol = 1; + Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); + if ( !Bjcol.Store ) ABORT_SuperLU("SUPERLU_MALLOC fails for Bjcol.Store"); + Bjcol_store = Bjcol.Store; + Bjcol_store->lda = ldb; + Bjcol_store->nzval = work; /* address aliasing */ + + /* Do for each right hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + Bptr = &Bmat[j*ldb]; + Xptr = &Xmat[j*ldx]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + +#ifdef _CRAY + CCOPY(&A->nrow, Bptr, &ione, work, &ione); +#else + zcopy_(&A->nrow, Bptr, &ione, work, &ione); +#endif + sp_zgemv(transc, ndone, A, Xptr, ione, done, work, ione); + + /* Compute componentwise relative backward error from formula + max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) + where abs(Z) is the componentwise absolute value of the matrix + or vector Z. If the i-th component of the denominator is less + than SAFE2, then SAFE1 is added to the i-th component of the + numerator before dividing. */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if (notran) { + for (k = 0; k < A->ncol; ++k) { + xk = z_abs1( &Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + s += z_abs1(&Aval[i]) * z_abs1(&Xptr[irow]); + } + rwork[k] += s; + } + } + s = 0.; + for (i = 0; i < A->nrow; ++i) { + if (rwork[i] > safe2) { + s = SUPERLU_MAX( s, z_abs1(&work[i]) / rwork[i] ); + } else if ( rwork[i] != 0.0 ) { + s = SUPERLU_MAX( s, (z_abs1(&work[i]) + safe1) / rwork[i] ); + } + /* If rwork[i] is exactly 0.0, then we know the true + residual also must be exactly 0.0. */ + } + berr[j] = s; + + /* Test stopping criterion. Continue iterating if + 1) The residual BERR(J) is larger than machine epsilon, and + 2) BERR(J) decreased by at least a factor of 2 during the + last iteration, and + 3) At most ITMAX iterations tried. */ + + if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { + /* Update solution and try again. */ + zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + +#ifdef _CRAY + CAXPY(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#else + zaxpy_(&A->nrow, &done, work, &ione, + &Xmat[j*ldx], &ione); +#endif + lstres = berr[j]; + ++count; + } else { + break; + } + + } /* end while */ + + stat->RefineSteps = count; + + /* Bound error from formula: + norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* + ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) + where + norm(Z) is the magnitude of the largest component of Z + inv(op(A)) is the inverse of op(A) + abs(Z) is the componentwise absolute value of the matrix or + vector Z + NZ is the maximum number of nonzeros in any row of A, plus 1 + EPS is machine epsilon + + The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) + is incremented by SAFE1 if the i-th component of + abs(op(A))*abs(X) + abs(B) is less than SAFE2. + + Use ZLACON to estimate the infinity-norm of the matrix + inv(op(A)) * diag(W), + where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ + + for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); + + /* Compute abs(op(A))*abs(X) + abs(B). */ + if ( notran ) { + for (k = 0; k < A->ncol; ++k) { + xk = z_abs1( &Xptr[k] ); + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) + rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; + } + } else { + for (k = 0; k < A->ncol; ++k) { + s = 0.; + for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { + irow = Astore->rowind[i]; + xk = z_abs1( &Xptr[irow] ); + s += z_abs1(&Aval[i]) * xk; + } + rwork[k] += s; + } + } + + for (i = 0; i < A->nrow; ++i) + if (rwork[i] > safe2) + rwork[i] = z_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; + else + rwork[i] = z_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; + kase = 0; + + do { + zlacon_(&A->nrow, &work[A->nrow], work, + &ferr[j], &kase); + if (kase == 0) break; + + if (kase == 1) { + /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) { + zd_mult(&work[i], &work[i], C[i]); + } + else if ( !notran && rowequ ) + for (i = 0; i < A->nrow; ++i) { + zd_mult(&work[i], &work[i], R[i]); + } + + zgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); + + for (i = 0; i < A->nrow; ++i) { + zd_mult(&work[i], &work[i], rwork[i]); + } + } else { + /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ + for (i = 0; i < A->nrow; ++i) { + zd_mult(&work[i], &work[i], rwork[i]); + } + + zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); + + if ( notran && colequ ) + for (i = 0; i < A->ncol; ++i) { + zd_mult(&work[i], &work[i], C[i]); + } + else if ( !notran && rowequ ) + for (i = 0; i < A->ncol; ++i) { + zd_mult(&work[i], &work[i], R[i]); + } + } + + } while ( kase != 0 ); + + /* Normalize error. */ + lstres = 0.; + if ( notran && colequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, C[i] * z_abs1( &Xptr[i]) ); + } else if ( !notran && rowequ ) { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, R[i] * z_abs1( &Xptr[i]) ); + } else { + for (i = 0; i < A->nrow; ++i) + lstres = SUPERLU_MAX( lstres, z_abs1( &Xptr[i]) ); + } + if ( lstres != 0. ) + ferr[j] /= lstres; + + } /* for each RHS j ... */ + + SUPERLU_FREE(work); + SUPERLU_FREE(rwork); + SUPERLU_FREE(iwork); + SUPERLU_FREE(Bjcol.Store); + + return; + +} /* zgsrfs */ diff --git a/src/maths/SuperLU/zgssv.c b/src/maths/SuperLU/zgssv.c new file mode 100644 index 000000000..b99b3d3d9 --- /dev/null +++ b/src/maths/SuperLU/zgssv.c @@ -0,0 +1,227 @@ + +/*! @file zgssv.c + * \brief Solves the system of linear equations A*X=B + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ */ +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZGSSV solves the system of linear equations A*X=B, using the
+ * LU factorization from ZGSTRF. It performs the following steps:
+ *
+ *   1. If A is stored column-wise (A->Stype = SLU_NC):
+ *
+ *      1.1. Permute the columns of A, forming A*Pc, where Pc
+ *           is a permutation matrix. For more details of this step, 
+ *           see sp_preorder.c.
+ *
+ *      1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined
+ *           by Gaussian elimination with partial pivoting.
+ *           L is unit lower triangular with offdiagonal entries
+ *           bounded by 1 in magnitude, and U is upper triangular.
+ *
+ *      1.3. Solve the system of equations A*X=B using the factored
+ *           form of A.
+ *
+ *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the
+ *      above algorithm to the transpose of A:
+ *
+ *      2.1. Permute columns of transpose(A) (rows of A),
+ *           forming transpose(A)*Pc, where Pc is a permutation matrix. 
+ *           For more details of this step, see sp_preorder.c.
+ *
+ *      2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr
+ *           determined by Gaussian elimination with partial pivoting.
+ *           L is unit lower triangular with offdiagonal entries
+ *           bounded by 1 in magnitude, and U is upper triangular.
+ *
+ *      2.3. Solve the system of equations A*X=B using the factored
+ *           form of A.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ * 
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed and how the
+ *         system will be solved.
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = SLU_NC or SLU_NR; Dtype = SLU_Z; Mtype = SLU_GE.
+ *         In the future, more general A may be handled.
+ *
+ * perm_c  (input/output) int*
+ *         If A->Stype = SLU_NC, column permutation vector of size A->ncol
+ *         which defines the permutation matrix Pc; perm_c[i] = j means 
+ *         column i of A is in position j in A*Pc.
+ *         If A->Stype = SLU_NR, column permutation vector of size A->nrow
+ *         which describes permutation of columns of transpose(A) 
+ *         (rows of A) as described above.
+ * 
+ *         If options->ColPerm = MY_PERMC or options->Fact = SamePattern or
+ *            options->Fact = SamePattern_SameRowPerm, it is an input argument.
+ *            On exit, perm_c may be overwritten by the product of the input
+ *            perm_c and a permutation that postorders the elimination tree
+ *            of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ *            is already in postorder.
+ *         Otherwise, it is an output argument.
+ * 
+ * perm_r  (input/output) int*
+ *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
+ *         which defines the permutation matrix Pr, and is determined 
+ *         by partial pivoting.  perm_r[i] = j means row i of A is in 
+ *         position j in Pr*A.
+ *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ *         determines permutation of rows of transpose(A)
+ *         (columns of A) as described above.
+ *
+ *         If options->RowPerm = MY_PERMR or
+ *            options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *            input argument.
+ *         otherwise it is an output argument.
+ *
+ * L       (output) SuperMatrix*
+ *         The factor L from the factorization 
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses compressed row subscripts storage for supernodes, i.e.,
+ *         L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ *         
+ * U       (output) SuperMatrix*
+ *	   The factor U from the factorization 
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *	   = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ * 
+ */ + +void +zgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, + SuperLUStat_t *stat, int *info ) +{ + + DNformat *Bstore; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int lwork = 0, *etree, i; + + /* Set default values for some parameters */ + int panel_size; /* panel size */ + int relax; /* no of columns in a relaxed snodes */ + int permc_spec; + trans_t trans = NOTRANS; + double *utime; + double t; /* Temporary time */ + + /* Test the input parameters ... */ + *info = 0; + Bstore = B->Store; + if ( options->Fact != DOFACT ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) + *info = -7; + if ( *info != 0 ) { + i = -(*info); + xerbla_("zgssv", &i); + return; + } + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + trans = TRANS; + } else { + if ( A->Stype == SLU_NC ) AA = A; + } + + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t; + + etree = intMalloc(A->ncol); + + t = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t; + + panel_size = sp_ienv(1); + relax = sp_ienv(2); + + /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4));*/ + t = SuperLU_timer_(); + /* Compute the LU factorization of A. */ + zgstrf(options, &AC, relax, panel_size, etree, + NULL, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t; + + t = SuperLU_timer_(); + if ( *info == 0 ) { + /* Solve the system A*X=B, overwriting B with X. */ + zgstrs (trans, L, U, perm_c, perm_r, B, stat, info); + } + utime[SOLVE] = SuperLU_timer_() - t; + + SUPERLU_FREE (etree); + Destroy_CompCol_Permuted(&AC); + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/zgssvx.c b/src/maths/SuperLU/zgssvx.c new file mode 100644 index 000000000..ac2924fa3 --- /dev/null +++ b/src/maths/SuperLU/zgssvx.c @@ -0,0 +1,622 @@ + +/*! @file zgssvx.c + * \brief Solves the system of linear equations A*X=B or A'*X=B + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ */ +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZGSSVX solves the system of linear equations A*X=B or A'*X=B, using
+ * the LU factorization from zgstrf(). Error bounds on the solution and
+ * a condition estimate are also provided. It performs the following steps:
+ *
+ *   1. If A is stored column-wise (A->Stype = SLU_NC):
+ *  
+ *      1.1. If options->Equil = YES, scaling factors are computed to
+ *           equilibrate the system:
+ *           options->Trans = NOTRANS:
+ *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *           options->Trans = TRANS:
+ *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *           options->Trans = CONJ:
+ *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *           Whether or not the system will be equilibrated depends on the
+ *           scaling of the matrix A, but if equilibration is used, A is
+ *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B
+ *           (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
+ *           = TRANS or CONJ).
+ *
+ *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
+ *           matrix that usually preserves sparsity.
+ *           For more details of this step, see sp_preorder.c.
+ *
+ *      1.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *           factor the matrix A (after equilibration if options->Equil = YES)
+ *           as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
+ *
+ *      1.4. Compute the reciprocal pivot growth factor.
+ *
+ *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *           routine returns with info = i. Otherwise, the factored form of 
+ *           A is used to estimate the condition number of the matrix A. If
+ *           the reciprocal of the condition number is less than machine
+ *           precision, info = A->ncol+1 is returned as a warning, but the
+ *           routine still goes on to solve for X and computes error bounds
+ *           as described below.
+ *
+ *      1.6. The system of equations is solved for X using the factored form
+ *           of A.
+ *
+ *      1.7. If options->IterRefine != NOREFINE, iterative refinement is
+ *           applied to improve the computed solution matrix and calculate
+ *           error bounds and backward error estimates for it.
+ *
+ *      1.8. If equilibration was used, the matrix X is premultiplied by
+ *           diag(C) (if options->Trans = NOTRANS) or diag(R)
+ *           (if options->Trans = TRANS or CONJ) so that it solves the
+ *           original system before equilibration.
+ *
+ *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
+ *      to the transpose of A:
+ *
+ *      2.1. If options->Equil = YES, scaling factors are computed to
+ *           equilibrate the system:
+ *           options->Trans = NOTRANS:
+ *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
+ *           options->Trans = TRANS:
+ *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+ *           options->Trans = CONJ:
+ *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+ *           Whether or not the system will be equilibrated depends on the
+ *           scaling of the matrix A, but if equilibration is used, A' is
+ *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
+ *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
+ *
+ *      2.2. Permute columns of transpose(A) (rows of A), 
+ *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
+ *           usually preserves sparsity.
+ *           For more details of this step, see sp_preorder.c.
+ *
+ *      2.3. If options->Fact != FACTORED, the LU decomposition is used to
+ *           factor the transpose(A) (after equilibration if 
+ *           options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
+ *           permutation Pr determined by partial pivoting.
+ *
+ *      2.4. Compute the reciprocal pivot growth factor.
+ *
+ *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
+ *           routine returns with info = i. Otherwise, the factored form 
+ *           of transpose(A) is used to estimate the condition number of the
+ *           matrix A. If the reciprocal of the condition number
+ *           is less than machine precision, info = A->nrow+1 is returned as
+ *           a warning, but the routine still goes on to solve for X and
+ *           computes error bounds as described below.
+ *
+ *      2.6. The system of equations is solved for X using the factored form
+ *           of transpose(A).
+ *
+ *      2.7. If options->IterRefine != NOREFINE, iterative refinement is
+ *           applied to improve the computed solution matrix and calculate
+ *           error bounds and backward error estimates for it.
+ *
+ *      2.8. If equilibration was used, the matrix X is premultiplied by
+ *           diag(C) (if options->Trans = NOTRANS) or diag(R) 
+ *           (if options->Trans = TRANS or CONJ) so that it solves the
+ *           original system before equilibration.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed and how the
+ *         system will be solved.
+ *
+ * A       (input/output) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
+ *         In the future, more general A may be handled.
+ *
+ *         On entry, If options->Fact = FACTORED and equed is not 'N', 
+ *         then A must have been equilibrated by the scaling factors in
+ *         R and/or C.  
+ *         On exit, A is not modified if options->Equil = NO, or if 
+ *         options->Equil = YES but equed = 'N' on exit.
+ *         Otherwise, if options->Equil = YES and equed is not 'N',
+ *         A is scaled as follows:
+ *         If A->Stype = SLU_NC:
+ *           equed = 'R':  A := diag(R) * A
+ *           equed = 'C':  A := A * diag(C)
+ *           equed = 'B':  A := diag(R) * A * diag(C).
+ *         If A->Stype = SLU_NR:
+ *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
+ *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
+ *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
+ *
+ * perm_c  (input/output) int*
+ *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
+ *         which defines the permutation matrix Pc; perm_c[i] = j means
+ *         column i of A is in position j in A*Pc.
+ *         On exit, perm_c may be overwritten by the product of the input
+ *         perm_c and a permutation that postorders the elimination tree
+ *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ *         is already in postorder.
+ *
+ *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
+ *         which describes permutation of columns of transpose(A) 
+ *         (rows of A) as described above.
+ * 
+ * perm_r  (input/output) int*
+ *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
+ *         which defines the permutation matrix Pr, and is determined
+ *         by partial pivoting.  perm_r[i] = j means row i of A is in 
+ *         position j in Pr*A.
+ *
+ *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ *         determines permutation of rows of transpose(A)
+ *         (columns of A) as described above.
+ *
+ *         If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *         will try to use the input perm_r, unless a certain threshold
+ *         criterion is violated. In that case, perm_r is overwritten by a
+ *         new permutation determined by partial pivoting or diagonal
+ *         threshold pivoting.
+ *         Otherwise, perm_r is output argument.
+ * 
+ * etree   (input/output) int*,  dimension (A->ncol)
+ *         Elimination tree of Pc'*A'*A*Pc.
+ *         If options->Fact != FACTORED and options->Fact != DOFACT,
+ *         etree is an input argument, otherwise it is an output argument.
+ *         Note: etree is a vector of parent pointers for a forest whose
+ *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * equed   (input/output) char*
+ *         Specifies the form of equilibration that was done.
+ *         = 'N': No equilibration.
+ *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
+ *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
+ *         = 'B': Both row and column equilibration, i.e., A was replaced 
+ *                by diag(R)*A*diag(C).
+ *         If options->Fact = FACTORED, equed is an input argument,
+ *         otherwise it is an output argument.
+ *
+ * R       (input/output) double*, dimension (A->nrow)
+ *         The row scale factors for A or transpose(A).
+ *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *             (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
+ *         If equed = 'N' or 'C', R is not accessed.
+ *         If options->Fact = FACTORED, R is an input argument,
+ *             otherwise, R is output.
+ *         If options->zFact = FACTORED and equed = 'R' or 'B', each element
+ *             of R must be positive.
+ * 
+ * C       (input/output) double*, dimension (A->ncol)
+ *         The column scale factors for A or transpose(A).
+ *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
+ *             (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
+ *         If equed = 'N' or 'R', C is not accessed.
+ *         If options->Fact = FACTORED, C is an input argument,
+ *             otherwise, C is output.
+ *         If options->Fact = FACTORED and equed = 'C' or 'B', each element
+ *             of C must be positive.
+ *         
+ * L       (output) SuperMatrix*
+ *	   The factor L from the factorization
+ *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses compressed row subscripts storage for supernodes, i.e.,
+ *         L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ *
+ * U       (output) SuperMatrix*
+ *	   The factor U from the factorization
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
+ *
+ * work    (workspace/output) void*, size (lwork) (in bytes)
+ *         User supplied workspace, should be large enough
+ *         to hold data structures for factors L and U.
+ *         On exit, if fact is not 'F', L and U point to this array.
+ *
+ * lwork   (input) int
+ *         Specifies the size of work array in bytes.
+ *         = 0:  allocate space internally by system malloc;
+ *         > 0:  use user-supplied work array of length lwork in bytes,
+ *               returns error if space runs out.
+ *         = -1: the routine guesses the amount of space needed without
+ *               performing the factorization, and returns it in
+ *               mem_usage->total_needed; no other side effects.
+ *
+ *         See argument 'mem_usage' for memory usage statistics.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         If B->ncol = 0, only LU decomposition is performed, the triangular
+ *                         solve is skipped.
+ *         On exit,
+ *            if equed = 'N', B is not modified; otherwise
+ *            if A->Stype = SLU_NC:
+ *               if options->Trans = NOTRANS and equed = 'R' or 'B',
+ *                  B is overwritten by diag(R)*B;
+ *               if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
+ *                  B is overwritten by diag(C)*B;
+ *            if A->Stype = SLU_NR:
+ *               if options->Trans = NOTRANS and equed = 'C' or 'B',
+ *                  B is overwritten by diag(C)*B;
+ *               if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
+ *                  B is overwritten by diag(R)*B.
+ *
+ * X       (output) SuperMatrix*
+ *         X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. 
+ *         If info = 0 or info = A->ncol+1, X contains the solution matrix
+ *         to the original system of equations. Note that A and B are modified
+ *         on exit if equed is not 'N', and the solution to the equilibrated
+ *         system is inv(diag(C))*X if options->Trans = NOTRANS and
+ *         equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
+ *         and equed = 'R' or 'B'.
+ *
+ * recip_pivot_growth (output) double*
+ *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
+ *         The infinity norm is used. If recip_pivot_growth is much less
+ *         than 1, the stability of the LU factorization could be poor.
+ *
+ * rcond   (output) double*
+ *         The estimate of the reciprocal condition number of the matrix A
+ *         after equilibration (if done). If rcond is less than the machine
+ *         precision (in particular, if rcond = 0), the matrix is singular
+ *         to working precision. This condition is indicated by a return
+ *         code of info > 0.
+ *
+ * FERR    (output) double*, dimension (B->ncol)   
+ *         The estimated forward error bound for each solution vector   
+ *         X(j) (the j-th column of the solution matrix X).   
+ *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
+ *         is an estimated upper bound for the magnitude of the largest 
+ *         element in (X(j) - XTRUE) divided by the magnitude of the   
+ *         largest element in X(j).  The estimate is as reliable as   
+ *         the estimate for RCOND, and is almost always a slight   
+ *         overestimate of the true error.
+ *         If options->IterRefine = NOREFINE, ferr = 1.0.
+ *
+ * BERR    (output) double*, dimension (B->ncol)
+ *         The componentwise relative backward error of each solution   
+ *         vector X(j) (i.e., the smallest relative change in   
+ *         any element of A or B that makes X(j) an exact solution).
+ *         If options->IterRefine = NOREFINE, berr = 1.0.
+ *
+ * mem_usage (output) mem_usage_t*
+ *         Record the memory usage statistics, consisting of following fields:
+ *         - for_lu (float)
+ *           The amount of space used in bytes for L\U data structures.
+ *         - total_needed (float)
+ *           The amount of space needed in bytes to perform factorization.
+ *         - expansions (int)
+ *           The number of memory expansions during the LU factorization.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit   
+ *         < 0: if info = -i, the i-th argument had an illegal value   
+ *         > 0: if info = i, and i is   
+ *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
+ *                    been completed, but the factor U is exactly   
+ *                    singular, so the solution and error bounds   
+ *                    could not be computed.   
+ *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
+ *                    precision, meaning that the matrix is singular to
+ *                    working precision. Nevertheless, the solution and
+ *                    error bounds are computed because there are a number
+ *                    of situations where the computed solution can be more
+ *                    accurate than the value of RCOND would suggest.   
+ *              > A->ncol+1: number of bytes allocated when memory allocation
+ *                    failure occurred, plus A->ncol.
+ * 
+ */ + +void +zgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, + int *etree, char *equed, double *R, double *C, + SuperMatrix *L, SuperMatrix *U, void *work, int lwork, + SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, + double *rcond, double *ferr, double *berr, + mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) +{ + + + DNformat *Bstore, *Xstore; + doublecomplex *Bmat, *Xmat; + int ldb, ldx, nrhs; + SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ + SuperMatrix AC; /* Matrix postmultiplied by Pc */ + int colequ, equil, nofact, notran, rowequ, permc_spec; + trans_t trant; + char norm[1]; + int i, j, info1; + double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; + int relax, panel_size; + double diag_pivot_thresh; + double t0; /* temporary time */ + double *utime; + + /* External functions */ + extern double zlangs(char *, SuperMatrix *); + + Bstore = B->Store; + Xstore = X->Store; + Bmat = Bstore->nzval; + Xmat = Xstore->nzval; + ldb = Bstore->lda; + ldx = Xstore->lda; + nrhs = B->ncol; + + *info = 0; + nofact = (options->Fact != FACTORED); + equil = (options->Equil == YES_SuperLU); + notran = (options->Trans == NOTRANS); + if ( nofact ) { + *(unsigned char *)equed = 'N'; + rowequ = FALSE; + colequ = FALSE; + } else { + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + smlnum = dlamch_("Safe minimum"); + bignum = 1. / smlnum; + } + +#if 0 +printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", + options->Fact, options->Trans, *equed); +#endif + + /* Test the input parameters */ + if (options->Fact != DOFACT && options->Fact != SamePattern && + options->Fact != SamePattern_SameRowPerm && + options->Fact != FACTORED && + options->Trans != NOTRANS && options->Trans != TRANS && + options->Trans != CONJ && + options->Equil != NO_SuperLU && options->Equil != YES_SuperLU) + *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + (A->Stype != SLU_NC && A->Stype != SLU_NR) || + A->Dtype != SLU_Z || A->Mtype != SLU_GE ) + *info = -2; + else if (options->Fact == FACTORED && + !(rowequ || colequ || lsame_(equed, "N"))) + *info = -6; + else { + if (rowequ) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, R[j]); + rcmax = SUPERLU_MAX(rcmax, R[j]); + } + if (rcmin <= 0.) *info = -7; + else if ( A->nrow > 0) + rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else rowcnd = 1.; + } + if (colequ && *info == 0) { + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->nrow; ++j) { + rcmin = SUPERLU_MIN(rcmin, C[j]); + rcmax = SUPERLU_MAX(rcmax, C[j]); + } + if (rcmin <= 0.) *info = -8; + else if (A->nrow > 0) + colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); + else colcnd = 1.; + } + if (*info == 0) { + if ( lwork < -1 ) *info = -12; + else if ( B->ncol < 0 ) *info = -13; + else if ( B->ncol > 0 ) { /* no checking if B->ncol=0 */ + if ( Bstore->lda < SUPERLU_MAX(0, A->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || + B->Mtype != SLU_GE ) + *info = -13; + } + if ( X->ncol < 0 ) *info = -14; + else if ( X->ncol > 0 ) { /* no checking if X->ncol=0 */ + if ( Xstore->lda < SUPERLU_MAX(0, A->nrow) || + (B->ncol != 0 && B->ncol != X->ncol) || + X->Stype != SLU_DN || + X->Dtype != SLU_Z || X->Mtype != SLU_GE ) + *info = -14; + } + } + } + if (*info != 0) { + i = -(*info); + xerbla_("zgssvx", &i); + return; + } + + /* Initialization for factor parameters */ + panel_size = sp_ienv(1); + relax = sp_ienv(2); + diag_pivot_thresh = options->DiagPivotThresh; + + utime = stat->utime; + + /* Convert A to SLU_NC format when necessary. */ + if ( A->Stype == SLU_NR ) { + NRformat *Astore = A->Store; + AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, + Astore->nzval, Astore->colind, Astore->rowptr, + SLU_NC, A->Dtype, A->Mtype); + if ( notran ) { /* Reverse the transpose argument. */ + trant = TRANS; + notran = 0; + } else { + trant = NOTRANS; + notran = 1; + } + } else { /* A->Stype == SLU_NC */ + trant = options->Trans; + AA = A; + } + + if ( nofact && equil ) { + t0 = SuperLU_timer_(); + /* Compute row and column scalings to equilibrate the matrix A. */ + zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); + + if ( info1 == 0 ) { + /* Equilibrate matrix A. */ + zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); + rowequ = lsame_(equed, "R") || lsame_(equed, "B"); + colequ = lsame_(equed, "C") || lsame_(equed, "B"); + } + utime[EQUIL] = SuperLU_timer_() - t0; + } + + + if ( nofact ) { + + t0 = SuperLU_timer_(); + /* + * Gnet column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = COLAMD: approximate minimum degree column ordering + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) + get_perm_c(permc_spec, AA, perm_c); + utime[COLPERM] = SuperLU_timer_() - t0; + + t0 = SuperLU_timer_(); + sp_preorder(options, AA, perm_c, etree, &AC); + utime[ETREE] = SuperLU_timer_() - t0; + +/* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", + relax, panel_size, sp_ienv(3), sp_ienv(4)); + fflush(stdout); */ + + /* Compute the LU factorization of A*Pc. */ + t0 = SuperLU_timer_(); + zgstrf(options, &AC, relax, panel_size, etree, + work, lwork, perm_c, perm_r, L, U, stat, info); + utime[FACT] = SuperLU_timer_() - t0; + + if ( lwork == -1 ) { + mem_usage->total_needed = *info - A->ncol; + return; + } + } + + if ( options->PivotGrowth ) { + if ( *info > 0 ) { + if ( *info <= A->ncol ) { + /* Compute the reciprocal pivot growth factor of the leading + rank-deficient *info columns of A. */ + *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U); + } + return; + } + + /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ + *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); + } + + if ( options->ConditionNumber ) { + /* Estimate the reciprocal of the condition number of A. */ + t0 = SuperLU_timer_(); + if ( notran ) { + *(unsigned char *)norm = '1'; + } else { + *(unsigned char *)norm = 'I'; + } + anorm = zlangs(norm, AA); + zgscon(norm, L, U, anorm, rcond, stat, info); + utime[RCOND] = SuperLU_timer_() - t0; + } + + if ( nrhs > 0 ) { + /* Scale the right hand side if equilibration was performed. */ + if ( notran ) { + if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); + } + } else if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); + } + + /* Compute the solution matrix X. */ + for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ + for (i = 0; i < B->nrow; i++) + Xmat[i + j*ldx] = Bmat[i + j*ldb]; + + t0 = SuperLU_timer_(); + zgstrs (trant, L, U, perm_c, perm_r, X, stat, info); + utime[SOLVE] = SuperLU_timer_() - t0; + + /* Use iterative refinement to improve the computed solution and compute + error bounds and backward error estimates for it. */ + t0 = SuperLU_timer_(); + if ( options->IterRefine != NOREFINE ) { + zgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, + X, ferr, berr, stat, info); + } else { + for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; + } + utime[REFINE] = SuperLU_timer_() - t0; + + /* Transform the solution matrix X to a solution of the original system. */ + if ( notran ) { + if ( colequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); + } + } else if ( rowequ ) { + for (j = 0; j < nrhs; ++j) + for (i = 0; i < A->nrow; ++i) + zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); + } + } /* end if nrhs > 0 */ + + if ( options->ConditionNumber ) { + /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ + if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; + } + + if ( nofact ) { + zQuerySpace(L, U, mem_usage); + Destroy_CompCol_Permuted(&AC); + } + if ( A->Stype == SLU_NR ) { + Destroy_SuperMatrix_Store(AA); + SUPERLU_FREE(AA); + } + +} diff --git a/src/maths/SuperLU/zgstrf.c b/src/maths/SuperLU/zgstrf.c new file mode 100644 index 000000000..ed79815e4 --- /dev/null +++ b/src/maths/SuperLU/zgstrf.c @@ -0,0 +1,436 @@ + +/*! @file zgstrf.c + * \brief Computes an LU factorization of a general sparse matrix + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZGSTRF computes an LU factorization of a general sparse m-by-n
+ * matrix A using partial pivoting with row interchanges.
+ * The factorization has the form
+ *     Pr * A = L * U
+ * where Pr is a row permutation matrix, L is lower triangular with unit
+ * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
+ * triangular (upper trapezoidal if A->nrow < A->ncol).
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * A        (input) SuperMatrix*
+ *	    Original matrix A, permuted by columns, of dimension
+ *          (A->nrow, A->ncol). The type of A can be:
+ *          Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
+ *
+ * relax    (input) int
+ *          To control degree of relaxing supernodes. If the number
+ *          of nodes (columns) in a subtree of the elimination tree is less
+ *          than relax, this subtree is considered as one supernode,
+ *          regardless of the row structures of those columns.
+ *
+ * panel_size (input) int
+ *          A panel consists of at most panel_size consecutive columns.
+ *
+ * etree    (input) int*, dimension (A->ncol)
+ *          Elimination tree of A'*A.
+ *          Note: etree is a vector of parent pointers for a forest whose
+ *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *          On input, the columns of A should be permuted so that the
+ *          etree is in a certain postorder.
+ *
+ * work     (input/output) void*, size (lwork) (in bytes)
+ *          User-supplied work space and space for the output data structures.
+ *          Not referenced if lwork = 0;
+ *
+ * lwork   (input) int
+ *         Specifies the size of work array in bytes.
+ *         = 0:  allocate space internally by system malloc;
+ *         > 0:  use user-supplied work array of length lwork in bytes,
+ *               returns error if space runs out.
+ *         = -1: the routine guesses the amount of space needed without
+ *               performing the factorization, and returns it in
+ *               *info; no other side effects.
+ *
+ * perm_c   (input) int*, dimension (A->ncol)
+ *	    Column permutation vector, which defines the 
+ *          permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *          in position j in A*Pc.
+ *          When searching for diagonal, perm_c[*] is applied to the
+ *          row subscripts of A, so that diagonal threshold pivoting
+ *          can find the diagonal of A, rather than that of A*Pc.
+ *
+ * perm_r   (input/output) int*, dimension (A->nrow)
+ *          Row permutation vector which defines the permutation matrix Pr,
+ *          perm_r[i] = j means row i of A is in position j in Pr*A.
+ *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *             will try to use the input perm_r, unless a certain threshold
+ *             criterion is violated. In that case, perm_r is overwritten by
+ *             a new permutation determined by partial pivoting or diagonal
+ *             threshold pivoting.
+ *          Otherwise, perm_r is output argument;
+ *
+ * L        (output) SuperMatrix*
+ *          The factor L from the factorization Pr*A=L*U; use compressed row 
+ *          subscripts storage for supernodes, i.e., L has type: 
+ *          Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ *
+ * U        (output) SuperMatrix*
+ *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
+ *          storage scheme, i.e., U has types: Stype = SLU_NC, 
+ *          Dtype = SLU_Z, Mtype = SLU_TRU.
+ *
+ * stat     (output) SuperLUStat_t*
+ *          Record the statistics on runtime and floating-point operation count.
+ *          See slu_util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info     (output) int*
+ *          = 0: successful exit
+ *          < 0: if info = -i, the i-th argument had an illegal value
+ *          > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                and division by zero will occur if it is used to solve a
+ *                system of equations.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol. If lwork = -1, it is
+ *                the estimated amount of space needed, plus A->ncol.
+ *
+ * ======================================================================
+ *
+ * Local Working Arrays: 
+ * ======================
+ *   m = number of rows in the matrix
+ *   n = number of columns in the matrix
+ *
+ *   xprune[0:n-1]: xprune[*] points to locations in subscript 
+ *	vector lsub[*]. For column i, xprune[i] denotes the point where 
+ *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
+ *	to be traversed for symbolic factorization.
+ *
+ *   marker[0:3*m-1]: marker[i] = j means that node i has been 
+ *	reached when working on column j.
+ *	Storage: relative to original row subscripts
+ *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
+ *	      see zpanel_dfs.c; marker2 is used for inner-factorization,
+ *            see zcolumn_dfs.c.
+ *
+ *   parent[0:m-1]: parent vector used during dfs
+ *      Storage: relative to new row subscripts
+ *
+ *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
+ *	unexplored neighbor of i in lsub[*]
+ *
+ *   segrep[0:nseg-1]: contains the list of supernodal representatives
+ *	in topological order of the dfs. A supernode representative is the 
+ *	last column of a supernode.
+ *      The maximum size of segrep[] is n.
+ *
+ *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
+ *	supernodal representative r, repfnz[r] is the location of the first 
+ *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
+ *	indicates the supernode r has been explored.
+ *	NOTE: There are W of them, each used for one column of a panel. 
+ *
+ *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
+ *      the panel diagonal. These are filled in during zpanel_dfs(), and are
+ *      used later in the inner LU factorization within the panel.
+ *	panel_lsub[]/dense[] pair forms the SPA data structure.
+ *	NOTE: There are W of them.
+ *
+ *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
+ *	    	   NOTE: there are W of them.
+ *
+ *   tempv[0:*]: real temporary used for dense numeric kernels;
+ *	The size of this array is defined by NUM_TEMPV() in slu_zdefs.h.
+ * 
+ */ + +void +zgstrf (superlu_options_t *options, SuperMatrix *A, + int relax, int panel_size, int *etree, void *work, int lwork, + int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, + SuperLUStat_t *stat, int *info) +{ + /* Local working arrays */ + NCPformat *Astore; + int *iperm_r = NULL; /* inverse of perm_r; used when + options->Fact == SamePattern_SameRowPerm */ + int *iperm_c; /* inverse of perm_c */ + int *iwork; + doublecomplex *zwork; + int *segrep, *repfnz, *parent, *xplore; + int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ + int *xprune; + int *marker; + doublecomplex *dense, *tempv; + int *relax_end; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int *xsup, *supno; + int *xlsub, *xlusup, *xusub; + int nzlumax; + double fill_ratio = sp_ienv(6); /* estimated fill ratio */ + static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ + + /* Local scalars */ + fact_t fact = options->Fact; + double diag_pivot_thresh = options->DiagPivotThresh; + int pivrow; /* pivotal row number in the original matrix A */ + int nseg1; /* no of segments in U-column above panel row jcol */ + int nseg; /* no of segments in each U-column */ + register int jcol; + register int kcol; /* end column of a relaxed snode */ + register int icol; + register int i, k, jj, new_next, iinfo; + int m, n, min_mn, jsupno, fsupc, nextlu, nextu; + int w_def; /* upper bound on panel width */ + int usepr, iperm_r_allocated = 0; + int nnzL, nnzU; + int *panel_histo = stat->panel_histo; + flops_t *ops = stat->ops; + + iinfo = 0; + m = A->nrow; + n = A->ncol; + min_mn = SUPERLU_MIN(m, n); + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + + /* Allocate storage common to the factor routines */ + *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, + panel_size, fill_ratio, L, U, &Glu, &iwork, &zwork); + if ( *info ) return; + + xsup = Glu.xsup; + supno = Glu.supno; + xlsub = Glu.xlsub; + xlusup = Glu.xlusup; + xusub = Glu.xusub; + + SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, + &repfnz, &panel_lsub, &xprune, &marker); + zSetRWork(m, panel_size, zwork, &dense, &tempv); + + usepr = (fact == SamePattern_SameRowPerm); + if ( usepr ) { + /* Compute the inverse of perm_r */ + iperm_r = (int *) intMalloc(m); + for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; + iperm_r_allocated = 1; + } + iperm_c = (int *) intMalloc(n); + for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; + + /* Identify relaxed snodes */ + relax_end = (int *) intMalloc(n); + if ( options->SymmetricMode == YES_SuperLU ) { + heap_relax_snode(n, etree, relax, marker, relax_end); + } else { + relax_snode(n, etree, relax, marker, relax_end); + } + + ifill (perm_r, m, EMPTY); + ifill (marker, m * NO_MARKER, EMPTY); + supno[0] = -1; + xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; + w_def = panel_size; + + /* + * Work on one "panel" at a time. A panel is one of the following: + * (a) a relaxed supernode at the bottom of the etree, or + * (b) panel_size contiguous columns, defined by the user + */ + for (jcol = 0; jcol < min_mn; ) { + + if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ + kcol = relax_end[jcol]; /* end of the relaxed snode */ + panel_histo[kcol-jcol+1]++; + + /* -------------------------------------- + * Factorize the relaxed supernode(jcol:kcol) + * -------------------------------------- */ + /* Determine the union of the row structure of the snode */ + if ( (*info = zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, + xprune, marker, &Glu)) != 0 ) + return; + + nextu = xusub[jcol]; + nextlu = xlusup[jcol]; + jsupno = supno[jcol]; + fsupc = xsup[jsupno]; + new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); + nzlumax = Glu.nzlumax; + while ( new_next > nzlumax ) { + if ( (*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) + return; + } + + for (icol = jcol; icol<= kcol; icol++) { + xusub[icol+1] = nextu; + + /* Scatter into SPA dense[*] */ + for (k = xa_begin[icol]; k < xa_end[icol]; k++) + dense[asub[k]] = a[k]; + + /* Numeric update within the snode */ + zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); + + if ( (*info = zpivotL(icol, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + +#ifdef DEBUG + zprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); +#endif + + } + + jcol = icol; + + } else { /* Work on one panel of panel_size columns */ + + /* Adjust panel_size so that a panel won't overlap with the next + * relaxed snode. + */ + panel_size = w_def; + for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) + if ( relax_end[k] != EMPTY ) { + panel_size = k - jcol; + break; + } + if ( k == min_mn ) panel_size = min_mn - jcol; + panel_histo[panel_size]++; + + /* symbolic factor on a panel of columns */ + zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, + dense, panel_lsub, segrep, repfnz, xprune, + marker, parent, xplore, &Glu); + + /* numeric sup-panel updates in topological order */ + zpanel_bmod(m, panel_size, jcol, nseg1, dense, + tempv, segrep, repfnz, &Glu, stat); + + /* Sparse LU within the panel, and below panel diagonal */ + for ( jj = jcol; jj < jcol + panel_size; jj++) { + k = (jj - jcol) * m; /* column index for w-wide arrays */ + + nseg = nseg1; /* Begin after all the panel segments */ + + if ((*info = zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], + segrep, &repfnz[k], xprune, marker, + parent, xplore, &Glu)) != 0) return; + + /* Numeric updates */ + if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], + tempv, &segrep[nseg1], &repfnz[k], + jcol, &Glu, stat)) != 0) return; + + /* Copy the U-segments to ucol[*] */ + if ((*info = zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], + perm_r, &dense[k], &Glu)) != 0) + return; + + if ( (*info = zpivotL(jj, diag_pivot_thresh, &usepr, perm_r, + iperm_r, iperm_c, &pivrow, &Glu, stat)) ) + if ( iinfo == 0 ) iinfo = *info; + + /* Prune columns (0:jj-1) using column jj */ + zpruneL(jj, perm_r, pivrow, nseg, segrep, + &repfnz[k], xprune, &Glu); + + /* Reset repfnz[] for this column */ + resetrep_col (nseg, segrep, &repfnz[k]); + +#ifdef DEBUG + zprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); +#endif + + } + + jcol += panel_size; /* Move to the next panel */ + + } /* else */ + + } /* for */ + + *info = iinfo; + + if ( m > n ) { + k = 0; + for (i = 0; i < m; ++i) + if ( perm_r[i] == EMPTY ) { + perm_r[i] = n + k; + ++k; + } + } + + countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); + fixupL(min_mn, perm_r, &Glu); + + zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */ + + if ( fact == SamePattern_SameRowPerm ) { + /* L and U structures may have changed due to possibly different + pivoting, even though the storage is available. + There could also be memory expansions, so the array locations + may have changed, */ + ((SCformat *)L->Store)->nnz = nnzL; + ((SCformat *)L->Store)->nsuper = Glu.supno[n]; + ((SCformat *)L->Store)->nzval = Glu.lusup; + ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; + ((SCformat *)L->Store)->rowind = Glu.lsub; + ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; + ((NCformat *)U->Store)->nnz = nnzU; + ((NCformat *)U->Store)->nzval = Glu.ucol; + ((NCformat *)U->Store)->rowind = Glu.usub; + ((NCformat *)U->Store)->colptr = Glu.xusub; + } else { + zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, + Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, + Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU); + zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, + Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU); + } + + ops[FACT] += ops[TRSV] + ops[GEMV]; + stat->expansions = --(Glu.num_expansions); + + if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); + SUPERLU_FREE (iperm_c); + SUPERLU_FREE (relax_end); + +} diff --git a/src/maths/SuperLU/zgstrs.c b/src/maths/SuperLU/zgstrs.c new file mode 100644 index 000000000..7b2a80437 --- /dev/null +++ b/src/maths/SuperLU/zgstrs.c @@ -0,0 +1,350 @@ + +/*! @file zgstrs.c + * \brief Solves a system using LU factorization + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + +#include + + +/* + * Function prototypes + */ +void zusolve(int, int, doublecomplex*, doublecomplex*); +void zlsolve(int, int, doublecomplex*, doublecomplex*); +void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZGSTRS solves a system of linear equations A*X=B or A'*X=B
+ * with A sparse and B dense, using the LU factorization computed by
+ * ZGSTRF.
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * trans   (input) trans_t
+ *          Specifies the form of the system of equations:
+ *          = NOTRANS: A * X = B  (No transpose)
+ *          = TRANS:   A'* X = B  (Transpose)
+ *          = CONJ:    A**H * X = B  (Conjugate transpose)
+ *
+ * L       (input) SuperMatrix*
+ *         The factor L from the factorization Pr*A*Pc=L*U as computed by
+ *         zgstrf(). Use compressed row subscripts storage for supernodes,
+ *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
+ *
+ * U       (input) SuperMatrix*
+ *         The factor U from the factorization Pr*A*Pc=L*U as computed by
+ *         zgstrf(). Use column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
+ *
+ * perm_c  (input) int*, dimension (L->ncol)
+ *	   Column permutation vector, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *
+ * perm_r  (input) int*, dimension (L->nrow)
+ *         Row permutation vector, which defines the permutation matrix Pr; 
+ *         perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * stat     (output) SuperLUStat_t*
+ *          Record the statistics on runtime and floating-point operation count.
+ *          See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void +zgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, + int *perm_c, int *perm_r, SuperMatrix *B, + SuperLUStat_t *stat, int *info) +{ + +#ifdef _CRAY + _fcd ftcs1, ftcs2, ftcs3, ftcs4; +#endif + int incx = 1, incy = 1; +#ifdef USE_VENDOR_BLAS + doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; + doublecomplex *work_col; +#endif + doublecomplex temp_comp; + DNformat *Bstore; + doublecomplex *Bmat; + SCformat *Lstore; + NCformat *Ustore; + doublecomplex *Lval, *Uval; + int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; + int i, j, k, iptr, jcol, n, ldb, nrhs; + doublecomplex *work, *rhs_work, *soln; + flops_t solve_ops; + void zprint_soln(); + + /* Test input parameters ... */ + *info = 0; + Bstore = B->Store; + ldb = Bstore->lda; + nrhs = B->ncol; + if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; + else if ( L->nrow != L->ncol || L->nrow < 0 || + L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) + *info = -2; + else if ( U->nrow != U->ncol || U->nrow < 0 || + U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) + *info = -3; + else if ( ldb < SUPERLU_MAX(0, L->nrow) || + B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) + *info = -6; + if ( *info ) { + i = -(*info); + xerbla_("zgstrs", &i); + return; + } + + n = L->nrow; + work = doublecomplexCalloc(n * nrhs); + if ( !work ) ABORT_SuperLU("Malloc fails for local work[]."); + soln = doublecomplexMalloc(n); + if ( !soln ) ABORT_SuperLU("Malloc fails for local soln[]."); + + Bmat = Bstore->nzval; + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( trans == NOTRANS ) { + /* Permute right hand sides to form Pr*B */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + /* Forward solve PLy=Pb. */ + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + nrow = nsupr - nsupc; + + solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; + solve_ops += 8 * nrow * nsupc * nrhs; + + if ( nsupc == 1 ) { + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + luptr = L_NZ_START(fsupc); + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ + irow = L_SUB(iptr); + ++luptr; + zz_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); + z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); + } + } + } else { + luptr = L_NZ_START(fsupc); +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); + CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#else + ztrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); + + zgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, + &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, + &beta, &work[0], &n ); +#endif + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + work_col = &work[j*n]; + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + z_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); + work_col[i].r = 0.0; + work_col[i].i = 0.0; + iptr++; + } + } +#else + for (j = 0; j < nrhs; j++) { + rhs_work = &Bmat[j*ldb]; + zlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); + zmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], + &rhs_work[fsupc], &work[0] ); + + iptr = istart + nsupc; + for (i = 0; i < nrow; i++) { + irow = L_SUB(iptr); + z_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); + work[i].r = 0.; + work[i].i = 0.; + iptr++; + } + } +#endif + } /* else ... */ + } /* for L-solve */ + +#ifdef DEBUG + printf("After L-solve: y=\n"); + zprint_soln(n, nrhs, Bmat); +#endif + + /* + * Back solve Ux=y. + */ + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; + + if ( nsupc == 1 ) { + rhs_work = &Bmat[0]; + for (j = 0; j < nrhs; j++) { + z_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); + rhs_work += ldb; + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("U", strlen("U")); + ftcs3 = _cptofcd("N", strlen("N")); + CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#else + ztrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, + &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); +#endif +#else + for (j = 0; j < nrhs; j++) + zusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); +#endif + } + + for (j = 0; j < nrhs; ++j) { + rhs_work = &Bmat[j*ldb]; + for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ + irow = U_SUB(i); + zz_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); + z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); + } + } + } + + } /* for U-solve */ + +#ifdef DEBUG + printf("After U-solve: x=\n"); + zprint_soln(n, nrhs, Bmat); +#endif + + /* Compute the final solution X := Pc*X. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = solve_ops; + + } else { /* Solve A'*X=B or CONJ(A)*X=B */ + /* Permute right hand sides to form Pc'*B. */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + stat->ops[SOLVE] = 0; + if (trans == TRANS) { + for (k = 0; k < nrhs; ++k) { + /* Multiply by inv(U'). */ + sp_ztrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); + + /* Multiply by inv(L'). */ + sp_ztrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); + } + } else { /* trans == CONJ */ + for (k = 0; k < nrhs; ++k) { + /* Multiply by conj(inv(U')). */ + sp_ztrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); + + /* Multiply by conj(inv(L')). */ + sp_ztrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); + } + } + /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ + for (i = 0; i < nrhs; i++) { + rhs_work = &Bmat[i*ldb]; + for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; + for (k = 0; k < n; k++) rhs_work[k] = soln[k]; + } + + } + + SUPERLU_FREE(work); + SUPERLU_FREE(soln); +} + +/* + * Diagnostic print of the solution vector + */ +void +zprint_soln(int n, int nrhs, doublecomplex *soln) +{ + int i; + + for (i = 0; i < n; i++) + printf("\t%d: %.4f\n", i, soln[i]); +} diff --git a/src/maths/SuperLU/zlacon.c b/src/maths/SuperLU/zlacon.c new file mode 100644 index 000000000..40240d5d3 --- /dev/null +++ b/src/maths/SuperLU/zlacon.c @@ -0,0 +1,221 @@ + +/*! @file zlacon.c + * \brief Estimates the 1-norm + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +#include +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   ZLACON estimates the 1-norm of a square matrix A.   
+ *   Reverse communication is used for evaluating matrix-vector products. 
+ * 
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   N      (input) INT
+ *          The order of the matrix.  N >= 1.   
+ *
+ *   V      (workspace) DOUBLE COMPLEX PRECISION array, dimension (N)   
+ *          On the final return, V = A*W,  where  EST = norm(V)/norm(W)   
+ *          (W is not returned).   
+ *
+ *   X      (input/output) DOUBLE COMPLEX PRECISION array, dimension (N)   
+ *          On an intermediate return, X should be overwritten by   
+ *                A * X,   if KASE=1,   
+ *                A' * X,  if KASE=2,
+ *          where A' is the conjugate transpose of A,
+ *         and ZLACON must be re-called with all the other parameters   
+ *          unchanged.   
+ *
+ *
+ *   EST    (output) DOUBLE PRECISION   
+ *          An estimate (a lower bound) for norm(A).   
+ *
+ *   KASE   (input/output) INT
+ *          On the initial call to ZLACON, KASE should be 0.   
+ *          On an intermediate return, KASE will be 1 or 2, indicating   
+ *          whether X should be overwritten by A * X  or A' * X.   
+ *          On the final return from ZLACON, KASE will again be 0.   
+ *
+ *   Further Details   
+ *   ======= =======   
+ *
+ *   Contributed by Nick Higham, University of Manchester.   
+ *   Originally named CONEST, dated March 16, 1988.   
+ *
+ *   Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of 
+ *   a real or complex matrix, with applications to condition estimation", 
+ *   ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988.   
+ *   ===================================================================== 
+ * 
+ */ + +int +zlacon_(int *n, doublecomplex *v, doublecomplex *x, double *est, int *kase) + +{ + + + /* Table of constant values */ + int c__1 = 1; + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + + /* System generated locals */ + double d__1; + + /* Local variables */ + static int iter; + static int jump, jlast; + static double altsgn, estold; + static int i, j; + double temp; + double safmin; + extern double dlamch_(char *); + extern int izmax1_(int *, doublecomplex *, int *); + extern double dzsum1_(int *, doublecomplex *, int *); + + safmin = dlamch_("Safe minimum"); + if ( *kase == 0 ) { + for (i = 0; i < *n; ++i) { + x[i].r = 1. / (double) (*n); + x[i].i = 0.; + } + *kase = 1; + jump = 1; + return 0; + } + + switch (jump) { + case 1: goto L20; + case 2: goto L40; + case 3: goto L70; + case 4: goto L110; + case 5: goto L140; + } + + /* ................ ENTRY (JUMP = 1) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ + L20: + if (*n == 1) { + v[0] = x[0]; + *est = z_abs(&v[0]); + /* ... QUIT */ + goto L150; + } + *est = dzsum1_(n, x, &c__1); + + for (i = 0; i < *n; ++i) { + d__1 = z_abs(&x[i]); + if (d__1 > safmin) { + d__1 = 1 / d__1; + x[i].r *= d__1; + x[i].i *= d__1; + } else { + x[i] = one; + } + } + *kase = 2; + jump = 2; + return 0; + + /* ................ ENTRY (JUMP = 2) + FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ +L40: + j = izmax1_(n, &x[0], &c__1); + --j; + iter = 2; + + /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ +L50: + for (i = 0; i < *n; ++i) x[i] = zero; + x[j] = one; + *kase = 1; + jump = 3; + return 0; + + /* ................ ENTRY (JUMP = 3) + X HAS BEEN OVERWRITTEN BY A*X. */ +L70: +#ifdef _CRAY + CCOPY(n, x, &c__1, v, &c__1); +#else + zcopy_(n, x, &c__1, v, &c__1); +#endif + estold = *est; + *est = dzsum1_(n, v, &c__1); + + +L90: + /* TEST FOR CYCLING. */ + if (*est <= estold) goto L120; + + for (i = 0; i < *n; ++i) { + d__1 = z_abs(&x[i]); + if (d__1 > safmin) { + d__1 = 1 / d__1; + x[i].r *= d__1; + x[i].i *= d__1; + } else { + x[i] = one; + } + } + *kase = 2; + jump = 4; + return 0; + + /* ................ ENTRY (JUMP = 4) + X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ +L110: + jlast = j; + j = izmax1_(n, &x[0], &c__1); + --j; + if (x[jlast].r != (d__1 = x[j].r, fabs(d__1)) && iter < 5) { + ++iter; + goto L50; + } + + /* ITERATION COMPLETE. FINAL STAGE. */ +L120: + altsgn = 1.; + for (i = 1; i <= *n; ++i) { + x[i-1].r = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.); + x[i-1].i = 0.; + altsgn = -altsgn; + } + *kase = 1; + jump = 5; + return 0; + + /* ................ ENTRY (JUMP = 5) + X HAS BEEN OVERWRITTEN BY A*X. */ +L140: + temp = dzsum1_(n, x, &c__1) / (double)(*n * 3) * 2.; + if (temp > *est) { +#ifdef _CRAY + CCOPY(n, &x[0], &c__1, &v[0], &c__1); +#else + zcopy_(n, &x[0], &c__1, &v[0], &c__1); +#endif + *est = temp; + } + +L150: + *kase = 0; + return 0; + +} /* zlacon_ */ diff --git a/src/maths/SuperLU/zlangs.c b/src/maths/SuperLU/zlangs.c new file mode 100644 index 000000000..f3afd65d7 --- /dev/null +++ b/src/maths/SuperLU/zlangs.c @@ -0,0 +1,119 @@ + +/*! @file zlangs.c + * \brief Returns the value of the one norm + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Modified from lapack routine ZLANGE 
+ * 
+ */ +/* + * File name: zlangs.c + * History: Modified from lapack routine ZLANGE + */ +#include +#include + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   ZLANGS returns the value of the one norm, or the Frobenius norm, or 
+ *   the infinity norm, or the element of largest absolute value of a 
+ *   real matrix A.   
+ *
+ *   Description   
+ *   ===========   
+ *
+ *   ZLANGE returns the value   
+ *
+ *      ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+ *               (   
+ *               ( norm1(A),         NORM = '1', 'O' or 'o'   
+ *               (   
+ *               ( normI(A),         NORM = 'I' or 'i'   
+ *               (   
+ *               ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+ *
+ *   where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+ *   normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+ *   normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+ *   squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   NORM    (input) CHARACTER*1   
+ *           Specifies the value to be returned in ZLANGE as described above.   
+ *   A       (input) SuperMatrix*
+ *           The M by N sparse matrix A. 
+ *
+ *  =====================================================================
+ * 
+ */ + +double zlangs(char *norm, SuperMatrix *A) +{ + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int i, j, irow; + double value, sum; + double *rwork; + + Astore = A->Store; + Aval = Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + + } else if (lsame_(norm, "M")) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + value = SUPERLU_MAX( value, z_abs( &Aval[i]) ); + + } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += z_abs( &Aval[i] ); + value = SUPERLU_MAX(value,sum); + } + + } else if (lsame_(norm, "I")) { + /* Find normI(A). */ + if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) + ABORT_SuperLU("SUPERLU_MALLOC fails for rwork."); + for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { + irow = Astore->rowind[i]; + rwork[irow] += z_abs( &Aval[i] ); + } + value = 0.; + for (i = 0; i < A->nrow; ++i) + value = SUPERLU_MAX(value, rwork[i]); + + SUPERLU_FREE (rwork); + + } else if (lsame_(norm, "F") || lsame_(norm, "E")) { + /* Find normF(A). */ + ABORT_SuperLU("Not implemented."); + } else + ABORT_SuperLU("Illegal norm specified."); + + return (value); + +} /* zlangs */ + diff --git a/src/maths/SuperLU/zlaqgs.c b/src/maths/SuperLU/zlaqgs.c new file mode 100644 index 000000000..c381cb43a --- /dev/null +++ b/src/maths/SuperLU/zlaqgs.c @@ -0,0 +1,147 @@ + +/*! @file zlaqgs.c + * \brief Equlibrates a general sprase matrix + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ * Modified from LAPACK routine ZLAQGE
+ * 
+ */ +/* + * File name: zlaqgs.c + * History: Modified from LAPACK routine ZLAQGE + */ +#include +#include + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   ZLAQGS equilibrates a general sparse M by N matrix A using the row and   
+ *   scaling factors in the vectors R and C.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input/output) SuperMatrix*
+ *           On exit, the equilibrated matrix.  See EQUED for the form of 
+ *           the equilibrated matrix. The type of A can be:
+ *	    Stype = NC; Dtype = SLU_Z; Mtype = GE.
+ *	    
+ *   R       (input) double*, dimension (A->nrow)
+ *           The row scale factors for A.
+ *	    
+ *   C       (input) double*, dimension (A->ncol)
+ *           The column scale factors for A.
+ *	    
+ *   ROWCND  (input) double
+ *           Ratio of the smallest R(i) to the largest R(i).
+ *	    
+ *   COLCND  (input) double
+ *           Ratio of the smallest C(i) to the largest C(i).
+ *	    
+ *   AMAX    (input) double
+ *           Absolute value of largest matrix entry.
+ *	    
+ *   EQUED   (output) char*
+ *           Specifies the form of equilibration that was done.   
+ *           = 'N':  No equilibration   
+ *           = 'R':  Row equilibration, i.e., A has been premultiplied by  
+ *                   diag(R).   
+ *           = 'C':  Column equilibration, i.e., A has been postmultiplied  
+ *                   by diag(C).   
+ *           = 'B':  Both row and column equilibration, i.e., A has been
+ *                   replaced by diag(R) * A * diag(C).   
+ *
+ *   Internal Parameters   
+ *   ===================   
+ *
+ *   THRESH is a threshold value used to decide if row or column scaling   
+ *   should be done based on the ratio of the row or column scaling   
+ *   factors.  If ROWCND < THRESH, row scaling is done, and if   
+ *   COLCND < THRESH, column scaling is done.   
+ *
+ *   LARGE and SMALL are threshold values used to decide if row scaling   
+ *   should be done based on the absolute size of the largest matrix   
+ *   element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+ *
+ *   ===================================================================== 
+ * 
+ */ + +void +zlaqgs(SuperMatrix *A, double *r, double *c, + double rowcnd, double colcnd, double amax, char *equed) +{ + + +#define THRESH (0.1) + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int i, j, irow; + double large, small, cj; + double temp; + + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + + /* Initialize LARGE and SMALL. */ + small = dlamch_("Safe minimum") / dlamch_("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + zd_mult(&Aval[i], &Aval[i], cj); + } + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + zd_mult(&Aval[i], &Aval[i], r[irow]); + } + *(unsigned char *)equed = 'R'; + } else { + /* Row and column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + temp = cj * r[irow]; + zd_mult(&Aval[i], &Aval[i], temp); + } + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* zlaqgs */ + diff --git a/src/maths/SuperLU/zldperm.c b/src/maths/SuperLU/zldperm.c new file mode 100644 index 000000000..7600ae2dc --- /dev/null +++ b/src/maths/SuperLU/zldperm.c @@ -0,0 +1,168 @@ + +/*! @file + * \brief Finds a row permutation so that the matrix has large entries on the diagonal + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ * 
+ */ + +#include + +extern int_t mc64id_(int_t*); +extern int_t mc64ad_(int_t*, int_t*, int_t*, int_t [], int_t [], double [], + int_t*, int_t [], int_t*, int_t[], int_t*, double [], + int_t [], int_t []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   ZLDPERM finds a row permutation so that the matrix has large
+ *   entries on the diagonal.
+ *
+ * Arguments
+ * =========
+ *
+ * job    (input) int
+ *        Control the action. Possible values for JOB are:
+ *        = 1 : Compute a row permutation of the matrix so that the
+ *              permuted matrix has as many entries on its diagonal as
+ *              possible. The values on the diagonal are of arbitrary size.
+ *              HSL subroutine MC21A/AD is used for this.
+ *        = 2 : Compute a row permutation of the matrix so that the smallest 
+ *              value on the diagonal of the permuted matrix is maximized.
+ *        = 3 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *              The algorithm differs from the one used for JOB = 2 and may
+ *              have quite a different performance.
+ *        = 4 : Compute a row permutation of the matrix so that the sum
+ *              of the diagonal entries of the permuted matrix is maximized.
+ *        = 5 : Compute a row permutation of the matrix so that the product
+ *              of the diagonal entries of the permuted matrix is maximized
+ *              and vectors to scale the matrix so that the nonzero diagonal 
+ *              entries of the permuted matrix are one in absolute value and 
+ *              all the off-diagonal entries are less than or equal to one in 
+ *              absolute value.
+ *        Restriction: 1 <= JOB <= 5.
+ *
+ * n      (input) int
+ *        The order of the matrix.
+ *
+ * nnz    (input) int
+ *        The number of nonzeros in the matrix.
+ *
+ * adjncy (input) int*, of size nnz
+ *        The adjacency structure of the matrix, which contains the row
+ *        indices of the nonzeros.
+ *
+ * colptr (input) int*, of size n+1
+ *        The pointers to the beginning of each column in ADJNCY.
+ *
+ * nzval  (input) doublecomplex*, of size nnz
+ *        The nonzero values of the matrix. nzval[k] is the value of
+ *        the entry corresponding to adjncy[k].
+ *        It is not used if job = 1.
+ *
+ * perm   (output) int*, of size n
+ *        The permutation vector. perm[i] = j means row i in the
+ *        original matrix is in row j of the permuted matrix.
+ *
+ * u      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the row scaling factors. 
+ *
+ * v      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the column scaling factors. 
+ *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
+ * 
+ */ + +int +zldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], + doublecomplex nzval[], int_t *perm, double u[], double v[]) +{ + int_t i, liw, ldw, num; + int_t *iw, icntl[10], info[10]; + double *dw; + double *nzval_d = (double *) SUPERLU_MALLOC(nnz * sizeof(double)); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Enter zldperm()"); +#endif + liw = 5*n; + if ( job == 3 ) liw = 10*n + nnz; + if ( !(iw = intMalloc(liw)) ) ABORT_SuperLU("Malloc fails for iw[]"); + ldw = 3*n + nnz; + if ( !(dw = (double*) SUPERLU_MALLOC(ldw * sizeof(double))) ) + ABORT_SuperLU("Malloc fails for dw[]"); + + /* Increment one to get 1-based indexing. */ + for (i = 0; i <= n; ++i) ++colptr[i]; + for (i = 0; i < nnz; ++i) ++adjncy[i]; +#if ( DEBUGlevel>=2 ) + printf("LDPERM(): n %d, nnz %d\n", n, nnz); + slu_PrintInt10("colptr", n+1, colptr); + slu_PrintInt10("adjncy", nnz, adjncy); +#endif + + /* + * NOTE: + * ===== + * + * MC64AD assumes that column permutation vector is defined as: + * perm(i) = j means column i of permuted A is in column j of original A. + * + * Since a symmetric permutation preserves the diagonal entries. Then + * by the following relation: + * P'(A*P')P = P'A + * we can apply inverse(perm) to rows of A to get large diagonal entries. + * But, since 'perm' defined in MC64AD happens to be the reverse of + * SuperLU's definition of permutation vector, therefore, it is already + * an inverse for our purpose. We will thus use it directly. + * + */ + mc64id_(icntl); +#if 0 + /* Suppress error and warning messages. */ + icntl[0] = -1; + icntl[1] = -1; +#endif + + for (i = 0; i < nnz; ++i) nzval_d[i] = z_abs1(&nzval[i]); + mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval_d, &num, perm, + &liw, iw, &ldw, dw, icntl, info); + +#if ( DEBUGlevel>=2 ) + slu_PrintInt10("perm", n, perm); + printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); +#endif + if ( info[0] == 1 ) { /* Structurally singular */ + printf(".. The last %d permutations:\n", n-num); + slu_PrintInt10("perm", n-num, &perm[num]); + } + + /* Restore to 0-based indexing. */ + for (i = 0; i <= n; ++i) --colptr[i]; + for (i = 0; i < nnz; ++i) --adjncy[i]; + for (i = 0; i < n; ++i) --perm[i]; + + if ( job == 5 ) + for (i = 0; i < n; ++i) { + u[i] = dw[i]; + v[i] = dw[n+i]; + } + + SUPERLU_FREE(iw); + SUPERLU_FREE(dw); + SUPERLU_FREE(nzval_d); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC("Exit zldperm()"); +#endif + + return info[0]; +} diff --git a/src/maths/SuperLU/zmemory.c b/src/maths/SuperLU/zmemory.c new file mode 100644 index 000000000..c76ba2b14 --- /dev/null +++ b/src/maths/SuperLU/zmemory.c @@ -0,0 +1,701 @@ + +/*! @file zmemory.c + * \brief Memory details + * + *
+ * -- SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Laboratory.
+ * June 30, 2009
+ * 
+ */ +#include + + +/* Internal prototypes */ +void *zexpand (int *, MemType,int, int, GlobalLU_t *); +int zLUWorkInit (int, int, int, int **, doublecomplex **, GlobalLU_t *); +void copy_mem_doublecomplex (int, void *, void *); +void zStackCompress (GlobalLU_t *); +void zSetupSpace (void *, int, GlobalLU_t *); +void *zuser_malloc (int, int, GlobalLU_t *); +void zuser_free (int, int, GlobalLU_t *); + +/* External prototypes (in memory.c - prec-independent) */ +extern void copy_mem_int (int, void *, void *); +extern void user_bcopy (char *, char *, int); + + +/* Macros to manipulate stack */ +#define StackFull(x) ( x + Glu->stack.used >= Glu->stack.size ) +#define NotDoubleAlign(addr) ( (long int)addr & 7 ) +#define DoubleAlign(addr) ( ((long int)addr + 7) & ~7L ) +#define TempSpace(m, w) ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \ + (w + 1) * m * sizeof(doublecomplex) ) +#define Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */ + + + + +/*! \brief Setup the memory model to be used for factorization. + * + * lwork = 0: use system malloc; + * lwork > 0: use user-supplied work[] space. + */ +void zSetupSpace(void *work, int lwork, GlobalLU_t *Glu) +{ + if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; /* malloc/free */ + } else if ( lwork > 0 ) { + Glu->MemModel = USER; /* user provided space */ + Glu->stack.used = 0; + Glu->stack.top1 = 0; + Glu->stack.top2 = (lwork/4)*4; /* must be word addressable */ + Glu->stack.size = Glu->stack.top2; + Glu->stack.array = (void *) work; + } +} + + + +void *zuser_malloc(int bytes, int which_end, GlobalLU_t *Glu) +{ + void *buf; + + if ( StackFull(bytes) ) return (NULL); + + if ( which_end == HEAD ) { + buf = (char*) Glu->stack.array + Glu->stack.top1; + Glu->stack.top1 += bytes; + } else { + Glu->stack.top2 -= bytes; + buf = (char*) Glu->stack.array + Glu->stack.top2; + } + + Glu->stack.used += bytes; + return buf; +} + + +void zuser_free(int bytes, int which_end, GlobalLU_t *Glu) +{ + if ( which_end == HEAD ) { + Glu->stack.top1 -= bytes; + } else { + Glu->stack.top2 += bytes; + } + Glu->stack.used -= bytes; +} + + + +/*! \brief + * + *
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total_needed (float)
+ *      The amount of space needed in bytes to perform factorization.
+ * 
+ */ +int zQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, iword, dword, panel_size = sp_ienv(1); + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(doublecomplex); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0*n + 3.0) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0 * panel_size + 4.0 + NO_MARKER) * n * iword + + (panel_size + 1.0) * n * dword ); + + return 0; +} /* zQuerySpace */ + + +/*! \brief + * + *
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total_needed (float)
+ *      The amount of space needed in bytes to perform factorization.
+ * 
+ */ +int ilu_zQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage) +{ + SCformat *Lstore; + NCformat *Ustore; + register int n, panel_size = sp_ienv(1); + register float iword, dword; + + Lstore = L->Store; + Ustore = U->Store; + n = L->ncol; + iword = sizeof(int); + dword = sizeof(double); + + /* For LU factors */ + mem_usage->for_lu = (float)( (4.0f * n + 3.0f) * iword + + Lstore->nzval_colptr[n] * dword + + Lstore->rowind_colptr[n] * iword ); + mem_usage->for_lu += (float)( (n + 1.0f) * iword + + Ustore->colptr[n] * (dword + iword) ); + + /* Working storage to support factorization. + ILU needs 5*n more integers than LU */ + mem_usage->total_needed = mem_usage->for_lu + + (float)( (2.0f * panel_size + 9.0f + NO_MARKER) * n * iword + + (panel_size + 1.0f) * n * dword ); + + return 0; +} /* ilu_zQuerySpace */ + + +/*! \brief Allocate storage for the data structures common to all factor routines. + * + *
+ * For those unpredictable size, estimate as fill_ratio * nnz(A).
+ * Return value:
+ *     If lwork = -1, return the estimated amount of space required, plus n;
+ *     otherwise, return the amount of space actually allocated when
+ *     memory allocation failure occurred.
+ * 
+ */ +int +zLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz, + int panel_size, double fill_ratio, SuperMatrix *L, SuperMatrix *U, + GlobalLU_t *Glu, int **iwork, doublecomplex **dwork) +{ + int info, iword, dword; + SCformat *Lstore; + NCformat *Ustore; + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + doublecomplex *ucol; + int *usub, *xusub; + int nzlmax, nzumax, nzlumax; + + iword = sizeof(int); + dword = sizeof(doublecomplex); + Glu->n = n; + Glu->num_expansions = 0; + + if ( !Glu->expanders ) + Glu->expanders = (ExpHeader*)SUPERLU_MALLOC( NO_MEMTYPE * + sizeof(ExpHeader) ); + if ( !Glu->expanders ) ABORT_SuperLU("SUPERLU_MALLOC fails for expanders"); + + if ( fact != SamePattern_SameRowPerm ) { + /* Guess for L\U factors */ + nzumax = nzlumax = fill_ratio * annz; + nzlmax = SUPERLU_MAX(1, fill_ratio/4.) * annz; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else { + zSetupSpace(work, lwork, Glu); + } + +#if ( PRNTlevel >= 1 ) + printf("zLUMemInit() called: fill_ratio %.0f, nzlmax %ld, nzumax %ld\n", + fill_ratio, nzlmax, nzumax); + fflush(stdout); +#endif + + /* Integer pointers for L\U factors */ + if ( Glu->MemModel == SYSTEM ) { + xsup = intMalloc(n+1); + supno = intMalloc(n+1); + xlsub = intMalloc(n+1); + xlusup = intMalloc(n+1); + xusub = intMalloc(n+1); + } else { + xsup = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + supno = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + xlsub = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + xlusup = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + xusub = (int *)zuser_malloc((n+1) * iword, HEAD, Glu); + } + + lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) zexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) zexpand( &nzumax, USUB, 0, 1, Glu ); + + while ( !lusup || !ucol || !lsub || !usub ) { + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE(lusup); + SUPERLU_FREE(ucol); + SUPERLU_FREE(lsub); + SUPERLU_FREE(usub); + } else { + zuser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, + HEAD, Glu); + } + nzlumax /= 2; + nzumax /= 2; + nzlmax /= 2; + if ( nzlumax < annz ) { + printf("Not enough memory to perform factorization.\n"); + return (zmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + } +#if ( PRNTlevel >= 1) + printf("zLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", + nzlmax, nzumax); + fflush(stdout); +#endif + lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu ); + ucol = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu ); + lsub = (int *) zexpand( &nzlmax, LSUB, 0, 0, Glu ); + usub = (int *) zexpand( &nzumax, USUB, 0, 1, Glu ); + } + + } else { + /* fact == SamePattern_SameRowPerm */ + Lstore = L->Store; + Ustore = U->Store; + xsup = Lstore->sup_to_col; + supno = Lstore->col_to_sup; + xlsub = Lstore->rowind_colptr; + xlusup = Lstore->nzval_colptr; + xusub = Ustore->colptr; + nzlmax = Glu->nzlmax; /* max from previous factorization */ + nzumax = Glu->nzumax; + nzlumax = Glu->nzlumax; + + if ( lwork == -1 ) { + return ( GluIntArray(n) * iword + TempSpace(m, panel_size) + + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n ); + } else if ( lwork == 0 ) { + Glu->MemModel = SYSTEM; + } else { + Glu->MemModel = USER; + Glu->stack.top2 = (lwork/4)*4; /* must be word-addressable */ + Glu->stack.size = Glu->stack.top2; + } + + lsub = Glu->expanders[LSUB].mem = Lstore->rowind; + lusup = Glu->expanders[LUSUP].mem = Lstore->nzval; + usub = Glu->expanders[USUB].mem = Ustore->rowind; + ucol = Glu->expanders[UCOL].mem = Ustore->nzval;; + Glu->expanders[LSUB].size = nzlmax; + Glu->expanders[LUSUP].size = nzlumax; + Glu->expanders[USUB].size = nzumax; + Glu->expanders[UCOL].size = nzumax; + } + + Glu->xsup = xsup; + Glu->supno = supno; + Glu->lsub = lsub; + Glu->xlsub = xlsub; + Glu->lusup = lusup; + Glu->xlusup = xlusup; + Glu->ucol = ucol; + Glu->usub = usub; + Glu->xusub = xusub; + Glu->nzlmax = nzlmax; + Glu->nzumax = nzumax; + Glu->nzlumax = nzlumax; + + info = zLUWorkInit(m, n, panel_size, iwork, dwork, Glu); + if ( info ) + return ( info + zmemory_usage(nzlmax, nzumax, nzlumax, n) + n); + + ++Glu->num_expansions; + return 0; + +} /* zLUMemInit */ + +/*! \brief Allocate known working storage. Returns 0 if success, otherwise + returns the number of bytes allocated so far when failure occurred. */ +int +zLUWorkInit(int m, int n, int panel_size, int **iworkptr, + doublecomplex **dworkptr, GlobalLU_t *Glu) +{ + int isize, dsize, extra; + doublecomplex *old_ptr; + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + + isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); + dsize = (m * panel_size + + NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(doublecomplex); + + if ( Glu->MemModel == SYSTEM ) + *iworkptr = (int *) intCalloc(isize/sizeof(int)); + else + *iworkptr = (int *) zuser_malloc(isize, TAIL, Glu); + if ( ! *iworkptr ) { + fprintf(stderr, "zLUWorkInit: malloc fails for local iworkptr[]\n"); + return (isize + n); + } + + if ( Glu->MemModel == SYSTEM ) + *dworkptr = (doublecomplex *) SUPERLU_MALLOC(dsize); + else { + *dworkptr = (doublecomplex *) zuser_malloc(dsize, TAIL, Glu); + if ( NotDoubleAlign(*dworkptr) ) { + old_ptr = *dworkptr; + *dworkptr = (doublecomplex*) DoubleAlign(*dworkptr); + *dworkptr = (doublecomplex*) ((double*)*dworkptr - 1); + extra = (char*)old_ptr - (char*)*dworkptr; +#ifdef DEBUG + printf("zLUWorkInit: not aligned, extra %d\n", extra); +#endif + Glu->stack.top2 -= extra; + Glu->stack.used += extra; + } + } + if ( ! *dworkptr ) { + fprintf(stderr, "malloc fails for local dworkptr[]."); + return (isize + dsize + n); + } + + return 0; +} + + +/*! \brief Set up pointers for real working arrays. + */ +void +zSetRWork(int m, int panel_size, doublecomplex *dworkptr, + doublecomplex **dense, doublecomplex **tempv) +{ + doublecomplex zero = {0.0, 0.0}; + + int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), + rowblk = sp_ienv(4); + *dense = dworkptr; + *tempv = *dense + panel_size*m; + zfill (*dense, m * panel_size, zero); + zfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); +} + +/*! \brief Free the working storage used by factor routines. + */ +void zLUWorkFree(int *iwork, doublecomplex *dwork, GlobalLU_t *Glu) +{ + if ( Glu->MemModel == SYSTEM ) { + SUPERLU_FREE (iwork); + SUPERLU_FREE (dwork); + } else { + Glu->stack.used -= (Glu->stack.size - Glu->stack.top2); + Glu->stack.top2 = Glu->stack.size; +/* zStackCompress(Glu); */ + } + + SUPERLU_FREE (Glu->expanders); + Glu->expanders = NULL; +} + +/*! \brief Expand the data structures for L and U during the factorization. + * + *
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ * 
+ */ +int +zLUMemXpand(int jcol, + int next, /* number of elements currently in the factors */ + MemType mem_type, /* which type of memory to expand */ + int *maxlen, /* modified - maximum length of a data structure */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + void *new_mem; + +#ifdef DEBUG + printf("zLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n", + jcol, next, *maxlen, mem_type); +#endif + + if (mem_type == USUB) + new_mem = zexpand(maxlen, mem_type, next, 1, Glu); + else + new_mem = zexpand(maxlen, mem_type, next, 0, Glu); + + if ( !new_mem ) { + int nzlmax = Glu->nzlmax; + int nzumax = Glu->nzumax; + int nzlumax = Glu->nzlumax; + fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); + return (zmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n); + } + + switch ( mem_type ) { + case LUSUP: + Glu->lusup = (doublecomplex *) new_mem; + Glu->nzlumax = *maxlen; + break; + case UCOL: + Glu->ucol = (doublecomplex *) new_mem; + Glu->nzumax = *maxlen; + break; + case LSUB: + Glu->lsub = (int *) new_mem; + Glu->nzlmax = *maxlen; + break; + case USUB: + Glu->usub = (int *) new_mem; + Glu->nzumax = *maxlen; + break; + } + + return 0; + +} + + + +void +copy_mem_doublecomplex(int howmany, void *old, void *new) +{ + register int i; + doublecomplex *dold = old; + doublecomplex *dnew = new; + for (i = 0; i < howmany; i++) dnew[i] = dold[i]; +} + +/*! \brief Expand the existing storage to accommodate more fill-ins. + */ +void +*zexpand ( + int *prev_len, /* length used from previous call */ + MemType type, /* which part of the memory to expand */ + int len_to_copy, /* size of the memory to be copied to new store */ + int keep_prev, /* = 1: use prev_len; + = 0: compute new_len to expand */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + float EXPAND = 1.5; + float alpha; + void *new_mem, *old_mem; + int new_len, tries, lword, extra, bytes_to_copy; + ExpHeader *expanders = Glu->expanders; /* Array of 4 types of memory */ + + alpha = EXPAND; + + if ( Glu->num_expansions == 0 || keep_prev ) { + /* First time allocate requested */ + new_len = *prev_len; + } else { + new_len = alpha * *prev_len; + } + + if ( type == LSUB || type == USUB ) lword = sizeof(int); + else lword = sizeof(doublecomplex); + + if ( Glu->MemModel == SYSTEM ) { + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + if ( Glu->num_expansions != 0 ) { + tries = 0; + if ( keep_prev ) { + if ( !new_mem ) return (NULL); + } else { + while ( !new_mem ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword); + } + } + if ( type == LSUB || type == USUB ) { + copy_mem_int(len_to_copy, expanders[type].mem, new_mem); + } else { + copy_mem_doublecomplex(len_to_copy, expanders[type].mem, new_mem); + } + SUPERLU_FREE (expanders[type].mem); + } + expanders[type].mem = (void *) new_mem; + + } else { /* MemModel == USER */ + if ( Glu->num_expansions == 0 ) { + new_mem = zuser_malloc(new_len * lword, HEAD, Glu); + if ( NotDoubleAlign(new_mem) && + (type == LUSUP || type == UCOL) ) { + old_mem = new_mem; + new_mem = (void *)DoubleAlign(new_mem); + extra = (char*)new_mem - (char*)old_mem; +#ifdef DEBUG + printf("expand(): not aligned, extra %d\n", extra); +#endif + Glu->stack.top1 += extra; + Glu->stack.used += extra; + } + expanders[type].mem = (void *) new_mem; + } else { + tries = 0; + extra = (new_len - *prev_len) * lword; + if ( keep_prev ) { + if ( StackFull(extra) ) return (NULL); + } else { + while ( StackFull(extra) ) { + if ( ++tries > 10 ) return (NULL); + alpha = Reduce(alpha); + new_len = alpha * *prev_len; + extra = (new_len - *prev_len) * lword; + } + } + + if ( type != USUB ) { + new_mem = (void*)((char*)expanders[type + 1].mem + extra); + bytes_to_copy = (char*)Glu->stack.array + Glu->stack.top1 + - (char*)expanders[type + 1].mem; + user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); + + if ( type < USUB ) { + Glu->usub = expanders[USUB].mem = + (void*)((char*)expanders[USUB].mem + extra); + } + if ( type < LSUB ) { + Glu->lsub = expanders[LSUB].mem = + (void*)((char*)expanders[LSUB].mem + extra); + } + if ( type < UCOL ) { + Glu->ucol = expanders[UCOL].mem = + (void*)((char*)expanders[UCOL].mem + extra); + } + Glu->stack.top1 += extra; + Glu->stack.used += extra; + if ( type == UCOL ) { + Glu->stack.top1 += extra; /* Add same amount for USUB */ + Glu->stack.used += extra; + } + + } /* if ... */ + + } /* else ... */ + } + + expanders[type].size = new_len; + *prev_len = new_len; + if ( Glu->num_expansions ) ++Glu->num_expansions; + + return (void *) expanders[type].mem; + +} /* zexpand */ + + +/*! \brief Compress the work[] array to remove fragmentation. + */ +void +zStackCompress(GlobalLU_t *Glu) +{ + register int iword, dword, ndim; + char *last, *fragment; + int *ifrom, *ito; + doublecomplex *dfrom, *dto; + int *xlsub, *lsub, *xusub, *usub, *xlusup; + doublecomplex *ucol, *lusup; + + iword = sizeof(int); + dword = sizeof(doublecomplex); + ndim = Glu->n; + + xlsub = Glu->xlsub; + lsub = Glu->lsub; + xusub = Glu->xusub; + usub = Glu->usub; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + lusup = Glu->lusup; + + dfrom = ucol; + dto = (doublecomplex *)((char*)lusup + xlusup[ndim] * dword); + copy_mem_doublecomplex(xusub[ndim], dfrom, dto); + ucol = dto; + + ifrom = lsub; + ito = (int *) ((char*)ucol + xusub[ndim] * iword); + copy_mem_int(xlsub[ndim], ifrom, ito); + lsub = ito; + + ifrom = usub; + ito = (int *) ((char*)lsub + xlsub[ndim] * iword); + copy_mem_int(xusub[ndim], ifrom, ito); + usub = ito; + + last = (char*)usub + xusub[ndim] * iword; + fragment = (char*) (((char*)Glu->stack.array + Glu->stack.top1) - last); + Glu->stack.used -= (long int) fragment; + Glu->stack.top1 -= (long int) fragment; + + Glu->ucol = ucol; + Glu->lsub = lsub; + Glu->usub = usub; + +#ifdef DEBUG + printf("zStackCompress: fragment %d\n", fragment); + /* for (last = 0; last < ndim; ++last) + print_lu_col("After compress:", last, 0);*/ +#endif + +} + +/*! \brief Allocate storage for original matrix A + */ +void +zallocateA(int n, int nnz, doublecomplex **a, int **asub, int **xa) +{ + *a = (doublecomplex *) doublecomplexMalloc(nnz); + *asub = (int *) intMalloc(nnz); + *xa = (int *) intMalloc(n+1); +} + + +doublecomplex *doublecomplexMalloc(int n) +{ + doublecomplex *buf; + buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doublecomplexMalloc()\n"); + } + return (buf); +} + +doublecomplex *doublecomplexCalloc(int n) +{ + doublecomplex *buf; + register int i; + doublecomplex zero = {0.0, 0.0}; + buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex)); + if ( !buf ) { + ABORT_SuperLU("SUPERLU_MALLOC failed for buf in doublecomplexCalloc()\n"); + } + for (i = 0; i < n; ++i) buf[i] = zero; + return (buf); +} + + +int zmemory_usage(const int nzlmax, const int nzumax, + const int nzlumax, const int n) +{ + register int iword, dword; + + iword = sizeof(int); + dword = sizeof(doublecomplex); + + return (10 * n * iword + + nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword); + +} diff --git a/src/maths/SuperLU/zmyblas2.c b/src/maths/SuperLU/zmyblas2.c new file mode 100644 index 000000000..e50505433 --- /dev/null +++ b/src/maths/SuperLU/zmyblas2.c @@ -0,0 +1,188 @@ + +/*! @file zmyblas2.c + * \brief Level 2 Blas operations + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ * Purpose: + * Level 2 BLAS operations: solves and matvec, written in C. + * Note: + * This is only used when the system lacks an efficient BLAS library. + *
+ */ +/* + * File name: zmyblas2.c + */ +#include + +/*! \brief Solves a dense UNIT lower triangular system + * + * The unit lower + * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). + * The solution will be returned in the rhs vector. + */ +void zlsolve ( int ldm, int ncol, doublecomplex *M, doublecomplex *rhs ) +{ + int k; + doublecomplex x0, x1, x2, x3, temp; + doublecomplex *M0; + doublecomplex *Mki0, *Mki1, *Mki2, *Mki3; + register int firstcol = 0; + + M0 = &M[0]; + + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + + x0 = rhs[firstcol]; + zz_mult(&temp, &x0, Mki0); Mki0++; + z_sub(&x1, &rhs[firstcol+1], &temp); + zz_mult(&temp, &x0, Mki0); Mki0++; + z_sub(&x2, &rhs[firstcol+2], &temp); + zz_mult(&temp, &x1, Mki1); Mki1++; + z_sub(&x2, &x2, &temp); + zz_mult(&temp, &x0, Mki0); Mki0++; + z_sub(&x3, &rhs[firstcol+3], &temp); + zz_mult(&temp, &x1, Mki1); Mki1++; + z_sub(&x3, &x3, &temp); + zz_mult(&temp, &x2, Mki2); Mki2++; + z_sub(&x3, &x3, &temp); + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + ++firstcol; + + for (k = firstcol; k < ncol; k++) { + zz_mult(&temp, &x0, Mki0); Mki0++; + z_sub(&rhs[k], &rhs[k], &temp); + zz_mult(&temp, &x1, Mki1); Mki1++; + z_sub(&rhs[k], &rhs[k], &temp); + zz_mult(&temp, &x2, Mki2); Mki2++; + z_sub(&rhs[k], &rhs[k], &temp); + zz_mult(&temp, &x3, Mki3); Mki3++; + z_sub(&rhs[k], &rhs[k], &temp); + } + + M0 += 4 * ldm + 4; + } + + if ( firstcol < ncol - 1 ) { /* Do 2 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + + x0 = rhs[firstcol]; + zz_mult(&temp, &x0, Mki0); Mki0++; + z_sub(&x1, &rhs[firstcol+1], &temp); + + rhs[++firstcol] = x1; + ++firstcol; + + for (k = firstcol; k < ncol; k++) { + zz_mult(&temp, &x0, Mki0); Mki0++; + z_sub(&rhs[k], &rhs[k], &temp); + zz_mult(&temp, &x1, Mki1); Mki1++; + z_sub(&rhs[k], &rhs[k], &temp); + } + } + +} + +/*! \brief Solves a dense upper triangular system. + * + * The upper triangular matrix is + * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned + * in the rhs vector. + */ +void +zusolve ( ldm, ncol, M, rhs ) +int ldm; /* in */ +int ncol; /* in */ +doublecomplex *M; /* in */ +doublecomplex *rhs; /* modified */ +{ + doublecomplex xj, temp; + int jcol, j, irow; + + jcol = ncol - 1; + + for (j = 0; j < ncol; j++) { + + z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */ + rhs[jcol] = xj; + + for (irow = 0; irow < jcol; irow++) { + zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */ + z_sub(&rhs[irow], &rhs[irow], &temp); + } + + jcol--; + + } +} + + +/*! \brief Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. + * + * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. + */ +void zmatvec ( ldm, nrow, ncol, M, vec, Mxvec ) +int ldm; /* in -- leading dimension of M */ +int nrow; /* in */ +int ncol; /* in */ +doublecomplex *M; /* in */ +doublecomplex *vec; /* in */ +doublecomplex *Mxvec; /* in/out */ +{ + doublecomplex vi0, vi1, vi2, vi3; + doublecomplex *M0, temp; + doublecomplex *Mki0, *Mki1, *Mki2, *Mki3; + register int firstcol = 0; + int k; + + M0 = &M[0]; + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + for (k = 0; k < nrow; k++) { + zz_mult(&temp, &vi0, Mki0); Mki0++; + z_add(&Mxvec[k], &Mxvec[k], &temp); + zz_mult(&temp, &vi1, Mki1); Mki1++; + z_add(&Mxvec[k], &Mxvec[k], &temp); + zz_mult(&temp, &vi2, Mki2); Mki2++; + z_add(&Mxvec[k], &Mxvec[k], &temp); + zz_mult(&temp, &vi3, Mki3); Mki3++; + z_add(&Mxvec[k], &Mxvec[k], &temp); + } + + M0 += 4 * ldm; + } + + while ( firstcol < ncol ) { /* Do 1 column */ + Mki0 = M0; + vi0 = vec[firstcol++]; + for (k = 0; k < nrow; k++) { + zz_mult(&temp, &vi0, Mki0); Mki0++; + z_add(&Mxvec[k], &Mxvec[k], &temp); + } + M0 += ldm; + } + +} + diff --git a/src/maths/SuperLU/zpanel_bmod.c b/src/maths/SuperLU/zpanel_bmod.c new file mode 100644 index 000000000..39aead642 --- /dev/null +++ b/src/maths/SuperLU/zpanel_bmod.c @@ -0,0 +1,487 @@ + +/*! @file zpanel_bmod.c + * \brief Performs numeric block updates + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ +/* + +*/ + +#include +#include +#include + +/* + * Function prototypes + */ +void zlsolve(int, int, doublecomplex *, doublecomplex *); +void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *); +extern void zcheck_tempv(); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *    Performs numeric block updates (sup-panel) in topological order.
+ *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
+ *    Special processing on the supernodal portion of L\U[*,j]
+ *
+ *    Before entering this routine, the original nonzeros in the panel 
+ *    were already copied into the spa[m,w].
+ *
+ *    Updated/Output parameters-
+ *    dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
+ *    collectively in the m-by-w vector dense[*]. 
+ * 
+ */ + +void +zpanel_bmod ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + const int nseg, /* in */ + doublecomplex *dense, /* out, of size n by w */ + doublecomplex *tempv, /* working array */ + int *segrep, /* in */ + int *repfnz, /* in, of size n by w */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ + + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + doublecomplex alpha, beta; +#endif + + register int k, ksub; + int fsupc, nsupc, nsupr, nrow; + int krep, krep_ind; + doublecomplex ukj, ukj1, ukj2; + int luptr, luptr1, luptr2; + int segsze; + int block_nrow; /* no of rows in a block row */ + register int lptr; /* Points to the row subscripts of a supernode */ + int kfnz, irow, no_zeros; + register int isub, isub1, i; + register int jj; /* Index through each column in the panel */ + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + int *repfnz_col; /* repfnz[] for a column in the panel */ + doublecomplex *dense_col; /* dense[] for a column in the panel */ + doublecomplex *tempv1; /* Used in 1-D update */ + doublecomplex *TriTmp, *MatvecTmp; /* used in 2-D update */ + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; + doublecomplex comp_temp, comp_temp1; + register int ldaTmp; + register int r_ind, r_hi; + static int first = 1, maxsuper, rowblk, colblk; + flops_t *ops = stat->ops; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + if ( first ) { + maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); + rowblk = sp_ienv(4); + colblk = sp_ienv(5); + first = 0; + } + ldaTmp = maxsuper + rowblk; + + /* + * For each nonz supernode segment of U[*,j] in topological order + */ + k = nseg - 1; + for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ + + /* krep = representative of current k-th supernode + * fsupc = first supernodal column + * nsupc = no of columns in a supernode + * nsupr = no of rows in a supernode + */ + krep = segrep[k--]; + fsupc = xsup[supno[krep]]; + nsupc = krep - fsupc + 1; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nrow = nsupr - nsupc; + lptr = xlsub[fsupc]; + krep_ind = lptr + nsupc - 1; + + repfnz_col = repfnz; + dense_col = dense; + + if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ + + TriTmp = tempv; + + /* Sequence through each column in panel -- triangular solves */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += 4 * segsze * (segsze - 1); + ops[GEMV] += 8 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); + z_sub(&ukj1, &ukj1, &comp_temp); + + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + luptr++; luptr1++; luptr2++; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } + + } else { /* segsze >= 4 */ + + /* Copy U[*,j] segment from dense[*] to TriTmp[*], which + holds the result of triangular solves. */ + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + TriTmp[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#else + ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, TriTmp, &incx ); +#endif +#else + zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); +#endif + + + } /* else ... */ + + } /* for jj ... end tri-solves */ + + /* Block row updates; push all the way into dense[*] block */ + for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { + + r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); + block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); + luptr = xlusup[fsupc] + nsupc + r_ind; + isub1 = lptr + nsupc + r_ind; + + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + /* Sequence through each column in panel -- matrix-vector */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + /* Perform a block update, and scatter the result of + matrix-vector to dense[]. */ + no_zeros = kfnz - fsupc; + luptr1 = luptr + nsupr * no_zeros; + MatvecTmp = &TriTmp[maxsuper]; + +#ifdef USE_VENDOR_BLAS + alpha = one; + beta = zero; +#ifdef _CRAY + CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#else + zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], + &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); +#endif +#else + zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], + TriTmp, MatvecTmp); +#endif + + /* Scatter MatvecTmp[*] into SPA dense[*] temporarily + * such that MatvecTmp[*] can be re-used for the + * the next blok row update. dense[] will be copied into + * global store after the whole panel has been finished. + */ + isub = isub1; + for (i = 0; i < block_nrow; i++) { + irow = lsub[isub]; + z_sub(&dense_col[irow], &dense_col[irow], + &MatvecTmp[i]); + MatvecTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } /* for each block row ... */ + + /* Scatter the triangular solves into SPA dense[*] */ + repfnz_col = repfnz; + TriTmp = tempv; + dense_col = dense; + + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + if ( segsze <= 3 ) continue; /* skip unrolled cases */ + + no_zeros = kfnz - fsupc; + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = TriTmp[i]; + TriTmp[i] = zero; + ++isub; + } + + } /* for jj ... */ + + } else { /* 1-D block modification */ + + + /* Sequence through each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++, + repfnz_col += m, dense_col += m) { + + kfnz = repfnz_col[krep]; + if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ + + segsze = krep - kfnz + 1; + luptr = xlusup[fsupc]; + + ops[TRSV] += 4 * segsze * (segsze - 1); + ops[GEMV] += 8 * nrow * segsze; + + /* Case 1: Update U-segment of size 1 -- col-col update */ + if ( segsze == 1 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc; + + for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { + irow = lsub[i]; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + ++luptr; + } + + } else if ( segsze <= 3 ) { + ukj = dense_col[lsub[krep_ind]]; + luptr += nsupr*(nsupc-1) + nsupc-1; + ukj1 = dense_col[lsub[krep_ind - 1]]; + luptr1 = luptr - nsupr; + + if ( segsze == 2 ) { + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } else { + ukj2 = dense_col[lsub[krep_ind - 2]]; + luptr2 = luptr1 - nsupr; + zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); + z_sub(&ukj1, &ukj1, &comp_temp); + + zz_mult(&comp_temp, &ukj1, &lusup[luptr1]); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&ukj, &ukj, &comp_temp); + dense_col[lsub[krep_ind]] = ukj; + dense_col[lsub[krep_ind-1]] = ukj1; + for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + ++luptr; ++luptr1; ++luptr2; + zz_mult(&comp_temp, &ukj, &lusup[luptr]); + zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]); + z_add(&comp_temp, &comp_temp, &comp_temp1); + z_sub(&dense_col[irow], &dense_col[irow], &comp_temp); + } + } + + } else { /* segsze >= 4 */ + /* + * Perform a triangular solve and block update, + * then scatter the result of sup-col update to dense[]. + */ + no_zeros = kfnz - fsupc; + + /* Copy U[*,j] segment from dense[*] to tempv[*]: + * The result of triangular solve is in tempv[*]; + * The result of matrix vector update is in dense_col[*] + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; ++i) { + irow = lsub[isub]; + tempv[i] = dense_col[irow]; /* Gather */ + ++isub; + } + + /* start effective triangle */ + luptr += nsupr * no_zeros + no_zeros; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#else + ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], + &nsupr, tempv, &incx ); +#endif + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + alpha = one; + beta = zero; +#ifdef _CRAY + CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#else + zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], + &nsupr, tempv, &incx, &beta, tempv1, &incy ); +#endif +#else + zlsolve ( nsupr, segsze, &lusup[luptr], tempv ); + + luptr += segsze; /* Dense matrix-vector */ + tempv1 = &tempv[segsze]; + zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); +#endif + + /* Scatter tempv[*] into SPA dense[*] temporarily, such + * that tempv[*] can be used for the triangular solve of + * the next column of the panel. They will be copied into + * ucol[*] after the whole panel has been finished. + */ + isub = lptr + no_zeros; + for (i = 0; i < segsze; i++) { + irow = lsub[isub]; + dense_col[irow] = tempv[i]; + tempv[i] = zero; + isub++; + } + + /* Scatter the update from tempv1[*] into SPA dense[*] */ + /* Start dense rectangular L */ + for (i = 0; i < nrow; i++) { + irow = lsub[isub]; + z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); + tempv1[i] = zero; + ++isub; + } + + } /* else segsze>=4 ... */ + + } /* for each column in the panel... */ + + } /* else 1-D update ... */ + + } /* for each updating supernode ... */ + +} + + + diff --git a/src/maths/SuperLU/zpanel_dfs.c b/src/maths/SuperLU/zpanel_dfs.c new file mode 100644 index 000000000..e160a2f2a --- /dev/null +++ b/src/maths/SuperLU/zpanel_dfs.c @@ -0,0 +1,254 @@ + +/*! @file zpanel_dfs.c + * \brief Peforms a symbolic factorization on a panel of symbols + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives.
+ *
+ *   The routine returns one list of the supernodal representatives
+ *   in topological order of the dfs that generates them. This list is
+ *   a superset of the topological order of each individual column within
+ *   the panel. 
+ *   The location of the first nonzero in each supernodal segment
+ *   (supernodal entry location) is also returned. Each column has a 
+ *   separate list for this purpose.
+ *
+ *   Two marker arrays are used for dfs:
+ *     marker[i] == jj, if i was visited during dfs of current column jj;
+ *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
+ *
+ *   marker: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ * 
+ */ + +void +zpanel_dfs ( + const int m, /* in - number of rows in the matrix */ + const int w, /* in */ + const int jcol, /* in */ + SuperMatrix *A, /* in - original matrix */ + int *perm_r, /* in */ + int *nseg, /* out */ + doublecomplex *dense, /* out */ + int *panel_lsub, /* out */ + int *segrep, /* out */ + int *repfnz, /* out */ + int *xprune, /* out */ + int *marker, /* out */ + int *parent, /* working array */ + int *xplore, /* working array */ + GlobalLU_t *Glu /* modified */ + ) +{ + + NCPformat *Astore; + doublecomplex *a; + int *asub; + int *xa_begin, *xa_end; + int krep, chperm, chmark, chrep, oldrep, kchild, myfnz; + int k, krow, kmark, kperm; + int xdfs, maxdfs, kpar; + int jj; /* index through each column in the panel */ + int *marker1; /* marker1[jj] >= jcol if vertex jj was visited + by a previous column within this panel. */ + int *repfnz_col; /* start of each column in the panel */ + doublecomplex *dense_col; /* start of each column in the panel */ + int nextl_col; /* next available position in panel_lsub[*,jj] */ + int *xsup, *supno; + int *lsub, *xlsub; + + /* Initialize pointers */ + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; + marker1 = marker + m; + repfnz_col = repfnz; + dense_col = dense; + *nseg = 0; + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + + /* For each column in the panel */ + for (jj = jcol; jj < jcol + w; jj++) { + nextl_col = (jj - jcol) * m; + +#ifdef CHK_DFS + printf("\npanel col %d: ", jj); +#endif + + /* For each nonz in A[*,jj] do dfs */ + for (k = xa_begin[jj]; k < xa_end[jj]; k++) { + krow = asub[k]; + dense_col[krow] = a[k]; + kmark = marker[krow]; + if ( kmark == jj ) + continue; /* krow visited before, go to the next nonzero */ + + /* For each unmarked nbr krow of jj + * krow is in L: place it in structure of L[*,jj] + */ + marker[krow] = jj; + kperm = perm_r[krow]; + + if ( kperm == EMPTY ) { + panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ + } + /* + * krow is in U: if its supernode-rep krep + * has been explored, update repfnz[*] + */ + else { + + krep = xsup[supno[kperm]+1] - 1; + myfnz = repfnz_col[krep]; + +#ifdef CHK_DFS + printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm); +#endif + if ( myfnz != EMPTY ) { /* Representative visited before */ + if ( myfnz > kperm ) repfnz_col[krep] = kperm; + /* continue; */ + } + else { + /* Otherwise, perform dfs starting at krep */ + oldrep = EMPTY; + parent[krep] = oldrep; + repfnz_col[krep] = kperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + do { + /* + * For each unmarked kchild of krep + */ + while ( xdfs < maxdfs ) { + + kchild = lsub[xdfs]; + xdfs++; + chmark = marker[kchild]; + + if ( chmark != jj ) { /* Not reached yet */ + marker[kchild] = jj; + chperm = perm_r[kchild]; + + /* Case kchild is in L: place it in L[*,j] */ + if ( chperm == EMPTY ) { + panel_lsub[nextl_col++] = kchild; + } + /* Case kchild is in U: + * chrep = its supernode-rep. If its rep has + * been explored, update its repfnz[*] + */ + else { + + chrep = xsup[supno[chperm]+1] - 1; + myfnz = repfnz_col[chrep]; +#ifdef CHK_DFS + printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm); +#endif + if ( myfnz != EMPTY ) { /* Visited before */ + if ( myfnz > chperm ) + repfnz_col[chrep] = chperm; + } + else { + /* Cont. dfs at snode-rep of kchild */ + xplore[krep] = xdfs; + oldrep = krep; + krep = chrep; /* Go deeper down G(L) */ + parent[krep] = oldrep; + repfnz_col[krep] = chperm; + xdfs = xlsub[krep]; + maxdfs = xprune[krep]; +#ifdef CHK_DFS + printf(" xdfs %d, maxdfs %d: ", xdfs, maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } /* else */ + + } /* else */ + + } /* if... */ + + } /* while xdfs < maxdfs */ + + /* krow has no more unexplored nbrs: + * Place snode-rep krep in postorder DFS, if this + * segment is seen for the first time. (Note that + * "repfnz[krep]" may change later.) + * Backtrack dfs to its parent. + */ + if ( marker1[krep] < jcol ) { + segrep[*nseg] = krep; + ++(*nseg); + marker1[krep] = jj; + } + + kpar = parent[krep]; /* Pop stack, mimic recursion */ + if ( kpar == EMPTY ) break; /* dfs done */ + krep = kpar; + xdfs = xplore[krep]; + maxdfs = xprune[krep]; + +#ifdef CHK_DFS + printf(" pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs); + for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]); + printf("\n"); +#endif + } while ( kpar != EMPTY ); /* do-while - until empty stack */ + + } /* else */ + + } /* else */ + + } /* for each nonz in A[*,jj] */ + + repfnz_col += m; /* Move to next column */ + dense_col += m; + + } /* for jj ... */ + +} diff --git a/src/maths/SuperLU/zpivotL.c b/src/maths/SuperLU/zpivotL.c new file mode 100644 index 000000000..59af91c72 --- /dev/null +++ b/src/maths/SuperLU/zpivotL.c @@ -0,0 +1,185 @@ + +/*! @file zpivotL.c + * \brief Performs numerical pivoting + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include +#include +#include + +#undef DEBUG + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Performs the numerical pivoting on the current column of L,
+ *   and the CDIV operation.
+ *
+ *   Pivot policy:
+ *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
+ *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
+ *           pivot row = k;
+ *       ELSE IF abs(A_jj) >= thresh THEN
+ *           pivot row = j;
+ *       ELSE
+ *           pivot row = m;
+ * 
+ *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
+ *
+ *   Return value: 0      success;
+ *                 i > 0  U(i,i) is exactly zero.
+ * 
+ */ + +int +zpivotL( + const int jcol, /* in */ + const double u, /* in - diagonal pivoting threshold */ + int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ + int *perm_r, /* may be modified */ + int *iperm_r, /* in - inverse of perm_r */ + int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ + int *pivrow, /* out */ + GlobalLU_t *Glu, /* modified - global LU data structures */ + SuperLUStat_t *stat /* output */ + ) +{ + + doublecomplex one = {1.0, 0.0}; + int fsupc; /* first column in the supernode */ + int nsupc; /* no of columns in the supernode */ + int nsupr; /* no of rows in the supernode */ + int lptr; /* points to the starting subscript of the supernode */ + int pivptr, old_pivptr, diag, diagind; + double pivmax, rtemp, thresh; + doublecomplex temp; + doublecomplex *lu_sup_ptr; + doublecomplex *lu_col_ptr; + int *lsub_ptr; + int isub, icol, k, itemp; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + /* Initialize pointers */ + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; + nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ + lptr = xlsub[fsupc]; + nsupr = xlsub[fsupc+1] - lptr; + lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ + lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ + lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ + +#ifdef DEBUG +if ( jcol == MIN_COL ) { + printf("Before cdiv: col %d\n", jcol); + for (k = nsupc; k < nsupr; k++) + printf(" lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]); +} +#endif + + /* Determine the largest abs numerical value for partial pivoting; + Also search for user-specified pivot, and diagonal element. */ + if ( *usepr ) *pivrow = iperm_r[jcol]; + diagind = iperm_c[jcol]; + pivmax = 0.0; + pivptr = nsupc; + diag = EMPTY; + old_pivptr = nsupc; + for (isub = nsupc; isub < nsupr; ++isub) { + rtemp = z_abs1 (&lu_col_ptr[isub]); + if ( rtemp > pivmax ) { + pivmax = rtemp; + pivptr = isub; + } + if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; + if ( lsub_ptr[isub] == diagind ) diag = isub; + } + + /* Test for singularity */ + if ( pivmax == 0.0 ) { +#if 1 + *pivrow = lsub_ptr[pivptr]; + perm_r[*pivrow] = jcol; +#else + perm_r[diagind] = jcol; +#endif + *usepr = 0; + return (jcol+1); + } + + thresh = u * pivmax; + + /* Choose appropriate pivotal element by our policy. */ + if ( *usepr ) { + rtemp = z_abs1 (&lu_col_ptr[old_pivptr]); + if ( rtemp != 0.0 && rtemp >= thresh ) + pivptr = old_pivptr; + else + *usepr = 0; + } + if ( *usepr == 0 ) { + /* Use diagonal pivot? */ + if ( diag >= 0 ) { /* diagonal exists */ + rtemp = z_abs1 (&lu_col_ptr[diag]); + if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; + } + *pivrow = lsub_ptr[pivptr]; + } + + /* Record pivot row */ + perm_r[*pivrow] = jcol; + + /* Interchange row subscripts */ + if ( pivptr != nsupc ) { + itemp = lsub_ptr[pivptr]; + lsub_ptr[pivptr] = lsub_ptr[nsupc]; + lsub_ptr[nsupc] = itemp; + + /* Interchange numerical values as well, for the whole snode, such + * that L is indexed the same way as A. + */ + for (icol = 0; icol <= nsupc; icol++) { + itemp = pivptr + icol * nsupr; + temp = lu_sup_ptr[itemp]; + lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; + lu_sup_ptr[nsupc + icol*nsupr] = temp; + } + } /* if */ + + /* cdiv operation */ + ops[FACT] += 10 * (nsupr - nsupc); + + z_div(&temp, &one, &lu_col_ptr[nsupc]); + for (k = nsupc+1; k < nsupr; k++) + zz_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); + + return 0; +} + diff --git a/src/maths/SuperLU/zpivotgrowth.c b/src/maths/SuperLU/zpivotgrowth.c new file mode 100644 index 000000000..ecf9993b7 --- /dev/null +++ b/src/maths/SuperLU/zpivotgrowth.c @@ -0,0 +1,114 @@ + +/*! @file zpivotgrowth.c + * \brief Computes the reciprocal pivot growth factor + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +#include +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Compute the reciprocal pivot growth factor of the leading ncols columns
+ * of the matrix, using the formula:
+ *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
+ *
+ * Arguments
+ * =========
+ *
+ * ncols    (input) int
+ *          The number of columns of matrices A, L and U.
+ *
+ * A        (input) SuperMatrix*
+ *	    Original matrix A, permuted by columns, of dimension
+ *          (A->nrow, A->ncol). The type of A can be:
+ *          Stype = NC; Dtype = SLU_Z; Mtype = GE.
+ *
+ * L        (output) SuperMatrix*
+ *          The factor L from the factorization Pr*A=L*U; use compressed row 
+ *          subscripts storage for supernodes, i.e., L has type: 
+ *          Stype = SC; Dtype = SLU_Z; Mtype = TRLU.
+ *
+ * U        (output) SuperMatrix*
+ *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
+ *          storage scheme, i.e., U has types: Stype = NC;
+ *          Dtype = SLU_Z; Mtype = TRU.
+ * 
+ */ + +double +zPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, + SuperMatrix *L, SuperMatrix *U) +{ + + NCformat *Astore; + SCformat *Lstore; + NCformat *Ustore; + doublecomplex *Aval, *Lval, *Uval; + int fsupc, nsupr, luptr, nz_in_U; + int i, j, k, oldcol; + int *inv_perm_c; + double rpg, maxaj, maxuj; + double smlnum; + doublecomplex *luval; + doublecomplex temp_comp; + + /* Get machine constants. */ + smlnum = dlamch_("S"); + rpg = 1. / smlnum; + + Astore = A->Store; + Lstore = L->Store; + Ustore = U->Store; + Aval = Astore->nzval; + Lval = Lstore->nzval; + Uval = Ustore->nzval; + + inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); + for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; + + for (k = 0; k <= Lstore->nsuper; ++k) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + luptr = L_NZ_START(fsupc); + luval = &Lval[luptr]; + nz_in_U = 1; + + for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { + maxaj = 0.; + oldcol = inv_perm_c[j]; + for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) + maxaj = SUPERLU_MAX( maxaj, z_abs1( &Aval[i]) ); + + maxuj = 0.; + for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) + maxuj = SUPERLU_MAX( maxuj, z_abs1( &Uval[i]) ); + + /* Supernode */ + for (i = 0; i < nz_in_U; ++i) + maxuj = SUPERLU_MAX( maxuj, z_abs1( &luval[i]) ); + + ++nz_in_U; + luval += nsupr; + + if ( maxuj == 0. ) + rpg = SUPERLU_MIN( rpg, 1.); + else + rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); + } + + if ( j >= ncols ) break; + } + + SUPERLU_FREE(inv_perm_c); + return (rpg); +} diff --git a/src/maths/SuperLU/zpruneL.c b/src/maths/SuperLU/zpruneL.c new file mode 100644 index 000000000..e16099b89 --- /dev/null +++ b/src/maths/SuperLU/zpruneL.c @@ -0,0 +1,154 @@ + +/*! @file zpruneL.c + * \brief Prunes the L-structure + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ *
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Prunes the L-structure of supernodes whose L-structure
+ *   contains the current pivot row "pivrow"
+ * 
+ */ + +void +zpruneL( + const int jcol, /* in */ + const int *perm_r, /* in */ + const int pivrow, /* in */ + const int nseg, /* in */ + const int *segrep, /* in */ + const int *repfnz, /* in */ + int *xprune, /* out */ + GlobalLU_t *Glu /* modified - global LU data structures */ + ) +{ + + doublecomplex utemp; + int jsupno, irep, irep1, kmin, kmax, krow, movnum; + int i, ktemp, minloc, maxloc; + int do_prune; /* logical variable */ + int *xsup, *supno; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + /* + * For each supernode-rep irep in U[*,j] + */ + jsupno = supno[jcol]; + for (i = 0; i < nseg; i++) { + + irep = segrep[i]; + irep1 = irep + 1; + do_prune = FALSE; + + /* Don't prune with a zero U-segment */ + if ( repfnz[irep] == EMPTY ) + continue; + + /* If a snode overlaps with the next panel, then the U-segment + * is fragmented into two parts -- irep and irep1. We should let + * pruning occur at the rep-column in irep1's snode. + */ + if ( supno[irep] == supno[irep1] ) /* Don't prune */ + continue; + + /* + * If it has not been pruned & it has a nonz in row L[pivrow,i] + */ + if ( supno[irep] != jsupno ) { + if ( xprune[irep] >= xlsub[irep1] ) { + kmin = xlsub[irep]; + kmax = xlsub[irep1] - 1; + for (krow = kmin; krow <= kmax; krow++) + if ( lsub[krow] == pivrow ) { + do_prune = TRUE; + break; + } + } + + if ( do_prune ) { + + /* Do a quicksort-type partition + * movnum=TRUE means that the num values have to be exchanged. + */ + movnum = FALSE; + if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */ + movnum = TRUE; + + while ( kmin <= kmax ) { + + if ( perm_r[lsub[kmax]] == EMPTY ) + kmax--; + else if ( perm_r[lsub[kmin]] != EMPTY ) + kmin++; + else { /* kmin below pivrow (not yet pivoted), and kmax + * above pivrow: interchange the two subscripts + */ + ktemp = lsub[kmin]; + lsub[kmin] = lsub[kmax]; + lsub[kmax] = ktemp; + + /* If the supernode has only one column, then we + * only keep one set of subscripts. For any subscript + * interchange performed, similar interchange must be + * done on the numerical values. + */ + if ( movnum ) { + minloc = xlusup[irep] + (kmin - xlsub[irep]); + maxloc = xlusup[irep] + (kmax - xlsub[irep]); + utemp = lusup[minloc]; + lusup[minloc] = lusup[maxloc]; + lusup[maxloc] = utemp; + } + + kmin++; + kmax--; + + } + + } /* while */ + + xprune[irep] = kmin; /* Pruning */ + +#ifdef CHK_PRUNE + printf(" After zpruneL(),using col %d: xprune[%d] = %d\n", + jcol, irep, kmin); +#endif + } /* if do_prune */ + + } /* if */ + + } /* for each U-segment... */ +} diff --git a/src/maths/SuperLU/zreadhb.c b/src/maths/SuperLU/zreadhb.c new file mode 100644 index 000000000..90debee41 --- /dev/null +++ b/src/maths/SuperLU/zreadhb.c @@ -0,0 +1,267 @@ + +/*! @file zreadhb.c + * \brief Read a matrix stored in Harwell-Boeing format + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Purpose
+ * =======
+ * 
+ * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format 
+ * as described below.
+ * 
+ * Line 1 (A72,A8) 
+ *  	Col. 1 - 72   Title (TITLE) 
+ *	Col. 73 - 80  Key (KEY) 
+ * 
+ * Line 2 (5I14) 
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
+ *                    (including starting guesses and solution vectors 
+ *		       if present) 
+ *           	      (zero indicates no right-hand side data is present) 
+ *
+ * Line 3 (A3, 11X, 4I14) 
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
+ *	              (equal to number of entries for assembled matrices) 
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
+ *	              (zero in the case of assembled matrices) 
+ * Line 4 (2A16, 2A20) 
+ * 	Col. 1 - 16   Format for pointers (PTRFMT) 
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
+ *    	Col. 1 	      Right-hand side type: 
+ *	         	  F for full storage or M for same format as matrix 
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
+ *    	Col. 3        X if an exact solution vector(s) is supplied. 
+ *	Col. 15 - 28  Number of right-hand sides (NRHS) 
+ *	Col. 29 - 42  Number of row indices (NRHSIX) 
+ *          	      (ignored in case of unassembled matrices) 
+ *
+ * The three character type field on line 3 describes the matrix type. 
+ * The following table lists the permitted values for each of the three 
+ * characters. As an example of the type field, RSA denotes that the matrix 
+ * is real, symmetric, and assembled. 
+ *
+ * First Character: 
+ *	R Real matrix 
+ *	C Complex matrix 
+ *	P Pattern only (no numerical values supplied) 
+ *
+ * Second Character: 
+ *	S Symmetric 
+ *	U Unsymmetric 
+ *	H Hermitian 
+ *	Z Skew symmetric 
+ *	R Rectangular 
+ *
+ * Third Character: 
+ *	A Assembled 
+ *	E Elemental matrices (unassembled) 
+ *
+ * 
+ */ +#include +#include +#include + + +/*! \brief Eat up the rest of the current line */ +int zDumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +int zParseIntFormat(char *buf, int *num, int *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + sscanf(tmp, "%d", num); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + sscanf(tmp, "%d", size); + return 0; +} + +int zParseFloatFormat(char *buf, int *num, int *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ + + return 0; +} + +static int ReadVector(FILE *fp, int n, int *where, int perline, int persize) +{ + register int i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + * + * + * Purpose + * ======= + * + * Read a DOUBLE COMPLEX PRECISION matrix stored in Rutherford-Boeing format + * as described below. + * + * Line 1 (A72, A8) + * Col. 1 - 72 Title (TITLE) + * Col. 73 - 80 Matrix name / identifier (MTRXID) + * + * Line 2 (I14, 3(1X, I13)) + * Col. 1 - 14 Total number of lines excluding header (TOTCRD) + * Col. 16 - 28 Number of lines for pointers (PTRCRD) + * Col. 30 - 42 Number of lines for row (or variable) indices (INDCRD) + * Col. 44 - 56 Number of lines for numerical values (VALCRD) + * + * Line 3 (A3, 11X, 4(1X, I13)) + * Col. 1 - 3 Matrix type (see below) (MXTYPE) + * Col. 15 - 28 Compressed Column: Number of rows (NROW) + * Elemental: Largest integer used to index variable (MVAR) + * Col. 30 - 42 Compressed Column: Number of columns (NCOL) + * Elemental: Number of element matrices (NELT) + * Col. 44 - 56 Compressed Column: Number of entries (NNZERO) + * Elemental: Number of variable indeces (NVARIX) + * Col. 58 - 70 Compressed Column: Unused, explicitly zero + * Elemental: Number of elemental matrix entries (NELTVL) + * + * Line 4 (2A16, A20) + * Col. 1 - 16 Fortran format for pointers (PTRFMT) + * Col. 17 - 32 Fortran format for row (or variable) indices (INDFMT) + * Col. 33 - 52 Fortran format for numerical values of coefficient matrix + * (VALFMT) + * (blank in the case of matrix patterns) + * + * The three character type field on line 3 describes the matrix type. + * The following table lists the permitted values for each of the three + * characters. As an example of the type field, RSA denotes that the matrix + * is real, symmetric, and assembled. + * + * First Character: + * R Real matrix + * C Complex matrix + * I integer matrix + * P Pattern only (no numerical values supplied) + * Q Pattern only (numerical values supplied in associated auxiliary value + * file) + * + * Second Character: + * S Symmetric + * U Unsymmetric + * H Hermitian + * Z Skew symmetric + * R Rectangular + * + * Third Character: + * A Compressed column form + * E Elemental form + * + * + */ + +#include + + +/*! \brief Eat up the rest of the current line */ +static int zDumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +static int zParseIntFormat(char *buf, int *num, int *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + sscanf(tmp, "%d", num); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + sscanf(tmp, "%d", size); + return 0; +} + +static int zParseFloatFormat(char *buf, int *num, int *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ + + return 0; +} + +static int ReadVector(FILE *fp, int n, int *where, int perline, int persize) +{ + register int i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * -- SuperLU routine (version 4.0) -- + * Lawrence Berkeley National Laboratory. + * June 30, 2009 + * + */ + +#include + + +void +zreadtriple(int *m, int *n, int *nonz, + doublecomplex **nzval, int **rowind, int **colptr) +{ +/* + * Output parameters + * ================= + * (a,asub,xa): asub[*] contains the row subscripts of nonzeros + * in columns of matrix A; a[*] the numerical values; + * row i of A is given by a[k],k=xa[i],...,xa[i+1]-1. + * + */ + int j, k, jsize, nnz, nz; + doublecomplex *a, *val; + int *asub, *xa, *row, *col; + int zero_base = 0; + + /* Matrix format: + * First line: #rows, #cols, #non-zero + * Triplet in the rest of lines: + * row, col, value + */ + + scanf("%d%d", n, nonz); + *m = *n; + printf("m %d, n %d, nonz %d\n", *m, *n, *nonz); + zallocateA(*n, *nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + val = (doublecomplex *) SUPERLU_MALLOC(*nonz * sizeof(doublecomplex)); + row = (int *) SUPERLU_MALLOC(*nonz * sizeof(int)); + col = (int *) SUPERLU_MALLOC(*nonz * sizeof(int)); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* Read into the triplet array from a file */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { + scanf("%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i); + + if ( nnz == 0 ) { /* first nonzero */ + if ( row[0] == 0 || col[0] == 0 ) { + zero_base = 1; + printf("triplet file: row/col indices are zero-based.\n"); + } else + printf("triplet file: row/col indices are one-based.\n"); + } + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz %d, (%d, %d) = (%e,%e) out of bound, removed\n", + nz, row[nz], col[nz], val[nz].r, val[nz].i); + exit(-1); + } else { + ++xa[col[nz]]; + ++nz; + } + } + + *nonz = nz; + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + { + int i; + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } + } +#endif + +} + + +void zreadrhs(int m, doublecomplex *b) +{ + FILE *fp, *fopen(); + int i; + /*int j;*/ + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "dreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + fscanf(fp, "%lf%lf\n", &b[i].r, &b[i].i); + + /* readpair_(j, &b[i]);*/ + fclose(fp); +} diff --git a/src/maths/SuperLU/zsnode_bmod.c b/src/maths/SuperLU/zsnode_bmod.c new file mode 100644 index 000000000..9f30eed6c --- /dev/null +++ b/src/maths/SuperLU/zsnode_bmod.c @@ -0,0 +1,120 @@ + +/*! @file zsnode_bmod.c + * \brief Performs numeric block updates within the relaxed snode. + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + + +/*! \brief Performs numeric block updates within the relaxed snode. + */ +int +zsnode_bmod ( + const int jcol, /* in */ + const int jsupno, /* in */ + const int fsupc, /* in */ + doublecomplex *dense, /* in */ + doublecomplex *tempv, /* working array */ + GlobalLU_t *Glu, /* modified */ + SuperLUStat_t *stat /* output */ + ) +{ +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + int incx = 1, incy = 1; + doublecomplex alpha = {-1.0, 0.0}, beta = {1.0, 0.0}; +#endif + + doublecomplex comp_zero = {0.0, 0.0}; + int luptr, nsupc, nsupr, nrow; + int isub, irow, i, iptr; + register int ufirst, nextlu; + int *lsub, *xlsub; + doublecomplex *lusup; + int *xlusup; + flops_t *ops = stat->ops; + + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + + nextlu = xlusup[jcol]; + + /* + * Process the supernodal portion of L\U[*,j] + */ + for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { + irow = lsub[isub]; + lusup[nextlu] = dense[irow]; + dense[irow] = comp_zero; + ++nextlu; + } + + xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ + + if ( fsupc < jcol ) { + + luptr = xlusup[fsupc]; + nsupr = xlsub[fsupc+1] - xlsub[fsupc]; + nsupc = jcol - fsupc; /* Excluding jcol */ + ufirst = xlusup[jcol]; /* Points to the beginning of column + jcol in supernode L\U(jsupno). */ + nrow = nsupr - nsupc; + + ops[TRSV] += 4 * nsupc * (nsupc - 1); + ops[GEMV] += 8 * nrow * nsupc; + +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, + &lusup[ufirst], &incx ); + CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#else + ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, + &lusup[ufirst], &incx ); + zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, + &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); +#endif +#else + zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); + zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], + &lusup[ufirst], &tempv[0] ); + + /* Scatter tempv[*] into lusup[*] */ + iptr = ufirst + nsupc; + for (i = 0; i < nrow; i++) { + z_sub(&lusup[iptr], &lusup[iptr], &tempv[i]); + ++iptr; + tempv[i] = comp_zero; + } +#endif + + } + + return 0; +} diff --git a/src/maths/SuperLU/zsnode_dfs.c b/src/maths/SuperLU/zsnode_dfs.c new file mode 100644 index 000000000..0c4a2d8ef --- /dev/null +++ b/src/maths/SuperLU/zsnode_dfs.c @@ -0,0 +1,112 @@ + +/*! @file zsnode_dfs.c + * \brief Determines the union of row structures of columns within the relaxed node + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *    zsnode_dfs() - Determine the union of the row structures of those 
+ *    columns within the relaxed snode.
+ *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
+ *    the portion outside the rectangular supernode must be zero.
+ *
+ * Return value
+ * ============
+ *     0   success;
+ *    >0   number of bytes allocated when run out of memory.
+ * 
+ */ + +int +zsnode_dfs ( + const int jcol, /* in - start of the supernode */ + const int kcol, /* in - end of the supernode */ + const int *asub, /* in */ + const int *xa_begin, /* in */ + const int *xa_end, /* in */ + int *xprune, /* out */ + int *marker, /* modified */ + GlobalLU_t *Glu /* modified */ + ) +{ + + register int i, k, ifrom, ito, nextl, new_next; + int nsuper, krow, kmark, mem_error; + int *xsup, *supno; + int *lsub, *xlsub; + int nzlmax; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + nzlmax = Glu->nzlmax; + + nsuper = ++supno[jcol]; /* Next available supernode number */ + nextl = xlsub[jcol]; + + for (i = jcol; i <= kcol; i++) { + /* For each nonzero in A[*,i] */ + for (k = xa_begin[i]; k < xa_end[i]; k++) { + krow = asub[k]; + kmark = marker[krow]; + if ( kmark != kcol ) { /* First time visit krow */ + marker[krow] = kcol; + lsub[nextl++] = krow; + if ( nextl >= nzlmax ) { + if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + } + } + supno[i] = nsuper; + } + + /* Supernode > 1, then make a copy of the subscripts for pruning */ + if ( jcol < kcol ) { + new_next = nextl + (nextl - xlsub[jcol]); + while ( new_next > nzlmax ) { + if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) ) + return (mem_error); + lsub = Glu->lsub; + } + ito = nextl; + for (ifrom = xlsub[jcol]; ifrom < nextl; ) + lsub[ito++] = lsub[ifrom++]; + for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl; + nextl = ito; + } + + xsup[nsuper+1] = kcol + 1; + supno[kcol+1] = nsuper; + xprune[kcol] = nextl; + xlsub[kcol+1] = nextl; + + return 0; +} + diff --git a/src/maths/SuperLU/zsp_blas2.c b/src/maths/SuperLU/zsp_blas2.c new file mode 100644 index 000000000..c094f2d69 --- /dev/null +++ b/src/maths/SuperLU/zsp_blas2.c @@ -0,0 +1,573 @@ + +/*! @file zsp_blas2.c + * \brief Sparse BLAS 2, using some dense BLAS 2 operations + * + *
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ * 
+ */ +/* + * File name: zsp_blas2.c + * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. + */ + +#include + +/* + * Function prototypes + */ +void zusolve(int, int, doublecomplex*, doublecomplex*); +void zlsolve(int, int, doublecomplex*, doublecomplex*); +void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*); + +/*! \brief Solves one of the systems of equations A*x = b, or A'*x = b + * + *
+ *   Purpose
+ *   =======
+ *
+ *   sp_ztrsv() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.
+ *                trans = 'C' or 'c'   A^H*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes,
+ *             i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU.
+ *    
+ *   x       - (input/output) doublecomplex*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ * 
+ */ +int +sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, + SuperMatrix *U, doublecomplex *x, SuperLUStat_t *stat, int *info) +{ +#ifdef _CRAY + _fcd ftcs1 = _cptofcd("L", strlen("L")), + ftcs2 = _cptofcd("N", strlen("N")), + ftcs3 = _cptofcd("U", strlen("U")); +#endif + SCformat *Lstore; + NCformat *Ustore; + doublecomplex *Lval, *Uval; + int incx = 1, incy = 1; + doublecomplex temp; + doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; + doublecomplex comp_zero = {0.0, 0.0}; + int nrow; + int fsupc, nsupr, nsupc, luptr, istart, irow; + int i, k, iptr, jcol; + doublecomplex *work; + flops_t solve_ops; + + /* Test the input parameters */ + *info = 0; + if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; + else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && + !lsame_(trans, "C")) *info = -2; + else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; + else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; + else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; + if ( *info ) { + i = -(*info); + xerbla_("sp_ztrsv", &i); + return 0; + } + + Lstore = L->Store; + Lval = Lstore->nzval; + Ustore = U->Store; + Uval = Ustore->nzval; + solve_ops = 0; + + if ( !(work = doublecomplexCalloc(L->nrow)) ) + ABORT_SuperLU("Malloc fails for work in sp_ztrsv()."); + + if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L)*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + nrow = nsupr - nsupc; + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc; + solve_ops += 8 * nrow * nsupc; + + if ( nsupc == 1 ) { + for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { + irow = L_SUB(iptr); + ++luptr; + zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]); + z_sub(&x[irow], &x[irow], &comp_zero); + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#else + ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); + + zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], + &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); +#endif +#else + zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); + + zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], + &x[fsupc], &work[0] ); +#endif + + iptr = istart + nsupc; + for (i = 0; i < nrow; ++i, ++iptr) { + irow = L_SUB(iptr); + z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ + work[i] = comp_zero; + + } + } + } /* for k ... */ + + } else { + /* Form x := inv(U)*x */ + + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; k--) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; + + if ( nsupc == 1 ) { + z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); + for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { + irow = U_SUB(i); + zz_mult(&comp_zero, &x[fsupc], &Uval[i]); + z_sub(&x[irow], &x[irow], &comp_zero); + } + } else { +#ifdef USE_VENDOR_BLAS +#ifdef _CRAY + CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif +#else + zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); +#endif + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); + i++) { + irow = U_SUB(i); + zz_mult(&comp_zero, &x[jcol], &Uval[i]); + z_sub(&x[irow], &x[irow], &comp_zero); + } + } + } + } /* for k ... */ + + } + } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */ + + if ( lsame_(uplo, "L") ) { + /* Form x := inv(L')*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; --k) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 8 * (nsupr - nsupc) * nsupc; + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + iptr = istart + nsupc; + for (i = L_NZ_START(jcol) + nsupc; + i < L_NZ_START(jcol+1); i++) { + irow = L_SUB(iptr); + zz_mult(&comp_zero, &x[irow], &Lval[i]); + z_sub(&x[jcol], &x[jcol], &comp_zero); + iptr++; + } + } + + if ( nsupc > 1 ) { + solve_ops += 4 * nsupc * (nsupc - 1); +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("U", strlen("U")); + CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } + } else { + /* Form x := inv(U')*x */ + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { + irow = U_SUB(i); + zz_mult(&comp_zero, &x[irow], &Uval[i]); + z_sub(&x[jcol], &x[jcol], &comp_zero); + } + } + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; + + if ( nsupc == 1 ) { + z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); + } else { +#ifdef _CRAY + ftcs1 = _cptofcd("U", strlen("U")); + ftcs2 = _cptofcd("T", strlen("T")); + ftcs3 = _cptofcd("N", strlen("N")); + CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } /* for k ... */ + } + } else { /* Form x := conj(inv(A'))*x */ + + if ( lsame_(uplo, "L") ) { + /* Form x := conj(inv(L'))*x */ + if ( L->nrow == 0 ) return 0; /* Quick return */ + + for (k = Lstore->nsuper; k >= 0; --k) { + fsupc = L_FST_SUPC(k); + istart = L_SUB_START(fsupc); + nsupr = L_SUB_START(fsupc+1) - istart; + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + solve_ops += 8 * (nsupr - nsupc) * nsupc; + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + iptr = istart + nsupc; + for (i = L_NZ_START(jcol) + nsupc; + i < L_NZ_START(jcol+1); i++) { + irow = L_SUB(iptr); + zz_conj(&temp, &Lval[i]); + zz_mult(&comp_zero, &x[irow], &temp); + z_sub(&x[jcol], &x[jcol], &comp_zero); + iptr++; + } + } + + if ( nsupc > 1 ) { + solve_ops += 4 * nsupc * (nsupc - 1); +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd(trans, strlen("T")); + ftcs3 = _cptofcd("U", strlen("U")); + ZTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } + } else { + /* Form x := conj(inv(U'))*x */ + if ( U->nrow == 0 ) return 0; /* Quick return */ + + for (k = 0; k <= Lstore->nsuper; k++) { + fsupc = L_FST_SUPC(k); + nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); + nsupc = L_FST_SUPC(k+1) - fsupc; + luptr = L_NZ_START(fsupc); + + for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { + solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); + for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { + irow = U_SUB(i); + zz_conj(&temp, &Uval[i]); + zz_mult(&comp_zero, &x[irow], &temp); + z_sub(&x[jcol], &x[jcol], &comp_zero); + } + } + + /* 1 z_div costs 10 flops */ + solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; + + if ( nsupc == 1 ) { + zz_conj(&temp, &Lval[luptr]); + z_div(&x[fsupc], &x[fsupc], &temp); + } else { +#ifdef _CRAY + ftcs1 = _cptofcd("U", strlen("U")); + ftcs2 = _cptofcd(trans, strlen("T")); + ftcs3 = _cptofcd("N", strlen("N")); + ZTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#else + ztrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr, + &x[fsupc], &incx); +#endif + } + } /* for k ... */ + } + } + + stat->ops[SOLVE] += solve_ops; + SUPERLU_FREE(work); + return 0; +} + + + +/*! \brief Performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y + * + *
  
+ *   Purpose   
+ *   =======   
+ *
+ *   sp_zgemv()  performs one of the matrix-vector operations   
+ *      y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+ *   where alpha and beta are scalars, x and y are vectors and A is a
+ *   sparse A->nrow by A->ncol matrix.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   TRANS  - (input) char*
+ *            On entry, TRANS specifies the operation to be performed as   
+ *            follows:   
+ *               TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+ *               TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+ *               TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+ *
+ *   ALPHA  - (input) doublecomplex
+ *            On entry, ALPHA specifies the scalar alpha.   
+ *
+ *   A      - (input) SuperMatrix*
+ *            Before entry, the leading m by n part of the array A must   
+ *            contain the matrix of coefficients.   
+ *
+ *   X      - (input) doublecomplex*, array of DIMENSION at least   
+ *            ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+ *           and at least   
+ *            ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+ *            Before entry, the incremented array X must contain the   
+ *            vector x.   
+ * 
+ *   INCX   - (input) int
+ *            On entry, INCX specifies the increment for the elements of   
+ *            X. INCX must not be zero.   
+ *
+ *   BETA   - (input) doublecomplex
+ *            On entry, BETA specifies the scalar beta. When BETA is   
+ *            supplied as zero then Y need not be set on input.   
+ *
+ *   Y      - (output) doublecomplex*,  array of DIMENSION at least   
+ *            ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+ *            and at least   
+ *            ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+ *            Before entry with BETA non-zero, the incremented array Y   
+ *            must contain the vector y. On exit, Y is overwritten by the 
+ *            updated vector y.
+ *	      
+ *   INCY   - (input) int
+ *            On entry, INCY specifies the increment for the elements of   
+ *            Y. INCY must not be zero.   
+ *
+ *    ==== Sparse Level 2 Blas routine.   
+ * 
+*/ +int +sp_zgemv(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x, + int incx, doublecomplex beta, doublecomplex *y, int incy) +{ + + /* Local variables */ + NCformat *Astore; + doublecomplex *Aval; + int info; + doublecomplex temp, temp1; + int lenx, leny, i, j, irow; + int iy, jx, jy, kx, ky; + int notran; + doublecomplex comp_zero = {0.0, 0.0}; + doublecomplex comp_one = {1.0, 0.0}; + + notran = lsame_(trans, "N"); + Astore = A->Store; + Aval = Astore->nzval; + + /* Test the input parameters */ + info = 0; + if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; + else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; + else if (incx == 0) info = 5; + else if (incy == 0) info = 8; + if (info != 0) { + xerbla_("sp_zgemv ", &info); + return 0; + } + + /* Quick return if possible. */ + if (A->nrow == 0 || A->ncol == 0 || + z_eq(&alpha, &comp_zero) && + z_eq(&beta, &comp_one)) + return 0; + + + /* Set LENX and LENY, the lengths of the vectors x and y, and set + up the start points in X and Y. */ + if (lsame_(trans, "N")) { + lenx = A->ncol; + leny = A->nrow; + } else { + lenx = A->nrow; + leny = A->ncol; + } + if (incx > 0) kx = 0; + else kx = - (lenx - 1) * incx; + if (incy > 0) ky = 0; + else ky = - (leny - 1) * incy; + + /* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + /* First form y := beta*y. */ + if ( !z_eq(&beta, &comp_one) ) { + if (incy == 1) { + if ( z_eq(&beta, &comp_zero) ) + for (i = 0; i < leny; ++i) y[i] = comp_zero; + else + for (i = 0; i < leny; ++i) + zz_mult(&y[i], &beta, &y[i]); + } else { + iy = ky; + if ( z_eq(&beta, &comp_zero) ) + for (i = 0; i < leny; ++i) { + y[iy] = comp_zero; + iy += incy; + } + else + for (i = 0; i < leny; ++i) { + zz_mult(&y[iy], &beta, &y[iy]); + iy += incy; + } + } + } + + if ( z_eq(&alpha, &comp_zero) ) return 0; + + if ( notran ) { + /* Form y := alpha*A*x + y. */ + jx = kx; + if (incy == 1) { + for (j = 0; j < A->ncol; ++j) { + if ( !z_eq(&x[jx], &comp_zero) ) { + zz_mult(&temp, &alpha, &x[jx]); + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + zz_mult(&temp1, &temp, &Aval[i]); + z_add(&y[irow], &y[irow], &temp1); + } + } + jx += incx; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } else { + /* Form y := alpha*A'*x + y. */ + jy = ky; + if (incx == 1) { + for (j = 0; j < A->ncol; ++j) { + temp = comp_zero; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + zz_mult(&temp1, &Aval[i], &x[irow]); + z_add(&temp, &temp, &temp1); + } + zz_mult(&temp1, &alpha, &temp); + z_add(&y[jy], &y[jy], &temp1); + jy += incy; + } + } else { + ABORT_SuperLU("Not implemented."); + } + } + return 0; +} /* sp_zgemv */ + diff --git a/src/maths/SuperLU/zsp_blas3.c b/src/maths/SuperLU/zsp_blas3.c new file mode 100644 index 000000000..f208433bf --- /dev/null +++ b/src/maths/SuperLU/zsp_blas3.c @@ -0,0 +1,127 @@ + +/*! @file zsp_blas3.c + * \brief Sparse BLAS3, using some dense BLAS3 operations + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +/* + * File name: sp_blas3.c + * Purpose: Sparse BLAS3, using some dense BLAS3 operations. + */ + +#include + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ * 
+ *   sp_z performs one of the matrix-matrix operations   
+ * 
+ *      C := alpha*op( A )*op( B ) + beta*C,   
+ * 
+ *   where  op( X ) is one of 
+ * 
+ *      op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+ * 
+ *   alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+ *   an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+ *   
+ * 
+ *   Parameters   
+ *   ==========   
+ * 
+ *   TRANSA - (input) char*
+ *            On entry, TRANSA specifies the form of op( A ) to be used in 
+ *            the matrix multiplication as follows:   
+ *               TRANSA = 'N' or 'n',  op( A ) = A.   
+ *               TRANSA = 'T' or 't',  op( A ) = A'.   
+ *               TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+ *            Unchanged on exit.   
+ * 
+ *   TRANSB - (input) char*
+ *            On entry, TRANSB specifies the form of op( B ) to be used in 
+ *            the matrix multiplication as follows:   
+ *               TRANSB = 'N' or 'n',  op( B ) = B.   
+ *               TRANSB = 'T' or 't',  op( B ) = B'.   
+ *               TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+ *            Unchanged on exit.   
+ * 
+ *   M      - (input) int   
+ *            On entry,  M  specifies  the number of rows of the matrix 
+ *	     op( A ) and of the matrix C.  M must be at least zero. 
+ *	     Unchanged on exit.   
+ * 
+ *   N      - (input) int
+ *            On entry,  N specifies the number of columns of the matrix 
+ *	     op( B ) and the number of columns of the matrix C. N must be 
+ *	     at least zero.
+ *	     Unchanged on exit.   
+ * 
+ *   K      - (input) int
+ *            On entry, K specifies the number of columns of the matrix 
+ *	     op( A ) and the number of rows of the matrix op( B ). K must 
+ *	     be at least  zero.   
+ *           Unchanged on exit.
+ *      
+ *   ALPHA  - (input) doublecomplex
+ *            On entry, ALPHA specifies the scalar alpha.   
+ * 
+ *   A      - (input) SuperMatrix*
+ *            Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+ *            Currently, the type of A can be:
+ *                Stype = NC or NCP; Dtype = SLU_Z; Mtype = GE. 
+ *            In the future, more general A can be handled.
+ * 
+ *   B      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is 
+ *            n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+ *            Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+ *            part of the array B must contain the matrix B, otherwise 
+ *            the leading n by k part of the array B must contain the 
+ *            matrix B.   
+ *            Unchanged on exit.   
+ * 
+ *   LDB    - (input) int
+ *            On entry, LDB specifies the first dimension of B as declared 
+ *            in the calling (sub) program. LDB must be at least max( 1, n ).  
+ *            Unchanged on exit.   
+ * 
+ *   BETA   - (input) doublecomplex
+ *            On entry, BETA specifies the scalar beta. When BETA is   
+ *            supplied as zero then C need not be set on input.   
+ *  
+ *   C      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDC, n ).   
+ *            Before entry, the leading m by n part of the array C must 
+ *            contain the matrix C,  except when beta is zero, in which 
+ *            case C need not be set on entry.   
+ *            On exit, the array C is overwritten by the m by n matrix 
+ *	     ( alpha*op( A )*B + beta*C ).   
+ *  
+ *   LDC    - (input) int
+ *            On entry, LDC specifies the first dimension of C as declared 
+ *            in the calling (sub)program. LDC must be at least max(1,m).   
+ *            Unchanged on exit.   
+ *  
+ *   ==== Sparse Level 3 Blas routine.   
+ * 
+ */ + +int +sp_zgemm(char *transa, char *transb, int m, int n, int k, + doublecomplex alpha, SuperMatrix *A, doublecomplex *b, int ldb, + doublecomplex beta, doublecomplex *c, int ldc) +{ + int incx = 1, incy = 1; + int j; + + for (j = 0; j < n; ++j) { + sp_zgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); + } + return 0; +} diff --git a/src/maths/SuperLU/zutil.c b/src/maths/SuperLU/zutil.c new file mode 100644 index 000000000..20478c743 --- /dev/null +++ b/src/maths/SuperLU/zutil.c @@ -0,0 +1,475 @@ + +/*! @file zutil.c + * \brief Matrix utility functions + * + *
+ * -- SuperLU routine (version 3.1) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * August 1, 2008
+ *
+ * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+ * EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ * 
+ * Permission is hereby granted to use or copy this program for any
+ * purpose, provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ * 
+ */ + + +#include +#include + +void +zCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, + doublecomplex *nzval, int *rowind, int *colptr, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + NCformat *Astore; + + A->Stype = stype; + A->Dtype = dtype; + A->Mtype = mtype; + A->nrow = m; + A->ncol = n; + A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); + if ( !(A->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for A->Store"); + Astore = A->Store; + Astore->nnz = nnz; + Astore->nzval = nzval; + Astore->rowind = rowind; + Astore->colptr = colptr; +} + +void +zCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, + doublecomplex *nzval, int *colind, int *rowptr, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + NRformat *Astore; + + A->Stype = stype; + A->Dtype = dtype; + A->Mtype = mtype; + A->nrow = m; + A->ncol = n; + A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) ); + if ( !(A->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for A->Store"); + Astore = A->Store; + Astore->nnz = nnz; + Astore->nzval = nzval; + Astore->colind = colind; + Astore->rowptr = rowptr; +} + +/*! \brief Copy matrix A into matrix B. */ +void +zCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B) +{ + NCformat *Astore, *Bstore; + int ncol, nnz, i; + + B->Stype = A->Stype; + B->Dtype = A->Dtype; + B->Mtype = A->Mtype; + B->nrow = A->nrow;; + B->ncol = ncol = A->ncol; + Astore = (NCformat *) A->Store; + Bstore = (NCformat *) B->Store; + Bstore->nnz = nnz = Astore->nnz; + for (i = 0; i < nnz; ++i) + ((doublecomplex *)Bstore->nzval)[i] = ((doublecomplex *)Astore->nzval)[i]; + for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i]; + for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i]; +} + + +void +zCreate_Dense_Matrix(SuperMatrix *X, int m, int n, doublecomplex *x, int ldx, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + DNformat *Xstore; + + X->Stype = stype; + X->Dtype = dtype; + X->Mtype = mtype; + X->nrow = m; + X->ncol = n; + X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); + if ( !(X->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for X->Store"); + Xstore = (DNformat *) X->Store; + Xstore->lda = ldx; + Xstore->nzval = (doublecomplex *) x; +} + +void +zCopy_Dense_Matrix(int M, int N, doublecomplex *X, int ldx, + doublecomplex *Y, int ldy) +{ +/*! \brief Copies a two-dimensional matrix X to another matrix Y. + */ + int i, j; + + for (j = 0; j < N; ++j) + for (i = 0; i < M; ++i) + Y[i + j*ldy] = X[i + j*ldx]; +} + +void +zCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, + doublecomplex *nzval, int *nzval_colptr, int *rowind, + int *rowind_colptr, int *col_to_sup, int *sup_to_col, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + SCformat *Lstore; + + L->Stype = stype; + L->Dtype = dtype; + L->Mtype = mtype; + L->nrow = m; + L->ncol = n; + L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); + if ( !(L->Store) ) ABORT_SuperLU("SUPERLU_MALLOC fails for L->Store"); + Lstore = L->Store; + Lstore->nnz = nnz; + Lstore->nsuper = col_to_sup[n]; + Lstore->nzval = nzval; + Lstore->nzval_colptr = nzval_colptr; + Lstore->rowind = rowind; + Lstore->rowind_colptr = rowind_colptr; + Lstore->col_to_sup = col_to_sup; + Lstore->sup_to_col = sup_to_col; + +} + + +/*! \brief Convert a row compressed storage into a column compressed storage. + */ +void +zCompRow_to_CompCol(int m, int n, int nnz, + doublecomplex *a, int *colind, int *rowptr, + doublecomplex **at, int **rowind, int **colptr) +{ + register int i, j, col, relpos; + int *marker; + + /* Allocate storage for another copy of the matrix. */ + *at = (doublecomplex *) doublecomplexMalloc(nnz); + *rowind = (int *) intMalloc(nnz); + *colptr = (int *) intMalloc(n+1); + marker = (int *) intCalloc(n); + + /* Get counts of each column of A, and set up column pointers */ + for (i = 0; i < m; ++i) + for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; + (*colptr)[0] = 0; + for (j = 0; j < n; ++j) { + (*colptr)[j+1] = (*colptr)[j] + marker[j]; + marker[j] = (*colptr)[j]; + } + + /* Transfer the matrix into the compressed column storage. */ + for (i = 0; i < m; ++i) { + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + col = colind[j]; + relpos = marker[col]; + (*rowind)[relpos] = i; + (*at)[relpos] = a[j]; + ++marker[col]; + } + } + + SUPERLU_FREE(marker); +} + + +void +zPrint_CompCol_Matrix(char *what, SuperMatrix *A) +{ + NCformat *Astore; + register int i,n; + double *dp; + + printf("\nCompCol matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + n = A->ncol; + Astore = (NCformat *) A->Store; + dp = (double *) Astore->nzval; + printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz); + printf("nzval: "); + for (i = 0; i < 2*Astore->colptr[n]; ++i) printf("%f ", dp[i]); + printf("\nrowind: "); + for (i = 0; i < Astore->colptr[n]; ++i) printf("%d ", Astore->rowind[i]); + printf("\ncolptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->colptr[i]); + printf("\n"); + fflush(stdout); +} + +void +zPrint_SuperNode_Matrix(char *what, SuperMatrix *A) +{ + SCformat *Astore; + register int i, j, k, c, d, n, nsup; + double *dp; + int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr; + + printf("\nSuperNode matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + n = A->ncol; + Astore = (SCformat *) A->Store; + dp = (double *) Astore->nzval; + col_to_sup = Astore->col_to_sup; + sup_to_col = Astore->sup_to_col; + rowind_colptr = Astore->rowind_colptr; + rowind = Astore->rowind; + printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", + A->nrow,A->ncol,Astore->nnz,Astore->nsuper); + printf("nzval:\n"); + for (k = 0; k <= Astore->nsuper; ++k) { + c = sup_to_col[k]; + nsup = sup_to_col[k+1] - c; + for (j = c; j < c + nsup; ++j) { + d = Astore->nzval_colptr[j]; + for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) { + printf("%d\t%d\t%e\t%e\n", rowind[i], j, dp[d], dp[d+1]); + d += 2; + } + } + } +#if 0 + for (i = 0; i < 2*Astore->nzval_colptr[n]; ++i) printf("%f ", dp[i]); +#endif + printf("\nnzval_colptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->nzval_colptr[i]); + printf("\nrowind: "); + for (i = 0; i < Astore->rowind_colptr[n]; ++i) + printf("%d ", Astore->rowind[i]); + printf("\nrowind_colptr: "); + for (i = 0; i <= n; ++i) printf("%d ", Astore->rowind_colptr[i]); + printf("\ncol_to_sup: "); + for (i = 0; i < n; ++i) printf("%d ", col_to_sup[i]); + printf("\nsup_to_col: "); + for (i = 0; i <= Astore->nsuper+1; ++i) + printf("%d ", sup_to_col[i]); + printf("\n"); + fflush(stdout); +} + +void +zPrint_Dense_Matrix(char *what, SuperMatrix *A) +{ + DNformat *Astore = (DNformat *) A->Store; + register int i, j, lda = Astore->lda; + double *dp; + + printf("\nDense matrix %s:\n", what); + printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype); + dp = (double *) Astore->nzval; + printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda); + printf("\nnzval: "); + for (j = 0; j < A->ncol; ++j) { + for (i = 0; i < 2*A->nrow; ++i) printf("%f ", dp[i + j*2*lda]); + printf("\n"); + } + printf("\n"); + fflush(stdout); +} + +/*! \brief Diagnostic print of column "jcol" in the U/L factor. + */ +void +zprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu) +{ + int i, k, fsupc; + int *xsup, *supno; + int *xlsub, *lsub; + doublecomplex *lusup; + int *xlusup; + doublecomplex *ucol; + int *usub, *xusub; + + xsup = Glu->xsup; + supno = Glu->supno; + lsub = Glu->lsub; + xlsub = Glu->xlsub; + lusup = Glu->lusup; + xlusup = Glu->xlusup; + ucol = Glu->ucol; + usub = Glu->usub; + xusub = Glu->xusub; + + printf("%s", msg); + printf("col %d: pivrow %d, supno %d, xprune %d\n", + jcol, pivrow, supno[jcol], xprune[jcol]); + + printf("\tU-col:\n"); + for (i = xusub[jcol]; i < xusub[jcol+1]; i++) + printf("\t%d%10.4f, %10.4f\n", usub[i], ucol[i].r, ucol[i].i); + printf("\tL-col in rectangular snode:\n"); + fsupc = xsup[supno[jcol]]; /* first col of the snode */ + i = xlsub[fsupc]; + k = xlusup[jcol]; + while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) { + printf("\t%d\t%10.4f, %10.4f\n", lsub[i], lusup[k].r, lusup[k].i); + i++; k++; + } + fflush(stdout); +} + + +/*! \brief Check whether tempv[] == 0. This should be true before and after calling any numeric routines, i.e., "panel_bmod" and "column_bmod". + */ +void zcheck_tempv(int n, doublecomplex *tempv) +{ + int i; + + for (i = 0; i < n; i++) { + if ((tempv[i].r != 0.0) || (tempv[i].i != 0.0)) + { + fprintf(stderr,"tempv[%d] = {%f, %f}\n", i, tempv[i].r, tempv[i].i); + ABORT_SuperLU("zcheck_tempv"); + } + } +} + + +void +zGenXtrue(int n, int nrhs, doublecomplex *x, int ldx) +{ + int i, j; + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + x[i + j*ldx].r = 1.0; + x[i + j*ldx].i = 0.0; + } +} + +/*! \brief Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's + */ +void +zFillRHS(trans_t trans, int nrhs, doublecomplex *x, int ldx, + SuperMatrix *A, SuperMatrix *B) +{ + NCformat *Astore; + doublecomplex *Aval; + DNformat *Bstore; + doublecomplex *rhs; + doublecomplex one = {1.0, 0.0}; + doublecomplex zero = {0.0, 0.0}; + int ldc; + char transc[1]; + + Astore = A->Store; + Aval = (doublecomplex *) Astore->nzval; + Bstore = B->Store; + rhs = Bstore->nzval; + ldc = Bstore->lda; + + if ( trans == NOTRANS ) *(unsigned char *)transc = 'N'; + else *(unsigned char *)transc = 'T'; + + sp_zgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A, + x, ldx, zero, rhs, ldc); + +} + +/*! \brief Fills a doublecomplex precision array with a given value. + */ +void +zfill(doublecomplex *a, int alen, doublecomplex dval) +{ + register int i; + for (i = 0; i < alen; i++) a[i] = dval; +} + + + +/*! \brief Check the inf-norm of the error vector + */ +void zinf_norm_error(int nrhs, SuperMatrix *X, doublecomplex *xtrue) +{ + DNformat *Xstore; + double err, xnorm; + doublecomplex *Xmat, *soln_work; + doublecomplex temp; + int i, j; + + Xstore = X->Store; + Xmat = Xstore->nzval; + + for (j = 0; j < nrhs; j++) { + soln_work = &Xmat[j*Xstore->lda]; + err = xnorm = 0.0; + for (i = 0; i < X->nrow; i++) { + z_sub(&temp, &soln_work[i], &xtrue[i]); + err = SUPERLU_MAX(err, z_abs(&temp)); + xnorm = SUPERLU_MAX(xnorm, z_abs(&soln_work[i])); + } + err = err / xnorm; + printf("||X - Xtrue||/||X|| = %e\n", err); + } +} + + + +/*! \brief Print performance of the code. */ +void +zPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage, + double rpg, double rcond, double *ferr, + double *berr, char *equed, SuperLUStat_t *stat) +{ + SCformat *Lstore; + NCformat *Ustore; + double *utime; + flops_t *ops; + + utime = stat->utime; + ops = stat->ops; + + if ( utime[FACT] != 0. ) + printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT], + ops[FACT]*1e-6/utime[FACT]); + printf("Identify relaxed snodes = %8.2f\n", utime[RELAX]); + if ( utime[SOLVE] != 0. ) + printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE], + ops[SOLVE]*1e-6/utime[SOLVE]); + + Lstore = (SCformat *) L->Store; + Ustore = (NCformat *) U->Store; + printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz); + printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz); + printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); + + printf("L\\U MB %.3f\ttotal MB needed %.3f\n", + mem_usage->for_lu/1e6, mem_usage->total_needed/1e6); + printf("Number of memory expansions: %d\n", stat->expansions); + + printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n"); + printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n", + utime[FACT], ops[FACT]*1e-6/utime[FACT], + utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE], + utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]); + + printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n"); + printf("NUM:\t%e\t%e\t%e\t%e\t%s\n", + rpg, rcond, ferr[0], berr[0], equed); + +} + + + + +print_doublecomplex_vec(char *what, int n, doublecomplex *vec) +{ + int i; + printf("%s: n %d\n", what, n); + for (i = 0; i < n; ++i) printf("%d\t%f%f\n", i, vec[i].r, vec[i].i); + return 0; +} +