/*
 * Algorithms implementing unblocked and blocked symmetric indefinite
 * factorizaton of matrices representing linear systems.  Unblocked algorithms
 * include the outer product method and SAXPY operation, while blocked 
 * algorithms include simple blocking and an implementation that uses tuned
 * BLAS (Basic Linear Algebra Subroutines).  Also, function wrappers facilitate
 * calling unblocked and blocked LAPACK symmetric indefinite factorization 
 * routines DSYTF2 and DSYTRF, respectively.
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <float.h>

#include "ldltfact.h"
#include "lapack.h"
#include "matcom.h"
#include "timing.h"

static void eval_pivot_bk( int n, int d, int ldim, const double *A, 
	int *piv, int *ord );
static void eval_pivot_bbk( int n, int d, int ldim, const double *A, 
	int *piv, int *ord );
static void eval_pivot_bp( int n, int d, int ldim, const double *A, 
	int *piv, int *ord );
static void eval_pivot_reduce_bk( int n, int d, int ldim, const double *L, 
	const double *D, const double *A, double *Q, int *piv, int *ord );
static void eval_pivot_reduce_bbk( int n, int d, int ldim, const double *L, 
	const double *D, const double *A, double *Q, int *piv, int *ord );
static void eval_pivot_blas_bk( int n, int d, int ldim, const double *L, 
	const double *M, const double *A, double *Q, int *piv, int *ord );
static void eval_pivot_blas_bbk( int n, int d, int ldim, const double *L, 
	const double *M, const double *A, double *Q, int *piv, int *ord );
static void pivot_sym_reduce( int n, int k, int r, double *vec, int ldim, 
	double *A );
static void pivot_sym_blas( int n, int k, int r, int ldim, double *W,
	double *A );
static void reduce_ldlt_vector( int m, int n, int r, int *ord, int ldim, 
	const double *L, const double *D, double *vec );
static void reduce_ldlt_matrix( int m, int n, int p, int diag, const int *ord, 
	int ldim, const double *L, const double *D, const double *M, 
	const double *T, double *A );
static void reduce_ldlt_matrix_blas( int m, int n, int p, int diag, 
	const int *ord, int ldim, const double *L, const double *D, const double *M, 
	const double *T, double *A );
static void ldlt_factor( char pivot, int m, int *n, int *piv, int *ord, 
	int ldim, double *A, double *W );
static void ldlt_factor_blas( char pivot, int m, int *n, int *piv, int *ord, 
	int ldim, double *A, double *W );
static void ldlt_block_rook_pivot( int blas, char pivot, int n, 
	int *piv, int *ord, int ldim, double *A );
static void ldlt_block_comp_pivot( int blas, char pivot, int n, 
	int *piv, int *ord, int ldim, double *A );

// Parameter for bounding element growth in trailing sub-matrix
static const double	alpha = (1.0 + sqrt(17.0)) / 8.0;

#if defined(LDLTFACT) && defined(PROFILE)
	static int xtra_work = 0;
	static double tm_ldlt = 0.0;
	static double tm_factor = 0.0;
	static double tm_pivot = 0.0;
	static double tm_reduce = 0.0;
	static double tm_fact_piv = 0.0;
	static double tm_red_vec = 0.0;
#endif

/******************************************************************************/

/*
 * Determines optimal block dimension for the local environment given a routine
 * and matrix leading dimension.  The function returns different block 
 * dimensions for simple blocking, blocking using BLAS, and the LAPACK symmetric
 * indefinite factorization routine.  Also, it facilitates the use of a 
 * different block dimension for testing (debugging).  If the leading dimension
 * is less than the optimal block dimension, the block dimension is set to the
 * leading dimension, and the matrix computation becomes an unblocked algorithm.
 */
int get_block_dim_ldlt( int lapack, int blas, int ldim )
{
	const int	optm_bdim = 1;
	const int	no_dim = -1;
	const char	*parm_str = "L";	
	const char	*func_name = "DSYTRF";	

	int bdim;

#if defined(DEBUG)
	bdim = BDIM;
#else
	if ( lapack ) {
		bdim = ilaenv_( &optm_bdim, func_name, parm_str, 
			&ldim, &no_dim, &no_dim, &no_dim );
	} else {
		if ( blas ) {
			bdim = 64;	
		} else {
			bdim = 128;
		}
	}
#endif
	if ( bdim <= 1 || bdim > ldim ) {
		bdim = ldim;
	}
	return bdim;
}

/*
 * Counts the number of pivots performed -- row and column interchanges -- 
 * during matrix factorization.  piv_ord passed in the argument list determines 
 * whether 1-by-1 pivots, 2-by-2 pivots, or both are counted.  piv[k] specifies  
 * the permutation applied to row/ column k, so if piv[k] != k then a row/
 * column interchange is performed.
 */
int count_pivot( int piv_ord, int n, const int *piv, const int *ord )
{
	int count = 0;

	if ( piv_ord == 0 ) {					// Count 1-by-1 and 2-by-2 pivots
		for (int k = 0; k < n; k++) {
			if ( k != piv[k] ) {
				count++;
			}
		}
	} else if ( piv_ord == 1 ) {			// Count 1-by-1 pivots
		for (int k = 0; k < n; k++) {
			if ( ord[k] == 1 && piv[k] != k ) {
				count++;
			}
		}
	} else {
		for (int k = 0; k < n; k++) {
			if ( ord[k] == 2 ) {			// Count 2-by-2 pivots
				count++;
			}
		}
	}
	return count;
}

/*
 * Performs diagonal pivot selection on an n-by-1 vector representing the
 * diagonal elements of an n-by-n matrix.  The maximum magnitude diagonal 
 * element is chosen as the pivot.  A single pivot adjusted by diagonal offset
 * d, and its order (=1) are stored in vectors piv[] and ord[], respectively.  
 * piv[k] specifies the permutation applied to row/ column k when performing 
 * matrix factorization.
 */
void eval_pivot_diag( int n, int d, const double *diag, int *piv, int *ord )
{
	int 	p = d;
	double	lambda = -1.0;

	for ( int i = 0; i < n; i++ ) {
		double x = fabs( diag[i] );
		if ( x > lambda ) {
			lambda = x;
			p = i + d;
		}
	}
	piv[d] = p;
	ord[d] = 1;
}

/*
 * Performs Bunch-Kaufman (partial) pivot selection on an n-by-n trailing 
 * sub-matrix A with diagonal offset d.  The Bunch-Kaufman algorithm selects a 
 * 1-by-1 or 2-by-2 pivot for symmetric indefinite factorization, L*D*L', where 
 * D is block diagonal with block order 1 or 2.  The selected 1-by-1 or 2-by-2 
 * pivot and its order are stored in vectors piv[] and ord[], respectively. 
 * piv[k] (and piv[k+1]) specifies the permutation(s) applied to row(s)/
 * column(s) k (and k+1).  It is assumed that the matrix factorization is 
 * implemented using the outer product method, such that trailing sub-matrix A 
 * has been reduced (updated) prior to pivot selection.  Matrix A is stored in
 * column-major order with leading dimension ldim.
 */
void eval_pivot_bk( int n, int d, int ldim, const double *A, 
	int *piv, int *ord )
{
	int r = 0;
	double lambda = -1.0;
	double sigma = -1.0;
	double a, arr;
	const double *A_r, *Ar_;
	
	// Determine largest magnitude off-diagonal entry in 1st row/ column
	for ( int i = 1; i < n; i++ ) {
		double x = fabs( *(A+i) );
		if ( x  > lambda ) {
			lambda = x;
			r = i;
		}
	}
	
	if ( lambda > 0.0 ) {
		a = fabs(*A);
		if ( a >= alpha * lambda ) {
			// Use 1st row/ column as 1-by-1 pivot
			piv[d] = d;
			ord[d] = 1;
		} else {
			A_r = A + r*ldim;
			Ar_ = A + r;
			arr = fabs ( *(A_r+r) );
			// Determine maximum magnitude off-diagonal entry in row/ column r.  
			// Because of symmetry only entries on and below the diagonal have
			// been updated, so check entries in row r corresponding to entries
			// above the diagonal in column r and entries below the diagonal
			// in column r
			for ( int j = 0; j < r; j++ ) {
				double x = fabs( *(Ar_+j*ldim) );
				if ( x > sigma ) {
					sigma = x;
				}	
			}
			for ( int i = r+1; i < n; i++ ) {
				double x = fabs( *(A_r+i) );
				if ( x > sigma ) {
					sigma = x;
				}	
			}
			
			if ( a * sigma >= alpha * lambda * lambda ) {
				// Use 1st row/ column as 1-by-1 pivot
				piv[d] = d;
				ord[d] = 1;
			} else if ( arr >= alpha * sigma ) {
				// Use rth row/ column as 1-by-1 pivot
				piv[d] = d + r;
				ord[d] = 1;
			} else {
				// Use 1st and rth rows/ columns as 2-by-2 pivot
				piv[d] = d;
				piv[d+1] = d + r;
				ord[d] = 2;
				ord[d+1] = 0;
			}
		}
	} else {
		// Use 1st row/ column as 1-by-1 pivot
		piv[d] = d;
		ord[d] = 1;
	}	
}

/*
 * Performs bounded Bunch-Kaufman (rook) pivot selection on an n-by-n trailing 
 * sub-matrix A with diagonal offset d.  The bounded Bunch-Kaufman algorithm 
 * selects a 1-by-1 or 2-by-2 pivot for symmetric indefinite factorization, 
 * L*D*L', where D is block diagonal with block order 1 or 2.  The selected
 * 1-by-1 or 2-by-2 pivot and its order are stored in vectors piv[] and ord[], 
 * respectively.  piv[k] (and piv[k+1]) specifies the permutation(s) applied to 
 * row(s)/ column(s) k (and k+1).  It is assumed that the matrix factorization 
 * is implemented using the outer product method, such that trailing sub-matrix
 * A has been reduced (updated) prior to pivot selection.  Matrix A is stored in
 * column-major order with leading dimension ldim.
 */
void eval_pivot_bbk( int n, int d, int ldim, const double *A, 
	int *piv, int *ord )
{
	const double tol = 100.0*DBL_EPSILON;

	int k = 0;
	int p = 0;
	int r = 0;
	double lambda = -1.0;
	double sigma = -1.0;
	double a, arr, eps;
	const double *A_r, *Ar_;
	
	// Determine largest magnitude off-diagonal entry in 1st row/ column
	for ( int i = 1; i < n; i++ ) {
		double x = fabs( *(A+i) );
		if ( x  > lambda ) {
			lambda = x;
			r = i;
		}
	}
	
	if ( lambda > 0.0 ) {
		a = fabs(*A);
		if ( a >= alpha * lambda ) {
			// Use 1st row/ column as 1-by-1 pivot
			piv[d] = d;
			ord[d] = 1;
		} else {
			int piv_slct = 0;			// Pivot selected = FALSE
			while ( piv_slct == 0 ) {	// Until pivot selected perform ...
				A_r = A + r*ldim;
				Ar_ = A + r;
				arr = fabs ( *(A_r+r) );
				// Determine maximum magnitude off-diagonal entry in row/ 
				// column r.  Because of symmetry only entries on and below the
				// diagonal have been updated, so check entries in row r 
				// corresponding to entries above the diagonal in column r and 
				// entries below the diagonal in column r
				for ( int j = 0; j < r; j++ ) {
					double x = fabs( *(Ar_+j*ldim) );
					if ( x > sigma ) {
						sigma = x;
						p = j;
					}	
				}
				for ( int i = r+1; i < n; i++ ) {
					double x = fabs( *(A_r+i) );
					if ( x > sigma ) {
						sigma = x;
						p = i;
					}	
				}

				// Calculate relative difference between lambda and sigma to 
				// check whether they are equal within rounding error tolerance
				eps = fabs( lambda - sigma ) / sigma;
				if ( arr >= alpha * sigma ) {
					// Use rth row/ column as 1-by-1 pivot
					piv[d] = d + r;
					ord[d] = 1;
					piv_slct = 1;
				} else if ( eps < tol ) {
					// Use kth and rth rows/ columns as 2-by-2 pivot
					piv[d] = d + k;
					piv[d+1] = d + r;
					ord[d] = 2;
					ord[d+1] = 0;
					piv_slct = 1;
				} else {
					// Continue search for pivot
					k = r;
					lambda = sigma;
					r = p;
					sigma = -1.0;
				}				
			}
		}			
	} else {
		// Use 1st row/ column as 1-by-1 pivot
		piv[d] = d;
		ord[d] = 1;
	}	
}

/*
 * Performs Bunch-Parlett (complete) pivot selection on an n-by-n trailing 
 * sub-matrix A with diagonal offset d.  The Bunch-Parlett algorithm selects a
 * 1-by-1 or 2-by-2 pivot for symmetric indefinite factorization, L*D*L', where
 * D is block diagonal with block order 1 or 2.  The selected 1-by-1 or 2-by-2
 * pivot and its order are stored in vectors piv[] and ord[], respectively.
 * piv[k] (and piv[k+1]) specifies the permutation(s) applied to row(s)/ 
 * column(s) k (and k+1).  It is assumed that the matrix factorization is 
 * implemented using the outer product method, such that trailing sub-matrix A
 * has been reduced (updated) prior to pivot selection.  Matrix A is stored in
 * column-major order with leading dimension ldim.
 */
void eval_pivot_bp( int n, int d, int ldim, const double *A, 
	int *piv, int *ord )
{
	int r = 0;
	int s = 0;
	int t = 0;
	double mu = -1.0;
	double nu = -1.0;
	const double *A_j;
	
	// Determine largest magnitude diagonal (nu) and off-diagonal (mu) entries
	// of trailing sub-matrix A, and their respective indexes.
	for ( int j = 0; j < n; j++ ) {
		A_j = A + j*ldim; 
		double ajj = fabs( *(A_j+j) );
		if ( ajj > nu ) {
			nu = ajj;
			t = j;
		}
		for ( int i = j+1; i < n; i++ ) {
			double aij = fabs( *(A_j+i) );
			if ( aij > mu ) {
				mu = aij;
				r = i;
				s = j;
			}
		}
	}
	
	if ( mu > 0.0 || nu > 0.0 ) {
		if ( nu >= alpha * mu ) {
			// Use row/ column corresponding to maximum magnitude diagonal entry
			// as 1-x-1 pivot
			piv[d] = d + t;
			ord[d] = 1;
		} else {
			// Use rows/ columns corresponding to maximum magnitude off-diagonal
			// entry as 2-x-2 pivot
			piv[d] = d + s;
			piv[d+1] = d + r;
			ord[d] = 2;
			ord[d+1] = 0;
		}
	} else {
		// Use 1st row/ column as 1-by-1 pivot
		piv[d] = d;
		ord[d] = 1;
	}	
}

/*
 * Performs Bunch-Kaufman (partial) pivot selection on an n-by-n trailing 
 * sub-matrix A with diagonal offset d.  The Bunch-Kaufman algorithm selects a 
 * 1-by-1 or 2-by-2 pivot for symmetric indefinite factorization, L*D*L', where 
 * D is block diagonal with block order 1 or 2.  The selected 1-by-1 or 2-by-2 
 * pivot and its order are stored in vectors piv[] and ord[], respectively.  
 * piv[k] (and piv[k+1]) specifies the permutation(s) applied to row(s)/
 * column(s) k (and k+1).  It is assumed that the symmetric indefinite 
 * factorization is an implementation of the SAXPY operation, such that trailing 
 * sub-matrix A has yet to be reduced (updated) when pivot selection is 
 * performed. Unit lower triangular matrix L and block diagonal matrix D are 
 * used to reduce the rows/ columns of trailing sub-matrix A, which are 
 * evaluated during pivot selection.  The reduced rows/ columns are stored in 
 * matrix Q for reuse in the matrix factorization.  Matrices A, L and D are
 * stored in column-major order with leading dimension ldim.
 */
void eval_pivot_reduce_bk( int n, int d, int ldim, const double *L, 
	const double *D, const double *A, double *Q, int *piv, int *ord )
{
	const double	*A_r, *Ar_;

	int 		r = 0;
	double	lambda = -1.0;
	double	sigma = -1.0;
	double	q, qrr;
	double	*Q_r;

	// Copy 1st row/ column of A to Q and perform cumulative trailing 
	// sub-matrix reduction on this vector
	for ( int i = 0; i < n; i++ ) {
		*(Q + i) = *(A + i);
	}
	reduce_ldlt_vector( n, d, r, ord, ldim, L, D, Q );
	
	// Determine largest magnitude off-diagonal entry in reduced 1st row/ column
	for ( int i = 1; i < n; i++ ) {
		double x = fabs( *(Q+i) );
		if ( x  > lambda ) {
			lambda = x;
			r = i;
		}
	}
	
	if ( lambda > 0.0 ) {
		q = fabs( *Q );
		if ( q >= alpha * lambda ) {
			// Use 1st row/ column as 1-by-1 pivot
			piv[d] = d;
			ord[d] = 1;
		} else {
			A_r = A + r*ldim;
			Ar_ = A + r;
			Q_r = Q + ldim;
			// Determine maximum magnitude off-diagonal entry in reduced row/ 
			// column r.  Because of symmetry only entries on and below the 
			// diagonal have been updated/ pivoted, so check entries in row r 
			// corresponding to entries above the diagonal in column r and 
			// entries below the diagonal in column r
			for ( int j = 0; j < r; j++ ) {
				*(Q_r + j) = *(Ar_ + j*ldim);
			}
			memcpy( Q_r + r, A_r + r, (n-r)*sizeof(double) );
			reduce_ldlt_vector( n, d, r, ord, ldim, L, D, Q_r );
			qrr = fabs ( *(Q_r+r) );

			for ( int i = 0; i < n; i++ ) {
				double x = fabs( *(Q_r+i) );
				if ( x > sigma && i != r ) {
					sigma = x;
				}	
			}
			
			if ( q * sigma >= alpha * lambda * lambda ) {
				// Use 1st row/ column as 1-by-1 pivot
				piv[d] = d;
				ord[d] = 1;
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
			} else if ( qrr >= alpha * sigma ) {
				// Use rth row/ column as 1-by-1 pivot, and copy reduced rth
				// row/ column to 1st column of Q
				piv[d] = d + r;
				ord[d] = 1;
				memcpy( Q, Q_r, n*sizeof(double) );
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
			} else {
				// Use 1st and rth rows/ columns as 2-by-2 pivot
				piv[d] = d;
				piv[d+1] = d + r;
				ord[d] = 2;
				ord[d+1] = 0;
			}
		}
	} else {
		// Use 1st row/ column as 1-by-1 pivot
		piv[d] = d;
		ord[d] = 1;
	}
}

/*
 * Performs bounded Bunch-Kaufman (rook) pivot selection on an n-by-n trailing 
 * sub-matrix A with diagonal offset d.  The bounded Bunch-Kaufman algorithm 
 * selects a 1-by-1 or 2-by-2 pivot for symmetric indefinite factorization, 
 * L*D*L', where D is block diagonal with block order 1 or 2.  The selected
 * 1-by-1 or 2-by-2 pivot and its order are stored in vectors piv[] and ord[], 
 * respectively.  piv[k] (and piv[k+1]) specifies the permutation(s) applied to 
 * row(s)/ column(s) k (and k+1).  It is assumed that the symmetric indefinite
 * factorization is an implementation of the SAXPY operation, such that trailing 
 * sub-matrix A has yet to be reduced (updated) when pivot selection is 
 * performed. Unit lower triangular matrix L and block diagonal matrix D are 
 * used to reduce the rows/ columns of trailing sub-matrix A, which are 
 * evaluated during pivot selection.  The reduced rows/ columns are stored in 
 * matrix Q for reuse in the matrix factorization.  Matrices A, L and D are
 * stored in column-major order with leading dimension ldim.
 */
void eval_pivot_reduce_bbk( int n, int d, int ldim, const double *L, 
	const double *D, const double *A, double *Q, int *piv, int *ord )
{
	const double	tol = 100.0*DBL_EPSILON;
	const double	*A_r, *Ar_;	

	int 		k = 0;
	int 		p = 0;
	int 		r = 0;
	double 	lambda = -1.0;
	double 	sigma = -1.0;
	double 	eps, q, qrr;
	double 	*Q_r;

	// Copy 1st row/ column of A to Q and perform cumulative trailing 
	// sub-matrix reduction on this vector
	for ( int i = 0; i < n; i++ ) {
		*(Q + i) = *(A + i);
	}
	reduce_ldlt_vector( n, d, r, ord, ldim, L, D, Q );

	// Determine largest magnitude off-diagonal entry in 1st row/ column
	for ( int i = 1; i < n; i++ ) {
		double x = fabs( *(Q+i) );
		if ( x  > lambda ) {
			lambda = x;
			r = i;
		}
	}

	if ( lambda > 0.0 ) {
		q = fabs( *Q );
		if ( q >= alpha * lambda ) {
			// Use 1st row/ column as 1-by-1 pivot
			piv[d] = d;
			ord[d] = 1;
		} else {
			Q_r = Q + ldim;
			int piv_slct = 0;			// Pivot selected = FALSE
			while ( piv_slct == 0 ) {	// Until pivot selected perform ...
				A_r = A + r*ldim;
				Ar_ = A + r;
				// Determine maximum magnitude off-diagonal entry in reduced 
				// row/ column r.  Because of symmetry only entries on and below  
				// the diagonal have been updated/ pivoted, so check entries in
				// row r corresponding to entries above the diagonal in column r 
				// and entries below the diagonal in column r
				for ( int j = 0; j < r; j++ ) {
					*(Q_r + j) = *(Ar_ + j*ldim);
				}
				memcpy( Q_r + r, A_r + r, (n-r)*sizeof(double) );
				reduce_ldlt_vector( n, d, r, ord, ldim, L, D, Q_r );
				qrr = fabs( *(Q_r+r) );
				for ( int i = 0; i < n; i++ ) {
					double x = fabs( *(Q_r+i) );
					if ( x > sigma && i != r ) {
						sigma = x;
						p = i;
					}	
				}

				// Calculate relative difference between lambda and sigma to 
				// check whether they are equal within rounding error tolerance
				eps = fabs( lambda - sigma ) / sigma;
				if ( qrr >= alpha * sigma ) {
					// Use rth row/ column as 1-by-1 pivot
					piv[d] = d + r;
					ord[d] = 1;
					memcpy( Q, Q_r, n*sizeof(double) );
					piv_slct = 1;
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
				} else if ( eps < tol ) {
					// Use kth and rth rows/ columns as 2-by-2 pivot
					piv[d] = d + k;
					piv[d+1] = d + r;
					ord[d] = 2;
					ord[d+1] = 0;
					piv_slct = 1;
				} else {
					// Continue search for pivot
					k = r;
					lambda = sigma;
					memcpy( Q, Q_r, n*sizeof(double) );
					r = p;
					sigma = -1.0;
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
				}				
			}
		}			
	} else {
		// Use 1st row/ column as 1-by-1 pivot
		piv[d] = d;
		ord[d] = 1;
	}
}

/*
 * Performs Bunch-Kaufman (partial) pivot selection on an n-by-n trailing 
 * sub-matrix A with diagonal offset d.  The Bunch-Kaufman algorithm selects a 
 * 1-by-1 or 2-by-2 pivot for symmetric indefinite factorization, L*D*L', where 
 * D is block diagonal with block order 1 or 2.  The selected 1-by-1 or 2-by-2 
 * pivot and its order are stored in vectors piv[] and ord[], respectively.  
 * piv[k] (and piv[k+1]) specifies the permutation(s) applied to row(s)/
 * column(s) k (and k+1).  It is assumed that the symmetric indefinite 
 * factorization is an implementation of the SAXPY operation, such that trailing 
 * sub-matrix A has yet to be reduced (updated) when pivot selection is 
 * performed. BLAS routines are invoked to reduce the rows/ columns of trailing 
 * sub-matrix A, which are evaluated during pivot selection.  The reduced rows/ 
 * columns are stored in matrix Q for reuse in the matrix factorization.  
 * Matrices A, L and D are stored in column-major order with leading dimension 
 * ldim.
 */
void eval_pivot_blas_bk( int n, int d, int ldim, const double *L, 
	const double *M, const double *A, double *Q, int *piv, int *ord )
{
	const int		one = 1;
	const double		*A_r, *Ar_;

	int 		r, t;
	double	lambda = -1.0;
	double	sigma = -1.0;
	double	q, qrr;
	double	*Q_r;


	// Copy 1st row/ column of A to Q and perform cumulative trailing 
	// sub-matrix reduction on this vector
	for ( int i = 0; i < n; i++ ) {
		*(Q + i) = *(A + i);
	}
	r = 0;
	reduce_ldlt_vector_blas( n, d, r, ord, ldim, L, M, Q );
	
	// Determine largest magnitude off-diagonal entry in reduced 1st row/ column
	for ( int i = 1; i < n; i++ ) {
		double x = fabs( *(Q+i) );
		if ( x  > lambda ) {
			lambda = x;
			r = i;
		}
	}
	
	if ( lambda > 0.0 ) {
		q = fabs( *Q );
		if ( q >= alpha * lambda ) {
			// Use 1st row/ column as 1-by-1 pivot
			piv[d] = d;
			ord[d] = 1;
		} else {
			A_r = A + r*ldim;
			Ar_ = A + r;
			Q_r = Q + ldim;
			// Determine maximum magnitude off-diagonal entry in reduced row/ 
			// column r.  Because of symmetry only entries on and below the 
			// diagonal have been updated/ pivoted, so check entries in row r 
			// corresponding to entries above the diagonal in column r and 
			// entries below the diagonal in column r
			const double	 *Arr = A_r + r;
			double *Qrr = Q_r + r;
			t = n - r;
			dcopy_( &r, Ar_, &ldim, Q_r, &one );
			dcopy_( &t, Arr, &one, Qrr, &one );
			reduce_ldlt_vector_blas( n, d, r, ord, ldim, L, M, Q_r );
			qrr = fabs ( *(Q_r+r) );
			// Determine largest magnitude off-diagonal entry in reduced 
			// rth row/ column
			for ( int i = 0; i < n; i++ ) {
				double x = fabs( *(Q_r+i) );
				if ( x > sigma && i != r ) {
					sigma = x;
				}	
			}
			
			if ( q * sigma >= alpha * lambda * lambda ) {
				// Use 1st row/ column as 1-by-1 pivot
				piv[d] = d;
				ord[d] = 1;
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
			} else if ( qrr >= alpha * sigma ) {
				// Use rth row/ column as 1-by-1 pivot, and copy reduced rth
				// row/ column to 1st column of Q
				piv[d] = d + r;
				ord[d] = 1;
				dcopy_( &n, Q_r, &one, Q, &one );
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
			} else {
				// Use 1st and rth rows/ columns as 2-by-2 pivot
				piv[d] = d;
				piv[d+1] = d + r;
				ord[d] = 2;
				ord[d+1] = 0;
			}
		}
	} else {
		// Use 1st row/ column as 1-by-1 pivot
		piv[d] = d;
		ord[d] = 1;
	}
}

/*
 * Performs bounded Bunch-Kaufman (rook) pivot selection on an n-by-n trailing 
 * sub-matrix A with diagonal offset d.  The bounded Bunch-Kaufman algorithm 
 * selects a 1-by-1 or 2-by-2 pivot for symmetric indefinite factorization, 
 * L*D*L', where D is block diagonal with block order 1 or 2.  The selected
 * 1-by-1 or 2-by-2 pivot and its order are stored in vectors piv[] and ord[], 
 * respectively.  piv[k] (and piv[k+1]) specifies the permutation(s) applied to 
 * row(s)/ column(s) k (and k+1).  It is assumed that the symmetric indefinite
 * factorization is implemented using the SAXPY method, such that trailing 
 * sub-matrix A has yet to be reduced (updated) when pivot selection is 
 * performed. BLAS routines are invoked to reduce the rows/ columns of trailing 
 * sub-matrix A, which are evaluated during pivot selection.  The reduced rows/ 
 * columns are stored in matrix Q for reuse in the matrix factorization.
 * Matrices A, L and D are stored in column-major order with leading dimension 
 * ldim.
 */
void eval_pivot_blas_bbk( int n, int d, int ldim, const double *L, 
	const double *M, const double *A, double *Q, int *piv, int *ord )
{
	const int		one = 1;
	const double		tol = 100.0*DBL_EPSILON;
	const double		*A_r, *Ar_;	
	
	int		k, p, r, t;
	double 	lambda = -1.0;
	double 	sigma = -1.0;
	double 	eps, q, qrr;
	double 	*Q_r;

	// Copy 1st row/ column of A to Q and perform cumulative trailing 
	// sub-matrix reduction on this vector
	for ( int i = 0; i < n; i++ ) {
		*(Q + i) = *(A + i);
	}
	r = 0;
	reduce_ldlt_vector_blas( n, d, r, ord, ldim, L, M, Q );

	// Determine largest magnitude off-diagonal entry in 1st row/ column
	for ( int i = 1; i < n; i++ ) {
		double x = fabs( *(Q+i) );
		if ( x  > lambda ) {
			lambda = x;
			r = i;
		}
	}

	k = 0;
	p = 0;
	if ( lambda > 0.0 ) {
		q = fabs( *Q );
		if ( q >= alpha * lambda ) {
			// Use 1st row/ column as 1-by-1 pivot
			piv[d] = d;
			ord[d] = 1;
		} else {
			Q_r = Q + ldim;
			int piv_slct = 0;			// Pivot selected = FALSE
			while ( piv_slct == 0 ) {	// Until pivot selected perform ...
				A_r = A + r*ldim;
				Ar_ = A + r;
				// Determine maximum magnitude off-diagonal entry in reduced 
				// row/ column r.  Because of symmetry only entries on and below  
				// the diagonal have been updated/ pivoted, so check entries in
				// row r corresponding to entries above the diagonal in column r 
				// and entries below the diagonal in column r
				const double	 *Arr = A_r + r;
				double *Qrr = Q_r + r;
				t = n - r;
				dcopy_( &r, Ar_, &ldim, Q_r, &one );
				dcopy_( &t, Arr, &one, Qrr, &one );
				reduce_ldlt_vector_blas( n, d, r, ord, ldim, L, M, Q_r );
				qrr = fabs( *(Q_r+r) );
				for ( int i = 0; i < n; i++ ) {
					double x = fabs( *(Q_r+i) );
					if ( x > sigma && i != r ) {
						sigma = x;
						p = i;
					}	
				}

				// Calculate relative difference between lambda and sigma to 
				// check whether they are equal within rounding error tolerance
				eps = fabs( lambda - sigma ) / sigma;
				if ( qrr >= alpha * sigma ) {
					// Use rth row/ column as 1-by-1 pivot
					piv[d] = d + r;
					ord[d] = 1;
					dcopy_( &n, Q_r, &one, Q, &one );
					piv_slct = 1;
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
				} else if ( eps < tol ) {
					// Use kth and rth rows/ columns as 2-by-2 pivot
					piv[d] = d + k;
					piv[d+1] = d + r;
					ord[d] = 2;
					ord[d+1] = 0;
					piv_slct = 1;
				} else {
					// Continue search for pivot
					k = r;
					lambda = sigma;
					dcopy_( &n, Q_r, &one, Q, &one );
					r = p;
					sigma = -1.0;
#if defined(LDLTFACT) && defined(PROFILE)
	xtra_work++;
#endif
				}				
			}
		}			
	} else {
		// Use 1st row/ column as 1-by-1 pivot
		piv[d] = d;
		ord[d] = 1;
	}
}

/*
 * Performs pivoting of an n-by-n symmetric matrix A stored in column-major
 * with leading dimension ldim.  To preserve the symmetry of matrix A both row 
 * and column k are interchanged with row and column r.  A^ = P*A*P', where P is
 * the permutation matrix and P' its transpose, is called a symmetric permutation
 * of A.  Because of symmetry only elements on and below the diagonal need be  
 * interchanged.
 */ 
void pivot_sym( int n, int k, int r, int ldim, double *A ) 
{
	double *A_k = A + k*ldim;
	double *A_r = A + r*ldim;
	double *Ak_ = A + k;
	double *Ar_ = A + r;

	// Interchange elements A(k,0:k-1) with A(r,0:k-1)
	// i.e., elements of rows k and r to the left of column k
	for ( int j = 0; j < k; j++ ) {
		double akj = *(Ak_ + j*ldim);
		*(Ak_ + j*ldim) = *(Ar_ + j*ldim);
		*(Ar_ + j*ldim) = akj;
	}

	// Interchange diagonal elements of rows/ columns k and r
	double akk = *(A_k + k);
	*(A_k + k) = *(A_r + r);
	*(A_r + r) = akk;

	// Interchange elements A(k+1:r-1,k) with A(r,k+1:r-1)
	for ( int i = k+1; i < r; i++ ) {
		double aik = *(A_k + i);
		*(A_k + i) = *(Ar_ + i*ldim);
		*(Ar_ + i*ldim) = aik;
	}

	// Interchange elements A(r+1:n-1,k) with A(r+1:n-1,r)
	// i.e., elements of columns k and r below row r
	for ( int i = r+1; i < n; i++ ) {
		double aik = *(A_k + i);
		*(A_k + i) = *(A_r + i);
		*(A_r + i) = aik;
	}
}

/*
 * Performs pivoting of an n-by-n symmetric matrix A stored in column-major 
 * order with leading dimension ldim.  To preserve the symmetry of matrix A both
 * row and column k are interchanged with row and column r.  A^ = P*A*P', where 
 * P is the permutation matrix and P' its transpose, is called a symmetric 
 * permutation of A.  The pivoting algorithm populates column k on and below the
 * diagonal of matrix A with row/ column r of the associated reduced trailing 
 * sub-matrix stored in vector vec[].  Because of symmetry only elements on and
 * below the diagonal need be interchanged.  
 */
void pivot_sym_reduce( int n, int k, int r, double *vec, int ldim, double *A ) 
{
	double *A_k = A + k*ldim;
	double *A_r = A + r*ldim;
	double *Ak_ = A + k;
	double *Ar_ = A + r;

	if ( k != r ) {
		// Interchange elements A(k,0:k-1) with A(r,0:k-1)
		// i.e., elements of rows k and r to the left of column k
		for ( int j = 0; j < k; j++ ) {
			double akj = *(Ak_ + j*ldim);
			*(Ak_ + j*ldim) = *(Ar_ + j*ldim);
			*(Ar_ + j*ldim) = akj;
		}

		// Interchange reduced diagonal elements vec[k] and vec[r], and
		// overwrite diagonal element A(r,r) with A(k,k) 
		double vk = vec[k];
		vec[k] = vec[r];
		vec[r] = vk;
		*(A_r + r) = *(A_k + k);

		// Replace elements A(r,k+1:r-1) with A(k+1:r-1,k)
		for ( int i = k+1; i < r; i++ ) {
			*(Ar_ + i*ldim) = *(A_k + i);
		}

		// Replace elements A(r+1:n-1,r) with A(r+1:n-1,k) 
		// i.e., elements below row r
		memcpy( A_r+r+1, A_k+r+1, (n-r-1)*sizeof(double) );
	}

	// Replace elements A(k:n-1,k) with reduced vector vec[k:n-1]
	memcpy( A_k + k, &vec[k], (n-k)*sizeof(double) );
}

/*
 * Performs pivoting of an n-by-n symmetric matrix A, and working array W, which
 * stores trailing sub-matrix updates applied to columns of A.  Arrays A and W
 * are stored in column-major order with leading dimension ldim.  To preserve 
 * the symmetry of matrix A both row and column k are interchanged with row and 
 * column r.  A^ = P*A*P', where P is the permutation matrix and P' its 
 * transpose, is called a symmetric permutation of A.  BLAS routines are invoked
 * to perform copy and swap operations that constitute symmetric pivoting.  
 * Because of symmetry only elements on and below the diagonal need be 
 * interchanged.  
 */
void pivot_sym_blas( int n, int k, int r, int ldim, double *W, double *A ) 
{
	const int one = 1;
	
	int		t;	
	double	*A_k = A + k*ldim;
	double	*A_r = A + r*ldim;
	double	*W_k = W + k*ldim;
	double	*Ak_ = A + k;
	double	*Ar_ = A + r;
	double	*Wk_ = W + k;
	double	*Wr_ = W + r;

	if ( k != r ) {
		// Interchange elements A(k,0:k-1) with A(r,0:k-1) and W(k,0:k-1) with
		// W(r,0:k-1), i.e., elements of rows k and r to the left of column k
		dswap_( &k, Ak_, &ldim, Ar_, &ldim );
		dswap_( &k, Wk_, &ldim, Wr_, &ldim );

		// Interchange reduced diagonal elements W(k,k) and W(r,k), and
		// overwrite diagonal element A(r,r) with A(k,k) 
		double wkk = *(W_k + k);
		*(W_k + k) = *(W_k + r);
		*(W_k + r) = wkk;
		*(A_r + r) = *(A_k + k);

		// Replace elements A(r,k+1:r-1) with A(k+1:r-1,k)
		double	*Arl = Ar_ + k*ldim + ldim;
		double	*Alk = A_k + k + 1;
		t = r - k - 1;
		dcopy_( &t, Alk, &one, Arl, &ldim );

		// Replace elements A(r+1:n-1,r) with A(r+1:n-1,k) 
		// i.e., elements below row r
		double	*Asr = A_r + r + 1;
		double	*Ask = A_k + r + 1;
		t = n - r - 1;
		dcopy_( &t, Ask, &one, Asr, &one );
	}
}

/*
 * Performs cumulative trailing sub-matrix updates (reduction) on row/ column r 
 * of a symmetric indefinite matrix, which is stored in vector vec[] during the 
 * factorization procedure.  Suppose that P*A*P' = [A_00^, A_01^; A_10^, A_11^] 
 * = [L_00, 0; L_10, L_11] * [D_00, 0; 0; D_11] * [L_00', L_10'; 0, L_11'].
 * Then the trailing sub-matrix A_11^^ = L_11*D_11*L_11' =
 * A_11^ - L_10*D_00*L_10', where vec[] is row/ column r of A_11^.
 * L is a pointer to m-by-n block L_10 of a unit lower triangular matrix, and 
 * D is a pointer to n-by-n block D_00 of a block diagonal matrix with block 
 * order 1 or 2 defined in ord[].  Matrices L and D are stored in column-major
 * order with leading dimension ldim.
 */
void reduce_ldlt_vector( int m, int n, int r, int *ord, int ldim, 
	const double *L, const double *D, double *vec )
{
#if defined(LDLTFACT) && defined(PROFILE)
	struct	timespec sta_red_vec, end_red_vec;

	get_time( &sta_red_vec );
#endif
	
	const double *Lr_ = L + r;						// Points to element L'(0,r)

	for ( int k = 0; k < n; ) {			
		const double *L_k = L + k*ldim;				// Points to element L(0,k)
			
		if ( ord[k] == 1 ) {						// 1-by-1 pivot
			double dkk = *(D + k + k*ldim);			// Element D(k,k)
			double lrk = *(Lr_ + k*ldim);			// Element L'(k,r) = L(r,k)
			for ( int i = 0; i < m; i++ ) {
				vec[i] -= *(L_k + i) * dkk * lrk;
			}
			k++;
		} else {									// 2-by-2 pivot 
			double d00 = *(D + k + k*ldim);			// Element D(k,k)
			double d10 = *(D + k + 1 + k*ldim);		// Element D(k+1,k) = D(k,k+1)
			double d11 = *(D + k + 1 + k*ldim + ldim);	// Element D(k+1,k+1)
			double lr0 = *(Lr_ + k*ldim);			// Element L'(k,r) = L(r,k)
			double lr1 = *(Lr_ + k*ldim + ldim);	// Element L'(k+1,r) = L(r,k+1)
			for ( int i = 0; i < m; i++ ) {
				vec[i] -=	*(L_k + i) * (d00*lr0 + d10*lr1) +
							 	*(L_k + i + ldim) * (d10*lr0 + d11*lr1);
			}
			k += 2;
		}
	}
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_red_vec );
	tm_red_vec += timespec_diff( sta_red_vec, end_red_vec );
#endif
}

/*
 * Performs cumulative trailing sub-matrix updates (reduction) on row/ column r 
 * of a symmetric indefinite matrix, which is stored in vector vec[] during the 
 * factorization procedure.  Suppose that P*A*P' = [A_00^, A_01^; A_10^, A_11^] 
 * = [L_00, 0; L_10, L_11] * [D_00, 0; 0; D_11] * [L_00', L_10'; 0, L_11'].
 * Then the trailing sub-matrix A_11^^ = L_11*D_11*L_11' =
 * A_11^ - L_10*D_00*L_10', where vec[] is row/ column r of A_11^.
 * L is a pointer to m-by-n block L_10 of a unit lower triangular matrix, and 
 * M is a pointer to m-by-n working array that stores the product L_10*D_00.  
 * Matrices L and M are stored in column-major order with leading dimension ldim.
 * BLAS routine DGMEMV performs the matrix-vector multiplication operation that
 * constitutes the trailing sub-matrix update of column r. 
 */
void reduce_ldlt_vector_blas( int m, int n, int r, int *ord, int ldim, 
	const double *L, const double *M, double *vec )
{
#if defined(LDLTFACT) && defined(PROFILE)
	struct	timespec sta_red_vec, end_red_vec;

	get_time( &sta_red_vec );
#endif

	const char		no_trans = 'N';
	const int		incx = ldim;
	const int		incy = 1;
	const double	_one = -1.0;
	const double	one = 1.0;

	const double *Lr_ = L + r;		// Points to element L(r,0) = L'(0,r)

	dgemv_( &no_trans, &m, &n, &_one, M, &ldim, Lr_, &incx, &one, vec, &incy );

#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_red_vec );
	tm_red_vec += timespec_diff( sta_red_vec, end_red_vec );
#endif
}

/*
 * Matrix factorization reduces trailing sub-matrix A by computing 
 * A = A - L*D*T', where A is an m-by-n sub-matrix, L and T are m-by-p and 
 * n-by-p blocks of unit lower triangular matrices, and D is a p-by-p block of a
 * block diagonal matrix with block order 1 or 2 -- vector ord[k] specifies the 
 * diagonal block order.  A, L, D and T are stored in column-major order with 
 * leading dimension ldim.  The trailing sub-matrix update is an implementation 
 * of the SAXPY operation.  Because of symmetry, the trailing sub-matrix update 
 * need only be performed on elements on or below the diagonal. 
 */
void reduce_ldlt_matrix( int m, int n, int p, int diag, const int *ord, 
	int ldim, const double *L, const double *D, const double *M, 
	const double *T, double *A )
{
	for ( int j = 0; j < n; j++ ) {
		const double *Tj_ = T + j;					// Points to element T'(0,j)
		double *A_j = A + j*ldim;					// Points to element A(0,j)
		for ( int k = 0; k < p; ) {
			const double *L_k = L + k*ldim;			// Points to element L(0,k)
			
			if ( ord[k] == 1 ) {					// 1-by-1 pivot
				double dkk = *(D + k + k*ldim);		// Element D(k,k)
				double tjk = *(Tj_ + k*ldim);		// Element T'(k,j) = T(j,k)
				for (int i = diag ? j : 0; i < m; i++) {
					*(A_j + i) -= *(L_k + i) * dkk * tjk;
				}
				k++;
			} else {								// 2-by-2 pivot 
				double d00 = *(D + k + k*ldim);		// Element D(k,k)	
				double d10 = *(D + k + 1 + k*ldim);	// Element D(k+1,k) = D(k,k+1)
				double d11 = *(D + k + 1 + k*ldim + ldim);	// Element D(k+1,k+1)
				double tj0 = *(Tj_ + k*ldim);		// Element T'(k,j) = T(j,k)
				double tj1 = *(Tj_ + k*ldim + ldim);// Element T'(k+1,j) = T(j,k+1)
				for ( int i = diag ? j : 0; i < m; i++ ) {
					*(A_j + i) -=	*(L_k + i) * (d00*tj0 + d10*tj1) +
								 	*(L_k + i + ldim) * (d10*tj0 + d11*tj1);
				}
				k += 2;
			}
		}
	}
}

/*
 * Matrix factorization reduces trailing sub-matrix A by computing 
 * A = A - L*D*T', where A is an m-by-n sub-matrix, L and T are m-by-p and 
 * n-by-p blocks of unit lower triangular matrices, and D is a p-by-p block of a
 * block diagonal matrix with block order 1 or 2 -- vector ord[k] specifies the 
 * diagonal block order.  Matrix M stores the product L*D.  A, M and T are 
 * stored in column-major order with leading dimension ldim.  BLAS routines 
 * DGEMM and DGEMV perform the trailing sub-matrix update, which need only be 
 * applied to elements on or below the diagonal because of symmetry. 
 */
void reduce_ldlt_matrix_blas( int m, int n, int p, int diag, const int *ord, 
	int ldim, const double *L, const double *D, const double *M, 
	const double *T, double *A )
{
	const char		trans = 'T';
	const char		no_trans = 'N';
	const int		incx = ldim;
	const int		incy = 1;
	const double	_one = -1.0;
	const double	one = 1.0;

	// Compute A = A - L*D*T' = A - M*T'
	if ( diag )	{									// diagonal block, m = n	
		for ( int j = 0; j < n; j++ ) {		
			int r = n - j;
			const double *Mj_ = M + j;
			const double *Tj_ = T + j;
			double *Ajj = A + j + j*ldim;
			dgemv_( &no_trans, &r, &p, &_one, Mj_, &ldim, Tj_, &incx, 
				&one, Ajj, &incy );
		}
	} else {										// rectangular block
		dgemm_( &no_trans, &trans, &m, &n, &p, &_one, M, &ldim, T, &ldim,
			&one, A, &ldim );
	}
}

/*
 * Matrix factorization reduces trailing sub-matrix A by computing 
 * A = A - L*D*L', where A is an m-by-m sub-matrix, L is an m-by-n column block 
 * of a unit lower triangular matrix and D is an n-by-n block of a block 
 * diagonal matrix with block order 1 or 2 -- vector ord[k] specifies the 
 * diagonal block order.  For the implementation that uses the BLAS library, 
 * matrix M stores the product L*D.  Matrices A, L, D, M and T are stored in 
 * column-major order with leading dimension ldim.  Blocking is used to optimize
 * memory access for the trailing sub-matrix update, and bdim is the blocking
 * parameter.  Because of symmetry, the trailing sub-matrix update need only be
 * performed on diagonal blocks and blocks below the diagonal.
 */
void reduce_ldlt_mat_blk( int blas, int m, int n, const int *ord, int bdim, 
	int ldim, const double *L, const double *D, const double *M, double *A )
{
	void 	(*reduce_matrix)( int m, int n, int p, int diag, const int *ord, 
				int ldim, const double *L, const double *D, const double *M, 
				const double *T, double *A );

	if ( blas ) {
		reduce_matrix = reduce_ldlt_matrix_blas;
	} else {
		reduce_matrix = reduce_ldlt_matrix;
	}

	for ( int j = 0; j < m; j += bdim ) {
		// Determine number of columns in (i,j)th block of A
		const int s = (j + bdim > m) ? (m - j) : bdim;

		for ( int k = 0; k < n; k += bdim ) {
			int diag = 1;		// Diagonal block = TRUE
			// Determine number of columns of Lik, rows of L'kj,
			// and dimension of square matrix block Dkk
			const int t = (k + bdim > n) ? (n - k) : bdim;
			// Set pointer to matrix blocks Dkk and L'kj.  Pointer to L'kj also 
			// points to Ljk -- referred to as Tjk to differentiate from Lik
			const double *Dkk = D + k + k*ldim;
			const double *Tjk = L + j + k*ldim;

			for ( int i = j; i < m; i += bdim ) {
				// Determine number of rows in (i,j)th block of A
				const int r = (i + bdim > m) ? (m - i) : bdim;
				// Set pointers to block matrices Lik, Mik = Lik*Dkk, and Aij
				const double *Mik = M + i + k*ldim;
				const double *Lik = L + i + k*ldim;
				double *Aij = A + i + j*ldim;
				// Reduce trailing block matrix
				reduce_matrix( r, s, t, diag, &ord[k], 
					ldim, Lik, Dkk, Mik, Tjk, Aij );
				diag = 0; 		// Diagonal block = FALSE
			}								
		}
	}
}

/*
 * Implements a rectangular version of the SAXPY operation (jki indexing) for 
 * symmetric indefinite factorization.  Symmetric indefinite m-by-n matrix A 
 * with leading dimension ldim is factored into a unit lower triangular matrix L 
 * and block diagonal matrix D with block order 1 or 2.  Symmetrically permuted
 * matrix A^ = P*A*P' = L*D*L', where P is the permutation matrix, and L' and P'
 * are the transpose of L and P, respectively.  Permutation matrix P is encoded
 * in vectors piv[] and ord[], such that row/ column k is interchanged with 
 * row/ column piv[k], and ord[k] specifies the diagonal block order.  Entries 
 * of L and D are stored on and below the diagonal of matrix A, i.e., L and D 
 * overwrite A.
 */
void ldlt_factor( char pivot, int m, int *n, int *piv, int *ord, int ldim, 
	double *A, double *W )
{
#if defined(LDLTFACT) && defined(PROFILE)
	struct	timespec sta_fact_piv, end_fact_piv;
#endif

	int		j = 0;
	double	*D = A;

	for ( ; j < *n; ) {
		double *L = A + j;
		double *A_j = A + j*ldim;
		double *Ajj = A_j + j;
		// Evaluate pivot using to method specified in argument list
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_fact_piv );
#endif
		switch ( pivot ) {
		case 'B':
			eval_pivot_reduce_bbk( m-j, j, ldim, L, D, Ajj, W+j, piv, ord );
			break;
		case 'K':
			eval_pivot_reduce_bk( m-j, j, ldim, L, D, Ajj, W+j, piv, ord );
			break;
		default:
			eval_pivot_reduce_bk( m-j, j, ldim, L, D, Ajj, W+j, piv, ord );
			break;
		}
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_fact_piv );
	tm_fact_piv += timespec_diff( sta_fact_piv, end_fact_piv );
#endif
		// Perform symmetric pivoting using reduced trailing sub-matrix row(s)/ 
		// column(s) returned by pivot selection algorithm, and compute column(s) 
		// of unit lower triangular matrix L -- because of symmetry need only
		// update elements on and below the diagonal.  Details of these 
		// computations differ depending on whether the diagonal block (pivot) 
		// is 1-by-1 or 2-by-2		
		if ( ord[j] == 1 ) {				// 1-x-1 pivot
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_fact_piv );
#endif
			pivot_sym_reduce( m, j, piv[j], W, ldim, A );
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_fact_piv );
	tm_fact_piv += timespec_diff( sta_fact_piv, end_fact_piv );
#endif
			double ajj = *Ajj;
			for ( int i = j+1; i < m; i++ ) {
				*(A_j + i)	/= ajj;
			}
			j++;
		
		} else { 							// 2-x-2 pivot, ord[k] == 2
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_fact_piv );
#endif
			// Apply first of two pivots to matrix A using first column of W
			pivot_sym_reduce( m, j, piv[j], W, ldim, A );
			// Apply first of two pivots to second column of W
			*(W + ldim + piv[j]) = *(W + ldim + j); 
			// Apply second of two pivots to matrix A using second column of W
			pivot_sym_reduce( m, j+1, piv[j+1], W+ldim, ldim, A );
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_fact_piv );
	tm_fact_piv += timespec_diff( sta_fact_piv, end_fact_piv );
#endif
			// Let A(k:n-1,k:k+1) = [D, C'; C, A^] = 
// [I, 0; C*inv(D), I] * [D, 0; 0, A^ - C*inv(D)*C'] * [I, 0; C*inv(D), I]
			// where D is 2-by-2 symmetric diagonal block and inv(D) its inverse.
			// First solve for (n-k-2)-by-2 unit lower triangular block, then
			// reduce trailing sub-matrix by computing A^ - C*inv(D)*C'.  Once
			// computed, L and D overwrite A
			double d00 = *Ajj;					// Element D(j,j)
			double d10 = *(Ajj + 1);			// Element D(j+1,j) = D(j,j+1)
			double d11 = *(Ajj + 1 + ldim);		// Element D(j+1,j+1)
			double denom = d00 * d11 - d10 * d10;
			for ( int i = j+2; i < m; i++ ) {
				double aij = *(A_j + i);
				double aik = *(A_j + i + ldim);
				*(A_j+i) = ( aij * d11 - aik * d10 ) / denom;
				*(A_j+i+ldim) = ( aik * d00 - aij * d10 ) / denom;
			}
			j += 2;
		}
	}
	*n = j;
}

/*
 * Implements a rectangular version of the SAXPY operation (jki indexing) for 
 * symmetric indefinite factorization.  Symmetric indefinite m-by-n matrix A 
 * with leading dimension ldim is factored into a unit lower triangular matrix L 
 * and block diagonal matrix D with block order 1 or 2.  Symmetrically permuted
 * matrix A^ = P*A*P' = L*D*L', where P is the permutation matrix, and L' and P'
 * are the transpose of L and P, respectively.  Permutation matrix P is encoded
 * in vectors piv[] and ord[], such that row/ column k is interchanged with 
 * row/ column piv[k], and ord[k] specifies the diagonal block order.  To the
 * extent possible, this implementation of the SAXPY operation uses the BLAS 
 * library to perform matrix operations.  Entries of L and D are stored on and 
 * below the diagonal of matrix A, i.e., L and D overwrite A.
 */
void ldlt_factor_blas( char pivot, int m, int *n, int *piv, int *ord, int ldim, 
	double *A, double *W )
{
#if defined(LDLTFACT) && defined(PROFILE)
	struct	timespec sta_fact_piv, end_fact_piv;
#endif

	int 	j = 0;
	
	for ( ; j < *n; ) {
		double *L = A + j;			// A stores unit lower triangular matrix L
		double *M = W + j;			// W stores L*D
		double *W_j = W + j*ldim;
		double *Wjj = W_j + j;
		double *A_j = A + j*ldim;
		double *Ajj = A_j + j;

		// Evaluate pivot using to method specified in argument list
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_fact_piv );
#endif
		switch ( pivot ) {
		case 'B':
			eval_pivot_blas_bbk( m-j, j, ldim, L, M, Ajj, Wjj, piv, ord );
			break;
		case 'K':
			eval_pivot_blas_bk( m-j, j, ldim, L, M, Ajj, Wjj, piv, ord );
			break;
		default:
			eval_pivot_blas_bk( m-j, j, ldim, L, M, Ajj, Wjj, piv, ord );
			break;
		}
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_fact_piv );
	tm_fact_piv += timespec_diff( sta_fact_piv, end_fact_piv );
#endif
		// Perform symmetric pivoting using reduced trailing sub-matrix row(s)/ 
		// column(s) returned by pivot selection algorithm, and compute column(s) 
		// of unit lower triangular matrix L -- because of symmetry need only
		// update elements on and below the diagonal.  Details of these 
		// computations differ depending on whether the diagonal block (pivot) 
		// is 1-by-1 or 2-by-2		
		if ( ord[j] == 1 ) {		// 1-x-1 pivot
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_fact_piv );
#endif
			pivot_sym_blas( m, j, piv[j], ldim, W, A );
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_fact_piv );
	tm_fact_piv += timespec_diff( sta_fact_piv, end_fact_piv );
#endif
			double ajj = *Wjj;
			*Ajj = ajj;
			for ( int i = j+1; i < m; i++ ) {
				*(A_j + i)	= *(W_j + i) / ajj;
			}
			j++;
		
		} else { 					// 2-x-2 pivot, ord[k] == 2
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_fact_piv );
#endif
			// Apply first of two pivots to matrix A using first column of W
			pivot_sym_blas( m, j, piv[j], ldim, W, A );
			// Apply first of two pivots to second column of W
			*(W_j + ldim + piv[j]) = *(W_j + ldim + j);
			// Apply second of two pivots to matrix A using second column of W
			pivot_sym_blas( m, j+1, piv[j+1], ldim, W, A );
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_fact_piv );
	tm_fact_piv += timespec_diff( sta_fact_piv, end_fact_piv );
#endif
			// Solve for L(j+2:n-1,j:j+1) * D(j:j+1,j:j+1) = A(j+2:n-1,j:j+1)
			// where columns of updated trailing sub-matrix A and diagonal block
			// are stored in W.  Once computed, L and D overwrite A
			double d00 = *Wjj;					// Element D(j,j)
			double d10 = *(Wjj + 1);			// Element D(j+1,j) = D(j,j+1)
			double d11 = *(Wjj + 1 + ldim);		// Element D(j+1,j+1)
			double denom = d00 * d11 - d10 * d10;
			*Ajj = d00;
			*(Ajj + 1) = d10;
			*(Ajj + 1 + ldim) = d11;
			for ( int i = j+2; i < m; i++ ) {
				*(A_j+i) = ( *(W_j+i) * d11 - *(W_j+i+ldim) * d10 ) / denom;
				*(A_j+i+ldim) = ( *(W_j+i+ldim) * d00 - *(W_j+i) * d10 ) / denom;
			}
			j += 2;
		}
	}
	*n = j;
}

/*
 * Implements simple blocking with partial or rook pivoting to factorize 
 * symmetric indefinite n-by-n matrix A (with leading dimension ldim) into a unit 
 * lower triangular matrix L and block diagonal matrix D with block order 1 or 2.  
 * Symmetrically permuted matrix A^ = P*A*P' = L*D*L', where P is the 
 * permutation matrix, and L' and P' are the transpose of L and P, respectively.
 * Permutation matrix P is encoded in vectors piv[] and ord[], such that row/ 
 * column k is interchanged with row/ column piv[k], and ord[k] specifies the
 * diagonal block order.  Suppose that A is decomposed into blocks 
 * [A_00, A_01; A_10, A_11], where A_00 is an r-by-r block matrix.  First, a 
 * rectangular version of the SAXPY operation for symmetric indefinite 
 * factorization computes 
 * P * [A_00, A_01; A_10, A_11] * P' = [A_00^, A_01^; A_10^, A_11^]
 * = [L_00, 0; L_10, L_11] * [D_00, 0; 0; D_11] * [L_00', L_10'; 0, L_11']
 * = [L_00*D_00*L_00', (L_10*D_00*L00')'; 
 *    L_10*D_00*L00', L_10*D_00*L_10' + L_11*D_11*L11'].
 * This computation yields the LDL' factorization for the first n-by-r column
 * block of A.  Then the trailing submatrix is updated to give 
 * A_11^^ = L_11*D_11*L11' = A_11^ - L_10*D_00*L_10'. 
 * This procedure is repeated iteratively on the trailing sub-matrix until the 
 * last diagonal block (dimension <= r) is reached.  Simple blocking is also
 * used to optimize memory access when updating the trailing sub-matrix.
 */
void ldlt_block_rook_pivot( int blas, char pivot, int n, int *piv, int *ord, 
	int ldim, double *A )
{
	const int lapack = 0;	
	const int bdim = get_block_dim_ldlt( lapack, blas, ldim );

	int		d, j, r, t;
	double	*Ajj, *L, *D, *W;
	void 	(*ldlt)( char pivot, int m, int *n, int *piv, int *ord, int ldim, 
				double *A, double *W );

#if defined(LDLTFACT) && defined(PROFILE)
	struct	timespec sta_ldlt, sta_factor, sta_pivot, sta_reduce, 
			end_ldlt, end_factor, end_pivot, end_reduce;
	xtra_work = 0;
	tm_ldlt = 0.0;
	tm_factor = 0.0;
	tm_pivot = 0.0;
	tm_reduce = 0.0;
	tm_fact_piv = 0.0;
	tm_red_vec = 0.0;

	get_time(&sta_ldlt);
#endif

	if ( blas ) {
		ldlt = ldlt_factor_blas;
		W = (double *) malloc( ldim*(bdim+1)*sizeof(double) );
	} else {
		ldlt = ldlt_factor;
		W = (double *) malloc( ldim*2*sizeof(double) );
	}
	
	j = 0;
	r = (bdim > n) ? n : bdim;
	// Perform rectangular factorization on first column block A(0:n-1,0:r)
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_factor );
#endif
	ldlt( pivot, n, &r, &piv[j], &ord[j], ldim, A, W );
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_factor );
	tm_factor += timespec_diff( sta_factor, end_factor );
#endif

	d = 0;
	j = r;
	t = n - r;
	Ajj = A;
	for ( ; j < n; j += r, t -= r ) {
		// Adjust pivot vector of previous block for diagonal offset
		for (int i = d; i < j; i++) {
			piv[i] += d;
		}
		L = Ajj + r;
		D = Ajj;
		Ajj = A + j + j*ldim;
		// Reduce trailing sub-matrix, P * A(j:n-1,j:n-1) * P'
		// -= L(j:n-1,j-r:j-1) * D(j-r:j-1,j-r:j-1) * L'(j-r:j-1,j:n-1)
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_reduce );
#endif
		reduce_ldlt_mat_blk( blas, t, r, &ord[d], r, ldim, L, D, W+j, Ajj );
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_reduce );
	tm_reduce += timespec_diff( sta_reduce, end_reduce );
#endif
		d += r;
		r = t < bdim ? t : bdim;
		// Perform rectangular factorization on column block A(j:n-1,j:j+r-1)
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_factor );
#endif
		ldlt( pivot, t, &r, &piv[j], &ord[j], ldim, Ajj, W+j );
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_factor );
	tm_factor += timespec_diff( sta_factor, end_factor );
#endif
		// Apply permutation matrix for current block, encoded in piv(j:j+r-1),
		// to columns to the left of current block A(:,0:j-1)
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_pivot );
#endif
		for ( int i = j; i < j+r; i++ ) {
			if ( i != piv[i] + j ) {
				if ( blas ) {
					double *Ai_ = A + i;
					double *Ar_ = A + piv[i] + j;
					dswap_( &j, Ai_, &ldim, Ar_, &ldim );
				} else {
					for ( int k = 0; k < j; k++ ) {
						double aik = *(A + i + k*ldim);
						*(A + i + k*ldim) = *(A + piv[i] + j + k*ldim);
						*(A + piv[i] + j + k*ldim) = aik;
					}
				}
			}
		}
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_pivot );
	tm_pivot += timespec_diff( sta_pivot, end_pivot );
#endif
	}
	// Adjust pivot vector of last block for diagonal offset
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &sta_pivot );
#endif
	for ( int i = d; i < n; i++ ) {
		piv[i] += d;
	}
#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_pivot );
	tm_pivot += timespec_diff( sta_pivot, end_pivot );
#endif
	free( W );

#if defined(LDLTFACT) && defined(PROFILE)
	get_time( &end_ldlt );
	tm_ldlt += timespec_diff( sta_ldlt, end_ldlt );

	int num_piv = count_pivot( 0, n, piv, ord );
	double frac_fact = num_piv / (double) (num_piv + xtra_work);
	tm_pivot = tm_pivot + tm_fact_piv - frac_fact * tm_red_vec;
	tm_factor = tm_factor - tm_fact_piv + frac_fact * tm_red_vec;
	fprintf( stdout, "%.3f\t%.3f\t\t%.3f\t\t%.3f\t\t%.1f\t\t%.1f\t\t%.1f\n",
		tm_ldlt, tm_factor, tm_pivot, tm_reduce, 
		tm_factor/tm_ldlt*100, tm_pivot/tm_ldlt*100, tm_reduce/tm_ldlt*100 );
#endif
}

/*
 * Implements simple blocking with complete pivoting to factorize symmetric
 * indefinite n-by-n matrix A (with leading dimension ldim) into a unit lower 
 * triangular matrix L and block diagonal matrix D with block order 1 or 2.
 * Symmetrically permuted matrix A^ = P*A*P' = L*D*L', where P is the 
 * permutation matrix, and L' and P' are the transpose of L and P, respectively.  
 * Permutation matrix P is encoded in vectors piv[] and ord[], such that row/ 
 * column k is interchanged with row/ column piv[k], and ord[k] specifies the 
 * diagonal block order.  Each pass through the k-loop selects a 1-by-1 or 
 * 2-by-2 pivot, computes column(s) of the unit lower triangular matrix below 
 * the pivot, and performs an outer product operation to reduce the trailing 
 * sub-matrix.  In order to implement complete pivoting the entire trailing 
 * sub-matrix must be updated prior to each pivot selection, i.e., prior to each
 * pass through the k-loop.  Hence, the blocking procedure is applied to the 
 * trailing sub-matrix reduction alone.
 */
void ldlt_block_comp_pivot( int blas, char pivot, int n, int *piv, int *ord, 
	int ldim, double *A )
{	
	const int	lapack = 0;
	const int 	bdim = get_block_dim_ldlt( lapack, blas, ldim );

	double	*A_k, *Akk, *L, *D, *M, *W;

	W = (double *) malloc( ldim*2*sizeof(double) );
	A_k = A;
	Akk = A;
	for ( int k = 0; k < n-1; ) {

		// Evaluate pivot using to method specified in argument list
		switch ( pivot ) {
		case 'P':
			eval_pivot_bp( n-k, k, ldim, Akk, piv, ord );
			break;
		default:
			eval_pivot_bp( n-k, k, ldim, Akk, piv, ord );
			break;
		}

		// Perform symmetric pivoting, compute column(s) of unit lower 
		// triangular matrix L, and update trailing sub-matrix -- because of
		// symmetry need only update elements on and below the diagonal.   
		// Details of these computations differ depending on whether the 
		// diagonal block (pivot) is 1-by-1 or 2-by-2
		if ( ord[k] == 1 ) {		// 1-x-1 pivot
			if ( k != piv[k] ) {
				pivot_sym( n, k, piv[k], ldim, A );
			}
			// Store A(k+1:n-1,k), a column vector of LD, in working array W
			memcpy( W+k+1, Akk+1, (n-k-1)*sizeof(double) );
			double akk = *Akk;
			for ( int i = 1; i < n-k; i++ ) {
				*(Akk + i)	/= akk;
			}
		} else { 					// 2-x-2 pivot, ord[k] == 2
			if ( k != piv[k] ) {
				pivot_sym( n, k, piv[k], ldim, A );
			}
			if ( k+1 != piv[k+1] ) {
				pivot_sym( n, k+1, piv[k+1], ldim, A );
			}
			// Store A(k+2:n-1,k:k+1), a column block of LD, in working array W
			memcpy( W+k+2, Akk+2, (n-k-2)*sizeof(double) );
			memcpy( W+k+2+ldim, Akk+2+ldim, (n-k-2)*sizeof(double) );
			// Solve for L(k+2:n-1,k:k+1) * D(k:k+1,k:k+1) = A(k+2:n-1,k:k+1)
			// Once computed, L and D overwrite A
			double d00 = *Akk;					// Element D(k,k)
			double d10 = *(Akk + 1);			// Element D(k+1,k) = D(k,k+1)
			double d11 = *(Akk + 1 + ldim);		// Element D(k+1,k+1)
			double denom = d00 * d11 - d10 * d10;
			for ( int i = k+2; i < n; i++ ) {
				*(A_k+i) = ( *(W+i) * d11 - *(W+i+ldim) * d10 ) / denom;
				*(A_k+i+ldim) = ( *(W+i+ldim) * d00 - *(W+i) * d10 ) / denom;
			}
		}
		L = Akk + ord[k];
		D = Akk;
		M = W + k + ord[k];
		A_k = A_k + ord[k]*ldim;
		Akk = A_k + k + ord[k];
		// Reduce trailing sub-matrix, P * A(k+ord[k]:n-1,k+ord[k]:n-1) * P' -=
		// L(k+ord[k]:n-1,k:k+ord[k]) * D(k:k+ord[k],k:k+ord[k]) 
		// * L'(k:k+ord[k]:k+ord[k]:n-1)
		reduce_ldlt_mat_blk( blas, n-k-ord[k], ord[k], &ord[k], bdim, 
			ldim, L, D, M, Akk );
		k += ord[k];
	}
	if ( ord[n-2] != 2 ) {			// Last pivot is 1-by-1
		piv[n-1] = n-1;
		ord[n-1] = 1;
	}
	free( W );
}

/******************************************************************************/

/*
 * Implements the outer product method (kji indexing) to factorize symmetric
 * indefinite n-by-n matrix A into a unit lower triangular matrix L and block 
 * diagonal matrix D with block order 1 or 2.  Symmetrically permuted matrix
 * A^ = P*A*P' = L*D*L', where P is the permutation matrix, and L' and P' are
 * the transpose of L and P, respectively.  The permutation matrix P is encoded
 * in vectors piv[] and ord[], such that row/ column k is interchanged with
 * row/ column piv[k], and ord[k] specifies the diagonal block order.  Entries
 * of L and D are stored on and below the diagonal of matrix A, i.e., L and D
 * overwrite A.  Each pass through the k-loop performs an outer product
 * operation.  
 */
void ldlt_outer_product( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;

	double *W;

	W = (double *) malloc( ldim*2*sizeof(double) );

	for (int k = 0; k < n-1; ) {

		double *A_k = A + k*ldim;
		double *Akk = A_k + k;
		// Evaluate pivot using to method specified in argument list
		switch ( pivot ) {
		case 'B':
			eval_pivot_bbk( n-k, k, ldim, Akk, piv, ord );
			break;
		case 'K':
			eval_pivot_bk( n-k, k, ldim, Akk, piv, ord );
			break;
		case 'P':
			eval_pivot_bp( n-k, k, ldim, Akk, piv, ord );
			break;
		default:
			eval_pivot_bbk( n-k, k, ldim, Akk, piv, ord );
			break;
		}

		// Perform symmetric pivoting, compute column(s) of unit lower 
		// triangular matrix L, and update trailing sub-matrix -- because of
		// symmetry need only update elements on and below the diagonal.   
		// Details of these computations differ depending on whether the 
		// diagonal block (pivot) is 1-by-1 or 2-by-2
		if ( ord[k] == 1 ) {		// 1-x-1 pivot
			if ( k != piv[k] ) {
				pivot_sym( n, k, piv[k], ldim, A );
			}
			double akk = *Akk;
			for ( int i = k+1; i < n; i++ ) {
				*(A_k + i)	/= akk;
			}
			for ( int j = k+1; j < n; j++ ) {
				double *A_j = A + j*ldim;
				double ajk = *(A_k + j);
				for ( int i = j; i < n; i++ ) {  
					*(A_j+i) -= *(A_k+i) * ajk * akk;
				}
			}
			k++;
		
		} else { 					// 2-x-2 pivot, ord[k] == 2
			if ( k != piv[k] ) {
				pivot_sym( n, k, piv[k], ldim, A );
			}
			if ( k+1 != piv[k+1] ) {
				pivot_sym( n, k+1, piv[k+1], ldim, A );
			}
			// Let A(k:n-1,k:k+1) = [D, C'; C, A^] = 
// [I, 0; C*inv(D), I] * [D, 0; 0, A^ - C*inv(D)*C'] * [I, 0; C*inv(D), I]
			// where D is 2-by-2 symmetric diagonal block and inv(D) its inverse.
			// First solve for (n-k-2)-by-2 unit lower triangular block, then
			// reduce trailing sub-matrix by computing A^ - C*inv(D)*C'. Once
			// computed, L and D overwrite A.  Store A(k+2:n-1,k:k+1) in W
			memcpy( W+k+2, Akk+2, (n-k-2)*sizeof(double) );
			memcpy( W+k+2+ldim, Akk+2+ldim, (n-k-2)*sizeof(double) );
			// Solve for L(k+2:n-1,k:k+1) * D(k:k+1,k:k+1) = A(k+2:n-1,k:k+1)
			// Once computed, L and D overwrite A
			double d00 = *Akk;					// Element D(k,k)
			double d10 = *(Akk + 1);			// Element D(k+1,k) = D(k,k+1)
			double d11 = *(Akk + 1 + ldim);		// Element D(k+1,k+1)
			double denom = d00 * d11 - d10 * d10;
			for ( int i = k+2; i < n; i++ ) {
				*(A_k+i) = ( *(W+i) * d11 - *(W+i+ldim) * d10 ) / denom;
				*(A_k+i+ldim) = ( *(W+i+ldim) * d00 - *(W+i) * d10 ) / denom;
			}
			// Reduce trailing sub-matrix, A(k+2:n-1,k+2,n-1) =  
			// A(k+2:n-1,k+2,n-1) - L(k+2:n-1,k:k+1) * W(k+2:n-1,k:k+1)'
			double *L;
			double wjk;
			for ( int j = k+2; j < n; j++ ) {
				double *A_j = A + j*ldim;
				L = A + k*ldim;					
				wjk = *(W+j);	
				for (int i = j; i < n; i++) {
					*(A_j + i) -= *(L + i) * wjk;
				}
				L = A + k*ldim + ldim;					
				wjk = *(W+j+ldim);	
				for (int i = j; i < n; i++) {
					*(A_j + i) -= *(L + i) * wjk;
				}
			}
			k += 2;
		}
	}
	if ( ord[n-2] != 2 ) {			// Last pivot is 1-by-1
		piv[n-1] = n-1;
		ord[n-1] = 1;
	}
	free( W );
}

/*
 * Implements the SAXPY operation (jki indexing) to factorize symmetric 
 * indefinite n-by-n matrix A into a unit lower triangular matrix L and block 
 * diagonal matrix D with block order 1 or 2.  Symmetrically permuted matrix 
 * A^ = P*A*P' = L*D*L', where P is the permutation matrix, and L' and P' are 
 * the transpose of L and P, respectively.  Permutation matrix P is encoded in 
 * vectors piv[] and ord[], such that row/ column k is interchanged with row/ 
 * column piv[k], and ord[k] specifies the diagonal block order.  Entries of L 
 * and D are stored on and below the  diagonal of matrix A, i.e., L and D 
 * overwrite A.  
 */
void ldlt_saxpy( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;

	double	*W;

	W = (double *) malloc( ldim*2*sizeof(double) );

	ldlt_factor( pivot, n, &n, piv, ord, ldim, A, W );

	free( W );
}

/*
 * Implements simple blocking to factorize symmetric indefinite n-by-n matrix A
 * into a unit lower triangular matrix L and block diagonal matrix D with block
 * order 1 or 2.  Symmetrically permuted matrix A^ = P*A*P' = L*D*L', where P
 * is the permutation matrix, and L' and P' are the transpose of L and P,
 * respectively.  Permutation matrix P is encoded in vectors piv[] and ord[],
 * such that row/ column k is interchanged with row/ column piv[k], and ord[k] 
 * specifies the diagonal block order.  In the case where a partial or rook
 * pivoting strategy is specified the blocked algorithm employs the SAXPY 
 * operation to perform symmetric indefinite factorization -- cumulative 
 * trailing sub-matrix updates are applied to candidate pivot row(s)/ column(s)
 * as the factorization proceeds.  When a complete pivoting strategy is 
 * specified the entire trailing sub-matrix must be updated prior to each pivot
 * selection, so the blocked algorithm employs the outer product method.
 */
void ldlt_block( char pivot, int n, int *piv, int *ord, double *A )
{
	const int	ldim = n;
	const int	blas = 0;

	// Choose blocked algorithm based on pivot strategy
	switch ( pivot ) {
	case 'B':
		ldlt_block_rook_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	case 'K':
		ldlt_block_rook_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	case 'P':
		ldlt_block_comp_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	default:
		ldlt_block_rook_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	}
}

/*
 * Implements simple blocking to factorize symmetric indefinite n-by-n matrix A
 * into a unit lower triangular matrix L and block diagonal matrix D with block
 * order 1 or 2.  Symmetrically permuted matrix A^ = P*A*P' = L*D*L', where P
 * is the permutation matrix, and L' and P' are the transpose of L and P, 
 * respectively.  Permutation matrix P is encoded in vectors piv[] and ord[],
 * such that row/ column k is interchanged with row/ column piv[k], and ord[k] 
 * specifies the diagonal block order.  To the extent possible, this 
 * implementation of a blocked algorithm for symmetric indefinite factorization
 * uses the BLAS library to perform matrix operations.  In the case where a 
 * partial or rook pivoting strategy is specified the blocked algorithm employs
 * the SAXPY operation to perform symmetric indefinite factorization -- 
 * cumulative trailing sub-matrix updates are applied to candidate pivot row(s)/ 
 * column(s) as the factorization proceeds.  When a complete pivoting strategy 
 * is specified the entire trailing sub-matrix must be updated prior to each 
 * pivot selection, so the blocked algorithm employs the outer product method.
 */
void ldlt_block_blas( char pivot, int n, int *piv, int *ord, double *A )
{
	const int	blas = 1;
	const int	ldim = n;

	// Choose blocked algorithm based on pivot strategy
	switch ( pivot ) {
	case 'B':
		ldlt_block_rook_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	case 'K':
		ldlt_block_rook_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	case 'P':
		ldlt_block_comp_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	default:
		ldlt_block_rook_pivot( blas, pivot, n, piv, ord, ldim, A );
		break;
	}
}

/*
 * Wrapper for calling LAPACK routine DPOTF2 which computes the Cholesky 
 * factorization of a real symmetric positive definite matrix.  DPOTF2 is
 * LAPACK's unblocked version of Cholesky factorization.
 */
void ldlt_lapack_unblocked( char pivot, int n, int *piv, int *ord, double *A )
{
	const char	lower = 'L';
	const int	ldim = n;
	int			info = 0;

    dsytf2_( &lower, &n, A, &ldim, piv, &info );
}

/*
 * Wrapper for calling LAPACK routine DSYTRF which computes the factorization 
 * of a real symmetric indefinite matrix.
 */
void ldlt_lapack( char pivot, int n, int *piv, int *ord, double *A )
{
	const char	lower = 'L';
	const int	lapack = 1;
	const int	blas = 0;
	const int	ldim = n;
	const int	bdim = get_block_dim_ldlt( lapack, blas, ldim );

	int			lwork = ldim*bdim;
	int			info = 0;
	double 		*work;
	
	work = (double *) malloc( lwork*sizeof(double) );
    dsytrf_( &lower, &n, A, &ldim, piv, work, &lwork, &info );
	free( work );
}

