/*
 * Gill-Murray-Wright and Cheng-Higham modified Cholesky algorithms.  Unblocked 
 * versions of these algorithms include the outer product method and SAXPY 
 * operation, while blocked versions include simple blocking and an 
 * implementation that uses tuned BLAS (Basic Linear Algebra Subroutines).
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <float.h>

#include "modchol.h"
#include "ldltfact.h"
#include "lapack.h"
#include "matcom.h"
#include "timing.h"

static double calc_beta_sqr_gmw( int n, int ldim, const double *A );
static double calc_delta_ch( int n, int ldim, const double *A );
static void pivot_sym_blas( int n, int k, int r, int ldim, double *W,
	double *A );
static double modify_pivot_gmw( double delta, double beta_sqr, int n, 
	double *vec );
static void ldlt_spectral_decomp( int ldim, const double *A, double *U, 
	double *lambda );
static void modify_blk_diag( int type, double delta, int n, const int *ord, 
	int ldim, double *A );
static void chol_gmw_factor( char pivot, double delta, double beta_sqr, 
	int m, int n, int *piv, int *ord, double *diag, int ldim, double *A,
	double *W );
static void chol_gmw_factor_blas( char pivot, double delta, double beta_sqr, 
	int m, int n, int *piv, int *ord, double *diag, int ldim, double *A, 
	double *W );
static void chol_gmw_block_handler( int blas, char pivot, double delta, 
	double beta_sqr, int n, int *piv, int *ord, int ldim, double *A );

#if defined(MODCHOL) && defined(PROFILE)
	static double tm_mod_chol = 0.0;
	static double tm_mod_fact = 0.0;
#endif

/*
 * Computes and returns a parameter, beta_sqr, that bounds the magnitude of the
 * off-diagonal elements in the unit lower triangular matrix produced by 
 * symmetric indefinite factorization, P*A*P' = L*D*L'.  The function value is
 * used in the modified Cholesky algorithm proposed by Gill, Murray and Wright 
 * to determine the perturbation applied to symmetric indefinite n-by-n matrix A
 * to make it positive definite.
 */
double calc_beta_sqr_gmw( int n, int ldim, const double *A )
{
	double mu = -1.0;
	double nu = -1.0;
	double beta_sqr;

	for ( int j = 0; j < n; j++ ) {
		const double *A_j = A + j*ldim; 
		// Maximum magnitude of diagonal elements
		double ajj = fabs( *(A_j+j) );
		if ( ajj > nu ) {
			nu = ajj;
		}
		// Maximum magnitude of off-diagonal elements -- exploit symmetry
		for ( int i = j+1; i < n; i++ ) {
			double aij = fabs( *(A_j+i) );
			if ( aij > mu ) {
				mu = aij;
			}
		}
	}
	
	beta_sqr = mu / sqrt( n*n - 1 );
	if ( nu > beta_sqr ) {
		beta_sqr = nu;
	}
	if ( DBL_EPSILON > beta_sqr ) {
		beta_sqr = DBL_EPSILON;
	}

	return beta_sqr;
}

/*
 * Computes and returns a preset modification tolerance, delta, which is used to 
 * determine the perturbation applied to symmetric indefinite n-by-n matrix A 
 * to make it positive definite.   The function value is used in the modified 
 * Cholesky algorithm proposed by Cheng and Higham.
 */
double calc_delta_ch( int n, int ldim, const double *A )
{
	double delta = sqrt( DBL_EPSILON / 2.0 );
	double sigma = -1.0;

	// Compute sigma = infinity norm of symmetric matrix A
	// = maximum absolute row sum = maximum absolute column sum
	for ( int j = 0; j < n; j++ ) {
		double sum = 0.0;
		const double *A_j = A + j*ldim;
		for ( int i = 0; i < n; i++ ) {
			sum += fabs( *(A_j+i) );
		}
		if ( sum > sigma ) {
			sigma = sum;
		}
	}
	delta *= sigma;

	return delta;
}

/*
 * Performs pivoting of an n-by-n symmetric matrix A, and working array W, which
 * stores trailing sub-matrix updates applied to columns of A.  Arrays A and W
 * are stored in column-major order with leading dimension ldim.  To preserve 
 * the symmetry of matrix A both row and column k are interchanged with row and
 * column r.  A^ = P*A*P', where P is the permutation matrix and P' its 
 * transpose, is called a symmetric permutation of A.  BLAS routine DSWAP is 
 * invoked to perform symmetric pivoting.  Because of symmetry only elements on 
 * and below the diagonal need be interchanged.  
 */
void pivot_sym_blas( int n, int k, int r, int ldim, double *W, double *A )
{
	const int one = 1;	

	int		t;
	double	*A_k = A + k*ldim;
	double	*A_r = A + r*ldim;
	double	*Ak_ = A + k;
	double	*Ar_ = A + r;
	double	*Wk_ = W + k;
	double	*Wr_ = W + r;

	if ( k != r ) {
		// Interchange elements A(k,0:k-1) with A(r,0:k-1) and W(k,0:k-1) with
		// W(r,0:k-1), i.e., elements of rows k and r to the left of column k
		dswap_( &k, Ak_, &ldim, Ar_, &ldim );
		dswap_( &k, Wk_, &ldim, Wr_, &ldim );

		// Interchange diagonal elements of rows/ columns k and r
		double akk = *(A_k + k);
		*(A_k + k) = *(A_r + r);
		*(A_r + r) = akk;

		// Interchange elements A(k+1:r-1,k) with A(r,k+1:r-1)
		double	*Arl = Ar_ + k*ldim + ldim;
		double	*Alk = A_k + k + 1;
		t = r - k - 1;
		dswap_( &t, Alk, &one, Arl, &ldim );

		// Interchange elements A(r+1:n-1,k) with A(r+1:n-1,r)
		// i.e., elements of columns k and r below row r
		double	*Asr = A_r + r + 1;
		double	*Ask = A_k + r + 1;
		t = n - r - 1;
		dswap_( &t, Ask, &one, Asr, &one );
	}
}

/*
 * Modifies the selected pivot -- element vec[0] equal to akk, the kth diagaonal
 * element of some matrix A -- such that matrix (A + dA) is sufficiently 
 * positive definite and reasonably well-conditioned.  The Gill-Murray-Wright 
 * algorithm performs a Type-I modification, 
 * akk = max{|akk|, delta, c_sqr/beta_sqr}, where c_sqr is the square of the
 * infinity norm of vec[1:n-1] = A(k+1:k+n-1,k).
 */
double modify_pivot_gmw( double delta, double beta_sqr, int n, double *vec )
{
#if defined(MODCHOL) && defined(PROFILE)
	struct	timespec sta_mod_fact, end_mod_fact;

	get_time( &sta_mod_fact );
#endif
	double c_sqr = -1.0;
	double akk = fabs( vec[0] );
	for ( int i = 1; i < n; i++ ) {
		double x = vec[i] * vec[i];
		if ( x > c_sqr ) {
			c_sqr = x;
		}
	}
	if ( (c_sqr / beta_sqr) > akk ) {
		akk = c_sqr / beta_sqr;
	}
	if ( delta > akk ) {
		akk = delta;
	}
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_fact );
	tm_mod_fact += timespec_diff( sta_mod_fact, end_mod_fact );
#endif
	return akk;
}

/*
 * Computes the spectral decomposition of 2-by-2 symmetric matrix A,
 * A = U * [lambda[0], 0; 0, lambda[1]] * U', where lambda[0] and lambda[1]
 * are eigenvalues of A, and eigenvectors belonging to lambda[0] and lambda[1]
 * form the columns of orthogonal matrix U.
 */
void ldlt_spectral_decomp( int ldim, const double *A, double *U, double *lambda )
{
	// Trace of symmetric 2-by-2 matrix A
	double trA = *A + *(A + 1 + ldim);	
	// Determinant of symmetric 2-by-2 matrix A
	double detA = *A * *(A + 1 + ldim) - *(A + 1) * *(A + 1);
	// The characteristic polynomial of 2-by-2 matrix A is:
	// p(t) = t*t - trA*t + detA
	// Calculate roots of the characteristic polynomial, i.e., eigenvalues.
	// Note that if A is a real symmetric 2-by-2 matrix, with non-zero 
	// off-diagonal entries, then A has two distinct real eigenvalues
	lambda[0] = ( trA + sqrt(trA*trA - 4.0*detA) ) / 2.0;
	lambda[1] = ( trA - sqrt(trA*trA - 4.0*detA) ) / 2.0;
	
	if ( lambda[0] == lambda[1] ) {
		// A is 2-by-2 diagonal matrix with one distinct diagonal element.
		// Single distinct eigenvalue is equal to diagonal element,
		// and U is the identity matrix
		U[0] = 1;
		U[1] = 0;
		U[2] = 0;
		U[3] = 1;
	} else {		
		// Compute eigenvectors belonging to distinct real eigenvalues.
		// y = a*x --> u = alpha * (1; a), alpha any real number
		double a = -(*A - lambda[0]) / *(A + 1);
		double norm_u = sqrt( 1.0 + a*a );
		U[0] = 1 / norm_u;
		U[1] = a / norm_u;
		// y = b*x --> v = beta * (1; b), beta any real number
		double b = -(*A - lambda[1]) / *(A + 1);
		double norm_v = sqrt( 1.0 + b*b );
		U[2] = 1 / norm_v;
		U[3] = b / norm_v;
	}
}

/*
 * Modifies the diagonal block matrix computed by symmetric indefinite 
 * factorization, P*A*P' = L*D*L', where A is an n-by-n symmetric matrix, P is a
 * permutation matrix, L is unit lower triangular and D is block diagonal with 
 * block order 1 or 2.  Block diagonal matrix D is modified to make matrix 
 * (A + dA) positive definite.  Each 1-by-1 diagonal block is modified so that  
 * either dk = max{delta, |dk|} (Type-I modification) or dk = max{delta, dk} 
 * (Type-II modification), where dk is the kth diagonal element and delta is a 
 * preset modification tolerance.  The modification type is specified in the 
 * argument list.  For each 2-by-2 diagonal block, first compute its spectral 
 * decompostion, D = U * [lambda[0], 0; 0, lambda[1]] * U', where lambda[0] and 
 * lambda[1] are eigenvalues of D and U is orthogonal.  Then apply Type-I or 
 * Type-II modification to the eigenvalues and calculate the modified block 
 * diagonal D. 
 */
void modify_blk_diag( int type, double delta, int n, const int *ord, 
	int ldim, double *A )
{
	double U[2*2];
	double lambda[2*1];

	for ( int k = 0; k < n; ) {
		double *Akk = A + k + k*ldim;

		if ( ord[k] == 1 ) {		// 1-by-1 pivot
			if ( type == 1 ) {		// Type-I modification
				*Akk = fabs( *Akk );
			}
			if ( delta > *Akk ) {
				*Akk = delta;
			}
			k++;
		} else {					// 2-by-2 pivot
			ldlt_spectral_decomp( ldim, Akk, U, lambda );
			if ( type == 1 ) {		// Type-I modification
				lambda[0] = fabs( lambda[0] );
				lambda[1] = fabs( lambda[1] );
			}
			if ( delta > lambda[0] || delta > lambda[1] ) {
				for ( int i = 0; i < 2; i++ ) {
					if ( delta > lambda[i] ) {
						lambda[i] = delta;
					}
				}
				// D^ = U * [lambda[0]^, 0; 0, lambda[1]^] * U'
				*Akk = U[0] * lambda[0] * U[0] + U[1] * lambda[1] * U[1];
				*(Akk + 1) = U[2] * lambda[0] * U[0] + U[3] * lambda[1] * U[1];
				*(Akk + 1 + ldim) = U[2] * lambda[0] * U[2] + 
					U[3] * lambda[1] * U[3];
			}
			k += 2;
		}
	}
}

/*
 * Implements a rectangular version of the SAXPY operation (jki indexing) for 
 * the modified Cholesky algorithm proposed by Gill, Murray and Wright.  If the 
 * n-by-n symmetric principal minor of A is not positive definite, it is 
 * perturbed such that (A + dA) is sufficiently positive definite and reasonably
 * well-conditioned while preserving as much as possible the information 
 * contained in the Hessian.  Factorization yields P*(A+dA)*P' = L*D*L', where 
 * L is unit lower triangular and D is diagonal.  The permutation matrix P is 
 * encoded in vectors piv[] and ord[], which contain the pivot index and its 
 * order (=1).  The pivoting strategy is passed in the argument list.  The 
 * Gill-Murray-Wright algorithm modifies diagonal matrix D, computed by 
 * symmetric indefinite factorization, as the decomposition proceeds to make 
 * matrix (A + dA) positive definite.  Factors L and D overwrite matrix A on 
 * and below the diagonal.
 */
void chol_gmw_factor( char pivot, double delta, double beta_sqr, int m, int n,
	int *piv, int *ord, double *diag, int ldim, double *A, double *W  )
{
	for ( int j = 0; j < n; j++ ) {
		double *A_j = A + j*ldim;
		double *Ajj = A_j + j;

		// Determine pivot using method specified in the argument list
		switch ( pivot ) {
		case 'D':
			eval_pivot_diag( m-j, j, &diag[j], piv, ord );
			break;
		default:
			eval_pivot_diag( m-j, j, &diag[j], piv, ord );
			break;
		}
		// Perform symmetric pivoting
		if ( j != piv[j] ) {
			pivot_sym( m, j, piv[j], ldim, A );
			double dj = diag[j];
			diag[j] = diag[ piv[j] ];
			diag[ piv[j] ] = dj;
		}
		// Perform cumulative trailing sub-matrix updates on diagonal element
		// and elements below the diagonal of column j
		*Ajj = diag[j];
		for ( int k = 0; k < j; k++ ) {
			const double *A_k = A + k*ldim;
			double akk = *(A_k + k);
			double ajk = *(A_k + j);
			for ( int i = j+1; i < m; i++ ) {
				*(A_j + i) -= *(A_k + i) * ajk * akk;
			}
		}

		// Modify diagonal element (pivot) so that matrix (A + dA) is
		// sufficiently positive definite and reasonably well-conditioned
		double ajj = modify_pivot_gmw( delta, beta_sqr, m-j, Ajj );
		*Ajj = ajj;
		// Divide elements of column k of matrix A below the diagonal by the 
		// diagonal element, and perform trailing sub-matrix update on the
 		// vector diagonal elements used in pivot selection
		for ( int i = j+1; i < m; i++ ) {
			*(A_j + i)	/= ajj;
			diag[i] -= *(A_j + i) * ajj *  *(A_j + i);
		}
	}	
}

/*
 * Implements a rectangular version of the SAXPY operation (jki indexing) for 
 * the modified Cholesky algorithm proposed by Gill, Murray and Wright.  If the 
 * n-by-n symmetric principal minor of A is not positive definite, it is 
 * perturbed such that (A + dA) is sufficiently positive definite and reasonably
 * well-conditioned while preserving as much as possible the information 
 * contained in the Hessian.  Factorization yields P*(A+dA)*P' = L*D*L', where 
 * L is unit lower triangular and D is diagonal.  The permutation matrix P is 
 * encoded in vectors piv[] and ord[], which contain the pivot index and its 
 * order (=1).  The pivoting strategy is passed in the argument list.  The 
 * Gill-Murray-Wright algorithm modifies diagonal matrix D, computed by 
 * symmetric indefinite factorization, as the decomposition proceeds to make 
 * matrix (A + dA) positive definite.  To the extent possible, this 
 * implementation of the SAXPY operation uses the BLAS library to perform 
 * matrix operations.  Factors L and D overwrite matrix A on and below the 
 * diagonal.
 */
void chol_gmw_factor_blas( char pivot, double delta, double beta_sqr, 
	int m, int n, int *piv, int *ord, double *diag, int ldim, double *A, 
	double *W )
{
	for ( int j = 0; j < n; j++ ) {
		double *L = A + j;
		double *M = W + j;
		double *W_j = W + j*ldim;
		double *A_j = A + j*ldim;
		double *Ajj = A_j + j;

		// Determine pivot using method specified in the argument list
		switch ( pivot ) {
		case 'D':
			eval_pivot_diag( m-j, j, &diag[j], piv, ord );
			break;
		default:
			eval_pivot_diag( m-j, j, &diag[j], piv, ord );
			break;
		}
		// Perform symmetric pivoting
		if ( j != piv[j] ) {
			pivot_sym_blas( m, j, piv[j], ldim, W, A );
			double dj = diag[j];
			diag[j] = diag[ piv[j] ];
			diag[ piv[j] ] = dj;
		}
		// Perform cumulative trailing sub-matrix updates on diagonal element
		// and elements below the diagonal of column j
		// A = A = ML'
		*Ajj = diag[j];
		reduce_ldlt_vector_blas( m-j-1, j, 0, ord, ldim, L, M+1, Ajj+1 );

		// Modify diagonal element (pivot) so that matrix (A + dA) is
		// sufficiently positive definite and reasonably well-conditioned
		double ajj = modify_pivot_gmw( delta, beta_sqr, m-j, Ajj );
		*Ajj = ajj;
		// Elements of column k of matrix A on and below the diagonal are equal
		// to elements of column k of L*D.  Store column k of L*D in W before 
		// solving for L by dividing column k of L*D by diagonal element dkk.
		// Then perform trailing sub-matrix updates on the vector of diagonal 
		// elements used in pivot selection
		for ( int i = j+1; i < m; i++ ) {
			*(W_j + i) = *(A_j + i);
			*(A_j + i)	/= ajj;
			diag[i] -= *(A_j + i) * ajj *  *(A_j + i);
		}
	}
}

/*
 * Implements the modified Cholesky algorithm proposed by Gill, Murray and
 * Wright using simple blocking to optimize memory access.  If n-by-n symmetric
 * matrix A with leading dimension ldim is not positive definite, it is 
 * perturbed such that (A + dA) is sufficiently positive definite and reasonably
 * well-conditioned while preserving as much as possible the information 
 * contained in the Hessian.  Factorization yields P*(A+dA)*P' = L*D*L', where L
 * is unit lower triangular and D is diagonal.  The permutation matrix P is 
 * encoded in vectors piv[] and ord[], which contain the pivot and its order.  
 * The pivoting strategy is passed in the argument list.  The Gill-Murray-Wright 
 * algorithm modifies diagonal matrix D, computed by symmetric indefinite 
 * factorization, as the decomposition proceeds to make matrix (A + dA) positive 
 * definite. The blocked algorithm handler determines which implementation of 
 * SAXPY operation -- native or using BLAS -- to invoke to factor a column block
 * of matrix A.  Factors L and D overwrite matrix A on and below the diagonal.  
 */
void chol_gmw_block_handler( int blas, char pivot, double delta, double beta_sqr,
	int n, int *piv, int *ord, int ldim, double *A )
{
	const int	lapack = 0;
	const int	bdim = get_block_dim_ldlt( lapack, blas, ldim );

	int		d, j, r, t;
	double	*Ajj, *L, *D, *W, *diag;
	void 	(*chol_gmw)( char pivot, double delta, double beta_sqr, 
				int m, int n, int *piv, int *ord, double *diag, int ldim, 
				double *M, double *A  );

	if ( blas ) {
		chol_gmw = chol_gmw_factor_blas;
		W = (double *) malloc( ldim*bdim*sizeof(double) );
	} else {
		chol_gmw = chol_gmw_factor;
		W = (double *) malloc( ldim*2*sizeof(double) );
	}

	// For efficient memory access during pivot selection, copy diagonal 
	// elements of matrix A into vector diag[]
	diag = (double *) malloc( ldim*sizeof(double) );
	for ( int k = 0; k < n; k++ ) {
		diag[k] = *(A + k + k*ldim);
	}
	
	j = 0;
	r = (bdim > n) ? n : bdim;
	// Perform rectangular factorization on first column block A(0:n-1,0:r)
	chol_gmw( pivot, delta, beta_sqr, n, r, &piv[j], &ord[j], &diag[j], 
		ldim, A, W );

	d = 0;
	j = bdim;
	t = n - bdim;
	Ajj = A;
	for ( ; j < n; j += bdim, d += bdim, t -= bdim ) {

		// Adjust pivot vector of previous block for diagonal offset
		for ( int i = d; i < j; i++ ) {
			piv[i] += d;
		}
		L = Ajj + bdim;
		D = Ajj;
		Ajj = A + j + j*ldim;
		// Reduce trailing sub-matrix, P * A(j:n-1,j:n-1) * P' =
		// L(j:n-1,j-BDIM:j-1) * D(j-BDIM:j-1,j-BDIM:j-1) * L'(j-BDIM:j-1,j:n-1)
		reduce_ldlt_mat_blk( blas, t, r, &ord[d], bdim, ldim, L, D, W+j, Ajj );

		r = t < bdim ? t : bdim;
		// Perform rectangular factorization on column block A(j:n-1,j:j+r-1)
		chol_gmw( pivot, delta, beta_sqr, t, r, &piv[j], &ord[j], &diag[j], 
			ldim, Ajj, W+j );

		// Apply permutation matrix for current block, encoded in piv(j:j+r-1),
		// to columns to the left of current block A(:,0:j-1)
		for ( int i = j; i < j+r; i++ ) {
			if ( i != piv[i] + j ) {
				if ( blas ) {
					double *Ai_ = A + i;
					double *Ar_ = A + piv[i] + j;
					dswap_( &j, Ai_, &ldim, Ar_, &ldim );
				} else {
					for ( int k = 0; k < j; k++ ) {
						double aik = *(A + i + k*ldim);
						*(A + i + k*ldim) = *(A + piv[i] + j + k*ldim);
						*(A + piv[i] + j + k*ldim) = aik;
					}
				}
			}
		}	
	}
	// Adjust pivot vector of last block for diagonal offset
	for ( int i = d; i < n; i++ ) {
		piv[i] += d;
	}
	free( diag );
	free( W );
}

/******************************************************************************/

/*
 * Implements the modified Cholesky algorithm proposed by Gill, Murray and
 * Wright using the outer product method (kji indexing).  If n-by-n symmetric 
 * matrix A is not positive definite, it is perturbed such that (A + dA) is 
 * sufficiently positive definite and reasonably well-conditioned while 
 * preserving as much as possible the information contained in the Hessian.  
 * Factorization yields P*(A+dA)*P' = L*D*L', where L is unit lower triangular 
 * and D is diagonal.  The permutation matrix P is encoded in vectors piv[] and 
 * ord[], which contain the pivot and its order (=1).  The pivoting strategy is
 * passed in the argument list.  The Gill-Murray-Wright algorithm modifies 
 * diagonal matrix D, computed by symmetric indefinite factorization, as the 
 * proceeds to make matrix (A + dA) positive definite.  Factors L and D 
 * overwrite matrix A on and below the diagonal.
 */
void chol_gmw_outer_product( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;

	double *diag;
	double delta = DBL_EPSILON;
	double beta_sqr = calc_beta_sqr_gmw( n, ldim, A );

	diag = (double *) malloc( ldim*sizeof(double) );

	for ( int k = 0; k < n; k++ ) {

		double *A_k = A + k*ldim;
		double *Akk = A_k + k;
		// For efficient memory access during pivot selection, copy diagonal 
		// elements of trailing sub-matrix A into vector diag[]
		for ( int j = k; j < n; j++ ) {
			diag[j] = *(A + j + j*ldim);
		}
		// Determine pivot using method specified in the argument list
		switch ( pivot ) {
		case 'D':
			eval_pivot_diag( n-k, k, &diag[k], piv, ord );
			break;
		default:
			eval_pivot_diag( n-k, k, &diag[k], piv, ord );
			break;
		}
		// Perform symmetric pivoting
		if ( k != piv[k] ) {
			pivot_sym( n, k, piv[k], ldim, A );
		}
		
		// Modify diagonal element (pivot) so that matrix (A + dA) is
		// sufficiently positive definite and reasonably well-conditioned
		double akk = modify_pivot_gmw( delta, beta_sqr, n-k, Akk );
		*Akk = akk;
		// Divide elements of column k below the diagonal by the diagonal element
		for ( int i = k+1; i < n; i++ ) {
			*(A_k + i)	/= akk;
		}
		// Update trailing sub-matrix by subtracting the outer product.
		// Because of symmetry need only update elements on and below diagonal
		for ( int j = k+1; j < n; j++ ) {
			double *A_j = A + j*ldim;
			double ajk = *(A_k + j);
			for ( int i = j; i < n; i++ ) {  
				*(A_j+i) -= *(A_k+i) * ajk * akk;
			}
		}
	}
}

/*
 * Implements the modified Cholesky algorithm proposed by Gill, Murray and 
 * Wright using the SAXPY operation (jki indexing).  If n-by-n symmetric 
 * matrix A is not positive definite, it is perturbed such that (A + dA) is 
 * sufficiently positive definite and reasonably well-conditioned while 
 * preserving as much as possible the information contained in the Hessian.  
 * Factorization yields P*(A+dA)*P' = L*D*L', where L is unit lower triangular 
 * and D is diagonal.  The permutation matrix P is encoded in vectors piv[] and 
 * ord[], which contain the pivot and its order (=1).  The pivoting strategy is
 * passed in the argument list.  The Gill-Murray-Wright algorithm modifies 
 * diagonal matrix D, computed by symmetric indefinite factorization, as the 
 * proceeds to make matrix (A + dA) positive definite.  Factors L and D 
 * overwrite matrix A on and below the diagonal.
 */
void chol_gmw_saxpy( const char pivot, const int n, int *piv, int *ord, 
	double *A )
{
	const int ldim = n;

	double *W, *diag;
	double delta = DBL_EPSILON;
	double beta_sqr = calc_beta_sqr_gmw( n, ldim, A );

	W = (double *) malloc( sizeof(double) );
	// For efficient memory access during pivot selection, copy diagonal 
	// elements of matrix A into vector diag[] 
	diag = (double *) malloc( ldim*sizeof(double) );
	for (int k = 0; k < n; k++) {
		diag[k] = *(A + k + k*ldim);
	}

	chol_gmw_factor(pivot, delta, beta_sqr, n, n, piv, ord, diag, ldim, A, W);

	free( diag );
	free( W );
}

/*
 * Implements simple blocking for the modified Cholesky algorithm proposed by
 * Gill, Murray and Wright.  A is an n-by-n symmetric, possibly indefinite, 
 * matrix.
 */
void chol_gmw_block( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;
	const int blas = 0;
#if defined(MODCHOL) && defined(PROFILE)
	struct	timespec sta_mod_chol, sta_mod_fact, end_mod_chol, end_mod_fact;

	tm_mod_chol = 0.0;
	tm_mod_fact = 0.0;
	get_time( &sta_mod_chol );
	
	get_time( &sta_mod_fact );
#endif
	double delta = DBL_EPSILON;
	double beta_sqr = calc_beta_sqr_gmw( n, ldim, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_fact );
	tm_mod_fact += timespec_diff( sta_mod_fact, end_mod_fact );
#endif
	
	chol_gmw_block_handler( blas, pivot, delta, beta_sqr, n, piv, ord, ldim, A );

#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_chol );
	tm_mod_chol += timespec_diff( sta_mod_chol, end_mod_chol );
	fprintf( stdout, "%.4f\t\t%.4f\t\t%.2f\n", tm_mod_chol, tm_mod_fact, 
		tm_mod_fact/tm_mod_chol*100 );
#endif
}

/*
 * Implements simple blocking using the BLAS library for the modified Cholesky 
 * algorithm proposed by Gill, Murray and Wright.  A is an n-by-n symmetric, 
 * possibly indefinite, matrix. 
 */
void chol_gmw_block_blas( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;
	const int blas = 1;
#if defined(MODCHOL) && defined(PROFILE)
	struct	timespec sta_mod_chol, sta_mod_fact, end_mod_chol, end_mod_fact;

	tm_mod_chol = 0.0;
	tm_mod_fact = 0.0;
	get_time( &sta_mod_chol );

	get_time( &sta_mod_fact );
#endif
	double delta = DBL_EPSILON;
	double beta_sqr = calc_beta_sqr_gmw( n, ldim, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_fact );
	tm_mod_fact += timespec_diff( sta_mod_fact, end_mod_fact );
#endif
	
	chol_gmw_block_handler( blas, pivot, delta, beta_sqr, n, piv, ord, ldim, A );

#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_chol );
	tm_mod_chol += timespec_diff( sta_mod_chol, end_mod_chol );
	fprintf( stdout, "%.4f\t\t%.4f\t\t%.2f\n", tm_mod_chol, tm_mod_fact, 
		tm_mod_fact/tm_mod_chol*100 );
#endif
}


/*
 * Implements the modified Cholesky algorithm proposed by Cheng and Higham using
 * the outer product method (kji indexing). .  If n-by-n symmetric matrix A is 
 * not positive definite, it is perturbed such that (A + dA) is sufficiently 
 * positive definite and reasonably well-conditioned while preserving as much as
 * possible the information contained in the Hessian.  Factorization yields
 * P*(A+dA)*P' = L*D*L', where L is unit lower triangular and D is block 
 * diagonal with block order 1 or 2.  The permutation matrix P is encoded in 
 * vectors piv[] and ord[], which contain the pivot and its order.  The pivoting
 * strategy (Bunch-Kaufman, bounded Bunch-Kaufman or Bunch-Parlett) is passed in  
 * the argument list.  Once the symmetric indefinite factorization has been 
 * computed for matrix A, block diagonal matrix D is modified to make matrix 
 * (A + dA) positive definite.  The Cheng-Higham algorithm performs a Type-II 
 * modification of the block diagonal matrix.  Factors L and D overwrite matrix
 * A on and below the diagonal.
 */
void chol_ch_outer_product( const char pivot, const int n, int *piv, int *ord, 
	double *A )
{
	const int	ldim = n;
	const int	type = 2;

	double delta = calc_delta_ch( n, ldim, A );

	ldlt_outer_product( pivot, n, piv, ord, A );
	modify_blk_diag( type, delta, n, ord, ldim, A );
}

/* 
 * Implements the modified Cholesky algorithm proposed by Cheng and Higham using
 * the SAXPY operation (jki indexing).  If n-by-n symmetric matrix A is not 
 * positive definite, it is perturbed such that (A + dA) is sufficiently
 * positive definite and reasonably well-conditioned while preserving as much as
 * possible the information contained in the Hessian.  Factorization yields 
 * P*(A+dA)*P' = L*D*L', where L is unit lower triangular and D is block 
 * diagonal with block order 1 or 2.  The permutation matrix P is encoded in 
 * vectors piv[] and ord[], which contain the pivot and its order.  The pivoting
 * strategy (Bunch-Kaufman, bounded Bunch-Kaufman or Bunch-Parlett) is passed in
 * the argument list.  Once the symmetric indefinite factorization has been 
 * computed for matrix A, block diagonal matrix D is modified to make matrix
 * (A + dA) positive definite.  The Cheng-Higham algorithm performs a Type-II 
 * modification of the block diagonal matrix.  Factors L and D overwrite matrix
 * A on and below the diagonal.
 */
void chol_ch_saxpy( const char pivot, const int n, int *piv, int *ord, 
	double *A )
{
	const int	ldim = n;
	const int	type = 2;

	double delta = calc_delta_ch( n, ldim, A );

	ldlt_saxpy( pivot, n, piv, ord, A );
	modify_blk_diag( type, delta, n, ord, ldim, A );
}

/*
 * Implements the modified Cholesky algorithm proposed by Cheng and Higham using 
 * simple blocking to optimize memory access.  If n-by-n symmetric matrix A
 * is not positive definite, it is perturbed such that (A + dA) is sufficiently 
 * positive definite and reasonably well-conditioned while preserving as much as
 * possible the information contained in the Hessian.  Factorization yields 
 * P*(A+dA)*P' = L*D*L', where L is unit lower triangular and D is block 
 * diagonal with block order 1 or 2.  The permutation matrix P is encoded in 
 * vectors piv[] and ord[], which contain the pivot and its order.  The pivoting
 * strategy (Bunch-Kaufman, bounded Bunch-Kaufman or Bunch-Parlett) is passed in
 * the argument list.  Once the symmetric indefinite factorization has been 
 * computed for matrix A, block diagonal matrix D is modified to make matrix
 * (A + dA) positive definite.  The Cheng-Higham algorithm performs a Type-II 
 * modification of the block diagonal matrix.  Factors L and D overwrite matrix
 * A on and below the diagonal.
 */
void chol_ch_block( const char pivot, const int n, int *piv, int *ord, 
	double *A )
{
	const int	ldim = n;
	const int	type = 2;	
#if defined(MODCHOL) && defined(PROFILE)
	struct	timespec sta_mod_chol, sta_mod_fact, end_mod_chol, end_mod_fact;

	tm_mod_chol = 0.0;
	tm_mod_fact = 0.0;
	get_time( &sta_mod_chol );

	get_time( &sta_mod_fact );
#endif
	double delta = calc_delta_ch( n, ldim, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_fact );
	tm_mod_fact += timespec_diff( sta_mod_fact, end_mod_fact );
#endif

	ldlt_block( pivot, n, piv, ord, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &sta_mod_fact );
#endif
	modify_blk_diag( type, delta, n, ord, ldim, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_fact );
	tm_mod_fact += timespec_diff( sta_mod_fact, end_mod_fact );

	get_time( &end_mod_chol );
	tm_mod_chol += timespec_diff( sta_mod_chol, end_mod_chol );
	fprintf( stdout, "%.4f\t\t%.4f\t\t%.2f\n", tm_mod_chol, tm_mod_fact, 
		tm_mod_fact/tm_mod_chol*100 );
#endif
}

/*
 * Implements simple blocking using BLAS for the modified Cholesky algorithm 
 * proposed by Cheng and Higham.  If n-by-n symmetric matrix A is not positive 
 * definite, it is perturbed such that (A + dA) is sufficiently positive 
 * definite and reasonably well-conditioned while preserving as much as possible
 * the information contained in the Hessian.  Factorization yields 
 * P*(A+dA)*P' = L*D*L', where L is unit lower triangular and D is block 
 * diagonal with block order 1 or 2.  The permutation matrix P is encoded in 
 * vectors piv[] and ord[], which contain the pivot and its order.  The pivoting
 * strategy (Bunch-Kaufman, bounded Bunch-Kaufman or Bunch-Parlett) is passed in
 * the argument list.  Once the symmetric indefinite factorization has been 
 * computed for matrix A, block diagonal matrix D is modified to make matrix 
 * (A + dA) positive definite.  The Cheng-Higham algorithm performs a Type-II  
 * modification of the block diagonal matrix.  Factors L and D overwrite matrix
 * A on and below the diagonal.
 */
void chol_ch_block_blas( const char pivot, const int n, int *piv, int *ord, 
	double *A )
{
	const int	ldim = n;
	const int	type = 2;
#if defined(MODCHOL) && defined(PROFILE)
	struct	timespec sta_mod_chol, sta_mod_fact, end_mod_chol, end_mod_fact;

	tm_mod_chol = 0.0;
	tm_mod_fact = 0.0;
	get_time( &sta_mod_chol );

	get_time( &sta_mod_fact );
#endif
	double delta = calc_delta_ch( n, ldim, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_fact );
	tm_mod_fact += timespec_diff( sta_mod_fact, end_mod_fact );
#endif

	ldlt_block_blas( pivot, n, piv, ord, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &sta_mod_fact );
#endif
	modify_blk_diag( type, delta, n, ord, ldim, A );
#if defined(MODCHOL) && defined(PROFILE)
	get_time( &end_mod_fact );
	tm_mod_fact += timespec_diff( sta_mod_fact, end_mod_fact );

	get_time( &end_mod_chol );
	tm_mod_chol += timespec_diff( sta_mod_chol, end_mod_chol );
	fprintf( stdout, "%.4f\t\t%.4f\t\t%.2f\n", tm_mod_chol, tm_mod_fact, 
		tm_mod_fact/tm_mod_chol*100 );
#endif
}
