/*
 * Algorithms implementing unblocked and blocked Cholesky factorizaton of 
 * symmetric positive definite matrices representing linear systems.  Unblocked
 * algorithms include the outer product method and SAXPY operation, while 
 * blocked algorithms include simple blocking, contiguous blocking and recursive 
 * contiguous blocking.  One implementation of a blocked algorithm uses tuned 
 * BLAS (Basic Linear Algebra Subroutines).  Also, function wrappers facilitate 
 * calling unblocked and blocked LAPACK Cholesky factorization routines DPOTF2 
 * and DPOTRF, respectively.
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>

#include "cholfact.h"
#include "lapack.h"
#include "matcom.h"
#include "timing.h"

static void reduce_sym_matrix( int diag, int m, int n, int p, int ldimL, 
	const double *L, int ldimT, const double *T, int ldimA, double *A );
static void reduce_sym_mat_blk( int m, int n, int p, int ldim, int bdim,
	const double *L, const double *T, double *A );
static void reduce_sym_kernel( const int diag, 
	const double *L, const double *T, double *A );
static void reduce_sym_blk_ker( int diag, int m, int n, int p, int ldimL, 
	const double *L, int ldimT, const double *T, int ldimA, double *A );
static void tri_solve_xltb_matrix( int m, int n, int ldimL, const double *L,
	int ldimB, double *B );
static void tri_solve_xltb_mat_blk( int m, int n, int ldim, int bdim, 
	const double *L, double *B );
static void tri_solve_xltb_kernel( const double *L, double *B );
static void tri_solve_xltb_blk_ker( int m, int n, int ldimL, const double *L,
	int ldimB, double *B );
static void chol_kernel( const int n, double *A );
static void chol_blk_ker( int n, int ldim, double *A );
static void chol_factor( int m, int n, int ldim, double *A );

/******************************************************************************/

/*
 * Determines optimal block dimension for the local environment given a routine
 * and matrix leading dimension.  The function returns the block dimension 
 * chosen by the LAPACK Cholesky factorization routine, or a block dimension for
 * testing (debugging).  If the leading dimension is less than the optimal block
 * dimension, the block dimension is set to the leading dimension, and the 
 * matrix computation becomes an unblocked algorithm. 
 */
int get_block_dim_chol( int ldim )
{
	const int	optm_bdim = 1;
	const int	no_dim = -1;
	const char	*parm_str = "L";	
	const char	*func_name = "DPOTRF";	

	int bdim;

#if defined(DEBUG)
	bdim = BDIM;
#else
	bdim = ilaenv_( &optm_bdim, func_name, parm_str, 
		&ldim, &no_dim, &no_dim, &no_dim );
#endif
	if ( bdim <= 1 || bdim > ldim ) {
		bdim = ldim;
	}
	return bdim;
}

/* 
 * Matrix factorization reduces symmetric trailing sub-matrix A by computing 
 * A = A - L*T', where A is an m-by-n sub-matrix, and L and T are m-by-p and 
 * n-by-p blocks, respectively, of a lower triangular matrix.  A, L, and T are
 * stored in column-major order with leading dimensions ldimL, ldimT and ldimA,
 * respectively.  The trailing sub-matrix update is an implementation of the
 * SAXPY operation.  Because of symmetry, the trailing sub-matrix update need
 * only be performed on elements on or below the diagonal.
 */
void reduce_sym_matrix( int diag, int m, int n, int p, int ldimL, 
	const double *L, int ldimT, const double *T, int ldimA, double *A ) 	
{
	for ( int j = 0; j < n; j++ ) {
		const double *Tj_ = T + j;					// Points to element T(j,0)
		double *A_j = A + j*ldimA;					// Points to element A(0,j)
		for ( int k = 0; k < p; k++ ) {
			const double *L_k = L + k*ldimL;		// Points to element L(0,k)
			double tjk = *(Tj_ + k*ldimT);			// Element T(j,k) = T'(k,j)
			for ( int i = diag ? j : 0; i < m; i++ ) {
				*(A_j + i) -= *(L_k + i) * tjk;
			}										// A(i,j) -= L(i,k) * T'(k,j)
		}
	}
}

/*
 * Matrix factorization reduces symmetric trailing sub-matrix A by computing 
 * A = A - L*T', where A is an m-by-n sub-matrix, and L and T are m-by-p and 
 * n-by-p column blocks, respectively, of a lower triangular matrix.  A, L and T 
 * are stored in column-major order with leading dimension ldim.  Blocking is 
 * used to optimize memory access for the trailing sub-matrix update, and bdim
 * is the blocking parameter.  Because of symmetry, the trailing sub-matrix 
 * update need only be performed on diagonal blocks and blocks below the
 * diagonal.
 */
void reduce_sym_mat_blk( int m, int n, int p, int ldim, int bdim,
	const double *L, const double *T, double *A )
{
	for ( int j = 0; j < n; j += bdim ) {
		// Determine number of columns in (i,j)th block of A
		int s = (j + bdim > n) ? (n - j) : bdim;

		for ( int k = 0; k < p; k += bdim ) {
			int diag = 1;		// Diagonal block = TRUE
			// Determine number of columns of Lik and Tjk (rows of T'kj)
			int t = (k + bdim > p) ? (p - k) : bdim;
			// Set pointer to block matrix Tjk
			const double *Tjk = T + j + k*ldim;

			for ( int i = j; i < m; i += bdim ) {
				// Determine number of rows in (i,j)th block of A
				int r = (i + bdim > m) ? (m - i) : bdim;
				// Set pointers to block matrices Lik and Aij
				const double *Lik = L + i + k*ldim;
				double *Aij = A + i + j*ldim;
				// Reduce trailing block matrix
				reduce_sym_matrix( diag, r, s, t,
					ldim, Lik, ldim, Tjk, ldim, Aij );
				diag = 0; 		// Diagonal block = FALSE
			}								
		}
	}	
}

/*
 * Matrix factorization reduces symmetric trailing sub-matrix by computing 
 * A = A - L*T', where A is a contiguous KDIM-by-KDIM sub-block of the trailing 
 * sub-matrix, and L and T are contiguous KDIM-by-KDIM sub-blocks of a lower 
 * triangular matrix. Looping is controlled by a symbolic constant (KDIM), which  
 * is evaluated during compilation.  The trailing sub-matrix update is an 
 * implementation of the SAXPY operation.  Because of symmetry, the trailing 
 * sub-matrix update need only be performed on elements on and below the diagonal.
 */
void reduce_sym_kernel( const int diag, 
	const double *L, const double *T, double *A ) 	
{
	for ( int j = 0; j < KDIM; j++ ) {
		const double *Tj_ = T + j;					// Points to element T(j,0)
		double *A_j = A + j*KDIM;					// Points to element A(0,j)
		for ( int k = 0; k < KDIM; k++ ) {
			const double *L_k = L + k*KDIM;			// Points to element L(0,k)
			double tjk = *(Tj_ + k*KDIM);			// Element T(j,k) = T'(k,j)
			for ( int i = diag ? j : 0; i < KDIM; i++ ) {
				*(A_j + i) -= *(L_k + i) * tjk;		// A(i,j) -= L(i,k) * T'(k,j)
			}										
		}
	}
}

/*
 * Matrix factorization reduces symmetric trailing sub-matrix by computing 
 * A = A - L*T', where A is an m-by-n block of the trailing sub-matrix, 
 * and L and T are m-by-p and n-by-p blocks of a lower triangular matrix.
 * Matrix blocks A, L and T are stored contiguously with leading dimension 
 * ldimA, ldimL and ldimT, respectively.  Within blocks of A, L and T, sub-
 * blocks of size KDIM*KDIM are stored contiguously.  Because of symmetry, the 
 * trailing sub-matrix update need only be performed on diagonal sub-blocks and 
 * sub-blocks below the diagonal.
 */
void reduce_sym_blk_ker( int diag, int m, int n, int p, int ldimL, 
	const double *L, int ldimT, const double *T, int ldimA, double *A ) 	
{
	for ( int j = 0; j < n; j += KDIM ) {

		for ( int k = 0; k < p; k += KDIM ) {
			int diag_blk = diag;	// Diagonal block --> diagonal sub-block
			// Set pointer to sub-block Tjk (T'kj)
			const double *Tjk = T + j*KDIM + k*ldimT;

			for ( int i = diag_blk ? j : 0; i < m; i += KDIM ) {
				// Set pointers to sub-blocks Lik and Aij
				const double *Lik = L + i*KDIM + k*ldimL;
				double *Aij = A + i*KDIM + j*ldimA;
				// Perform matrix reduction on sub-blocks
				reduce_sym_kernel( diag_blk, Lik, Tjk, Aij );
				diag_blk = 0;	// Diagonal sub-block = FALSE	
			}													
		}
	}
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * X*L' = B, where L is an n-by-n lower triangular matrix and L' its transpose,
 * and X and B are m-by-n matrices.  L and B are stored in column-major order
 * with leading dimensions ldimL and ldimB, respectively.  The solution X 
 * overwrites B.
 */
void tri_solve_xltb_matrix( int m, int n, int ldimL, const double *L,
	int ldimB, double *B )
{
	for ( int k = 0; k < n; k++ ) {
		double lkk = *(L + k + k*ldimL);
		double *B_k = B + k*ldimB;
		for (int i = 0; i < m; i++) {
			*(B_k + i) /= lkk;
		}
		for ( int j = k+1; j < n; j++ ) {
			double ljk = *(L + j + k*ldimL);			// Element L(j,k) = L'(k,j)
			double *B_j = B + j*ldimB;
			for (int i = 0; i < m; i++) {
				*(B_j+i) -= *(B_k+i) * ljk;
			}
		}
	}	
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * X*L' = B, where L is an n-by-n lower triangular matrix and L' its transpose,
 * and X and B are m-by-n matrices.  X, L and B are stored in column-major order
 * with leading dimension ldim.  Blocking is used to optimize memory access for
 * the triangular solve operation, and bdim is the blocking parameter.
 */
void tri_solve_xltb_mat_blk( int m, int n, int ldim, int bdim, 
	const double *L, double *B )
{
	for ( int i = 0; i < m; i += bdim ) {
		int r = (i + bdim > m) ? (m - i) : bdim;
		tri_solve_xltb_matrix( r, n, ldim, L, ldim, B+i ); 
	}
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * X*L' = B, where X, L and B are contiguous KDIM-by-KDIM matrix sub-blocks, and
 * L is lower triangular and L' its transpose.  Looping is controlled by a 
 * symbolic constant (KDIM), which is evaluated during compilation.  The 
 * solution X overwrites B.
 */
void tri_solve_xltb_kernel( const double *L, double *B )
{
	for ( int k = 0; k < KDIM; k++ ) {
		double lkk = *(L + k + k*KDIM);
		double *B_k = B + k*KDIM;
		for ( int i = 0; i < KDIM; i++ ) {
			*(B_k + i) /= lkk;
		}
		for ( int j = k+1; j < KDIM; j++ ) {
			double ljk = *(L + j + k*KDIM);			// Element L(j,k) = L'(k,j)
			double *B_j = B + j*KDIM;
			for ( int i = 0; i < KDIM; i++ ) {
				*(B_j+i) -= *(B_k+i) * ljk;
			}
		}
	}	
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * X*L' = B, where L is an n-by-n lower triangular matrix block and L' its 
 * transpose, and X and B are m-by-n matrix blocks.  Matrix blocks L and B are 
 * stored contiguously with leading dimension ldimL and ldimB, respectively.  
 * Within blocks of L and B, sub-blocks of size KDIM*KDIM are stored 
 * contiguously.  Suppose that L' is decomposed into sub-blocks 
 * [L_00', L_10'; 0, L_11'].  Then X_00*L_00' = B_00; X_10*L_00' = B_10; 
 * X_00*L_10' + X_01*L_11' = B_01 --> X_01*L_11' = B_01 - X_00*L_10'; and 
 * X_10*L_10' + X_11*L_11' = B_11 --> X_11*L_11' = B_11 - X_10*L_10'.  
 */
void tri_solve_xltb_blk_ker( int m, int n, int ldimL, const double *L,
	int ldimB, double *B )
{
	for ( int j = 0; j < n; j += KDIM ) {
		const double *Lj_ = L + j*KDIM;
		const double *Ljj = Lj_ + j*ldimL;
		double *B_j = B + j*ldimB; 

		for ( int k = 0; k < j; k += KDIM ) {
			const double *Ljk = Lj_ + k*ldimL;
			double *B_k = B + k*ldimB;			

			for ( int i = 0; i < m; i += KDIM ) {
				double *Bik = B_k + i*KDIM;
				double *Bij = B_j + i*KDIM;
				reduce_sym_kernel( 0, Bik, Ljk, Bij );
			}
			
		}
		for (int i = 0; i < m; i += KDIM) {
			double *Bij = B_j + i*KDIM;
			tri_solve_xltb_kernel( Ljj, Bij );
		}
	}
}

/*
 * Factorizes an n-by-n symmetric matrix sub-block A into a lower triangular
 * sub-block L, such that A = L*L'.  KDIM-by-KDIM sub-block A is stored 
 * contiguously.  The Cholesky factorization algorithm is an implementation of  
 * the SAXPY operation.  Looping is controlled by a symbolic constant (KDIM), 
 * which is evaluated during compilation.  The factor L overwrites elements of A
 * on and below the diagonal.
 */
void chol_kernel( const int n, double *A )
{
	// Divide elements of the first column by square root of element in first row
	double ajj = sqrt(*A);
	for ( int i = 0; i < KDIM; i++ ) {
		*(A + i) /= ajj;
	}

	for ( int j = 1; j < n; j++ ) {
		// Perform cumulative trailing sub-matrix updates on diagonal element 
		// and elements below the diagonal of column j
		double *A_j = A + j*KDIM;
		for ( int k = 0; k < j; k++ ) {	
			double *L_k = A + k*KDIM;				// Points to  L(0,k) = A(0,k)
			double ljk = *(L_k + j);				// Element L(j,k) = L'(k,j)
			for ( int i = j; i < KDIM; i++ ) {
				*(A_j + i) -= *(L_k + i) * ljk;		// A(i,j) -= L(i,k) * L'(k,j)
			}
		}
		
		// Divide elements of column j by square root of the diagonal element
		ajj = sqrt( *(A_j + j) );
		for ( int i = j; i < KDIM; i++ ) {
			*(A_j+i) /= ajj;
		}
	}
}

/*
 * Factorizes an n-by-n symmetric matrix block A into a lower triangular block L,
 * such that A = L*L'.  Matrix block A is stored contiguously with leading 
 * dimension ldim, and within the matrix block, sub-blocks of size KDIM*KDIM are
 * stored contiguously.  Because of symmetry, the Cholesky factorization need 
 * only be performed on diagonal sub-blocks and sub-blocks below the diagonal.
 */
void chol_blk_ker( int n, int ldim, double *A )
{
	const double *L, *T;

	for ( int j = 0; j < n; j += KDIM ) {
		const int s = (j + KDIM > n) ? (n - j) : KDIM;
		double *A_j = A + j*ldim;
		double *Ajj = A_j + j*KDIM;
		T = A + j*KDIM;

		// Perform cumulative trailing sub-matrix updates on diagonal sub-block 
		// and sub-blocks below the diagonal
		for ( int k = 0; k < j; k += KDIM ) {
			int diag = 1;		// Diagonal block = TRUE
			L = T;

			for ( int i = j; i < n; i += KDIM ) {
				double *Aij = A_j + i*KDIM;
				reduce_sym_kernel( diag, L, T, Aij );
				L = L + KDIM*KDIM;
				diag = 0;		// Diagonal block = FALSE
			}
			T = T + KDIM*ldim;
		}

		// Factorize diagonal sub-block, and solve X*L' = A using forward 
		// substitution on sub-blocks below the diagonal
		chol_kernel( s, Ajj );
		T = Ajj;
		for ( int i = j+KDIM; i < n; i += KDIM ) {
			double *Aij = A_j + i*KDIM;
			tri_solve_xltb_kernel( T, Aij );
		}
	}
}

/*
 * Implements a rectangular version the SAXPY operation (jki indexing) for 
 * Cholesky factorization.  Symmetric positive definite m-by-n matrix A with 
 * leading dimension ldim is factored into a lower triangular matrix L, such 
 * that A = L*L', where L' is the transpose of L.  Elements of L are stored in 
 * A(k:n-1,k), base 0 indexing i.e., on and below the diagonal.  The inner-most 
 * loop subtracts a scalar multiple of a vector from another vector.
 */
void chol_factor( int m, int n, int ldim, double *A )
{
	for ( int j = 0; j < n; j++ ) {
		// Perform cumulative trailing sub-matrix updates on diagonal element 
		// and elements below the diagonal of column j
		double *A_j = A + j*ldim;
		for ( int k = 0; k < j; k++ ) {	
			double *L_k = A + k*ldim;				// Element L(0,k) = A(0,k)
			double ljk = *(L_k + j);				// Element L(j,k) = L'(k,j)
			for ( int i = j; i < m; i++ ) {
				*(A_j + i) -= *(L_k + i) * ljk;		// A(i,j) -= L(i,k) * L'(k,j)
			}
		}
		
		// Divide elements of column j by square root of the diagonal element
		double ajj = sqrt(*(A_j + j));
		for ( int i = j; i < m; i++ ) {
			*(A_j+i) /= ajj;
		}
	}
}

/******************************************************************************/

/*
 * Implements the outer product method (kji indexing) to factorize symmetric
 * positive definite n-by-n matrix A into a lower triangular matrix L, such that
 * A = L*L', where L' is the transpose of L.  Symmetric postive definite 
 * matrices have weighty diagonals, which precludes the need for pivoting.
 * Elements of L are stored in A(k:n-1,k), base 0 indexing i.e., on and below
 * the diagonal.  Each pass through the k-loop performs an outer product 
 * operation. 
 */
void chol_outer_product( int n, double *A )
{
	const int ldim = n;

	for ( int k = 0; k < n; k++ ) {

		// Divide elements of column k on and below the diagonal by the 
		// square root of the diagonal element
		double *A_k = A + k*ldim;
		double akk = sqrt( *(A_k + k) );
		*(A_k + k) = akk;
		for ( int i = k+1; i < n; i++ ) {
			*(A_k + i)	/= akk;
		}
		// Update trailing sub-matrix by subtracting the outer product
		for ( int j = k+1; j < n; j++ ) {
			double *A_j = A + j*ldim;
			double ajk = *(A_k + j);
			for ( int i = j; i < n; i++ ) {
				*(A_j+i) -= *(A_k+i) * ajk;
			}
		}
	}
}

/*
 * Implements the SAXPY operation (jki indexing) to factorize symmetric positive 
 * definite n-by-n matrix A into a lower triangular matrix L, such that A = L*L', 
 * where L' is the transpose of L.   Symmetric postive definite matrices have 
 * weighty diagonals, which precludes the need for pivoting. Elements of L are 
 * stored in A(k:n-1,k), base 0 indexing i.e., on and below the diagonal.  The 
 * inner-most loop subtracts a scalar multiple of a vector from another vector. 
 */
void chol_saxpy( int n, double *A )
{
	const int ldim = n;

	chol_factor( n, n, ldim, A );
}

/*
 * Implements simple blocking to factorize symmetric positive definite n-by-n 
 * matrix A into a lower triangular matrix L, such that A = L*L', where L' is the
 * transpose of L.  Suppose A is decomposed into blocks [A_00, A_01; A_10, A_11], 
 * where A_00 is an r-by-r block matrix.  First, an implementation of the SAXPY
 * operation computes the Cholesky factorization of r-by-r diagonal block,
 * A_00 = L_00*L_00'.  Then, solve for L_10 in the triangular system of linear 
 * equations L_00 * L_10' = A_10', and update the trailing sub-matrix, 
 * A_11 = A_11 - L_10 * L_10'.  This procedure is repeated iteratively on the 
 * trailing sub-matrix until the last diagonal block (dimension less than or 
 * equal to r) is reached.  Simple blocking is also used to optimize memory 
 * access when updating the trailing sub-matrix.
 */
void chol_block( int n, double *A )
{
	const int ldim = n;
	const int bdim = get_block_dim_chol( ldim );

	int		r, t;
	double	*Ajj, *L;

#if defined(CHOLFACT) && defined(PROFILE)
	struct		timespec sta_chol, sta_factor, sta_tri_solve, sta_reduce,
				end_chol, end_factor, end_tri_solve, end_reduce;
	double tm_chol = 0.0;
	double tm_factor = 0.0;
	double tm_tri_solve = 0.0;
	double tm_reduce = 0.0;

	get_time( &sta_chol );
#endif

	Ajj = A;
	r = (bdim > n) ? n : bdim;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_factor );
#endif
	chol_factor( r, r, ldim, Ajj );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_factor );
	tm_factor += timespec_diff( sta_factor, end_factor );
#endif

	for ( int j = bdim; j < n; j += bdim ) {
		t = n - j;
		L = Ajj + bdim;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_tri_solve );
#endif
		tri_solve_xltb_mat_blk( t, r, ldim, bdim, Ajj, L );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_tri_solve );
	tm_tri_solve += timespec_diff( sta_tri_solve, end_tri_solve );
#endif
		Ajj = A + j*ldim + j;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_reduce );
#endif
		reduce_sym_mat_blk( t, t, bdim, ldim, bdim, L, L, Ajj );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_reduce );
	tm_reduce += timespec_diff( sta_reduce, end_reduce );
#endif
		r = (j + bdim > n) ? t : bdim;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_factor );
#endif
		chol_factor( r, r, ldim, Ajj );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_factor );
	tm_factor += timespec_diff( sta_factor, end_factor );
#endif
	}

#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_chol );
	tm_chol += timespec_diff( sta_chol, end_chol );
	fprintf( stdout, "%.3f\t%.3f\t\t%.3f\t\t%.3f\t\t%.1f\t\t%.1f\t\t%.1f\n",
	tm_chol, tm_factor, tm_tri_solve, tm_reduce,
	tm_factor/tm_chol*100, tm_tri_solve/tm_chol*100, tm_reduce/tm_chol*100 );
#endif
}

/*
 * Implements simple blocking to factorize symmetric positive definite n-by-n 
 * matrix A into a lower triangular matrix L, such that A = L*L', where L' is the
 * transpose of L.  Suppose A is decomposed into blocks [A_00, A_01; A_10, A_11], 
 * where A_00 is an r-by-r block matrix.  First, a rectangular version of the 
 * SAXPY operation computes the Cholesky factorization of n-by-r column block
 * [A_00; A_10] = [L_00; L10]*L_00'.  Then, update the trailing sub-matrix, 
 * A_11 = A_11 - L_10 * L_10'.  This procedure is repeated iteratively on the
 * trailing sub-matrix until the last diagonal block (dimension less than or 
 * equal to r) is reached.  Simple blocking is also used to optimize memory 
 * access when updating the trailing sub-matrix.
 */
void chol_rect_block( int n, double *A )
{
	const int ldim = n;
	const int bdim = get_block_dim_chol( ldim );

	int		r, t;
	double	*Ajj, *L;

	Ajj = A;
	r = (bdim > n) ? n : bdim;
	chol_factor( n, r, ldim, Ajj );

	for ( int j = bdim; j < n; j += bdim ) {
		t = n - j;
		L = Ajj + bdim;
		Ajj = A + j*ldim + j;
		reduce_sym_mat_blk( t, t, bdim, ldim, bdim, L, L, Ajj );
		r = (j + bdim > n) ? t : bdim;
		chol_factor( t, r, ldim, Ajj );
	}	
}

/*
 * Implements contiguous blocking to factorize symmetric positive definite 
 * n-by-n matrix A into a lower triangular matrix L such that A = L*L', where
 * L' is the transpose of L.  Matrix A, which is stored in column-major order
 * is first copied to array AA, which stores contiguous blocks.  Cholesky 
 * factorization yields lower triangular matrix L stored in contiguous blocks
 * in array AA, which is then copied to array A, where matrix elements are 
 * stored in conventional column-major order. 
 */
void chol_contig_block( int n, double *A )
{
	const int 	ldim = n;
	const int	bdim = get_block_dim_chol( ldim );

	double 	*AA, *L, *T;

	AA = (double *) malloc( ldim*ldim*sizeof(double) );
	form_contig_blocks( n, n, ldim, A, n, n, bdim, ldim, AA );
	for ( int j = 0; j < n; j += bdim ) {
		int s = (j + bdim > n) ? (n - j) : bdim;
		double *A_j = AA + j*ldim;
		double *Ajj = A_j + j*s;
		T = AA + j*bdim;

		// Perform cumulative trailing sub-matrix updates on diagonal block 
		// and matrix blocks below the diagonal
		for ( int k = 0; k < j; k += bdim ) {
			int diag = 1;		// Diagonal block = TRUE
			L = T;

			for ( int i = j; i < n; i += bdim ) {
				int r = (i + bdim > n) ? (n - i) : bdim;
				double *Aij = A_j + i*s;
				reduce_sym_matrix( diag, r, s, bdim, r, L, s, T, r, Aij );
				L = L + bdim*bdim;
				diag = 0;		// Diagonal block = FALSE
			}
			T = T + bdim*ldim;
		}

		// Factorize diagonal block, and solve X*L' = A using forward
		// substitution on blocks below the diagonal
		chol_factor( s, s, s, Ajj );
		T = Ajj;
		for ( int i = j+bdim; i < n; i += bdim ) {
			int r = (i + bdim > n) ? (n - i) : bdim;
			double *Aij = A_j + i*bdim;
			tri_solve_xltb_matrix( r, bdim, bdim, T, r, Aij );
		}
	}	
	unpack_contig_blocks( n, n, bdim, ldim, AA, n, n, ldim, A );
	free( AA );
}

/*
 * Implements recursive contiguous blocking to factorize symmetric positive 
 * definite n-by-n matrix A into a lower triangular matrix L such that A = L*L', 
 * where L' is the transpose of L.  Matrix A, which is stored in column-major 
 * order is first copied to array AA, which stores recursive contiguous blocks.  
 * That is, matrix blocks are stored contiguously, and within each block, sub-
 * blocks of size KDIM*KDIM are stored contiguously.  Cholesky factorization 
 * yields lower triangular matrix L stored in recursive contiguous blocks in 
 * array AA, which is then copied to array A, where matrix elements are stored 
 * in conventional column-major order. 
 */
void chol_recur_block( int n, double *A )
{
	const int	nn = (n / KDIM) * KDIM + ((n % KDIM) ? KDIM : 0);
	const int	ldim = nn;

	int		bdim, bdim_low, bdim_high;
	double 	*AA, *L, *T;

	bdim = get_block_dim_chol( ldim );
	bdim_low = (bdim / KDIM) * KDIM;
	bdim_high = (bdim / KDIM) * KDIM + ((bdim % KDIM) ? KDIM : 0);
	if ( bdim_low == 0 ) {
		bdim = bdim_high;
	} else {
		if ( (bdim - bdim_low) > (bdim_high - bdim) ) {
			bdim = bdim_high;
		} else {
			bdim = bdim_low;
		}
	}

	AA = (double *) malloc( ldim*ldim*sizeof(double) );
	form_recur_blocks( n, n, n, A, nn, nn, KDIM, bdim, ldim, AA);
	for ( int j = 0; j < nn; j += bdim ) {
		int s = (j + bdim > n) ? (n - j) : bdim;
		int q = (j + bdim > nn) ? (nn - j) : bdim;
		double *A_j = AA + j*ldim;
		double *Ajj = A_j + j*q;
		T = AA + j*bdim;

		// Perform cumulative trailing sub-matrix updates on diagonal block 
		// and matrix blocks below the diagonal
		for ( int k = 0; k < j; k += bdim ) {
			int diag = 1;		// Diagonal block = TRUE
			L = T;

			for ( int i = j; i < nn; i += bdim ) {
				int r = (i + bdim > n) ? (n - i) : bdim;
				int p = (i + bdim > nn) ? (nn - i) : bdim;
				double *Aij = A_j + i*q;
				reduce_sym_blk_ker( diag, r, s, bdim, p, L, q, T, p, Aij );
				L = L + bdim*bdim;
				diag = 0;		// Diagonal block = FALSE
			}
			T = T + bdim*ldim;
		}

		// Factorize diagonal block, and solve X*L' = A using forward
		// substitution on blocks below the diagonal
		chol_blk_ker( s, q, Ajj );
		T = Ajj;
		for ( int i = j+bdim; i < nn; i += bdim ) {
			int r = (i + bdim > n) ? (n - i) : bdim;
			int p = (i + bdim > nn) ? (nn - i) : bdim;
			double *Aij = A_j + i*bdim;
			tri_solve_xltb_blk_ker( r, bdim, bdim, T, p, Aij );
		}
	}
	unpack_recur_blocks( nn, nn, KDIM, bdim, ldim, AA, n, n, n, A );
	free( AA );
}

/*
 * Implements simple blocking to factorize symmetric positive definite n-by-n 
 * matrix A into a lower triangular matrix L, such that A = L*L', where L' is the
 * transpose of L.  Suppose A is decomposed into blocks [A_00, A_01; A_10, A_11], 
 * where A_00 is an r-by-r block matrix.  First, an implementation of the SAXPY
 * operation computes the Cholesky factorization of r-by-r diagonal block,
 * A_00 = L_00*L_00'.  BLAS routine DTRSM solves for L_10 in the triangular 
 * system of linear equations L_00 * L_10' = A_10'.  Then, BLAS routine DSYRK is
 * invoked to reduce the trailing sub-matrix, A_11 = A_11 - L_10 * L_10'.  This
 * procedure is repeated iteratively on the trailing sub-matrix until the last 
 * diagonal block (dimension less than or equal to r) is reached.
 */
void chol_block_blas( int n, double *A )
{
	const char	lower = 'L';
	const char	trans = 'T';
	const char	no_trans = 'N';
	const char	rhs = 'R';
	const char	not_unit = 'N';
	const int 	ldim = n;
	const int	bdim = get_block_dim_chol( ldim );
	const double		_one = -1.0;
	const double		one = 1.0;

	int 		r, t;
	double	*Ajj, *L;

#if defined(CHOLFACT) && defined(PROFILE)
	struct		timespec sta_chol, sta_factor, sta_tri_solve, sta_reduce,
				end_chol, end_factor, end_tri_solve, end_reduce;
	double tm_chol = 0.0;
	double tm_factor = 0.0;
	double tm_tri_solve = 0.0;
	double tm_reduce = 0.0;

	get_time( &sta_chol );
#endif

	Ajj = A;
	r = (bdim > n) ? n : bdim;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_factor );
#endif
	chol_factor( r, r, ldim, Ajj );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_factor );
	tm_factor += timespec_diff( sta_factor, end_factor );
#endif

	for ( int j = bdim; j < n; j += bdim ) {
		t = n - j;
		L = Ajj + bdim;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_tri_solve );
#endif
		dtrsm_( &rhs, &lower, &trans, &not_unit, &t, &bdim, 
			&one, Ajj, &ldim, L, &ldim );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_tri_solve );
	tm_tri_solve += timespec_diff( sta_tri_solve, end_tri_solve );
#endif
		Ajj = A + j*ldim + j;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_reduce );
#endif
		dsyrk_( &lower, &no_trans, &t, &bdim, &_one, L, &ldim, 
			&one, Ajj, &ldim );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_reduce );
	tm_reduce += timespec_diff( sta_reduce, end_reduce );
#endif
		r = (j + bdim > n) ? t : bdim;
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &sta_factor );
#endif
		chol_factor( r, r, ldim, Ajj );
#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_factor );
	tm_factor += timespec_diff( sta_factor, end_factor );
#endif
	}

#if defined(CHOLFACT) && defined(PROFILE)
	get_time( &end_chol );
	tm_chol += timespec_diff( sta_chol, end_chol );
	fprintf( stdout, "%.3f\t%.3f\t\t%.3f\t\t%.3f\t\t%.1f\t\t%.1f\t\t%.1f\n",
	tm_chol, tm_factor, tm_tri_solve, tm_reduce,
	tm_factor/tm_chol*100, tm_tri_solve/tm_chol*100, tm_reduce/tm_chol*100 );
#endif
}

/*
 * Implements contiguous blocking to factorize symmetric positive definite 
 * n-by-n matrix A into a lower triangular matrix L such that A = L*L', where
 * L' is the transpose of L.  Matrix A, which is stored in column-major order
 * is first copied to array AA, which stores contiguous blocks.  Cholesky 
 * factorization yields lower triangular matrix L stored in contiguous blocks
 * in array AA, which is then copied to array A, where matrix elements are 
 * stored in conventional column-major order.  LAPACK unblocked routine DPOTF2
 * computes the Cholesky factorization of a diagonal block; BLAS routine DTRSM
 * solves for blocks of the lower triangular matrix; and BLAS routines DSYRK 
 * and DGEMM update the trailing sub-matrix.
 */
void chol_contig_block_blas( int n, double *A )
{
	const char	lower = 'L';
	const char	trans = 'T';
	const char	no_trans = 'N';
	const char	rhs = 'R';
	const char	not_unit = 'N';
	const int 	ldim = n;
	const int	bdim = get_block_dim_chol( ldim );
	const double		_one = -1.0;
	const double		one = 1.0;	

	int		info = 0;
	double 	*AA, *L, *T;

	AA = (double *) malloc( ldim*ldim*sizeof(double) );
	form_contig_blocks( n, n, ldim, A, n, n, bdim, ldim, AA );

	for ( int j = 0; j < n; j += bdim ) {
		int s = (j + bdim > n) ? (n - j) : bdim;
		double *A_j = AA + j*ldim;
		double *Ajj = A_j + j*s;
		T = AA + j*bdim;

		// Perform cumulative trailing sub-matrix updates on diagonal block 
		// and matrix blocks below the diagonal
		for ( int k = 0; k < j; k += bdim ) {
			int diag = 1;		// Diagonal block = TRUE
			L = T;

			for ( int i = j; i < n; i += bdim ) {
				int r = (i + bdim > n) ? (n - i) : bdim;
				double *Aij = A_j + i*s;
				if ( diag == 0 ) {
					dgemm_( &no_trans, &trans, &r, &s, &bdim, &_one, L, &r,
						T, &s, &one, Aij, &r );
				} else {
					dsyrk_( &lower, &no_trans, &s, &bdim, &_one, L, &r, 
						&one, Aij, &r );
				}
				L = L + bdim*bdim;
				diag = 0;		// Diagonal block = FALSE
			}
			T = T + bdim*ldim;
		}

		// Factorize diagonal block, and solve X*L' = A using forward
		// substitution on blocks below the diagonal
		dpotf2_( &lower, &s, Ajj, &s, &info );
		T = Ajj;
		for ( int i = j+bdim; i < n; i += bdim ) {
			int r = (i + bdim > n) ? (n - i) : bdim;
			double *Aij = A_j + i*bdim;
		dtrsm_( &rhs, &lower, &trans, &not_unit, &r, &bdim, 
			&one, T, &bdim, Aij, &r );
		}
	}	
	unpack_contig_blocks( n, n, bdim, ldim, AA, n, n, ldim, A );
	free( AA );
}

/*
 * Wrapper for calling LAPACK routine DPOTF2 which computes the Cholesky 
 * factorization of a real symmetric positive definite matrix.  DPOTF2 is
 * LAPACK's unblocked version of Cholesky factorization.
 */
void chol_lapack_unblocked( int n, double *A )
{
	const char	lower = 'L';
	const int	ldim = n;
	int			info = 0;

    dpotf2_( &lower, &n, A, &ldim, &info );
}

/*
 * Wrapper for calling LAPACK routine DPOTRF, which computes the Cholesky 
 * factorization of a real symmetric positive definite matrix.
 */
void chol_lapack( int n, double *A )
{
	const char	lower = 'L';
	const int	ldim = n;
	int			info = 0;

    dpotrf_( &lower, &n, A, &ldim, &info );
}
