/*
 * Algorithms implementing unblocked and blocked LU factorization (Gaussian 
 * elimination) of nonsingular matrices representing linear systems.  Unblocked
 * algorithms include the outer product method and SAXPY operation, while 
 * blocked algorithms include simple blocking and recursive contiguous blocking.
 * LU factorization algorithms are implemented with and without partial 
 * pivoting.  (If a nonsingular matrix exhibits certain properties, such a 
 * diagonal dominance, then Gaussian elimination without pivoting is numerically
 * stable.)  Also, a function wrapper facilitates calling LAPACK Gaussian
 * elimination routine DGETRF.
 */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>

#include "lufact.h"
#include "lapack.h"
#include "matcom.h"

static void eval_pivot_gauss( int n, int k, const double *vec,
	int *piv, int *ord );
static void tri_solve_l1xb( int m, int n, int ldim, double *L, double *B );
static void tri_solve_l1xb_pivot( int m, int n, int ldim,
	int *piv, double *L, double *B );
static void tri_solve_l1xb_kernel( const double *L, double *B );
static void tri_solve_l1xb_blk_ker( int m, int n, int ldimL, const double *L, 
	int ldimB, double *B );
static void tri_solve_xub_kernel( const double *U, double *B );
static void tri_solve_xub_blk_ker( int m, int n, int ldimU, const double *U, 
	int ldimB, double *B );
static void reduce_matrix( int m, int n, int p, int ldim,
	const double *L, const double *U, double *A );
static void reduce_mat_blk( int m, int n, int p, int ldim, int bdim,
	const double *L, const double *U, double *A);
static void reduce_kernel( const double *L, const double *U, double *A );
static void reduce_blk_ker( int m, int n, int p, int ldimL, const double *L, 
	int ldimU, const double *U, int ldimA, double *A );
static void lu_kernel( const int n, double *A );
static void lu_blk_ker( int n, int ldim, double *A );
static void lu_factor( int m, int n, int ldim, double *A );
static void lu_pivot( char pivot, int m, int n, int ldim,
	int *piv, int *ord, double *A );

/******************************************************************************/

/*
 * Determines optimal block dimension for the local environment given a routine
 * and matrix leading dimension.  The function returns the block dimension 
 * chosen by the LAPACK LU factorization routine, or a block dimension for 
 * testing (debugging).  If the leading dimension is less than the optimal block
 * dimension, the block dimension is set to the leading dimension, and the 
 * matrix computation becomes an unblocked algorithm. 
 */
int get_block_dim_lu( int ldim )
{
	const int	optm_bdim = 1;
	const int	no_dim = -1;
	const char	*parm_str = " ";	
	const char	*func_name = "DGETRF";	

	int bdim;

#if defined(DEBUG)
	bdim = BDIM;
#else	
	bdim = ilaenv_( &optm_bdim, func_name, parm_str, 
		&ldim, &ldim, &no_dim, &no_dim );
#endif
	if ( bdim <= 1 || bdim > ldim ) {
		bdim = ldim;
	}
	return bdim;
}

/*
 * Performs partial pivot selection on an n-by-1 vector representing elements
 * of a column of an n-by-n matrix.  The maximum magnitude element is chosen as 
 * the pivot.  A single pivot adjusted by row offset k, and its order (=1)
 * are stored in vectors piv[] and ord[], respectively.  piv[k] specifies the 
 * row permutation applied to row k when performing Gaussian elimination.  
 */
void eval_pivot_gauss( int n, int k, const double *vec, int *piv, int *ord )
{
	int 	p = k;
	double	lambda = -1.0;

	for (int i = 0; i < n; i++) {
		double x = fabs( vec[i] );
		if ( x > lambda ) {
			lambda = x;
			p = i + k;
		}
	}
	piv[k] = p;
	ord[k] = 1;
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * L*X = B, where L is an m-by-m unit lower triangular matrix, and X and B are 
 * m-by-n matrices.  Matrices L, X and B are stored in column-major order with
 * leading dimension ldim.  The solution X overwrites matrix B.
 */
void tri_solve_l1xb( int m, int n, int ldim, double *L, double *B )
{
	for ( int k = 0; k < n; k++ ) {
		double *B_k = B + k*ldim;
		for ( int j = 0; j < m-1; j++ ) {
			double bjk = *(B_k + j);
			double *L_j = L + j*ldim;
			for ( int i = j+1; i < m; i++ ) {
				*(B_k+i) -= *(L_j+i) * bjk;
			}
		}
	}	
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * L*X = B, where L is an m-by-m unit lower triangular matrix, and X and B are 
 * m-by-n matrices.  The permutation matrix encoded in the pivot vector piv[] 
 * is first applied to matrix B before solving for X.  Matrices L, X and B are 
 * stored in column-major order with leading dimension ldim.  The solution X 
 * overwrites matrix B. 
 */
void tri_solve_l1xb_pivot( int m, int n, int ldim, 
	int *piv, double *L, double *B )
{	
	for (int k = 0; k < n; k++) {
		double *B_k = B + k*ldim;
		// Apply permutation matrix to column k of matrix B
		for (int i = 0; i < m; i++) {
			double bik = *(B_k + i);
			*(B_k + i) = *(B_k + piv[i]);
			*(B_k + piv[i]) = bik;
		}
		for (int j = 0; j < m-1; j++) {
			double bjk = *(B_k + j);
			double *L_j = L + j*ldim;
			for (int i = j+1; i < m; i++) {
				*(B_k+i) -= *(L_j+i) * bjk;
			}
		}
	}	
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * L*X = B, where L, X and B are contiguous KDIM-by-KDIM matrix sub-blocks and 
 * L is unit lower triangular.  Looping is controlled by a symbolic constant 
 * (KDIM), which is evaluated during compilation.  The solution X overwrites B.
 */
void tri_solve_l1xb_kernel( const double *L, double *B )
{
	for ( int j = 0; j < KDIM; j++ ) {
		double *B_j = B + j*KDIM;
		for ( int k = 0; k < KDIM-1; k++ ) {
			double bkj = *(B_j + k);
			const double *L_k = L + k*KDIM;
			for ( int i = k+1; i < KDIM; i++ ) {
				*(B_j + i) -= *(L_k + i) * bkj;
			}
		}
	}	
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * L*X = B, where L is an m-by-m unit lower triangular matrix block, and X and B 
 * are m-by-n matrix blocks.  Matrix blocks L and B are stored contiguously
 * with leading dimensions ldimL and ldimB, respectively.  Within blocks of 
 * L and B, sub-blocks of size KDIM*KDIM are stored contiguously.  Suppose that 
 * L is decomposed into sub-blocks [L_00, 0; L_10, L_11].  Then 
 * L_00*X_00 = B_00; L_00*X_01 = B_01; 
 * L_10*X_00 + L_11*X_10 = B_10 --> L_11*X_10 = B_10 - L_10*X_00; and 
 * L_10*X_01 + L_11*X_11 = B_11 --> L_11*X_11 = B_11 - L_10*X_01.  
 */
void tri_solve_l1xb_blk_ker( int m, int n, int ldimL, const double *L, 
	int ldimB, double *B )
{
	for ( int j = 0; j < n; j += KDIM ) {
			double *B_j = B + j*ldimB;

		for ( int i = 0; i < m; i += KDIM ) {
			const double *Li_ = L + i*KDIM;
			const double *Lii = Li_ + i*ldimL;
			double *Bij = B_j + i*KDIM;
			
			for ( int k = 0; k < i; k += KDIM ) {
				const double *Lik = Li_ + k*ldimL;
				double *Bkj = B_j + k*KDIM;
				reduce_kernel( Lik, Bkj, Bij );
			}
			tri_solve_l1xb_kernel( Lii, Bij );
		}
	}
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * X*U = B, where X, U and B are contiguous KDIM-by-KDIM matrix sub-blocks and 
 * U is upper triangular.  Looping is controlled by a symbolic constant (KDIM), 
 * which is evaluated during compilation.  The solution X overwrites B.
 */
void tri_solve_xub_kernel( const double *U, double *B )
{
	for ( int k = 0; k < KDIM; k++ ) {
		double ukk = *(U + k + k*KDIM);
		double *B_k = B + k*KDIM;
		for ( int i = 0; i < KDIM; i++ ) {
			*(B_k + i) /= ukk;
		}
		for ( int j = k+1; j < KDIM; j++ ) {
			double ukj = *(U + k + j*KDIM);
			double *B_j = B + j*KDIM;
			for (int i = 0; i < KDIM; i++) {
				*(B_j + i) -= *(B_k + i) * ukj;
			}
		}
	}	
}

/*
 * Uses forward substitution to solve the triangular system of linear equations
 * X*U = B, where U is an n-by-n upper triangular matrix block, and X and B are
 * m-by-n matrix blocks.  Matrix blocks U and B are stored contiguously
 * with leading dimensions ldimU and ldimB, respectively.  Within blocks of 
 * U and B, sub-blocks of size KDIM*KDIM are stored contiguously.  Suppose that 
 * U is decomposed into sub-blocks [U_00, U_01; 0, U_11].  Then 
 * X_00*U_00 = B_00; X_10*U_00 = B_10; 
 * X_00*U_01 + X_01*U_11 = B_01 --> X_01*U_11 = B_01 - X_00*U_01; and 
 * X_10*U_01 + X_11*U_11 = B_11 --> X_11*U_11 = B_11 - X_10*U_01.  
 */
void tri_solve_xub_blk_ker( int m, int n, int ldimU, const double *U, 
	int ldimB, double *B )
{
	for ( int j = 0; j < n; j += KDIM ) {
		const double *U_j = U + j*ldimU;
		const double *Ujj = U_j + j*KDIM;
		double *B_j = B + j*ldimB;

		for ( int k = 0; k < j; k += KDIM ) {
			const double *Ukj = U_j + k*KDIM;
			double *B_k = B + k*ldimB;
			
			for ( int i = 0; i < m; i += KDIM ) {
				double *Bik = B_k + i*KDIM;
				double *Bij = B_j + i*KDIM;
				reduce_kernel( Bik, Ukj, Bij );
			}	
		}

		for ( int i = 0; i < m; i += KDIM ) {
			double *Bij = B_j + i*KDIM;
			tri_solve_xub_kernel( Ujj, Bij );
		}
	}
}

/* 
 * Matrix factorization reduces trailing sub-matrix A by computing A = A - L*U, 
 * where A is an m-by-n sub-matrix, L is an m-by-p block of a unit lower 
 * triangular matrix and U is a p-by-n block of an upper triangular matrix.  
 * Matrices A, L and U are stored in column-major order with leading dimension 
 * ldim.  The trailing sub-matrix update is an implementation of the SAXPY 
 * operation.
 */
void reduce_matrix( int m, int n, int p, int ldim,
	const double *L, const double *U, double *A ) 	
{
	for ( int j = 0; j < n; j++ ) {
		const double *U_j = U + j*ldim;				// Points to element U(0,j)
		double *A_j = A + j*ldim;					// Points to element A(0,j)
		for ( int k = 0; k < p; k++ ) {
			const double *L_k = L + k*ldim;			// Points to element L(0,k)
			double ukj = *(U_j + k);				// Element U(k,j)
			for ( int i = 0; i < m; i++ ) {
				*(A_j + i) -= *(L_k + i) * ukj;
			}										// A(i,j) -= L(i,k) * U(k,j)
		}
	}
}

/*
 * Matrix factorization reduces trailing sub-matrix A by computing A = A - L*U, 
 * where A is an m-by-n sub-matrix, L is an m-by-p column block of a unit lower 
 * triangular matrix and U is a p-by-n row block of an upper triangular matrix.  
 * Matrices A, L and U are stored in column-major order with leading dimension
 * ldim. Blocking is used to optimize memory access for the trailing sub-matrix 
 * reduction, and bdim is the blocking parameter.
 */
void reduce_mat_blk( int m, int n, int p, int ldim, int bdim,
	const double *L, const double *U, double *A )
{
	for ( int j = 0; j < n; j += bdim ) {
		// Determine number of columns in (i,j)th block of A
		const int s = (j + bdim > n) ? (n - j) : bdim;

		for ( int k = 0; k < p; k += bdim ) {
			// Determine number of columns of Lik and rows of Ukj
			const int t = (k + bdim > p) ? (p - k) : bdim;
			// Set pointer to block matrix Ukj
			const double *Ukj = U + k + j*ldim;

			for ( int i = 0; i < m; i += bdim ) {
				// Determine number of rows in (i,j)th block of A
				const int r = (i + bdim > m) ? (m - i) : bdim;
				// Set pointers to block matrices Lik and Aij
				const double *Lik = L + i + k*ldim;
				double *Aij = A + i + j*ldim;
				// Reduce trailing block matrix
				reduce_matrix( r, s, t, ldim, Lik, Ukj, Aij );	
			}								
		}
	}	
}

/*
 * Matrix factorization reduces the trailing sub-matrix by computing A = A - L*U, 
 * where A, L and U are contiguous KDIM-by-KDIM sub-blocks of the trailing
 * sub-matrix, a unit lower triangular matrix and an upper triangular matrix, 
 * respectively.  Looping is controlled by a symbolic constant (KDIM), which is 
 * evaluated during compilation.  The trailing sub-matrix update is an 
 * implementation of the SAXPY operation.
 */
void reduce_kernel( const double *L, const double *U, double *A ) 	
{
	for ( int j = 0; j < KDIM; j++ ) {
		const double *U_j = U + j*KDIM;				// Points to element U(0,j)
		double *A_j = A + j*KDIM;					// Points to element A(0,j)
		for ( int k = 0; k < KDIM; k++ ) {
			const double *L_k = L + k*KDIM;			// Points to element L(0,k)
			double ukj = *(U_j + k);				// Element U(k,j)
			for ( int i = 0; i < KDIM; i++ ) {
				*(A_j + i) -= *(L_k + i) * ukj;		// A(i,j) -= L(i,k) * U(k,j)
			}										
		}
	}
}

/*
 * Matrix factorization reduces the trailing sub-matrix by computing A = A - L*U,  
 * where A is an m-by-n block of the trailing sub-matrix, L is an m-by-p block 
 * of a unit lower triangular matrix and U is a p-by-n block of an upper 
 * triangular matrix.  Matrix blocks A, L and U are stored contiguously with 
 * leading dimension ldimA, ldimL and ldimU, respectively.  Within blocks of 
 * A, L and U, sub-blocks of size KDIM*KDIM are stored contiguously.
 */
void reduce_blk_ker( int m, int n, int p, int ldimL, const double *L, 
	int ldimU, const double *U, int ldimA, double *A ) 	
{
	for ( int j = 0; j < n; j += KDIM ) {

		for ( int k = 0; k < p; k += KDIM ) {
			// Set pointer to sub-block Ukj
			const double *Ukj = U + k*KDIM + j*ldimU;

			for ( int i = 0; i < m; i += KDIM ) {
				// Set pointers to sub-blocks Lik and Aij
				const double *Lik = L + i*KDIM + k*ldimL;
				double *Aij = A + i*KDIM + j*ldimA;
				// Perform matrix reduction on sub-blocks
				reduce_kernel( Lik, Ukj, Aij );
			}													
		}
	}	
}

/*
 * Factorizes an n-by-n matrix sub-block A into a unit lower triangular sub-
 * block L and an upper triangular sub-block U, such that A = L*U.  KDIM-by-KDIM
 * matrix sub-block A is stored contiguously.  The LU factorization algorithm is 
 * an implementation of the SAXPY operation.  Looping is controlled by a 
 * symbolic constant (KDIM), which is evaluated during compilation.  The factors 
 * L and U overwrite A.
 */
void lu_kernel( const int n, double *A )
{
	for (int j= 0; j < n; j++) {

		// Perform cumulative trailing sub-matrix updates on elements of 
		// column j above the diagonal
		double *A_j = A + j*KDIM;
		for (int k = 0; k < j; k++) {
			double *A_k = A + k*KDIM;
			double akj = *(A_j + k);
			for (int i = k+1; i < j; i++) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
		// Perform cumulative trailing sub-matrix updates on diagonal element 
		// and elements below the diagonal of column j
		for (int k = 0; k < j; k++) {
			double *A_k = A + k*KDIM;
			double akj = *(A_j + k);
			for (int i = j; i < KDIM; i++) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
		// Divide elements in column j below the diagonal by the diagonal element
		double ajj = *(A_j + j);
		for (int i = j+1; i < KDIM; i++) {
			*(A_j+i) /= ajj;
		}
	}
}
  
/*
 * Factorizes an n-by-n matrix block A into a unit lower triangular block L and
 * an upper triangular block U, such that A = L*U.  Matrix block A is stored 
 * contiguously with leading dimension ldim, and within the matrix block, sub-
 * blocks of size KDIM*KDIM are stored contiguously.
 */
void lu_blk_ker( int n, int ldim, double *A )
{
	for ( int j = 0; j < n; j += KDIM ) {
		const int s = (j + KDIM > n) ? (n - j) : KDIM;
		double *A_j = A + j*ldim;
		const double *U_j = A_j;

		// Solve for L*X = A using forward substitution, and perform cumulative 
		// trailing sub-matrix updates on matrix sub-blocks above the diagonal
		for ( int k = 0; k < j; k += KDIM ) {
			const double *L_k = A + k*ldim;
			const double *Lkk = L_k + k*KDIM;
			const double *Ukj = U_j + k*KDIM;
			double *Akj = A_j + k*KDIM;
			tri_solve_l1xb_kernel( Lkk, Akj );
			for (int i = k+KDIM; i < j; i += KDIM) {
				const double *Lik = L_k + i*KDIM;
				double *Aij = A_j + i*KDIM;
				reduce_kernel( Lik, Ukj, Aij );
			}
		}

		// Perform cumulative trailing sub-matrix updates on diagonal sub-block 
		// and sub-blocks below the diagonal
		for (int k = 0; k < j; k += KDIM) {
			const double *L_k = A + k*ldim;
			const double *Ukj = U_j + k*KDIM;
			for ( int i = j; i < n; i += KDIM ) {
				const double *Lik = L_k + i*KDIM;
				double *Aij = A_j + i*KDIM;
				reduce_kernel( Lik, Ukj, Aij );
			}
		}

		// Factorize diagonal sub-block, and solve X*U = A using forward 
		// substitution on sub-blocks below the diagonal
		double *Ajj = A_j + j*KDIM;
		const double *Ujj = Ajj;
		lu_kernel( s, Ajj );
		for ( int i = j+KDIM; i < n; i += KDIM ) {
			double *Aij = A_j + i*KDIM;
			tri_solve_xub_kernel( Ujj, Aij );
		}
	}
}

/*
 * Implements a rectangular version of SAXPY operation (jki indexing) for 
 * Gaussian elimination.  Nonsingular m-by-n matrix A with leading dimension 
 * ldim is factored into a unit lower triangular matrix L and upper triangular
 * matrix U, such that A = L*U.  It is assumed that properties of matrix A,
 * e.g., diagonally dominant, obviate the need for pivoting.  Elements of L are
 * stored in A(k+1:n-1,k), while elements of U are stored in A(0:k,k), assuming
 * base 0 indexing.  The inner-most loop subtracts a scalar multiple of a vector
 * from another vector. 
 */
void lu_factor( const int m, const int n, int ldim, double *A )
{
	for ( int j= 0; j < n; j++ ) {

		// Perform cumulative trailing sub-matrix updates on elements of 
		// column j above the diagonal
		double *A_j = A + j*ldim;
		for ( int k = 0; k < j; k++ ) {
			double *A_k = A + k*ldim;
			double akj = *(A_j + k);
			for (int i = k+1; i < j; i++) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
		// Perform cumulative trailing sub-matrix updates on diagonal element 
		// and elements below the diagonal of column j
		for (int k = 0; k < j; k++) {
			double *A_k = A + k*ldim;
			double akj = *(A_j + k);
			for (int i = j; i < m; i++) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
		// Divide elements in column j below the diagonal by the diagonal element
		double ajj = *(A_j + j);
		for (int i = j+1; i < m; i++) {
			*(A_j+i) /= ajj;
		}
	}
}

/*
 * Implements a rectangular version of the SAXPY operation (jki indexing) for
 * Guassian elimination with partial pivoting.  Nonsingular m-by-n matrix A
 * with leading dimension ldim is factored into a unit lower triangular matrix L 
 * and upper triangular matrix U.  Row permuted matrix A^ = P*A = L*U.  
 * Permutation matrix P is encoded in vectors piv[] and ord[], such that 
 * row k is interchanged with row piv[k], and ord[k] = 1 is the diagonal block 
 * order.  Elements of L are stored in A(k+1:n-1,k), while elements of U are 
 * stored in A(0:k,k), assuming base 0 indexing.  The inner-most loop subtracts 
 * a scalar multiple of a vector from another vector.
 */
void lu_pivot( char pivot, int m, int n, int ldim, 
	int *piv, int *ord, double *A )
{
	for ( int j= 0; j < n; j++ ) {

		// Apply permutation matrix encoded in pivot vector to column j
		double *A_j = A + j*ldim;
		double *Ajj = A_j +j;
		for ( int k = 0; k < j; k++ ) {
			double akj = *(A_j + k);
			*(A_j + k) = *(A_j + piv[k]);
			*(A_j + piv[k]) = akj;
		}
		// Perform cumulative trailing sub-matrix updates on elements of 
		// column j above the diagonal
		for ( int k = 0; k < j; k++ ) {
			double *A_k = A + k*ldim;
			double akj = *(A_j + k);
			for ( int i = k+1; i < j; i++ ) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
		// Perform cumulative trailing sub-matrix updates on diagonal element 
		// and elements below the diagonal of column j
		for ( int k = 0; k < j; k++ ) {
			double *A_k = A + k*ldim;
			double akj = *(A_j + k);
			for ( int i = j; i < m; i++ ) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
		// Determine pivot for column j and interchange elements in the pivot
		// row from columns 0 to j with elements in row j
		switch ( pivot ) {
		case 'G':
			eval_pivot_gauss( m-j, j, Ajj, piv, ord );
			break;
		default:
			eval_pivot_gauss( m-j, j, Ajj, piv, ord );
			break;
		}
		if ( j != piv[j] ) {
			for ( int k = 0; k <= j; k++ ) {
				double ajk = *(A + j + k*ldim);
				*(A + j + k*ldim) = *(A + piv[j] + k*ldim);
				*(A + piv[j] + k*ldim) = ajk;
			}
		}
		// Divide elements in column j below the diagonal by the diagonal element
		double ajj = *Ajj;
		for ( int i = j+1; i < m; i++ ) {
			*(A_j+i) /= ajj;
		}
	}
}

/******************************************************************************/

/*
 * If a nonsingular matrix exhibits certain properties, such a diagonal
 * dominance, then Gaussian elimination without pivoting is numerically stable.
 */

/*
 * Implements the outer product method (kji indexing) to factorize nonsingular 
 * n-by-n matrix A into a unit lower triangular matrix L and upper triangular 
 * matrix U, such that A = L*U.  Elements of L are stored in A(k+1:n-1,k), while
 * elements of U are stored in A(0:k,k), assuming base 0 indexing.  Each pass 
 * through the k-loop performs an outer product operation. 
 */
void lu_outer_product( int n, double *A )
{
	const int ldim = n;

	for ( int k = 0; k < n-1; k++ ) {

		// Divide elements of column k below the diagonal by the diagonal element
		double *A_k = A + k*ldim;
		double akk = *(A_k + k);
		for ( int i = k+1; i < n; i++ ) {
			*(A_k + i)	/= akk;
		}
		// Update trailing sub-matrix by subtracting the outer product
		for ( int j = k+1; j < n; j++ ) {
			double *A_j = A + j*ldim;
			double akj = *(A_j + k);
			for ( int i = k+1; i < n; i++ ) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
	}
}

/*
 * Implements the SAXPY operation using jki indexing to factorize nonsingular 
 * n-by-n matrix A into a unit lower triangular matrix L and upper triangular 
 * matrix U, such that A = L*U.  Elements of L are stored in A(k+1:n-1,k), while
 * elements of U are stored in A(0:k,k), assuming base 0 indexing.  The inner-
 * most loop subtracts a scalar multiple of a vector from another vector.  
 */
void lu_saxpy( int n, double *A )
{
	const int ldim = n;

	lu_factor( n, n, ldim, A );
}

/*
 * Implements simple blocking to factorize nonsingular n-by-n matrix A into
 * a unit lower triangular matrix L and an upper triangular matrix U, such that
 * A = L*U.  Suppose A is decomposed into blocks [A_00, A_01; A_10, A_11], 
 * where A_00 is an r-by-r matrix block.  First, a rectangular unblocked version
 * of the SAXPY operation for LU factorization computes [L_00; L_10] and U_00.  
 * Given that A_01 = L_00 * U_01, we can solve for U_01 using forward 
 * substitution.  Then the trailing sub-matrix is updated, 
 * A_11 = A_11 - L_10 * U_01.  This procedure is repeated iteratively on the 
 * trailing sub-matrix until the last diagonal block (dimension <= r) is reached. 
 * Simple blocking is also used to optimize memory access when updating the 
 * trailing sub-matrix.
 */
void lu_block( int n, double *A )
{
	const int ldim = n;
	const int bdim = get_block_dim_lu( ldim );

	int 		r, t;
	double	*Ajj, *L, *U;

	Ajj = A;
	r = (bdim > n) ? n : bdim;
	lu_factor(n, r, ldim, Ajj);

	t = n - bdim;
	for ( int j = bdim; j < n; j += bdim, t -= bdim ) {
		U = Ajj + bdim*ldim;
		tri_solve_l1xb( bdim, t, ldim, Ajj, U );
		L = Ajj + bdim;
		Ajj = A + j*ldim + j;
		reduce_mat_blk( t, t, bdim, ldim, bdim, L, U, Ajj );
		r = (t < bdim) ? t : bdim;
		lu_factor( t, r, ldim, Ajj );
	}	
}

/*
 * Implements recursive contiguous blocking to factorize nonsingular n-by-n 
 * matrix A into a unit lower triangular matrix L and an upper triangular 
 * matrix U, such that A = L*U.  Matrix A, which is stored in column-major 
 * order is first copied to array AA, which stores recursive contiguous blocks.  
 * That is, matrix blocks are stored contiguously, and within each block, sub-
 * blocks of size KDIM*KDIM are stored contiguously.  Gaussian elimination 
 * yields factors L and U stored in recursive contiguous blocks in array AA, 
 * which is then copied to array A, where matrix elements are stored in
 * conventional column-major order. 
 */
void lu_recur_block( int n, double *A )
{
	const int	nn = (n / KDIM) * KDIM + ((n % KDIM) ? KDIM : 0);
	const int	ldim = nn;
	const int	bdim = get_block_dim_lu( ldim );

	double 	*AA;

	AA = (double *) malloc( ldim*ldim*sizeof(double) );
	form_recur_blocks( n, n, n, A, nn, nn, KDIM, bdim, ldim, AA );

	for ( int j = 0; j < nn; j += bdim ) {
		int s = (j + bdim > n) ? (n - j) : bdim;
		int q = (j + bdim > nn) ? (nn - j) : bdim;
		double *A_j = AA + j*ldim;
		const double *U_j = A_j;

		// Solve for L*X = A using forward substitution, and perform cumulative 
		// trailing sub-matrix updates on matrix blocks above the diagonal
		for ( int k = 0; k < j; k += bdim ) {
			const double *L_k = AA + k*ldim;
			const double *Lkk = L_k + k*bdim;
			const double *Ukj = U_j + k*q;
			double *Akj = A_j + k*q;
			tri_solve_l1xb_blk_ker(bdim, s, bdim, Lkk, bdim, Akj);
			for ( int i = k+bdim; i < j; i += bdim ) {
				int r = (i + bdim > n) ? (n - i) : bdim;
				int p = (i + bdim > nn) ? (nn - i) : bdim;
				const double *Lik = L_k + i*bdim;
				double *Aij = A_j + i*q;
				reduce_blk_ker( r, s, bdim, p, Lik, bdim, Ukj, p, Aij );
			}
		}

		// Perform cumulative trailing sub-matrix updates on diagonal block 
		// and matrix blocks below the diagonal
		for ( int k = 0; k < j; k += bdim ) {
			const double *L_k = AA + k*ldim;
			const double *Ukj = U_j + k*q;
			for ( int i = j; i < nn; i += bdim ) {
				int r = (i + bdim > n) ? (n - i) : bdim;
				int p = (i + bdim > nn) ? (nn - i) : bdim;
				const double *Lik = L_k + i*bdim;
				double *Aij = A_j + i*q;
				reduce_blk_ker(r, s, bdim, p, Lik, bdim, Ukj, p, Aij);
			}
		}

		// Factorize diagonal block, and solve X*U = A using forward
		// substitution on blocks below the diagonal
		double *Ajj = A_j + j*q;
		const double *Ujj = Ajj;
		lu_blk_ker(s, q, Ajj);
		for ( int i = j+BDIM; i < nn; i += bdim ) {
			int r = (i + bdim > n) ? (n - i) : bdim;
			int p = (i + bdim > nn) ? (nn - i) : bdim;
			double *Aij = A_j + i*bdim;
			tri_solve_xub_blk_ker(r, bdim, bdim, Ujj, p, Aij);
		}
	}	
	unpack_recur_blocks( nn, nn, KDIM, bdim, ldim, AA, n, n, n, A );
	free( AA );
}

/******************************************************************************/

/*
 * In general, Gaussian elimination requires pivoting to ensure numerical 
 * stability.  We implement partial pivoting to compute the LU factorization of 
 * row permuted matrix P*A = L*U, where the permutation matrix is encoded in a
 * pivot vector.  The factorization overwrites matrix A with unit lower 
 * triangular matrix L and upper triangular matrix U.  Although, one only needs
 * the pivot vector and the unit lower and upper triangular factors to solve the
 * the corresponding linear system, the function prototypes have additional 
 * arguments to be consistent with matrix factorizations that implement a
 * variety of more complicated pivoting strategies.  This enables the use of 
 * function pointers to invoke matrix factorizations, which specify a pivoting
 * strategy in the argument list and return all neccessary information to solve
 * the corresponding linear system.
 */

/*
 * Employs the outer product method (kji indexing) with partial pivoting to 
 * factorize nonsingular n-by-n matrix A into a unit lower triangular matrix L 
 * and upper triangular matrix U.  Row permuted matrix A^ = P*A = L*U.
 * Permutation matrix P is encoded in vectors piv[] and ord[], such that row k
 * is interchanged with row piv[k] and ord[k] = 1 is the diagonal block order.
 * Elements of L are stored in A(k+1:n-1,k), while elements of U are stored in 
 * A(0:k,k), assuming base 0 indexing.  Each pass through the k-loop performs 
 * an outer product operation. 
 */
void lu_pivot_outer_product( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;

	for ( int k = 0; k < n-1; k++ ) {

		double *A_k = A + k*ldim;
		double *Akk = A_k + k;
		// Determine pivot for column k and interchange elements in the pivot
		// row with elements in row k
		switch ( pivot ) {
		case 'G':
			eval_pivot_gauss( n-k, k, Akk, piv, ord );
			break;
		default:
			eval_pivot_gauss( n-k, k, Akk, piv, ord );
			break;
		}
		if ( k != piv[k] ) {
			for ( int j = 0; j < n; j++ ) {
				double akj = *(A + k + j*ldim);
				*(A + k + j*ldim) = *(A + piv[k] + j*ldim);
				*(A + piv[k] + j*ldim) = akj;
			}
		}
		// Divide elements of column k below the diagonal by the diagonal element
		double akk = *Akk;
		for ( int i = k+1; i < n; i++ ) {
			*(A_k + i)	/= akk;
		}
		// Update trailing sub-matrix by subtracting the outer product
		for ( int j = k+1; j < n; j++ ) {
			double *A_j = A + j*ldim;
			double akj = *(A_j + k);
			for ( int i = k+1; i < n; i++ ) {
				*(A_j+i) -= *(A_k+i) * akj;
			}
		}
	}
	piv[n-1] = n-1;
}

/*
 * Employs the SAXPY operation (jki indexing) with partial pivoting to factorize 
 * nonsingular n-by-n matrix A into a unit lower triangular matrix L and upper
 * triangular matrix U.  Row permuted matrix A^ = P*A = L*U.  Permutation matrix
 * P is encoded in vectors piv[] and ord[], such that row k is interchanged with
 * row piv[k] and ord[k] = 1 is the diagonal block order.  Elements of L are 
 * stored in A(k+1:n-1,k), while elements of U are stored in A(0:k,k), assuming 
 * base 0 indexing.  The inner-most loop subtracts a scalar multiple of vector 
 * from another vector. 
 */
void lu_pivot_saxpy( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;
	
	lu_pivot( pivot, n, n, ldim, piv, ord, A );
}

/*
 * Implements simple blocking with partial pivoting to factorize nonsingular 
 * n-by-n matrix A into a unit lower triangular matrix L and an upper triangular
 * matrix U.  Row permuted matrix A^ = P*A = L*U.  Permutation matrix P is 
 * encoded in vectors piv[] and ord[], such that row k is interchanged with 
 * row piv[k], and ord[k] = 1 is the diagonal block order. Suppose A is 
 * decomposed into blocks [A_00, A_01; A_10, A_11], where A_00 is an r-by-r 
 * matrix block.  First, a rectangular version of the SAXPY operation for LU 
 * factorization with partial pivoting computes 
 * P*[A_00; A_10] = [L_00; L_10]*U_00.  Let [A_01^; A_11^] = P*[A_01; A_11].  
 * Given that A_01^ = L_00*U_01, solve for U_01.  Then the trailing sub-matrix 
 * is updated, A_11^ = A_11^ - L_10*U_01. This procedure is repeated iteratively
 * on the trailing sub-matrix until the last diagonal block (dimension <= r) is 
 * reached. Simple blocking is also used to optimize memory access when updating
 * the trailing sub-matrix.
 */
void lu_pivot_block( char pivot, int n, int *piv, int *ord, double *A )
{
	const int ldim = n;
	const int bdim = get_block_dim_lu( ldim );

	int 		d, j, r, t;
	double	*Ajj, *L, *U;

	j = 0;
	Ajj = A;
	r = (bdim > n) ? n : bdim;
	// Perform rectangular factorization on first column block A(0:n-1,0:r)
	lu_pivot( pivot, n, r, ldim, &piv[j], &ord[j], Ajj );

	d = 0;
	j = bdim;
	t = n - bdim;
	for ( ; j < n; j += bdim, d += bdim, t -= bdim ) {
		U = Ajj + bdim*ldim;
		// Solve for U(j-BDIM:j,j:n-1) where
 		// P * A(j-BDIM:j,j:n-1) = L(j-BDIM:j,j-BDIM:j) * U(j-BDIM:j,j:n-1)
		tri_solve_l1xb_pivot( bdim, t, ldim, &piv[d], Ajj, U );

		// Adjust pivot vector of previous block for diagonal offset
		for ( int i = d; i < j; i++ ) {
			piv[i] += d;
		}
		L = Ajj + bdim;
		Ajj = A + j + j*ldim;
		// Reduce trailing sub-matrix
		// P * A(j:n-1,j:n-1) = L(j:n-1,j-BDIM:j-1) * U(j-BDIM:j-1,j:n-1)
		reduce_mat_blk( t, t, bdim, ldim, bdim, L, U, Ajj );
		r = t < bdim ? t : bdim;
		// Perform rectangular factorization on column block A(j:n-1,j:j+r-1)
		lu_pivot( pivot, t, r, ldim, &piv[j], &ord[j], Ajj );

		// Apply permutation matrix for current block, encoded in piv(j:j+r-1),
		// to columns to the left of current block A(:,0:j-1)
		for ( int i = j; i < j+r; i++ ) {
			if ( i != piv[i] + j ) {
				for ( int k = 0; k < j; k++ ) {
					double aik = *(A + i + k*ldim);
					*(A + i + k*ldim) = *(A + piv[i] + j + k*ldim);
					*(A + piv[i] + j + k*ldim) = aik;
				}
			}
		}		
	}
	// Adjust pivot vector of last block for diagonal offset
	for ( int i = d; i < n; i++ ) {
		piv[i] += d;
	}
}

/*
 * Wrapper for calling LAPACK routine DGETRF which computes an LU factorization 
 * of a nonsingular matrix using partial pivoting with row interchanges.
 */
void lu_pivot_lapack( char pivot, int n, int *piv, int *ord, double *A )
{
	const int	ldim = n;
	int			info = 0;

    dgetrf_(&n, &n, A, &ldim, piv, &info);
}
