/*
 * Timing harness for measuring the performance of basic and "optimized"
 * algorithms implementing matrix factorizations on square matrices over a range
 * of dimensions.  Matrix factorizations include LU (Gaussian elimination), 
 * standard Cholesky, symmetric indefinite (LDL'), and modified Cholesky 
 * (Gill-Murray-Wright and Cheng-Higham algorithms).  Performance data are
 * written to an output file destination.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "lufact.h"
#include "cholfact.h"
#include "ldltfact.h"
#include "modchol.h"
#include "matcom.h"
#include "timing.h"

#if !defined(PROC)
#	define PROC "unknown"
#endif
#if !defined(CORES)
#	define CORES "unknown"
#endif
#if !defined(CLKSPEED)
#	define CLKSPEED "unknown"
#endif
#if !defined(CACHE)
#	define CACHE "unknown"
#endif
#if !defined(COMPILER)
#	define COMPILER "unknown"
#endif
#if !defined(LANGUAGE)
#	define LANGUAGE "default"
#endif
#if !defined(OPTM)
#	define OPTM "default"
#endif
#if !defined(DATADIR)
#	define DATADIR "."		// Current directory ./
#endif

#if defined(DEBUG)
#	define MIN_ITER 4		// Minimum number of iterations of algorithm
#	define MIN_SECS 1.0		// Minimum elapsed time for execution of algorithm
	// Define sizes (dimensions) of square matrices used to measure performance
	const int mat_size[] = { 65, 130, 195, 254 };
#else
#	define MIN_ITER 8
#	define MIN_SECS 2.0
	const int mat_size[] = { 65, 130, 195, 254, 258, 321, 387, 450, 508, 516, 
		579, 642, 707, 764, 772, 833, 899, 963, 1021, 1027, 1155, 1278, 1282, 
		1411, 1532, 1540, 1666, 1789, 1795, 1921, 2046, 2050 };
#endif
#define SIZES (sizeof(mat_size) / sizeof(int))

static void read_matrix( const char *file, int m, int n, double *A );
static void write_data_file( const char *file, const char *hdr_text,
	const int rows, const int cols, const double *data );
static double time_mfact( void (*mfact)(int n, double *A), int n, double *A );
static double time_mfact_pivot( void (*mfact_piv)(char pivot, int n, int *piv, 
	int *ord, double *A), char pivot, int n, int *piv, int *ord, double *A );
static void time_lu( void );
static void time_lu_pivot( void );
static void time_chol( void );
static void profile_chol( void );
static void time_ldlt( void );
static void time_ldlt_indef( void );
static void profile_ldlt( void );
static void time_chol_gmw( void );
static void time_chol_ch( void );
static void time_mod_chol_indef( void );
static void profile_mod_chol( void );

static char		*file_path;

int main()
{
	// Specify file path for output data files
	file_path = (char *) calloc( strlen(DATADIR) + 2, sizeof(char) );
	strcpy( file_path, DATADIR );
	strcat( file_path, "/" );

#if defined(LUFACT)
	time_lu();
#endif

#if defined(LUPIVOT)
	time_lu_pivot();
#endif

#if defined(CHOLFACT)
#if defined(PROFILE)
	profile_chol();
#else
	time_chol();
#endif
#endif

#if defined(LDLTFACT)
#if defined(PROFILE)
	profile_ldlt();
#else
	time_ldlt();
	time_ldlt_indef();
#endif
#endif

#if defined(MODCHOL)
#if defined(PROFILE)
	profile_mod_chol();
#else
	time_chol_gmw();
	time_chol_ch();
	time_mod_chol_indef();
#endif
#endif

	return 0;
}

/******************************************************************************/

/*
 * Reads matrix data in specified file into array A passed in argument list.  
 * Matrix A is stored in column-major order. 
 */
void read_matrix( const char *file, int m, int n, double *A )
{
	const int ldim = m;	

	FILE *fp;

	if ( (fp = fopen(file, "r")) == NULL ) {
		fprintf( stderr, "Error opening file %s.", file );
		exit(-1);
	}
	// Read matrix data from file
	for ( int i = 0; i < m; i++ ) {
		for ( int j = 0; j < n; j++ ) {
			fscanf( fp, "%lg", (A + i + j*ldim) );
		}
	}
	fclose( fp );	
}

/*
 * Writes header text and experimental data to the file specified in the 
 * argument list.  Experimental data is enumerated in a matrix stored in 
 * column-major order. 
 */
void write_data_file( const char *file, const char *hdr_text, 
	int rows, int cols, const double *data )
{
	FILE *fp;

	if ( (fp = fopen( file, "w" )) == NULL ) {
		fprintf( stderr, "Error opening file %s.", file );
		exit( -1 );
	}
	// Write header text
	fprintf( fp, "# Processor:\t%s\n", PROC);
	fprintf( fp, "# Cores:\t%s\n", CORES);
	fprintf( fp, "# Clock speed:\t%s\n", CLKSPEED);
	fprintf( fp, "# Cache:\t%s\n", CACHE);
	fprintf( fp, "# \n" );
	fprintf( fp, "# C compiler:\t%s\n", COMPILER );
	fprintf( fp, "# C language standard:\t%s\n", LANGUAGE );
	fprintf( fp, "# Optimization level and options:\t%s\n", OPTM );
	fprintf( fp, "# Clock resolution:\t%Lg\n", timer_resolution() );
	fprintf( fp, "# \n" );
	fprintf( fp, "%s\n", hdr_text );
	// Write experimental data
	for ( int i = 0; i < rows; i++ ) {
		for ( int j = 0; j < cols; j++ ) {
			fprintf( fp, "%g\t", *(data+j*rows+i) );
		}
		fprintf( fp, "\n" );
	}
	fclose( fp );	
}

/*
 * Measures the average time (number of seconds) to factor an n-by-n matrix.
 * Assumes that pivoting is not required to ensure numerical stability of the
 * factorization procedure.  Matrix factorization is performed iteratively for
 * at least the minimum number of iterations, and until the minimumn time 
 * (in seconds) has elapsed.
 */
double time_mfact( void (*mfact)(int n, double *A), int n, double *A )
{
	struct		timespec sta, end;
	long int	num_iter = MIN_ITER;
	double		secs = -1.0;
	double		*M;	

	// Save copy of matrix A before performing matrix factorization
	M = (double *) malloc( n*n*sizeof(double) );
	copy_matrix( n, n, A, M );

	while ( secs < MIN_SECS ) {
		get_time( &sta );
		for ( int i = 0; i < num_iter; i++ ) {
			mfact( n, A );
			copy_matrix( n, n, M, A );		// Reset matrix A to initial value
		}
		get_time( &end );
		secs = timespec_diff( sta, end );
		num_iter *= 2;
	}
	free( M );	

	// On exiting the while loop, the number of iterations (num_iter) has been
	// doubled in the event that secs < MIN_SECS, so num_iter must be halved
	return secs / (num_iter/2.0);
}

/*
 * Measures the average time (number of seconds) to factor an n-by-n matrix.
 * Assumes that pivoting is required to ensure numerical stability of the
 * factorization procedure.   Matrix factorization is performed iteratively for
 * at least the minimum number of iterations, and until the minimum time 
 * (in seconds) has elapsed.
 */
double time_mfact_pivot( void (*mfact_piv)(char pivot, int n, int *piv, 
	int *ord, double *A), char pivot, int n, int *piv, int *ord, double *A )
{
	struct		timespec sta, end;
	long int	num_iter = MIN_ITER;
	double		secs = -1.0;
	double		*M;

	// Save copy of matrix A before performing matrix factorization
	M = (double *) malloc( n*n*sizeof(double) );
	copy_matrix( n, n, A, M );

	while ( secs < MIN_SECS ) {
		get_time( &sta );
		for ( int i = 0; i < num_iter; i++ ) {
			mfact_piv( pivot, n, piv, ord, A );
			copy_matrix( n, n, M, A );		// Reset matrix A to initial value
		}
		get_time( &end );
		secs = timespec_diff( sta, end );
		num_iter *= 2;
	}
	free( M );
	
	// On exiting the while loop, the number of iterations (num_iter) has been
	// doubled in the event that secs < MIN_SECS, so num_iter must be halved
	return secs / (num_iter/2.0);
}

/*
 * Measures the performance (Mflops/sec) of basic and optimized algorithms
 * implementing LU factorization on nonsingular matrices over a range of 
 * dimensions.  The algorithms employ performance optimization techniques 
 * including loop reordering and blocking.  It is assumed that LU factorization
 * is performed on matrices with properties -- for example, diagonally dominant 
 * (alpha = 1.0) -- that do not require pivoting.
 */
void time_lu( void )
{
#define FIELDS 6			// Number of output data fields
	const char		*data_file_name = "lu.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for blocked algorithms\n"
"#			Mflop/sec for Guassian elimination (LU factorization) algorithms\n"
"# OUTPROD:	Outer product method, kji indexing\n"
"# SAXPY:	SAXPY operation, jki indexing\n"
"# BLKSIMP:	Simple blocking\n"
"# BLKRCR:	Recursive contiguous blocking\n"
"# \n"
"# N\tBDIM\tOUTPROD\tSAXPY\tBLKSIMP\tBLKRCR";
	const int		col_n = 0,
					col_bdim = 1,
					col_outprod = 2,
					col_saxpy = 3,
					col_blksimp = 4,
					col_blkrcr = 5;
	const double	alpha = 1.0;	// Scaling factor for random matrix
	
	char	*data_file;
	int		n, bdim;
	double	mflops;
	double	perf_data[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact)( int n, double *A );

	// Concatenate file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );

	for ( int i = 0; i < SIZES; i++ ) {
		n = mat_size[i];
		bdim = get_block_dim_lu( n );
		fprintf( stdout, "n = %d, bdim = %d\n", n, bdim );
		// LU factorization takes (2/3)*n^3 floating point operations
		mflops = 1.0e-06 * (2.0/3.0) * n * n * n;
		// Create random n-by-n nonsingular matrix that is diagonally dominant
		A = (double *) malloc( n*n*sizeof(double) );
		create_random_nonsingular( alpha, n, A );
		
		// Performance data is stored in perf_data[] array in column-major order
		perf_data[i+col_n*SIZES] = (double) n;
		perf_data[i+col_bdim*SIZES] = (double) bdim;
printf("lu_outer_product\n");		
		// Measure performance of LU factorization algorithms:
		// Outer product method (kji indexing)
		mfact = lu_outer_product;
		perf_data[i+col_outprod*SIZES] = mflops / time_mfact( mfact, n, A );
printf("lu_saxpy\n");	
		// SAXPY operation (jki indexing)
		mfact = lu_saxpy;
		perf_data[i+col_saxpy*SIZES] = mflops / time_mfact( mfact, n, A );
printf("lu_block\n");	
		// Simple blocking
		mfact = lu_block;
		perf_data[i+col_blksimp*SIZES] = mflops / time_mfact( mfact, n, A );
printf("lu_recur_block\n");	
		// Recursive contiguous blocking
		mfact = lu_recur_block;
		perf_data[i+col_blkrcr*SIZES] = mflops / time_mfact( mfact, n, A );

		free( A );
	}
	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data ); 

#undef FIELDS
}

/*
 * Measures the performance (Mflops/sec) of basic and optimized algorithms
 * implementing LU factorization with partial pivoting on nonsingular matrices 
 * over a range of dimensions.  The algorithms employ performance optimization
 * techniques including loop reordering, blocking and the use of the LAPACK
 * library.
 */
void time_lu_pivot( void )
{
#define FIELDS 6			// Number of output data fields
	const char		*data_file_name = "lu_pivot.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for blocked algorithms\n"
"#			Mflop/sec for LU factorization with partial pivoting algorithms\n"
"# OUTPROD:	Outer product method, kji indexing\n"
"# SAXPY:	SAXPY operation, jki indexing\n"
"# BLOCK:	Simple blocking\n"
"# LAPACK:	LAPACK routine DGETRF\n"
"# \n"
"# N\tBDIM\tOUTPROD\tSAXPY\tBLOCK\tLAPACK";
	const int		col_n = 0,
					col_bdim = 1,
					col_outprod = 2,
					col_saxpy = 3,
					col_block = 4,
					col_lapack = 5;
	const double	alpha = 10.0;	// Scaling factor for random matrix
	
	char	*data_file;
	int		n, bdim;
	int		*piv, *ord;
	double	mflops;
	double	perf_data[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact_piv)( char pivot, int n, int *piv, int *ord, double *A );

	// Concatenate file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );

	for (int i = 0; i < SIZES; i++) {
		n = mat_size[i];
		bdim = get_block_dim_lu( n );
		fprintf( stdout, "n = %d, bdim = %d\n", n, bdim );
		// LU factorization takes (2/3)*n^3 floating point operations
		mflops = 1.0e-06 * (2.0/3.0) * n * n * n;
		// Create random n-by-n nonsingular matrix that is diagonally dominant
		A = (double *) malloc( n*n*sizeof(double) );
		create_random_nonsingular( alpha, n, A );
		// Declare pivot and pivot order vectors
		piv = (int *) malloc( n*sizeof(int) );
		ord = (int *) malloc( n*sizeof(int) );

		// Performance data is stored in perf_data[] array in column-major order
		perf_data[i+col_n*SIZES] = (double) n;
		perf_data[i+col_bdim*SIZES] = (double) bdim;
printf("lu_pivot_outer_product\n");		
		// Measure performance of LU factorization with partial pivoting algorithms:
		// Outer product method (kji indexing)
		mfact_piv = lu_pivot_outer_product;
		perf_data[i+col_outprod*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'G', n, piv, ord, A );
printf("lu_pivot_saxpy\n");	
		// SAXPY operation (jki indexing)
		mfact_piv = lu_pivot_saxpy;
		perf_data[i+col_saxpy*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'G', n, piv, ord, A );
printf("lu_pivot_block\n");	
		// Simple blocking
		mfact_piv = lu_pivot_block;
		perf_data[i+col_block*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'G', n, piv, ord, A );
printf("lu_pivot_lapack\n");	
		// LAPACK routine DGETRF
		mfact_piv = lu_pivot_lapack;
		perf_data[i+col_lapack*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'G', n, piv, ord, A );

		free( A );
		free( piv );
		free( ord );
	}
	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data ); 

#undef FIELDS
}

/*
 * Measures the performance (Mflops/sec) of basic and optimized algorithms
 * implementing Cholesky factorization on symmetric positive definite matrices 
 * over a range of dimensions.  The algorithms employ performance optimization
 * techniques including loop reordering, blocking and the use of BLAS and
 * LAPACK libraries.
 */
void time_chol( void )
{
#define FIELDS 12			// Number of output data fields
	const char		*data_file_name = "chol.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for blocked algorithms\n"
"#			Mflop/sec for Cholesky factorization algorithms\n"
"# OUTPROD:	Outer product method, kji indexing\n"
"# SAXPY:	SAXPY operation, jki indexing\n"
"# LAPUNBK:	LAPACK routine DPOTF2, unblocked version\n"
"# BLKSIMP:	Simple blocking\n"
"# BLKRECT:	Simple blocking, rectangular version of Cholesky factorization\n"
"# BLKCTG:	Contiguous blocking\n"
"# BLKRCR:	Recursive contiguous blocking\n"
"# BLAS:	Simple blocking using BLAS\n"
"# CTGBLAS:	Contiguous blocking using BLAS\n"
"# LAPACK:	LAPACK routine DPOTRF\n"
"# \n"
"# N\tBDIM\tOUTPROD\tSAXPY\tLAPUNBK\tBLKSIMP\tBLKRECT\tBLKCTG\tBLKRCR"
"\tBLAS\tCTGBLAS\tLAPACK";
	const int		col_n = 0,
					col_bdim = 1,
					col_outprod = 2,
					col_saxpy = 3,
					col_lapunbk = 4,
					col_blksimp = 5,
					col_blkrect = 6,
					col_blkctg = 7,
					col_blkrcr = 8,
					col_blas = 9,
					col_ctgblas = 10,
					col_lapack = 11;
	const double	alpha = 1.0;	// Scaling factor for random matrix
	
	char	*data_file;
	int		n, bdim;
	double	mflops;
	double	perf_data[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact)( int n, double *A );

	// Concatenate file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );
	
	for ( int i = 0; i < SIZES; i++ ) {
		n = mat_size[i];
		bdim = get_block_dim_chol( n );
		fprintf( stdout, "n = %d, bdim = %d\n", n, bdim );
		// Cholesky factorization takes (1/3)*n^3 floating point operations
		mflops = (1.0e-06 / 3.0) * n * n * n;
		// Create random n-by-n symmetric positive definite matrix
		A = (double *) malloc( n*n*sizeof(double) );
		create_random_spd( alpha, n, A );
		
		// Performance data is stored in perf_data[] array in column-major order
		perf_data[i+col_n*SIZES] = (double) n;
		perf_data[i+col_bdim*SIZES] = (double) bdim;

		// Measure performance of standard Cholesky algorithms:
printf("chol_outer_product\n");		
		// Outer product method (kji indexing)
		mfact = chol_outer_product;
		perf_data[i+col_outprod*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_saxpy\n");	
		// SAXPY operation (jki indexing)
		mfact = chol_saxpy;
		perf_data[i+col_saxpy*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_lapack_unblocked\n");	
		// LAPACK routine DPOTF2, unblocked version
		mfact = chol_lapack_unblocked;
		perf_data[i+col_lapunbk*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_block\n");	
		// Simple blocking
		mfact = chol_block;
		perf_data[i+col_blksimp*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_rect_block\n");	
		// Simple blocking, rectangular version of Cholesky factorization
		mfact = chol_rect_block;
		perf_data[i+col_blkrect*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_contig_block\n");	
		// Contiguous blocking
		mfact = chol_contig_block;
		perf_data[i+col_blkctg*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_recur_block\n");	
		// Recursive contiguous blocking
		mfact = chol_recur_block;
		perf_data[i+col_blkrcr*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_block_blas\n");	
		// Simple blocking using the BLAS library
		mfact = chol_block_blas;
		perf_data[i+col_blas*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_contig_block_blas\n");	
		// Contiguous blocking using the BLAS library
		mfact = chol_contig_block_blas;
		perf_data[i+col_ctgblas*SIZES] = mflops / time_mfact( mfact, n, A );
printf("chol_lapack\n");	
		// LAPACK routine DPOTRF
		mfact = chol_lapack;
		perf_data[i+col_lapack*SIZES] = mflops / time_mfact( mfact, n, A );

		free( A );
	}
	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data ); 

#undef FIELDS
}

/*
 * Profiles blocked algorithms implementing Cholesky factorization on symmetric
 * positive definite matrices.  Profile data estimate the time and proportion of
 * time spent factoring diagonal blocks, solving for lower triangular column
 * blocks and updating the trailing sub-matrix.
 */
void profile_chol( void )
{
	const int	n = 2000;
	const char	*mat_file_name = "mat_2000_spd.dat";

	char 	*mat_file;
	double	*A, *W;

	A = (double *) malloc( n*n*sizeof(double) );
	W = (double *) malloc( n*n*sizeof(double) );
	
	// Concatenate file path and name
	mat_file = (char *) calloc( strlen(file_path) + strlen(mat_file_name) + 1,
		sizeof(char) );
	strcpy( mat_file, file_path );
	strcat( mat_file, mat_file_name );
		
	read_matrix( mat_file, n, n, A );
	copy_matrix( n, n, A, W );

	fprintf( stdout, "Profile of Cholesky factorization (seconds)\n" );
	fprintf( stdout, "Simple blocking\n" );
	fprintf( stdout, "%d-by-%d symmetric positive definite matrix\n", n, n );
	fprintf( stdout, "tm_chol\ttm_factor\ttm_tri_solve\ttm_reduce\t" );
	fprintf( stdout, "pct_factor\tpct_tri_solve\tpct_reduce\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_block( n, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nProfile of Cholesky factorization (seconds)\n" );
	fprintf( stdout, "Blocked algorithm using BLAS\n" );
	fprintf( stdout, "%d-by-%d symmetric positive definite matrix\n", n, n );
	fprintf( stdout, "tm_chol\ttm_factor\ttm_tri_solve\ttm_reduce\t" );
	fprintf( stdout, "pct_factor\tpct_tri_solve\tpct_reduce\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_block_blas( n, A );
		copy_matrix( n, n, W, A );
	}
}

/*
 * Measures the performance (Mflops/sec) of basic and optimized algorithms
 * implementing symmetric indefinite factorization (LDL') on matrices over a
 * range of dimensions.  Our implementation of symmetric indefinite 
 * factorization uses Bunch-Kaufman (partial), bounded Bunch-Kaufman (rook) or
 * Bunch-Parlett (complete) pivoting.  The algorithms employ performance
 * optimization techniques including loop reordering, blocking and the use of
 * BLAS and LAPACK libraries.
 */
void time_ldlt( void )
{
#define FIELDS 20			// Number of output data fields
	const char		*data_file_name = "ldlt.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for simple blocking algorithm\n"
"# BDIMBLA:	Block dimension for blocked algorithm using BLAS\n"
"# BDIMLAP:	Block dimension for LAPACK routine DSYTRF\n"
"# NUMPIVK:	Pivot count, Bunch-Kaufman pivoting\n"
"# NUMPIVB:	Pivot count, bounded Bunch-Kaufman pivoting\n"
"# NUMPIVP:	Pivot count, Bunch-Parlett pivoting\n"
"#			Mflop/sec for symmetric indefinite factorization algorithms\n"
"# OUTPRDK:	Outer product method, Bunch-Kaufman pivoting\n"
"# SAXPYK:	SAXPY operation, Bunch-Kaufman pivoting\n"
"# LAPUNBK:	LAPACK routine DSYTF2, unblocked version\n"
"# BLOCKK:	Simple blocking, Bunch-Kaufman pivoting\n"
"# BLASK:	Simple blocking, Bunch-Kaufman pivoting, BLAS routines \n"
"# LAPACK:	LAPACK routine DSYTRF, Bunch-Kaufman pivoting\n"
"# OUTPRDB:	Outer product method, bounded Bunch-Kaufman pivoting\n"
"# SAXPYB:	SAXPY operation, bounded Bunch-Kaufman pivoting\n"
"# BLOCKB:	Simple blocking, bounded Bunch-Kaufman pivoting\n"
"# BLASB:	Simple blocking, bounded Bunch-Kaufman pivoting, BLAS routines \n"
"# OUTPRDP:	Outer product method, Bunch-Parlett pivoting\n"
"# BLOCKP:	Simple blocking, Bunch-Parlett pivoting\n"
"# BLASP:	Simple blocking, Bunch-Parlett pivoting, BLAS routines \n"
"# \n"
"# N\tBDIM\tBDIMBLA\tBDIMLAP\tNUMPIVK\tNUMPIVB\tNUMPIVP\tOUTPRDK\tSAXPYK\tLAPUNBK"
"\tBLOCKK\tBLASK\tLAPACK\tOUTPRDB\tSAXPYB\tBLOCKB\tBLASB\tOUTPRDP\tBLOCKP\tBLASP";
	const int		col_n = 0,
					col_bdim = 1,
					col_bdimbla = 2,
					col_bdimlap = 3,
					col_numpivk = 4,
					col_numpivb = 5,
					col_numpivp = 6,
					col_outprdk = 7,
					col_saxpyk = 8,
					col_lapunbk = 9,
					col_blockk = 10,
					col_blask = 11,
					col_lapack = 12,
					col_outprdb = 13,
					col_saxpyb = 14,
					col_blockb = 15,
					col_blasb = 16,
					col_outprdp = 17,
					col_blockp = 18,
					col_blasp = 19;
	const double	alpha = 10.0;	// Scaling factor for random matrix
	
	char	*data_file;
	int		n, bdim, bdim_blas, bdim_lapack;
	int		*piv, *ord;
	double	mflops;
	double	perf_data[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact_piv)( char pivot, int n, int *piv, int *ord, double *A );

	// Concatenate file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );
	
	for ( int i = 0; i < SIZES; i++ ) {
		n = mat_size[i];
		bdim = get_block_dim_ldlt( 0, 0, n );
		bdim_blas = get_block_dim_ldlt( 0, 1, n );
		bdim_lapack = get_block_dim_ldlt( 1, 0, n );
		fprintf( stdout, "n = %d\n", n );
		// (1/3)*n^3 floating point operations is a lower bound on symmetric 
		// indefinite factorization
		mflops = 1.0e-06 * (1.0/3.0) * n * n * n;
		// Create random n-by-n symmetric matrix
		A = (double *) malloc( n*n*sizeof(double) );
		create_random_symmetric( alpha, n, A );
		// Declare pivot and pivot order vectors
		piv = (int *) malloc( n*sizeof(int) );
		ord = (int *) malloc( n*sizeof(int) );

		// Performance data is stored in perf_data[] array in column-major order
		perf_data[i+col_n*SIZES] = (double) n;
		perf_data[i+col_bdim*SIZES] = (double) bdim;
		perf_data[i+col_bdimbla*SIZES] = (double) bdim_blas;
		perf_data[i+col_bdimlap*SIZES] = (double) bdim_lapack;
		
		// Measure performance of LDL' algorithms
printf("ldlt_outer_product(Bunch-Kaufman)\n");
		// Outer product method (kji indexing), Bunch-Kaufman pivoting
		mfact_piv = ldlt_outer_product;
		perf_data[i+col_outprdk*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("ldlt_saxpy(Bunch-Kaufman)\n");	
		// SAXPY operation (jki indexing), Bunch-Kaufman pivoting
		mfact_piv = ldlt_saxpy;
		perf_data[i+col_saxpyk*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("ldlt_lapack_unblocked(Bunch-Kaufman)\n");	
		// LAPACK routine DSYTF2, unblocked version
		mfact_piv = ldlt_lapack_unblocked;
		perf_data[i+col_lapunbk*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("ldlt_block(Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting
		mfact_piv = ldlt_block;
		perf_data[i+col_blockk*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("ldlt_block_blas(Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting, BLAS routines
		mfact_piv = ldlt_block_blas;
		perf_data[i+col_blask*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );
		// Pivot count, Bunch-Kaufman
		perf_data[i+col_numpivk*SIZES] = (double) count_pivot( 0, n, piv, ord );

printf("ldlt_lapack\n");
		// LAPACK routine DSYTRF, Bunch-Kaufman pivoting
		mfact_piv = ldlt_lapack;
		perf_data[i+col_lapack*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("ldlt_outer_product(bounded Bunch-Kaufman)\n");
		// Outer product method (kji indexing), bounded Bunch-Kaufman pivoting
		mfact_piv = ldlt_outer_product;
		perf_data[i+col_outprdb*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );
	
printf("ldlt_saxpy(bounded Bunch-Kaufman)\n");
		// SAXPY operation (jki indexing), bounded Bunch-Kaufman pivoting
		mfact_piv = ldlt_saxpy;
		perf_data[i+col_saxpyb*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );
		
printf("ldlt_block(bounded Bunch-Kaufman)\n");
		// Simple blocking, bounded Bunch-Kaufman pivoting
		mfact_piv = ldlt_block;
		perf_data[i+col_blockb*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );

printf("ldlt_block_blas(bounded Bunch-Kaufman)\n");
		// Simple blocking, bounded Bunch-Kaufman pivoting, BLAS routines
		mfact_piv = ldlt_block_blas;
		perf_data[i+col_blasb*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );
		// Pivot count, bounded Bunch-Kaufman
		perf_data[i+col_numpivb*SIZES] = (double) count_pivot( 0, n, piv, ord );
	
printf("ldlt_outer_product(Bunch-Parlett)\n");
		// Outer product method (kji indexing), Bunch-Parlett pivoting
		mfact_piv = ldlt_outer_product;
		perf_data[i+col_outprdp*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'P', n, piv, ord, A );

printf("ldlt_block(Bunch-Parlett)\n");
		// Simple blocking, Bunch-Parlett pivoting
		mfact_piv = ldlt_block;
		perf_data[i+col_blockp*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'P', n, piv, ord, A );

printf("ldlt_block_blas(Bunch-Parlett)\n");
		// Simple blocking, Bunch-Parlett pivoting, BLAS routines
		mfact_piv = ldlt_block_blas;
		perf_data[i+col_blasp*SIZES] =
			mflops / time_mfact_pivot( mfact_piv, 'P', n, piv, ord, A );
		// Pivot count, Bunch-Parlett
		perf_data[i+col_numpivp*SIZES] = (double) count_pivot( 0, n, piv, ord );

		free( A );
		free( piv );
		free( ord );
	}
	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data ); 

#undef FIELDS
}

/*
 * Measures the time taken to perform symmetric indefinite (LDL') factorization 
 * on matrices of varying degrees of indefiniteness as proxied by the number of
 * Bunch-Kaufman pivots.  Time measurements are made for blocked algorithms 
 * employing Bunch-Kaufman (partial), bounded Bunch-Kaufman (rook) or 
 * Bunch-Parlett (complete) pivoting.
 */
void time_ldlt_indef( void )
{
#if defined(DEBUG)
	#define MATS 4
#else 
	#define MATS 14
#endif
#define FIELDS 13			// Number of output data fields
	const char		*data_file_name = "ldlt_indef.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for simple blocking algorithm\n"
"# BDIMBLA:	Block dimension for blocked algorithm using BLAS\n"
"# BDIMLAP:	Block dimension for LAPACK routine DSYTRF\n"
"# NUMPIVK:	Pivot count, Bunch-Kaufman pivoting\n"
"# NUMPIVB:	Pivot count, bounded Bunch-Kaufman pivoting\n"
"# NUMPIVP:	Pivot count, Bunch-Parlett pivoting\n"
"#			Time (seconds) taken for LDL' factorization of symmetric matrices\n"
"# BLOCKK:	Simple blocking, Bunch-Kaufman pivoting\n"
"# BLASK:	Simple blocking, Bunch-Kaufman pivoting, BLAS routines \n"
"# LAPACK:	LAPACK routine DSYTRF, Bunch-Kaufman pivoting\n"
"# BLASB:	Simple blocking, bounded Bunch-Kaufman pivoting, BLAS routines\n"
"# BLASP:	Simple blocking, Bunch-Parlett pivoting, BLAS routines\n"
"# LAPCHOL:	LAPACK routine DPOTRF, Cholesky factorization\n"
"# \n"
"# N\tBDIM\tBDIMBLA\tBDIMLAP\tNUMPIVK\tNUMPIVB\tNUMPIVP\tBLOCKK\tBLASK\tLAPACK"
"\tBLASB\tBLASP\tLAPCHOL";
	const int		col_n = 0,
					col_bdim = 1,
					col_bdimbla = 2,
					col_bdimlap = 3,
					col_numpivk = 4,
					col_numpivb = 5,
					col_numpivp = 6,
					col_blockk = 7,
					col_blask = 8,
					col_lapack = 9,
					col_blasb = 10,
					col_blasp = 11,
					col_lapchol = 12;
	const int		n = 2000;		// Matrix dimension
	const double	alpha = 10.0;	// Scaling factor for random matrix
	
	char	*data_file, *mat_file;
	char	*mat_file_name[MATS];
	int		bdim, bdim_blas, bdim_lapack;
	int		*piv, *ord;
	double	mflops;
	double	time[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact_piv)( char pivot, int n, int *piv, int *ord, double *A );
	void 	(*mfact)( int n, double *A );

	// Concatenate output data file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );
	// Matrices of varying degrees of indefiniteness are stored in files
	mat_file_name[0] = "mat_2000_spd.dat";
	mat_file_name[1] = "mat_2000_bk24.dat";
	mat_file_name[2] = "mat_2000_bk49.dat";
	mat_file_name[3] = "mat_2000_bk98.dat";
#if !defined(DEBUG)
	mat_file_name[4] = "mat_2000_bk148.dat";
	mat_file_name[5] = "mat_2000_bk202.dat";
	mat_file_name[6] = "mat_2000_bk302.dat";
	mat_file_name[7] = "mat_2000_bk400.dat";
	mat_file_name[8] = "mat_2000_bk500.dat";
	mat_file_name[9] = "mat_2000_bk597.dat";
	mat_file_name[10] = "mat_2000_bk703.dat";
	mat_file_name[11] = "mat_2000_bk797.dat";
	mat_file_name[12] = "mat_2000_bk880.dat";
	mat_file_name[13] = "mat_2000_sym.dat";
#endif
	bdim = get_block_dim_ldlt( 0, 0, n );
	bdim_blas = get_block_dim_ldlt( 0, 1, n );
	bdim_lapack = get_block_dim_ldlt( 1, 0, n );
	// Declare matrix A, and pivot and pivot order vectors
	A = (double *) malloc( n*n*sizeof(double) );
	piv = (int *) malloc( n*sizeof(int) );
	ord = (int *) malloc( n*sizeof(int) );
	
	for ( int i = 0; i < MATS; i++ ) {
		// Concatentate matrix file path and name
		mat_file = (char *) calloc( strlen(file_path) + 
			strlen(mat_file_name[i]) + 1, sizeof(char) );
		strcpy( mat_file, file_path );
		strcat( mat_file, mat_file_name[i] );
		// Read n-by-n symmetric matrix from file
		read_matrix( mat_file, n, n, A );
		// Time measurements are stored in time_data[] array in column-major order
		time[i+col_n*MATS] = (double) n;
		time[i+col_bdim*MATS] = (double) bdim;
		time[i+col_bdimbla*MATS] = (double) bdim_blas;
		time[i+col_bdimlap*MATS] = (double) bdim_lapack;
		
		// Time LDL' factorization
printf("ldlt_block(Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting
		mfact_piv = ldlt_block;
		time[i+col_blockk*MATS] = 
			time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );
		// Pivot count, Bunch-Kaufman
		time[i+col_numpivk*MATS] = (double) count_pivot( 0, n, piv, ord );
		fprintf( stdout, "n = %d, number of (Bunch-Kaufman) pivots = %.0f\n",
			n, time[i+col_numpivk*MATS] );

printf("ldlt_block_blas(Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting, BLAS routines
		mfact_piv = ldlt_block_blas;
		time[i+col_blask*MATS] = 
			time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("ldlt_lapack\n");
		// LAPACK routine DSYTRF, Bunch-Kaufman pivoting
		mfact_piv = ldlt_lapack;
		time[i+col_lapack*MATS] = 
			time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("ldlt_block(bounded Bunch-Kaufman)\n");
		// Simple blocking, bounded Bunch-Kaufman pivoting
		mfact_piv = ldlt_block_blas;
		time[i+col_blasb*MATS] = 
			time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );
		// Pivot count, bounded Bunch-Kaufman
		time[i+col_numpivb*MATS] = (double) count_pivot( 0, n, piv, ord );	
		
printf("ldlt_block(Bunch-Parlett)\n");
		// Simple blocking, Bunch-Parlett pivoting
		mfact_piv = ldlt_block_blas;
		time[i+col_blasp*MATS] =
			time_mfact_pivot( mfact_piv, 'P', n, piv, ord, A );
		// Pivot count, Bunch-Parlett
		time[i+col_numpivp*MATS] = (double) count_pivot( 0, n, piv, ord );

		if ( i == 0 ) {		// Symmetric positive definite 
printf("chol_lapack\n");	
			// LAPACK routine DPOTRF
			mfact = chol_lapack;
			time[i+col_lapchol*MATS] = time_mfact( mfact, n, A );
		} else {
			time[i+col_lapchol*MATS] = -1.0;
		}
	}
	free( A );
	free( piv );
	free( ord );
	write_data_file( data_file, hdr_text, MATS, FIELDS, time ); 

#undef FIELDS
#undef MATS
}

/*
 * Profiles blocked algorithms implementing symmetric indefinite factorization 
 * (LDL').  Profile data estimate the time and proportion of time spent 
 * factoring column blocks, performing symmetric pivoting and updating the 
 * trailing sub-matrix.
 */
void profile_ldlt( void )
{
	const int	n = 2000;
	const char	*mat_file_name = "mat_2000_bk500.dat";

	char 	*mat_file;
	int		num_piv;
	int 	*piv, *ord;
	double	*A, *W;

	A = (double *) malloc( n*n*sizeof(double) );
	W = (double *) malloc( n*n*sizeof(double) );
	piv = (int *) malloc( n*sizeof(int) );
	ord = (int *) malloc( n*sizeof(int) );
	
	// Concatenate file path and name
	mat_file = (char *) calloc( strlen(file_path) + strlen(mat_file_name) + 1,
		sizeof(char) );
	strcpy( mat_file, file_path );
	strcat( mat_file, mat_file_name );
		
	read_matrix( mat_file, n, n, A );
	copy_matrix( n, n, A, W );

	fprintf( stdout, "Profile of symmetric indefinite factorization (seconds)\n" );
	fprintf( stdout, "Simple blocking, Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_ldlt\ttm_factor\ttm_pivot\ttm_reduce\t" );
	fprintf( stdout, "pct_factor\tpct_pivot\tpct_reduce\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		ldlt_block( 'K', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nProfile of symmetric indefinite factorization (seconds)\n" );
	fprintf( stdout, "Blocked algorithm using BLAS, Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_ldlt\ttm_factor\ttm_pivot\ttm_reduce\t" );
	fprintf( stdout, "pct_factor\tpct_pivot\tpct_reduce\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		ldlt_block_blas( 'K', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nProfile of symmetric indefinite factorization (seconds)\n" );
	fprintf( stdout, "Simple blocking, bounded Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_ldlt\ttm_factor\ttm_pivot\ttm_reduce\t" );
	fprintf( stdout, "pct_factor\tpct_pivot\tpct_reduce\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		ldlt_block( 'B', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nProfile of symmetric indefinite factorization (seconds)\n" );
	fprintf( stdout, "Blocked algorithm using BLAS, bounded Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_ldlt\ttm_factor\ttm_pivot\ttm_reduce\t" );
	fprintf( stdout, "pct_factor\tpct_pivot\tpct_reduce\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		ldlt_block_blas( 'B', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}
}

/*
 * Measures the performance (Mflops/sec) of basic and optimized algorithms
 * implementing the modified Cholesky factorization proposed by Gill, Murray & 
 * Wright (with partial pivoting) on symmetric matrices over a range of 
 * dimensions.  The algorithms employ performance optimization techniques 
 * including loop reordering, blocking and the use of the BLAS library.
 */
void time_chol_gmw( void )
{
#define FIELDS 8		// Number of output data fields
	const char		*data_file_name = "chol_gmw.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for simple blocking algorithm\n"
"# BDIMBLA:	Block dimension for blocked algorithm using BLAS\n"
"#			Mflop/sec for modified Cholesky algorithms (Gill, Murray & Wright)\n"
"# NUMPIV:	Pivot count\n"
"# OUTPROD:	Outer product method, kji indexing\n"
"# SAXPY:	SAXPY operation, jki indexing\n"
"# BLOCK:	Simple blocking\n"
"# BLAS:	Simple blocking, BLAS routines\n"
"# \n"
"# N\tBDIM\tBDIMBLA\tNUMPIV\tOUTPROD\tSAXPY\tBLOCK\tBLAS";
	const int		col_n = 0,
					col_bdim = 1,
					col_bdimbla = 2,
					col_numpiv = 3,
					col_outprod = 4,
					col_saxpy = 5,
					col_block = 6,
					col_blas = 7;
	const double	alpha = 10.0;	// Scaling factor for random matrix
	
	char	*data_file;
	int		n, bdim, bdim_blas;
	int		*piv, *ord;
	double	mflops;
	double	perf_data[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact_piv)( char pivot, int n, int *piv, int *ord, double *A );

	// Concatenate file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );
	
	for ( int i = 0; i < SIZES; i++ ) {
		n = mat_size[i];
		bdim = get_block_dim_ldlt( 0, 0, n );
		bdim_blas = get_block_dim_ldlt( 0, 1, n );
		fprintf( stdout, "n = %d\n", n );
		// (1/3)*n^3 floating point operations is a lower bound on modified 
		// Cholesky factorization
		mflops = 1.0e-06 * (1.0/3.0) * n * n * n;
		// Create random n-by-n symmetric matrix
		A = (double *) malloc( n*n*sizeof(double) );
		create_random_symmetric( alpha, n, A );
		// Declare pivot and pivot order vectors
		piv = (int *) malloc( n*sizeof(int) );
		ord = (int *) malloc( n*sizeof(int) );

		// Performance data is stored in perf_data[] array in column-major order
		perf_data[i+col_n*SIZES] = (double) n;
		perf_data[i+col_bdim*SIZES] = (double) bdim;
		perf_data[i+col_bdimbla*SIZES] = (double) bdim_blas;

		// Measure performance of modified Cholesky algorithms
		// Gill, Murray & Wright algorithm with Type-1 modification
printf("chol_gmw_outer_product\n");	
		// Outer product method (kji indexing)
		mfact_piv = chol_gmw_outer_product;
		perf_data[i+col_outprod*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'D', n, piv, ord, A );
	
printf("chol_gmw_saxpy\n");	
		// SAXPY operation (jki indexing), diagonal pivoting
		mfact_piv = chol_gmw_saxpy;
		perf_data[i+col_saxpy*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'D', n, piv, ord, A );
		
printf("chol_gmw_block\n");	
		// Simple blocking, diagonal pivoting
		mfact_piv = chol_gmw_block;
		perf_data[i+col_block*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'D', n, piv, ord, A );
		// Pivot count, diagonal (Gill, Murray & Wright)
		perf_data[i+col_numpiv*SIZES] = (double) count_pivot( 0, n, piv, ord );

printf("chol_gmw_block_blas\n");	
		// Simple blocking, diagonal pivoting, BLAS routines
		mfact_piv = chol_gmw_block_blas;
		perf_data[i+col_blas*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'D', n, piv, ord, A );

		free( A );
		free( piv );
		free( ord );
	}
	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data );

#undef FIELDS
}

/*
 * Measures the performance (Mflops/sec) of basic and optimized algorithms
 * implementing the modified Cholesky factorization proposed by Cheng & Higham
 * on symmetric matrices over a range of dimensions.  Our implementation of the
 * modified Cholesky factorization proposed by Cheng & Higham uses either 
 * Bunch-Kaufman (partial) or bounded Bunch-Kaufman (rook) pivoting.  The
 * algorithms employ performance optimization techniques including loop 
 * reordering, blocking and the use of the BLAS library.
 */
void time_chol_ch( void )
{
#define FIELDS 9			// Number of output data fields
	const char		*data_file_name = "chol_ch.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for simple blocking algorithm\n"
"# BDIMBLA:	Block dimension for blocked algorithm using BLAS\n"
"# NUMPIVK:	Pivot count, Bunch-Kaufman pivoting\n"
"# NUMPIVB:	Pivot count, bounded Bunch-Kaufman pivoting\n"
"#			Mflop/sec for modified Cholesky algorithms (Cheng & Higham)\n"
"# BLOCKK:	Simple blocking, Bunch-Kaufman pivoting\n"
"# BLASK:	Simple blocking, Bunch-Kaufman pivoting, BLAS routines \n"
"# BLOCKB:	Simple blocking, bounded Bunch-Kaufman pivoting\n"
"# BLASB:	Simple blocking, bounded Bunch-Kaufman pivoting, BLAS routines \n"
"# \n"
"# N\tBDIM\tBDIMBLA\tNUMPIVK\tNUMPIVB\tBLOCKK\tBLASK\tBLOCKB\tBLASB";
	const int		col_n = 0,
					col_bdim = 1,
					col_bdimbla = 2,
					col_numpivk = 3,
					col_numpivb = 4,
					col_blockk = 5,
					col_blask = 6,
					col_blockb = 7,
					col_blasb = 8;
	const double	alpha = 10.0;	// Scaling factor for random matrix
	
	char		*data_file;
	int		n, bdim, bdim_blas;
	int		*piv, *ord;
	double	mflops;
	double	perf_data[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact_piv)( char pivot, int n, int *piv, int *ord, double *A );

	// Concatenate file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );
	
	for ( int i = 0; i < SIZES; i++ ) {
		n = mat_size[i];
		bdim = get_block_dim_ldlt( 0, 0, n );
		bdim_blas = get_block_dim_ldlt( 0, 1, n );
		fprintf( stdout, "n = %d\n", n );
		// (1/3)*n^3 floating point operations is a lower bound on modified 
		// Cholesky factorization
		mflops = 1.0e-06 * (1.0/3.0) * n * n * n;
		// Create random n-by-n symmetric matrix
		A = (double *) malloc( n*n*sizeof(double) );
		create_random_symmetric( alpha, n, A );
		// Declare pivot and pivot order vectors
		piv = (int *) malloc( n*sizeof(int) );
		ord = (int *) malloc( n*sizeof(int) );

		// Performance data is stored in perf_data[] array in column-major order
		perf_data[i+col_n*SIZES] = (double) n;
		perf_data[i+col_bdim*SIZES] = (double) bdim;
		perf_data[i+col_bdimbla*SIZES] = (double) bdim_blas;

		// Measure performance of modified Cholesky algorithms
		// Cheng & Higham algorithm with Type-II modification		
printf("chol_ch_block(Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting
		mfact_piv = chol_ch_block;
		perf_data[i+col_blockk*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );
		// Pivot count, Bunch-Kaufman
		perf_data[i+col_numpivk*SIZES] = (double) count_pivot( 0, n, piv, ord );

printf("chol_ch_block_blas(Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting, BLAS routines
		mfact_piv = chol_ch_block_blas;
		perf_data[i+col_blask*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("chol_ch_block(bounded Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting
		mfact_piv = chol_ch_block;
		perf_data[i+col_blockb*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );
		// Pivot count, Bunch-Kaufman
		perf_data[i+col_numpivb*SIZES] = (double) count_pivot( 0, n, piv, ord );

printf("chol_ch_block_blas(bounded Bunch-Kaufman)\n");
		// Simple blocking, Bunch-Kaufman pivoting, BLAS routines
		mfact_piv = chol_ch_block_blas;
		perf_data[i+col_blasb*SIZES] = 
			mflops / time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );
		
		free( A );
		free( piv );
		free( ord );
	}
	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data ); 

#undef FIELDS
}

/*
 * Measures the time taken to perform modified Cholesky factorization 
 * (Gill-Murray-Wright and Cheng-Higham) on matrices of varying degrees of 
 * indefiniteness as proxied by the number of Bunch-Kaufman pivots.  Time
 * measurements are made for blocked algorithms employing partial and rook
 * pivoting strategies.  
 */
void time_mod_chol_indef( void )
{
#if defined(DEBUG)
	#define MATS 4
#else 
	#define MATS 14
#endif
#define FIELDS 12			// Number of output data fields
	const char		*data_file_name = "mod_chol_indef.dat";
	const char		*hdr_text = 
"# N:		Matrix dimension, N-by-N\n"
"# BDIM:	Block dimension for simple blocking algorithm\n"
"# BDIMBLA:	Block dimension for blocked algorithm using BLAS\n"
"# NUMPIVK:	Pivot count, Bunch-Kaufman pivoting\n"
"# NUMPIVB:	Pivot count, bounded Bunch-Kaufman pivoting\n"
"# NUMPIVD:	Pivot count, Gill-Murray-Wright diagonal pivoting\n"
"#			Time (seconds) taken for modified Cholesky factorization\n"
"# CHBLKK:	Cheng-Higham, simple blocking, Bunch-Kaufman pivoting\n"
"# CHBLKB:	Cheng-Higham, simple blocking, bounded Bunch-Kaufman pivoting\n"
"# CHBLASK:	Cheng-Higham, BLAS routines, Bunch-Kaufman pivoting \n"
"# CHBLASB:	Cheng-Higham, BLAS routines, bounded Bunch-Kaufman pivoting\n"
"# GMWBLK:	Gill-Murray-Wright, simple blocking, partial pivoting\n"
"# GMWBLAS:	Gill-Murray-Wright, BLAS routines, partial pivoting\n"
"# \n"
"# N\tBDIM\tBDIMBLA\tNUMPIVK\tNUMPIVB\tNUMPIVD\tCHBLKK\tCHBLKB\tCHBLASK"
"\tCHBLASB\tGMWBLK\tGMWBLAS";
	const int		col_n = 0,
					col_bdim = 1,
					col_bdimbla = 2,
					col_numpivk = 3,
					col_numpivb = 4,
					col_numpivd = 5,
					col_chblkk = 6,
					col_chblkb = 7,
					col_chblask = 8,
					col_chblasb = 9,
					col_gmwblk = 10,
					col_gmwblas = 11;
	const int		n = 2000;		// Matrix dimension
	const double	alpha = 10.0;	// Scaling factor for random matrix
	
	char	*data_file, *mat_file;
	char	*mat_file_name[MATS];
	int		bdim, bdim_blas;
	int		*piv, *ord;
	double	mflops;
	double	time[FIELDS*SIZES];	
	double	*A;
	void 	(*mfact_piv)( char pivot, int n, int *piv, int *ord, double *A );
	void 	(*mfact)( int n, double *A );

	// Concatenate output data file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );
	// Matrices of varying degrees of indefiniteness are stored in files
	mat_file_name[0] = "mat_2000_spd.dat";
	mat_file_name[1] = "mat_2000_bk24.dat";
	mat_file_name[2] = "mat_2000_bk49.dat";
	mat_file_name[3] = "mat_2000_bk98.dat";
#if !defined(DEBUG)
	mat_file_name[4] = "mat_2000_bk148.dat";
	mat_file_name[5] = "mat_2000_bk202.dat";
	mat_file_name[6] = "mat_2000_bk302.dat";
	mat_file_name[7] = "mat_2000_bk400.dat";
	mat_file_name[8] = "mat_2000_bk500.dat";
	mat_file_name[9] = "mat_2000_bk597.dat";
	mat_file_name[10] = "mat_2000_bk703.dat";
	mat_file_name[11] = "mat_2000_bk797.dat";
	mat_file_name[12] = "mat_2000_bk880.dat";
	mat_file_name[13] = "mat_2000_sym.dat";
#endif
	bdim = get_block_dim_ldlt( 0, 0, n );
	bdim_blas = get_block_dim_ldlt( 0, 1, n );
	// Declare matrix A, and pivot and pivot order vectors
	A = (double *) malloc( n*n*sizeof(double) );
	piv = (int *) malloc( n*sizeof(int) );
	ord = (int *) malloc( n*sizeof(int) );
	
	for ( int i = 0; i < MATS; i++ ) {
		// Concatentate matrix file path and name
		mat_file = (char *) calloc( strlen(file_path) + 
			strlen(mat_file_name[i]) + 1, sizeof(char) );
		strcpy( mat_file, file_path );
		strcat( mat_file, mat_file_name[i] );
		// Read n-by-n symmetric matrix from file
		read_matrix( mat_file, n, n, A );
		// Time measurements are stored in time_data[] array in column-major order
		time[i+col_n*MATS] = (double) n;
		time[i+col_bdim*MATS] = (double) bdim;
		time[i+col_bdimbla*MATS] = (double) bdim_blas;
		
		// Time LDL' factorization
printf("chol_ch_block(Bunch-Kaufman)\n");
		// Cheng-Higham, simple blocking, Bunch-Kaufman pivoting
		mfact_piv = chol_ch_block;
		time[i+col_chblkk*MATS] = 
			time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );
		// Pivot count, Bunch-Kaufman
		time[i+col_numpivk*MATS] = (double) count_pivot( 0, n, piv, ord );
		fprintf( stdout, "n = %d, number of (Bunch-Kaufman) pivots = %.0f\n",
			n, time[i+col_numpivk*MATS] );

printf("chol_ch_block(bounded Bunch-Kaufman)\n");
		// Cheng-Higham, simple blocking, bounded Bunch-Kaufman pivoting
		mfact_piv = chol_ch_block;
		time[i+col_chblkb*MATS] = 
			time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );
		// Pivot count, bounded Bunch-Kaufman
		time[i+col_numpivb*MATS] = (double) count_pivot( 0, n, piv, ord );

printf("chol_ch_block_blas(Bunch-Kaufman)\n");
		// Cheng-Higham, BLAS routines, Bunch-Kaufman pivoting
		mfact_piv = chol_ch_block_blas;
		time[i+col_chblask*MATS] = 
			time_mfact_pivot( mfact_piv, 'K', n, piv, ord, A );

printf("chol_ch_block_blas(bounded Bunch-Kaufman)\n");
		// Cheng-Higham, BLAS routines, bounded Bunch-Kaufman pivoting
		mfact_piv = chol_ch_block_blas;
		time[i+col_chblasb*MATS] = 
			time_mfact_pivot( mfact_piv, 'B', n, piv, ord, A );

printf("chol_gmw_block\n");
		// Gill-Murray-Wright, simple blocking, partial pivoting
		mfact_piv = chol_gmw_block;
		time[i+col_gmwblk*MATS] =
			time_mfact_pivot( mfact_piv, 'D', n, piv, ord, A );
		// Pivot count, bounded Bunch-Kaufman
		time[i+col_numpivd*MATS] = (double) count_pivot( 0, n, piv, ord );

printf("chol_gmw_block_blas\n");
		// Gill-Murray-Wright, BLAS routines, partial pivoting
		mfact_piv = chol_gmw_block_blas;
		time[i+col_gmwblas*MATS] =
			time_mfact_pivot( mfact_piv, 'D', n, piv, ord, A );
	}
	free( A );
	free( piv );
	free( ord );
	write_data_file( data_file, hdr_text, MATS, FIELDS, time ); 

#undef FIELDS
#undef MATS
}

/*
 * Profiles blocked algorithms implementing modified Cholesky factorization  
 * (Gill-Murray-Wright and Cheng-Higham) on symmetric matrices.  Profile data 
 * estimate the time and proportion of time spent modifying the symmetric
 * indefinite factorization.
 */
void profile_mod_chol( void )
{
	const int	n = 2000;
	const char	*mat_file_name = "mat_2000_bk500.dat";

	char 	*mat_file;
	int 	*piv, *ord;
	double	*A, *W;

	A = (double *) malloc( n*n*sizeof(double) );
	W = (double *) malloc( n*n*sizeof(double) );
	piv = (int *) malloc( n*sizeof(int) );
	ord = (int *) malloc( n*sizeof(int) );
	
	// Concatenate file path and name
	mat_file = (char *) calloc( strlen(file_path) + strlen(mat_file_name) + 1,
		sizeof(char) );
	strcpy( mat_file, file_path );
	strcat( mat_file, mat_file_name );
		
	read_matrix( mat_file, n, n, A );
	copy_matrix( n, n, A, W );

	fprintf( stdout, "Time profile of modified Cholesky (seconds)\n" );
	fprintf( stdout, "Gill-Murray-Wright, simple blocking, partial pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_mod_chol\ttm_mod_fact\tpct_mod_fact\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_gmw_block( 'K', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nTime profile of modified Cholesky (seconds)\n" );
	fprintf( stdout, "Gill-Murray-Wright, BLAS routines, partial pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_mod_chol\ttm_mod_fact\tpct_mod_fact\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_gmw_block_blas( 'K', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nTime profile of modified Cholesky (seconds)\n" );
	fprintf( stdout, "Cheng-Higham, simple blocking, Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_mod_chol\ttm_mod_fact\tpct_mod_fact\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_ch_block( 'K', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nTime profile of modified Cholesky (seconds)\n" );
	fprintf( stdout, "Cheng-Higham, BLAS routines, Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_mod_chol\ttm_mod_fact\tpct_mod_fact\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_ch_block_blas( 'K', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nTime profile of modified Cholesky (seconds)\n" );
	fprintf( stdout, "Cheng-Higham, simple blocking, bounded Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_mod_chol\ttm_mod_fact\tpct_mod_fact\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_ch_block( 'B', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}

	fprintf( stdout, "\nTime profile of modified Cholesky (seconds)\n" );
	fprintf( stdout, "Cheng-Higham, BLAS routines, bounded Bunch-Kaufman pivoting\n" );
	fprintf( stdout, "%d-by-%d symmetric matrix: %s\n", n, n, mat_file_name );
	fprintf( stdout, "tm_mod_chol\ttm_mod_fact\tpct_mod_fact\n" );
	for ( int i = 0; i < MIN_ITER; i++ ) {
		chol_ch_block_blas( 'B', n, piv, ord, A );
		copy_matrix( n, n, W, A );
	}
}
