/*
 * Timing harness for measuring the performance of basic and "optimized" 
 * algorithms implementing matrix multiplication (and addition), C = C + A*B, 
 * on square matrices over a range of dimensions.  Performance of matrix
 * multiplication algorithms is also measured for different compiler
 * optimization levels and options.  Performance data are written to an output
 * file destination.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

//#include <sys/types.h>
//#include <sys/resource.h>
//#include <unistd.h>

#include "matmult.h"
#include "matcom.h"
#include "timing.h"

#if !defined(PROC)
#	define PROC "unknown"
#endif
#if !defined(CORES)
#	define CORES "unknown"
#endif
#if !defined(CLKSPEED)
#	define CLKSPEED "unknown"
#endif
#if !defined(CACHE)
#	define CACHE "unknown"
#endif
#if !defined(COMPILER)
#	define COMPILER "unknown"
#endif
#if !defined(LANGUAGE)
#	define LANGUAGE "default"
#endif
#if !defined(OPTM)
#	define OPTM "default"
#endif
#if !defined(DATADIR)
#	define DATADIR "."	// Current directory ./
#endif

#if defined(DEBUG)
#	define MIN_ITER 4		// Minimum number of iterations of algorithm
#	define MIN_SECS 1.0		// Minimum elapsed time for execution of algorithm
	// Define sizes (dimensions) of square matrices used to measure performance
	const int mat_size[] = { 65, 130, 195, 254 };
#else
#	define MIN_ITER 8
#	define MIN_SECS 2.0
	const int mat_size[] = { 65, 130, 195, 254, 258, 321, 387, 450, 508, 516, 
		579, 642, 707, 764, 772, 833, 899, 963, 1021, 1027 };
#endif
#define SIZES (sizeof(mat_size) / sizeof(int))

static void write_data_file( const char *file, const char *hdr_text, 
	int rows, int cols, const double *data );
static double time_mmult( void (*mmult)(int n, const double *A, const double *B,
	double *C), int n, const double *A, const double *B, double *C );
static void time_mmult_algo( void );
static void time_compiler_optm( void );

static char		*file_path;

int main()
{
	// Specify file path for output data files
	file_path = (char *) calloc( strlen(DATADIR) + 2, sizeof(char) );
	strcpy( file_path, DATADIR );
	strcat( file_path, "/" );

#if defined(MULTALGO)
	time_mmult_algo();
#endif

#if defined(CCOPTMDP) || defined(CCOPTMSA)
	time_compiler_optm();
#endif

	return 0;
}

/******************************************************************************/

/*
 * Writes header text and experimental data to the file specified in the 
 * argument list.  Experimental data is enumerated in a matrix stored in 
 * column-major order. 
 */
void write_data_file( const char *file, const char *hdr_text, 
	int rows, int cols, const double *data )
{
	FILE *fp;

	if ( (fp = fopen( file, "w" )) == NULL ) {
		fprintf( stderr, "Error opening file %s.", file );
		exit( -1 );
	}
	// Write header text
	fprintf( fp, "# Processor:\t%s\n", PROC);
	fprintf( fp, "# Cores:\t%s\n", CORES);
	fprintf( fp, "# Clock speed:\t%s\n", CLKSPEED);
	fprintf( fp, "# Cache:\t%s\n", CACHE);
	fprintf( fp, "# \n" );
	fprintf( fp, "# C compiler:\t%s\n", COMPILER );
	fprintf( fp, "# C language standard:\t%s\n", LANGUAGE );
	fprintf( fp, "# Optimization level and options:\t%s\n", OPTM );
	fprintf( fp, "# Clock resolution:\t%Lg\n", timer_resolution() );
	fprintf( fp, "# \n" );
#if defined(MULTALGO)
	fprintf( fp, "# Sub-block dimension (kernel multiplication):\t%d\n", KDIM );
	fprintf( fp, "# Depth of loop unrolling:\t%d\n", UNROLL_DEPTH );
	fprintf( fp, "# Depth of software pipelining:\t%d\n", PIPE_DEPTH );
#elif defined(CCOPTMDP)
	fprintf( fp, "# Dot product (ijk indexing) algorithm\n" );
#elif defined(CCOPTMSA)
	fprintf( fp, "# Scalar alpha x plus y (jki indexing) algorithm\n" );
#endif
	fprintf( fp, "# \n" );
	fprintf( fp, "%s\n", hdr_text );
	// Write experimental data
	for ( int i = 0; i < rows; i++ ) {
		for ( int j = 0; j < cols; j++ ) {
			fprintf( fp, "%g\t", *(data+j*rows+i) );
		}
		fprintf( fp, "\n" );
	}
	fclose( fp );	
}

/* 
 * Measures the average time (number of seconds) to perform matrix 
 * multiplication (and addition), C = C + A*B, on n-by-n matrices. Matrix  
 * multiplication is performed iteratively for at least the minimum number of 
 * iterations, and until the minimum time (in seconds) has elapsed.
 */
double time_mmult( void (*mmult)(int n, const double *A, const double *B,
	double *C), int n, const double *A, const double *B, double *C )
{
	struct		timespec sta, end;
	long int	num_iter = MIN_ITER;
	double		secs = -1.0;
	double		*M;

	// Save copy of matrix C before performing matrix multiplication
	M = (double *) malloc( n*n*sizeof(double) );
	copy_matrix( n, n, C, M );

	while ( secs < MIN_SECS ) {
		get_time( &sta );
		for ( int i = 0; i < num_iter; i++ ) {
			mmult( n, A, B, C );
			copy_matrix( n, n, M, C );	// Reset matrix C to initial value
		}
		get_time( &end );
		secs = timespec_diff( sta, end );
		num_iter *= 2;
	}
	free( M );
	// On exiting the while loop, the number of iterations (num_iter) has been
	// doubled in the event that secs < MIN_SECS, so num_iter must be halved
	return secs / (num_iter/2.0);
}

/*
 * Measures the performance (Mflops/sec) of unblocked and blocked algorithms 
 * performing matrix multiplication (and addition), C = C + A*B, on square
 * matrices over a range of dimensions.
 */
void time_mmult_algo( void )
{
#define FIELDS 11	// Number of output data fields
	const char	*data_file_name = "mmult.dat";
	const char	*hdr_text =
"# N:		Matrix dimension, n-by-n\n"
"# BDIM:	Block dimension used by blocking algorithms\n"
"#			Mflop/sec for matrix multiplication algorithms\n"
"# DOTPROD:	Dot product, ijk indexing\n"
"# SAXPY:	Scalar alpha x plus y, jki indexing\n"
"# UNROLL:	Loop unrolling, dot product\n"
"# PIPELN:	Software pipelining, SAXPY\n"
"# BLKSIMP:	Simple blocking\n"
"# BLKCTG:	Contiguous blocking\n"
"# BLKRCR:	Recursive contiguous blocking\n"
"# RCRRECT:	Recursive contiguous blocking, variable sub-block sizes\n"
"# BLAS:	BLAS routine DGEMM\n"
"# \n"
"# N\tBDIM\tDOTPROD\tSAXPY\tUNROLL\tPIPELN\tBLKSIMP\tBLKCTG\tBLKRCR"
"\tRCRRECT\tBLAS";
	const int		col_n = 0,
					col_bdim = 1,
					col_dotprod = 2,
					col_saxpy = 3,
					col_unroll = 4,
					col_pipeln = 5,
					col_blksimp = 6,
					col_blkctg = 7,
					col_blkrcr = 8,
					col_rcrrect = 9,
					col_blas = 10;
	const double 	alpha = 10.0;	// Scaling factor for random matrix
	
	char	*data_file;
	int		n, bdim;
	double	mflops;
	double	perf_data[FIELDS*SIZES];	
	double	*A, *B, *C;
	void 	(*mmult)( int n, const double *A, const double *B, double *C );

	// Concatenate file path and name
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_name) + 1,
		sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_name );

	for ( int i = 0; i < SIZES; i++ ) {
		n = mat_size[i];
		bdim = get_block_dim_mmult( n );
		fprintf( stdout, "n = %d, bdim = %d\n", n, bdim );
		// Matrix multiplication takes 2*n^3 floating point operations
		mflops = 1.0e-06 * 2.0 * n * n * n;
		// Create random n-by-n matrices
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		
		// Performance data is stored in perf_data[] array in column-major order
		perf_data[i+col_n*SIZES] = (double) n;
		perf_data[i+col_bdim*SIZES] = (double) bdim;
printf("mmult_outer_product\n");	
		// Measure performance of matrix multiplication algorithms:
		// Dot product, ijk indexing
		mmult = mmult_dot_product;
		perf_data[i+col_dotprod*SIZES] = 0.0;//mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_saxpy\n");	
		// Scalar alpha x plus y, jki indexing
		mmult = mmult_saxpy;
		perf_data[i+col_saxpy*SIZES] = 0.0;//mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_unroll\n");			
		// Loop unrolling, dot product
		mmult = mmult_unroll;
		perf_data[i+col_unroll*SIZES] = 0.0;//mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_pipeline\n");	
		// Software pipelining, SAXPY
		mmult = mmult_pipeline;
		perf_data[i+col_pipeln*SIZES] = 0.0;//mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_block\n");	
		// Simple blocking
		mmult = mmult_block;
		perf_data[i+col_blksimp*SIZES] = mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_contig_block\n");	
		// Blocking, contiguous block storage
		mmult = mmult_contig_block;
		perf_data[i+col_blkctg*SIZES] = 0.0;//mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_recur_block\n");	
		// Blocking, recursive contiguous blocking
		mmult = mmult_recur_block;
		perf_data[i+col_blkrcr*SIZES] = mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_rect_recur_block\n");	
		// Blocking, recursive contiguous blocking, variable looping
		mmult = mmult_rect_recur_block;
		perf_data[i+col_rcrrect*SIZES] = 0.0;//mflops / time_mmult( mmult, n, A, B, C );
printf("mmult_blas\n");	
		// BLAS routine DGEMM
		mmult = mmult_blas;
		perf_data[i+col_blas*SIZES] = mflops / time_mmult( mmult, n, A, B, C );

		free( A );
		free( B );
		free( C );
	}

	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data ); 

#undef FIELDS
}

/*
 * Measures the performance (Mflops/sec) of matrix multiplication (and addition), 
 * C = C + A*B, on square matrices over a range of dimensions.  Performance is
 * measured for different compiler optimization levels and options.
 */
void time_compiler_optm( void )
{
#define FIELDS 2	// Number of output data fields
	const char		*data_file_suffix = ".dat",
					*delim = " -";
	const char		*hdr_text =
"# N:		Matrix dimension, N-by-N\n"
"# PERF:	Mflop/sec for matrix multiplication algorithm\n"
"# \n"
"# N\tPERF";
	const int		col_n = 0,
					col_perf = 1;
	const double 	alpha = 10.0;	// Scaling factor for random matrix

	char	*data_file, *optm_lvl, *optm_str, *token;
	int		n;
	double	mflops;
	double	perf_data[FIELDS*SIZES];
	double	*A, *B, *C;	
	void 	(*mmult)( int n, const double *A, const double *B, double *C );

#if defined(CCOPTMDP)
	const char *data_file_prefix = "mmult_dot_ccoptm_";
	mmult = mmult_dot_product;
#elif defined(CCOPTMSA)
	const char *data_file_prefix = "mmult_saxpy_ccoptm_";
	mmult = mmult_saxpy;
#else 
	const char *data_file_prefix = "mmult_saxpy_ccoptm_";
	mmult = mmult_saxpy;
#endif

	// Format optimization level and options as a string
	optm_lvl = (char *) calloc( strlen(OPTM) + 1, sizeof(char) );
	strcpy( optm_lvl, OPTM );
	optm_str = (char *) calloc( strlen(optm_lvl) + 1, sizeof(char) );
	if ( (token = strtok( optm_lvl, delim )) != NULL ) {
		strcpy( optm_str, token );
		while ( (token = strtok( NULL, delim )) != NULL ) {		
			strcat( optm_str, token );
		}
	}
	// Concatenate file path and names
	data_file = (char *) calloc( strlen(file_path) + strlen(data_file_prefix) +
		strlen(optm_str) + strlen(data_file_suffix) + 1, sizeof(char) );
	strcpy( data_file, file_path );
	strcat( data_file, data_file_prefix );
	strcat( data_file, optm_str );
	strcat( data_file, data_file_suffix );
	
	for ( int i = 0; i < SIZES; i++ ) {
		n = mat_size[i];
		fprintf( stdout, "n = %d\n", n );
		// Matrix multiplication takes 2*n^3 floating point operations
		mflops = 1.0e-06 * 2.0 * n * n * n;
		// Create random n-by-n matrices
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		// Matrix multiplication takes 2*n^3 floating point operations
		mflops = 1.0e-06 * 2.0 * n * n * n;

		perf_data[i+col_n*SIZES] = n;
		perf_data[i+col_perf*SIZES] = mflops / time_mmult( mmult, n, A, B, C );

		free( A );
		free( B );
		free( C );
	}

	write_data_file( data_file, hdr_text, SIZES, FIELDS, perf_data ); 

#undef FIELDS
}
