/*
 * Timing hareness for measuring the performance of parallel algorithms 
 * implementing matrix multiplication (and addition), C = C + A*B, on square 
 * matrices over a range of dimensions.  Performance data are written to an
 * output file destination.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <mpi.h>

#include "timing.h"
#include "matmultp.h"

#if !defined(PROC)
#	define PROC "unknown"
#endif
#if !defined(CORES)
#	define CORES "unknown"
#endif
#if !defined(CLKSPEED)
#	define CLKSPEED "unknown"
#endif
#if !defined(CACHE)
#	define CACHE "unknown"
#endif
#if !defined(COMPILER)
#	define COMPILER "unknown"
#endif
#if !defined(LANGUAGE)
#	define LANGUAGE "default"
#endif
#if !defined(OPTM)
#	define OPTM "default"
#endif
#if !defined(DATADIR)
#	define DATADIR "."	// Current directory ./
#endif

#define MIN_ITER 8		// Minimum number of iterations of algorithm
#define MIN_SECS 2.0	// Minimum elapsed time for execution of algorithm

static void write_data_file( const char *file, const char *hdr_text, 
	int rows, int cols, const double *data, struct mpi_grid *grid );
static void read_data_file( const char *file, int rows, int cols, double *data );
static double time_matmultp( void (*matmultp)(int n, const double *A, const double *B,
	double *C, struct mpi_grid *grid), int n, const double *A, const double *B, 
	double *C, struct mpi_grid *grid );
static void time_parallel_matrix_multiply( struct mpi_grid *grid  );

static char				*file_path;
static struct mpi_grid	grid;

int main( int argc, char **argv )
{
	const double	alpha = 10.0;	// Scaling factor for random matrix 

	MPI_Init( &argc, &argv );
	//Establish Cartesian topology for collective communication
	setup_mpi_grid( &grid );

	// Specify file path for input and output data files
	file_path = (char *) calloc( strlen(DATADIR) + 2, sizeof(char) );
	strcpy( file_path, DATADIR );
	strcat( file_path, "/" );

	time_parallel_matrix_multiply( &grid );

	MPI_Finalize();
	return 0;
}

/******************************************************************************/

/*
 * Writes header text and experimental data to the file specified in the 
 * argument list.  Experimental data is enumerated in a matrix stored in 
 * column-major order. 
 */
void write_data_file( const char *file, const char *hdr_text, 
	int rows, int cols, const double *data, struct mpi_grid *grid )
{
	FILE *fp;

	if ( (fp = fopen( file, "w" )) == NULL ) {
		fprintf( stderr, "Error opening file %s.", file );
		exit( -1 );
	}
	// Write header text
	fprintf( fp, "# Processor:\t%s\n", PROC);
	fprintf( fp, "# Cores:\t%s\n", CORES);
	fprintf( fp, "# Clock speed:\t%s\n", CLKSPEED);
	fprintf( fp, "# Cache:\t%s\n", CACHE);
	fprintf( fp, "# \n" );
	fprintf( fp, "# C compiler:\t%s\n", COMPILER );
	fprintf( fp, "# C language standard:\t%s\n", LANGUAGE );
	fprintf( fp, "# Optimization level and options:\t%s\n", OPTM );
	fprintf( fp, "# Clock resolution:\t%Lg\n", timer_resolution() );
	fprintf( fp, "# Number of processors:\t%d\n", grid->p);
	fprintf( fp, "# \n" );
	fprintf( fp, "%s\n", hdr_text );
	// Write experimental data
	for ( int i = 0; i < rows; i++ ) {
		for ( int j = 0; j < cols; j++ ) {
			fprintf( fp, "%g\t", *(data+j*rows+i) );
		}
		fprintf( fp, "\n" );
	}
	fclose( fp );	
}

/*
 * Reads matrix data in specified file into an array passed in argument list.
 * Data read from the file is stored in the array in column-major order. 
 */
void read_data_file( const char *file, int rows, int cols, double *data )
{
	FILE *fp;

	if ( (fp = fopen(file, "r")) == NULL ) {
		fprintf( stderr, "Error opening file %s.", file );
		exit(-1);
	}
	for ( int i = 0; i < rows; i++ ) {
		for ( int j = 0; j < cols; j++ ) {
			fscanf( fp, "%lg", (data+j*rows+i) );
		}
	}
	fclose(fp);	
}

/* 
 * Measures the average time (number of seconds) to perform parallel matrix 
 * multiplication (and addition), C = C + A*B, on n-by-n matrices. Parallel 
 * matrix  multiplication is performed iteratively for at least the minimum 
 * number of iterations, and until the minimum time (in seconds) has elapsed.
 */
double time_matmultp( void (*matmultp)(int n, const double *A, const double *B,
	double *C, struct mpi_grid *grid), int n, const double *A, const double *B, 
	double *C, struct mpi_grid *grid )
{
	struct		timespec sta, end;
	long int	num_iter = MIN_ITER;
	double		secs = -1.0;
	double		*M;

	// Save copy of matrix C before performing matrix multiplication
	if (grid->rank == 0) {	
		M = (double *) malloc( n*n*sizeof(double) );
		copy_matrix( n, n, C, M );
	}
	
	while ( secs < MIN_SECS ) {
		get_time( &sta );
		for ( int i = 0; i < num_iter; i++ ) {
			parallel_matrix_multiply( n, A, B, C, grid );
		}
		get_time( &end );
		secs = timespec_diff( sta, end );
		num_iter *= 2;
		if ( grid->rank == 0 ) {
			copy_matrix( n, n, M, C );		// Reset matrix C to initial value
		}
	}
	if (grid->rank == 0) {	
		free( M );
	}
	// On exiting the while loop, the number of iterations (num_iter) has been
	// doubled in the event that secs < MIN_SECS, so num_iter must be halved
	return secs / (num_iter/2.0);
}

/*
 * Measures the performance of parallel matrix multiplication (and addition), 
 * C = C + A*B, on square matrices over a range of dimensions.
 */
void time_parallel_matrix_multiply( struct mpi_grid *grid  )
{
#define IN_FIELDS 11		// Number of input data fields
#define OUT_FIELDS 5		// Number of output data fields
	const char	*out_file_prefix = "mmult_",
				*out_file_ext = ".dat",
				*in_file_name = "mmult.dat";
	const char	*hdr_text =
"# N:		Matrix dimension, n-by-n\n"
"# SERIAL:	Mflop/sec for serial matrix multiplication, simple blocking\n"
"# PARA:	Mflop/sec for parallel matrix multiplication, simple blocking\n"
"# SPEEDUP:	Speed-up\n"
"# EFFNCY:	Efficiency\n"
"# \n"
"# N\tSERIAL\tPARA\tSPEEDUP\tEFFNCY";
	const int		col_n = 0,
					col_serial = 1,
					col_para = 2,
					col_speedup = 3,
					col_effncy = 4,
					col_in_serial = COLINSER;
	const double	alpha = 10.0;	// Scaling factor for random matrix 

	char 	procs[4];
	char	*out_data_file, *in_data_file;
	int		n;
	double	mflops, mflop_sec;
	double	perf_data[OUT_FIELDS*SIZES],
			perf_serial[IN_FIELDS*SIZES];
	double	*A, *B, *C;
	void 	(*matmultp)( int n, const double *A, const double *B, double *C,
				struct mpi_grid *grid );

	// Concatenate file path and names
	sprintf(procs, "np%d", grid->p);
	out_data_file = (char *) calloc( strlen(file_path) + strlen(out_file_prefix)
		+ strlen(procs) + strlen(out_file_ext) + 1, sizeof(char) );
	strcpy( out_data_file, file_path );
	strcat( out_data_file, out_file_prefix );
	strcat( out_data_file, procs );
	strcat( out_data_file, out_file_ext );
	in_data_file = (char *) calloc( strlen(file_path) + 
		strlen(in_file_name) + 1, sizeof(char) );
	strcpy( in_data_file, file_path );
	strcat( in_data_file, in_file_name );
	// Read input data file (performance of serial matrix multiplication)
	read_data_file( in_data_file, SIZES, IN_FIELDS, perf_serial );

	for ( int i = 0; i < SIZES; i++ ) {
		n = perf_serial[i+col_n*SIZES];
		// Matrix multiplication takes 2*n^3 floating point operations
		mflops = 1.0e-06 * 2.0 * n * n * n;
		if ( grid->rank == 0 ) {
			fprintf( stdout, "n = %d\n", n );
			// Allocate memory for matrices
			A = (double *) malloc( n*n*sizeof(double) );
			B = (double *) malloc( n*n*sizeof(double) );
			C = (double *) malloc( n*n*sizeof(double) );		
			// Create random matrices A, B and C
			create_random_matrix( alpha, n, n, A );
			create_random_matrix( alpha, n, n, B );
			create_random_matrix( alpha, n, n, C );
		}
		matmultp = parallel_matrix_multiply;
		mflop_sec = mflops / time_matmultp( matmultp, n, A, B, C, grid );
		if ( grid->rank == 0 ) {
			perf_data[i+col_n*SIZES] = perf_serial[i+col_n*SIZES];
			perf_data[i+col_serial*SIZES] = perf_serial[i+col_in_serial*SIZES];
			perf_data[i+col_para*SIZES] = mflop_sec;
			perf_data[i+col_speedup*SIZES] = 
				mflop_sec / perf_data[i+col_serial*SIZES];
			perf_data[i+col_effncy*SIZES] = 
				perf_data[i+col_speedup*SIZES] / grid->p;
			free( A );
			free( B );
			free( C );
		}
	}

	if ( grid->rank == 0 ) {
		write_data_file( out_data_file, hdr_text, 
				SIZES, OUT_FIELDS, perf_data, grid );
	}
#undef FIELDS
}


