/*
 * Testing harness for parallel algorithms implementing matrix multiplication 
 * (and addition), C = C + A*B.   The number of tests and error count are
 * accumulated through a single execution of the mmultstp program, and all test 
 * results are written to an output file destination (terminal).
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <mpi.h>

#include "matmultp.h"

static void test_assert( double eps, double tol, const char *test_name );
static void error_matrix_comp_frob( double *eps, double *err, int m, int n, 
	const double *E, const double *F );
static void error_matrix_comp_l1( double *eps, double *err, int m, int n, 
	const double *E, const double *F );
static void init_test_matrices_6( 
	double *A, double *B, double *C, double *C_ans );
static void init_test_matrices( int n, 
	double *A, double *B, double *C, double *C_ans );
static void test_serial_matrix_multiply( int n, 
	const double *A, const double *B, double *C, const double *C_ans,
	double tol, const char *test_name );
static void test_parallel_matrix_multiply( int n, 
	const double *A, const double *B, double *C, const double *C_ans, 
	struct mpi_grid *grid, double tol, const char *test_name );

static int				tests = 0,			// Test count
						errs = 0;			// Error count
static FILE				*fp;
static struct mpi_grid	grid;

int main( int argc, char **argv )
{
	const double	tol = 1e-12;	// Error tolerance

	char	test_name[80];
	int 	n;
	double	*A, *B, *C, *C_ans;

	MPI_Init( &argc, &argv );
	//Establish Cartesian topology for collective communication
	setup_mpi_grid( &grid );

	fp = stdout;

	n = 6;
	if ( grid.rank == 0 ) {
		// Allocate memory for matrices
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );		
		// Test serial matrix multiply algorithm on 6x6 test matrices
		sprintf( test_name, 
			"Serial matrix multiply algorithm, %dx%d matrix", n, n );
		init_test_matrices_6( A, B, C, C_ans );
		test_serial_matrix_multiply( n, A, B, C, C_ans, tol, test_name );

		// Re-initialize matrices for parallel matrix multiply test
		sprintf( test_name, 
			"Fox algorithm on %d parallel processors, %dx%d matrix", 
			grid.p, n, n );
		init_test_matrices_6( A, B, C, C_ans );
	}
	// Matrices are partitioned into p full blocks for parallel processing on 
	// p processors 
	test_parallel_matrix_multiply( n, A, B, C, C_ans, &grid, tol, test_name );
	if ( grid.rank == 0 ) {	
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}

	// Matrices are partitioned into (q-1)^2 full blocks and (2q-1) fringe 
	// blocks for parallel processing on p processors, p = q*q  
	n = 2000;
	if ( grid.rank == 0 ) {
		// Allocate memory for matrices
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );
		init_test_matrices( n, A, B, C, C_ans );
		sprintf( test_name, 
			"Fox algorithm on %d parallel processors, %dx%d matrix", 
			grid.p, n, n );
	}
	test_parallel_matrix_multiply(n, A, B, C, C_ans, &grid, tol, test_name);
	if ( grid.rank == 0 ) {	
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}

	if ( grid.rank == 0 ) {
		if ( errs == 0 ) {
			fprintf( fp, "\nPassed all %d tests.\n", tests );
		} else {
			fprintf( fp, "\nTotal of %d error(s) encountered in %d tests.\n", 
				errs, tests );
		}
	}
	MPI_Finalize();
	return 0;
}

/******************************************************************************/

/*
 * Verifies that test results are accurate within specified tolerance, and
 * prints message indicating whether the routine passed or failed the test.
 */
void test_assert( double eps, double tol, const char *test_name )
{
	tests++;
	if ( eps <= tol ) {
		fprintf( fp, "PASSED: %s\n(eps=%e <= tol=%e)\n", test_name, eps, tol );
	} else {
		fprintf( fp, "FAILED: %s\n(eps=%e > tol=%e)\n", test_name, eps, tol );
		errs++;
	}
}

/*
 * Computes the relative and absolute errors in a matrix computation using the 
 * Frobenius norm ||F - E||, where F is the result of the floating point matrix
 * computation and E is the exact solution.  Both matrices are stored in 
 * column-major order with leading dimension m.
 */
void error_matrix_comp_frob( double *eps, double *err, int m, int n, 
	const double *E, const double *F )
{
	int		ldim = m;
	double	ssq_delta = 0.0;
	double	ssq_eij = 0.0;

	for ( int j = 0; j < n; j++ ) {
		const double *E_j = E + j*ldim;
		const double *F_j = F + j*ldim;
		for ( int i = 0; i < m; i++ ) {
			double delta = *(E_j + i) - *(F_j + i);
			ssq_delta +=  delta * delta;
			ssq_eij += *(E_j + i) * *(E_j + i);
		}
	}
	*err = sqrt( ssq_delta );
	*eps = *err / sqrt( ssq_eij );
}

/*
 * Computes the relative and absolute errors in a matrix computation using the 
 * l1-norm ||F - E||, where F is the result of the floating point matrix
 * computation and E is the exact solution.  Both matrices are stored in 
 * column-major order with leading dimension m.
 */
void error_matrix_comp_l1( double *eps, double *err, int m, int n, 
	const double *E, const double *F )
{
	int		ldim = m;
	double	sum_abs_delta = 0.0;
	double	sum_abs_eij = 0.0;

	*err = 0.0;
	*eps = 0.0;

	for ( int j = 0; j < n; j++ ) {
		const double *E_j = E + j*ldim;
		const double *F_j = F + j*ldim;
		for ( int i = 0; i < m; i++ ) {
			double delta = *(E_j + i) - *(F_j + i);
			sum_abs_delta +=  fabs( delta );
			sum_abs_eij += fabs(*(E_j + i));
		}
		if ( sum_abs_delta > *err ) {
			*err = sum_abs_delta;
			*eps = *err / sum_abs_eij;
		}
	}
}

/*
 * Initializes 6x6 matrices used to test serial and parallel matrix
 * multiplication algorithms.  Matrices A, B and C are initialized with preset
 * values, and C_ans contains the result of the matrix multiplication (and
 * addition), C_ans = C + A*B.  Matrices are stored in column-major order.
 */
void init_test_matrices_6( double *A, double *B, double *C, double *C_ans )
{
	const int 		n =	6;			// n-by-n matrices

	double	AA[] =		{ -2,  1, -8, -8,  1,  5,  8, -7,  1, -1, -9, -3, 
					 	  -6,  6, -4,  1, -6,  2, -5,  3,  4,  5, -1, -8,
					 	  -5,  8, -7,  6,  1,  9, -8, -1, -7,  9, -9,  5 },
			BB[] =		{  6,  7, -8, -2, -5,  6, -1,  8, -6, -4, -7, -7, 
					   	   7,  2,  1, -7,  7,  2, -3,  0, -2, -8, -5, -7,
					 	  -6, -5, -2, -9,  8,  8,  0,  0, -3,  8, -2, -7 },
			CC[] =		{  5, -2, -5, -2, -8, -7,  8,  9,  1, -8, -5, -3, 
					   	   6, -9, -9, -6,  3,  4,  3, -1,  1, -4,  5, -6,
					   	   4, -6, -2,  2,  5, -8,  8,  5,  0, -1, -1, -4 },			
			CC_ans[] =	{   84, -145,  -29,  -51,  -74,  -13,
						   221, -145,  123, -139,   18, -110,
						   -14,   23, -158,  -38,  -18,  164, 
						   142,  -73,   85, -115,   80,  -41,
						   -71,   40,  -99,  128,    1,  157, 
						    52,	   2,  107,  -39,   70, -127 };

	// Copy given nxn matrices into corresponding randomly generated matrices
	copy_matrix( n, n, AA, A );
	copy_matrix( n, n, BB, B );
	copy_matrix( n, n, CC, C );
	copy_matrix( n, n, CC_ans, C_ans );
}

/*
 * Initializes nxn matrices used to test parallel matrix multiplication.  
 * A, B and C are randomly generated matrices, and C_ans contains the result of 
 * C_ans = C + A*B, computed by serial matrix multiplication. The matrices are 
 * stored in column-major order with leading dimension n.
 */
void init_test_matrices( const int n, double *A, double *B, double *C, 
	double *C_ans )
{
	const double	alpha = 10.0;	// Scaling factor for random matrix

	create_random_matrix( alpha, n, n, A );
	create_random_matrix( alpha, n, n, B );
	create_random_matrix( alpha, n, n, C );
	// Compute C_ans = C + A*B using serial matrix multiply
	copy_matrix( n, n, C, C_ans );
	serial_matrix_multiply( n, A, B, C_ans );
}

/******************************************************************************/

/*
 * Checks whether the serial matrix multiply (jki indexing) algorithm performs 
 * matrix multiplication (and addition), C = C + A*B, correctly.  The 
 * verification is done on the n-by-n matrices passed in the argument list, 
 * which are stored in column-major order with leading dimension n.
 */
void test_serial_matrix_multiply( int n, 
	const double *A, const double *B, double *C, const double *C_ans,
	double tol, const char *test_name )
{
	double	eps, err;
	
	// Perform serial matrix multiplication and compare result with correct answer
	serial_matrix_multiply( n, A, B, C );
	error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
	test_assert( eps, tol, test_name );
}

/*
 * Verifies that the Fox algorithm on parallel processors performs matrix 
 * multiplication (and addition), C = C + A*B, correctly.  The verification is
 * done on the n-by-n matrices passed in the argument list, which are stored
 * in column-major order with leading dimension n.
 */
void test_parallel_matrix_multiply( int n, 
	const double *A, const double *B, double *C, const double *C_ans, 
	struct mpi_grid *grid, double tol, const char *test_name )
{
	double	eps, err;

	// Compute C = C + A*B using parallel algorithm and compare with correct answer
	parallel_matrix_multiply( n, A, B, C, grid );
	if ( grid->rank == 0 ) {
		error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
		test_assert( eps, tol, test_name );
	}
}
