/*
 * Testing harness for unblocked and blocked algorithms implementing matrix 
 * multiplication (and addition), C = C + A*B, on square matrices.  The number 
 * of tests and error count are accumulated through a single execution of the 
 * mmultest program, and all test results are written to an output file 
 * destination (terminal).
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include "matmult.h"
#include "matcom.h"

static void test_assert( double eps, double tol, const char *test_name );
static void test_mmult_dot_product( void );
static void test_mmult_saxpy( void );
static void test_mmult_unroll( void );
static void test_mmult_pipeline( void );
static void test_mmult_block( void );
static void test_mmult_contig_block( void );
static void test_mmult_recur_block( void );
static void test_mmult_rect_recur_block( void );
static void test_mmult_blas( void );

static int		tests = 0,			// Test count
				errs = 0;			// Error count
static FILE		*fp;

int main()
{
	fp = stdout;

	test_mmult_dot_product();
	test_mmult_saxpy();
	test_mmult_unroll();
	test_mmult_pipeline();
	test_mmult_block();
	test_mmult_contig_block();
	test_mmult_recur_block();
	test_mmult_rect_recur_block();
	test_mmult_blas();

	if ( errs == 0 ) {
		fprintf( fp, "Passed all %d tests.\n", tests );
	} else {
		fprintf( fp, "Total of %d error(s) encountered in %d tests.\n", 
			errs, tests );
	}
	return 0;
}

/*
 * Verifies that test results are accurate within specified tolerance, and
 * prints message indicating whether the routine passed or failed the test.
 */
void test_assert( double eps, double tol, const char *test_name )
{
	tests++;
	if ( eps <= tol ) {
		fprintf( fp, "PASSED: %s\n(eps=%e <= tol=%e)\n", test_name, eps, tol );
	} else {
		fprintf( fp, "FAILED: %s\n(eps=%e > tol=%e)\n", test_name, eps, tol );
		errs++;
	}
}

/******************************************************************************/

/*
 * Checks whether the dot (inner) product method with ijk indexing performs 
 * matrix multiplication (and addition), C = C + A*B, correctly.  Verification  
 * is done on pre-specified n-by-n matrices with leading dimension n stored in
 * column-major order.
 */
void test_mmult_dot_product( void )
{
	const int		n = 4;			// n-by-n matrix
	const double	tol = 1e-12;	// Error tolerance
	
	char	test_name[80];
	double	eps, err;
	double	A[] = {	4.2, 9.2, 7.9, 9.6, 6.6, 0.4, 8.5, 9.3,
					6.8, 7.6, 7.4, 3.9, 6.6, 1.7, 7.1, 0.3 },
			B[] = {	2.8, 0.5, 1.0, 8.2, 6.9, 3.2, 9.5, 0.3,
					4.4, 3.8, 7.7, 8.0, 1.9, 4.9, 4.5, 6.5 },
			C[] = {	8.9, 9.6, 5.5, 1.4, 1.5, 2.6, 8.4, 2.5, 
					8.1, 2.4, 9.3, 3.5, 2.0, 2.5, 6.2, 4.7 },			
			C_ans[] = {	 84.88,  57.10,  97.49,  39.29, 
						118.18, 140.07, 162.54, 135.64, 
						156.82, 116.52, 190.14, 113.51, 
						115.82,  67.19, 142.31,  88.01 };

	sprintf( test_name, 
		"Matrix multiplication, dot product (ijk indexing), %dx%d matrix", n, n );
	// Perform matrix multiplication and compare result with correct answer
	mmult_dot_product( n, A, B, C );
	error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
	test_assert( eps, tol, test_name );
}

/*
 * Verifies that the SAXPY operation with jki indexing performs matrix 
 * multiplication (and addition), C = C + A*B, correctly.  The result from the 
 * SAXPY operation is compared with that produced by the dot product method.
 */
void test_mmult_saxpy( void )
{
	const int		n = 72;			// n-by-n matrix
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix

	char	test_name[80];
	double	eps, err;
	double	*A, *B, *C, *C_ans;

	sprintf( test_name, 
		"Matrix multiplication, SAXPY (kji indexing), %dx%d matrix", n, n );
	A = (double *) malloc( n*n*sizeof(double) );
	B = (double *) malloc( n*n*sizeof(double) );
	C = (double *) malloc( n*n*sizeof(double) );
	C_ans = (double *) malloc( n*n*sizeof(double) );
	create_random_matrix( alpha, n, n, A );
	create_random_matrix( alpha, n, n, B );
	create_random_matrix( alpha, n, n, C );
	copy_matrix( n, n, C, C_ans );

	// Compute C_ans = C + A*B using the dot product method	
	mmult_dot_product( n, A, B, C_ans );
	// Compute C = C + A*B using the SAXPY operation, compare with C_ans
	mmult_saxpy( n, A, B, C );
	error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
	test_assert( eps, tol, test_name );
	free( A );
	free( B );
	free( C );
	free( C_ans );
}

/*
 * Verifies that the dot product method with loop unrolling performs matrix
 * multiplication (and addition), C = C + A*B, correctly.  The results from the 
 * optimized algorithm with loop unrolling are compared with those produced by
 * the basic dot product method.
 */
void test_mmult_unroll( void )
{	
	const int 		mat_size[] = { 6, 48, 66 };
#define SIZES (sizeof(mat_size) / sizeof(int))
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix
	
	char	*ptr, *test_name[SIZES];
	double	eps, err;
	double	*A, *B, *C, *C_ans;

	// Define test names
	ptr =	"Matrix multiplication, dot product with loop unrolling --\n"
			"matrix dimension less than UNROLL_DEPTH";
	test_name[0] = ptr;
	ptr =	"Matrix multiplication, dot product with loop unrolling --\n"
			"matrix dimension a multiple of UNROLL_DEPTH";
	test_name[1] = ptr;
	ptr =	"Matrix multiplication, dot product with loop unrolling --\n"
			"matrix dimension not a multiple of UNROLL_DEPTH";
	test_name[2] = ptr;

	for ( int i = 0; i < SIZES; i++ ) {
		const int n = mat_size[i];
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		copy_matrix( n, n, C, C_ans );

		// Compute C_ans = C + A*B using the dot product method
		mmult_dot_product( n, A, B, C_ans );
		// Compute C = C + A*B using loop unrolling, compare with C_ans
		mmult_unroll( n, A, B, C );
		error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
		test_assert( eps, tol, test_name[i] );
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}
#undef SIZES
}

/*
 * Verifies that the SAXPY method with software pipelining performs matrix
 * multiplication (and addition), C = C + A*B, correctly.  The results from the
 * optimized algorithm with software pipelining are compared with those
 * produced by the basic dot product method.
 */
void test_mmult_pipeline( void )
{	
	const int		mat_size[] = { 3, 64, 77 };
#define SIZES (sizeof(mat_size) / sizeof(int))
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix
	
	char	*ptr, *test_name[SIZES];
	double	eps, err;
	double	*A, *B, *C, *C_ans;


	// Define test names
	ptr =	"Matrix multiplication, SAXPY with software pipelining --\n"
			"matrix dimension less than PIPE_DEPTH";
	test_name[0] = ptr;
	ptr =	"Matrix multiplication, SAXPY with software pipelining --\n"
			"matrix dimension a multiple of PIPE_DEPTH";
	test_name[1] = ptr;
	ptr =	"Matrix multiplication, SAXPY with software pipelining --\n"
			"matrix dimension not a multiple of PIPE_DEPTH";
	test_name[2] = ptr;

	for ( int i = 0; i < SIZES; i++ ) {
		const int n = mat_size[i];
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		copy_matrix( n, n, C, C_ans );

		// Compute C_ans = C + A*B using the dot product method	
		mmult_dot_product( n, A, B, C_ans );
		// Compute C = C + A*B using software pipelining, compare with C_ans
		mmult_pipeline( n, A, B, C );
		error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
		test_assert( eps, tol, test_name[i] );
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}
#undef SIZES
}

/*
 * Verifies that the simple blocking algorithm performs matrix multiplication
 * (and addition), C = C + A*B, correctly.  The results from the blocked
 * algorithm are compared with those produced by the dot product method.
 */
void test_mmult_block( void )
{	
	const int		mat_size[] = { 21, 96, 111 };
#define SIZES (sizeof(mat_size) / sizeof(int))
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix
	
	char	*ptr, *test_name[SIZES];
	double	eps, err;
	double	*A, *B, *C, *C_ans;

	// Define test names
	ptr =	"Matrix multiplication, simple blocking --\n"
			"matrix dimension less than block dimension";
	test_name[0] = ptr;
	ptr =	"Matrix multiplication, simple blocking --\n"
			"matrix dimension a multiple of block dimension";
	test_name[1] = ptr;
	ptr =	"Matrix multiplication, simple blocking --\n"
			"matrix dimension not a multiple of block dimension";
	test_name[2] = ptr;

	for ( int i = 0; i < SIZES; i++ ) {
		const int n = mat_size[i];
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		copy_matrix( n, n, C, C_ans );

		// Compute C_ans = C + A*B using the dot product method	
		mmult_dot_product( n, A, B, C_ans );
		// Compute C = C + A*B using the blocked algorithm, compare with C_ans
		mmult_block( n, A, B, C );
		error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
		test_assert( eps, tol, test_name[i] );
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}
#undef SIZES
}

/*
 * Verifies that the contiguous blocking algorithm performs matrix
 * multiplication (and addition), C = C + A*B, correctly.  The results from the
 * blocked algorithm are compared with those produced by the dot product method.
 */
void test_mmult_contig_block( void )
{	
	const int		mat_size[] = { 13, 96, 122 };
#define SIZES (sizeof(mat_size) / sizeof(int))
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix
	
	char	*ptr, *test_name[SIZES];
	double	eps, err;
	double	*A, *B, *C, *C_ans;
	
	// Define test names
	ptr =	"Matrix multiplication, contiguous block storage --\n"
			"matrix dimension less than block dimension";
	test_name[0] = ptr;
	ptr =	"Matrix multiplication, contiguous block storage --\n"
			"matrix dimension a multiple of block dimension";
	test_name[1] = ptr;
	ptr =	"Matrix multiplication, contiguous block storage --\n"
			"matrix dimension not a multiple of block dimension";
	test_name[2] = ptr;

	for ( int i = 0; i < SIZES; i++ ) {
		const int n = mat_size[i];
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		copy_matrix( n, n, C, C_ans );

		// Compute C_ans = C + A*B using the dot product method	
		mmult_dot_product( n, A, B, C_ans );
		// Compute C = C + A*B using the blocked algorithm, compare with C_ans
		mmult_contig_block( n, A, B, C );
		error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
		test_assert( eps, tol, test_name[i] );
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}
#undef SIZES
}

/*
 * Verifies that the recursive contiguous blocking algorithm performs matrix
 * multiplication (and addition), C = C + A*B, correctly.  The matrix 
 * multiplication kernel uses a symbolic constant to control looping.  The 
 * results from the blocked algorithm are compared with those produced by the
 * dot product method.
 */
void test_mmult_recur_block( void )
{	
	const int		mat_size[] = { 13, 96, 122 };
#define SIZES (sizeof(mat_size) / sizeof(int))
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix
	
	char	*ptr, *test_name[SIZES];
	double	eps, err;
	double	*A, *B, *C, *C_ans;

	// Define test names
	ptr =	"Matrix multiplication, recursive contiguous blocking --\n"
			"matrix dimension less than block dimension";
	test_name[0] = ptr;
	ptr =	"Matrix multiplication, recursive contiguous blocking --\n"
			"matrix dimension a multiple of block dimension";
	test_name[1] = ptr;
	ptr =	"Matrix multiplication, recursive contiguous blocking --\n"
			"matrix dimension not a multiple of block dimension";
	test_name[2] = ptr;


	for ( int i = 0; i < SIZES; i++ ) {
		const int n = mat_size[i];
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		copy_matrix( n, n, C, C_ans );

		// Compute C_ans = C + A*B using the basic dot product algorithm	
		mmult_dot_product( n, A, B, C_ans );
		// Compute C = C + A*B using the optimized algorithm, compare with C_ans
		mmult_recur_block( n, A, B, C );
		error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
		test_assert( eps, tol, test_name[i] );
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}
#undef SIZES
}

/*
 * Verifies that the recursive contiguous blocking algorithm performs matrix 
 * multiplication (and addition), C = C + A*B, correctly.  The matrix 
 * multiplication kernel uses variables to control looping.  The results from
 * the blocked algorithm are compared with those produced by the dot product 
 * method.
 */
void test_mmult_rect_recur_block( void )
{	
	const int		mat_size[] = { 25, 128, 141 };
#define SIZES (sizeof(mat_size) / sizeof(int))
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix
	
	char	*ptr, *test_name[SIZES];
	double	eps, err;
	double	*A, *B, *C, *C_ans;

	// Define test names
	ptr =	"Multiplication, recursive contiguous blocking, variable looping --\n"
			"matrix dimension less than block dimension";
	test_name[0] = ptr;
	ptr =	"Multiplication, recursive contiguous blocking, variable looping --\n"
			"matrix dimension a multiple of block dimension";
	test_name[1] = ptr;
	ptr =	"Multiplication, recursive contiguous blocking, variable looping --\n"
			"matrix dimension not a multiple of block dimension";
	test_name[2] = ptr;


	for ( int i = 0; i < SIZES; i++ ) {
		const int n = mat_size[i];
		A = (double *) malloc( n*n*sizeof(double) );
		B = (double *) malloc( n*n*sizeof(double) );
		C = (double *) malloc( n*n*sizeof(double) );
		C_ans = (double *) malloc( n*n*sizeof(double) );
		create_random_matrix( alpha, n, n, A );
		create_random_matrix( alpha, n, n, B );
		create_random_matrix( alpha, n, n, C );
		copy_matrix( n, n, C, C_ans );

		// Compute C_ans = C + A*B using the dot product method
		mmult_dot_product( n, A, B, C_ans );
		// Compute C = C + A*B using the blocked algorithm, compare with C_ans
		mmult_rect_recur_block( n, A, B, C );
		error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
		test_assert( eps, tol, test_name[i] );
		free( A );
		free( B );
		free( C );
		free( C_ans );
	}
#undef SIZES
}

/*
 * Checks whether the wrapper function properly invokes BLAS routine DGEMM,
 * which performs matrix multiplication.
 */
void test_mmult_blas( void )
{
	const int		n = 48;			// n-by-n matrix
	const double	tol = 1e-12,	// Error tolerance
					alpha = 10.0;	// Scaling factor for random matrix
	
	char	test_name[80];
	double	eps, err;
	double	*A, *B, *C, *C_ans;

	sprintf( test_name, 
		"Matrix multiplication, BLAS routine DGEMM, %dx%d matrix", n, n );
	A = (double *) malloc( n*n*sizeof(double) );
	B = (double *) malloc( n*n*sizeof(double) );
	C = (double *) malloc( n*n*sizeof(double) );
	C_ans = (double *) malloc( n*n*sizeof(double) );
	create_random_matrix( alpha, n, n, A );
	create_random_matrix( alpha, n, n, B );
	create_random_matrix( alpha, n, n, C );
	copy_matrix( n, n, C, C_ans );

	// Compute C_ans = C + A*B using the dot product method	
	mmult_dot_product( n, A, B, C_ans );
	// Compute C = C + A*B using DGEMM, compare with C_ans
	mmult_blas( n, A, B, C );
	error_matrix_comp_frob( &eps, &err, n, n, C_ans, C );
	test_assert( eps, tol, test_name );
	free( A );
	free( B );
	free( C );
	free( C_ans );
}

