/*
 * Algorithms implementing unblocked and blocked matrix multiplication 
 * (and addition), C = C + A*B.  Unblocked algorithms include: dot (inner) 
 * product method, SAXPY operation, loop unrolling and software pipelining.  
 * Blocked algorithms include: simple blocking, contiguous blocking, and 
 * recursive contiguous blocking.  Also, a function wrapper facilitates calling 
 * BLAS matrix multiplication routine DGEMM.
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>

#include "matmult.h"
#include "lapack.h"
#include "matcom.h"

static void multiply_kernel( const double *A, const double *B, double *C );
static void multiply_rect_kernel( int m, int n, int p,
	const double *A, const double *B, double *C );
static void multiply_blk_ker( int m, int n, int p, int ldimA, const double *A, 
	int ldimB, const double *B, int ldimC, double *C );
static void multiply_rect_blk_ker( int m, int n, int p, int ldimA, const double *A, 
	int ldimB, const double *B, int ldimC, double *C );

/******************************************************************************/

/*
 * Determines optimal block dimension for the local environment given the matrix 
 * leading dimension.  Also, it facilitates the use of a different block 
 * dimension for testing (debugging).  If the leading dimension is less than the
 * optimal block dimension, the block dimension is set to the leading dimension,
 * and the matrix computation becomes an unblocked algorithm. 
 */
int get_block_dim_mmult( int ldim )
{
	int bdim;

#if defined(DEBUG)
	bdim = BDIM;
#else
	bdim = BDIM;	
#endif
	if ( bdim <= 1 || bdim > ldim ) {
		bdim = ldim;
	}
	return bdim;
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using the SAXPY
 * operation -- jki indexing.  A, B and C are contiguous KDIM-by-KDIM sub-blocks
 * stored in column-major order.  Looping is controlled by a symbolic constant 
 * (KDIM), which is evaluated during compilation.
 */
void multiply_kernel( const double *A, const double *B, double *C ) 	
{
	for ( int j = 0; j < KDIM; j++ ) {
		const double *B_j = B + j*KDIM;				// Points to element B(0,j)
		double *C_j = C + j*KDIM;					// Points to element C(0,j)
		for ( int k = 0; k < KDIM; k++ ) {
			const double *A_k = A + k*KDIM;			// Points to element A(0,k)
			double bkj = *(B_j + k);				// Element B(k,j)
			for ( int i = 0; i < KDIM; i++ ) {
				*(C_j + i) += *(A_k + i) * bkj;		// C(i,j) += A(i,k) * B(k,j)
			}										
		}
	}
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using the SAXPY
 * operation -- jki indexing.  A (m-by-p), B (p-by-n) and C (m-by-n) are
 * rectangular contiguous sub-blocks stored in column-major order with leading
 * dimension KDIM.  Looping is controlled by variables, which are evaluated at 
 * run time.
 */
void multiply_rect_kernel( int m, int n, int p,
	const double *A, const double *B, double *C ) 	
{
	for ( int j = 0; j < n; j++ ) {
		const double *B_j = B + j*KDIM;				// Points to element B(0,j)
		double *C_j = C + j*KDIM;					// Points to element C(0,j)
		for ( int k = 0; k < p; k++ ) {
			const double *A_k = A + k*KDIM;			// Points to element A(0,k)
			double bkj = *(B_j + k);				// Element B(k,j)
			for ( int i = 0; i < m; i++ ) {
				*(C_j + i) += *(A_k + i) * bkj;		// C(i,j) += A(i,k) * B(k,j)
			}										
		}
	}
}

/*
 * Performs (block) matrix multiplication and addition, C = C + A*B, using the 
 * SAXPY operation -- jki indexing.  A (m-by-p), B (p-by-n) and C (m-by-n) are
 * contiguous matrix blocks with leading dimensions ldimA, ldimB and ldimC, 
 * respectively.  Within blocks of A, B and C, sub-blocks of size KDIM*KDIM are
 * stored contiguously.  Matrix multiplication and addition is ultimately 
 * performed on KDIM-by-KDIM sub-blocks with looping controlled by a symbolic 
 * constant.
 */
void multiply_blk_ker( int m, int n, int p, int ldimA, const double *A, 
	int ldimB, const double *B, int ldimC, double *C ) 	
{
	for ( int j = 0; j < n; j += KDIM ) {

		for ( int k = 0; k < p; k += KDIM ) {
			// Set pointer to kernel (sub-block) Bkj
			const double *Bkj = B + k*KDIM + j*ldimB;

			for ( int i = 0; i < m; i += KDIM ) {
				// Set pointers to kernels (sub-blocks) Aik and Cij
				const double *Aik = A + i*KDIM + k*ldimA;
				double *Cij = C + i*KDIM + j*ldimC;
				// Perform matrix multiplication on kernels (sub-blocks)
				multiply_kernel( Aik, Bkj, Cij );
			}													
		}
	}	
}

/*
 * Performs (block) matrix multiplication and addition, C = C + A*B, using the 
 * SAXPY operation -- jki indexing.   A (m-by-p), B (p-by-n) and C (m-by-n) are
 * contiguous matrix blocks with leading dimensions ldimA, ldimB and ldimC, 
 * respectively.  Within blocks of A, B and C, sub-blocks of size KDIM*KDIM are
 * stored contiguously.  Matrix multiplication and addition is ultimately 
 * performed on r-by-s, r-by-t and t-by-s sub-blocks with looping controlled 
 * by variables.
 */
void multiply_rect_blk_ker( int m, int n, int p, int ldimA, const double *A, 
	int ldimB, const double *B, int ldimC, double *C ) 	
{
	for ( int j = 0; j < n; j += KDIM ) {
		// Determine number of columns in (i,j)th sub-block of C
		const int s = (j + KDIM > n) ? (n - j) : KDIM;

		for ( int k = 0; k < p; k += KDIM ) {
			// Determine number of columns of Aik and rows of Bkj
			const int t = (k + KDIM > p) ? (p - k) : KDIM;
			// Set pointer to kernel (sub-block) Bkj
			const double *Bkj = B + k*KDIM + j*ldimB;

			for ( int i = 0; i < m; i += KDIM ) {
				// Determine number of rows in (i,j)th sub-block of C
				const int r = (i + KDIM > m) ? (m - i) : KDIM;
				// Set pointers to kernels (sub-blocks) Aik and Cij
				const double *Aik = A + i*KDIM + k*ldimA;
				double *Cij = C + i*KDIM + j*ldimC;
				// Perform matrix multiplication on kernels (sub-blocks)
				multiply_rect_kernel( r, s, t, Aik, Bkj, Cij );
			}													
		}
	}	
}

/******************************************************************************/

/*
 * Performs matrix multiplication and addition, C = C + A*B, using the dot
 * (inner) product method -- ijk indexing.  For each element C(i,j), the 
 * inner-most loop computes the dot product of row i of A with column j of B, 
 * and adds the result to C(i,j).  A, B and C are n-by-n matrices stored in 
 * column-major order with leading dimension n.
 */
void mmult_dot_product( int n, const double *A, const double *B, double *C ) 	
{
	const int ldim = n;

	for ( int i = 0; i < n; i++ ) {
		const double *Ai_ = A + i;				// Points to element A(i,0)
		for ( int j = 0; j < n; j++ ) {
			const double *B_j = B + j*ldim;		// Points to element B(0,j)
			double cij = *(C + j*ldim + i);		// Element C(i,j)
			for ( int k = 0; k < n; k++ ) {
				cij += *(Ai_ + k*ldim) * *(B_j + k);
			}									// C(i,j) += A(i,k) * B(k,j)
			*(C + j*ldim + i) = cij;				
		}
	}
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using the SAXPY
 * operation -- jki indexing.  The inner-most loop adds a scalar multiple of a
 * column to another column.  A, B and C are n-by-n matrices stored in 
 * column-major order with leading dimension n.
 */
void mmult_saxpy( int n, const double *A, const double *B, double *C ) 	
{
	const int ldim = n;

	multiply_matrix( n, n, n, ldim, A, ldim, B, ldim, C );
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using the dot
 * (inner) product method -- ijk indexing.  Optimizes floating point 
 * operations by unrolling the inner-most loop to a depth of 8.  A, B and C are 
 * n-by-n matrices stored in column-major order with leading dimension n.
 */
void mmult_unroll( int n, const double *A, const double *B, double *C ) 	
{
	const int ldim = n;	

	for ( int i = 0; i < n; i++ ) {
		const double *Ai_ = A + i;				// Points to element A(i,0)
		for ( int j = 0; j < n; j++ ) {
			const double *B_j = B + j*ldim;		// Points to element B(0,j)
			double	cij = *(C + j*ldim + i),	
					c0 = 0, c1 = 0, c2 = 0,	c3 = 0,
					c4 = 0,	c5 = 0,	c6 = 0,	c7 = 0;
			int k = 0;
			for ( ; k < n-7; k += 8 ) {
				c0 += *(Ai_ + (k+0)*ldim) * *(B_j + (k+0));
				c1 += *(Ai_ + (k+1)*ldim) * *(B_j + (k+1));
				c2 += *(Ai_ + (k+2)*ldim) * *(B_j + (k+2));
				c3 += *(Ai_ + (k+3)*ldim) * *(B_j + (k+3));
				c4 += *(Ai_ + (k+4)*ldim) * *(B_j + (k+4));
				c5 += *(Ai_ + (k+5)*ldim) * *(B_j + (k+5));
				c6 += *(Ai_ + (k+6)*ldim) * *(B_j + (k+6));
				c7 += *(Ai_ + (k+7)*ldim) * *(B_j + (k+7));
			}
			cij += c0 + c1 + c2 + c3 + c4 + c5 + c6 + c7;
			// Finish up dot product computation for cases where matrix
			// dimension n is not a multiple of the depth of loop unrolling
			for ( ; k < n; k++ ) {
				cij += *(Ai_ + k*ldim) * *(B_j + k);
			}
			*(C + j*ldim + i) = cij;			// Element C(i,j)			
		}
	}
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using the SAXPY
 * operation -- jki indexing.  Optimizes floating point operations through 
 * software pipelining with the inner-most loop unrolled to a depth of 8. 
 * A, B and C are n-by-n matrices stored in column-major order with leading 
 * dimension n.
 */
void mmult_pipeline( int n, const double *A, const double *B, double *C ) 	
{
	const int ldim = n;	

	for ( int j = 0; j < n; j++ ) {
		const double *B_j = B + j*ldim;			// Points to element B(0,j)
		double *C_j = C + j*ldim;				// Points to element C(0,j)
		for ( int k = 0; k < n; k++ ) {
			const double *A_k = A + k*ldim;		// Points to element A(0,k)
			double	bkj,
					a0, a1, a2, a3, a4, a5, a6, a7,
					c0, c1, c2, c3, c4, c5, c6, c7;
			int i = 0;
			bkj = *(B_j + k);					// Element B(k,j)
			if ( n > 7+8 ) {					// Proceed with software pipelining
				a0 = *(A_k + 0);
				a1 = *(A_k + 1);
				a2 = *(A_k + 2);
				a3 = *(A_k + 3);
				a4 = *(A_k + 4);
				a5 = *(A_k + 5);
				a6 = *(A_k + 6);
				a7 = *(A_k + 7);
				c0 = *(C_j + 0) + a0 * bkj;
 				c1 = *(C_j + 1) + a1 * bkj;
				c2 = *(C_j + 2) + a2 * bkj;
				c3 = *(C_j + 3) + a3 * bkj;
				c4 = *(C_j + 4);
				c5 = *(C_j + 5);
				c6 = *(C_j + 6);
				c7 = *(C_j + 7);

				for (; i < n-7-8; i += 8) {
					*(C_j + i + 0) = c0;
					a4 *= bkj;
					a0 = *(A_k + i + 8);
					c4 += a4;
					c0 = *(C_j + i + 8);

					*(C_j + i + 1) = c1;
					a5 *= bkj;
					a1 = *(A_k + i + 9);
					c5 += a5;
					c1 = *(C_j + i + 9);

					*(C_j + i + 2) = c2;
					a6 *= bkj;
					a2 = *(A_k + i + 10);
					c6 += a6;
					c2 = *(C_j + i + 10);

					*(C_j + i + 3) = c3;
					a7 *= bkj;
					a3 = *(A_k + i + 11);
					c7 += a7;
					c3 = *(C_j + i + 11);

					*(C_j + i + 4) = c4;
					a0 *= bkj;
					a4 = *(A_k + i + 12);
					c0 += a0;
					c4 = *(C_j + i + 12);

					*(C_j + i + 5) = c5;
					a1 *= bkj;
					a5 = *(A_k + i + 13);
					c1 += a1;
					c5 = *(C_j + i + 13);

					*(C_j + i + 6) = c6;
					a2 *= bkj;
					a6 = *(A_k + i + 14);
					c2 += a2;
					c6 = *(C_j + i + 14);

					*(C_j + i + 7) = c7;
					a3 *= bkj;
					a7 = *(A_k + i + 15);
					c3 += a3;
					c7 = *(C_j + i + 15);
				}
				*(C_j + i + 0) = c0;
				*(C_j + i + 1) = c1;
				*(C_j + i + 2) = c2;
				*(C_j + i + 3) = c3;
				*(C_j + i + 4) = c4 + a4 * bkj;
				*(C_j + i + 5) = c5 + a5 * bkj;
				*(C_j + i + 6) = c6 + a6 * bkj;
				*(C_j + i + 7) = c7 + a7 * bkj;
				i += 8;
			}
			// Finish up combined scalar multiplication and vector addition for  
			// cases where matrix dimension n is not a multiple of the depth of
			// loop unrolling
			for ( ; i < n; i++ ) {
				*(C_j + i) += *(A_k + i) * bkj;
			}									
		}
	}
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using simple
 * blocking to optimize memory access.  The underlying unblocked matrix
 * multiplication algorithm is the SAXPY operation.  A, B and C are n-by-n 
 * matrices stored in column-major order with leading dimension n.
 */
void mmult_block( int n, const double *A, const double *B, double *C ) 	
{
	const int ldim = n;
	const int bdim = get_block_dim_mmult( ldim );

	for ( int j = 0; j < n; j += bdim ) {
		// Determine number of columns in (i,j)th block of C
		int s = (j + bdim > n) ? (n - j) : bdim;

		for ( int k = 0; k < n; k += bdim ) {
			// Determine number of columns of Aik and rows of Bkj
			int t = (k + bdim > n) ? (n - k) : bdim;
			// Set pointer to block matrix Bkj
			const double *Bkj = B + k + j*ldim;

			for ( int i = 0; i < n; i += bdim ) {
				// Determine number of rows in (i,j)th block of C
				int r = (i + bdim > n) ? (n - i) : bdim;
				// Set pointers to block matrices Aik and Cij
				const double *Aik = A + i + k*ldim;
				double *Cij = C + i + j*ldim;
				// Perform multiplication on block matrices
				multiply_matrix( r, s, t, ldim, Aik, ldim, Bkj, ldim, Cij );	
			}								
		}
	}
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using contiguous
 * blocking to optimize memory access.  The underlying unblocked matrix
 * multiplication algorithm is the SAXPY operation.  A, B and C are n-by-n 
 * matrices stored in column-major order with leading dimension n.  A, B, and C
 * are first copied to arrays AA, BB and CC, respectively, where bdim-by-bdim
 * matrix blocks are stored contiguously, and within each block, elements are
 * stored in column-major order.  Matrix multiplication and addition on
 * contiguous blocks yields CC = CC + AA*BB, and the result is copied from array
 * CC to C, where matrix elements are stored in conventional column-major order.
 */
void mmult_contig_block( int n, const double *A, const double *B, double *C ) 	
{
	const int 	ldim = n;
	const int	bdim = get_block_dim_mmult( ldim );

	double 	*AA, *BB, *CC;

	AA = (double *) malloc( ldim*ldim*sizeof(double) );
	BB = (double *) malloc( ldim*ldim*sizeof(double) );
	CC = (double *) malloc( ldim*ldim*sizeof(double) );
	form_contig_blocks( n, n, ldim, A, n, n, bdim, ldim, AA );
	form_contig_blocks( n, n, ldim, B, n, n, bdim, ldim, BB );
	form_contig_blocks( n, n, ldim, C, n, n, bdim, ldim, CC );

	for ( int j = 0; j < n; j += bdim ) {
		// Determine number of columns in (i,j)th block of CC
		int s = (j + bdim > n) ? (n - j) : bdim;

		for ( int k = 0; k < n; k += bdim ) {
			// Determine number of columns of AAik and rows of BBkj
			int t = (k + bdim > n) ? (n - k) : bdim;
			// Set pointer to block matrix BBkj
			double *BBkj = BB + k*s + j*ldim;

			for ( int i = 0; i < n; i += bdim ) {
				// Determine number of rows in (i,j)th block of CC
				int r = (i + bdim > n) ? (n - i) : bdim;
				// Set pointers to block matrices AAik and CCij
				double *AAik = AA + i*t + k*ldim;
				double *CCij = CC + i*s + j*ldim;
				// Perform multiplication on block matrices
				multiply_matrix( r, s, t, r, AAik, t, BBkj, r, CCij );	
			}								
		}
	}
	// Extract matrix C from contiguous blocks
	unpack_contig_blocks( n, n, bdim, ldim, CC, n, n, ldim, C );
	free( AA );
	free( BB );
	free( CC );
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using recursive 
 * contiguous blocking to optimize memory access.  The underlying unblocked 
 * matrix multiplication algorithm is the SAXPY operation.  A, B and C are 
 * n-by-n matrices stored in column-major order with leading dimension n. 
 * A, B and C are first copied to arrays AA, BB and CC, respectively, where  
 * bdim-by-bdim matrix blocks are stored contiguously, and within each block, 
 * sub-blocks of size KDIM*KDIM are stored contiguously.  Matrix multiplication
 * and addition on recursive contiguous blocks yields CC = CC + AA*BB, and the 
 * result is copied from array CC to C, where matrix elements are stored in 
 * conventional column-major order.  Ultimately, a symbolic constant (KDIM) 
 * controls looping in the matrix multiplication kernel.
 */
void mmult_recur_block( int n, const double *A, const double *B, double *C ) 	
{
	const int	nn = (n / KDIM) * KDIM + ((n % KDIM) ? KDIM : 0);
	const int	ldim = nn;
	const int	bdim = get_block_dim_mmult( ldim );

	double 	*AA, *BB, *CC;

	AA = (double *) malloc( ldim*ldim*sizeof(double) );
	BB = (double *) malloc( ldim*ldim*sizeof(double) );
	CC = (double *) malloc( ldim*ldim*sizeof(double) );
	form_recur_blocks( n, n, n, A, nn, nn, KDIM, bdim, ldim, AA );
	form_recur_blocks( n, n, n, B, nn, nn, KDIM, bdim, ldim, BB );
	form_recur_blocks( n, n, n, C, nn, nn, KDIM, bdim, ldim, CC );

	for ( int j = 0; j < nn; j += bdim ) {
		// Determine number of columns in (i,j)th block of CC
		int s = (j + bdim > n) ? (n - j) : bdim;
		int v = (j + bdim > nn) ? (nn - j) : bdim;

		for ( int k = 0; k < nn; k += bdim ) {
			// Determine number of columns of AAik and rows of BBkj
			int t = (k + bdim > n) ? (n - k) : bdim;
			int w = (k + bdim > nn) ? (nn - k) : bdim;
			// Set pointer to block matrix BBkj
			double *BBkj = BB + k*v + j*ldim;

			for ( int i = 0; i < nn; i += bdim ) {
				// Determine number of rows in (i,j)th block of CC
				int r = (i + bdim > n) ? (n - i) : bdim;
				int u = (i + bdim > nn) ? (nn - i) : bdim;
				// Set pointers to block matrices AAik and CCij
				double *AAik = AA + i*w + k*ldim;
				double *CCij = CC + i*v + j*ldim;
				// Perform multiplication on recursive block matrices
				multiply_blk_ker( r, s, t, u, AAik, w, BBkj, u, CCij );	
			}								
		}
	}
	// Extract matrix C from recursive contiguous blocks
	unpack_recur_blocks( nn, nn, KDIM, bdim, ldim, CC, n, n, n, C );
	free( AA );
	free( BB );
	free( CC );
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using recursive 
 * contiguous blocking to optimize memory access.  The underlying unblocked 
 * matrix multiplication algorithm is the SAXPY operation.  A, B and C are 
 * n-by-n matrices stored in column-major order with leading dimension n.  
 * A, B and C are first copied to arrays AA, BB and CC, respectively, where 
 * bdim-by-bdim matrix blocks are stored contiguously, and within each block, 
 * sub-blocks of size KDIM*KDIM are stored contiguously.  Matrix multiplication 
 * and addition on recursive contiguous blocks yields CC = CC + AA*BB, and the 
 * result is copied from array CC to C, where matrix elements are stored in 
 * conventional column-major order.  Ultimately, variables control looping in 
 * the matrix multiplication kernel.
 */
void mmult_rect_recur_block( int n, const double *A, const double *B, double *C ) 	
{
	const int	nn = (n / KDIM) * KDIM + ((n % KDIM) ? KDIM : 0);
	const int	ldim = nn;
	const int	bdim = get_block_dim_mmult( ldim );

	double 	*AA, *BB, *CC;

	AA = (double *) malloc( ldim*ldim*sizeof(double) );
	BB = (double *) malloc( ldim*ldim*sizeof(double) );
	CC = (double *) malloc( ldim*ldim*sizeof(double) );
	form_recur_blocks( n, n, n, A, nn, nn, KDIM, bdim, ldim, AA );
	form_recur_blocks( n, n, n, B, nn, nn, KDIM, bdim, ldim, BB );
	form_recur_blocks( n, n, n, C, nn, nn, KDIM, bdim, ldim, CC );

	for ( int j = 0; j < nn; j += bdim ) {
		// Determine number of columns in (i,j)th block of CC
		int s = (j + bdim > n) ? (n - j) : bdim;
		int v = (j + bdim > nn) ? (nn - j) : bdim;

		for ( int k = 0; k < nn; k += bdim ) {
			// Determine number of columns of AAik and rows of BBkj
			int t = (k + bdim > n) ? (n - k) : bdim;
			int w = (k + bdim > nn) ? (nn - k) : bdim;
			// Set pointer to block matrix BBkj
			double *BBkj = BB + k*v + j*ldim;

			for ( int i = 0; i < nn; i += bdim ) {
				// Determine number of rows in (i,j)th block of CC
				int r = (i + bdim > n) ? (n - i) : bdim;
				int u = (i + bdim > nn) ? (nn - i) : bdim;
				// Set pointers to block matrices AAik and CCij
				double *AAik = AA + i*w + k*ldim;
				double *CCij = CC + i*v + j*ldim;
				// Perform multiplication on recursive block matrices
				multiply_rect_blk_ker( r, s, t, u, AAik, w, BBkj, u, CCij );	
			}								
		}
	}
	// Extract matrix C from recursive contiguous blocks
	unpack_recur_blocks( nn, nn, KDIM, bdim, ldim, CC, n, n, n, C );
	free( AA );
	free( BB );
	free( CC );
}

/*
 * Wrapper for calling BLAS routine DGEMM, which performs matrix multiplication.
 */
void mmult_blas( int n, const double *A, const double *B, double *C )
{
	const char		no_trans = 'N';
	const double	one = 1.0;

    dgemm_(&no_trans, &no_trans, &n, &n, &n, &one, A, &n, B, &n, &one, C, &n);		
}

