/*
 * Algorithms implementing parallel matrix multiplication (and addition), 
 * C = C + A*B, using the MPI (Message-Passing Interface) library.  Includes 
 * functions that facilitate interprocess communication, and Fox's algorithm 
 * for parallel matrix multiplication.
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

#include "matmultp.h"

static void form_contig_blocks( int m, int n, int ldimE, const double *E, 
	int mm, int nn, int bdim, int ldimF, double *F );
static void unpack_contig_blocks( int mm, int nn, int bdim, int ldimE,
	const double *E, int m, int n, int ldimF, double *F  );

static MPI_Datatype contig;
	
static double	*AA, *BB, *CC;
static double	*X, *Y, *Z;

/******************************************************************************/

/*
 * Copy elements of an m-by-n matrix E to mm-by-nn matrix F.  Leading dimension 
 * ldimE is the number of rows of E, while ldimF is the number of columns of F. 
 * Elements of E are stored in column-major order.  Array F stores bdim-by-bdim
 * matrix blocks contiguously, with blocks stored in row-major order and
 * elements of blocks stored in column-major order.
 */
void form_contig_blocks( int m, int n, int ldimE, const double *E, 
	int mm, int nn, int bdim, int ldimF, double *F )
{
 	for ( int i = 0; i < mm; i += bdim ) {
		int r = (i + bdim > m) ? (m - i) : bdim;
		int p = (i + bdim > mm) ? (mm - i) : bdim;
		for ( int j = 0; j < nn; j += bdim ) {
			int s = (j + bdim > n) ? (n - j) : bdim;
			int q = (j + bdim > nn) ? (nn - j) : bdim;
			// Clear fringe blocks by setting elements to zero
			if ( s != q || r != p ) {
				double *Fij = F + i*ldimF + j*p;
				memset( Fij, 0, p*q*sizeof(double) );
			}
			for ( int k = 0; k < s; k++ ) {
				const double *Eij = E + j*ldimE + i + k*ldimE;
				double *Fij = F + i*ldimF + j*p + k*p;
				memcpy( Fij, Eij, r*sizeof(double) );
			}
		}
	}
}

/*
 * Copies elements of an mm-by-nn matrix E to m-by-n matrix F. Leading dimension 
 * ldimE is the number of columns of E, while ldimF is the number of rows of F.
 * Array E stores bdim-by-bdim matrix blocks contiguously, with blocks stored in
 * row-major order and elements of blocks stored in column-major order.  As 
 * matrix E is copied to array F, elements are unpacked and stored in
 * conventional column-major order.
 */
void unpack_contig_blocks( int mm, int nn, int bdim, int ldimE,
	const double *E, int m, int n, int ldimF, double *F  )
{	
	for ( int i = 0; i < mm; i += bdim ) {
		int r = (i + bdim > m) ? (m - i) : bdim;
		int p = (i + bdim > mm) ? (mm - i) : bdim;
		for ( int j = 0; j < nn; j += bdim ) {
			int s = (j + bdim > n) ? (n - j) : bdim;
			for ( int k = 0; k < s; k++ ) {
				const double *Eij = E + i*ldimE + j*p + k*p;
				double *Fij = F + j*ldimF + i + k*ldimF;
				memcpy( Fij, Eij, r*sizeof(double) );
			}
		}
	}
}

/******************************************************************************/

/*
 * Generates a random m-by-n matrix with leading dimension m.  The uniform
 * randomly generated elements are scaled by factor alpha.
 */
void create_random_matrix( double alpha, int m, int n, double *E )
{
	const int ldim = m;

	for ( int j = 0; j < n; j++ ) {	
		for ( int i = 0; i < m; i++ ) {
			E[j*ldim + i] = alpha * drand48() - (0.50 * alpha);
		}
	}
}

/*
 * Sets elements of m-by-n matrix with leading dimension m to zero.
 */
void clear_matrix( int m, int n, double *E )
{
	const int ldim = m;
	
	for ( int j =  0; j < n; j++ ) {
		memset( E + j*ldim, 0, m*sizeof(double) );
	}
	
}

/*
 * Copies the elements of an m-by-n matrix E to matrix F.  For both matrices 
 * the leading dimension is m, and elements are stored in column-major order.
 */
void copy_matrix( int m, int n, const double *E, double *F )
{
	const int ldim = m;	

	for ( int j = 0; j < n; j++ ) {
		const double *E_j = E + j*ldim;
		double *F_j = F + j*ldim;
		memcpy( F_j, E_j, m*sizeof(double) );
	}
}

/*
 * Establish the Cartesian (square grid) topology that facilitates collective
 * communication between processes.  It is assumed that the number of 
 * processes is a perfect square, so that the row dimension equals the column
 * dimension of the grid.
 */
void setup_mpi_grid( struct mpi_grid *grid )
{
	int	dims, reord, world_rank;
	int dim[2], 
		wrap[2], 
		coords[2], 
		free_coords[2];

	MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));
	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

	dims = 2;						// Number of grid dimensions
	grid->q = (int) sqrt(grid->p);	// Assume p is a perfect square
	dim[0] = grid->q;				// Number of row blocks
	dim[1] = grid->q;				// Number of column blocks
	wrap[0] = 0;				
	wrap[1] = 1;					// Wrap around or circular shift for columns
	reord = 1;						// Permit reordering of processes 

	// Create communicator with Cartesian topology
	MPI_Cart_create( MPI_COMM_WORLD, dims, dim, wrap, reord, &(grid->comm) );
	// Get process rank in Cartesian communicator
	MPI_Comm_rank( grid->comm, &(grid->rank) );
	// Get process coordinates in Cartesian communicator
	MPI_Cart_coords( grid->comm, grid->rank, dims, coords );
	grid->row = coords[0];
	grid->col = coords[1];

	// Setup row communicators
	free_coords[0] = 0;				// Fix row coordinates
	free_coords[1] = 1;				// Vary column coordinates
	MPI_Cart_sub( grid->comm, free_coords, &(grid->row_comm) );
	// Setup column communicators
	free_coords[0] = 1;				// Vary row coordinates
	free_coords[1] = 0;				// Fix column coordinates
	MPI_Cart_sub( grid->comm, free_coords, &(grid->col_comm) );	
}

/*
 * The full matrices stored on the root processor are partitioned into square
 * blocks corresponding to the processes in the Cartesian grid.  The root 
 * processor sends (scatters) the blocks to their respective processors 
 * where (block) matrix multiplication is performed. 
 */
void scatter_blocks( int bdim, int n, 
		const double *A, const double *B, double *C, struct mpi_grid *grid )	
{
	const int	nn = bdim * grid->q; 
	const int	ldim = nn;
	const long	blk_sz = bdim * bdim;

	// On root processor copy matrix blocks into array so that elements are
	// stored contiguously to facilitate collective communication
	if ( grid->rank == 0 ) {
		AA = (double *) malloc( grid->p*blk_sz*sizeof(double) );
		form_contig_blocks( n, n, n, A, nn, nn, bdim, ldim, AA );
		BB = (double *) malloc( grid->p*blk_sz*sizeof(double) );
		form_contig_blocks( n, n, n, B, nn, nn, bdim, ldim, BB );
		CC = (double *) malloc( grid->p*blk_sz*sizeof(double) );
		form_contig_blocks( n, n, n, C, nn, nn, bdim, ldim, CC );
	}

	// Allocate memory for matrix blocks on processors
	X = (double *) malloc( blk_sz*sizeof(double) ); 
	Y = (double *) malloc( blk_sz*sizeof(double) );
	Z = (double *) malloc( blk_sz*sizeof(double) );

	// Distribute matrix blocks across processors in communicator
	MPI_Scatter( AA, bdim, contig, X, bdim, contig, 0, grid->comm );
	MPI_Scatter( BB, bdim, contig, Y, bdim, contig, 0, grid->comm );
	MPI_Scatter( CC, bdim, contig, Z, bdim, contig, 0, grid->comm );
}

/*
 * Once the processes have completed (block) matrix multiplication, the root 
 * processor receives (gathers) the resulting block matrices from their
 * respective processors within the Cartesian grid, and assembles the blocks 
 * into matrix C, which is the result of the matrix multiplication operation 
 * C = C + A*B. 
 */
void gather_blocks( int bdim, int n, double *C, struct mpi_grid *grid )	
{
	const int	nn = bdim * grid->q; 
	const int	ldim = nn;

	// Collect matrix blocks from processors in communicator
	MPI_Gather( Z, bdim, contig, CC, bdim, contig, 0, grid->comm );
	free( X );
	free( Y );
	free( Z );

	// On root processor copy contiguous matrix blocks into array where elements 
	// are stored in column-major order
	if (grid->rank == 0) {	
		unpack_contig_blocks( nn, nn, bdim, ldim, CC, n, n, n, C );
		free( AA );
		free( BB );
		free( CC );
	}
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using the SAXPY
 * operation -- jki indexing.  The inner-most loop adds a scalar multiple of
 * column vector x to column vector y.  A (m-by-p), B (p-by-n) and C (m-by-n)
 * are rectangular matrices stored in column-major order with leading dimensions
 * ldimA, ldimB and ldimC, respectively.
 */
void multiply_matrix( int m, int n, int p, int ldimA, const double *A, 
	int ldimB, const double *B, int ldimC, double *C ) 	
{
	for ( int j = 0; j < n; j++ ) {
		const double *B_j = B + j*ldimB;			// Points to element B(0,j)
		double *C_j = C + j*ldimC;					// Points to element C(0,j)
		for ( int k = 0; k < p; k++ ) {
			const double *A_k = A + k*ldimA;		// Points to element A(0,k)
			double bkj = *(B_j + k);				// Element B(k,j)
			for ( int i = 0; i < m; i++ ) {
				*(C_j + i) += *(A_k + i) * bkj;
			}										// C(i,j) += A(i,k) * B(k,j)
		}
	}
}

/*
 * Performs matrix multiplication and addition, C = C + A*B, using simple
 * blocking to optimize memory access.  The underlying unblocked matrix
 * multiplication algorithm is the SAXPY operation.  A (m-by-p), B (p-by-n) 
 * and C (m-by-n) are rectangular matrices stored in column-major order with 
 * leading dimensions ldimA, ldimB and ldimC, respectively.
 */
void blocked_matrix_multiply ( int m, int n, int p, int ldimA, const double *A, 
	int ldimB, const double *B, int ldimC, double *C ) 	
{
	const int bdim = BDIM;

	for ( int j = 0; j < n; j += bdim ) {
		// Determine number of columns in (i,j)th block of C
		int s = (j + bdim > n) ? (n - j) : bdim;

		for ( int k = 0; k < p; k += bdim ) {
			// Determine number of columns of Aik and rows of Bkj
			int t = (k + bdim > p) ? (p - k) : bdim;
			// Set pointer to block matrix Bkj
			const double *Bkj = B + k + j*ldimB;

			for ( int i = 0; i < m; i += bdim ) {
				// Determine number of rows in (i,j)th block of C
				int r = (i + bdim > m) ? (m - i) : bdim;
				// Set pointers to block matrices Aik and Cij
				const double *Aik = A + i + k*ldimA;
				double *Cij = C + i + j*ldimC;
				// Perform multiplication on block matrices
				multiply_matrix( r, s, t, ldimA, Aik, ldimB, Bkj, ldimC, Cij );	
			}								
		}
	}
}

/*
 * Performs serial matrix multiplication and addition, C = C + A*B, using
 * simple blocking to optimize memory access.  A, B and C are n-by-n matrices
 * stored in column-major order with leading dimension n.
 */
void serial_matrix_multiply( int n, const double *A, const double *B, double *C ) 	
{
	const int ldim = n;

	blocked_matrix_multiply( n, n, n, ldim, A, ldim, B, ldim, C );
}

/*
 * Implements the memory efficient Fox algorithm for parallel matrix 
 * multiplication.  The number of stages is equal to the square root of the 
 * number of processes i.e., the number of row/ column blocks.  For each stage
 * a block matrix Aik is broadcast across processes in each row communicator,
 * block matrix multiplication, Cij = Cij + Aik*Bkj, is performed, and block
 * matrices Bkj are rolled bewteen processes within each column communicator.
 */
void fox_matrix_multiply( const int n, 
		double *A, double *B, double *C, struct mpi_grid *grid )	
{
	const int		ldim = n;
	const long		blk_sz = n * n;

	int 			src, dest;
	double 			*T;
	MPI_Status		stat;

	// Allocate memory for temporary matrix block
	T = (double *) malloc( blk_sz*sizeof(double) );
	// Determine row index of source processor from which block matrix is 
	// received, and row index of destination processor to which block matrix
	// is sent, for next block matrix multiplication operation
	src = (grid->row + 1) % grid->q;
	dest = (grid->row + grid->q - 1) % grid->q;

	// Number of stages = number row blocks = number of column blocks.
	// For each stage broadcast block matrix Aik across processors in row 
	// communicator, perform block matrix multiplication, Cij = Cij + Aik*Bkj,
	// and then roll block matrices Bkj between processors within column
	// communicator.
	for ( int stage = 0; stage < grid->q; stage++ ) {
		int bcast_root = (grid->row + stage) % grid->q;
		if ( bcast_root == grid->col ) {
			MPI_Bcast( A, n, contig, bcast_root, grid->row_comm );
			blocked_matrix_multiply( n, n, n, ldim, A, ldim, B, ldim, C );
		} else {
			MPI_Bcast( T, n, contig, bcast_root, grid->row_comm );
			blocked_matrix_multiply( n, n, n, ldim, T, ldim, B, ldim, C );
		}
		MPI_Sendrecv_replace( B, n, contig, dest, 0, src, 0, 
			grid->col_comm, &stat );
	}
}

/*
 * Perform matrix multiplication (and addition), C = C + A*B, using parallel
 * programming with MPI.  First, matrices on the root processor are partitioned
 * into blocks and sent to processes in the Cartesian grid topology.  Then, the
 * Fox algorithm performs (block) matrix multiplication.  Finally, the root
 * processor receives the resulting block matrices from the processes and 
 * assembles the resulting matrix C.
 */
void parallel_matrix_multiply( int n, 
		const double *A, const double *B, double *C, struct mpi_grid *grid )	
{
	const int	bdim = (n / grid->q) + ((n % grid->q ? 1 : 0));
										// Dimension of matrix blocks
	//Create and commit MPI derived datatype constructor
	MPI_Type_contiguous( bdim, MPI_DOUBLE, &contig );
	MPI_Type_commit( &contig );

	scatter_blocks( bdim, n, A, B, C, grid );

	fox_matrix_multiply( bdim, X, Y, Z, grid );

	gather_blocks( bdim, n, C, grid );

}

