#if !defined(MATMULTP_H_)
	#define MATMULTP_H_ 1

#if defined(__cplusplus)
	extern "C" {
#endif

// Number of matrices of varying sizes for which performance is measured
#define SIZES 11
// Column of input data file containing performance data for serial algorithm 
#define COLINSER 6
// Blocking parameter, i.e., block size = BDIM-by-BDIM
#define BDIM 96

	struct mpi_grid {
		MPI_Comm	comm;
		MPI_Comm	row_comm;
		MPI_Comm	col_comm;
		int			p;
		int			q;
		int			row;
		int			col;
		int			rank;
	};

	void setup_mpi_grid( struct mpi_grid *grid );

	void scatter_blocks( int bdim, int n, 
		const double *A, const double *B, double *C, struct mpi_grid *grid );

	void gather_blocks( int bdim, int n, double *C, struct mpi_grid *grid );

	void create_random_matrix( double alpha, int m, int n, double *E );

	void clear_matrix( int m, int n, double *E );

	void copy_matrix( int m, int n, const double *E, double *F );

	void multiply_matrix( int m, int n, int p, int ldimA, const double *A, 
		int ldimB, const double *B, int ldimC, double *C );

	void blocked_matrix_multiply ( int m, int n, int p, int ldimA, 
		const double *A, int ldimB, const double *B, int ldimC, double *C );

	void serial_matrix_multiply( int n, 
		const double *A, const double *B, double *C );

	void fox_matrix_multiply( int n, 
		double *A, double *B, double *C, struct mpi_grid *grid );

	void parallel_matrix_multiply( int n, 
		const double *A, const double *B, double *C, struct mpi_grid *grid );

#if defined(__cplusplus)
	}
#endif

#endif
