#ifndef MATRIX_H
#define MATRIX_H

#include <iostream>
#include <iomanip>
#include <cstdint>
#include <valarray>
#include <algorithm>
#include <stdexcept>
using namespace std;
#include <omp.h>

// use CBLAS or GSL CBLAS but not together because
// there will be a conflict on constants like CblasNoTrans
// due to the absence of namespace

//#define CBLAS
// or use
// #define GSL_BLAS
// or
//#define MKL

#ifdef CBLAS
#include <cblas.h>
#endif

#ifdef GSL_CBLAS
#include <gsl/gsl_blas.h>
#endif

#ifdef MKL
#include <mkl.h>
#endif

// define you own namespace
namespace mylib {

// redefine type to make it shorter
typedef unsigned int u32;

/**
 * Generic matrix implementation based on valarray
 */
template<class T>
class Matrix {
protected:
	/**
	 * coefficients of the matrix
	 */
	valarray<T> data;
	/**
	 * number of rows of the matrix
	 */
	u32 rows;
	/**
	 * number of columns of the matrix
	 */
	u32 cols;

public:
	/**
 	 * constructor given number of rows and columns
	 * @param r number of rows
	 * @param c number of columns
	 */
	Matrix(u32 r, u32 c) {
		rows = r;
		cols = c;
		data.resize( rows * cols );
	}

	/**
	 * constructor for square matrix given dimension
	 * @param r dimension of the square matrix
	 */
	Matrix(u32 r) : Matrix(r, r) {
	}

	/**
	 * copy constructor
	 * @param obj existing matrix
	 */
	Matrix(const Matrix<T>& obj) {
		rows = obj.rows;
		cols = obj.cols;
		data = obj.data;
	}

	/**
	 * redefinition of assignment operator
	 * @param obj existing matrix
	 */
	Matrix<T>& operator=(const Matrix<T>& obj) {
		if (&obj != this) {
			rows = obj.rows;
			cols = obj.cols;
			data = obj.data;
		}
		return *this;
	}

	/**
	 * default destructor
	 */
	~Matrix() {
		
	}

	/**
	 * fill matrix with a unique value
	 * @param v value that is used to fill matrix
	 */
	void fill(T v) {
		std::fill(std::begin(data), std::end(data), v);
	}

	/**
	 * return reference to data
	 * @return a reference to the valarray that stores the data
	 */
	valarray<T>& get_data() {
		return data;
	}

	/**
	 * get access to nth element of the valarray
	 */
	T& operator[](int n) {
		return data[n];
	}

	/**
	 * function used to print matrix in a visual format. 
	 * this function is used by the operator<<
	 * @param out output stream
	 * @return output stream
	 */
	ostream& print(ostream& out) {
		for (u32 y=0; y<rows; ++y) {
			for (u32 x=0; x<cols; ++x) {
				out << setw(10);
				out << data[y * cols + x] << " ";
			}
			out << endl;
		}
		return out;
	}

	/**
	 * friend function used to print matrix
	 * @param out output stream
	 * @param obj matrix to print
	 * @return output stream
	 */
	friend ostream& operator<<(ostream& out, Matrix<T>& obj) {
		return obj.print(out);
	}

	/**
	 * friend function to compute c = a + b using the transform
	 * algorithm
	 * @param c matrix that results from c = a + b
	 * @param a first operand
	 * @param b second operand
	 */
	friend void sum_impl1(Matrix<T>& c, const Matrix<T>& a, const Matrix<T>& b) {
		if ((a.rows != b.rows) || (a.cols != b.cols)) {
			throw std::logic_error("sum_impl1: first and second operand"
				" have different dimensions");
		}

		u32 size = a.rows * a.cols;
		if (c.data.size() != size) {
			c.data.resize(size);
		}	
		
		c.data = a.data;
		std::transform(std::begin(c.data), std::end(c.data), std::begin(b.data),
               std::begin(c.data), std::plus<T>());
			
	}

	/**
	 * friend function to compute c = a + b using the valarray +
	 * operator
	 * @param c matrix that results from c = a + b
	 * @param a first operand
	 * @param b second operand
	 */
	friend void sum_impl2(Matrix<T>& c, const Matrix<T>& a, const Matrix<T>& b) {
		if ((a.rows != b.rows) || (a.cols != b.cols)) {
			throw std::logic_error("sum_impl2: first and second operand"
				" have different dimensions");
		}

		u32 size = a.rows * a.cols;
		if (c.data.size() != size) {
			c.data.resize(size);
		}	
		
		c.data = a.data + b.data;
	}

	/**
	 * friend function to compute c = a * b using the classic
	 * product implementation
	 * @param c matrix that results from c = a * b
	 * @param a first operand
	 * @param b second operand
	 */
	friend void mul_impl1(Matrix<T>& c, const Matrix<T>& a, const Matrix<T>& b) {
		if (a.cols != b.rows) {
			throw std::logic_error("mul_impl1: first and second operand"
				" don't have dimensions that matches");
		}
	
		u32 size = a.rows * b.cols;
		if (c.data.size() != size) {
			c.data.resize(size);
		}	
		
		c.rows = a.rows;
		c.cols = b.cols;

		for (u32 i=0; i<a.rows; ++i) {
			for (u32 j=0; j<b.cols; ++j) {
				T sum = (T) 0;
				for (u32 k=0; k<a.cols; ++k) {
					sum += a.data[i * a.cols + k] * b.data[k * b.cols + j];
				}
				c.data[i * c.cols + j] = sum;
			}
		}
			
	}

	/**
	 * friend function to compute c = a * b using the classic
	 * product implementation with parallelization of first
	 * loop by openmp
	 * @param c matrix that results from c = a * b
	 * @param a first operand
	 * @param b second operand
	 */
	friend void mul_impl2(Matrix<T>& c, const Matrix<T>& a, const Matrix<T>& b) {
		if (a.cols != b.rows) {
			throw std::logic_error("mul_impl2: first and second operand"
				" don't have dimensions that matches");
		}
	
		u32 size = a.rows * b.cols;
		if (c.data.size() != size) {
			c.data.resize(size);
		}	
		
		c.rows = a.rows;
		c.cols = b.cols;

		#pragma omp parallel for
		for (u32 i=0; i<a.rows; ++i) {
			for (u32 j=0; j<b.cols; ++j) {
				T sum = (T) 0;
				for (u32 k=0; k<a.cols; ++k) {
					sum += a.data[i * a.cols + k] * b.data[k * b.cols + j];
				}
				c.data[i * c.cols + j] = sum;
			}
		}
			
	}

	/**
	 * friend function to compute c = a * b using the tiling
	 * technique
	 * @param c matrix that results from c = a * b
	 * @param a first operand
	 * @param b second operand
	 */
	friend void mul_impl3(Matrix<T>& c, const Matrix<T>& a, const Matrix<T>& b) {
		if (a.cols != b.rows) {
			throw std::logic_error("mul_impl3: first and second operand"
				" don't have dimensions that matches");
		}
	
		u32 size = a.rows * b.cols;
		if (c.data.size() != size) {
			c.data.resize(size);
		}	
		
		c.rows = a.rows;
		c.cols = b.cols;

		const int BLOCK_SIZE = 32;
		
		if (((a.rows % 32) != 0) || ((a.cols % 32) != 0) || ((b.rows % 32) != 0) || ((b.cols % 32) != 0)) {
			throw std::logic_error("matrix dimensions must be a multiple of 32 to use tiling !");
		}
        
		c.fill(0);
        for (u32 i=0; i<a.rows; i+=BLOCK_SIZE) {
                for (u32 j=0; j<b.cols; j+=BLOCK_SIZE) {
                        for (u32 k=0; k<a.cols; ++k) {
                                for (u32 ii=i; ii<i+BLOCK_SIZE; ++ii) {
                                        for (u32 jj=j; jj<j+BLOCK_SIZE; ++jj) {
                                                c.data[ii*c.cols+jj] += a.data[ii*a.cols+k] * b.data[k*b.cols+jj];
                                        } 
                                }
                        }
                }
        }
			
	}

	/**
	 * friend function to compute c = a * b using the tiling
	 * technique and parallelization of the first loop
	 * by openmp
	 * @param c matrix that results from c = a * b
	 * @param a first operand
	 * @param b second operand
	 */
	friend void mul_impl4(Matrix<T>& c, const Matrix<T>& a, const Matrix<T>& b) {
		if (a.cols != b.rows) {
			throw std::logic_error("mul_impl4: first and second operand"
				" don't have dimensions that matches");
		}
	
		u32 size = a.rows * b.cols;
		if (c.data.size() != size) {
			c.data.resize(size);
		}	
		
		c.rows = a.rows;
		c.cols = b.cols;

		const int BLOCK_SIZE = 32;
        
        if (((a.rows % 32) != 0) || ((a.cols % 32) != 0) || ((b.rows % 32) != 0) || ((b.cols % 32) != 0)) {
			throw std::logic_error("matrix dimensions must be a multiple of 32 to use tiling !");
		}
		
		c.fill(0);
		T *cd = &c.data[0];
		const T *ad = &a.data[0];
		const T *bd = &b.data[0];

		#pragma omp parallel for
        for (u32 i=0; i<a.rows; i+=BLOCK_SIZE) {
                for (u32 j=0; j<b.cols; j+=BLOCK_SIZE) {
                        for (u32 k=0; k<a.cols; ++k) {
                                for (u32 ii=i; ii<i+BLOCK_SIZE; ++ii) {
                                        for (u32 jj=j; jj<j+BLOCK_SIZE; ++jj) {
                                                cd[ii*c.cols+jj] += ad[ii*a.cols+k] * bd[k*b.cols+jj];
                                        } 
                                }
                        }
                }
        }	
	}

	/**
	 * friend function to compute c = a * b using one of the
	 * libraries blas, gsl_cblas or MKL. Note that this is only
	 * suitable for T=double
	 * @param c matrix that results from c = a * b
	 * @param a first operand
	 * @param b second operand
	 */	
	friend void mul_impl5(Matrix<T>& c, const Matrix<T>& a, const Matrix<T>& b) {
		if (a.cols != b.rows) {
			throw std::logic_error("mul_impl5: first and second operand"
				" don't have dimensions that matches");
		}
	
		u32 size = a.rows * b.cols;
		if (c.data.size() != size) {
			c.data.resize(size);
		}	
		
		c.rows = a.rows;
		c.cols = b.cols;

		#ifdef CBLAS
		cout << "use cblas" << endl;
		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.cols, a.rows, b.cols,
	      1.0, &a.data[0], a.cols, &b.data[0], b.cols, 0.0, &c.data[0], c.cols);	
		#endif
		
		#ifdef GSL_CBLAS
		cout << "use gsl_cblas" << endl;
		gsl_matrix_view gA = gsl_matrix_view_array(&const_cast<Matrix<T>&>(a).data[0], a.rows, a.cols);
   		gsl_matrix_view gB = gsl_matrix_view_array(&const_cast<Matrix<T>&>(b).data[0], b.rows, b.cols);
    	gsl_matrix_view gC = gsl_matrix_view_array(&c.data[0], c.rows, c.cols);
 
    	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans,
      		1.0, &gA.matrix, &gB.matrix, 0.0, &gC.matrix);
		#endif	
		
		#ifdef MKL
		cout << "use MKL" << endl;
		int DIM = a.rows;
		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, DIM, DIM, DIM,
      	1.0, &a.data[0], DIM, &b.data[0], DIM, 0.0, &c.data[0], DIM);
      	#endif
	}

	/**
	 * redefinition of operator == to compare two matrices based on
	 * the valarray compareason operator ==
	 * @param lhs first operand
	 * @param rhs second operand
	 * @return return true if matrices have the same dimensions and the
	 * same values, otherwise return false
	 */
	friend bool operator==(const Matrix<T>& lhs, const Matrix<T>& rhs) {
		if ((lhs.rows != rhs.rows) || (lhs.cols != rhs.cols)) return false;
		return (lhs.data == rhs.data).min();
	}
	
	/**
	 * redefinition of operator * for two matrices
	 */
	Matrix<T> operator*(const Matrix<T>& b) {
		Matrix<T> tmp(rows, b.cols);
		mul_impl5(tmp, *this, b);
		return tmp;
	}
	
	/**
	 * redefinition of operator + for two matrices
	 */
	Matrix<T> operator+(const Matrix<T>& b) {
		Matrix<T> tmp(rows, b.cols);
		sum_impl2(tmp, *this, b);
		return tmp;
	}
	
};

} // end of namespace

#endif

