#include "parsimony_cuda.h"

extern StaticSequences sseq;

#define CUDA_ASSERT( condition ) { \
		if( (condition) != 0 ) { \
			cerr <<  "\n FAILURE " <<  cudaGetErrorString(condition) \
			<< " in " << __FILE__ << " at line " <<  __LINE__ ; \
			exit( 1 ); \
		} \
}

__device__ __constant__ StaticTree::Node gpu_nodes[Parameters::MAX_STATIC_TREE_NODES];

ResidueType *gpu_data;
int *cpu_mutations;
int *gpu_mutations;
int gpu_taxa_length_aligned;
int NBR_TAXA = 0;
int TAXA_LENGTH = 0;

dim3 block(1,1,1);
dim3 grid(1,1,1);

cudaStream_t myStream;

void cuda_parsimony_init() {
	Parameters& params = Parameters::get_instance();
	NBR_TAXA = params.nbr_taxa;

	TAXA_LENGTH = params.taxa_length;
	int tla = ((TAXA_LENGTH + GPU_MEMORY_ALIGNMENT - 1) / GPU_MEMORY_ALIGNMENT)
							* GPU_MEMORY_ALIGNMENT;
	int full_size = (Parameters::get_instance().nbr_taxa * 2 - 1)
							* tla * sizeof(ResidueType);

	gpu_taxa_length_aligned = tla;

	cout << endl;
	cout << "=== cuda parsimony init ===" << endl;
	cout << "gpu taxa_length_aligned=" << tla << endl;
	cout << "gpu full_size=" << full_size << endl;

	cudaMalloc( (void **) &gpu_data, full_size);

	for (int i=0; i<params.nbr_taxa; ++i) {
		cudaMemcpy(&gpu_data[i * tla], sseq.get_sequence(i), 
				TAXA_LENGTH * sizeof(ResidueType), cudaMemcpyHostToDevice);
	}

	int tpb = params.threads_per_block;
	if (TAXA_LENGTH <= tpb) {
		tpb = TAXA_LENGTH;
	}
	block.x = tpb;
	grid.x = ((TAXA_LENGTH + tpb - 1)/tpb) ;
	cout << "threads per block=" << block.x << endl;
	cout << "blocks per grid=" << grid.x << endl;

	int size = tpb * grid.x;
	cpu_mutations = new int[size];
	cudaMalloc( (void **) &gpu_mutations, sizeof(int) * size);


	//cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferL1);
	cudaStreamCreate(&myStream);	
}

void cuda_parsimony_clear() {
	cudaFree(gpu_data);
	delete [] cpu_mutations;
	cudaFree(gpu_mutations);
	cudaStreamDestroy(myStream);	
}

__global__ void simple_method(int K, const int tla, 
		ResidueType *data, int *mutations) {
	int tid = blockDim.x * blockIdx.x + threadIdx.x;

	int res=0;
	ResidueType x_i = data[tid];
	ResidueType y_i = data[1 * tla + tid];
	ResidueType z_i = x_i & y_i;
	if (z_i == 0) {
		res=1;
		z_i = x_i | y_i;
	}
	data[2 * tla + tid] = z_i;

	if (tid < K) {
		mutations[tid] = res;
	} else {
		mutations[tid] = 0;
	}	
}

int score_simple_method() {

	simple_method<<<grid,block>>>(TAXA_LENGTH, 
			gpu_taxa_length_aligned,
			gpu_data, gpu_mutations);

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		cerr << "kernel error: " << cudaGetErrorString(err);
	}

	CUDA_ASSERT(
			cudaMemcpyAsync(cpu_mutations, gpu_mutations,
					sizeof(int) * TAXA_LENGTH,
					cudaMemcpyDeviceToHost)
	);

	int cost = accumulate(&cpu_mutations[0], &cpu_mutations[TAXA_LENGTH], 0);

	return cost;
}

/**
 * N number of leaves/taxa
 * K length of taxa
 */
__global__ void iterative_kernel(const int N, const int K, const int tla, 
		ResidueType *data, int *mutations) {

	int tid = blockDim.x * blockIdx.x + threadIdx.x;
	int node_id = N;
	int res = 0;

	while (gpu_nodes[node_id].p != 0) {
		int l = gpu_nodes[node_id].l;
		int r = gpu_nodes[node_id].r;
		ResidueType x_i = data[l * tla + tid];
		ResidueType y_i = data[r * tla + tid];
		/* */
		ResidueType z_i = x_i & y_i;
		if (z_i == 0) {
			++res;
			z_i = x_i | y_i;
		}
		/* */
		/*
		ResidueType x_ = x_i & y_i;
		ResidueType y_ = x_i | y_i;
		//ResidueType z_i = x_ | ((!x_) * y_);
		ResidueType z_i = (x_ == 0) ? y_ : x_;
		c = ((!x_) & 1);
		res += c;
		 */
		data[node_id * tla + tid] = z_i;
		++node_id;
	}

	if (tid < K) {
		mutations[tid] = res;
	} else {
		mutations[tid] = 0;
	}
	__syncthreads();
}

int iterative_score_on_gpu(StaticTree& t) {
	CUDA_ASSERT(
			cudaMemcpyToSymbolAsync(gpu_nodes, t.nodes, Parameters::MAX_STATIC_TREE_NODES * sizeof(StaticTree::Node),
					0, cudaMemcpyHostToDevice, myStream)
	);
	;
	iterative_kernel<<<grid, block, 0, myStream>>>(NBR_TAXA, TAXA_LENGTH,
			gpu_taxa_length_aligned,
			gpu_data, gpu_mutations);

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		cerr << "kernel error: " << cudaGetErrorString(err);
	}

	CUDA_ASSERT(cudaMemcpyAsync(cpu_mutations, gpu_mutations,sizeof(int) * TAXA_LENGTH,	cudaMemcpyDeviceToHost, myStream));
	cudaDeviceSynchronize();

	int cost = accumulate(&cpu_mutations[0], &cpu_mutations[TAXA_LENGTH], 0);

	//if (METHOD < 0) {
	//	cudaMemcpy(s.tab, d_tab, s.FULL_SIZE, cudaMemcpyDeviceToHost);
	//}
	return cost;
}



__device__ int recursive_device_call(int k, int tid, const int tla, 
		ResidueType *data) {
	int l = gpu_nodes[k].l;
	int r = gpu_nodes[k].r;

	if ((l == -1) && (r == -1)) {
		return 0;
	} else {
		int lscore = recursive_device_call(l, tid, tla, data);
		int rscore = recursive_device_call(r, tid, tla, data);
		int pscore = 0;
		ResidueType z = data[l * tla + tid] & data[r * tla + tid];
		if (z == 0) {
			z = data[l * tla + tid] | data[r * tla + tid];
			++pscore;
		}
		data[k * tla + tid] = z;
		return pscore + lscore + rscore;
	}
}


__global__ void recursive_kernel(const int N, const int K, const int tla, 
		ResidueType *data, int *mutations) {

	int tid = blockDim.x * blockIdx.x + threadIdx.x;

	int res = recursive_device_call(2*N-2, tid, tla, data);
	if (tid < K) {
		mutations[tid] = res;
	} else {
		mutations[tid] = 0;
	}
}

int recursive_score_on_gpu(StaticTree& t) {

	CUDA_ASSERT(
			cudaMemcpyToSymbolAsync("gpu_nodes", t.nodes,
					Parameters::MAX_STATIC_TREE_NODES * sizeof(StaticTree::Node),
					0, cudaMemcpyHostToDevice)
	);

	recursive_kernel<<<1,block>>>(NBR_TAXA, TAXA_LENGTH, 
			gpu_taxa_length_aligned,
			gpu_data, gpu_mutations);

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		cerr << "kernel error: " << cudaGetErrorString(err);
	}

	CUDA_ASSERT(
			cudaMemcpyAsync(cpu_mutations, gpu_mutations,
					sizeof(int) * TAXA_LENGTH,
					cudaMemcpyDeviceToHost)
	);

	int cost = accumulate(&cpu_mutations[0], &cpu_mutations[TAXA_LENGTH], 0);

	//if (METHOD < 0) {
	//	cudaMemcpy(s.tab, d_tab, s.FULL_SIZE, cudaMemcpyDeviceToHost);
	//}
	return cost;
}


__global__ void iterative_stride_kernel(const int N, const int K, const int tla, 
		ResidueType *data, int *mutations) {

	int tid = threadIdx.x;

	while (tid < K) {
		int node_id = N;
		int res = 0;
		while (gpu_nodes[node_id].p != 0) {
			int l = gpu_nodes[node_id].l;
			int r = gpu_nodes[node_id].r;
			ResidueType x_i = data[l * tla + tid];
			ResidueType y_i = data[r * tla + tid];
			ResidueType z_i = x_i & y_i;
			if (z_i == 0) {
				++res;
				z_i = x_i | y_i;
			}
			data[node_id * tla + tid] = z_i;
			++node_id;
		}

		if (tid < K) {
			mutations[tid] = res;
		} else {
			mutations[tid] = 0;
		}

		tid += blockDim.x;
	}
}

int iterative_stride_score_on_gpu(StaticTree& t) {

	CUDA_ASSERT(
			cudaMemcpyToSymbolAsync("gpu_nodes", t.nodes,
					Parameters::MAX_STATIC_TREE_NODES * sizeof(StaticTree::Node),
					0, cudaMemcpyHostToDevice)
	);

	iterative_stride_kernel<<<grid,block>>>(NBR_TAXA, TAXA_LENGTH, 
			gpu_taxa_length_aligned,
			gpu_data, gpu_mutations);

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		cerr << "kernel error: " << cudaGetErrorString(err);
	}

	CUDA_ASSERT(
			cudaMemcpyAsync(cpu_mutations, gpu_mutations,
					sizeof(int) * TAXA_LENGTH,
					cudaMemcpyDeviceToHost)
	);

	int cost = accumulate(&cpu_mutations[0], &cpu_mutations[TAXA_LENGTH], 0);

	//if (METHOD < 0) {
	//	cudaMemcpy(s.tab, d_tab, s.FULL_SIZE, cudaMemcpyDeviceToHost);
	//}
	return cost;
}
