/*
 * main.cpp
 *
 *  Created on: Jan 28, 2014
 *      Author: richer
 */

#include <vector>
#include <iostream>
#include <cassert>
#include <typeinfo>
using namespace std;
#include <getopt.h>
#include <unistd.h>
#include <cassert>
#include <cstdlib>
#include "cpu_chrono.h"
#include "parameters.h"
#include "parsimony.h"
#include "sequence.h"
#include "static_sequences.h"
#include "dynamic_tree.h"
#include "static_tree.h"
#include "gpu_chrono.h"
#include "cuda_part.h"
#include "parsimony_cuda.h"

static struct option long_options[] = {
		{"test",  				required_argument, 	0, 't'},
		{"seed",  				required_argument, 	0, 's'},
		{"method",  			required_argument, 	0, 'm'},
		{"nbr-taxa",			required_argument, 	0, 'n'},
		{"taxa-length",			required_argument, 	0, 'k'},
		{"tree-implementation",	required_argument, 	0, 'i'},
		{"tree-topology",		required_argument, 	0, 'y'},
		{"taxa-initialization",	required_argument, 	0, 'z'},
		{"threads-per-block", 	required_argument, 0, 'b'},
		{"max-iterations", 		required_argument, 0, 'c'},
		{"gpu-detect",			no_argument, 0, 'g'},
		{"gpu-select",			required_argument, 0, 'a'},
		{"get-method-name", 	required_argument, 0, 'x'},
		{0, 0, 0, 0}
};

StaticSequences sseq;

/**
 * test all method: CPU and GPU
 */
void test_all_methods() {
	int score = 0;

	cout << "===========================" << endl;
	cout << "=== test for all method ===" << endl;
	cout << "===========================" << endl;

	if (cuda_init() == false) {
		cout << "warning: no CUDA device found\n";
		return ;
	}

	cuda_parsimony_init();

	cout << endl;
	cout << "methods" << endl;
	cout << "--------------------" << endl;
	for (int i=1; i<Parsimony::get_nbr_methods(); ++i) {
		cout << "method:" << ":" << Parsimony::get_method_name(i);
		score = Parsimony::execute(i, sseq.get_sequence(0),
				sseq.get_sequence(1), sseq.get_sequence(2),
				Parameters::get_instance().taxa_length);
		cout << ":" << score << endl;
	}

	score = score_simple_method();
	cout << "method:cuda:" << score << endl;

}

/**
 * test CPU method
 */
void test_for_cpu_method(vector<Tree *>& trees) {
	int score=-1;
	int impl = Parameters::get_instance().tree_implementation;

	CPUChrono c;
	c.start();

	cout << "=======================" << endl;
	cout << "=== test for method ===" << endl;
	cout << "=======================" << endl;
	cout << "method_name=" << Parsimony::get_method_name(Parameters::get_instance().method) << endl;
	int l=0;
	int total=0;
	for (l=0; l<Parameters::get_instance().max_iterations; ++l) {
		vector<Tree *>::iterator i_trees;
		for (i_trees = trees.begin(); i_trees != trees.end(); ++ i_trees) {

			++total;
			if (impl == Parameters::TREE_IMPLEMENTATION_DYNAMIC) {
				//cerr << ".";
				score = Parsimony::score( *reinterpret_cast<DynamicTree *>(*i_trees));
			} else {
				//cerr << "*";
				score = Parsimony::score( *reinterpret_cast<StaticTree *>(*i_trees));
			}
		}
	}
	c.stop();
	cout << "cpu elapsed=" << c.elapsed() << endl;
	cout << "iterations=" << l << endl;
	cout << "total=" << total << endl;
	cout << "score=" << score << endl;
}


void test_for_iterative_score_on_gpu(vector<Tree *>& trees) {
	int score=-1;
	int impl = Parameters::get_instance().tree_implementation;

	if (impl == Parameters::TREE_IMPLEMENTATION_DYNAMIC) {
		cerr << "error: tree implementation must be static for GPU" << endl;
		exit(EXIT_FAILURE);
	}

	if (cuda_init() == false) {
		cerr << "error: no CUDA device found\n";
		exit(EXIT_FAILURE);
	}
	cuda_parsimony_init();

	CPUChrono c;
	GPUChrono g;

	c.start();
	g.start();
	cout << "========================================" << endl;
	cout << "=== test for iterative method on GPU ===" << endl;
	cout << "========================================" << endl;
	int l=0;
	int total=0;
	for (l=0; l<Parameters::get_instance().max_iterations; ++l) {
		vector<Tree *>::iterator i_trees;
		for (i_trees = trees.begin(); i_trees != trees.end(); ++ i_trees) {
			++total;
			score = iterative_score_on_gpu( *reinterpret_cast<StaticTree *>(*i_trees));
		}
	}
	c.stop();
	g.stop();

	cout << "cpu elapsed=" << c.elapsed() << endl;
	cout << "gpu elapsed=" << g.elapsed() << endl;
	cout << "iterations=" << l << endl;
	cout << "total=" << total << endl;
	cerr << "score=" << score << endl;

	cuda_parsimony_clear();
}

void test_for_recursive_score_on_gpu(vector<Tree *>& trees) {
	int score=-1;
	int impl = Parameters::get_instance().tree_implementation;

	if (impl == Parameters::TREE_IMPLEMENTATION_DYNAMIC) {
		cerr << "error: tree implementation must be static for GPU" << endl;
		exit(EXIT_FAILURE);
	}

	if (cuda_init() == false) {
		cerr << "error: no CUDA device found\n";
		exit(EXIT_FAILURE);
	}
	cuda_parsimony_init();

	CPUChrono c;
	GPUChrono g;

	c.start();
	g.start();

	cout << "========================================" << endl;
	cout << "=== test for recursive method on GPU ===" << endl;
	cout << "========================================" << endl;
	int l=0;
	int total=0;
	for (l=0; l<Parameters::get_instance().max_iterations; ++l) {
		vector<Tree *>::iterator i_trees;
		for (i_trees = trees.begin(); i_trees != trees.end(); ++ i_trees) {
			++total;
			score = recursive_score_on_gpu( *reinterpret_cast<StaticTree *>(*i_trees));
		}
	}
	c.stop();
	g.stop();

	cout << "cpu elapsed=" << c.elapsed() << endl;
	cout << "gpu elapsed=" << g.elapsed() << endl;
	cout << "iterations=" << l << endl;
	cout << "total=" << total << endl;
	cerr << "score=" << score << endl;

	cuda_parsimony_clear();
}

void test_for_iterative_stride_score_on_gpu(vector<Tree *>& trees) {
	int score=-1;
	int impl = Parameters::get_instance().tree_implementation;

	if (impl == Parameters::TREE_IMPLEMENTATION_DYNAMIC) {
		cerr << "error: tree implementation must be static for GPU" << endl;
		exit(EXIT_FAILURE);
	}

	if (cuda_init() == false) {
		cerr << "error: no CUDA device found\n";
		exit(EXIT_FAILURE);
	}
	cuda_parsimony_init();

	CPUChrono c;
	GPUChrono g;

	c.start();
	g.start();

	cout << "====================================================" << endl;
	cout << "=== test for iterative with stride method on GPU ===" << endl;
	cout << "====================================================" << endl;
	int l=0;
	int total=0;
	for (l=0; l<Parameters::get_instance().max_iterations; ++l) {
		vector<Tree *>::iterator i_trees;
		for (i_trees = trees.begin(); i_trees != trees.end(); ++ i_trees) {
			++total;
			score = iterative_stride_score_on_gpu( *reinterpret_cast<StaticTree *>(*i_trees));
		}
	}
	c.stop();
	g.stop();

	cout << "cpu elapsed=" << c.elapsed() << endl;
	cout << "gpu elapsed=" << g.elapsed() << endl;
	cout << "iterations=" << l << endl;
	cout << "total=" << total << endl;
	cout << "score=" << score << endl;

	cuda_parsimony_clear();
}

void test_dynamic_tree() {
	Parameters::get_instance().nbr_taxa = 8;
	DynamicTree factory;
	Tree *t1 = factory.generate_balanced();
	Tree *t2 = factory.generate_comb();
	//Tree *t3 = factory.generate_random();
	Tree *t3 = factory.generate_comb();
	t1->to_newick(cerr);
	cerr << endl;
	t2->to_newick(cerr);
	cerr << endl;
	t3->to_newick(cerr);
	cerr << endl;
	//cerr << t1->get_depth() << " " << t2->get_depth() << endl;
	assert(t1->get_depth() == 4);
	assert(t2->get_depth() == 8);
	//exit(1);
	int expected = 2 * Parameters::get_instance().nbr_taxa - 1;
	assert(t1->count_nodes() == expected);
	assert(t2->count_nodes() == expected);
	assert(t3->count_nodes() == expected);
	delete t1;
	delete t2;
	delete t3;
}

void test_static_tree() {
	Parameters::get_instance().nbr_taxa = 8;
	StaticTree factory;
	Tree *t1 = factory.generate_balanced();
	Tree *t2 = factory.generate_comb();
	Tree *t3 = factory.generate_random();
	t1->to_newick(cerr);
	cerr << endl;
	t2->to_newick(cerr);
	cerr << endl;
	t3->to_newick(cerr);
	cerr << endl;
	//cerr << t1->get_depth() << " " << t2->get_depth() << endl;
	assert(t1->get_depth() == 4);
	assert(t2->get_depth() == 8);
	//exit(1);
	int expected = 2 * Parameters::get_instance().nbr_taxa - 1;
	assert(t1->count_nodes() == expected);
	assert(t2->count_nodes() == expected);
	assert(t3->count_nodes() == expected);
	delete t1;
	delete t2;
	delete t3;
}

void test_dynamic_trees_management() {
	CPUChrono c;
	Parameters& params = Parameters::get_instance();
	DynamicTree factory;

	cout << "run dynamic trees management test" << endl;

	c.start();
	Tree **tab = new Tree * [ params.nbr_trees ];
	for (int iter = 0; iter < params.max_iterations; ++ iter) {
		for (int i = 0; i < params.nbr_trees; ++i) {
			tab[i] = factory.generate_random();
		}
		for (int i = 1; i < params.nbr_trees; ++i) {
			(*tab[i]) = (*tab[0]);
		}
		for (int i = 0; i < params.nbr_trees; ++i) {
			delete tab[i];
		}
	}
	c.stop();
	cout << "elapsed=" << c.elapsed() << endl;
}

void test_static_trees_management() {
	CPUChrono c;
	Parameters& params = Parameters::get_instance();
	StaticTree factory;

	cout << "run static trees management test" << endl;

	c.start();
	Tree **tab = new Tree * [ params.nbr_trees ];
	for (int iter = 0; iter < params.max_iterations; ++ iter) {
		for (int i = 0; i < params.nbr_trees; ++i) {
			tab[i] = factory.generate_random();
		}
		for (int i = 1; i < params.nbr_trees; ++i) {
			(*tab[i]) = (*tab[0]);
		}
		for (int i = 0; i < params.nbr_trees; ++i) {
			delete tab[i];
		}
	}
	c.stop();
	cout << "elapsed=" << c.elapsed() << endl;
}



// ====================================================
// main
// ====================================================
int main(int argc, char *argv[]) {

	Parameters& params = Parameters::get_instance();

	int option_index;

	while (1) {
		option_index = 0;
		int c = getopt_long (argc, argv, "t:s:m:n:k:i:y:z:b:c:gx:a:", long_options, &option_index);
		if (c == -1) break;

		switch(c) {
		case 't':
			params.test = atoi(optarg);
			break;
		case 's':
			params.seed = atoi(optarg);
			break;
		case 'm':
			params.method = atoi(optarg);
			break;
		case 'n':
			params.nbr_taxa = atoi(optarg);
			break;
		case 'k':
			params.taxa_length = atoi(optarg);
			break;
		case 'i':
			params.tree_implementation = atoi(optarg);
			break;
		case 'y':
			params.tree_topology = atoi(optarg);
			break;
		case 'z':
			params.taxa_initialization = atoi(optarg);
			break;
		case 'b':
			params.threads_per_block = atoi(optarg);
			break;
		case 'c':
			params.max_iterations = atoi(optarg);
			break;
		case 'g':
			if (cuda_init() == false) {
				cerr << "error: no CUDA device found\n";
				exit(EXIT_FAILURE);
			}
			exit(EXIT_SUCCESS);
			break;
		case 'x':
			params.method = atoi(optarg);
			if (params.method >= Parsimony::get_nbr_methods()) {
				cerr << "error: unknown method index" << endl;
				exit(EXIT_FAILURE);
			}
			cerr << "method_name=" << Parsimony::get_method_name(params.method) << endl;
			exit(EXIT_SUCCESS);
			break;
		case 'a':
			params.gpu_selected = atoi(optarg);
			break;
		}
	}

	if (params.seed <= 0) {
		srand(getpid());
	} else {
		srand(params.seed);
	}

	//test_dynamic_tree();
	//test_static_tree();

	switch(params.test) {
	case 1:
		params.nbr_taxa = 2;
		params.taxa_length = 1025;
		params.taxa_initialization = Parameters::TAXA_INITIALIZATION_CONSTANT;
		break;
	case 2:
		break;

	case 3: // GPU
	case 4:
	case 5:
		params.tree_implementation = Parameters::TREE_IMPLEMENTATION_STATIC;
		break;

	case -2: // test dynamic tree generation copy and deletion
		params.nbr_trees = 1000;
		params.nbr_taxa = 512;
		params.max_iterations = 100;
		test_dynamic_trees_management();
		break;

	case -3: // test static tree generation copy and deletion
		params.nbr_trees = 1000;
		params.nbr_taxa = 512;
		params.max_iterations = 100;
		test_static_trees_management();
		break;

	default:
		cerr << "error: test " << params.test << " not defined" << endl;
		exit(EXIT_FAILURE);
		break;
	};

	cout << params;

#ifdef CPU_ARCHITECTURE_32_BITS
	cout << "architecture=32 bits" << endl;
#endif
#ifdef CPU_ARCHITECTURE_64_BITS
	cout << "architecture=64 bits" << endl;
#endif

#ifdef CPU_DATA_SIZE_8_BITS
	cout << "data size=8 bits" << endl;
#endif
#ifdef CPU_DATA_SIZE_32_BITS
	cout << "data size=32 bits" << endl;
#endif


	// --------------------------------------------------
	// generate sequences
	// --------------------------------------------------
	vector<Sequence *> sequences;
	int tab_init1[4] = { 1, 2, 4, 8 };
	int tab_init2[4] = { 8, 2, 4, 1 };
	for (int i=0; i<params.nbr_taxa; ++i) {
		Sequence *seq;
		if (params.taxa_initialization == Parameters::TAXA_INITIALIZATION_CONSTANT) {
			if ((i % 2)== 0) {
				seq = new Sequence(tab_init1, 4);
			} else {
				seq = new Sequence(tab_init2, 4);
			}
		} else {
			seq = new Sequence;
		}
		//cerr << "seq " << i << ": " << *seq << endl;
		sequences.push_back(seq);
	}

	sseq.setup(sequences);
	cout << sseq << endl;

	// --------------------------------------------------
	// generate trees
	// --------------------------------------------------
	vector<Tree *> trees;
	Tree *factory;
	if (params.tree_implementation == Parameters::TREE_IMPLEMENTATION_DYNAMIC) {
		factory = new DynamicTree;
	} else {
		factory = new StaticTree;
	}
	for (int i=0; i<params.nbr_trees; ++i) {
		Tree *t;

		switch(params.tree_topology) {
		case Parameters::TREE_TOPOLOGY_RANDOM:
			t = factory->generate_random(); break;
		case Parameters::TREE_TOPOLOGY_COMB:
			t = factory->generate_comb(); break;
		case Parameters::TREE_TOPOLOGY_BALANCED:
			t = factory->generate_balanced(); break;
		}
		trees.push_back(t);
	}
	delete factory;

	// --------------------------------------------------
	// tests
	// --------------------------------------------------
	switch(params.test) {
	case 1:
		test_all_methods();
		break;

	case 2:
		test_for_cpu_method(trees);
		break;

	case 3:
		test_for_iterative_score_on_gpu(trees);
		break;

	case 4:
		test_for_recursive_score_on_gpu(trees);
		break;

	case 5:
		test_for_iterative_stride_score_on_gpu(trees);
		break;

	};

	// --------------------------------------------------
	// remove trees
	// --------------------------------------------------
	vector<Tree *>::iterator i_trees;
	for (i_trees = trees.begin(); i_trees != trees.end(); ++ i_trees) {
		delete (*i_trees);
	}

	return 0;
}
