#include <iostream>
#include <iomanip>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <vector>
#include <algorithm>
#include <numeric>

#include <getopt.h>
#include "timer.h"

using namespace std;

#include "ez_cuda.h"


int verbose_level = 1;
int vector_size = 100;


// compute sum of vectors : z = x + y
__global__
void kernel_sum(int *x, int *y, int *z, int size) {
	
	// global thread index
	int gtid = blockDim.x * blockIdx.x + threadIdx.x;
	
	if (gtid < size) {
		z[ gtid ] = x[ gtid ] + y[ gtid ];
	}
	
	
}



void usage(int argc, char *argv[])
{
    cout << endl;
    cout << argv[0] << " [options]" << endl;
	cout << endl;
	
    cout << "\t-size=int or -s int" << endl;
    cout << "\t\tsize of vector" << endl;

    cout << "\t--verbose=int or -v int" << endl;
    cout << "\t\tverbose level: 0 is equivalent to quiet" << endl;

    cout << "\t--quiet or -q" << endl;
    cout << "\t\tno output" << endl;


    cout << "\t--help or -h" << endl;
    cout << "\t\tthis message" << endl;
    
    cout << endl << endl;

    exit(EXIT_SUCCESS);
}


void parse_arguments(int argc, char *argv[]) {

	static struct option long_options[] = {
        
        {"verbose", required_argument, 0, 0},
        {"quiet", no_argument, 0, 0},
        {"help", no_argument, 0, 0},
        {"size", required_argument, 0, 0},
        {0, 0, 0, 0}};

    while (true)
    {
        int option_index;
        int c = getopt_long(argc, argv, "v:qhs:", long_options, &option_index);
        if (c == -1)
            break;

        switch (c)
        {

        case 's':
            vector_size = atoi(optarg);
            break;

        case 'v':
            verbose_level = atoi(optarg);
            break;

        case 'q':
            verbose_level = 0;
            break;

        case 'h':
            usage(argc, argv);
            break;

        default:
            throw std::runtime_error("unknown option, use -h or --help for more information");
        }
    }
    
    // now check parameters
    if (vector_size < 1) {
    	throw std::runtime_error("vector size must be greater than 0");
    }

}


int main( int argc, char *argv[] ) {

	try {
	
		parse_arguments(argc, argv);
		
	} catch(exception& e) {
	
		cerr << "error: " << e.what() << endl;
		return EXIT_FAILURE;
		
	}	
	
	if (verbose_level > 0) {
		cout << "- create vectors with values" << endl;
	}
	
	int *x_on_cpu, *y_on_cpu, *z_on_cpu;
	int *x_on_gpu, *y_on_gpu, *z_on_gpu;
	
	// create vectors on cpu
	x_on_cpu = new int [ vector_size ];
	y_on_cpu = new int [ vector_size ];
	z_on_cpu = new int [ vector_size ];

	// fill vectors x and y on cpu with respectively 1 and 2, set z to 0
	for (int i = 0; i < vector_size; ++i) {
		x_on_cpu[ i ] = 1;
		y_on_cpu[ i ] = 2;
		z_on_cpu[ i ] = 0;
	}
	
	// create vectors on gpu
	ezc_valid( cudaMalloc( (void **) &x_on_gpu, vector_size * sizeof(int) ) );
	ezc_valid( cudaMalloc( (void **) &y_on_gpu, vector_size * sizeof(int) ) );
	ezc_valid( cudaMalloc( (void **) &z_on_gpu, vector_size * sizeof(int) ) );
	
	// or you could use:
	// ezc_malloc( x_on_gpu, int, vector_size);
	// ezc_malloc( y_on_gpu, int, vector_size);
	// ezc_malloc( z_on_gpu, int, vector_size);
		
	
	
	ezc_valid( cudaMemcpy( x_on_gpu, x_on_cpu, vector_size * sizeof(int), CPU_to_GPU ) );
	ezc_valid( cudaMemcpy( y_on_gpu, y_on_cpu, vector_size * sizeof(int), CPU_to_GPU ) );
	
	// or you could use:
	// ezc_memcpy_to_gpu( x_on_gpu, x_on_cpu, int, vector_size );
	// ezc_memcpy_to_gpu( y_on_gpu, y_on_cpu, int, vector_size );
	
	dim3 block(64);
	dim3 grid( (vector_size + 63) / block.x );
	
	kernel_sum<<< grid, block>>>( x_on_gpu, y_on_gpu, z_on_gpu, vector_size );
	ezc_valid_kernel();
	
	ezc_valid( cudaMemcpy( z_on_cpu, z_on_gpu, vector_size * sizeof(int), GPU_to_CPU ) );
	
	// or you could use:
	// ezc_memcpy_to_cpu( z_on_cpu, z_on_gpu, int, vector_size );
	
	cout << "- z=";
	for (int i = 0; i < vector_size; ++i) {
		cout << z_on_cpu[i] << " ";
	}
	cout << endl;
 	
 	// free resources on cpu
 	delete [] x_on_cpu;
 	delete [] y_on_cpu;
 	delete [] z_on_cpu;
 	
 	// free resources on gpu
 	cudaFree( x_on_gpu );
 	cudaFree( y_on_gpu );
 	cudaFree( z_on_gpu );
 	
	return EXIT_SUCCESS;
	
}
