#!/bin/sh
# to get occupancy do the following, but it will slow down execution
#export CUDA_PROFILE=1
#export CUDA_PROFILE_LOG=tmp/cuda.log

# number of taxa
N="64 128 256 512"
N="512"
# sizes
K="1024 2048 4096 8192 16384 32768 131072 262144"
K="262144"
echo "============"
echo "TESTS ON GPU"
echo "============"

# architecture
arch=`bin/main.exe --tree-implementation=2 | grep "^architecture=" | cut -d '=' -f2 | cut -d ' ' -f1`
# data size
dasz=`bin/main.exe --tree-implementation=2 | grep "^data size=" | cut -d '=' -f2 | cut -d ' ' -f1`

gpu_name=`bin/main.exe --gpu-detect 2>&1 | grep "^gpu_name=" | cut -d'=' -f2 | tr ' ' '-' | head -1`
threads_per_block=256
if test $# -eq 1 ; then
	threads_per_block=$1
fi

# add --gpu-select=0 (or 1,2,3} to specify GPU to use
params="--tree-implementation=2 --test=3 --threads-per-block=${threads_per_block}"
results="results/gpu_${gpu_name}_arch_${arch}_data_${dasz}_threads_${threads_per_block}.txt"
rm -rf $results

echo "generate results in $results"

stdout="tmp/stdout.txt"
stderr="tmp/stderr.txt"
cudalog="tmp/cuda.log"
for n in $N ; do
	for k in $K ; do
		echo "time ./bin/main.exe --nbr-taxa=$n --taxa-length=$k $params >$stdout 2>$stderr" 
		time ./bin/main.exe --nbr-taxa=$n --taxa-length=$k $params >$stdout 2>$stderr
		t=`cat $stdout | grep "gpu elapsed=" | cut -d'=' -f2`
		tpb=`cat $stdout | grep "^threads per block=" | cut -d'=' -f2`
		if test -f $cudalog ; then
			occ=`cat $cudalog | grep "occupancy=" | head -1 | sed -e "s/.\+occupancy.\[ \([0-9.]\+\).\+/\1/g"`
		fi
		echo "$n; $k; $t; $tpb; $occ"		
		echo "$n; $k; $t; $tpb; $occ"	>> $results
	done
done
