global sum_asm
global sum_asm_vec_sse
global sum_asm_vec_avx


; ===================================================================
; DATA
; ===================================================================
section .data


; ===================================================================
; CODE
; ===================================================================
section .text

; -------------------------------------------------------------------
; int sum_asm(int *t, int size)
;
;    compute sum of values in array 't' of 'size' elements
;
; -------------------------------------------------------------------
; 64 bits code and registers mapping
; -------------------------------------------------------------------
; rdi = t
; rsi = size
; eax = sum
; ecx = i
; -------------------------------------------------------------------
sum_asm:
    xor     eax, eax				; int sum = 0
    test    rsi, rsi				; if size == 0 goto .end
    jbe     .end

    xor     ecx, ecx				; for (int i = 0; i < size; ++i)
.for:
    add     eax, [rdi + rcx * 4]	;	sum += t[i]
    add     ecx, 1
    cmp     ecx, esi
    jne     .for

.end:								
    ret								; return sum
    
; -------------------------------------------------------------------
; int sum_asm_vec_sse(int *t, int size)
;
;    compute sum of values in array 't' of 'size' elements using
;    vectorization with SSE
;
; -------------------------------------------------------------------
; 64 bits code and registers mapping
; -------------------------------------------------------------------
; rdi = t
; rsi = size
; eax = sum
; ecx = i
; rdx = number of iterations of the loop 
; xmm0 = partial sum
; -------------------------------------------------------------------
sum_asm_vec_sse:
    xor     eax, eax				; int sum = 0
    test    rsi, rsi				; if size == 0 goto .end
    jbe     .end

    xor     ecx, ecx				; int i = 0
    
    mov		rdx, rsi				; int nbr_loops = size / 4
    shr		rdx, 2
	jz		.last					; if nbr_loops == 0 goto .last
    
    pxor	xmm0, xmm0				; xmm0 = [0,0,0,0]
.for_vec:
	paddd	xmm0, [rdi + rcx * 4]	; xmm0[0] += t[i+0], xmm0[1] += t[i+1]
									; xmm0[2] += t[i+2], xmm0[3] += t[i+3]
	add		ecx, 4					; i+= 4
	dec		edx						; --nbr_loops;
	jnz		.for_vec				; if nbr_loops != 0 goto .for_vec
    
    phaddd	xmm0, xmm0				; xmm0[0] = xmm0[0] + ... + xmm0[3]
    phaddd	xmm0, xmm0
    movd	eax, xmm0				; sum = xmm0[0]
    
.last:
	mov		rdx, rsi				; int last_iterations = size & 3
	and		rdx, 3
	jz		.end					; if last_iterations == 0 goto .end
			    	
.for:
    add     eax, [rdi + rcx * 4]	; sum += t[i]
    add     ecx, 1					; ++i
    dec		edx						; --last_iterations 
    jnz     .for					; if last_iterations != 0 goto .for

.end:
    ret								; return sum
    
; -------------------------------------------------------------------
; int sum_asm_vec_avx(int *t, int size)
;
;    compute sum of values in array 't' of 'size' elements using
;    vectorization with AVX
;
; -------------------------------------------------------------------
; 64 bits code and registers mapping
; -------------------------------------------------------------------
; rdi = t
; rsi = size
; eax = sum
; ecx = i
; rdx = number of iterations of the loop 
; xmm0 = partial sum
; -------------------------------------------------------------------
sum_asm_vec_avx:
    xor     eax, eax				; int sum = 0
    test    rsi, rsi				; if size <= 0 goto .end
    jbe     .end

    xor     ecx, ecx				; int i = 0
    
    mov		rdx, rsi				; int nbr_loops = size / 8
    shr		rdx, 3
	jz		.last					; if nbr_loops == 0 goto .last
    
    vpxor	ymm0, ymm0				; ymm0= [0,0,0,0,0,0,0,0]
.for_vec:
	vpaddd	ymm0, [rdi + rcx * 4]	; ymm0[0] += t[i+0]... ymm0[7] += t[i+7]
	add		ecx, 8					; i += 8
	dec		edx						; --nbr_loops
	jnz		.for_vec				; if nbr_loops != 0 goto .for_vec
    
    vphaddd	ymm0, ymm0				; sum of all the elements of ymm0
    vphaddd	ymm0, ymm0
	vextractf128 xmm1, ymm0, 1
	addps	xmm0, xmm1
	vmovd	eax, xmm0				; sum = ymm0[0] + ... + ymm0[7]
    
.last:
	mov		rdx, rsi				; int last_iterations = size & 7
	and		rdx, 7					
	jz		.end					; if last_iterations == 0 goto .end
			    	
.for:
    add     eax, [rdi + rcx * 4]	; sum += t[i]
    add     ecx, 1					; ++i
    dec		edx						; --last_iterations
    jnz     .for					; if last_iterations == 0 goto .for

.end:
    ret								; return sum
    
        
