global pars_avx2_64bits_data_8bits


; calling convention of the System V AMD64 application binary interface
; is used with Linux
; The registers RDI, RSI, RDX, RCX, R8 and R9 are used for integer and pointer arguments
; while XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6 and XMM7 are used for floating point
; arguments
; RBX, RBP, R12 to R15 must be preserved
; RAX, RCX, RDX, RSI, RDI, R8 to R11 can be modified



section .text

; ===============================================================
; uint fitch_parsimony_asm_compute(char *x, char *y, char *z, uint length);
; will work with any length
; assume that x, y and Z are aligned on a 16 boundary
; assume that length is a multiple of 16
; follow C calling conventions
;
; rdi = x -> xmm0 = [x : x+15]
; rsi = y -> xmm1 = [y : y+15]
; rdx = z -> [z: z+15]
; rcx = length
; r8  = changes
; r9  = temporary use
; r10 = i
; r11 = upper bound of i
; note that parameters are in rdi, rsi, rdx, rcx
; we use the RED ZONE principle

pars_avx2_64bits_data_8bits:
	push	rbp
	mov		rbp, rsp

	; assume red zone

	mov		[rbp-16], rbx	; save rbx which must not be modified


	xor		rax, rax	; set to 0 upper 32bits of rax
						; because used by popcount

	xor		r8, r8		; changes <- 0

	xor		r10, r10		; i <- 0

	mov		r11, rcx
	and		r11, ~31				; number of SSE iterations
	cmp		r11, 32
	jl		.cpt_next16

.cpt_loop32:

	vmovdqa	xmm0, [rdi + r10]
	vmovdqa	xmm4, [rdi + r10 + 16]
	vmovdqa xmm1, [rsi + r10]
	vmovdqa xmm5, [rsi + r10 + 16]

	vpxor	xmm3, xmm3
	vpxor	xmm7, xmm7

	vmovdqa	xmm2, xmm0
	vmovdqa xmm6, xmm4

	vpand	xmm0, xmm0, xmm1
	vpand	xmm4, xmm4, xmm5
	vpor	xmm1, xmm1, xmm2
	vpor	xmm5, xmm5, xmm6

	vpcmpeqb	xmm3, xmm0
	vpcmpeqb xmm7, xmm4

	vpmovmskb	eax, xmm3
	vmovdqa  xmm2, xmm3

	; temporarily use ebx that is reloaded later
%ifdef ASM_POPCNT_COMPLIANT
	popcnt	rax, rax
	add		r8, rax
%else
	; use ebx for temporary computation, reload its value later
	movzx	ebx,ah
	and		eax, 0xFF
	movzx	ebx, byte [parsimony_translation_table + ebx]
	movzx	eax, byte [parsimony_translation_table + eax]
	lea		eax, [eax + ebx]
	add		r8, rax
%endif

	vpmovmskb	eax, xmm7
	vmovdqa  xmm6, xmm7

	; temporarily use ebx that is reloaded later
%ifdef ASM_POPCNT_COMPLIANT
	popcnt	rax, rax
	add		r8, rax
%else
	; use ebx for temporary computation, reload its value later
	movzx	ebx,ah
	and		eax, 0xFF
	movzx	ebx, byte [parsimony_translation_table + ebx]
	movzx	eax, byte [parsimony_translation_table + eax]
	lea		eax, [eax + ebx]
	add		r8, rax
%endif

	vpand	xmm2, xmm2, xmm1
	vpand	xmm6, xmm6, xmm5
	vpandn	xmm3, xmm3, xmm0
	vpandn	xmm7, xmm7, xmm4
	vpor	xmm2, xmm2, xmm3
	vpor	xmm6, xmm6, xmm7

	vmovdqa	[rdx + r10], xmm2
	vmovdqa	[rdx + r10 + 16], xmm6

	add		r10, 32

	cmp		r10, r11
	jl		.cpt_loop32

	; loop unrolling 1 (1*16)
.cpt_next16:
	mov		r11, rcx			; length
	and		r11, ~15			; number of SSE iterations
	mov		r9, r11
	sub		r9, r10
	cmp		r9, 16
	jl		.cpt_loop1

.cpt_loop16:

	vmovdqa	xmm0, [rdi + r10]
	vmovdqa 	xmm1, [rsi + r10]

	vpxor	xmm4, xmm4

	vmovdqa	xmm2, xmm0

	vpand	xmm2, xmm1
	vpor		xmm0, xmm1

	vpcmpeqb	xmm4, xmm2

	vpmovmskb	eax, xmm4

	vmovdqa  xmm3, xmm4

	; temporarily use ebx that is reloaded later
%ifdef ASM_POPCNT_COMPLIANT
	popcnt	rax, rax
	add		r8, rax
%else
	; use ebx for temporary computation, reload its value later
	movzx	ebx,ah
	and		eax, 0xFF
	movzx	ebx, byte [parsimony_translation_table + ebx]
	movzx	eax, byte [parsimony_translation_table + eax]
	lea		eax, [eax + ebx]
	add		r8, rax
%endif

	vpandn	xmm4, xmm2
	vpand	xmm3, xmm0
	vpor		xmm3, xmm4
	vmovdqa	[rdx + r10], xmm3

	add		r10, 16

.cpt_loop1:
	mov		r11, rcx

.cpt_loop1_for:
	cmp		r10, r11
	jge		.cpt_loop1_for_end

	mov		al, [rdi + r10]
	mov		bl, [rsi + r10]
	mov		cl, bl
	or 		bl, al
	and		cl, al
	xor		eax, eax
	test	cl, cl
	cmovz	ecx, ebx
	setz	al

	mov		[rdx + r10], cl
	add		r8, rax
	inc		r10

	jmp 	.cpt_loop1_for

.cpt_loop1_for_end:

	mov		rbx, [rbp-16]	; restore rbx

	mov		rax, r8
	mov		rsp,rbp
	pop		rbp
	ret


