global pars_sse42_64bits_data_8bits

; calling convention of the System V AMD64 application binary interface
; is used with Linux
; The registers RDI, RSI, RDX, RCX, R8 and R9 are used for integer and pointer arguments
; while XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6 and XMM7 are used for floating point
; arguments
; RBX, RBP, R12 to R15 must be preserved
; RAX, RCX, RDX, RSI, RDI, R8 to R11 can be modified

section .text

pars_sse42_64bits_data_8bits:
	push	rbp
	mov		rbp, rsp

	sub		rsp, 40			; assume no red zone

	mov		[rbp-16], rbx	; save rbx which must not be modified


	xor		rax, rax	; set to 0 upper 32bits of rax
						; because used by popcount

	xor		r8, r8		; changes <- 0

	xor		r10, r10		; i <- 0

	mov		r11, rcx
	and		r11, ~31				; number of SSE iterations
	cmp		r11, 32
	jl		.cpt_next16

	; loop unrolling 2 (2 * 16)
.cpt_loop32:

	movdqa	xmm0, [rdi + r10]
	movdqa	xmm4, [rdi + r10 + 16]
	movdqa 	xmm1, [rsi + r10]
	movdqa 	xmm5, [rsi + r10 + 16]

	pxor	xmm3, xmm3
	pxor	xmm7, xmm7

	movdqa	xmm2, xmm0
	movdqa  xmm6, xmm4

	pand	xmm0, xmm1
	pand	xmm4, xmm5
	por		xmm1, xmm2
	por		xmm5, xmm6

	pcmpeqb	xmm3, xmm0
	pcmpeqb xmm7, xmm4

	pmovmskb	eax, xmm3
	movdqa  xmm2, xmm3

%ifdef ASM_POPCNT_COMPLIANT
	popcnt	rax, rax
	add		r8, rax
%else
	xor			rbx,rbx
	mov         bl,al
	mov         bl, [parsimony_translation_table + ebx]
	add			r8, rbx
	mov			bl,ah
	mov			bl, [parsimony_translation_table + ebx]
	add         r8, rbx
%endif


	pmovmskb	eax, xmm7
	movdqa  xmm6, xmm7

%ifdef ASM_POPCNT_COMPLIANT
	popcnt	rax, rax
	add		r8, rax
%else
	xor			rbx,rbx
	mov         bl,al
	mov         bl, [parsimony_translation_table + ebx]
	add			r8, rbx
	mov			bl,ah
	mov			bl, [parsimony_translation_table + ebx]
	add         r8, rbx
%endif

	pand	xmm2, xmm1
	pand	xmm6, xmm5
	pandn	xmm3, xmm0
	pandn	xmm7, xmm4
	por		xmm2, xmm3
	por		xmm6, xmm7

	movdqa	[rdx + r10], xmm2
	movdqa	[rdx + r10 + 16], xmm6

	add		r10, 32

	cmp		r10, r11
	jl		.cpt_loop32

	; loop unrolling 1 (1*16)
.cpt_next16:
	mov		r11, rcx			; length
	and		r11, ~15			; number of SSE iterations
	mov		r9, r11
	sub		r9, r10
	cmp		r9, 16
	jl		.cpt_loop1

.cpt_loop16:

	movdqa	xmm0, [rdi + r10]
	movdqa 	xmm1, [rsi + r10]

	pxor	xmm4, xmm4

	movdqa	xmm2, xmm0

	pand	xmm2, xmm1
	por		xmm0, xmm1

	pcmpeqb	xmm4, xmm2

	pmovmskb	eax, xmm4

	movdqa  xmm3, xmm4

	popcnt	rax, rax
	add		r8, rax

	pandn	xmm4, xmm2
	pand	xmm3, xmm0
	por		xmm3, xmm4
	movdqa	[rdx + r10], xmm3

	add		r10, 16

.cpt_loop1:
	mov		r11, rcx

.cpt_loop1_for:
	cmp		r10, r11
	jge		.cpt_loop1_for_end

	mov		al, [rdi + r10]
	mov		bl, [rsi + r10]
	mov		cl, bl
	or 		bl, al
	and		cl, al
	xor		eax, eax
	test	cl, cl
	cmovz	ecx, ebx
	setz	al

	mov		[rdx + r10], cl
	add		r8, rax
	inc		r10

	jmp 	.cpt_loop1_for

.cpt_loop1_for_end:

	mov		rbx, [rbp-16]	; restore rbx

	mov		rax, r8
	mov		rsp,rbp
	pop		rbp
	ret
