global pars_sse2_64bits_data_32bits

section .text

pars_sse2_64bits_data_32bits:
	push	rbp
	mov		rbp, rsp

	sub		rsp, 40			; assume no red zone

	mov		[rbp-16], rbx	; save rbx which must not be modified


	xor		rax, rax	; set to 0 upper 32bits of rax
						; because used by popcount

	xor		r8, r8		; changes <- 0
	xor		r9, r9		; offset in arrays
	xor		r10, r10		; i <- 0

	mov		r11, rcx			; length
	and		r11, ~3			; number of SSE iterations
	jz		.cpt_loop1

; treat by 4 x 32 bits
.cpt_loop16:

	movdqa	xmm0, [rdi + r9]
	movdqa 	xmm1, [rsi + r9]

	pxor	xmm4, xmm4

	movdqa	xmm2, xmm0

	pand	xmm2, xmm1
	por		xmm0, xmm1

	pcmpeqd	xmm4, xmm2

	pmovmskb	eax, xmm4
	and			eax, 0x1111
	movdqa  	xmm3, xmm4

%ifdef ASM_POPCNT_COMPLIANT
	popcnt	rax, rax
	add		r8, rax
%else
	xor			rbx,rbx
	mov         bl,al
	mov         bl, [parsimony_translation_table + ebx]
	add			r8, rbx
	mov			bl,ah
	mov			bl, [parsimony_translation_table + ebx]
	add         r8, rbx
%endif

	pandn	xmm4, xmm2
	pand	xmm3, xmm0
	por		xmm3, xmm4
	movdqa	[rdx + r9], xmm3

	add		r9, 16
	add		r10, 4
	cmp		r10, r11
	jl		.cpt_loop16

.cpt_loop1:
	mov		r11, rcx

.cpt_loop1_for:
	cmp		r10, r11
	jge		.cpt_loop1_for_end

	mov		eax, [rdi + r9]
	mov		ebx, [rsi + r9]
	mov		ecx, ebx
	or 		ebx, eax
	and		ecx, eax
	xor		rax, rax
	test	ecx, ecx
	cmovz	ecx, ebx
	setz	al

	mov		[rdx + r9], cl
	add		r8, rax
	inc		r10
	add		r9, 4

	jmp 	.cpt_loop1_for

.cpt_loop1_for_end:

	mov		rbx, [rbp-16]	; restore rbx

	mov		rax, r8
	mov		rsp,rbp
	pop		rbp
	ret
