global pars_sse42_32bits_data_8bits

section .data

section .text

; uint32_t parsimony_asm_sse2(uint8_t *x, uint8_t *y, uint8_t *z, uint32_t size);
; esi = x is aligned on a 16 or 32 boundary
; edi = y is aligned on a 16 or 32 boundary
; ebx = z is aligned on a 16 or 32 boundary
; xmm0 = [0,....0] used for comparison
; xmm1 = x[i+0:15]
; xmm2 = y[i+0:15]
; xmm3 = x & y
; xmm4 = x | y
; xmm5 = comparison
; xmm6 = [FF, .., FF]
pars_sse42_32bits_data_8bits:
	push		ebp
	mov			ebp, esp
	sub			esp, 4		; differences

	push		esi
	push		edi
	push		ebx

	mov			dword [ebp-4], 0

	pxor		xmm7, xmm7
	mov			eax, 0xFFFFFFFF                 ; xmm6 <- 0xFFFFFFFF...F
;	movd		xmm6, eax;
;	pshufd		xmm6, xmm6, 0

	mov			esi, [ebp+8]
	mov			edi, [ebp+12]
	mov			ebx, [ebp+16]

	; compute size modulo 16
	mov			ecx, [ebp+20]	; size
	shr			ecx, 4			; divide by 16 to find number of iterations
	test		ecx, ecx
	jz			.multiple_of_1

.loop_16:
	pxor		xmm0, xmm0
	movdqa		xmm1, [esi]
	movdqa		xmm2, [edi]
	movdqa		xmm3, xmm1
	movdqa		xmm4, xmm2
	pand		xmm3, xmm2
	por			xmm4, xmm1
;	movdqa		xmm5, xmm3
	pcmpeqb		xmm0, xmm3
	pmovmskb 	edx, xmm0

%ifdef ASM_POPCNT_COMPLIANT
	popcnt  	edx, edx
    add			[ebp-4], edx
%else
	xor			eax,eax
	mov         al,dl
	mov         al, [parsimony_translation_table + eax]
	add			[ebp-4], eax
	mov			al,dh
	mov			al, [parsimony_translation_table + eax]
	add         [ebp-4], eax
%endif

;	pand		xmm4, xmm5
;	pxor		xmm5, xmm6;
;	pand		xmm3, xmm5
;	por			xmm3, xmm4
	pblendvb		xmm3, xmm4, xmm0

	movdqa 		[ebx], xmm3

	add			esi, 16
	add			edi, 16
	add			ebx, 16

	dec			ecx
	jnz			.loop_16

.multiple_of_1:
	mov			ecx, [ebp+20]
	and			ecx, 15
	test		ecx, ecx
	jz			.end

.loop_1:
 	push		ebx
 	mov			al, [esi]
 	mov			bl, [edi]
 	mov			dl, bl
 	or			bl, al
 	add			esi, 1
 	add			edi, 1
 	and			dl, al
 	xor			eax, eax
 	test		dl, dl
 	cmovz 		edx, ebx
 	setz		al
 	pop			ebx
 	add			[ebp-4], eax
 	mov			[ebx], dl
 	add			ebx, 1
 	dec			ecx
 	jnz			.loop_1
.end:
	pop			ebx
	pop			edi
	pop			esi

	mov			eax, [ebp-4]
	mov			esp, ebp
	pop			ebp
	ret
