private: No


default rel

;void
;printtime(uint64_t start, uint64_t end, uint64_t iter) {
;	printf("cycles: %lu, per loop: %f\n", end - start, (end - start) / (double)iter);
;}

extern printtime

section .text

bench:
	rdtsc
	shl rdx, 32
	or rdx, rax
	mov rdi, rdx

	mov r8, 0x10000000
	mov rax, r8
.loop:
	pxor xmm0, xmm0
	pxor xmm0, xmm0
	pxor xmm0, xmm0
	pxor xmm0, xmm0
	pxor xmm0, xmm0
	pxor xmm0, xmm0
	pxor xmm0, xmm0
	pxor xmm0, xmm0
	sub rax, 1
	jnz .loop

	rdtsc
	shl rdx, 32
	or rdx, rax
	mov rsi, rdx

	mov rdx, r8
	push r8
	call printtime
	pop r8
	ret

global main
main:
	push rbp
	mov rbp, rsp

	; Get size of XSAVE area
	push rbx
	mov eax, 0xd
	mov ecx, 0
	cpuid
	pop rbx

	sub rsp, rcx; Size of XSAVE area
	and rsp, ~63; Align to 64 bytes

	; Clear the area to save the registers in
	mov rax, rcx
	xor rdx, rdx
.zeroloop:
	mov [rsp+rax], dl
	sub rax, 1
	jnz .zeroloop

	; Save the current SSE & AVX register state
	vzeroupper
	mov eax, -1
	mov edx, -1
	xsave [rsp]

	; Initial test. Upper registers are clean
	call bench

	; Make the upper state dirty and test
	vpxor ymm1, ymm1, ymm1
	call bench

	; Restore saved SSE and AVX registers
	mov eax, -1
	mov edx, -1
	xrstor [rsp]

	; Test again
	call bench

	xor eax, eax
	leave
	ret