default rel
;void
;printtime(uint64_t start, uint64_t end, uint64_t iter) {
; printf("cycles: %lu, per loop: %f\n", end - start, (end - start) / (double)iter);
;}
extern printtime
section .text
bench:
rdtsc
shl rdx, 32
or rdx, rax
mov rdi, rdx
mov r8, 0x10000000
mov rax, r8
.loop:
pxor xmm0, xmm0
pxor xmm0, xmm0
pxor xmm0, xmm0
pxor xmm0, xmm0
pxor xmm0, xmm0
pxor xmm0, xmm0
pxor xmm0, xmm0
pxor xmm0, xmm0
sub rax, 1
jnz .loop
rdtsc
shl rdx, 32
or rdx, rax
mov rsi, rdx
mov rdx, r8
push r8
call printtime
pop r8
ret
global main
main:
push rbp
mov rbp, rsp
; Get size of XSAVE area
push rbx
mov eax, 0xd
mov ecx, 0
cpuid
pop rbx
sub rsp, rcx; Size of XSAVE area
and rsp, ~63; Align to 64 bytes
; Clear the area to save the registers in
mov rax, rcx
xor rdx, rdx
.zeroloop:
mov [rsp+rax], dl
sub rax, 1
jnz .zeroloop
; Save the current SSE & AVX register state
vzeroupper
mov eax, -1
mov edx, -1
xsave [rsp]
; Initial test. Upper registers are clean
call bench
; Make the upper state dirty and test
vpxor ymm1, ymm1, ymm1
call bench
; Restore saved SSE and AVX registers
mov eax, -1
mov edx, -1
xrstor [rsp]
; Test again
call bench
xor eax, eax
leave
ret