default rel ;void ;printtime(uint64_t start, uint64_t end, uint64_t iter) { ; printf("cycles: %lu, per loop: %f\n", end - start, (end - start) / (double)iter); ;} extern printtime section .text bench: rdtsc shl rdx, 32 or rdx, rax mov rdi, rdx mov r8, 0x10000000 mov rax, r8 .loop: pxor xmm0, xmm0 pxor xmm0, xmm0 pxor xmm0, xmm0 pxor xmm0, xmm0 pxor xmm0, xmm0 pxor xmm0, xmm0 pxor xmm0, xmm0 pxor xmm0, xmm0 sub rax, 1 jnz .loop rdtsc shl rdx, 32 or rdx, rax mov rsi, rdx mov rdx, r8 push r8 call printtime pop r8 ret global main main: push rbp mov rbp, rsp ; Get size of XSAVE area push rbx mov eax, 0xd mov ecx, 0 cpuid pop rbx sub rsp, rcx; Size of XSAVE area and rsp, ~63; Align to 64 bytes ; Clear the area to save the registers in mov rax, rcx xor rdx, rdx .zeroloop: mov [rsp+rax], dl sub rax, 1 jnz .zeroloop ; Save the current SSE & AVX register state vzeroupper mov eax, -1 mov edx, -1 xsave [rsp] ; Initial test. Upper registers are clean call bench ; Make the upper state dirty and test vpxor ymm1, ymm1, ymm1 call bench ; Restore saved SSE and AVX registers mov eax, -1 mov edx, -1 xrstor [rsp] ; Test again call bench xor eax, eax leave ret