%include "reg_sizes.asm"
+%define fetch_dist 1024
+
[bits 64]
default rel
; update the buffer pointer
add arg2, 64 ; buf += 64;
+ prefetchnta [arg2+fetch_dist+0]
movdqu xmm4, xmm0
movdqu xmm5, xmm1
pxor xmm0, xmm4
pxor xmm1, xmm5
+ prefetchnta [arg2+fetch_dist+32]
movdqu xmm4, xmm2
movdqu xmm5, xmm3