%include "reg_sizes.asm"
+%define fetch_dist 1024
+
[bits 64]
default rel
;update the buffer pointer
add arg2, 64
+ prefetchnta [arg2+fetch_dist+0]
movdqa xmm4, xmm0
movdqa xmm5, xmm1
pxor xmm0, xmm4
pxor xmm1, xmm5
+ prefetchnta [arg2+fetch_dist+32]
movdqa xmm4, xmm2
movdqa xmm5, xmm3