#
#------------------------------------------------------------------------------
-
#------------------------------------------------------------------------------
# VOID *
# EFIAPI
# IN VOID *Source,
# IN UINTN Count
# )
-#------------------------------------------------------------------------------
-.intel_syntax noprefix
-.globl ASM_PFX(InternalMemCopyMem)
-ASM_PFX(InternalMemCopyMem):
- push rsi
- push rdi
- mov rsi, rdx # rsi <- Source
- mov rdi, rcx # rdi <- Destination
- lea r9, [rsi + r8 - 1] # r9 <- End of Source
- cmp rsi, rdi
- mov rax, rdi # rax <- Destination as return value
- jae L0
- cmp r9, rdi
- jae L_CopyBackward # Copy backward if overlapped
-L0:
- mov rcx, r8
- and r8, 7
- shr rcx, 3 # rcx <- # of Qwords to copy
- jz L_CopyBytes
- movd r10, mm0 # (Save mm0 in r10)
-L1:
- movq mm0, [rsi]
- movntq [rdi], mm0
- add rsi, 8
- add rdi, 8
- loop L1
- mfence
- movd mm0, r10 # (Restore mm0)
- jmp L_CopyBytes
-L_CopyBackward:
- mov rsi, r9 # rsi <- End of Source
- lea rdi, [rdi + r8 - 1] # rdi <- End of Destination
- std # set direction flag
-L_CopyBytes:
- mov rcx, r8
- rep movsb # Copy bytes backward
- cld
- pop rdi
- pop rsi
- ret
+#------------------------------------------------------------------------------\r
+.intel_syntax noprefix\r
+.globl ASM_PFX(InternalMemCopyMem)\r
+ASM_PFX(InternalMemCopyMem):\r
+ push rsi\r
+ push rdi\r
+ mov rsi, rdx # rsi <- Source\r
+ mov rdi, rcx # rdi <- Destination\r
+ lea r9, [rsi + r8 - 1] # r9 <- Last byte of Source\r
+ cmp rsi, rdi\r
+ mov rax, rdi # rax <- Destination as return value\r
+ jae L0 # Copy forward if Source > Destination\r
+ cmp r9, rdi # Overlapped?\r
+ jae @CopyBackward # Copy backward if overlapped\r
+L0:\r
+ xor rcx, rcx\r
+ sub rcx, rdi # rcx <- -rdi\r
+ and rcx, 15 # rcx + rsi should be 16 bytes aligned\r
+ jz L1 # skip if rcx == 0\r
+ cmp rcx, r8\r
+ cmova rcx, r8\r
+ sub r8, rcx\r
+ rep movsb\r
+L1:\r
+ mov rcx, r8\r
+ and r8, 15\r
+ shr rcx, 4 # rcx <- # of DQwords to copy\r
+ jz L_CopyBytes\r
+ movdqa [rsp + 0x18], xmm0 # save xmm0 on stack\r
+L2:\r
+ movdqu xmm0, [rsi] # rsi may not be 16-byte aligned\r
+ movntdq [rdi], xmm0 # rdi should be 16-byte aligned\r
+ add rsi, 16\r
+ add rdi, 16\r
+ loop L2\r
+ mfence\r
+ movdqa xmm0, [rsp + 0x18] # restore xmm0\r
+ jmp L_CopyBytes # copy remaining bytes\r
+L_CopyBackward:\r
+ mov rsi, r9 # rsi <- Last byte of Source\r
+ lea rdi, [rdi + r8 - 1] # rdi <- Last byte of Destination\r
+ std\r
+L_CopyBytes:\r
+ mov rcx, r8\r
+ rep movsb\r
+ cld\r
+ pop rdi\r
+ pop rsi\r
+ ret\r