\r
ASM_PFX(InternalMemCopyMem):\r
stmfd sp!, {r4-r11, lr}\r
- tst r0, #3\r
+ // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)\r
mov r11, r0\r
mov r10, r0\r
- mov ip, r2\r
- mov lr, r1\r
+ mov r12, r2\r
+ mov r14, r1\r
+ \r
+memcopy_check_overlapped:\r
+ cmp r11, r1\r
+ // If (dest < source)\r
+ bcc memcopy_check_optim_default\r
+ // If (dest <= source). But with the previous condition -> If (dest == source)\r
+ bls memcopy_end\r
+\r
+ // If (source + length < dest)\r
+ rsb r3, r1, r11\r
+ cmp r12, r3\r
+ bcc memcopy_check_optim_default\r
+\r
+ // If (length == 0) \r
+ cmp r12, #0\r
+ beq memcopy_end\r
+ \r
+ b memcopy_check_optim_overlap\r
+\r
+memcopy_check_optim_default:\r
+ // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)\r
+ tst r0, #0xF\r
movne r0, #0\r
- bne L4\r
- tst r1, #3\r
+ bne memcopy_default\r
+ tst r1, #0xF\r
movne r3, #0\r
moveq r3, #1\r
cmp r2, #31\r
movls r0, #0\r
andhi r0, r3, #1\r
-L4:\r
- cmp r11, r1\r
- bcc L26\r
- bls L7\r
- rsb r3, r1, r11\r
- cmp ip, r3\r
- bcc L26\r
- cmp ip, #0\r
- beq L7\r
- add r10, r11, ip\r
- add lr, ip, r1\r
- b L16\r
-L29:\r
- sub ip, ip, #8\r
- cmp ip, #7\r
- ldrd r2, [lr, #-8]!\r
+ b memcopy_default\r
+ \r
+memcopy_check_optim_overlap:\r
+ // r10 = dest_end, r14 = source_end\r
+ add r10, r11, r12\r
+ add r14, r12, r1\r
+\r
+ // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)\r
+ cmp r2, #31\r
movls r0, #0\r
- cmp ip, #0\r
- strd r2, [r10, #-8]!\r
- beq L7\r
-L16:\r
- cmp r0, #0\r
- bne L29\r
- sub r3, lr, #1\r
- sub ip, ip, #1\r
+ movhi r0, #1\r
+ tst r10, #0xF\r
+ movne r0, #0\r
+ tst r14, #0xF\r
+ movne r0, #0\r
+ b memcopy_overlapped\r
+ \r
+memcopy_overlapped_non_optim:\r
+ // We read 1 byte from the end of the source buffer\r
+ sub r3, r14, #1\r
+ sub r12, r12, #1\r
ldrb r3, [r3, #0] \r
sub r2, r10, #1\r
- cmp ip, #0\r
+ cmp r12, #0\r
+ // We write 1 byte at the end of the dest buffer\r
sub r10, r10, #1\r
- sub lr, lr, #1\r
+ sub r14, r14, #1\r
strb r3, [r2, #0]\r
- bne L16\r
- b L7\r
-L11:\r
- ldrb r3, [lr], #1 \r
- sub ip, ip, #1\r
+ bne memcopy_overlapped_non_optim\r
+ b memcopy_end\r
+\r
+// r10 = dest_end, r14 = source_end\r
+memcopy_overlapped:\r
+ // Are we in the optimized case ?\r
+ cmp r0, #0\r
+ beq memcopy_overlapped_non_optim\r
+ \r
+ // Optimized Overlapped - Read 32 bytes\r
+ sub r14, r14, #32\r
+ sub r12, r12, #32\r
+ cmp r12, #31\r
+ ldmia r14, {r2-r9}\r
+ \r
+ // If length is less than 32 then disable optim\r
+ movls r0, #0\r
+ \r
+ cmp r12, #0\r
+ \r
+ // Optimized Overlapped - Write 32 bytes \r
+ sub r10, r10, #32\r
+ stmia r10, {r2-r9}\r
+ \r
+ // while (length != 0)\r
+ bne memcopy_overlapped\r
+ b memcopy_end\r
+ \r
+memcopy_default_non_optim:\r
+ // Byte copy\r
+ ldrb r3, [r14], #1 \r
+ sub r12, r12, #1\r
strb r3, [r10], #1\r
-L26:\r
- cmp ip, #0\r
- beq L7\r
-L30:\r
+ \r
+memcopy_default:\r
+ cmp r12, #0\r
+ beq memcopy_end\r
+ \r
+// r10 = dest, r14 = source\r
+memcopy_default_loop:\r
cmp r0, #0\r
- beq L11\r
- sub ip, ip, #32\r
- cmp ip, #31\r
- ldmia lr!, {r2-r9}\r
+ beq memcopy_default_non_optim\r
+ \r
+ // Optimized memcopy - Read 32 Bytes\r
+ sub r12, r12, #32\r
+ cmp r12, #31\r
+ ldmia r14!, {r2-r9}\r
+ \r
+ // If length is less than 32 then disable optim\r
movls r0, #0\r
- cmp ip, #0\r
+ \r
+ cmp r12, #0\r
+ \r
+ // Optimized memcopy - Write 32 Bytes\r
stmia r10!, {r2-r9}\r
- bne L30\r
-L7:\r
+\r
+ // while (length != 0)\r
+ bne memcopy_default_loop\r
+ \r
+memcopy_end:\r
mov r0, r11\r
ldmfd sp!, {r4-r11, pc}\r
\r
\r
InternalMemCopyMem\r
stmfd sp!, {r4-r11, lr}
- tst r0, #3
+ // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)
mov r11, r0
mov r10, r0
- mov ip, r2
- mov lr, r1
+ mov r12, r2
+ mov r14, r1
+ \r
+memcopy_check_overlapped\r
+ cmp r11, r1\r
+ // If (dest < source)\r
+ bcc memcopy_check_optim_default\r
+ // If (dest <= source). But with the previous condition -> If (dest == source)\r
+ bls memcopy_end\r
+\r
+ // If (source + length < dest)\r
+ rsb r3, r1, r11\r
+ cmp r12, r3\r
+ bcc memcopy_check_optim_default\r
+\r
+ // If (length == 0) \r
+ cmp r12, #0\r
+ beq memcopy_end\r
+ \r
+ b memcopy_check_optim_overlap\r
+\r
+memcopy_check_optim_default\r
+ // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)\r
+ tst r0, #0xF\r
movne r0, #0
- bne L4
- tst r1, #3
+ bne memcopy_default
+ tst r1, #0xF
movne r3, #0
moveq r3, #1
cmp r2, #31
movls r0, #0
andhi r0, r3, #1
-L4
- cmp r11, r1
- bcc L26
- bls L7
- rsb r3, r1, r11
- cmp ip, r3
- bcc L26
- cmp ip, #0
- beq L7
- add r10, r11, ip
- add lr, ip, r1
- b L16
-L29
- sub ip, ip, #8
- cmp ip, #7
- ldrd r2, [lr, #-8]!
+ b memcopy_default
+
+memcopy_check_optim_overlap
+ // r10 = dest_end, r14 = source_end
+ add r10, r11, r12
+ add r14, r12, r1
+
+ // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)
+ cmp r2, #31
movls r0, #0
- cmp ip, #0
- strd r2, [r10, #-8]!
- beq L7
-L16
- cmp r0, #0
- bne L29
- sub r3, lr, #1
- sub ip, ip, #1
+ movhi r0, #1
+ tst r10, #0xF
+ movne r0, #0
+ tst r14, #0xF
+ movne r0, #0
+ b memcopy_overlapped\r
+
+memcopy_overlapped_non_optim\r
+ // We read 1 byte from the end of the source buffer\r
+ sub r3, r14, #1
+ sub r12, r12, #1
ldrb r3, [r3, #0]
sub r2, r10, #1
- cmp ip, #0
+ cmp r12, #0
+ // We write 1 byte at the end of the dest buffer\r
sub r10, r10, #1
- sub lr, lr, #1
+ sub r14, r14, #1
strb r3, [r2, #0]
- bne L16
- b L7
-L11
- ldrb r3, [lr], #1
- sub ip, ip, #1
+ bne memcopy_overlapped_non_optim\r
+ b memcopy_end\r
+\r
+// r10 = dest_end, r14 = source_end\r
+memcopy_overlapped\r
+ // Are we in the optimized case ?\r
+ cmp r0, #0\r
+ beq memcopy_overlapped_non_optim\r
+
+ // Optimized Overlapped - Read 32 bytes\r
+ sub r14, r14, #32\r
+ sub r12, r12, #32\r
+ cmp r12, #31\r
+ ldmia r14, {r2-r9}\r
+
+ // If length is less than 32 then disable optim\r
+ movls r0, #0\r
+
+ cmp r12, #0\r
+ \r
+ // Optimized Overlapped - Write 32 bytes \r
+ sub r10, r10, #32\r
+ stmia r10, {r2-r9}\r
+ \r
+ // while (length != 0)\r
+ bne memcopy_overlapped\r
+ b memcopy_end\r
+ \r
+memcopy_default_non_optim\r
+ // Byte copy\r
+ ldrb r3, [r14], #1
+ sub r12, r12, #1
strb r3, [r10], #1
-L26
- cmp ip, #0
- beq L7
-L30
+
+memcopy_default\r
+ cmp r12, #0
+ beq memcopy_end
+
+// r10 = dest, r14 = source\r
+memcopy_default_loop\r
cmp r0, #0
- beq L11
- sub ip, ip, #32
- cmp ip, #31
- ldmia lr!, {r2-r9}
+ beq memcopy_default_non_optim\r
+
+ // Optimized memcopy - Read 32 Bytes\r
+ sub r12, r12, #32
+ cmp r12, #31
+ ldmia r14!, {r2-r9}
+ \r
+ // If length is less than 32 then disable optim\r
movls r0, #0
- cmp ip, #0
+ \r
+ cmp r12, #0
+ \r
+ // Optimized memcopy - Write 32 Bytes\r
stmia r10!, {r2-r9}
- bne L30
-L7
+
+ // while (length != 0)\r
+ bne memcopy_default_loop\r
+
+memcopy_end\r
mov r0, r11
ldmfd sp!, {r4-r11, pc}\r
\r