[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / Arm / CopyMem.S

#------------------------------------------------------------------------------\r
#\r
# CopyMem() worker for ARM\r
#\r
# This file started out as C code that did 64 bit moves if the buffer was\r
# 32-bit aligned, else it does a byte copy. It also does a byte copy for\r
# any trailing bytes. It was updated to do 32-byte copies using stm/ldm.\r
#\r
# Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>\r
# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>\r
# This program and the accompanying materials\r
# are licensed and made available under the terms and conditions of the BSD License\r
# which accompanies this distribution.  The full text of the license may be found at\r
# http://opensource.org/licenses/bsd-license.php\r
#\r
# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,\r
# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.\r
#\r
#------------------------------------------------------------------------------\r
\r
    .text\r
    .thumb\r
    .syntax unified\r
\r
/**\r
  Copy Length bytes from Source to Destination. Overlap is OK.\r
\r
  This implementation\r
\r
  @param  Destination Target of copy\r
  @param  Source      Place to copy from\r
  @param  Length      Number of bytes to copy\r
\r
  @return Destination\r
\r
\r
VOID *\r
EFIAPI\r
InternalMemCopyMem (\r
  OUT     VOID                      *DestinationBuffer,\r
  IN      CONST VOID                *SourceBuffer,\r
  IN      UINTN                     Length\r
  )\r
**/\r
    .type   ASM_PFX(InternalMemCopyMem), %function\r
ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
ASM_PFX(InternalMemCopyMem):\r
    push    {r4-r11, lr}\r
    // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)\r
    mov     r11, r0\r
    mov     r10, r0\r
    mov     r12, r2\r
    mov     r14, r1\r
\r
    cmp     r11, r1\r
    // If (dest < source)\r
    bcc     memcopy_check_optim_default\r
\r
    // If (source + length < dest)\r
    rsb     r3, r1, r11\r
    cmp     r12, r3\r
    bcc     memcopy_check_optim_default\r
    b       memcopy_check_optim_overlap\r
\r
memcopy_check_optim_default:\r
    // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)\r
    tst     r0, #0xF\r
    it      ne\r
    movne.n r0, #0\r
    bne     memcopy_default\r
    tst     r1, #0xF\r
    it      ne\r
    movne.n r3, #0\r
    it      eq\r
    moveq.n r3, #1\r
    cmp     r2, #31\r
    it      ls\r
    movls.n r0, #0\r
    bls     memcopy_default\r
    and     r0, r3, #1\r
    b       memcopy_default\r
\r
memcopy_check_optim_overlap:\r
    // r10 = dest_end, r14 = source_end\r
    add     r10, r11, r12\r
    add     r14, r12, r1\r
\r
    // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)\r
    cmp     r2, #31\r
    it      ls\r
    movls.n r0, #0\r
    it      hi\r
    movhi.n r0, #1\r
    tst     r10, #0xF\r
    it      ne\r
    movne.n r0, #0\r
    tst     r14, #0xF\r
    it      ne\r
    movne.n r0, #0\r
    b       memcopy_overlapped\r
\r
memcopy_overlapped_non_optim:\r
    // We read 1 byte from the end of the source buffer\r
    sub     r3, r14, #1\r
    sub     r12, r12, #1\r
    ldrb    r3, [r3, #0]\r
    sub     r2, r10, #1\r
    cmp     r12, #0\r
    // We write 1 byte at the end of the dest buffer\r
    sub     r10, r10, #1\r
    sub     r14, r14, #1\r
    strb    r3, [r2, #0]\r
    bne     memcopy_overlapped_non_optim\r
    b       memcopy_end\r
\r
// r10 = dest_end, r14 = source_end\r
memcopy_overlapped:\r
    // Are we in the optimized case ?\r
    cmp     r0, #0\r
    beq     memcopy_overlapped_non_optim\r
\r
    // Optimized Overlapped - Read 32 bytes\r
    sub     r14, r14, #32\r
    sub     r12, r12, #32\r
    cmp     r12, #31\r
    ldmia   r14, {r2-r9}\r
\r
    // If length is less than 32 then disable optim\r
    it      ls\r
    movls.n r0, #0\r
\r
    cmp     r12, #0\r
\r
    // Optimized Overlapped - Write 32 bytes\r
    sub     r10, r10, #32\r
    stmia   r10, {r2-r9}\r
\r
    // while (length != 0)\r
    bne     memcopy_overlapped\r
    b       memcopy_end\r
\r
memcopy_default_non_optim:\r
    // Byte copy\r
    ldrb    r3, [r14], #1\r
    sub     r12, r12, #1\r
    strb    r3, [r10], #1\r
\r
memcopy_default:\r
    cmp     r12, #0\r
    beq     memcopy_end\r
\r
// r10 = dest, r14 = source\r
memcopy_default_loop:\r
    cmp     r0, #0\r
    beq     memcopy_default_non_optim\r
\r
    // Optimized memcopy - Read 32 Bytes\r
    sub     r12, r12, #32\r
    cmp     r12, #31\r
    ldmia   r14!, {r2-r9}\r
\r
    // If length is less than 32 then disable optim\r
    it      ls\r
    movls.n r0, #0\r
\r
    cmp     r12, #0\r
\r
    // Optimized memcopy - Write 32 Bytes\r
    stmia   r10!, {r2-r9}\r
\r
    // while (length != 0)\r
    bne     memcopy_default_loop\r
\r
memcopy_end:\r
    mov     r0, r11\r
    pop     {r4-r11, pc}\r
Commit	Line	Data
a37f6605 AB	1	#------------------------------------------------------------------------------\r
	2	#\r
	3	# CopyMem() worker for ARM\r
	4	#\r
	5	# This file started out as C code that did 64 bit moves if the buffer was\r
	6	# 32-bit aligned, else it does a byte copy. It also does a byte copy for\r
	7	# any trailing bytes. It was updated to do 32-byte copies using stm/ldm.\r
	8	#\r
	9	# Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>\r
	10	# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>\r
	11	# This program and the accompanying materials\r
	12	# are licensed and made available under the terms and conditions of the BSD License\r
	13	# which accompanies this distribution. The full text of the license may be found at\r
	14	# http://opensource.org/licenses/bsd-license.php\r
	15	#\r
	16	# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,\r
	17	# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.\r
	18	#\r
	19	#------------------------------------------------------------------------------\r
	20	\r
	21	.text\r
	22	.thumb\r
	23	.syntax unified\r
	24	\r
	25	/**\r
	26	Copy Length bytes from Source to Destination. Overlap is OK.\r
	27	\r
	28	This implementation\r
	29	\r
	30	@param Destination Target of copy\r
	31	@param Source Place to copy from\r
	32	@param Length Number of bytes to copy\r
	33	\r
	34	@return Destination\r
	35	\r
	36	\r
	37	VOID *\r
	38	EFIAPI\r
	39	InternalMemCopyMem (\r
	40	OUT VOID *DestinationBuffer,\r
	41	IN CONST VOID *SourceBuffer,\r
	42	IN UINTN Length\r
	43	)\r
	44	**/\r
decaac5d	45	.type ASM_PFX(InternalMemCopyMem), %function\r
a37f6605 AB	46	ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
	47	ASM_PFX(InternalMemCopyMem):\r
	48	push {r4-r11, lr}\r
	49	// Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)\r
	50	mov r11, r0\r
	51	mov r10, r0\r
	52	mov r12, r2\r
	53	mov r14, r1\r
	54	\r
	55	cmp r11, r1\r
	56	// If (dest < source)\r
	57	bcc memcopy_check_optim_default\r
	58	\r
	59	// If (source + length < dest)\r
	60	rsb r3, r1, r11\r
	61	cmp r12, r3\r
	62	bcc memcopy_check_optim_default\r
	63	b memcopy_check_optim_overlap\r
	64	\r
	65	memcopy_check_optim_default:\r
	66	// Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)\r
	67	tst r0, #0xF\r
	68	it ne\r
eab26788	69	movne.n r0, #0\r
a37f6605 AB	70	bne memcopy_default\r
a37f6605 AB	71	tst r1, #0xF\r
eab26788 AB	72	it ne\r
	73	movne.n r3, #0\r
	74	it eq\r
	75	moveq.n r3, #1\r
a37f6605	76	cmp r2, #31\r
eab26788 AB	77	it ls\r
	78	movls.n r0, #0\r
	79	bls memcopy_default\r
	80	and r0, r3, #1\r
a37f6605 AB	81	b memcopy_default\r
	82	\r
	83	memcopy_check_optim_overlap:\r
	84	// r10 = dest_end, r14 = source_end\r
	85	add r10, r11, r12\r
	86	add r14, r12, r1\r
	87	\r
	88	// Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)\r
	89	cmp r2, #31\r
eab26788 AB	90	it ls\r
	91	movls.n r0, #0\r
	92	it hi\r
	93	movhi.n r0, #1\r
a37f6605 AB	94	tst r10, #0xF\r
a37f6605 AB	95	it ne\r
eab26788	96	movne.n r0, #0\r
a37f6605 AB	97	tst r14, #0xF\r
a37f6605 AB	98	it ne\r
eab26788	99	movne.n r0, #0\r
a37f6605 AB	100	b memcopy_overlapped\r
	101	\r
	102	memcopy_overlapped_non_optim:\r
	103	// We read 1 byte from the end of the source buffer\r
	104	sub r3, r14, #1\r
	105	sub r12, r12, #1\r
	106	ldrb r3, [r3, #0]\r
	107	sub r2, r10, #1\r
	108	cmp r12, #0\r
	109	// We write 1 byte at the end of the dest buffer\r
	110	sub r10, r10, #1\r
	111	sub r14, r14, #1\r
	112	strb r3, [r2, #0]\r
	113	bne memcopy_overlapped_non_optim\r
	114	b memcopy_end\r
	115	\r
	116	// r10 = dest_end, r14 = source_end\r
	117	memcopy_overlapped:\r
	118	// Are we in the optimized case ?\r
	119	cmp r0, #0\r
	120	beq memcopy_overlapped_non_optim\r
	121	\r
	122	// Optimized Overlapped - Read 32 bytes\r
	123	sub r14, r14, #32\r
	124	sub r12, r12, #32\r
	125	cmp r12, #31\r
	126	ldmia r14, {r2-r9}\r
	127	\r
	128	// If length is less than 32 then disable optim\r
	129	it ls\r
eab26788	130	movls.n r0, #0\r
a37f6605 AB	131	\r
	132	cmp r12, #0\r
	133	\r
	134	// Optimized Overlapped - Write 32 bytes\r
	135	sub r10, r10, #32\r
	136	stmia r10, {r2-r9}\r
	137	\r
	138	// while (length != 0)\r
	139	bne memcopy_overlapped\r
	140	b memcopy_end\r
	141	\r
	142	memcopy_default_non_optim:\r
	143	// Byte copy\r
	144	ldrb r3, [r14], #1\r
	145	sub r12, r12, #1\r
	146	strb r3, [r10], #1\r
	147	\r
	148	memcopy_default:\r
	149	cmp r12, #0\r
	150	beq memcopy_end\r
	151	\r
	152	// r10 = dest, r14 = source\r
	153	memcopy_default_loop:\r
	154	cmp r0, #0\r
	155	beq memcopy_default_non_optim\r
	156	\r
	157	// Optimized memcopy - Read 32 Bytes\r
	158	sub r12, r12, #32\r
	159	cmp r12, #31\r
	160	ldmia r14!, {r2-r9}\r
	161	\r
	162	// If length is less than 32 then disable optim\r
	163	it ls\r
eab26788	164	movls.n r0, #0\r
a37f6605 AB	165	\r
	166	cmp r12, #0\r
	167	\r
	168	// Optimized memcopy - Write 32 Bytes\r
	169	stmia r10!, {r2-r9}\r
	170	\r
	171	// while (length != 0)\r
	172	bne memcopy_default_loop\r
	173	\r
	174	memcopy_end:\r
	175	mov r0, r11\r
	176	pop {r4-r11, pc}\r