[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / CopyMem.S

//\r
// Copyright (c) 2012 - 2016, Linaro Limited\r
// All rights reserved.\r
// Copyright (c) 2015 ARM Ltd\r
// All rights reserved.\r
// SPDX-License-Identifier: BSD-2-Clause-Patent\r
//\r
\r
// Assumptions:\r
//\r
// ARMv8-a, AArch64, unaligned accesses.\r
//\r
//\r
\r
#define dstin     x0\r
#define src       x1\r
#define count     x2\r
#define dst       x3\r
#define srcend    x4\r
#define dstend    x5\r
#define A_l       x6\r
#define A_lw      w6\r
#define A_h       x7\r
#define A_hw      w7\r
#define B_l       x8\r
#define B_lw      w8\r
#define B_h       x9\r
#define C_l       x10\r
#define C_h       x11\r
#define D_l       x12\r
#define D_h       x13\r
#define E_l       x14\r
#define E_h       x15\r
#define F_l       srcend\r
#define F_h       dst\r
#define tmp1      x9\r
#define tmp2      x3\r
\r
#define L(l) .L ## l\r
\r
// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
// of more than 96 bytes align the destination and use an unrolled loop\r
// processing 64 bytes per iteration.\r
// Small and medium copies read all data before writing, allowing any\r
// kind of overlap, and memmove tailcalls memcpy for these cases as\r
// well as non-overlapping copies.\r
\r
__memcpy:\r
    prfm    PLDL1KEEP, [src]\r
    add     srcend, src, count\r
    add     dstend, dstin, count\r
    cmp     count, 16\r
    b.ls    L(copy16)\r
    cmp     count, 96\r
    b.hi    L(copy_long)\r
\r
    // Medium copies: 17..96 bytes.\r
    sub     tmp1, count, 1\r
    ldp     A_l, A_h, [src]\r
    tbnz    tmp1, 6, L(copy96)\r
    ldp     D_l, D_h, [srcend, -16]\r
    tbz     tmp1, 5, 1f\r
    ldp     B_l, B_h, [src, 16]\r
    ldp     C_l, C_h, [srcend, -32]\r
    stp     B_l, B_h, [dstin, 16]\r
    stp     C_l, C_h, [dstend, -32]\r
1:\r
    stp     A_l, A_h, [dstin]\r
    stp     D_l, D_h, [dstend, -16]\r
    ret\r
\r
    .p2align 4\r
    // Small copies: 0..16 bytes.\r
L(copy16):\r
    cmp     count, 8\r
    b.lo    1f\r
    ldr     A_l, [src]\r
    ldr     A_h, [srcend, -8]\r
    str     A_l, [dstin]\r
    str     A_h, [dstend, -8]\r
    ret\r
    .p2align 4\r
1:\r
    tbz     count, 2, 1f\r
    ldr     A_lw, [src]\r
    ldr     A_hw, [srcend, -4]\r
    str     A_lw, [dstin]\r
    str     A_hw, [dstend, -4]\r
    ret\r
\r
    // Copy 0..3 bytes.  Use a branchless sequence that copies the same\r
    // byte 3 times if count==1, or the 2nd byte twice if count==2.\r
1:\r
    cbz     count, 2f\r
    lsr     tmp1, count, 1\r
    ldrb    A_lw, [src]\r
    ldrb    A_hw, [srcend, -1]\r
    ldrb    B_lw, [src, tmp1]\r
    strb    A_lw, [dstin]\r
    strb    B_lw, [dstin, tmp1]\r
    strb    A_hw, [dstend, -1]\r
2:  ret\r
\r
    .p2align 4\r
    // Copy 64..96 bytes.  Copy 64 bytes from the start and\r
    // 32 bytes from the end.\r
L(copy96):\r
    ldp     B_l, B_h, [src, 16]\r
    ldp     C_l, C_h, [src, 32]\r
    ldp     D_l, D_h, [src, 48]\r
    ldp     E_l, E_h, [srcend, -32]\r
    ldp     F_l, F_h, [srcend, -16]\r
    stp     A_l, A_h, [dstin]\r
    stp     B_l, B_h, [dstin, 16]\r
    stp     C_l, C_h, [dstin, 32]\r
    stp     D_l, D_h, [dstin, 48]\r
    stp     E_l, E_h, [dstend, -32]\r
    stp     F_l, F_h, [dstend, -16]\r
    ret\r
\r
    // Align DST to 16 byte alignment so that we don't cross cache line\r
    // boundaries on both loads and stores. There are at least 96 bytes\r
    // to copy, so copy 16 bytes unaligned and then align.  The loop\r
    // copies 64 bytes per iteration and prefetches one iteration ahead.\r
\r
    .p2align 4\r
L(copy_long):\r
    and     tmp1, dstin, 15\r
    bic     dst, dstin, 15\r
    ldp     D_l, D_h, [src]\r
    sub     src, src, tmp1\r
    add     count, count, tmp1      // Count is now 16 too large.\r
    ldp     A_l, A_h, [src, 16]\r
    stp     D_l, D_h, [dstin]\r
    ldp     B_l, B_h, [src, 32]\r
    ldp     C_l, C_h, [src, 48]\r
    ldp     D_l, D_h, [src, 64]!\r
    subs    count, count, 128 + 16  // Test and readjust count.\r
    b.ls    2f\r
1:\r
    stp     A_l, A_h, [dst, 16]\r
    ldp     A_l, A_h, [src, 16]\r
    stp     B_l, B_h, [dst, 32]\r
    ldp     B_l, B_h, [src, 32]\r
    stp     C_l, C_h, [dst, 48]\r
    ldp     C_l, C_h, [src, 48]\r
    stp     D_l, D_h, [dst, 64]!\r
    ldp     D_l, D_h, [src, 64]!\r
    subs    count, count, 64\r
    b.hi    1b\r
\r
    // Write the last full set of 64 bytes.   The remainder is at most 64\r
    // bytes, so it is safe to always copy 64 bytes from the end even if\r
    // there is just 1 byte left.\r
2:\r
    ldp     E_l, E_h, [srcend, -64]\r
    stp     A_l, A_h, [dst, 16]\r
    ldp     A_l, A_h, [srcend, -48]\r
    stp     B_l, B_h, [dst, 32]\r
    ldp     B_l, B_h, [srcend, -32]\r
    stp     C_l, C_h, [dst, 48]\r
    ldp     C_l, C_h, [srcend, -16]\r
    stp     D_l, D_h, [dst, 64]\r
    stp     E_l, E_h, [dstend, -64]\r
    stp     A_l, A_h, [dstend, -48]\r
    stp     B_l, B_h, [dstend, -32]\r
    stp     C_l, C_h, [dstend, -16]\r
    ret\r
\r
\r
//\r
// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
// Larger backwards copies are also handled by memcpy. The only remaining\r
// case is forward large copies.  The destination is aligned, and an\r
// unrolled loop processes 64 bytes per iteration.\r
//\r
\r
ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
ASM_PFX(InternalMemCopyMem):\r
    sub     tmp2, dstin, src\r
    cmp     count, 96\r
    ccmp    tmp2, count, 2, hi\r
    b.hs    __memcpy\r
\r
    cbz     tmp2, 3f\r
    add     dstend, dstin, count\r
    add     srcend, src, count\r
\r
    // Align dstend to 16 byte alignment so that we don't cross cache line\r
    // boundaries on both loads and stores. There are at least 96 bytes\r
    // to copy, so copy 16 bytes unaligned and then align. The loop\r
    // copies 64 bytes per iteration and prefetches one iteration ahead.\r
\r
    and     tmp2, dstend, 15\r
    ldp     D_l, D_h, [srcend, -16]\r
    sub     srcend, srcend, tmp2\r
    sub     count, count, tmp2\r
    ldp     A_l, A_h, [srcend, -16]\r
    stp     D_l, D_h, [dstend, -16]\r
    ldp     B_l, B_h, [srcend, -32]\r
    ldp     C_l, C_h, [srcend, -48]\r
    ldp     D_l, D_h, [srcend, -64]!\r
    sub     dstend, dstend, tmp2\r
    subs    count, count, 128\r
    b.ls    2f\r
    nop\r
1:\r
    stp     A_l, A_h, [dstend, -16]\r
    ldp     A_l, A_h, [srcend, -16]\r
    stp     B_l, B_h, [dstend, -32]\r
    ldp     B_l, B_h, [srcend, -32]\r
    stp     C_l, C_h, [dstend, -48]\r
    ldp     C_l, C_h, [srcend, -48]\r
    stp     D_l, D_h, [dstend, -64]!\r
    ldp     D_l, D_h, [srcend, -64]!\r
    subs    count, count, 64\r
    b.hi    1b\r
\r
    // Write the last full set of 64 bytes. The remainder is at most 64\r
    // bytes, so it is safe to always copy 64 bytes from the start even if\r
    // there is just 1 byte left.\r
2:\r
    ldp     E_l, E_h, [src, 48]\r
    stp     A_l, A_h, [dstend, -16]\r
    ldp     A_l, A_h, [src, 32]\r
    stp     B_l, B_h, [dstend, -32]\r
    ldp     B_l, B_h, [src, 16]\r
    stp     C_l, C_h, [dstend, -48]\r
    ldp     C_l, C_h, [src]\r
    stp     D_l, D_h, [dstend, -64]\r
    stp     E_l, E_h, [dstin, 48]\r
    stp     A_l, A_h, [dstin, 32]\r
    stp     B_l, B_h, [dstin, 16]\r
    stp     C_l, C_h, [dstin]\r
3:  ret\r
Commit	Line	Data
	1	//\r
	2	// Copyright (c) 2012 - 2016, Linaro Limited\r
	3	// All rights reserved.\r
	4	// Copyright (c) 2015 ARM Ltd\r
	5	// All rights reserved.\r
	6	// SPDX-License-Identifier: BSD-2-Clause-Patent\r
	7	//\r
	8	\r
	9	// Assumptions:\r
	10	//\r
	11	// ARMv8-a, AArch64, unaligned accesses.\r
	12	//\r
	13	//\r
	14	\r
	15	#define dstin x0\r
	16	#define src x1\r
	17	#define count x2\r
	18	#define dst x3\r
	19	#define srcend x4\r
	20	#define dstend x5\r
	21	#define A_l x6\r
	22	#define A_lw w6\r
	23	#define A_h x7\r
	24	#define A_hw w7\r
	25	#define B_l x8\r
	26	#define B_lw w8\r
	27	#define B_h x9\r
	28	#define C_l x10\r
	29	#define C_h x11\r
	30	#define D_l x12\r
	31	#define D_h x13\r
	32	#define E_l x14\r
	33	#define E_h x15\r
	34	#define F_l srcend\r
	35	#define F_h dst\r
	36	#define tmp1 x9\r
	37	#define tmp2 x3\r
	38	\r
	39	#define L(l) .L ## l\r
	40	\r
	41	// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
	42	// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
	43	// of more than 96 bytes align the destination and use an unrolled loop\r
	44	// processing 64 bytes per iteration.\r
	45	// Small and medium copies read all data before writing, allowing any\r
	46	// kind of overlap, and memmove tailcalls memcpy for these cases as\r
	47	// well as non-overlapping copies.\r
	48	\r
	49	__memcpy:\r
	50	prfm PLDL1KEEP, [src]\r
	51	add srcend, src, count\r
	52	add dstend, dstin, count\r
	53	cmp count, 16\r
	54	b.ls L(copy16)\r
	55	cmp count, 96\r
	56	b.hi L(copy_long)\r
	57	\r
	58	// Medium copies: 17..96 bytes.\r
	59	sub tmp1, count, 1\r
	60	ldp A_l, A_h, [src]\r
	61	tbnz tmp1, 6, L(copy96)\r
	62	ldp D_l, D_h, [srcend, -16]\r
	63	tbz tmp1, 5, 1f\r
	64	ldp B_l, B_h, [src, 16]\r
	65	ldp C_l, C_h, [srcend, -32]\r
	66	stp B_l, B_h, [dstin, 16]\r
	67	stp C_l, C_h, [dstend, -32]\r
	68	1:\r
	69	stp A_l, A_h, [dstin]\r
	70	stp D_l, D_h, [dstend, -16]\r
	71	ret\r
	72	\r
	73	.p2align 4\r
	74	// Small copies: 0..16 bytes.\r
	75	L(copy16):\r
	76	cmp count, 8\r
	77	b.lo 1f\r
	78	ldr A_l, [src]\r
	79	ldr A_h, [srcend, -8]\r
	80	str A_l, [dstin]\r
	81	str A_h, [dstend, -8]\r
	82	ret\r
	83	.p2align 4\r
	84	1:\r
	85	tbz count, 2, 1f\r
	86	ldr A_lw, [src]\r
	87	ldr A_hw, [srcend, -4]\r
	88	str A_lw, [dstin]\r
	89	str A_hw, [dstend, -4]\r
	90	ret\r
	91	\r
	92	// Copy 0..3 bytes. Use a branchless sequence that copies the same\r
	93	// byte 3 times if count==1, or the 2nd byte twice if count==2.\r
	94	1:\r
	95	cbz count, 2f\r
	96	lsr tmp1, count, 1\r
	97	ldrb A_lw, [src]\r
	98	ldrb A_hw, [srcend, -1]\r
	99	ldrb B_lw, [src, tmp1]\r
	100	strb A_lw, [dstin]\r
	101	strb B_lw, [dstin, tmp1]\r
	102	strb A_hw, [dstend, -1]\r
	103	2: ret\r
	104	\r
	105	.p2align 4\r
	106	// Copy 64..96 bytes. Copy 64 bytes from the start and\r
	107	// 32 bytes from the end.\r
	108	L(copy96):\r
	109	ldp B_l, B_h, [src, 16]\r
	110	ldp C_l, C_h, [src, 32]\r
	111	ldp D_l, D_h, [src, 48]\r
	112	ldp E_l, E_h, [srcend, -32]\r
	113	ldp F_l, F_h, [srcend, -16]\r
	114	stp A_l, A_h, [dstin]\r
	115	stp B_l, B_h, [dstin, 16]\r
	116	stp C_l, C_h, [dstin, 32]\r
	117	stp D_l, D_h, [dstin, 48]\r
	118	stp E_l, E_h, [dstend, -32]\r
	119	stp F_l, F_h, [dstend, -16]\r
	120	ret\r
	121	\r
	122	// Align DST to 16 byte alignment so that we don't cross cache line\r
	123	// boundaries on both loads and stores. There are at least 96 bytes\r
	124	// to copy, so copy 16 bytes unaligned and then align. The loop\r
	125	// copies 64 bytes per iteration and prefetches one iteration ahead.\r
	126	\r
	127	.p2align 4\r
	128	L(copy_long):\r
	129	and tmp1, dstin, 15\r
	130	bic dst, dstin, 15\r
	131	ldp D_l, D_h, [src]\r
	132	sub src, src, tmp1\r
	133	add count, count, tmp1 // Count is now 16 too large.\r
	134	ldp A_l, A_h, [src, 16]\r
	135	stp D_l, D_h, [dstin]\r
	136	ldp B_l, B_h, [src, 32]\r
	137	ldp C_l, C_h, [src, 48]\r
	138	ldp D_l, D_h, [src, 64]!\r
	139	subs count, count, 128 + 16 // Test and readjust count.\r
	140	b.ls 2f\r
	141	1:\r
	142	stp A_l, A_h, [dst, 16]\r
	143	ldp A_l, A_h, [src, 16]\r
	144	stp B_l, B_h, [dst, 32]\r
	145	ldp B_l, B_h, [src, 32]\r
	146	stp C_l, C_h, [dst, 48]\r
	147	ldp C_l, C_h, [src, 48]\r
	148	stp D_l, D_h, [dst, 64]!\r
	149	ldp D_l, D_h, [src, 64]!\r
	150	subs count, count, 64\r
	151	b.hi 1b\r
	152	\r
	153	// Write the last full set of 64 bytes. The remainder is at most 64\r
	154	// bytes, so it is safe to always copy 64 bytes from the end even if\r
	155	// there is just 1 byte left.\r
	156	2:\r
	157	ldp E_l, E_h, [srcend, -64]\r
	158	stp A_l, A_h, [dst, 16]\r
	159	ldp A_l, A_h, [srcend, -48]\r
	160	stp B_l, B_h, [dst, 32]\r
	161	ldp B_l, B_h, [srcend, -32]\r
	162	stp C_l, C_h, [dst, 48]\r
	163	ldp C_l, C_h, [srcend, -16]\r
	164	stp D_l, D_h, [dst, 64]\r
	165	stp E_l, E_h, [dstend, -64]\r
	166	stp A_l, A_h, [dstend, -48]\r
	167	stp B_l, B_h, [dstend, -32]\r
	168	stp C_l, C_h, [dstend, -16]\r
	169	ret\r
	170	\r
	171	\r
	172	//\r
	173	// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
	174	// Larger backwards copies are also handled by memcpy. The only remaining\r
	175	// case is forward large copies. The destination is aligned, and an\r
	176	// unrolled loop processes 64 bytes per iteration.\r
	177	//\r
	178	\r
	179	ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
	180	ASM_PFX(InternalMemCopyMem):\r
	181	sub tmp2, dstin, src\r
	182	cmp count, 96\r
	183	ccmp tmp2, count, 2, hi\r
	184	b.hs __memcpy\r
	185	\r
	186	cbz tmp2, 3f\r
	187	add dstend, dstin, count\r
	188	add srcend, src, count\r
	189	\r
	190	// Align dstend to 16 byte alignment so that we don't cross cache line\r
	191	// boundaries on both loads and stores. There are at least 96 bytes\r
	192	// to copy, so copy 16 bytes unaligned and then align. The loop\r
	193	// copies 64 bytes per iteration and prefetches one iteration ahead.\r
	194	\r
	195	and tmp2, dstend, 15\r
	196	ldp D_l, D_h, [srcend, -16]\r
	197	sub srcend, srcend, tmp2\r
	198	sub count, count, tmp2\r
	199	ldp A_l, A_h, [srcend, -16]\r
	200	stp D_l, D_h, [dstend, -16]\r
	201	ldp B_l, B_h, [srcend, -32]\r
	202	ldp C_l, C_h, [srcend, -48]\r
	203	ldp D_l, D_h, [srcend, -64]!\r
	204	sub dstend, dstend, tmp2\r
	205	subs count, count, 128\r
	206	b.ls 2f\r
	207	nop\r
	208	1:\r
	209	stp A_l, A_h, [dstend, -16]\r
	210	ldp A_l, A_h, [srcend, -16]\r
	211	stp B_l, B_h, [dstend, -32]\r
	212	ldp B_l, B_h, [srcend, -32]\r
	213	stp C_l, C_h, [dstend, -48]\r
	214	ldp C_l, C_h, [srcend, -48]\r
	215	stp D_l, D_h, [dstend, -64]!\r
	216	ldp D_l, D_h, [srcend, -64]!\r
	217	subs count, count, 64\r
	218	b.hi 1b\r
	219	\r
	220	// Write the last full set of 64 bytes. The remainder is at most 64\r
	221	// bytes, so it is safe to always copy 64 bytes from the start even if\r
	222	// there is just 1 byte left.\r
	223	2:\r
	224	ldp E_l, E_h, [src, 48]\r
	225	stp A_l, A_h, [dstend, -16]\r
	226	ldp A_l, A_h, [src, 32]\r
	227	stp B_l, B_h, [dstend, -32]\r
	228	ldp B_l, B_h, [src, 16]\r
	229	stp C_l, C_h, [dstend, -48]\r
	230	ldp C_l, C_h, [src]\r
	231	stp D_l, D_h, [dstend, -64]\r
	232	stp E_l, E_h, [dstin, 48]\r
	233	stp A_l, A_h, [dstin, 32]\r
	234	stp B_l, B_h, [dstin, 16]\r
	235	stp C_l, C_h, [dstin]\r
	236	3: ret\r