[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / CopyMem.S

//\r
// Copyright (c) 2012 - 2016, Linaro Limited\r
// All rights reserved.\r
//\r
// Redistribution and use in source and binary forms, with or without\r
// modification, are permitted provided that the following conditions are met:\r
//     * Redistributions of source code must retain the above copyright\r
//       notice, this list of conditions and the following disclaimer.\r
//     * Redistributions in binary form must reproduce the above copyright\r
//       notice, this list of conditions and the following disclaimer in the\r
//       documentation and/or other materials provided with the distribution.\r
//     * Neither the name of the Linaro nor the\r
//       names of its contributors may be used to endorse or promote products\r
//       derived from this software without specific prior written permission.\r
//\r
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
//\r
\r
//\r
// Copyright (c) 2015 ARM Ltd\r
// All rights reserved.\r
//\r
// Redistribution and use in source and binary forms, with or without\r
// modification, are permitted provided that the following conditions\r
// are met:\r
// 1. Redistributions of source code must retain the above copyright\r
//    notice, this list of conditions and the following disclaimer.\r
// 2. Redistributions in binary form must reproduce the above copyright\r
//    notice, this list of conditions and the following disclaimer in the\r
//    documentation and/or other materials provided with the distribution.\r
// 3. The name of the company may not be used to endorse or promote\r
//    products derived from this software without specific prior written\r
//    permission.\r
//\r
// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
//\r
\r
// Assumptions:\r
//\r
// ARMv8-a, AArch64, unaligned accesses.\r
//\r
//\r
\r
#define dstin     x0\r
#define src       x1\r
#define count     x2\r
#define dst       x3\r
#define srcend    x4\r
#define dstend    x5\r
#define A_l       x6\r
#define A_lw      w6\r
#define A_h       x7\r
#define A_hw      w7\r
#define B_l       x8\r
#define B_lw      w8\r
#define B_h       x9\r
#define C_l       x10\r
#define C_h       x11\r
#define D_l       x12\r
#define D_h       x13\r
#define E_l       x14\r
#define E_h       x15\r
#define F_l       srcend\r
#define F_h       dst\r
#define tmp1      x9\r
#define tmp2      x3\r
\r
#define L(l) .L ## l\r
\r
// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
// of more than 96 bytes align the destination and use an unrolled loop\r
// processing 64 bytes per iteration.\r
// Small and medium copies read all data before writing, allowing any\r
// kind of overlap, and memmove tailcalls memcpy for these cases as\r
// well as non-overlapping copies.\r
\r
__memcpy:\r
    prfm    PLDL1KEEP, [src]\r
    add     srcend, src, count\r
    add     dstend, dstin, count\r
    cmp     count, 16\r
    b.ls    L(copy16)\r
    cmp     count, 96\r
    b.hi    L(copy_long)\r
\r
    // Medium copies: 17..96 bytes.\r
    sub     tmp1, count, 1\r
    ldp     A_l, A_h, [src]\r
    tbnz    tmp1, 6, L(copy96)\r
    ldp     D_l, D_h, [srcend, -16]\r
    tbz     tmp1, 5, 1f\r
    ldp     B_l, B_h, [src, 16]\r
    ldp     C_l, C_h, [srcend, -32]\r
    stp     B_l, B_h, [dstin, 16]\r
    stp     C_l, C_h, [dstend, -32]\r
1:\r
    stp     A_l, A_h, [dstin]\r
    stp     D_l, D_h, [dstend, -16]\r
    ret\r
\r
    .p2align 4\r
    // Small copies: 0..16 bytes.\r
L(copy16):\r
    cmp     count, 8\r
    b.lo    1f\r
    ldr     A_l, [src]\r
    ldr     A_h, [srcend, -8]\r
    str     A_l, [dstin]\r
    str     A_h, [dstend, -8]\r
    ret\r
    .p2align 4\r
1:\r
    tbz     count, 2, 1f\r
    ldr     A_lw, [src]\r
    ldr     A_hw, [srcend, -4]\r
    str     A_lw, [dstin]\r
    str     A_hw, [dstend, -4]\r
    ret\r
\r
    // Copy 0..3 bytes.  Use a branchless sequence that copies the same\r
    // byte 3 times if count==1, or the 2nd byte twice if count==2.\r
1:\r
    cbz     count, 2f\r
    lsr     tmp1, count, 1\r
    ldrb    A_lw, [src]\r
    ldrb    A_hw, [srcend, -1]\r
    ldrb    B_lw, [src, tmp1]\r
    strb    A_lw, [dstin]\r
    strb    B_lw, [dstin, tmp1]\r
    strb    A_hw, [dstend, -1]\r
2:  ret\r
\r
    .p2align 4\r
    // Copy 64..96 bytes.  Copy 64 bytes from the start and\r
    // 32 bytes from the end.\r
L(copy96):\r
    ldp     B_l, B_h, [src, 16]\r
    ldp     C_l, C_h, [src, 32]\r
    ldp     D_l, D_h, [src, 48]\r
    ldp     E_l, E_h, [srcend, -32]\r
    ldp     F_l, F_h, [srcend, -16]\r
    stp     A_l, A_h, [dstin]\r
    stp     B_l, B_h, [dstin, 16]\r
    stp     C_l, C_h, [dstin, 32]\r
    stp     D_l, D_h, [dstin, 48]\r
    stp     E_l, E_h, [dstend, -32]\r
    stp     F_l, F_h, [dstend, -16]\r
    ret\r
\r
    // Align DST to 16 byte alignment so that we don't cross cache line\r
    // boundaries on both loads and stores. There are at least 96 bytes\r
    // to copy, so copy 16 bytes unaligned and then align.  The loop\r
    // copies 64 bytes per iteration and prefetches one iteration ahead.\r
\r
    .p2align 4\r
L(copy_long):\r
    and     tmp1, dstin, 15\r
    bic     dst, dstin, 15\r
    ldp     D_l, D_h, [src]\r
    sub     src, src, tmp1\r
    add     count, count, tmp1      // Count is now 16 too large.\r
    ldp     A_l, A_h, [src, 16]\r
    stp     D_l, D_h, [dstin]\r
    ldp     B_l, B_h, [src, 32]\r
    ldp     C_l, C_h, [src, 48]\r
    ldp     D_l, D_h, [src, 64]!\r
    subs    count, count, 128 + 16  // Test and readjust count.\r
    b.ls    2f\r
1:\r
    stp     A_l, A_h, [dst, 16]\r
    ldp     A_l, A_h, [src, 16]\r
    stp     B_l, B_h, [dst, 32]\r
    ldp     B_l, B_h, [src, 32]\r
    stp     C_l, C_h, [dst, 48]\r
    ldp     C_l, C_h, [src, 48]\r
    stp     D_l, D_h, [dst, 64]!\r
    ldp     D_l, D_h, [src, 64]!\r
    subs    count, count, 64\r
    b.hi    1b\r
\r
    // Write the last full set of 64 bytes.   The remainder is at most 64\r
    // bytes, so it is safe to always copy 64 bytes from the end even if\r
    // there is just 1 byte left.\r
2:\r
    ldp     E_l, E_h, [srcend, -64]\r
    stp     A_l, A_h, [dst, 16]\r
    ldp     A_l, A_h, [srcend, -48]\r
    stp     B_l, B_h, [dst, 32]\r
    ldp     B_l, B_h, [srcend, -32]\r
    stp     C_l, C_h, [dst, 48]\r
    ldp     C_l, C_h, [srcend, -16]\r
    stp     D_l, D_h, [dst, 64]\r
    stp     E_l, E_h, [dstend, -64]\r
    stp     A_l, A_h, [dstend, -48]\r
    stp     B_l, B_h, [dstend, -32]\r
    stp     C_l, C_h, [dstend, -16]\r
    ret\r
\r
\r
//\r
// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
// Larger backwards copies are also handled by memcpy. The only remaining\r
// case is forward large copies.  The destination is aligned, and an\r
// unrolled loop processes 64 bytes per iteration.\r
//\r
\r
ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
ASM_PFX(InternalMemCopyMem):\r
    sub     tmp2, dstin, src\r
    cmp     count, 96\r
    ccmp    tmp2, count, 2, hi\r
    b.hs    __memcpy\r
\r
    cbz     tmp2, 3f\r
    add     dstend, dstin, count\r
    add     srcend, src, count\r
\r
    // Align dstend to 16 byte alignment so that we don't cross cache line\r
    // boundaries on both loads and stores. There are at least 96 bytes\r
    // to copy, so copy 16 bytes unaligned and then align. The loop\r
    // copies 64 bytes per iteration and prefetches one iteration ahead.\r
\r
    and     tmp2, dstend, 15\r
    ldp     D_l, D_h, [srcend, -16]\r
    sub     srcend, srcend, tmp2\r
    sub     count, count, tmp2\r
    ldp     A_l, A_h, [srcend, -16]\r
    stp     D_l, D_h, [dstend, -16]\r
    ldp     B_l, B_h, [srcend, -32]\r
    ldp     C_l, C_h, [srcend, -48]\r
    ldp     D_l, D_h, [srcend, -64]!\r
    sub     dstend, dstend, tmp2\r
    subs    count, count, 128\r
    b.ls    2f\r
    nop\r
1:\r
    stp     A_l, A_h, [dstend, -16]\r
    ldp     A_l, A_h, [srcend, -16]\r
    stp     B_l, B_h, [dstend, -32]\r
    ldp     B_l, B_h, [srcend, -32]\r
    stp     C_l, C_h, [dstend, -48]\r
    ldp     C_l, C_h, [srcend, -48]\r
    stp     D_l, D_h, [dstend, -64]!\r
    ldp     D_l, D_h, [srcend, -64]!\r
    subs    count, count, 64\r
    b.hi    1b\r
\r
    // Write the last full set of 64 bytes. The remainder is at most 64\r
    // bytes, so it is safe to always copy 64 bytes from the start even if\r
    // there is just 1 byte left.\r
2:\r
    ldp     E_l, E_h, [src, 48]\r
    stp     A_l, A_h, [dstend, -16]\r
    ldp     A_l, A_h, [src, 32]\r
    stp     B_l, B_h, [dstend, -32]\r
    ldp     B_l, B_h, [src, 16]\r
    stp     C_l, C_h, [dstend, -48]\r
    ldp     C_l, C_h, [src]\r
    stp     D_l, D_h, [dstend, -64]\r
    stp     E_l, E_h, [dstin, 48]\r
    stp     A_l, A_h, [dstin, 32]\r
    stp     B_l, B_h, [dstin, 16]\r
    stp     C_l, C_h, [dstin]\r
3:  ret\r
Commit	Line	Data
c86cd1e1 AB	1	//\r
	2	// Copyright (c) 2012 - 2016, Linaro Limited\r
	3	// All rights reserved.\r
	4	//\r
	5	// Redistribution and use in source and binary forms, with or without\r
	6	// modification, are permitted provided that the following conditions are met:\r
	7	// * Redistributions of source code must retain the above copyright\r
	8	// notice, this list of conditions and the following disclaimer.\r
	9	// * Redistributions in binary form must reproduce the above copyright\r
	10	// notice, this list of conditions and the following disclaimer in the\r
	11	// documentation and/or other materials provided with the distribution.\r
	12	// * Neither the name of the Linaro nor the\r
	13	// names of its contributors may be used to endorse or promote products\r
	14	// derived from this software without specific prior written permission.\r
	15	//\r
	16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
	17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
	18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
	19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
	20	// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
	21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
	22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
	23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
	24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
	25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
	26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
	27	//\r
	28	\r
	29	//\r
	30	// Copyright (c) 2015 ARM Ltd\r
	31	// All rights reserved.\r
	32	//\r
	33	// Redistribution and use in source and binary forms, with or without\r
	34	// modification, are permitted provided that the following conditions\r
	35	// are met:\r
	36	// 1. Redistributions of source code must retain the above copyright\r
	37	// notice, this list of conditions and the following disclaimer.\r
	38	// 2. Redistributions in binary form must reproduce the above copyright\r
	39	// notice, this list of conditions and the following disclaimer in the\r
	40	// documentation and/or other materials provided with the distribution.\r
	41	// 3. The name of the company may not be used to endorse or promote\r
	42	// products derived from this software without specific prior written\r
	43	// permission.\r
	44	//\r
	45	// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
	46	// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
	47	// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
	48	// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
	49	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
	50	// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
	51	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
	52	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
	53	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
	54	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
	55	//\r
	56	\r
	57	// Assumptions:\r
	58	//\r
	59	// ARMv8-a, AArch64, unaligned accesses.\r
	60	//\r
	61	//\r
	62	\r
	63	#define dstin x0\r
	64	#define src x1\r
65	#define count x2\r
66	#define dst x3\r
67	#define srcend x4\r
68	#define dstend x5\r
69	#define A_l x6\r
70	#define A_lw w6\r
71	#define A_h x7\r
72	#define A_hw w7\r
73	#define B_l x8\r
74	#define B_lw w8\r
75	#define B_h x9\r
76	#define C_l x10\r
77	#define C_h x11\r
78	#define D_l x12\r
79	#define D_h x13\r
80	#define E_l x14\r
81	#define E_h x15\r
82	#define F_l srcend\r
83	#define F_h dst\r
84	#define tmp1 x9\r
85	#define tmp2 x3\r
86	\r
87	#define L(l) .L ## l\r
88	\r
89	// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
90	// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
91	// of more than 96 bytes align the destination and use an unrolled loop\r
92	// processing 64 bytes per iteration.\r
93	// Small and medium copies read all data before writing, allowing any\r
94	// kind of overlap, and memmove tailcalls memcpy for these cases as\r
95	// well as non-overlapping copies.\r
96	\r
97	__memcpy:\r
98	prfm PLDL1KEEP, [src]\r
99	add srcend, src, count\r
100	add dstend, dstin, count\r
101	cmp count, 16\r
102	b.ls L(copy16)\r
103	cmp count, 96\r
104	b.hi L(copy_long)\r
105	\r
106	// Medium copies: 17..96 bytes.\r
107	sub tmp1, count, 1\r
108	ldp A_l, A_h, [src]\r
109	tbnz tmp1, 6, L(copy96)\r
110	ldp D_l, D_h, [srcend, -16]\r
111	tbz tmp1, 5, 1f\r
112	ldp B_l, B_h, [src, 16]\r
113	ldp C_l, C_h, [srcend, -32]\r
114	stp B_l, B_h, [dstin, 16]\r
115	stp C_l, C_h, [dstend, -32]\r
116	1:\r
117	stp A_l, A_h, [dstin]\r
118	stp D_l, D_h, [dstend, -16]\r
119	ret\r
120	\r
121	.p2align 4\r
122	// Small copies: 0..16 bytes.\r
123	L(copy16):\r
124	cmp count, 8\r
125	b.lo 1f\r
126	ldr A_l, [src]\r
127	ldr A_h, [srcend, -8]\r
128	str A_l, [dstin]\r
129	str A_h, [dstend, -8]\r
130	ret\r
131	.p2align 4\r
132	1:\r
133	tbz count, 2, 1f\r
134	ldr A_lw, [src]\r
135	ldr A_hw, [srcend, -4]\r
136	str A_lw, [dstin]\r
137	str A_hw, [dstend, -4]\r
138	ret\r
139	\r
140	// Copy 0..3 bytes. Use a branchless sequence that copies the same\r
141	// byte 3 times if count==1, or the 2nd byte twice if count==2.\r
142	1:\r
143	cbz count, 2f\r
144	lsr tmp1, count, 1\r
145	ldrb A_lw, [src]\r
146	ldrb A_hw, [srcend, -1]\r
147	ldrb B_lw, [src, tmp1]\r
148	strb A_lw, [dstin]\r
149	strb B_lw, [dstin, tmp1]\r
150	strb A_hw, [dstend, -1]\r
151	2: ret\r
152	\r
153	.p2align 4\r
154	// Copy 64..96 bytes. Copy 64 bytes from the start and\r
155	// 32 bytes from the end.\r
156	L(copy96):\r
157	ldp B_l, B_h, [src, 16]\r
158	ldp C_l, C_h, [src, 32]\r
159	ldp D_l, D_h, [src, 48]\r
160	ldp E_l, E_h, [srcend, -32]\r
161	ldp F_l, F_h, [srcend, -16]\r
162	stp A_l, A_h, [dstin]\r
163	stp B_l, B_h, [dstin, 16]\r
164	stp C_l, C_h, [dstin, 32]\r
165	stp D_l, D_h, [dstin, 48]\r
166	stp E_l, E_h, [dstend, -32]\r
167	stp F_l, F_h, [dstend, -16]\r
168	ret\r
169	\r
170	// Align DST to 16 byte alignment so that we don't cross cache line\r
171	// boundaries on both loads and stores. There are at least 96 bytes\r
9095d37b	172	// to copy, so copy 16 bytes unaligned and then align. The loop\r
c86cd1e1 AB	173	// copies 64 bytes per iteration and prefetches one iteration ahead.\r
	174	\r
	175	.p2align 4\r
	176	L(copy_long):\r
	177	and tmp1, dstin, 15\r
	178	bic dst, dstin, 15\r
	179	ldp D_l, D_h, [src]\r
	180	sub src, src, tmp1\r
	181	add count, count, tmp1 // Count is now 16 too large.\r
	182	ldp A_l, A_h, [src, 16]\r
	183	stp D_l, D_h, [dstin]\r
	184	ldp B_l, B_h, [src, 32]\r
	185	ldp C_l, C_h, [src, 48]\r
	186	ldp D_l, D_h, [src, 64]!\r
	187	subs count, count, 128 + 16 // Test and readjust count.\r
	188	b.ls 2f\r
	189	1:\r
	190	stp A_l, A_h, [dst, 16]\r
	191	ldp A_l, A_h, [src, 16]\r
	192	stp B_l, B_h, [dst, 32]\r
	193	ldp B_l, B_h, [src, 32]\r
	194	stp C_l, C_h, [dst, 48]\r
	195	ldp C_l, C_h, [src, 48]\r
	196	stp D_l, D_h, [dst, 64]!\r
	197	ldp D_l, D_h, [src, 64]!\r
	198	subs count, count, 64\r
	199	b.hi 1b\r
	200	\r
9095d37b	201	// Write the last full set of 64 bytes. The remainder is at most 64\r
c86cd1e1 AB	202	// bytes, so it is safe to always copy 64 bytes from the end even if\r
	203	// there is just 1 byte left.\r
	204	2:\r
	205	ldp E_l, E_h, [srcend, -64]\r
	206	stp A_l, A_h, [dst, 16]\r
	207	ldp A_l, A_h, [srcend, -48]\r
	208	stp B_l, B_h, [dst, 32]\r
	209	ldp B_l, B_h, [srcend, -32]\r
	210	stp C_l, C_h, [dst, 48]\r
	211	ldp C_l, C_h, [srcend, -16]\r
	212	stp D_l, D_h, [dst, 64]\r
	213	stp E_l, E_h, [dstend, -64]\r
	214	stp A_l, A_h, [dstend, -48]\r
	215	stp B_l, B_h, [dstend, -32]\r
	216	stp C_l, C_h, [dstend, -16]\r
	217	ret\r
	218	\r
	219	\r
	220	//\r
	221	// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
	222	// Larger backwards copies are also handled by memcpy. The only remaining\r
	223	// case is forward large copies. The destination is aligned, and an\r
	224	// unrolled loop processes 64 bytes per iteration.\r
	225	//\r
	226	\r
	227	ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
	228	ASM_PFX(InternalMemCopyMem):\r
	229	sub tmp2, dstin, src\r
	230	cmp count, 96\r
	231	ccmp tmp2, count, 2, hi\r
	232	b.hs __memcpy\r
	233	\r
	234	cbz tmp2, 3f\r
	235	add dstend, dstin, count\r
	236	add srcend, src, count\r
	237	\r
	238	// Align dstend to 16 byte alignment so that we don't cross cache line\r
	239	// boundaries on both loads and stores. There are at least 96 bytes\r
	240	// to copy, so copy 16 bytes unaligned and then align. The loop\r
	241	// copies 64 bytes per iteration and prefetches one iteration ahead.\r
	242	\r
	243	and tmp2, dstend, 15\r
	244	ldp D_l, D_h, [srcend, -16]\r
	245	sub srcend, srcend, tmp2\r
	246	sub count, count, tmp2\r
	247	ldp A_l, A_h, [srcend, -16]\r
	248	stp D_l, D_h, [dstend, -16]\r
	249	ldp B_l, B_h, [srcend, -32]\r
	250	ldp C_l, C_h, [srcend, -48]\r
	251	ldp D_l, D_h, [srcend, -64]!\r
	252	sub dstend, dstend, tmp2\r
	253	subs count, count, 128\r
	254	b.ls 2f\r
	255	nop\r
	256	1:\r
	257	stp A_l, A_h, [dstend, -16]\r
	258	ldp A_l, A_h, [srcend, -16]\r
	259	stp B_l, B_h, [dstend, -32]\r
	260	ldp B_l, B_h, [srcend, -32]\r
	261	stp C_l, C_h, [dstend, -48]\r
	262	ldp C_l, C_h, [srcend, -48]\r
	263	stp D_l, D_h, [dstend, -64]!\r
	264	ldp D_l, D_h, [srcend, -64]!\r
	265	subs count, count, 64\r
266	b.hi 1b\r
267	\r
268	// Write the last full set of 64 bytes. The remainder is at most 64\r
269	// bytes, so it is safe to always copy 64 bytes from the start even if\r
270	// there is just 1 byte left.\r
271	2:\r
272	ldp E_l, E_h, [src, 48]\r
273	stp A_l, A_h, [dstend, -16]\r
274	ldp A_l, A_h, [src, 32]\r
275	stp B_l, B_h, [dstend, -32]\r
276	ldp B_l, B_h, [src, 16]\r
277	stp C_l, C_h, [dstend, -48]\r
278	ldp C_l, C_h, [src]\r
279	stp D_l, D_h, [dstend, -64]\r
280	stp E_l, E_h, [dstin, 48]\r
281	stp A_l, A_h, [dstin, 32]\r
282	stp B_l, B_h, [dstin, 16]\r
283	stp C_l, C_h, [dstin]\r
284	3: ret\r