[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / SetMem.S

//\r
// Copyright (c) 2012 - 2016, Linaro Limited\r
// All rights reserved.\r
//\r
// Redistribution and use in source and binary forms, with or without\r
// modification, are permitted provided that the following conditions are met:\r
//     * Redistributions of source code must retain the above copyright\r
//       notice, this list of conditions and the following disclaimer.\r
//     * Redistributions in binary form must reproduce the above copyright\r
//       notice, this list of conditions and the following disclaimer in the\r
//       documentation and/or other materials provided with the distribution.\r
//     * Neither the name of the Linaro nor the\r
//       names of its contributors may be used to endorse or promote products\r
//       derived from this software without specific prior written permission.\r
//\r
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
//\r
\r
//\r
// Copyright (c) 2015 ARM Ltd\r
// All rights reserved.\r
//\r
// Redistribution and use in source and binary forms, with or without\r
// modification, are permitted provided that the following conditions\r
// are met:\r
// 1. Redistributions of source code must retain the above copyright\r
//    notice, this list of conditions and the following disclaimer.\r
// 2. Redistributions in binary form must reproduce the above copyright\r
//    notice, this list of conditions and the following disclaimer in the\r
//    documentation and/or other materials provided with the distribution.\r
// 3. The name of the company may not be used to endorse or promote\r
//    products derived from this software without specific prior written\r
//    permission.\r
//\r
// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
//\r
\r
// Assumptions:\r
//\r
// ARMv8-a, AArch64, unaligned accesses\r
//\r
//\r
\r
#define dstin     x0\r
#define count     x1\r
#define val       x2\r
#define valw      w2\r
#define dst       x3\r
#define dstend    x4\r
#define tmp1      x5\r
#define tmp1w     w5\r
#define tmp2      x6\r
#define tmp2w     w6\r
#define zva_len   x7\r
#define zva_lenw  w7\r
\r
#define L(l) .L ## l\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
ASM_PFX(InternalMemSetMem16):\r
    dup     v0.8H, valw\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
ASM_PFX(InternalMemSetMem32):\r
    dup     v0.4S, valw\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
ASM_PFX(InternalMemSetMem64):\r
    dup     v0.2D, val\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
ASM_PFX(InternalMemZeroMem):\r
    movi    v0.16B, #0\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
ASM_PFX(InternalMemSetMem):\r
    dup     v0.16B, valw\r
0:  add     dstend, dstin, count\r
    mov     val, v0.D[0]\r
\r
    cmp     count, 96\r
    b.hi    L(set_long)\r
    cmp     count, 16\r
    b.hs    L(set_medium)\r
\r
    // Set 0..15 bytes.\r
    tbz     count, 3, 1f\r
    str     val, [dstin]\r
    str     val, [dstend, -8]\r
    ret\r
    nop\r
1:  tbz     count, 2, 2f\r
    str     valw, [dstin]\r
    str     valw, [dstend, -4]\r
    ret\r
2:  cbz     count, 3f\r
    strb    valw, [dstin]\r
    tbz     count, 1, 3f\r
    strh    valw, [dstend, -2]\r
3:  ret\r
\r
    // Set 17..96 bytes.\r
L(set_medium):\r
    str     q0, [dstin]\r
    tbnz    count, 6, L(set96)\r
    str     q0, [dstend, -16]\r
    tbz     count, 5, 1f\r
    str     q0, [dstin, 16]\r
    str     q0, [dstend, -32]\r
1:  ret\r
\r
    .p2align 4\r
    // Set 64..96 bytes.  Write 64 bytes from the start and\r
    // 32 bytes from the end.\r
L(set96):\r
    str     q0, [dstin, 16]\r
    stp     q0, q0, [dstin, 32]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
    .p2align 3\r
    nop\r
L(set_long):\r
    bic     dst, dstin, 15\r
    str     q0, [dstin]\r
    cmp     count, 256\r
    ccmp    val, 0, 0, cs\r
    b.eq    L(try_zva)\r
L(no_zva):\r
    sub     count, dstend, dst        // Count is 16 too large.\r
    add     dst, dst, 16\r
    sub     count, count, 64 + 16     // Adjust count and bias for loop.\r
1:  stp     q0, q0, [dst], 64\r
    stp     q0, q0, [dst, -32]\r
L(tail64):\r
    subs    count, count, 64\r
    b.hi    1b\r
2:  stp     q0, q0, [dstend, -64]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
    .p2align 3\r
L(try_zva):\r
    mrs     tmp1, dczid_el0\r
    tbnz    tmp1w, 4, L(no_zva)\r
    and     tmp1w, tmp1w, 15\r
    cmp     tmp1w, 4                  // ZVA size is 64 bytes.\r
    b.ne    L(zva_128)\r
\r
    // Write the first and last 64 byte aligned block using stp rather\r
    // than using DC ZVA.  This is faster on some cores.\r
L(zva_64):\r
    str     q0, [dst, 16]\r
    stp     q0, q0, [dst, 32]\r
    bic     dst, dst, 63\r
    stp     q0, q0, [dst, 64]\r
    stp     q0, q0, [dst, 96]\r
    sub     count, dstend, dst         // Count is now 128 too large.\r
    sub     count, count, 128+64+64    // Adjust count and bias for loop.\r
    add     dst, dst, 128\r
    nop\r
1:  dc      zva, dst\r
    add     dst, dst, 64\r
    subs    count, count, 64\r
    b.hi    1b\r
    stp     q0, q0, [dst, 0]\r
    stp     q0, q0, [dst, 32]\r
    stp     q0, q0, [dstend, -64]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
    .p2align 3\r
L(zva_128):\r
    cmp     tmp1w, 5                    // ZVA size is 128 bytes.\r
    b.ne    L(zva_other)\r
\r
    str     q0, [dst, 16]\r
    stp     q0, q0, [dst, 32]\r
    stp     q0, q0, [dst, 64]\r
    stp     q0, q0, [dst, 96]\r
    bic     dst, dst, 127\r
    sub     count, dstend, dst          // Count is now 128 too large.\r
    sub     count, count, 128+128       // Adjust count and bias for loop.\r
    add     dst, dst, 128\r
1:  dc      zva, dst\r
    add     dst, dst, 128\r
    subs    count, count, 128\r
    b.hi    1b\r
    stp     q0, q0, [dstend, -128]\r
    stp     q0, q0, [dstend, -96]\r
    stp     q0, q0, [dstend, -64]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
L(zva_other):\r
    mov     tmp2w, 4\r
    lsl     zva_lenw, tmp2w, tmp1w\r
    add     tmp1, zva_len, 64           // Max alignment bytes written.\r
    cmp     count, tmp1\r
    blo     L(no_zva)\r
\r
    sub     tmp2, zva_len, 1\r
    add     tmp1, dst, zva_len\r
    add     dst, dst, 16\r
    subs    count, tmp1, dst            // Actual alignment bytes to write.\r
    bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.\r
    beq     2f\r
1:  stp     q0, q0, [dst], 64\r
    stp     q0, q0, [dst, -32]\r
    subs    count, count, 64\r
    b.hi    1b\r
2:  mov     dst, tmp1\r
    sub     count, dstend, tmp1         // Remaining bytes to write.\r
    subs    count, count, zva_len\r
    b.lo    4f\r
3:  dc      zva, dst\r
    add     dst, dst, zva_len\r
    subs    count, count, zva_len\r
    b.hs    3b\r
4:  add     count, count, zva_len\r
    b       L(tail64)\r
Commit	Line	Data
c86cd1e1 AB	1	//\r
	2	// Copyright (c) 2012 - 2016, Linaro Limited\r
	3	// All rights reserved.\r
	4	//\r
	5	// Redistribution and use in source and binary forms, with or without\r
	6	// modification, are permitted provided that the following conditions are met:\r
	7	// * Redistributions of source code must retain the above copyright\r
	8	// notice, this list of conditions and the following disclaimer.\r
	9	// * Redistributions in binary form must reproduce the above copyright\r
	10	// notice, this list of conditions and the following disclaimer in the\r
	11	// documentation and/or other materials provided with the distribution.\r
	12	// * Neither the name of the Linaro nor the\r
	13	// names of its contributors may be used to endorse or promote products\r
	14	// derived from this software without specific prior written permission.\r
	15	//\r
	16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
	17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
	18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
	19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
	20	// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
	21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
	22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
	23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
	24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
	25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
	26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
	27	//\r
	28	\r
	29	//\r
	30	// Copyright (c) 2015 ARM Ltd\r
	31	// All rights reserved.\r
	32	//\r
	33	// Redistribution and use in source and binary forms, with or without\r
	34	// modification, are permitted provided that the following conditions\r
	35	// are met:\r
	36	// 1. Redistributions of source code must retain the above copyright\r
	37	// notice, this list of conditions and the following disclaimer.\r
	38	// 2. Redistributions in binary form must reproduce the above copyright\r
	39	// notice, this list of conditions and the following disclaimer in the\r
	40	// documentation and/or other materials provided with the distribution.\r
	41	// 3. The name of the company may not be used to endorse or promote\r
	42	// products derived from this software without specific prior written\r
	43	// permission.\r
	44	//\r
	45	// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
	46	// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
	47	// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
	48	// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
	49	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
	50	// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
	51	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
	52	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
	53	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
	54	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
	55	//\r
	56	\r
	57	// Assumptions:\r
	58	//\r
	59	// ARMv8-a, AArch64, unaligned accesses\r
	60	//\r
	61	//\r
	62	\r
	63	#define dstin x0\r
	64	#define count x1\r
65	#define val x2\r
66	#define valw w2\r
67	#define dst x3\r
68	#define dstend x4\r
69	#define tmp1 x5\r
70	#define tmp1w w5\r
71	#define tmp2 x6\r
72	#define tmp2w w6\r
73	#define zva_len x7\r
74	#define zva_lenw w7\r
75	\r
76	#define L(l) .L ## l\r
77	\r
78	ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
79	ASM_PFX(InternalMemSetMem16):\r
80	dup v0.8H, valw\r
81	b 0f\r
82	\r
83	ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
84	ASM_PFX(InternalMemSetMem32):\r
85	dup v0.4S, valw\r
86	b 0f\r
87	\r
88	ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
89	ASM_PFX(InternalMemSetMem64):\r
90	dup v0.2D, val\r
91	b 0f\r
92	\r
93	ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
94	ASM_PFX(InternalMemZeroMem):\r
95	movi v0.16B, #0\r
96	b 0f\r
97	\r
98	ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
99	ASM_PFX(InternalMemSetMem):\r
100	dup v0.16B, valw\r
101	0: add dstend, dstin, count\r
102	mov val, v0.D[0]\r
103	\r
104	cmp count, 96\r
105	b.hi L(set_long)\r
106	cmp count, 16\r
107	b.hs L(set_medium)\r
108	\r
109	// Set 0..15 bytes.\r
110	tbz count, 3, 1f\r
111	str val, [dstin]\r
112	str val, [dstend, -8]\r
113	ret\r
114	nop\r
115	1: tbz count, 2, 2f\r
116	str valw, [dstin]\r
117	str valw, [dstend, -4]\r
118	ret\r
119	2: cbz count, 3f\r
120	strb valw, [dstin]\r
121	tbz count, 1, 3f\r
122	strh valw, [dstend, -2]\r
123	3: ret\r
124	\r
125	// Set 17..96 bytes.\r
126	L(set_medium):\r
127	str q0, [dstin]\r
128	tbnz count, 6, L(set96)\r
129	str q0, [dstend, -16]\r
130	tbz count, 5, 1f\r
131	str q0, [dstin, 16]\r
132	str q0, [dstend, -32]\r
133	1: ret\r
134	\r
135	.p2align 4\r
136	// Set 64..96 bytes. Write 64 bytes from the start and\r
137	// 32 bytes from the end.\r
138	L(set96):\r
139	str q0, [dstin, 16]\r
140	stp q0, q0, [dstin, 32]\r
141	stp q0, q0, [dstend, -32]\r
142	ret\r
143	\r
144	.p2align 3\r
145	nop\r
146	L(set_long):\r
147	bic dst, dstin, 15\r
148	str q0, [dstin]\r
149	cmp count, 256\r
150	ccmp val, 0, 0, cs\r
151	b.eq L(try_zva)\r
152	L(no_zva):\r
153	sub count, dstend, dst // Count is 16 too large.\r
154	add dst, dst, 16\r
155	sub count, count, 64 + 16 // Adjust count and bias for loop.\r
156	1: stp q0, q0, [dst], 64\r
157	stp q0, q0, [dst, -32]\r
158	L(tail64):\r
159	subs count, count, 64\r
160	b.hi 1b\r
161	2: stp q0, q0, [dstend, -64]\r
162	stp q0, q0, [dstend, -32]\r
163	ret\r
164	\r
165	.p2align 3\r
166	L(try_zva):\r
167	mrs tmp1, dczid_el0\r
168	tbnz tmp1w, 4, L(no_zva)\r
169	and tmp1w, tmp1w, 15\r
170	cmp tmp1w, 4 // ZVA size is 64 bytes.\r
171	b.ne L(zva_128)\r
172	\r
173	// Write the first and last 64 byte aligned block using stp rather\r
174	// than using DC ZVA. This is faster on some cores.\r
175	L(zva_64):\r
176	str q0, [dst, 16]\r
177	stp q0, q0, [dst, 32]\r
178	bic dst, dst, 63\r
179	stp q0, q0, [dst, 64]\r
180	stp q0, q0, [dst, 96]\r
181	sub count, dstend, dst // Count is now 128 too large.\r
182	sub count, count, 128+64+64 // Adjust count and bias for loop.\r
183	add dst, dst, 128\r
184	nop\r
185	1: dc zva, dst\r
186	add dst, dst, 64\r
187	subs count, count, 64\r
188	b.hi 1b\r
189	stp q0, q0, [dst, 0]\r
190	stp q0, q0, [dst, 32]\r
191	stp q0, q0, [dstend, -64]\r
192	stp q0, q0, [dstend, -32]\r
193	ret\r
194	\r
195	.p2align 3\r
196	L(zva_128):\r
197	cmp tmp1w, 5 // ZVA size is 128 bytes.\r
198	b.ne L(zva_other)\r
199	\r
200	str q0, [dst, 16]\r
201	stp q0, q0, [dst, 32]\r
202	stp q0, q0, [dst, 64]\r
203	stp q0, q0, [dst, 96]\r
204	bic dst, dst, 127\r
205	sub count, dstend, dst // Count is now 128 too large.\r
206	sub count, count, 128+128 // Adjust count and bias for loop.\r
207	add dst, dst, 128\r
208	1: dc zva, dst\r
209	add dst, dst, 128\r
210	subs count, count, 128\r
211	b.hi 1b\r
212	stp q0, q0, [dstend, -128]\r
213	stp q0, q0, [dstend, -96]\r
214	stp q0, q0, [dstend, -64]\r
215	stp q0, q0, [dstend, -32]\r
216	ret\r
217	\r
218	L(zva_other):\r
219	mov tmp2w, 4\r
220	lsl zva_lenw, tmp2w, tmp1w\r
221	add tmp1, zva_len, 64 // Max alignment bytes written.\r
222	cmp count, tmp1\r
223	blo L(no_zva)\r
224	\r
225	sub tmp2, zva_len, 1\r
226	add tmp1, dst, zva_len\r
227	add dst, dst, 16\r
228	subs count, tmp1, dst // Actual alignment bytes to write.\r
229	bic tmp1, tmp1, tmp2 // Aligned dc zva start address.\r
230	beq 2f\r
231	1: stp q0, q0, [dst], 64\r
232	stp q0, q0, [dst, -32]\r
233	subs count, count, 64\r
234	b.hi 1b\r
235	2: mov dst, tmp1\r
236	sub count, dstend, tmp1 // Remaining bytes to write.\r
237	subs count, count, zva_len\r
238	b.lo 4f\r
239	3: dc zva, dst\r
240	add dst, dst, zva_len\r
241	subs count, count, zva_len\r
242	b.hs 3b\r
243	4: add count, count, zva_len\r
244	b L(tail64)\r