[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / SetMem.S

//\r
// Copyright (c) 2012 - 2016, Linaro Limited\r
// All rights reserved.\r
// Copyright (c) 2015 ARM Ltd\r
// All rights reserved.\r
// SPDX-License-Identifier: BSD-2-Clause-Patent\r
//\r
\r
// Assumptions:\r
//\r
// ARMv8-a, AArch64, unaligned accesses\r
//\r
//\r
\r
#define dstin     x0\r
#define count     x1\r
#define val       x2\r
#define valw      w2\r
#define dst       x3\r
#define dstend    x4\r
#define tmp1      x5\r
#define tmp1w     w5\r
#define tmp2      x6\r
#define tmp2w     w6\r
#define zva_len   x7\r
#define zva_lenw  w7\r
\r
#define L(l) .L ## l\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
ASM_PFX(InternalMemSetMem16):\r
    dup     v0.8H, valw\r
    lsl     count, count, #1\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
ASM_PFX(InternalMemSetMem32):\r
    dup     v0.4S, valw\r
    lsl     count, count, #2\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
ASM_PFX(InternalMemSetMem64):\r
    dup     v0.2D, val\r
    lsl     count, count, #3\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
ASM_PFX(InternalMemZeroMem):\r
    movi    v0.16B, #0\r
    b       0f\r
\r
ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
ASM_PFX(InternalMemSetMem):\r
    dup     v0.16B, valw\r
0:  add     dstend, dstin, count\r
    mov     val, v0.D[0]\r
\r
    cmp     count, 96\r
    b.hi    L(set_long)\r
    cmp     count, 16\r
    b.hs    L(set_medium)\r
\r
    // Set 0..15 bytes.\r
    tbz     count, 3, 1f\r
    str     val, [dstin]\r
    str     val, [dstend, -8]\r
    ret\r
    nop\r
1:  tbz     count, 2, 2f\r
    str     valw, [dstin]\r
    str     valw, [dstend, -4]\r
    ret\r
2:  cbz     count, 3f\r
    strb    valw, [dstin]\r
    tbz     count, 1, 3f\r
    strh    valw, [dstend, -2]\r
3:  ret\r
\r
    // Set 17..96 bytes.\r
L(set_medium):\r
    str     q0, [dstin]\r
    tbnz    count, 6, L(set96)\r
    str     q0, [dstend, -16]\r
    tbz     count, 5, 1f\r
    str     q0, [dstin, 16]\r
    str     q0, [dstend, -32]\r
1:  ret\r
\r
    .p2align 4\r
    // Set 64..96 bytes.  Write 64 bytes from the start and\r
    // 32 bytes from the end.\r
L(set96):\r
    str     q0, [dstin, 16]\r
    stp     q0, q0, [dstin, 32]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
    .p2align 3\r
    nop\r
L(set_long):\r
    bic     dst, dstin, 15\r
    str     q0, [dstin]\r
    cmp     count, 256\r
    ccmp    val, 0, 0, cs\r
    b.eq    L(try_zva)\r
L(no_zva):\r
    sub     count, dstend, dst        // Count is 16 too large.\r
    add     dst, dst, 16\r
    sub     count, count, 64 + 16     // Adjust count and bias for loop.\r
1:  stp     q0, q0, [dst], 64\r
    stp     q0, q0, [dst, -32]\r
L(tail64):\r
    subs    count, count, 64\r
    b.hi    1b\r
2:  stp     q0, q0, [dstend, -64]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
    .p2align 3\r
L(try_zva):\r
    mrs     tmp1, dczid_el0\r
    tbnz    tmp1w, 4, L(no_zva)\r
    and     tmp1w, tmp1w, 15\r
    cmp     tmp1w, 4                  // ZVA size is 64 bytes.\r
    b.ne    L(zva_128)\r
\r
    // Write the first and last 64 byte aligned block using stp rather\r
    // than using DC ZVA.  This is faster on some cores.\r
L(zva_64):\r
    str     q0, [dst, 16]\r
    stp     q0, q0, [dst, 32]\r
    bic     dst, dst, 63\r
    stp     q0, q0, [dst, 64]\r
    stp     q0, q0, [dst, 96]\r
    sub     count, dstend, dst         // Count is now 128 too large.\r
    sub     count, count, 128+64+64    // Adjust count and bias for loop.\r
    add     dst, dst, 128\r
    nop\r
1:  dc      zva, dst\r
    add     dst, dst, 64\r
    subs    count, count, 64\r
    b.hi    1b\r
    stp     q0, q0, [dst, 0]\r
    stp     q0, q0, [dst, 32]\r
    stp     q0, q0, [dstend, -64]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
    .p2align 3\r
L(zva_128):\r
    cmp     tmp1w, 5                    // ZVA size is 128 bytes.\r
    b.ne    L(zva_other)\r
\r
    str     q0, [dst, 16]\r
    stp     q0, q0, [dst, 32]\r
    stp     q0, q0, [dst, 64]\r
    stp     q0, q0, [dst, 96]\r
    bic     dst, dst, 127\r
    sub     count, dstend, dst          // Count is now 128 too large.\r
    sub     count, count, 128+128       // Adjust count and bias for loop.\r
    add     dst, dst, 128\r
1:  dc      zva, dst\r
    add     dst, dst, 128\r
    subs    count, count, 128\r
    b.hi    1b\r
    stp     q0, q0, [dstend, -128]\r
    stp     q0, q0, [dstend, -96]\r
    stp     q0, q0, [dstend, -64]\r
    stp     q0, q0, [dstend, -32]\r
    ret\r
\r
L(zva_other):\r
    mov     tmp2w, 4\r
    lsl     zva_lenw, tmp2w, tmp1w\r
    add     tmp1, zva_len, 64           // Max alignment bytes written.\r
    cmp     count, tmp1\r
    blo     L(no_zva)\r
\r
    sub     tmp2, zva_len, 1\r
    add     tmp1, dst, zva_len\r
    add     dst, dst, 16\r
    subs    count, tmp1, dst            // Actual alignment bytes to write.\r
    bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.\r
    beq     2f\r
1:  stp     q0, q0, [dst], 64\r
    stp     q0, q0, [dst, -32]\r
    subs    count, count, 64\r
    b.hi    1b\r
2:  mov     dst, tmp1\r
    sub     count, dstend, tmp1         // Remaining bytes to write.\r
    subs    count, count, zva_len\r
    b.lo    4f\r
3:  dc      zva, dst\r
    add     dst, dst, zva_len\r
    subs    count, count, zva_len\r
    b.hs    3b\r
4:  add     count, count, zva_len\r
    b       L(tail64)\r
Commit	Line	Data
c86cd1e1 AB	1	//\r
	2	// Copyright (c) 2012 - 2016, Linaro Limited\r
	3	// All rights reserved.\r
c86cd1e1 AB	4	// Copyright (c) 2015 ARM Ltd\r
c86cd1e1 AB	5	// All rights reserved.\r
aa1b377e	6	// SPDX-License-Identifier: BSD-2-Clause-Patent\r
c86cd1e1 AB	7	//\r
	8	\r
	9	// Assumptions:\r
	10	//\r
	11	// ARMv8-a, AArch64, unaligned accesses\r
	12	//\r
	13	//\r
	14	\r
	15	#define dstin x0\r
	16	#define count x1\r
	17	#define val x2\r
	18	#define valw w2\r
	19	#define dst x3\r
	20	#define dstend x4\r
	21	#define tmp1 x5\r
	22	#define tmp1w w5\r
	23	#define tmp2 x6\r
	24	#define tmp2w w6\r
	25	#define zva_len x7\r
	26	#define zva_lenw w7\r
	27	\r
	28	#define L(l) .L ## l\r
	29	\r
	30	ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
	31	ASM_PFX(InternalMemSetMem16):\r
	32	dup v0.8H, valw\r
8b4ca351	33	lsl count, count, #1\r
c86cd1e1 AB	34	b 0f\r
	35	\r
	36	ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
	37	ASM_PFX(InternalMemSetMem32):\r
	38	dup v0.4S, valw\r
8b4ca351	39	lsl count, count, #2\r
c86cd1e1 AB	40	b 0f\r
	41	\r
	42	ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
	43	ASM_PFX(InternalMemSetMem64):\r
	44	dup v0.2D, val\r
8b4ca351	45	lsl count, count, #3\r
c86cd1e1 AB	46	b 0f\r
	47	\r
	48	ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
	49	ASM_PFX(InternalMemZeroMem):\r
	50	movi v0.16B, #0\r
	51	b 0f\r
	52	\r
	53	ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
	54	ASM_PFX(InternalMemSetMem):\r
	55	dup v0.16B, valw\r
	56	0: add dstend, dstin, count\r
	57	mov val, v0.D[0]\r
	58	\r
	59	cmp count, 96\r
	60	b.hi L(set_long)\r
	61	cmp count, 16\r
	62	b.hs L(set_medium)\r
	63	\r
	64	// Set 0..15 bytes.\r
	65	tbz count, 3, 1f\r
	66	str val, [dstin]\r
	67	str val, [dstend, -8]\r
	68	ret\r
	69	nop\r
	70	1: tbz count, 2, 2f\r
	71	str valw, [dstin]\r
	72	str valw, [dstend, -4]\r
	73	ret\r
	74	2: cbz count, 3f\r
	75	strb valw, [dstin]\r
	76	tbz count, 1, 3f\r
	77	strh valw, [dstend, -2]\r
	78	3: ret\r
	79	\r
	80	// Set 17..96 bytes.\r
	81	L(set_medium):\r
	82	str q0, [dstin]\r
	83	tbnz count, 6, L(set96)\r
	84	str q0, [dstend, -16]\r
	85	tbz count, 5, 1f\r
	86	str q0, [dstin, 16]\r
	87	str q0, [dstend, -32]\r
	88	1: ret\r
	89	\r
	90	.p2align 4\r
	91	// Set 64..96 bytes. Write 64 bytes from the start and\r
	92	// 32 bytes from the end.\r
	93	L(set96):\r
	94	str q0, [dstin, 16]\r
	95	stp q0, q0, [dstin, 32]\r
	96	stp q0, q0, [dstend, -32]\r
	97	ret\r
	98	\r
	99	.p2align 3\r
	100	nop\r
	101	L(set_long):\r
	102	bic dst, dstin, 15\r
	103	str q0, [dstin]\r
	104	cmp count, 256\r
	105	ccmp val, 0, 0, cs\r
	106	b.eq L(try_zva)\r
	107	L(no_zva):\r
	108	sub count, dstend, dst // Count is 16 too large.\r
	109	add dst, dst, 16\r
110	sub count, count, 64 + 16 // Adjust count and bias for loop.\r
111	1: stp q0, q0, [dst], 64\r
112	stp q0, q0, [dst, -32]\r
113	L(tail64):\r
114	subs count, count, 64\r
115	b.hi 1b\r
116	2: stp q0, q0, [dstend, -64]\r
117	stp q0, q0, [dstend, -32]\r
118	ret\r
119	\r
120	.p2align 3\r
121	L(try_zva):\r
122	mrs tmp1, dczid_el0\r
123	tbnz tmp1w, 4, L(no_zva)\r
124	and tmp1w, tmp1w, 15\r
125	cmp tmp1w, 4 // ZVA size is 64 bytes.\r
126	b.ne L(zva_128)\r
127	\r
128	// Write the first and last 64 byte aligned block using stp rather\r
129	// than using DC ZVA. This is faster on some cores.\r
130	L(zva_64):\r
131	str q0, [dst, 16]\r
132	stp q0, q0, [dst, 32]\r
133	bic dst, dst, 63\r
134	stp q0, q0, [dst, 64]\r
135	stp q0, q0, [dst, 96]\r
136	sub count, dstend, dst // Count is now 128 too large.\r
137	sub count, count, 128+64+64 // Adjust count and bias for loop.\r
138	add dst, dst, 128\r
139	nop\r
140	1: dc zva, dst\r
141	add dst, dst, 64\r
142	subs count, count, 64\r
143	b.hi 1b\r
144	stp q0, q0, [dst, 0]\r
145	stp q0, q0, [dst, 32]\r
146	stp q0, q0, [dstend, -64]\r
147	stp q0, q0, [dstend, -32]\r
148	ret\r
149	\r
150	.p2align 3\r
151	L(zva_128):\r
152	cmp tmp1w, 5 // ZVA size is 128 bytes.\r
153	b.ne L(zva_other)\r
154	\r
155	str q0, [dst, 16]\r
156	stp q0, q0, [dst, 32]\r
157	stp q0, q0, [dst, 64]\r
158	stp q0, q0, [dst, 96]\r
159	bic dst, dst, 127\r
160	sub count, dstend, dst // Count is now 128 too large.\r
161	sub count, count, 128+128 // Adjust count and bias for loop.\r
162	add dst, dst, 128\r
163	1: dc zva, dst\r
164	add dst, dst, 128\r
165	subs count, count, 128\r
166	b.hi 1b\r
167	stp q0, q0, [dstend, -128]\r
168	stp q0, q0, [dstend, -96]\r
169	stp q0, q0, [dstend, -64]\r
170	stp q0, q0, [dstend, -32]\r
171	ret\r
172	\r
173	L(zva_other):\r
174	mov tmp2w, 4\r
175	lsl zva_lenw, tmp2w, tmp1w\r
176	add tmp1, zva_len, 64 // Max alignment bytes written.\r
177	cmp count, tmp1\r
178	blo L(no_zva)\r
179	\r
180	sub tmp2, zva_len, 1\r
181	add tmp1, dst, zva_len\r
182	add dst, dst, 16\r
183	subs count, tmp1, dst // Actual alignment bytes to write.\r
184	bic tmp1, tmp1, tmp2 // Aligned dc zva start address.\r
185	beq 2f\r
186	1: stp q0, q0, [dst], 64\r
187	stp q0, q0, [dst, -32]\r
188	subs count, count, 64\r
189	b.hi 1b\r
190	2: mov dst, tmp1\r
191	sub count, dstend, tmp1 // Remaining bytes to write.\r
192	subs count, count, zva_len\r
193	b.lo 4f\r
194	3: dc zva, dst\r
195	add dst, dst, zva_len\r
196	subs count, count, zva_len\r
197	b.hs 3b\r
198	4: add count, count, zva_len\r
199	b L(tail64)\r