[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / ScanMem.S

//\r
// Copyright (c) 2014, ARM Limited\r
// All rights Reserved.\r
//\r
// Redistribution and use in source and binary forms, with or without\r
// modification, are permitted provided that the following conditions are met:\r
//     * Redistributions of source code must retain the above copyright\r
//       notice, this list of conditions and the following disclaimer.\r
//     * Redistributions in binary form must reproduce the above copyright\r
//       notice, this list of conditions and the following disclaimer in the\r
//       documentation and/or other materials provided with the distribution.\r
//     * Neither the name of the company nor the names of its contributors\r
//       may be used to endorse or promote products derived from this\r
//       software without specific prior written permission.\r
//\r
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
//\r
\r
// Assumptions:\r
//\r
// ARMv8-a, AArch64\r
// Neon Available.\r
//\r
\r
// Arguments and results.\r
#define srcin     x0\r
#define cntin     x1\r
#define chrin     w2\r
\r
#define result    x0\r
\r
#define src       x3\r
#define  tmp       x4\r
#define wtmp2     w5\r
#define synd      x6\r
#define soff      x9\r
#define cntrem    x10\r
\r
#define vrepchr   v0\r
#define vdata1    v1\r
#define vdata2    v2\r
#define vhas_chr1 v3\r
#define vhas_chr2 v4\r
#define vrepmask  v5\r
#define vend      v6\r
\r
//\r
// Core algorithm:\r
//\r
// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
// requested character and bit 1 is not used (faster than using a 32bit\r
// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
// things occur in the original string, counting trailing zeros allows to\r
// identify exactly which byte has matched.\r
//\r
\r
ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
ASM_PFX(InternalMemScanMem8):\r
    // Do not dereference srcin if no bytes to compare.\r
    cbz  cntin, .Lzero_length\r
    //\r
    // Magic constant 0x40100401 allows us to identify which lane matches\r
    // the requested byte.\r
    //\r
    mov     wtmp2, #0x0401\r
    movk    wtmp2, #0x4010, lsl #16\r
    dup     vrepchr.16b, chrin\r
    // Work with aligned 32-byte chunks\r
    bic     src, srcin, #31\r
    dup     vrepmask.4s, wtmp2\r
    ands    soff, srcin, #31\r
    and     cntrem, cntin, #31\r
    b.eq    .Lloop\r
\r
    //\r
    // Input string is not 32-byte aligned. We calculate the syndrome\r
    // value for the aligned 32 bytes block containing the first bytes\r
    // and mask the irrelevant part.\r
    //\r
\r
    ld1     {vdata1.16b, vdata2.16b}, [src], #32\r
    sub     tmp, soff, #32\r
    adds    cntin, cntin, tmp\r
    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128\r
    addp    vend.16b, vend.16b, vend.16b                  // 128->64\r
    mov     synd, vend.d[0]\r
    // Clear the soff*2 lower bits\r
    lsl     tmp, soff, #1\r
    lsr     synd, synd, tmp\r
    lsl     synd, synd, tmp\r
    // The first block can also be the last\r
    b.ls    .Lmasklast\r
    // Have we found something already?\r
    cbnz    synd, .Ltail\r
\r
.Lloop:\r
    ld1     {vdata1.16b, vdata2.16b}, [src], #32\r
    subs    cntin, cntin, #32\r
    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
    // If we're out of data we finish regardless of the result\r
    b.ls    .Lend\r
    // Use a fast check for the termination condition\r
    orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
    addp    vend.2d, vend.2d, vend.2d\r
    mov     synd, vend.d[0]\r
    // We're not out of data, loop if we haven't found the character\r
    cbz     synd, .Lloop\r
\r
.Lend:\r
    // Termination condition found, let's calculate the syndrome value\r
    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128\r
    addp    vend.16b, vend.16b, vend.16b                // 128->64\r
    mov     synd, vend.d[0]\r
    // Only do the clear for the last possible block\r
    b.hi    .Ltail\r
\r
.Lmasklast:\r
    // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
    add     tmp, cntrem, soff\r
    and     tmp, tmp, #31\r
    sub     tmp, tmp, #32\r
    neg     tmp, tmp, lsl #1\r
    lsl     synd, synd, tmp\r
    lsr     synd, synd, tmp\r
\r
.Ltail:\r
    // Count the trailing zeros using bit reversing\r
    rbit    synd, synd\r
    // Compensate the last post-increment\r
    sub     src, src, #32\r
    // Check that we have found a character\r
    cmp     synd, #0\r
    // And count the leading zeros\r
    clz     synd, synd\r
    // Compute the potential result\r
    add     result, src, synd, lsr #1\r
    // Select result or NULL\r
    csel    result, xzr, result, eq\r
    ret\r
\r
.Lzero_length:\r
    mov   result, #0\r
    ret\r
Commit	Line	Data
c86cd1e1 AB	1	//\r
	2	// Copyright (c) 2014, ARM Limited\r
	3	// All rights Reserved.\r
	4	//\r
	5	// Redistribution and use in source and binary forms, with or without\r
	6	// modification, are permitted provided that the following conditions are met:\r
	7	// * Redistributions of source code must retain the above copyright\r
	8	// notice, this list of conditions and the following disclaimer.\r
	9	// * Redistributions in binary form must reproduce the above copyright\r
	10	// notice, this list of conditions and the following disclaimer in the\r
	11	// documentation and/or other materials provided with the distribution.\r
	12	// * Neither the name of the company nor the names of its contributors\r
	13	// may be used to endorse or promote products derived from this\r
	14	// software without specific prior written permission.\r
	15	//\r
	16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
	17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
	18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
	19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
	20	// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
	21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
	22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
	23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
	24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
	25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
	26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
	27	//\r
	28	\r
	29	// Assumptions:\r
	30	//\r
	31	// ARMv8-a, AArch64\r
	32	// Neon Available.\r
	33	//\r
	34	\r
	35	// Arguments and results.\r
	36	#define srcin x0\r
	37	#define cntin x1\r
	38	#define chrin w2\r
	39	\r
	40	#define result x0\r
	41	\r
	42	#define src x3\r
9095d37b	43	#define tmp x4\r
c86cd1e1 AB	44	#define wtmp2 w5\r
	45	#define synd x6\r
	46	#define soff x9\r
	47	#define cntrem x10\r
	48	\r
	49	#define vrepchr v0\r
	50	#define vdata1 v1\r
	51	#define vdata2 v2\r
	52	#define vhas_chr1 v3\r
	53	#define vhas_chr2 v4\r
	54	#define vrepmask v5\r
	55	#define vend v6\r
	56	\r
	57	//\r
	58	// Core algorithm:\r
	59	//\r
	60	// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
	61	// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
	62	// requested character and bit 1 is not used (faster than using a 32bit\r
	63	// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
	64	// things occur in the original string, counting trailing zeros allows to\r
	65	// identify exactly which byte has matched.\r
	66	//\r
	67	\r
	68	ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
	69	ASM_PFX(InternalMemScanMem8):\r
	70	// Do not dereference srcin if no bytes to compare.\r
9095d37b	71	cbz cntin, .Lzero_length\r
c86cd1e1 AB	72	//\r
	73	// Magic constant 0x40100401 allows us to identify which lane matches\r
	74	// the requested byte.\r
	75	//\r
	76	mov wtmp2, #0x0401\r
	77	movk wtmp2, #0x4010, lsl #16\r
	78	dup vrepchr.16b, chrin\r
	79	// Work with aligned 32-byte chunks\r
	80	bic src, srcin, #31\r
	81	dup vrepmask.4s, wtmp2\r
	82	ands soff, srcin, #31\r
	83	and cntrem, cntin, #31\r
	84	b.eq .Lloop\r
	85	\r
	86	//\r
	87	// Input string is not 32-byte aligned. We calculate the syndrome\r
	88	// value for the aligned 32 bytes block containing the first bytes\r
	89	// and mask the irrelevant part.\r
	90	//\r
	91	\r
	92	ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
	93	sub tmp, soff, #32\r
	94	adds cntin, cntin, tmp\r
	95	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
	96	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
	97	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
	98	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
	99	addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
	100	addp vend.16b, vend.16b, vend.16b // 128->64\r
	101	mov synd, vend.d[0]\r
	102	// Clear the soff*2 lower bits\r
	103	lsl tmp, soff, #1\r
	104	lsr synd, synd, tmp\r
	105	lsl synd, synd, tmp\r
	106	// The first block can also be the last\r
	107	b.ls .Lmasklast\r
	108	// Have we found something already?\r
	109	cbnz synd, .Ltail\r
	110	\r
	111	.Lloop:\r
	112	ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
	113	subs cntin, cntin, #32\r
	114	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
	115	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
	116	// If we're out of data we finish regardless of the result\r
	117	b.ls .Lend\r
	118	// Use a fast check for the termination condition\r
	119	orr vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
	120	addp vend.2d, vend.2d, vend.2d\r
	121	mov synd, vend.d[0]\r
	122	// We're not out of data, loop if we haven't found the character\r
	123	cbz synd, .Lloop\r
	124	\r
	125	.Lend:\r
	126	// Termination condition found, let's calculate the syndrome value\r
	127	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
	128	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
	129	addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
	130	addp vend.16b, vend.16b, vend.16b // 128->64\r
	131	mov synd, vend.d[0]\r
	132	// Only do the clear for the last possible block\r
	133	b.hi .Ltail\r
	134	\r
	135	.Lmasklast:\r
136	// Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
137	add tmp, cntrem, soff\r
138	and tmp, tmp, #31\r
139	sub tmp, tmp, #32\r
140	neg tmp, tmp, lsl #1\r
141	lsl synd, synd, tmp\r
142	lsr synd, synd, tmp\r
143	\r
144	.Ltail:\r
145	// Count the trailing zeros using bit reversing\r
146	rbit synd, synd\r
147	// Compensate the last post-increment\r
148	sub src, src, #32\r
149	// Check that we have found a character\r
150	cmp synd, #0\r
151	// And count the leading zeros\r
152	clz synd, synd\r
153	// Compute the potential result\r
154	add result, src, synd, lsr #1\r
155	// Select result or NULL\r
156	csel result, xzr, result, eq\r
157	ret\r
158	\r
159	.Lzero_length:\r
160	mov result, #0\r
161	ret\r