[mirror_edk2.git] / MdePkg / Library / BaseMemoryLibOptDxe / AArch64 / ScanMem.S

//\r
// Copyright (c) 2014, ARM Limited\r
// All rights Reserved.\r
// SPDX-License-Identifier: BSD-2-Clause-Patent\r
//\r
\r
// Assumptions:\r
//\r
// ARMv8-a, AArch64\r
// Neon Available.\r
//\r
\r
// Arguments and results.\r
#define srcin     x0\r
#define cntin     x1\r
#define chrin     w2\r
\r
#define result    x0\r
\r
#define src       x3\r
#define  tmp       x4\r
#define wtmp2     w5\r
#define synd      x6\r
#define soff      x9\r
#define cntrem    x10\r
\r
#define vrepchr   v0\r
#define vdata1    v1\r
#define vdata2    v2\r
#define vhas_chr1 v3\r
#define vhas_chr2 v4\r
#define vrepmask  v5\r
#define vend      v6\r
\r
//\r
// Core algorithm:\r
//\r
// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
// requested character and bit 1 is not used (faster than using a 32bit\r
// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
// things occur in the original string, counting trailing zeros allows to\r
// identify exactly which byte has matched.\r
//\r
\r
ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
ASM_PFX(InternalMemScanMem8):\r
    // Do not dereference srcin if no bytes to compare.\r
    cbz  cntin, .Lzero_length\r
    //\r
    // Magic constant 0x40100401 allows us to identify which lane matches\r
    // the requested byte.\r
    //\r
    mov     wtmp2, #0x0401\r
    movk    wtmp2, #0x4010, lsl #16\r
    dup     vrepchr.16b, chrin\r
    // Work with aligned 32-byte chunks\r
    bic     src, srcin, #31\r
    dup     vrepmask.4s, wtmp2\r
    ands    soff, srcin, #31\r
    and     cntrem, cntin, #31\r
    b.eq    .Lloop\r
\r
    //\r
    // Input string is not 32-byte aligned. We calculate the syndrome\r
    // value for the aligned 32 bytes block containing the first bytes\r
    // and mask the irrelevant part.\r
    //\r
\r
    ld1     {vdata1.16b, vdata2.16b}, [src], #32\r
    sub     tmp, soff, #32\r
    adds    cntin, cntin, tmp\r
    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128\r
    addp    vend.16b, vend.16b, vend.16b                  // 128->64\r
    mov     synd, vend.d[0]\r
    // Clear the soff*2 lower bits\r
    lsl     tmp, soff, #1\r
    lsr     synd, synd, tmp\r
    lsl     synd, synd, tmp\r
    // The first block can also be the last\r
    b.ls    .Lmasklast\r
    // Have we found something already?\r
    cbnz    synd, .Ltail\r
\r
.Lloop:\r
    ld1     {vdata1.16b, vdata2.16b}, [src], #32\r
    subs    cntin, cntin, #32\r
    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
    // If we're out of data we finish regardless of the result\r
    b.ls    .Lend\r
    // Use a fast check for the termination condition\r
    orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
    addp    vend.2d, vend.2d, vend.2d\r
    mov     synd, vend.d[0]\r
    // We're not out of data, loop if we haven't found the character\r
    cbz     synd, .Lloop\r
\r
.Lend:\r
    // Termination condition found, let's calculate the syndrome value\r
    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128\r
    addp    vend.16b, vend.16b, vend.16b                // 128->64\r
    mov     synd, vend.d[0]\r
    // Only do the clear for the last possible block\r
    b.hi    .Ltail\r
\r
.Lmasklast:\r
    // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
    add     tmp, cntrem, soff\r
    and     tmp, tmp, #31\r
    sub     tmp, tmp, #32\r
    neg     tmp, tmp, lsl #1\r
    lsl     synd, synd, tmp\r
    lsr     synd, synd, tmp\r
\r
.Ltail:\r
    // Count the trailing zeros using bit reversing\r
    rbit    synd, synd\r
    // Compensate the last post-increment\r
    sub     src, src, #32\r
    // Check that we have found a character\r
    cmp     synd, #0\r
    // And count the leading zeros\r
    clz     synd, synd\r
    // Compute the potential result\r
    add     result, src, synd, lsr #1\r
    // Select result or NULL\r
    csel    result, xzr, result, eq\r
    ret\r
\r
.Lzero_length:\r
    mov   result, #0\r
    ret\r
Commit	Line	Data
c86cd1e1 AB	1	//\r
	2	// Copyright (c) 2014, ARM Limited\r
	3	// All rights Reserved.\r
aa1b377e	4	// SPDX-License-Identifier: BSD-2-Clause-Patent\r
c86cd1e1 AB	5	//\r
	6	\r
	7	// Assumptions:\r
	8	//\r
	9	// ARMv8-a, AArch64\r
	10	// Neon Available.\r
	11	//\r
	12	\r
	13	// Arguments and results.\r
	14	#define srcin x0\r
	15	#define cntin x1\r
	16	#define chrin w2\r
	17	\r
	18	#define result x0\r
	19	\r
	20	#define src x3\r
9095d37b	21	#define tmp x4\r
c86cd1e1 AB	22	#define wtmp2 w5\r
	23	#define synd x6\r
	24	#define soff x9\r
	25	#define cntrem x10\r
	26	\r
	27	#define vrepchr v0\r
	28	#define vdata1 v1\r
	29	#define vdata2 v2\r
	30	#define vhas_chr1 v3\r
	31	#define vhas_chr2 v4\r
	32	#define vrepmask v5\r
	33	#define vend v6\r
	34	\r
	35	//\r
	36	// Core algorithm:\r
	37	//\r
	38	// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
	39	// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
	40	// requested character and bit 1 is not used (faster than using a 32bit\r
	41	// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
	42	// things occur in the original string, counting trailing zeros allows to\r
	43	// identify exactly which byte has matched.\r
	44	//\r
	45	\r
	46	ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
	47	ASM_PFX(InternalMemScanMem8):\r
	48	// Do not dereference srcin if no bytes to compare.\r
9095d37b	49	cbz cntin, .Lzero_length\r
c86cd1e1 AB	50	//\r
	51	// Magic constant 0x40100401 allows us to identify which lane matches\r
	52	// the requested byte.\r
	53	//\r
	54	mov wtmp2, #0x0401\r
	55	movk wtmp2, #0x4010, lsl #16\r
	56	dup vrepchr.16b, chrin\r
	57	// Work with aligned 32-byte chunks\r
	58	bic src, srcin, #31\r
	59	dup vrepmask.4s, wtmp2\r
	60	ands soff, srcin, #31\r
	61	and cntrem, cntin, #31\r
	62	b.eq .Lloop\r
	63	\r
	64	//\r
	65	// Input string is not 32-byte aligned. We calculate the syndrome\r
	66	// value for the aligned 32 bytes block containing the first bytes\r
	67	// and mask the irrelevant part.\r
	68	//\r
	69	\r
	70	ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
	71	sub tmp, soff, #32\r
	72	adds cntin, cntin, tmp\r
	73	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
	74	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
	75	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
	76	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
	77	addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
	78	addp vend.16b, vend.16b, vend.16b // 128->64\r
	79	mov synd, vend.d[0]\r
	80	// Clear the soff*2 lower bits\r
	81	lsl tmp, soff, #1\r
	82	lsr synd, synd, tmp\r
	83	lsl synd, synd, tmp\r
	84	// The first block can also be the last\r
	85	b.ls .Lmasklast\r
	86	// Have we found something already?\r
	87	cbnz synd, .Ltail\r
	88	\r
	89	.Lloop:\r
	90	ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
	91	subs cntin, cntin, #32\r
	92	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
	93	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
	94	// If we're out of data we finish regardless of the result\r
	95	b.ls .Lend\r
	96	// Use a fast check for the termination condition\r
	97	orr vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
	98	addp vend.2d, vend.2d, vend.2d\r
	99	mov synd, vend.d[0]\r
	100	// We're not out of data, loop if we haven't found the character\r
	101	cbz synd, .Lloop\r
	102	\r
	103	.Lend:\r
	104	// Termination condition found, let's calculate the syndrome value\r
	105	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
	106	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
	107	addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
	108	addp vend.16b, vend.16b, vend.16b // 128->64\r
	109	mov synd, vend.d[0]\r
	110	// Only do the clear for the last possible block\r
	111	b.hi .Ltail\r
	112	\r
	113	.Lmasklast:\r
114	// Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
115	add tmp, cntrem, soff\r
116	and tmp, tmp, #31\r
117	sub tmp, tmp, #32\r
118	neg tmp, tmp, lsl #1\r
119	lsl synd, synd, tmp\r
120	lsr synd, synd, tmp\r
121	\r
122	.Ltail:\r
123	// Count the trailing zeros using bit reversing\r
124	rbit synd, synd\r
125	// Compensate the last post-increment\r
126	sub src, src, #32\r
127	// Check that we have found a character\r
128	cmp synd, #0\r
129	// And count the leading zeros\r
130	clz synd, synd\r
131	// Compute the potential result\r
132	add result, src, synd, lsr #1\r
133	// Select result or NULL\r
134	csel result, xzr, result, eq\r
135	ret\r
136	\r
137	.Lzero_length:\r
138	mov result, #0\r
139	ret\r