[mirror_qemu.git] / target / arm / vec_internal.h

/*
 * ARM AdvSIMD / SVE Vector Helpers
 *
 * Copyright (c) 2020 Linaro
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

#ifndef TARGET_ARM_VEC_INTERNALS_H
#define TARGET_ARM_VEC_INTERNALS_H

/*
 * Note that vector data is stored in host-endian 64-bit chunks,
 * so addressing units smaller than that needs a host-endian fixup.
 *
 * The H<N> macros are used when indexing an array of elements of size N.
 *
 * The H1_<N> macros are used when performing byte arithmetic and then
 * casting the final pointer to a type of size N.
 */
#if HOST_BIG_ENDIAN
#define H1(x)   ((x) ^ 7)
#define H1_2(x) ((x) ^ 6)
#define H1_4(x) ((x) ^ 4)
#define H2(x)   ((x) ^ 3)
#define H4(x)   ((x) ^ 1)
#else
#define H1(x)   (x)
#define H1_2(x) (x)
#define H1_4(x) (x)
#define H2(x)   (x)
#define H4(x)   (x)
#endif
/*
 * Access to 64-bit elements isn't host-endian dependent; we provide H8
 * and H1_8 so that when a function is being generated from a macro we
 * can pass these rather than an empty macro argument, for clarity.
 */
#define H8(x)   (x)
#define H1_8(x) (x)

/* Data for expanding active predicate bits to bytes, for byte elements. */
extern const uint64_t expand_pred_b_data[256];

static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
{
    uint64_t *d = vd + opr_sz;
    uintptr_t i;

    for (i = opr_sz; i < max_sz; i += 8) {
        *d++ = 0;
    }
}

static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
                                    bool round, uint32_t *sat)
{
    if (shift <= -bits) {
        /* Rounding the sign bit always produces 0. */
        if (round) {
            return 0;
        }
        return src >> 31;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < bits) {
        int32_t val = src << shift;
        if (bits == 32) {
            if (!sat || val >> shift == src) {
                return val;
            }
        } else {
            int32_t extval = sextract32(val, 0, bits);
            if (!sat || val == extval) {
                return extval;
            }
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return (1u << (bits - 1)) - (src >= 0);
}

static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
                                     bool round, uint32_t *sat)
{
    if (shift <= -(bits + round)) {
        return 0;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < bits) {
        uint32_t val = src << shift;
        if (bits == 32) {
            if (!sat || val >> shift == src) {
                return val;
            }
        } else {
            uint32_t extval = extract32(val, 0, bits);
            if (!sat || val == extval) {
                return extval;
            }
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return MAKE_64BIT_MASK(0, bits);
}

static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
                                     bool round, uint32_t *sat)
{
    if (sat && src < 0) {
        *sat = 1;
        return 0;
    }
    return do_uqrshl_bhs(src, shift, bits, round, sat);
}

static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
                                  bool round, uint32_t *sat)
{
    if (shift <= -64) {
        /* Rounding the sign bit always produces 0. */
        if (round) {
            return 0;
        }
        return src >> 63;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < 64) {
        int64_t val = src << shift;
        if (!sat || val >> shift == src) {
            return val;
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return src < 0 ? INT64_MIN : INT64_MAX;
}

static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
                                   bool round, uint32_t *sat)
{
    if (shift <= -(64 + round)) {
        return 0;
    } else if (shift < 0) {
        if (round) {
            src >>= -shift - 1;
            return (src >> 1) + (src & 1);
        }
        return src >> -shift;
    } else if (shift < 64) {
        uint64_t val = src << shift;
        if (!sat || val >> shift == src) {
            return val;
        }
    } else if (!sat || src == 0) {
        return 0;
    }

    *sat = 1;
    return UINT64_MAX;
}

static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
                                   bool round, uint32_t *sat)
{
    if (sat && src < 0) {
        *sat = 1;
        return 0;
    }
    return do_uqrshl_d(src, shift, round, sat);
}

int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);

/*
 * 8 x 8 -> 16 vector polynomial multiply where the inputs are
 * in the low 8 bits of each 16-bit element
*/
uint64_t pmull_h(uint64_t op1, uint64_t op2);
/*
 * 16 x 16 -> 32 vector polynomial multiply where the inputs are
 * in the low 16 bits of each 32-bit element
 */
uint64_t pmull_w(uint64_t op1, uint64_t op2);

#endif /* TARGET_ARM_VEC_INTERNALS_H */
Commit	Line	Data
a04b68e1 RH	1	/*
	2	* ARM AdvSIMD / SVE Vector Helpers
	3	*
	4	* Copyright (c) 2020 Linaro
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
50f57e09	9	* version 2.1 of the License, or (at your option) any later version.
a04b68e1 RH	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	18	*/
	19
	20	#ifndef TARGET_ARM_VEC_INTERNALS_H
	21	#define TARGET_ARM_VEC_INTERNALS_H
	22
93966af1 RH	23	/*
	24	* Note that vector data is stored in host-endian 64-bit chunks,
	25	* so addressing units smaller than that needs a host-endian fixup.
	26	*
	27	* The H<N> macros are used when indexing an array of elements of size N.
	28	*
	29	* The H1_<N> macros are used when performing byte arithmetic and then
	30	* casting the final pointer to a type of size N.
	31	*/
e03b5686	32	#if HOST_BIG_ENDIAN
93966af1 RH	33	#define H1(x) ((x) ^ 7)
	34	#define H1_2(x) ((x) ^ 6)
	35	#define H1_4(x) ((x) ^ 4)
	36	#define H2(x) ((x) ^ 3)
	37	#define H4(x) ((x) ^ 1)
	38	#else
	39	#define H1(x) (x)
	40	#define H1_2(x) (x)
	41	#define H1_4(x) (x)
	42	#define H2(x) (x)
	43	#define H4(x) (x)
	44	#endif
6e802db3 PM	45	/*
	46	* Access to 64-bit elements isn't host-endian dependent; we provide H8
	47	* and H1_8 so that when a function is being generated from a macro we
	48	* can pass these rather than an empty macro argument, for clarity.
	49	*/
	50	#define H8(x) (x)
	51	#define H1_8(x) (x)
93966af1	52
77f96148 PM	53	/* Data for expanding active predicate bits to bytes, for byte elements. */
	54	extern const uint64_t expand_pred_b_data[256];
	55
a04b68e1 RH	56	static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
	57	{
	58	uint64_t *d = vd + opr_sz;
	59	uintptr_t i;
	60
	61	for (i = opr_sz; i < max_sz; i += 8) {
	62	*d++ = 0;
	63	}
	64	}
	65
8b3f15b0 RH	66	static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
	67	bool round, uint32_t *sat)
	68	{
	69	if (shift <= -bits) {
	70	/* Rounding the sign bit always produces 0. */
	71	if (round) {
	72	return 0;
	73	}
	74	return src >> 31;
	75	} else if (shift < 0) {
	76	if (round) {
	77	src >>= -shift - 1;
	78	return (src >> 1) + (src & 1);
	79	}
	80	return src >> -shift;
	81	} else if (shift < bits) {
	82	int32_t val = src << shift;
	83	if (bits == 32) {
	84	if (!sat \|\| val >> shift == src) {
	85	return val;
	86	}
	87	} else {
	88	int32_t extval = sextract32(val, 0, bits);
	89	if (!sat \|\| val == extval) {
	90	return extval;
	91	}
	92	}
	93	} else if (!sat \|\| src == 0) {
	94	return 0;
	95	}
	96
	97	*sat = 1;
	98	return (1u << (bits - 1)) - (src >= 0);
	99	}
	100
	101	static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
	102	bool round, uint32_t *sat)
	103	{
	104	if (shift <= -(bits + round)) {
	105	return 0;
	106	} else if (shift < 0) {
	107	if (round) {
	108	src >>= -shift - 1;
	109	return (src >> 1) + (src & 1);
	110	}
	111	return src >> -shift;
	112	} else if (shift < bits) {
	113	uint32_t val = src << shift;
	114	if (bits == 32) {
	115	if (!sat \|\| val >> shift == src) {
	116	return val;
	117	}
	118	} else {
	119	uint32_t extval = extract32(val, 0, bits);
	120	if (!sat \|\| val == extval) {
	121	return extval;
	122	}
	123	}
	124	} else if (!sat \|\| src == 0) {
	125	return 0;
	126	}
	127
	128	*sat = 1;
	129	return MAKE_64BIT_MASK(0, bits);
130	}
131
132	static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
133	bool round, uint32_t *sat)
134	{
135	if (sat && src < 0) {
136	*sat = 1;
137	return 0;
138	}
139	return do_uqrshl_bhs(src, shift, bits, round, sat);
140	}
141
142	static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
143	bool round, uint32_t *sat)
144	{
145	if (shift <= -64) {
146	/* Rounding the sign bit always produces 0. */
147	if (round) {
148	return 0;
149	}
150	return src >> 63;
151	} else if (shift < 0) {
152	if (round) {
153	src >>= -shift - 1;
154	return (src >> 1) + (src & 1);
155	}
156	return src >> -shift;
157	} else if (shift < 64) {
158	int64_t val = src << shift;
159	if (!sat \|\| val >> shift == src) {
160	return val;
161	}
162	} else if (!sat \|\| src == 0) {
163	return 0;
164	}
165
166	*sat = 1;
167	return src < 0 ? INT64_MIN : INT64_MAX;
168	}
169
170	static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
171	bool round, uint32_t *sat)
172	{
173	if (shift <= -(64 + round)) {
174	return 0;
175	} else if (shift < 0) {
176	if (round) {
177	src >>= -shift - 1;
178	return (src >> 1) + (src & 1);
179	}
180	return src >> -shift;
181	} else if (shift < 64) {
182	uint64_t val = src << shift;
183	if (!sat \|\| val >> shift == src) {
184	return val;
185	}
186	} else if (!sat \|\| src == 0) {
187	return 0;
188	}
189
190	*sat = 1;
191	return UINT64_MAX;
192	}
193
194	static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
195	bool round, uint32_t *sat)
196	{
197	if (sat && src < 0) {
198	*sat = 1;
199	return 0;
200	}
201	return do_uqrshl_d(src, shift, round, sat);
202	}
203
d782d3ca RH	204	int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
	205	int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
	206	int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
	207	int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
	208
c1bd78cb PM	209	/*
	210	* 8 x 8 -> 16 vector polynomial multiply where the inputs are
	211	* in the low 8 bits of each 16-bit element
	212	*/
	213	uint64_t pmull_h(uint64_t op1, uint64_t op2);
	214	/*
	215	* 16 x 16 -> 32 vector polynomial multiply where the inputs are
	216	* in the low 16 bits of each 32-bit element
	217	*/
	218	uint64_t pmull_w(uint64_t op1, uint64_t op2);
	219
a04b68e1	220	#endif /* TARGET_ARM_VEC_INTERNALS_H */