diff options
Diffstat (limited to 'src/include/port')
-rw-r--r-- | src/include/port/simd.h | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/src/include/port/simd.h b/src/include/port/simd.h index 5f5737707a8..b0165b45861 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -128,6 +128,21 @@ vector32_load(Vector32 *v, const uint32 *s) #endif /* ! USE_NO_SIMD */ /* + * Store a vector into the given memory address. + */ +#ifndef USE_NO_SIMD +static inline void +vector8_store(uint8 *s, Vector8 v) +{ +#ifdef USE_SSE2 + _mm_storeu_si128((Vector8 *) s, v); +#elif defined(USE_NEON) + vst1q_u8(s, v); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* * Create a vector with all elements set to the same value. */ static inline Vector8 @@ -266,6 +281,25 @@ vector8_has_le(const Vector8 v, const uint8 c) } /* + * Returns true if any elements in the vector are greater than or equal to the + * given scalar. + */ +#ifndef USE_NO_SIMD +static inline bool +vector8_has_ge(const Vector8 v, const uint8 c) +{ +#ifdef USE_SSE2 + Vector8 umax = _mm_max_epu8(v, vector8_broadcast(c)); + Vector8 cmpe = vector8_eq(umax, v); + + return vector8_is_highbit_set(cmpe); +#elif defined(USE_NEON) + return vmaxvq_u8(v) >= c; +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* * Return true if the high bit of any element is set */ static inline bool @@ -360,6 +394,55 @@ vector32_or(const Vector32 v1, const Vector32 v2) #endif /* ! USE_NO_SIMD */ /* + * Return the bitwise AND of the inputs. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_and(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_and_si128(v1, v2); +#elif defined(USE_NEON) + return vandq_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Return the result of adding the respective elements of the input vectors. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_add(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_add_epi8(v1, v2); +#elif defined(USE_NEON) + return vaddq_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Return the result of subtracting the respective elements of the input + * vectors using signed saturation (i.e., if the operation would yield a value + * less than -128, -128 is returned instead). For more information on + * saturation arithmetic, see + * https://en.wikipedia.org/wiki/Saturation_arithmetic + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_issub(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_subs_epi8(v1, v2); +#elif defined(USE_NEON) + return (Vector8) vqsubq_s8((int8x16_t) v1, (int8x16_t) v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* * Return a vector with all bits set in each lane where the corresponding * lanes in the inputs are equal. */ @@ -388,6 +471,23 @@ vector32_eq(const Vector32 v1, const Vector32 v2) #endif /* ! USE_NO_SIMD */ /* + * Return a vector with all bits set for each lane of v1 that is greater than + * the corresponding lane of v2. NB: The comparison treats the elements as + * signed. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_gt(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_cmpgt_epi8(v1, v2); +#elif defined(USE_NEON) + return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* * Given two vectors, return a vector with the minimum element of each. */ #ifndef USE_NO_SIMD @@ -402,4 +502,115 @@ vector8_min(const Vector8 v1, const Vector8 v2) } #endif /* ! USE_NO_SIMD */ +/* + * Interleave elements of low halves (e.g., for SSE2, bits 0-63) of given + * vectors. Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_interleave_low(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_unpacklo_epi8(v1, v2); +#elif defined(USE_NEON) + return vzip1q_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Interleave elements of high halves (e.g., for SSE2, bits 64-127) of given + * vectors. Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_interleave_high(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_unpackhi_epi8(v1, v2); +#elif defined(USE_NEON) + return vzip2q_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Pack 16-bit elements in the given vectors into a single vector of 8-bit + * elements. The first half of the return vector (e.g., for SSE2, bits 0-63) + * uses v1, and the second half (e.g., for SSE2, bits 64-127) uses v2. + * + * NB: The upper 8-bits of each 16-bit element must be zeros, else this will + * produce different results on different architectures. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_pack_16(const Vector8 v1, const Vector8 v2) +{ + Vector8 mask PG_USED_FOR_ASSERTS_ONLY; + + mask = vector8_interleave_low(vector8_broadcast(0), vector8_broadcast(0xff)); + Assert(!vector8_has_ge(vector8_and(v1, mask), 1)); + Assert(!vector8_has_ge(vector8_and(v2, mask), 1)); +#ifdef USE_SSE2 + return _mm_packus_epi16(v1, v2); +#elif defined(USE_NEON) + return vuzp1q_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Unsigned shift left of each 32-bit element in the vector by "i" bits. + * + * XXX AArch64 requires an integer literal, so we have to list all expected + * values of "i" from all callers in a switch statement. If you add a new + * caller, be sure your expected values of "i" are handled. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_shift_left(const Vector8 v1, int i) +{ +#ifdef USE_SSE2 + return _mm_slli_epi32(v1, i); +#elif defined(USE_NEON) + switch (i) + { + case 4: + return (Vector8) vshlq_n_u32((Vector32) v1, 4); + default: + Assert(false); + return vector8_broadcast(0); + } +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Unsigned shift right of each 32-bit element in the vector by "i" bits. + * + * XXX AArch64 requires an integer literal, so we have to list all expected + * values of "i" from all callers in a switch statement. If you add a new + * caller, be sure your expected values of "i" are handled. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_shift_right(const Vector8 v1, int i) +{ +#ifdef USE_SSE2 + return _mm_srli_epi32(v1, i); +#elif defined(USE_NEON) + switch (i) + { + case 4: + return (Vector8) vshrq_n_u32((Vector32) v1, 4); + case 8: + return (Vector8) vshrq_n_u32((Vector32) v1, 8); + default: + Assert(false); + return vector8_broadcast(0); + } +#endif +} +#endif /* ! USE_NO_SIMD */ + #endif /* SIMD_H */ |