33 #ifndef _AK_SIMD_AVX_H_
34 #define _AK_SIMD_AVX_H_
39 #if defined(AKSIMD_AVX_SUPPORTED)
41 #include <immintrin.h>
48 typedef __m256 AKSIMD_V8F32;
49 typedef AKSIMD_V8F32 AKSIMD_V8COND;
50 typedef AKSIMD_V8F32 AKSIMD_V8FCOND;
63 #define AKSIMD_LOAD_V8F32( __addr__ ) _mm256_loadu_ps( (AkReal32*)(__addr__) )
67 #define AKSIMD_LOAD1_V8F32( __scalar__ ) _mm256_broadcast_ss( &(__scalar__) )
71 #define AKSIMD_SET_V8F32( __scalar__ ) _mm256_set1_ps( (__scalar__) )
74 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
78 #define AKSIMD_SETZERO_V8F32() _mm256_setzero_ps()
83 #define AKSIMD_LOAD_SS_V8F32( __addr__ ) _mm256_zextps128_ps256(_mm_load_ss( (__addr__) ))
89 #define AKSIMD_SET_V2F128( m1, m2) _mm256_setr_m128(m1, m2)
91 #define AKSIMD_INSERT_V2F128( a, m128, idx) _mm256_insertf128_ps(a, m128, idx)
104 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
108 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) )
122 #define AKSIMD_SHUFFLE_V8F32( a, b, i ) _mm256_shuffle_ps( a, b, i )
125 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
128 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
131 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
134 #define AKSIMD_DUP_V8_ODD(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
137 #define AKSIMD_DUP_V8_EVEN(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
140 #define AKSIMD_PERMUTE128( l1, l0 ) (((l1) << 4) | (l0))
144 #define AKSIMD_PERMUTE_2X128_V8F32( a, b, i ) _mm256_permute2f128_ps(a, b, i)
147 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0))
150 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1))
163 #define AKSIMD_SUB_V8F32( a, b ) _mm256_sub_ps( a, b )
168 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
172 #define AKSIMD_ADD_V8F32( a, b ) _mm256_add_ps( a, b )
177 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
181 #define AKSIMD_MUL_V8F32( a, b ) _mm256_mul_ps( a, b )
183 #define AKSIMD_DIV_V8F32( a, b ) _mm256_div_ps( a, b )
189 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) )
192 #define AKSIMD_MADD_V8F32( __a__, __b__, __c__ ) _mm256_add_ps( _mm256_mul_ps( (__a__), (__b__) ), (__c__) )
193 #define AKSIMD_MSUB_V8F32( __a__, __b__, __c__ ) _mm256_sub_ps( _mm256_mul_ps( (__a__), (__b__) ), (__c__) )
196 #define AKSIMD_MADD_SS_V8F32( __a__, __b__, __c__ ) AKSIMD_ADD_SS_V8F32( AKSIMD_MUL_SS_V8F32( (__a__), (__b__) ), (__c__) )
200 #define AKSIMD_MIN_V8F32( a, b ) _mm256_min_ps( a, b )
204 #define AKSIMD_MAX_V8F32( a, b ) _mm256_max_ps( a, b )
207 #define AKSIMD_ABS_V8F32( a ) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
210 #define AKSIMD_NEG_V8F32( __a__ ) _mm256_xor_ps(_mm256_set1_ps(-0.f), __a__)
213 #define AKSIMD_SQRT_V8F32( __a__ ) _mm256_sqrt_ps( (__a__) )
216 #define AKSIMD_RSQRT_V8F32( __a__ ) _mm256_rsqrt_ps( (__a__) )
219 #define AKSIMD_RECIP_V8F32( __a__ ) _mm256_rcp_ps( (__a__) )
222 #define AKSIMD_CEIL_V8F32( __a__ ) _mm256_ceil_ps( (__a__) )
224 #define AKSIMD_XOR_V8F32( a, b ) _mm256_xor_ps(a,b)
225 #define AKSIMD_OR_V8F32( a, b ) _mm256_or_ps(a,b)
226 #define AKSIMD_AND_V8F32( a, b) _mm256_and_ps(a,b)
227 #define AKSIMD_NOT_V8F32( a ) _mm256_xor_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(~0)))
233 static AkForceInline AKSIMD_V8F32 AKSIMD_HORIZONTALADD_V8F32(AKSIMD_V8F32 vVec)
235 __m256 vHaddAb = _mm256_hadd_ps(vVec, vVec);
236 __m256 vHaddAbcd = _mm256_hadd_ps(vHaddAb, vHaddAb);
237 __m256 vHaddEfgh = _mm256_permute2f128_ps(vHaddAbcd, vHaddAbcd, 0x01);
238 __m256 vHaddAll = _mm256_add_ps(vHaddAbcd, vHaddEfgh);
243 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(
const AKSIMD_V8F32 cIn1,
const AKSIMD_V8F32 cIn2)
245 __m256 real1Ext = _mm256_moveldup_ps(cIn1);
246 __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1);
247 __m256 imag1Ext = _mm256_movehdup_ps(cIn1);
248 __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf);
249 __m256 mul = _mm256_mul_ps(real1Ext, cIn2);
250 __m256 out = _mm256_addsub_ps(mul, temp);
265 #define AKSIMD_UNPACKLO_V8F32( a, b ) _mm256_unpacklo_ps( a, b )
270 #define AKSIMD_UNPACKHI_V8F32( a, b ) _mm256_unpackhi_ps( a, b )
280 #define AKSIMD_CMP_CTRLMASKV8 __m256
283 #define AKSIMD_LTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LE_OS )
285 #define AKSIMD_LT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LT_OS )
288 #define AKSIMD_GTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GE_OS )
290 #define AKSIMD_GT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GT_OS )
293 #define AKSIMD_EQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_EQ_OS )
296 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
298 return _mm256_blendv_ps(vA, vB, vMask);
302 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) )
305 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) )
307 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
309 #define AKSIMD_MASK_V8F32( __a__ ) _mm256_movemask_ps( __a__ )
312 #define AKSIMD_TESTZERO_V8I32( __a__ ) (_mm256_testz_si256( __a__, __a__ ) != 0)
313 #define AKSIMD_TESTZERO_V8F32( __a__ ) AKSIMD_TESTZERO_V8I32(_mm256_castps_si256(__a__))
316 #define AKSIMD_TESTONES_V8I32(__a__) (_mm256_testc_si256(__a__, _mm256_set1_epi32(~0)) != 0)
317 #define AKSIMD_TESTONES_V8F32( __a__) AKSIMD_TESTONES_V8I32(_mm256_castps_si256(__a__))
322 typedef __m256i AKSIMD_V8I32;
324 typedef AKSIMD_V8I32 AKSIMD_V8ICOND;
328 #define AKSIMD_LOAD_V8I32( __addr__ ) _mm256_loadu_si256( (__addr__) )
331 #define AKSIMD_SETZERO_V8I32() _mm256_setzero_si256()
334 #define AKSIMD_SET_V8I32( __scalar__ ) _mm256_set1_epi32( (__scalar__) )
337 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
343 #define AKSIMD_SET_V2I128(m1, m2) _mm256_setr_m128i(m1, m2)
348 #define AKSIMD_STORE_V8I32( __addr__, __vec__ ) _mm256_storeu_si256( (__addr__), (__vec__) )
356 #define AKSIMD_CONVERT_V8I32_TO_V8F32( __vec__ ) _mm256_cvtepi32_ps( (__vec__) )
360 #define AKSIMD_CONVERT_V8F32_TO_V8I32( __vec__ ) _mm256_cvtps_epi32( (__vec__) )
364 #define AKSIMD_TRUNCATE_V8F32_TO_V8I32( __vec__ ) _mm256_cvttps_epi32( (__vec__) )
369 #define AKSIMD_CONVERT_V8F16_TO_V8F32( __vec__ ) _mm256_cvtph_ps( (__vec__) )
374 #define AKSIMD_CONVERT_V8F32_TO_V8F16( __vec__ ) _mm256_cvtps_ph(__vec__, (_MM_FROUND_TO_NEAREST_INT ) )
378 #endif //_AK_SIMD_AVX_H_