PAPI 7.1.0.0
Loading...
Searching...
No Matches
cat_arch.h
Go to the documentation of this file.
1#include <inttypes.h>
2
3typedef unsigned long long uint64;
4
5#if defined(X86)
6void test_hp_x86_128B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
7void test_sp_x86_128B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
8void test_dp_x86_128B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
9
10void test_hp_x86_256B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
11void test_sp_x86_256B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
12void test_dp_x86_256B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
13
14void test_hp_x86_512B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
15void test_sp_x86_512B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
16void test_dp_x86_512B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
17
18void test_hp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
19void test_sp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
20void test_dp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
21
22void test_hp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
23void test_sp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
24void test_dp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
25
26void test_hp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
27void test_sp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
28void test_dp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
29
30#include <immintrin.h>
31
32typedef __m128 SP_SCALAR_TYPE;
33typedef __m128d DP_SCALAR_TYPE;
34
35#define SET_VEC_SS(_I_) _mm_set_ss( _I_ );
36#define ADD_VEC_SS(_I_,_J_) _mm_add_ss( _I_ , _J_ );
37#define MUL_VEC_SS(_I_,_J_) _mm_mul_ss( _I_ , _J_ );
38#define FMA_VEC_SS(_out_,_I_,_J_,_K_) { _out_ = _mm_fmadd_ss( _I_ , _J_ , _K_ ); }
39
40#define SET_VEC_SD(_I_) _mm_set_sd( _I_ );
41#define ADD_VEC_SD(_I_,_J_) _mm_add_sd( _I_ , _J_ );
42#define MUL_VEC_SD(_I_,_J_) _mm_mul_sd( _I_ , _J_ );
43#define FMA_VEC_SD(_out_,_I_,_J_,_K_) { _out_ = _mm_fmadd_sd( _I_ , _J_ , _K_ ); }
44
45#if defined(X86_VEC_WIDTH_128B)
46typedef __m128 SP_VEC_TYPE;
47typedef __m128d DP_VEC_TYPE;
48
49#define SET_VEC_PS(_I_) _mm_set1_ps( _I_ );
50#define ADD_VEC_PS(_I_,_J_) _mm_add_ps( _I_ , _J_ );
51#define MUL_VEC_PS(_I_,_J_) _mm_mul_ps( _I_ , _J_ );
52#define FMA_VEC_PS(_I_,_J_,_K_) _mm_fmadd_ps( _I_ , _J_ , _K_ );
53
54#define SET_VEC_PD(_I_) _mm_set1_pd( _I_ );
55#define ADD_VEC_PD(_I_,_J_) _mm_add_pd( _I_ , _J_ );
56#define MUL_VEC_PD(_I_,_J_) _mm_mul_pd( _I_ , _J_ );
57#define FMA_VEC_PD(_I_,_J_,_K_) _mm_fmadd_pd( _I_ , _J_ , _K_ );
58
59#elif defined(X86_VEC_WIDTH_512B)
60typedef __m512 SP_VEC_TYPE;
61typedef __m512d DP_VEC_TYPE;
62
63#define SET_VEC_PS(_I_) _mm512_set1_ps( _I_ );
64#define ADD_VEC_PS(_I_,_J_) _mm512_add_ps( _I_ , _J_ );
65#define MUL_VEC_PS(_I_,_J_) _mm512_mul_ps( _I_ , _J_ );
66#define FMA_VEC_PS(_I_,_J_,_K_) _mm512_fmadd_ps( _I_ , _J_ , _K_ );
67
68#define SET_VEC_PD(_I_) _mm512_set1_pd( _I_ );
69#define ADD_VEC_PD(_I_,_J_) _mm512_add_pd( _I_ , _J_ );
70#define MUL_VEC_PD(_I_,_J_) _mm512_mul_pd( _I_ , _J_ );
71#define FMA_VEC_PD(_I_,_J_,_K_) _mm512_fmadd_pd( _I_ , _J_ , _K_ );
72
73#else
74typedef __m256 SP_VEC_TYPE;
75typedef __m256d DP_VEC_TYPE;
76
77#define SET_VEC_PS(_I_) _mm256_set1_ps( _I_ );
78#define ADD_VEC_PS(_I_,_J_) _mm256_add_ps( _I_ , _J_ );
79#define MUL_VEC_PS(_I_,_J_) _mm256_mul_ps( _I_ , _J_ );
80#define FMA_VEC_PS(_I_,_J_,_K_) _mm256_fmadd_ps( _I_ , _J_ , _K_ );
81
82#define SET_VEC_PD(_I_) _mm256_set1_pd( _I_ );
83#define ADD_VEC_PD(_I_,_J_) _mm256_add_pd( _I_ , _J_ );
84#define MUL_VEC_PD(_I_,_J_) _mm256_mul_pd( _I_ , _J_ );
85#define FMA_VEC_PD(_I_,_J_,_K_) _mm256_fmadd_pd( _I_ , _J_ , _K_ );
86#endif
87
88#elif defined(ARM)
89void test_hp_arm_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
90void test_sp_arm_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
91void test_dp_arm_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
92void test_hp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
93void test_sp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
94void test_dp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
95
96#include <arm_neon.h>
97
98typedef __fp16 half;
99typedef float SP_SCALAR_TYPE;
100typedef double DP_SCALAR_TYPE;
101typedef float16x8_t HP_VEC_TYPE;
102typedef float32x4_t SP_VEC_TYPE;
103typedef float64x2_t DP_VEC_TYPE;
104
105#define SET_VEC_PH(_I_) (HP_VEC_TYPE)vdupq_n_f16( _I_ );
106#define SET_VEC_PS(_I_) (SP_VEC_TYPE)vdupq_n_f32( _I_ );
107#define SET_VEC_PD(_I_) (DP_VEC_TYPE)vdupq_n_f64( _I_ );
108
109#define ADD_VEC_PH(_I_,_J_) (HP_VEC_TYPE)vaddq_f16( _I_ , _J_ );
110#define ADD_VEC_PS(_I_,_J_) (SP_VEC_TYPE)vaddq_f32( _I_ , _J_ );
111#define ADD_VEC_PD(_I_,_J_) (DP_VEC_TYPE)vaddq_f64( _I_ , _J_ );
112
113#define MUL_VEC_PH(_I_,_J_) (HP_VEC_TYPE)vmulq_f16( _I_ , _J_ );
114#define MUL_VEC_PS(_I_,_J_) (SP_VEC_TYPE)vmulq_f32( _I_ , _J_ );
115#define MUL_VEC_PD(_I_,_J_) (DP_VEC_TYPE)vmulq_f64( _I_ , _J_ );
116
117#define FMA_VEC_PH(_I_,_J_,_K_) (HP_VEC_TYPE)vfmaq_f16( _K_ , _J_ , _I_ );
118#define FMA_VEC_PS(_I_,_J_,_K_) (SP_VEC_TYPE)vfmaq_f32( _K_ , _J_ , _I_ );
119#define FMA_VEC_PD(_I_,_J_,_K_) (DP_VEC_TYPE)vfmaq_f64( _K_ , _J_ , _I_ );
120
121/* There is no scalar FMA intrinsic available on this architecture. */
122#define SET_VEC_SH(_I_) _I_ ;
123#define ADD_VEC_SH(_I_,_J_) vaddh_f16( _I_ , _J_ );
124#define MUL_VEC_SH(_I_,_J_) vmulh_f16( _I_ , _J_ );
125#define SQRT_VEC_SH(_I_) vsqrth_f16( _I_ );
126#define FMA_VEC_SH(_out_,_I_,_J_,_K_) {\
127 HP_VEC_TYPE arg1 = SET_VEC_PH(_I_);\
128 HP_VEC_TYPE arg2 = SET_VEC_PH(_J_);\
129 HP_VEC_TYPE arg3 = SET_VEC_PH(_K_);\
130 HP_VEC_TYPE argTmp;\
131 argTmp = FMA_VEC_PH( arg1 , arg2 , arg3 );\
132 _out_ = ((half*)&(argTmp))[0];\
133}
134
135#define SET_VEC_SS(_I_) _I_ ;
136#define ADD_VEC_SS(_I_,_J_) _I_ + _J_ ;
137#define MUL_VEC_SS(_I_,_J_) _I_ * _J_ ;
138#define FMA_VEC_SS(_out_,_I_,_J_,_K_) {\
139 SP_VEC_TYPE arg1 = SET_VEC_PS(_I_);\
140 SP_VEC_TYPE arg2 = SET_VEC_PS(_J_);\
141 SP_VEC_TYPE arg3 = SET_VEC_PS(_K_);\
142 SP_VEC_TYPE argTmp;\
143 argTmp = FMA_VEC_PS( arg1 , arg2 , arg3 );\
144 _out_ = ((SP_SCALAR_TYPE*)&(argTmp))[0];\
145}
146
147#define SET_VEC_SD(_I_) _I_ ;
148#define ADD_VEC_SD(_I_,_J_) _I_ + _J_ ;
149#define MUL_VEC_SD(_I_,_J_) _I_ * _J_ ;
150#define FMA_VEC_SD(_out_,_I_,_J_,_K_) {\
151 DP_VEC_TYPE arg1 = SET_VEC_PD(_I_);\
152 DP_VEC_TYPE arg2 = SET_VEC_PD(_J_);\
153 DP_VEC_TYPE arg3 = SET_VEC_PD(_K_);\
154 DP_VEC_TYPE argTmp;\
155 argTmp = FMA_VEC_PD( arg1 , arg2 , arg3 );\
156 _out_ = ((DP_SCALAR_TYPE*)&(argTmp))[0];\
157}
158
159#elif defined(POWER)
160void test_hp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
161void test_sp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
162void test_dp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
163void test_hp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
164void test_sp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
165void test_dp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
166
167#include <altivec.h>
168
169typedef float SP_SCALAR_TYPE;
170typedef double DP_SCALAR_TYPE;
171typedef __vector float SP_VEC_TYPE;
172typedef __vector double DP_VEC_TYPE;
173
174#define SET_VEC_PS(_I_) (SP_VEC_TYPE){ _I_ , _I_ , _I_ , _I_ };
175#define SET_VEC_PD(_I_) (DP_VEC_TYPE){ _I_ , _I_ };
176
177#define ADD_VEC_PS(_I_,_J_) (SP_VEC_TYPE)vec_add( _I_ , _J_ );
178#define ADD_VEC_PD(_I_,_J_) (DP_VEC_TYPE)vec_add( _I_ , _J_ );
179
180#define MUL_VEC_PS(_I_,_J_) (SP_VEC_TYPE)vec_mul( _I_ , _J_ );
181#define MUL_VEC_PD(_I_,_J_) (DP_VEC_TYPE)vec_mul( _I_ , _J_ );
182
183#define FMA_VEC_PS(_I_,_J_,_K_) (SP_VEC_TYPE)vec_madd( _I_ , _J_ , _K_ );
184#define FMA_VEC_PD(_I_,_J_,_K_) (DP_VEC_TYPE)vec_madd( _I_ , _J_ , _K_ );
185
186/* There is no scalar FMA intrinsic available on this architecture. */
187#define SET_VEC_SS(_I_) _I_ ;
188#define ADD_VEC_SS(_I_,_J_) _I_ + _J_ ;
189#define MUL_VEC_SS(_I_,_J_) _I_ * _J_ ;
190#define FMA_VEC_SS(_out_,_I_,_J_,_K_) {\
191 SP_VEC_TYPE arg1 = SET_VEC_PS(_I_);\
192 SP_VEC_TYPE arg2 = SET_VEC_PS(_J_);\
193 SP_VEC_TYPE arg3 = SET_VEC_PS(_K_);\
194 SP_VEC_TYPE argTmp;\
195 argTmp = FMA_VEC_PS( arg1 , arg2 , arg3 );\
196 _out_ = ((SP_SCALAR_TYPE*)&(argTmp))[0];\
197}
198
199#define SET_VEC_SD(_I_) _I_ ;
200#define ADD_VEC_SD(_I_,_J_) _I_ + _J_ ;
201#define MUL_VEC_SD(_I_,_J_) _I_ * _J_ ;
202#define FMA_VEC_SD(_out_,_I_,_J_,_K_) {\
203 DP_VEC_TYPE arg1 = SET_VEC_PD(_I_);\
204 DP_VEC_TYPE arg2 = SET_VEC_PD(_J_);\
205 DP_VEC_TYPE arg3 = SET_VEC_PD(_K_);\
206 DP_VEC_TYPE argTmp;\
207 argTmp = FMA_VEC_PD( arg1 , arg2 , arg3 );\
208 _out_ = ((DP_SCALAR_TYPE*)&(argTmp))[0];\
209}
210
211#endif
unsigned long long uint64
Definition: cat_arch.h:3
static int EventSet
Definition: init_fini.c:8
static FILE * fp