9#if defined(X86_VEC_WIDTH_128B)
10void test_sp_x86_128B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
13#elif defined(X86_VEC_WIDTH_512B)
14void test_sp_x86_512B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
17#elif defined(X86_VEC_WIDTH_256B)
18void test_sp_x86_256B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
22void test_sp_arm_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
26void test_sp_power_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
36 register SP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
39 r0 = SET_VEC_PS(0.01);
40 r1 = SET_VEC_PS(0.02);
41 r2 = SET_VEC_PS(0.03);
42 r3 = SET_VEC_PS(0.04);
43 r4 = SET_VEC_PS(0.05);
44 r5 = SET_VEC_PS(0.06);
45 r6 = SET_VEC_PS(0.07);
46 r7 = SET_VEC_PS(0.08);
47 r8 = SET_VEC_PS(0.09);
48 r9 = SET_VEC_PS(0.10);
49 rA = SET_VEC_PS(0.11);
50 rB = SET_VEC_PS(0.12);
51 rC = SET_VEC_PS(0.13);
52 rD = SET_VEC_PS(0.14);
53 rE = SET_VEC_PS(0.15);
54 rF = SET_VEC_PS(0.16);
62 while (
c < iterations){
67 r0 = FMA_VEC_PS(r0,r7,r9);
68 r1 = FMA_VEC_PS(r1,r8,rA);
69 r2 = FMA_VEC_PS(r2,r9,rB);
70 r3 = FMA_VEC_PS(r3,rA,rC);
71 r4 = FMA_VEC_PS(r4,rB,rD);
72 r5 = FMA_VEC_PS(r5,rC,rE);
74 r0 = FMA_VEC_PS(r0,rD,rF);
75 r1 = FMA_VEC_PS(r1,rC,rE);
76 r2 = FMA_VEC_PS(r2,rB,rD);
77 r3 = FMA_VEC_PS(r3,rA,rC);
78 r4 = FMA_VEC_PS(r4,r9,rB);
79 r5 = FMA_VEC_PS(r5,r8,rA);
90 r0 = ADD_VEC_PS(r0,r1);
91 r2 = ADD_VEC_PS(r2,r3);
92 r4 = ADD_VEC_PS(r4,r5);
94 r0 = ADD_VEC_PS(r0,r6);
95 r2 = ADD_VEC_PS(r2,r4);
97 r0 = ADD_VEC_PS(r0,r2);
100 SP_VEC_TYPE temp = r0;
101 out += ((
float*)&temp)[0];
102 out += ((
float*)&temp)[1];
103 out += ((
float*)&temp)[2];
104 out += ((
float*)&temp)[3];
114 register SP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
117 r0 = SET_VEC_PS(0.01);
118 r1 = SET_VEC_PS(0.02);
119 r2 = SET_VEC_PS(0.03);
120 r3 = SET_VEC_PS(0.04);
121 r4 = SET_VEC_PS(0.05);
122 r5 = SET_VEC_PS(0.06);
123 r6 = SET_VEC_PS(0.07);
124 r7 = SET_VEC_PS(0.08);
125 r8 = SET_VEC_PS(0.09);
126 r9 = SET_VEC_PS(0.10);
127 rA = SET_VEC_PS(0.11);
128 rB = SET_VEC_PS(0.12);
129 rC = SET_VEC_PS(0.13);
130 rD = SET_VEC_PS(0.14);
131 rE = SET_VEC_PS(0.15);
132 rF = SET_VEC_PS(0.16);
140 while (
c < iterations){
145 r0 = FMA_VEC_PS(r0,r7,r9);
146 r1 = FMA_VEC_PS(r1,r8,rA);
147 r2 = FMA_VEC_PS(r2,r9,rB);
148 r3 = FMA_VEC_PS(r3,rA,rC);
149 r4 = FMA_VEC_PS(r4,rB,rD);
150 r5 = FMA_VEC_PS(r5,rC,rE);
152 r0 = FMA_VEC_PS(r0,rD,rF);
153 r1 = FMA_VEC_PS(r1,rC,rE);
154 r2 = FMA_VEC_PS(r2,rB,rD);
155 r3 = FMA_VEC_PS(r3,rA,rC);
156 r4 = FMA_VEC_PS(r4,r9,rB);
157 r5 = FMA_VEC_PS(r5,r8,rA);
159 r0 = FMA_VEC_PS(r0,r7,r9);
160 r1 = FMA_VEC_PS(r1,r8,rA);
161 r2 = FMA_VEC_PS(r2,r9,rB);
162 r3 = FMA_VEC_PS(r3,rA,rC);
163 r4 = FMA_VEC_PS(r4,rB,rD);
164 r5 = FMA_VEC_PS(r5,rC,rE);
166 r0 = FMA_VEC_PS(r0,rD,rF);
167 r1 = FMA_VEC_PS(r1,rC,rE);
168 r2 = FMA_VEC_PS(r2,rB,rD);
169 r3 = FMA_VEC_PS(r3,rA,rC);
170 r4 = FMA_VEC_PS(r4,r9,rB);
171 r5 = FMA_VEC_PS(r5,r8,rA);
182 r0 = ADD_VEC_PS(r0,r1);
183 r2 = ADD_VEC_PS(r2,r3);
184 r4 = ADD_VEC_PS(r4,r5);
186 r0 = ADD_VEC_PS(r0,r6);
187 r2 = ADD_VEC_PS(r2,r4);
189 r0 = ADD_VEC_PS(r0,r2);
192 SP_VEC_TYPE temp = r0;
193 out += ((
float*)&temp)[0];
194 out += ((
float*)&temp)[1];
195 out += ((
float*)&temp)[2];
196 out += ((
float*)&temp)[3];
206 register SP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
209 r0 = SET_VEC_PS(0.01);
210 r1 = SET_VEC_PS(0.02);
211 r2 = SET_VEC_PS(0.03);
212 r3 = SET_VEC_PS(0.04);
213 r4 = SET_VEC_PS(0.05);
214 r5 = SET_VEC_PS(0.06);
215 r6 = SET_VEC_PS(0.07);
216 r7 = SET_VEC_PS(0.08);
217 r8 = SET_VEC_PS(0.09);
218 r9 = SET_VEC_PS(0.10);
219 rA = SET_VEC_PS(0.11);
220 rB = SET_VEC_PS(0.12);
221 rC = SET_VEC_PS(0.13);
222 rD = SET_VEC_PS(0.14);
223 rE = SET_VEC_PS(0.15);
224 rF = SET_VEC_PS(0.16);
232 while (
c < iterations){
237 r0 = FMA_VEC_PS(r0,r7,r9);
238 r1 = FMA_VEC_PS(r1,r8,rA);
239 r2 = FMA_VEC_PS(r2,r9,rB);
240 r3 = FMA_VEC_PS(r3,rA,rC);
241 r4 = FMA_VEC_PS(r4,rB,rD);
242 r5 = FMA_VEC_PS(r5,rC,rE);
244 r0 = FMA_VEC_PS(r0,rD,rF);
245 r1 = FMA_VEC_PS(r1,rC,rE);
246 r2 = FMA_VEC_PS(r2,rB,rD);
247 r3 = FMA_VEC_PS(r3,rA,rC);
248 r4 = FMA_VEC_PS(r4,r9,rB);
249 r5 = FMA_VEC_PS(r5,r8,rA);
251 r0 = FMA_VEC_PS(r0,r7,r9);
252 r1 = FMA_VEC_PS(r1,r8,rA);
253 r2 = FMA_VEC_PS(r2,r9,rB);
254 r3 = FMA_VEC_PS(r3,rA,rC);
255 r4 = FMA_VEC_PS(r4,rB,rD);
256 r5 = FMA_VEC_PS(r5,rC,rE);
258 r0 = FMA_VEC_PS(r0,rD,rF);
259 r1 = FMA_VEC_PS(r1,rC,rE);
260 r2 = FMA_VEC_PS(r2,rB,rD);
261 r3 = FMA_VEC_PS(r3,rA,rC);
262 r4 = FMA_VEC_PS(r4,r9,rB);
263 r5 = FMA_VEC_PS(r5,r8,rA);
265 r0 = FMA_VEC_PS(r0,r7,r9);
266 r1 = FMA_VEC_PS(r1,r8,rA);
267 r2 = FMA_VEC_PS(r2,r9,rB);
268 r3 = FMA_VEC_PS(r3,rA,rC);
269 r4 = FMA_VEC_PS(r4,rB,rD);
270 r5 = FMA_VEC_PS(r5,rC,rE);
272 r0 = FMA_VEC_PS(r0,rD,rF);
273 r1 = FMA_VEC_PS(r1,rC,rE);
274 r2 = FMA_VEC_PS(r2,rB,rD);
275 r3 = FMA_VEC_PS(r3,rA,rC);
276 r4 = FMA_VEC_PS(r4,r9,rB);
277 r5 = FMA_VEC_PS(r5,r8,rA);
279 r0 = FMA_VEC_PS(r0,r7,r9);
280 r1 = FMA_VEC_PS(r1,r8,rA);
281 r2 = FMA_VEC_PS(r2,r9,rB);
282 r3 = FMA_VEC_PS(r3,rA,rC);
283 r4 = FMA_VEC_PS(r4,rB,rD);
284 r5 = FMA_VEC_PS(r5,rC,rE);
286 r0 = FMA_VEC_PS(r0,rD,rF);
287 r1 = FMA_VEC_PS(r1,rC,rE);
288 r2 = FMA_VEC_PS(r2,rB,rD);
289 r3 = FMA_VEC_PS(r3,rA,rC);
290 r4 = FMA_VEC_PS(r4,r9,rB);
291 r5 = FMA_VEC_PS(r5,r8,rA);
302 r0 = ADD_VEC_PS(r0,r1);
303 r2 = ADD_VEC_PS(r2,r3);
304 r4 = ADD_VEC_PS(r4,r5);
306 r0 = ADD_VEC_PS(r0,r6);
307 r2 = ADD_VEC_PS(r2,r4);
309 r0 = ADD_VEC_PS(r0,r2);
312 SP_VEC_TYPE temp = r0;
313 out += ((
float*)&temp)[0];
314 out += ((
float*)&temp)[1];
315 out += ((
float*)&temp)[2];
316 out += ((
float*)&temp)[3];
325 float scalar_sum = 0.0;
327 if ( instr_per_loop == 12 ) {
331 else if ( instr_per_loop == 24 ) {
335 else if ( instr_per_loop == 48 ) {
340 if( sum/4.0 != scalar_sum ) {
341 fprintf(
stderr,
"FMA: Inconsistent FLOP results detected!\n");
unsigned long long uint64
Start counting hardware events in an event set.
static double c[MATRIX_SIZE][MATRIX_SIZE]
static float test_sp_mac_VEC_FMA_24(uint64 iterations, int EventSet, FILE *fp)
static float test_sp_mac_VEC_FMA_48(uint64 iterations, int EventSet, FILE *fp)
static void test_sp_VEC_FMA(int instr_per_loop, uint64 iterations, int EventSet, FILE *fp)
static float test_sp_mac_VEC_FMA_12(uint64 iterations, int EventSet, FILE *fp)
float test_sp_scalar_VEC_FMA_12(uint64 iterations)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)
float test_sp_scalar_VEC_FMA_48(uint64 iterations)
float test_sp_scalar_VEC_FMA_24(uint64 iterations)