9#if defined(X86_VEC_WIDTH_128B)
10void test_sp_x86_128B_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
13#elif defined(X86_VEC_WIDTH_512B)
14void test_sp_x86_512B_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
17#elif defined(X86_VEC_WIDTH_256B)
18void test_sp_x86_256B_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
22void test_sp_arm_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
26void test_sp_power_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
36 register SP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
39 r0 = SET_VEC_PS(0.01);
40 r1 = SET_VEC_PS(0.02);
41 r2 = SET_VEC_PS(0.03);
42 r3 = SET_VEC_PS(0.04);
43 r4 = SET_VEC_PS(0.05);
44 r5 = SET_VEC_PS(0.06);
45 r6 = SET_VEC_PS(0.07);
46 r7 = SET_VEC_PS(0.08);
47 r8 = SET_VEC_PS(0.09);
48 r9 = SET_VEC_PS(0.10);
49 rA = SET_VEC_PS(0.11);
50 rB = SET_VEC_PS(0.12);
51 rC = SET_VEC_PS(0.13);
52 rD = SET_VEC_PS(0.14);
53 rE = SET_VEC_PS(0.15);
54 rF = SET_VEC_PS(0.16);
62 while (
c < iterations){
67 r0 = MUL_VEC_PS(r0,rC);
68 r1 = ADD_VEC_PS(r1,rD);
69 r2 = MUL_VEC_PS(r2,rE);
70 r3 = ADD_VEC_PS(r3,rF);
71 r4 = MUL_VEC_PS(r4,rC);
72 r5 = ADD_VEC_PS(r5,rD);
73 r6 = MUL_VEC_PS(r6,rE);
74 r7 = ADD_VEC_PS(r7,rF);
75 r8 = MUL_VEC_PS(r8,rC);
76 r9 = ADD_VEC_PS(r9,rD);
77 rA = MUL_VEC_PS(rA,rE);
78 rB = ADD_VEC_PS(rB,rF);
80 r0 = ADD_VEC_PS(r0,rF);
81 r1 = MUL_VEC_PS(r1,rE);
82 r2 = ADD_VEC_PS(r2,rD);
83 r3 = MUL_VEC_PS(r3,rC);
84 r4 = ADD_VEC_PS(r4,rF);
85 r5 = MUL_VEC_PS(r5,rE);
86 r6 = ADD_VEC_PS(r6,rD);
87 r7 = MUL_VEC_PS(r7,rC);
88 r8 = ADD_VEC_PS(r8,rF);
89 r9 = MUL_VEC_PS(r9,rE);
90 rA = ADD_VEC_PS(rA,rD);
91 rB = MUL_VEC_PS(rB,rC);
102 r0 = ADD_VEC_PS(r0,r1);
103 r2 = ADD_VEC_PS(r2,r3);
104 r4 = ADD_VEC_PS(r4,r5);
105 r6 = ADD_VEC_PS(r6,r7);
106 r8 = ADD_VEC_PS(r8,r9);
107 rA = ADD_VEC_PS(rA,rB);
109 r0 = ADD_VEC_PS(r0,r2);
110 r4 = ADD_VEC_PS(r4,r6);
111 r8 = ADD_VEC_PS(r8,rA);
113 r0 = ADD_VEC_PS(r0,r4);
114 r0 = ADD_VEC_PS(r0,r8);
117 SP_VEC_TYPE temp = r0;
118 out += ((
float*)&temp)[0];
119 out += ((
float*)&temp)[1];
120 out += ((
float*)&temp)[2];
121 out += ((
float*)&temp)[3];
131 register SP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
134 r0 = SET_VEC_PS(0.01);
135 r1 = SET_VEC_PS(0.02);
136 r2 = SET_VEC_PS(0.03);
137 r3 = SET_VEC_PS(0.04);
138 r4 = SET_VEC_PS(0.05);
139 r5 = SET_VEC_PS(0.06);
140 r6 = SET_VEC_PS(0.07);
141 r7 = SET_VEC_PS(0.08);
142 r8 = SET_VEC_PS(0.09);
143 r9 = SET_VEC_PS(0.10);
144 rA = SET_VEC_PS(0.11);
145 rB = SET_VEC_PS(0.12);
146 rC = SET_VEC_PS(0.13);
147 rD = SET_VEC_PS(0.14);
148 rE = SET_VEC_PS(0.15);
149 rF = SET_VEC_PS(0.16);
157 while (
c < iterations){
162 r0 = MUL_VEC_PS(r0,rC);
163 r1 = ADD_VEC_PS(r1,rD);
164 r2 = MUL_VEC_PS(r2,rE);
165 r3 = ADD_VEC_PS(r3,rF);
166 r4 = MUL_VEC_PS(r4,rC);
167 r5 = ADD_VEC_PS(r5,rD);
168 r6 = MUL_VEC_PS(r6,rE);
169 r7 = ADD_VEC_PS(r7,rF);
170 r8 = MUL_VEC_PS(r8,rC);
171 r9 = ADD_VEC_PS(r9,rD);
172 rA = MUL_VEC_PS(rA,rE);
173 rB = ADD_VEC_PS(rB,rF);
175 r0 = ADD_VEC_PS(r0,rF);
176 r1 = MUL_VEC_PS(r1,rE);
177 r2 = ADD_VEC_PS(r2,rD);
178 r3 = MUL_VEC_PS(r3,rC);
179 r4 = ADD_VEC_PS(r4,rF);
180 r5 = MUL_VEC_PS(r5,rE);
181 r6 = ADD_VEC_PS(r6,rD);
182 r7 = MUL_VEC_PS(r7,rC);
183 r8 = ADD_VEC_PS(r8,rF);
184 r9 = MUL_VEC_PS(r9,rE);
185 rA = ADD_VEC_PS(rA,rD);
186 rB = MUL_VEC_PS(rB,rC);
188 r0 = MUL_VEC_PS(r0,rC);
189 r1 = ADD_VEC_PS(r1,rD);
190 r2 = MUL_VEC_PS(r2,rE);
191 r3 = ADD_VEC_PS(r3,rF);
192 r4 = MUL_VEC_PS(r4,rC);
193 r5 = ADD_VEC_PS(r5,rD);
194 r6 = MUL_VEC_PS(r6,rE);
195 r7 = ADD_VEC_PS(r7,rF);
196 r8 = MUL_VEC_PS(r8,rC);
197 r9 = ADD_VEC_PS(r9,rD);
198 rA = MUL_VEC_PS(rA,rE);
199 rB = ADD_VEC_PS(rB,rF);
201 r0 = ADD_VEC_PS(r0,rF);
202 r1 = MUL_VEC_PS(r1,rE);
203 r2 = ADD_VEC_PS(r2,rD);
204 r3 = MUL_VEC_PS(r3,rC);
205 r4 = ADD_VEC_PS(r4,rF);
206 r5 = MUL_VEC_PS(r5,rE);
207 r6 = ADD_VEC_PS(r6,rD);
208 r7 = MUL_VEC_PS(r7,rC);
209 r8 = ADD_VEC_PS(r8,rF);
210 r9 = MUL_VEC_PS(r9,rE);
211 rA = ADD_VEC_PS(rA,rD);
212 rB = MUL_VEC_PS(rB,rC);
223 r0 = ADD_VEC_PS(r0,r1);
224 r2 = ADD_VEC_PS(r2,r3);
225 r4 = ADD_VEC_PS(r4,r5);
226 r6 = ADD_VEC_PS(r6,r7);
227 r8 = ADD_VEC_PS(r8,r9);
228 rA = ADD_VEC_PS(rA,rB);
230 r0 = ADD_VEC_PS(r0,r2);
231 r4 = ADD_VEC_PS(r4,r6);
232 r8 = ADD_VEC_PS(r8,rA);
234 r0 = ADD_VEC_PS(r0,r4);
235 r0 = ADD_VEC_PS(r0,r8);
238 SP_VEC_TYPE temp = r0;
239 out += ((
float*)&temp)[0];
240 out += ((
float*)&temp)[1];
241 out += ((
float*)&temp)[2];
242 out += ((
float*)&temp)[3];
252 register SP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
255 r0 = SET_VEC_PS(0.01);
256 r1 = SET_VEC_PS(0.02);
257 r2 = SET_VEC_PS(0.03);
258 r3 = SET_VEC_PS(0.04);
259 r4 = SET_VEC_PS(0.05);
260 r5 = SET_VEC_PS(0.06);
261 r6 = SET_VEC_PS(0.07);
262 r7 = SET_VEC_PS(0.08);
263 r8 = SET_VEC_PS(0.09);
264 r9 = SET_VEC_PS(0.10);
265 rA = SET_VEC_PS(0.11);
266 rB = SET_VEC_PS(0.12);
267 rC = SET_VEC_PS(0.13);
268 rD = SET_VEC_PS(0.14);
269 rE = SET_VEC_PS(0.15);
270 rF = SET_VEC_PS(0.16);
278 while (
c < iterations){
283 r0 = MUL_VEC_PS(r0,rC);
284 r1 = ADD_VEC_PS(r1,rD);
285 r2 = MUL_VEC_PS(r2,rE);
286 r3 = ADD_VEC_PS(r3,rF);
287 r4 = MUL_VEC_PS(r4,rC);
288 r5 = ADD_VEC_PS(r5,rD);
289 r6 = MUL_VEC_PS(r6,rE);
290 r7 = ADD_VEC_PS(r7,rF);
291 r8 = MUL_VEC_PS(r8,rC);
292 r9 = ADD_VEC_PS(r9,rD);
293 rA = MUL_VEC_PS(rA,rE);
294 rB = ADD_VEC_PS(rB,rF);
296 r0 = ADD_VEC_PS(r0,rF);
297 r1 = MUL_VEC_PS(r1,rE);
298 r2 = ADD_VEC_PS(r2,rD);
299 r3 = MUL_VEC_PS(r3,rC);
300 r4 = ADD_VEC_PS(r4,rF);
301 r5 = MUL_VEC_PS(r5,rE);
302 r6 = ADD_VEC_PS(r6,rD);
303 r7 = MUL_VEC_PS(r7,rC);
304 r8 = ADD_VEC_PS(r8,rF);
305 r9 = MUL_VEC_PS(r9,rE);
306 rA = ADD_VEC_PS(rA,rD);
307 rB = MUL_VEC_PS(rB,rC);
309 r0 = MUL_VEC_PS(r0,rC);
310 r1 = ADD_VEC_PS(r1,rD);
311 r2 = MUL_VEC_PS(r2,rE);
312 r3 = ADD_VEC_PS(r3,rF);
313 r4 = MUL_VEC_PS(r4,rC);
314 r5 = ADD_VEC_PS(r5,rD);
315 r6 = MUL_VEC_PS(r6,rE);
316 r7 = ADD_VEC_PS(r7,rF);
317 r8 = MUL_VEC_PS(r8,rC);
318 r9 = ADD_VEC_PS(r9,rD);
319 rA = MUL_VEC_PS(rA,rE);
320 rB = ADD_VEC_PS(rB,rF);
322 r0 = ADD_VEC_PS(r0,rF);
323 r1 = MUL_VEC_PS(r1,rE);
324 r2 = ADD_VEC_PS(r2,rD);
325 r3 = MUL_VEC_PS(r3,rC);
326 r4 = ADD_VEC_PS(r4,rF);
327 r5 = MUL_VEC_PS(r5,rE);
328 r6 = ADD_VEC_PS(r6,rD);
329 r7 = MUL_VEC_PS(r7,rC);
330 r8 = ADD_VEC_PS(r8,rF);
331 r9 = MUL_VEC_PS(r9,rE);
332 rA = ADD_VEC_PS(rA,rD);
333 rB = MUL_VEC_PS(rB,rC);
335 r0 = MUL_VEC_PS(r0,rC);
336 r1 = ADD_VEC_PS(r1,rD);
337 r2 = MUL_VEC_PS(r2,rE);
338 r3 = ADD_VEC_PS(r3,rF);
339 r4 = MUL_VEC_PS(r4,rC);
340 r5 = ADD_VEC_PS(r5,rD);
341 r6 = MUL_VEC_PS(r6,rE);
342 r7 = ADD_VEC_PS(r7,rF);
343 r8 = MUL_VEC_PS(r8,rC);
344 r9 = ADD_VEC_PS(r9,rD);
345 rA = MUL_VEC_PS(rA,rE);
346 rB = ADD_VEC_PS(rB,rF);
348 r0 = ADD_VEC_PS(r0,rF);
349 r1 = MUL_VEC_PS(r1,rE);
350 r2 = ADD_VEC_PS(r2,rD);
351 r3 = MUL_VEC_PS(r3,rC);
352 r4 = ADD_VEC_PS(r4,rF);
353 r5 = MUL_VEC_PS(r5,rE);
354 r6 = ADD_VEC_PS(r6,rD);
355 r7 = MUL_VEC_PS(r7,rC);
356 r8 = ADD_VEC_PS(r8,rF);
357 r9 = MUL_VEC_PS(r9,rE);
358 rA = ADD_VEC_PS(rA,rD);
359 rB = MUL_VEC_PS(rB,rC);
361 r0 = MUL_VEC_PS(r0,rC);
362 r1 = ADD_VEC_PS(r1,rD);
363 r2 = MUL_VEC_PS(r2,rE);
364 r3 = ADD_VEC_PS(r3,rF);
365 r4 = MUL_VEC_PS(r4,rC);
366 r5 = ADD_VEC_PS(r5,rD);
367 r6 = MUL_VEC_PS(r6,rE);
368 r7 = ADD_VEC_PS(r7,rF);
369 r8 = MUL_VEC_PS(r8,rC);
370 r9 = ADD_VEC_PS(r9,rD);
371 rA = MUL_VEC_PS(rA,rE);
372 rB = ADD_VEC_PS(rB,rF);
374 r0 = ADD_VEC_PS(r0,rF);
375 r1 = MUL_VEC_PS(r1,rE);
376 r2 = ADD_VEC_PS(r2,rD);
377 r3 = MUL_VEC_PS(r3,rC);
378 r4 = ADD_VEC_PS(r4,rF);
379 r5 = MUL_VEC_PS(r5,rE);
380 r6 = ADD_VEC_PS(r6,rD);
381 r7 = MUL_VEC_PS(r7,rC);
382 r8 = ADD_VEC_PS(r8,rF);
383 r9 = MUL_VEC_PS(r9,rE);
384 rA = ADD_VEC_PS(rA,rD);
385 rB = MUL_VEC_PS(rB,rC);
396 r0 = ADD_VEC_PS(r0,r1);
397 r2 = ADD_VEC_PS(r2,r3);
398 r4 = ADD_VEC_PS(r4,r5);
399 r6 = ADD_VEC_PS(r6,r7);
400 r8 = ADD_VEC_PS(r8,r9);
401 rA = ADD_VEC_PS(rA,rB);
403 r0 = ADD_VEC_PS(r0,r2);
404 r4 = ADD_VEC_PS(r4,r6);
405 r8 = ADD_VEC_PS(r8,rA);
407 r0 = ADD_VEC_PS(r0,r4);
408 r0 = ADD_VEC_PS(r0,r8);
411 SP_VEC_TYPE temp = r0;
412 out += ((
float*)&temp)[0];
413 out += ((
float*)&temp)[1];
414 out += ((
float*)&temp)[2];
415 out += ((
float*)&temp)[3];
424 float scalar_sum = 0.0;
426 if ( instr_per_loop == 24 ) {
430 else if ( instr_per_loop == 48 ) {
434 else if ( instr_per_loop == 96 ) {
439 if( sum/4.0 != scalar_sum ) {
440 fprintf(
stderr,
"Inconsistent FLOP results detected!\n");
unsigned long long uint64
Start counting hardware events in an event set.
static double c[MATRIX_SIZE][MATRIX_SIZE]
static void test_sp_VEC(int instr_per_loop, uint64 iterations, int EventSet, FILE *fp)
static float test_sp_mac_VEC_96(uint64 iterations, int EventSet, FILE *fp)
static float test_sp_mac_VEC_48(uint64 iterations, int EventSet, FILE *fp)
static float test_sp_mac_VEC_24(uint64 iterations, int EventSet, FILE *fp)
float test_sp_scalar_VEC_96(uint64 iterations)
float test_sp_scalar_VEC_24(uint64 iterations)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)
float test_sp_scalar_VEC_48(uint64 iterations)