15#if defined(X86_VEC_WIDTH_128B)
16void test_hp_x86_128B_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
19#elif defined(X86_VEC_WIDTH_512B)
20void test_hp_x86_512B_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
23#elif defined(X86_VEC_WIDTH_256B)
24void test_hp_x86_256B_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
28void test_hp_arm_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
32void test_hp_power_VEC(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
43 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
46 r0 = SET_VEC_PH(0.01);
47 r1 = SET_VEC_PH(0.02);
48 r2 = SET_VEC_PH(0.03);
49 r3 = SET_VEC_PH(0.04);
50 r4 = SET_VEC_PH(0.05);
51 r5 = SET_VEC_PH(0.06);
52 r6 = SET_VEC_PH(0.07);
53 r7 = SET_VEC_PH(0.08);
54 r8 = SET_VEC_PH(0.09);
55 r9 = SET_VEC_PH(0.10);
56 rA = SET_VEC_PH(0.11);
57 rB = SET_VEC_PH(0.12);
58 rC = SET_VEC_PH(0.13);
59 rD = SET_VEC_PH(0.14);
60 rE = SET_VEC_PH(0.15);
61 rF = SET_VEC_PH(0.16);
69 while (
c < iterations){
74 r0 = MUL_VEC_PH(r0,rC);
75 r1 = ADD_VEC_PH(r1,rD);
76 r2 = MUL_VEC_PH(r2,rE);
77 r3 = ADD_VEC_PH(r3,rF);
78 r4 = MUL_VEC_PH(r4,rC);
79 r5 = ADD_VEC_PH(r5,rD);
80 r6 = MUL_VEC_PH(r6,rE);
81 r7 = ADD_VEC_PH(r7,rF);
82 r8 = MUL_VEC_PH(r8,rC);
83 r9 = ADD_VEC_PH(r9,rD);
84 rA = MUL_VEC_PH(rA,rE);
85 rB = ADD_VEC_PH(rB,rF);
87 r0 = ADD_VEC_PH(r0,rF);
88 r1 = MUL_VEC_PH(r1,rE);
89 r2 = ADD_VEC_PH(r2,rD);
90 r3 = MUL_VEC_PH(r3,rC);
91 r4 = ADD_VEC_PH(r4,rF);
92 r5 = MUL_VEC_PH(r5,rE);
93 r6 = ADD_VEC_PH(r6,rD);
94 r7 = MUL_VEC_PH(r7,rC);
95 r8 = ADD_VEC_PH(r8,rF);
96 r9 = MUL_VEC_PH(r9,rE);
97 rA = ADD_VEC_PH(rA,rD);
98 rB = MUL_VEC_PH(rB,rC);
109 r0 = ADD_VEC_PH(r0,r1);
110 r2 = ADD_VEC_PH(r2,r3);
111 r4 = ADD_VEC_PH(r4,r5);
112 r6 = ADD_VEC_PH(r6,r7);
113 r8 = ADD_VEC_PH(r8,r9);
114 rA = ADD_VEC_PH(rA,rB);
116 r0 = ADD_VEC_PH(r0,r2);
117 r4 = ADD_VEC_PH(r4,r6);
118 r8 = ADD_VEC_PH(r8,rA);
120 r0 = ADD_VEC_PH(r0,r4);
121 r0 = ADD_VEC_PH(r0,r8);
124 HP_VEC_TYPE temp = r0;
125 out = vaddh_f16(out,((half*)&temp)[0]);
126 out = vaddh_f16(out,((half*)&temp)[1]);
127 out = vaddh_f16(out,((half*)&temp)[2]);
128 out = vaddh_f16(out,((half*)&temp)[3]);
138 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
141 r0 = SET_VEC_PH(0.01);
142 r1 = SET_VEC_PH(0.02);
143 r2 = SET_VEC_PH(0.03);
144 r3 = SET_VEC_PH(0.04);
145 r4 = SET_VEC_PH(0.05);
146 r5 = SET_VEC_PH(0.06);
147 r6 = SET_VEC_PH(0.07);
148 r7 = SET_VEC_PH(0.08);
149 r8 = SET_VEC_PH(0.09);
150 r9 = SET_VEC_PH(0.10);
151 rA = SET_VEC_PH(0.11);
152 rB = SET_VEC_PH(0.12);
153 rC = SET_VEC_PH(0.13);
154 rD = SET_VEC_PH(0.14);
155 rE = SET_VEC_PH(0.15);
156 rF = SET_VEC_PH(0.16);
164 while (
c < iterations){
169 r0 = MUL_VEC_PH(r0,rC);
170 r1 = ADD_VEC_PH(r1,rD);
171 r2 = MUL_VEC_PH(r2,rE);
172 r3 = ADD_VEC_PH(r3,rF);
173 r4 = MUL_VEC_PH(r4,rC);
174 r5 = ADD_VEC_PH(r5,rD);
175 r6 = MUL_VEC_PH(r6,rE);
176 r7 = ADD_VEC_PH(r7,rF);
177 r8 = MUL_VEC_PH(r8,rC);
178 r9 = ADD_VEC_PH(r9,rD);
179 rA = MUL_VEC_PH(rA,rE);
180 rB = ADD_VEC_PH(rB,rF);
182 r0 = ADD_VEC_PH(r0,rF);
183 r1 = MUL_VEC_PH(r1,rE);
184 r2 = ADD_VEC_PH(r2,rD);
185 r3 = MUL_VEC_PH(r3,rC);
186 r4 = ADD_VEC_PH(r4,rF);
187 r5 = MUL_VEC_PH(r5,rE);
188 r6 = ADD_VEC_PH(r6,rD);
189 r7 = MUL_VEC_PH(r7,rC);
190 r8 = ADD_VEC_PH(r8,rF);
191 r9 = MUL_VEC_PH(r9,rE);
192 rA = ADD_VEC_PH(rA,rD);
193 rB = MUL_VEC_PH(rB,rC);
195 r0 = MUL_VEC_PH(r0,rC);
196 r1 = ADD_VEC_PH(r1,rD);
197 r2 = MUL_VEC_PH(r2,rE);
198 r3 = ADD_VEC_PH(r3,rF);
199 r4 = MUL_VEC_PH(r4,rC);
200 r5 = ADD_VEC_PH(r5,rD);
201 r6 = MUL_VEC_PH(r6,rE);
202 r7 = ADD_VEC_PH(r7,rF);
203 r8 = MUL_VEC_PH(r8,rC);
204 r9 = ADD_VEC_PH(r9,rD);
205 rA = MUL_VEC_PH(rA,rE);
206 rB = ADD_VEC_PH(rB,rF);
208 r0 = ADD_VEC_PH(r0,rF);
209 r1 = MUL_VEC_PH(r1,rE);
210 r2 = ADD_VEC_PH(r2,rD);
211 r3 = MUL_VEC_PH(r3,rC);
212 r4 = ADD_VEC_PH(r4,rF);
213 r5 = MUL_VEC_PH(r5,rE);
214 r6 = ADD_VEC_PH(r6,rD);
215 r7 = MUL_VEC_PH(r7,rC);
216 r8 = ADD_VEC_PH(r8,rF);
217 r9 = MUL_VEC_PH(r9,rE);
218 rA = ADD_VEC_PH(rA,rD);
219 rB = MUL_VEC_PH(rB,rC);
230 r0 = ADD_VEC_PH(r0,r1);
231 r2 = ADD_VEC_PH(r2,r3);
232 r4 = ADD_VEC_PH(r4,r5);
233 r6 = ADD_VEC_PH(r6,r7);
234 r8 = ADD_VEC_PH(r8,r9);
235 rA = ADD_VEC_PH(rA,rB);
237 r0 = ADD_VEC_PH(r0,r2);
238 r4 = ADD_VEC_PH(r4,r6);
239 r8 = ADD_VEC_PH(r8,rA);
241 r0 = ADD_VEC_PH(r0,r4);
242 r0 = ADD_VEC_PH(r0,r8);
245 HP_VEC_TYPE temp = r0;
246 out = vaddh_f16(out,((half*)&temp)[0]);
247 out = vaddh_f16(out,((half*)&temp)[1]);
248 out = vaddh_f16(out,((half*)&temp)[2]);
249 out = vaddh_f16(out,((half*)&temp)[3]);
259 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
262 r0 = SET_VEC_PH(0.01);
263 r1 = SET_VEC_PH(0.02);
264 r2 = SET_VEC_PH(0.03);
265 r3 = SET_VEC_PH(0.04);
266 r4 = SET_VEC_PH(0.05);
267 r5 = SET_VEC_PH(0.06);
268 r6 = SET_VEC_PH(0.07);
269 r7 = SET_VEC_PH(0.08);
270 r8 = SET_VEC_PH(0.09);
271 r9 = SET_VEC_PH(0.10);
272 rA = SET_VEC_PH(0.11);
273 rB = SET_VEC_PH(0.12);
274 rC = SET_VEC_PH(0.13);
275 rD = SET_VEC_PH(0.14);
276 rE = SET_VEC_PH(0.15);
277 rF = SET_VEC_PH(0.16);
285 while (
c < iterations){
290 r0 = MUL_VEC_PH(r0,rC);
291 r1 = ADD_VEC_PH(r1,rD);
292 r2 = MUL_VEC_PH(r2,rE);
293 r3 = ADD_VEC_PH(r3,rF);
294 r4 = MUL_VEC_PH(r4,rC);
295 r5 = ADD_VEC_PH(r5,rD);
296 r6 = MUL_VEC_PH(r6,rE);
297 r7 = ADD_VEC_PH(r7,rF);
298 r8 = MUL_VEC_PH(r8,rC);
299 r9 = ADD_VEC_PH(r9,rD);
300 rA = MUL_VEC_PH(rA,rE);
301 rB = ADD_VEC_PH(rB,rF);
303 r0 = ADD_VEC_PH(r0,rF);
304 r1 = MUL_VEC_PH(r1,rE);
305 r2 = ADD_VEC_PH(r2,rD);
306 r3 = MUL_VEC_PH(r3,rC);
307 r4 = ADD_VEC_PH(r4,rF);
308 r5 = MUL_VEC_PH(r5,rE);
309 r6 = ADD_VEC_PH(r6,rD);
310 r7 = MUL_VEC_PH(r7,rC);
311 r8 = ADD_VEC_PH(r8,rF);
312 r9 = MUL_VEC_PH(r9,rE);
313 rA = ADD_VEC_PH(rA,rD);
314 rB = MUL_VEC_PH(rB,rC);
316 r0 = MUL_VEC_PH(r0,rC);
317 r1 = ADD_VEC_PH(r1,rD);
318 r2 = MUL_VEC_PH(r2,rE);
319 r3 = ADD_VEC_PH(r3,rF);
320 r4 = MUL_VEC_PH(r4,rC);
321 r5 = ADD_VEC_PH(r5,rD);
322 r6 = MUL_VEC_PH(r6,rE);
323 r7 = ADD_VEC_PH(r7,rF);
324 r8 = MUL_VEC_PH(r8,rC);
325 r9 = ADD_VEC_PH(r9,rD);
326 rA = MUL_VEC_PH(rA,rE);
327 rB = ADD_VEC_PH(rB,rF);
329 r0 = ADD_VEC_PH(r0,rF);
330 r1 = MUL_VEC_PH(r1,rE);
331 r2 = ADD_VEC_PH(r2,rD);
332 r3 = MUL_VEC_PH(r3,rC);
333 r4 = ADD_VEC_PH(r4,rF);
334 r5 = MUL_VEC_PH(r5,rE);
335 r6 = ADD_VEC_PH(r6,rD);
336 r7 = MUL_VEC_PH(r7,rC);
337 r8 = ADD_VEC_PH(r8,rF);
338 r9 = MUL_VEC_PH(r9,rE);
339 rA = ADD_VEC_PH(rA,rD);
340 rB = MUL_VEC_PH(rB,rC);
342 r0 = MUL_VEC_PH(r0,rC);
343 r1 = ADD_VEC_PH(r1,rD);
344 r2 = MUL_VEC_PH(r2,rE);
345 r3 = ADD_VEC_PH(r3,rF);
346 r4 = MUL_VEC_PH(r4,rC);
347 r5 = ADD_VEC_PH(r5,rD);
348 r6 = MUL_VEC_PH(r6,rE);
349 r7 = ADD_VEC_PH(r7,rF);
350 r8 = MUL_VEC_PH(r8,rC);
351 r9 = ADD_VEC_PH(r9,rD);
352 rA = MUL_VEC_PH(rA,rE);
353 rB = ADD_VEC_PH(rB,rF);
355 r0 = ADD_VEC_PH(r0,rF);
356 r1 = MUL_VEC_PH(r1,rE);
357 r2 = ADD_VEC_PH(r2,rD);
358 r3 = MUL_VEC_PH(r3,rC);
359 r4 = ADD_VEC_PH(r4,rF);
360 r5 = MUL_VEC_PH(r5,rE);
361 r6 = ADD_VEC_PH(r6,rD);
362 r7 = MUL_VEC_PH(r7,rC);
363 r8 = ADD_VEC_PH(r8,rF);
364 r9 = MUL_VEC_PH(r9,rE);
365 rA = ADD_VEC_PH(rA,rD);
366 rB = MUL_VEC_PH(rB,rC);
368 r0 = MUL_VEC_PH(r0,rC);
369 r1 = ADD_VEC_PH(r1,rD);
370 r2 = MUL_VEC_PH(r2,rE);
371 r3 = ADD_VEC_PH(r3,rF);
372 r4 = MUL_VEC_PH(r4,rC);
373 r5 = ADD_VEC_PH(r5,rD);
374 r6 = MUL_VEC_PH(r6,rE);
375 r7 = ADD_VEC_PH(r7,rF);
376 r8 = MUL_VEC_PH(r8,rC);
377 r9 = ADD_VEC_PH(r9,rD);
378 rA = MUL_VEC_PH(rA,rE);
379 rB = ADD_VEC_PH(rB,rF);
381 r0 = ADD_VEC_PH(r0,rF);
382 r1 = MUL_VEC_PH(r1,rE);
383 r2 = ADD_VEC_PH(r2,rD);
384 r3 = MUL_VEC_PH(r3,rC);
385 r4 = ADD_VEC_PH(r4,rF);
386 r5 = MUL_VEC_PH(r5,rE);
387 r6 = ADD_VEC_PH(r6,rD);
388 r7 = MUL_VEC_PH(r7,rC);
389 r8 = ADD_VEC_PH(r8,rF);
390 r9 = MUL_VEC_PH(r9,rE);
391 rA = ADD_VEC_PH(rA,rD);
392 rB = MUL_VEC_PH(rB,rC);
403 r0 = ADD_VEC_PH(r0,r1);
404 r2 = ADD_VEC_PH(r2,r3);
405 r4 = ADD_VEC_PH(r4,r5);
406 r6 = ADD_VEC_PH(r6,r7);
407 r8 = ADD_VEC_PH(r8,r9);
408 rA = ADD_VEC_PH(rA,rB);
410 r0 = ADD_VEC_PH(r0,r2);
411 r4 = ADD_VEC_PH(r4,r6);
412 r8 = ADD_VEC_PH(r8,rA);
414 r0 = ADD_VEC_PH(r0,r4);
415 r0 = ADD_VEC_PH(r0,r8);
418 HP_VEC_TYPE temp = r0;
419 out = vaddh_f16(out,((half*)&temp)[0]);
420 out = vaddh_f16(out,((half*)&temp)[1]);
421 out = vaddh_f16(out,((half*)&temp)[2]);
422 out = vaddh_f16(out,((half*)&temp)[3]);
431 half scalar_sum = 0.0;
433 if ( instr_per_loop == 24 ) {
437 else if ( instr_per_loop == 48 ) {
441 else if ( instr_per_loop == 96 ) {
446 if( vdivh_f16(sum,4.0) != scalar_sum ) {
447 fprintf(
stderr,
"Inconsistent FLOP results detected!\n");
486 float scalar_sum = 0.0;
488 if ( instr_per_loop == 24 ) {
492 else if ( instr_per_loop == 48 ) {
496 else if ( instr_per_loop == 96 ) {
501 if( sum/4.0 != scalar_sum ) {
502 fprintf(
stderr,
"Inconsistent FLOP results detected!\n");
unsigned long long uint64
Start counting hardware events in an event set.
static double c[MATRIX_SIZE][MATRIX_SIZE]
static float test_hp_mac_VEC_96(uint64 iterations, int EventSet, FILE *fp)
static float test_hp_mac_VEC_48(uint64 iterations, int EventSet, FILE *fp)
static float test_hp_mac_VEC_24(uint64 iterations, int EventSet, FILE *fp)
static void test_hp_VEC(int instr_per_loop, uint64 iterations, int EventSet, FILE *fp)
float test_hp_scalar_VEC_24(uint64 iterations)
void papi_stop_and_print_placeholder(long long theory, FILE *fp)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)
float test_hp_scalar_VEC_48(uint64 iterations)
float test_hp_scalar_VEC_96(uint64 iterations)