15#if defined(X86_VEC_WIDTH_128B)
16void test_hp_x86_128B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
19#elif defined(X86_VEC_WIDTH_512B)
20void test_hp_x86_512B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
23#elif defined(X86_VEC_WIDTH_256B)
24void test_hp_x86_256B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
28void test_hp_arm_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
32void test_hp_power_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
43 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
46 r0 = SET_VEC_PH(0.01);
47 r1 = SET_VEC_PH(0.02);
48 r2 = SET_VEC_PH(0.03);
49 r3 = SET_VEC_PH(0.04);
50 r4 = SET_VEC_PH(0.05);
51 r5 = SET_VEC_PH(0.06);
52 r6 = SET_VEC_PH(0.07);
53 r7 = SET_VEC_PH(0.08);
54 r8 = SET_VEC_PH(0.09);
55 r9 = SET_VEC_PH(0.10);
56 rA = SET_VEC_PH(0.11);
57 rB = SET_VEC_PH(0.12);
58 rC = SET_VEC_PH(0.13);
59 rD = SET_VEC_PH(0.14);
60 rE = SET_VEC_PH(0.15);
61 rF = SET_VEC_PH(0.16);
69 while (
c < iterations){
74 r0 = FMA_VEC_PH(r0,r7,r9);
75 r1 = FMA_VEC_PH(r1,r8,rA);
76 r2 = FMA_VEC_PH(r2,r9,rB);
77 r3 = FMA_VEC_PH(r3,rA,rC);
78 r4 = FMA_VEC_PH(r4,rB,rD);
79 r5 = FMA_VEC_PH(r5,rC,rE);
81 r0 = FMA_VEC_PH(r0,rD,rF);
82 r1 = FMA_VEC_PH(r1,rC,rE);
83 r2 = FMA_VEC_PH(r2,rB,rD);
84 r3 = FMA_VEC_PH(r3,rA,rC);
85 r4 = FMA_VEC_PH(r4,r9,rB);
86 r5 = FMA_VEC_PH(r5,r8,rA);
97 r0 = ADD_VEC_PH(r0,r1);
98 r2 = ADD_VEC_PH(r2,r3);
99 r4 = ADD_VEC_PH(r4,r5);
101 r0 = ADD_VEC_PH(r0,r6);
102 r2 = ADD_VEC_PH(r2,r4);
104 r0 = ADD_VEC_PH(r0,r2);
107 HP_VEC_TYPE temp = r0;
108 out = vaddh_f16(out,((half*)&temp)[0]);
109 out = vaddh_f16(out,((half*)&temp)[1]);
110 out = vaddh_f16(out,((half*)&temp)[2]);
111 out = vaddh_f16(out,((half*)&temp)[3]);
121 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
124 r0 = SET_VEC_PH(0.01);
125 r1 = SET_VEC_PH(0.02);
126 r2 = SET_VEC_PH(0.03);
127 r3 = SET_VEC_PH(0.04);
128 r4 = SET_VEC_PH(0.05);
129 r5 = SET_VEC_PH(0.06);
130 r6 = SET_VEC_PH(0.07);
131 r7 = SET_VEC_PH(0.08);
132 r8 = SET_VEC_PH(0.09);
133 r9 = SET_VEC_PH(0.10);
134 rA = SET_VEC_PH(0.11);
135 rB = SET_VEC_PH(0.12);
136 rC = SET_VEC_PH(0.13);
137 rD = SET_VEC_PH(0.14);
138 rE = SET_VEC_PH(0.15);
139 rF = SET_VEC_PH(0.16);
147 while (
c < iterations){
152 r0 = FMA_VEC_PH(r0,r7,r9);
153 r1 = FMA_VEC_PH(r1,r8,rA);
154 r2 = FMA_VEC_PH(r2,r9,rB);
155 r3 = FMA_VEC_PH(r3,rA,rC);
156 r4 = FMA_VEC_PH(r4,rB,rD);
157 r5 = FMA_VEC_PH(r5,rC,rE);
159 r0 = FMA_VEC_PH(r0,rD,rF);
160 r1 = FMA_VEC_PH(r1,rC,rE);
161 r2 = FMA_VEC_PH(r2,rB,rD);
162 r3 = FMA_VEC_PH(r3,rA,rC);
163 r4 = FMA_VEC_PH(r4,r9,rB);
164 r5 = FMA_VEC_PH(r5,r8,rA);
166 r0 = FMA_VEC_PH(r0,r7,r9);
167 r1 = FMA_VEC_PH(r1,r8,rA);
168 r2 = FMA_VEC_PH(r2,r9,rB);
169 r3 = FMA_VEC_PH(r3,rA,rC);
170 r4 = FMA_VEC_PH(r4,rB,rD);
171 r5 = FMA_VEC_PH(r5,rC,rE);
173 r0 = FMA_VEC_PH(r0,rD,rF);
174 r1 = FMA_VEC_PH(r1,rC,rE);
175 r2 = FMA_VEC_PH(r2,rB,rD);
176 r3 = FMA_VEC_PH(r3,rA,rC);
177 r4 = FMA_VEC_PH(r4,r9,rB);
178 r5 = FMA_VEC_PH(r5,r8,rA);
189 r0 = ADD_VEC_PH(r0,r1);
190 r2 = ADD_VEC_PH(r2,r3);
191 r4 = ADD_VEC_PH(r4,r5);
193 r0 = ADD_VEC_PH(r0,r6);
194 r2 = ADD_VEC_PH(r2,r4);
196 r0 = ADD_VEC_PH(r0,r2);
199 HP_VEC_TYPE temp = r0;
200 out = vaddh_f16(out,((half*)&temp)[0]);
201 out = vaddh_f16(out,((half*)&temp)[1]);
202 out = vaddh_f16(out,((half*)&temp)[2]);
203 out = vaddh_f16(out,((half*)&temp)[3]);
213 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
216 r0 = SET_VEC_PH(0.01);
217 r1 = SET_VEC_PH(0.02);
218 r2 = SET_VEC_PH(0.03);
219 r3 = SET_VEC_PH(0.04);
220 r4 = SET_VEC_PH(0.05);
221 r5 = SET_VEC_PH(0.06);
222 r6 = SET_VEC_PH(0.07);
223 r7 = SET_VEC_PH(0.08);
224 r8 = SET_VEC_PH(0.09);
225 r9 = SET_VEC_PH(0.10);
226 rA = SET_VEC_PH(0.11);
227 rB = SET_VEC_PH(0.12);
228 rC = SET_VEC_PH(0.13);
229 rD = SET_VEC_PH(0.14);
230 rE = SET_VEC_PH(0.15);
231 rF = SET_VEC_PH(0.16);
239 while (
c < iterations){
244 r0 = FMA_VEC_PH(r0,r7,r9);
245 r1 = FMA_VEC_PH(r1,r8,rA);
246 r2 = FMA_VEC_PH(r2,r9,rB);
247 r3 = FMA_VEC_PH(r3,rA,rC);
248 r4 = FMA_VEC_PH(r4,rB,rD);
249 r5 = FMA_VEC_PH(r5,rC,rE);
251 r0 = FMA_VEC_PH(r0,rD,rF);
252 r1 = FMA_VEC_PH(r1,rC,rE);
253 r2 = FMA_VEC_PH(r2,rB,rD);
254 r3 = FMA_VEC_PH(r3,rA,rC);
255 r4 = FMA_VEC_PH(r4,r9,rB);
256 r5 = FMA_VEC_PH(r5,r8,rA);
258 r0 = FMA_VEC_PH(r0,r7,r9);
259 r1 = FMA_VEC_PH(r1,r8,rA);
260 r2 = FMA_VEC_PH(r2,r9,rB);
261 r3 = FMA_VEC_PH(r3,rA,rC);
262 r4 = FMA_VEC_PH(r4,rB,rD);
263 r5 = FMA_VEC_PH(r5,rC,rE);
265 r0 = FMA_VEC_PH(r0,rD,rF);
266 r1 = FMA_VEC_PH(r1,rC,rE);
267 r2 = FMA_VEC_PH(r2,rB,rD);
268 r3 = FMA_VEC_PH(r3,rA,rC);
269 r4 = FMA_VEC_PH(r4,r9,rB);
270 r5 = FMA_VEC_PH(r5,r8,rA);
272 r0 = FMA_VEC_PH(r0,r7,r9);
273 r1 = FMA_VEC_PH(r1,r8,rA);
274 r2 = FMA_VEC_PH(r2,r9,rB);
275 r3 = FMA_VEC_PH(r3,rA,rC);
276 r4 = FMA_VEC_PH(r4,rB,rD);
277 r5 = FMA_VEC_PH(r5,rC,rE);
279 r0 = FMA_VEC_PH(r0,rD,rF);
280 r1 = FMA_VEC_PH(r1,rC,rE);
281 r2 = FMA_VEC_PH(r2,rB,rD);
282 r3 = FMA_VEC_PH(r3,rA,rC);
283 r4 = FMA_VEC_PH(r4,r9,rB);
284 r5 = FMA_VEC_PH(r5,r8,rA);
286 r0 = FMA_VEC_PH(r0,r7,r9);
287 r1 = FMA_VEC_PH(r1,r8,rA);
288 r2 = FMA_VEC_PH(r2,r9,rB);
289 r3 = FMA_VEC_PH(r3,rA,rC);
290 r4 = FMA_VEC_PH(r4,rB,rD);
291 r5 = FMA_VEC_PH(r5,rC,rE);
293 r0 = FMA_VEC_PH(r0,rD,rF);
294 r1 = FMA_VEC_PH(r1,rC,rE);
295 r2 = FMA_VEC_PH(r2,rB,rD);
296 r3 = FMA_VEC_PH(r3,rA,rC);
297 r4 = FMA_VEC_PH(r4,r9,rB);
298 r5 = FMA_VEC_PH(r5,r8,rA);
309 r0 = ADD_VEC_PH(r0,r1);
310 r2 = ADD_VEC_PH(r2,r3);
311 r4 = ADD_VEC_PH(r4,r5);
313 r0 = ADD_VEC_PH(r0,r6);
314 r2 = ADD_VEC_PH(r2,r4);
316 r0 = ADD_VEC_PH(r0,r2);
319 HP_VEC_TYPE temp = r0;
320 out = vaddh_f16(out,((half*)&temp)[0]);
321 out = vaddh_f16(out,((half*)&temp)[1]);
322 out = vaddh_f16(out,((half*)&temp)[2]);
323 out = vaddh_f16(out,((half*)&temp)[3]);
332 half scalar_sum = 0.0;
334 if ( instr_per_loop == 12 ) {
338 else if ( instr_per_loop == 24 ) {
342 else if ( instr_per_loop == 48 ) {
347 if( vdivh_f16(sum,4.0) != scalar_sum ) {
348 fprintf(
stderr,
"FMA: Inconsistent FLOP results detected!\n");
387 float scalar_sum = 0.0;
389 if ( instr_per_loop == 12 ) {
393 else if ( instr_per_loop == 24 ) {
397 else if ( instr_per_loop == 48 ) {
402 if( sum/4.0 != scalar_sum ) {
403 fprintf(
stderr,
"FMA: Inconsistent FLOP results detected!\n");
unsigned long long uint64
Start counting hardware events in an event set.
static double c[MATRIX_SIZE][MATRIX_SIZE]
static void test_hp_VEC_FMA(int instr_per_loop, uint64 iterations, int EventSet, FILE *fp)
static float test_hp_mac_VEC_FMA_48(uint64 iterations, int EventSet, FILE *fp)
static float test_hp_mac_VEC_FMA_24(uint64 iterations, int EventSet, FILE *fp)
static float test_hp_mac_VEC_FMA_12(uint64 iterations, int EventSet, FILE *fp)
float test_hp_scalar_VEC_FMA_48(uint64 iterations)
float test_hp_scalar_VEC_FMA_24(uint64 iterations)
void papi_stop_and_print_placeholder(long long theory, FILE *fp)
float test_hp_scalar_VEC_FMA_12(uint64 iterations)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)