PAPI 7.1.0.0
Loading...
Searching...
No Matches
vec_fma_hp.c
Go to the documentation of this file.
1#include "vec_scalar_verify.h"
2
3#if defined(ARM)
4static half test_hp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp );
5static half test_hp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp );
6static half test_hp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp );
7#else
8static float test_hp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp );
9static float test_hp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp );
10static float test_hp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp );
11#endif
12static void test_hp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
13
14/* Wrapper functions of different vector widths. */
15#if defined(X86_VEC_WIDTH_128B)
16void test_hp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
17 return test_hp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
18}
19#elif defined(X86_VEC_WIDTH_512B)
20void test_hp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
21 return test_hp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
22}
23#elif defined(X86_VEC_WIDTH_256B)
24void test_hp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
25 return test_hp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
26}
27#elif defined(ARM)
28void test_hp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
29 return test_hp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
30}
31#elif defined(POWER)
32void test_hp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
33 return test_hp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
34}
35#endif
36
37#if defined(ARM)
38/************************************/
39/* Loop unrolling: 12 instructions */
40/************************************/
41static
42half test_hp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){
43 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
44
45 /* Generate starting data */
46 r0 = SET_VEC_PH(0.01);
47 r1 = SET_VEC_PH(0.02);
48 r2 = SET_VEC_PH(0.03);
49 r3 = SET_VEC_PH(0.04);
50 r4 = SET_VEC_PH(0.05);
51 r5 = SET_VEC_PH(0.06);
52 r6 = SET_VEC_PH(0.07);
53 r7 = SET_VEC_PH(0.08);
54 r8 = SET_VEC_PH(0.09);
55 r9 = SET_VEC_PH(0.10);
56 rA = SET_VEC_PH(0.11);
57 rB = SET_VEC_PH(0.12);
58 rC = SET_VEC_PH(0.13);
59 rD = SET_VEC_PH(0.14);
60 rE = SET_VEC_PH(0.15);
61 rF = SET_VEC_PH(0.16);
62
63 /* Start PAPI counters */
64 if ( PAPI_start( EventSet ) != PAPI_OK ) {
65 return -1;
66 }
67
68 uint64 c = 0;
69 while (c < iterations){
70 size_t i = 0;
71 while (i < 1000){
72 /* The performance critical part */
73
74 r0 = FMA_VEC_PH(r0,r7,r9);
75 r1 = FMA_VEC_PH(r1,r8,rA);
76 r2 = FMA_VEC_PH(r2,r9,rB);
77 r3 = FMA_VEC_PH(r3,rA,rC);
78 r4 = FMA_VEC_PH(r4,rB,rD);
79 r5 = FMA_VEC_PH(r5,rC,rE);
80
81 r0 = FMA_VEC_PH(r0,rD,rF);
82 r1 = FMA_VEC_PH(r1,rC,rE);
83 r2 = FMA_VEC_PH(r2,rB,rD);
84 r3 = FMA_VEC_PH(r3,rA,rC);
85 r4 = FMA_VEC_PH(r4,r9,rB);
86 r5 = FMA_VEC_PH(r5,r8,rA);
87
88 i++;
89 }
90 c++;
91 }
92
93 /* Stop PAPI counters */
95
96 /* Use data so that compiler does not eliminate it when using -O2 */
97 r0 = ADD_VEC_PH(r0,r1);
98 r2 = ADD_VEC_PH(r2,r3);
99 r4 = ADD_VEC_PH(r4,r5);
100
101 r0 = ADD_VEC_PH(r0,r6);
102 r2 = ADD_VEC_PH(r2,r4);
103
104 r0 = ADD_VEC_PH(r0,r2);
105
106 half out = 0;
107 HP_VEC_TYPE temp = r0;
108 out = vaddh_f16(out,((half*)&temp)[0]);
109 out = vaddh_f16(out,((half*)&temp)[1]);
110 out = vaddh_f16(out,((half*)&temp)[2]);
111 out = vaddh_f16(out,((half*)&temp)[3]);
112
113 return out;
114}
115
116/************************************/
117/* Loop unrolling: 24 instructions */
118/************************************/
119static
120half test_hp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){
121 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
122
123 /* Generate starting data */
124 r0 = SET_VEC_PH(0.01);
125 r1 = SET_VEC_PH(0.02);
126 r2 = SET_VEC_PH(0.03);
127 r3 = SET_VEC_PH(0.04);
128 r4 = SET_VEC_PH(0.05);
129 r5 = SET_VEC_PH(0.06);
130 r6 = SET_VEC_PH(0.07);
131 r7 = SET_VEC_PH(0.08);
132 r8 = SET_VEC_PH(0.09);
133 r9 = SET_VEC_PH(0.10);
134 rA = SET_VEC_PH(0.11);
135 rB = SET_VEC_PH(0.12);
136 rC = SET_VEC_PH(0.13);
137 rD = SET_VEC_PH(0.14);
138 rE = SET_VEC_PH(0.15);
139 rF = SET_VEC_PH(0.16);
140
141 /* Start PAPI counters */
142 if ( PAPI_start( EventSet ) != PAPI_OK ) {
143 return -1;
144 }
145
146 uint64 c = 0;
147 while (c < iterations){
148 size_t i = 0;
149 while (i < 1000){
150 /* The performance critical part */
151
152 r0 = FMA_VEC_PH(r0,r7,r9);
153 r1 = FMA_VEC_PH(r1,r8,rA);
154 r2 = FMA_VEC_PH(r2,r9,rB);
155 r3 = FMA_VEC_PH(r3,rA,rC);
156 r4 = FMA_VEC_PH(r4,rB,rD);
157 r5 = FMA_VEC_PH(r5,rC,rE);
158
159 r0 = FMA_VEC_PH(r0,rD,rF);
160 r1 = FMA_VEC_PH(r1,rC,rE);
161 r2 = FMA_VEC_PH(r2,rB,rD);
162 r3 = FMA_VEC_PH(r3,rA,rC);
163 r4 = FMA_VEC_PH(r4,r9,rB);
164 r5 = FMA_VEC_PH(r5,r8,rA);
165
166 r0 = FMA_VEC_PH(r0,r7,r9);
167 r1 = FMA_VEC_PH(r1,r8,rA);
168 r2 = FMA_VEC_PH(r2,r9,rB);
169 r3 = FMA_VEC_PH(r3,rA,rC);
170 r4 = FMA_VEC_PH(r4,rB,rD);
171 r5 = FMA_VEC_PH(r5,rC,rE);
172
173 r0 = FMA_VEC_PH(r0,rD,rF);
174 r1 = FMA_VEC_PH(r1,rC,rE);
175 r2 = FMA_VEC_PH(r2,rB,rD);
176 r3 = FMA_VEC_PH(r3,rA,rC);
177 r4 = FMA_VEC_PH(r4,r9,rB);
178 r5 = FMA_VEC_PH(r5,r8,rA);
179
180 i++;
181 }
182 c++;
183 }
184
185 /* Stop PAPI counters */
187
188 /* Use data so that compiler does not eliminate it when using -O2 */
189 r0 = ADD_VEC_PH(r0,r1);
190 r2 = ADD_VEC_PH(r2,r3);
191 r4 = ADD_VEC_PH(r4,r5);
192
193 r0 = ADD_VEC_PH(r0,r6);
194 r2 = ADD_VEC_PH(r2,r4);
195
196 r0 = ADD_VEC_PH(r0,r2);
197
198 half out = 0;
199 HP_VEC_TYPE temp = r0;
200 out = vaddh_f16(out,((half*)&temp)[0]);
201 out = vaddh_f16(out,((half*)&temp)[1]);
202 out = vaddh_f16(out,((half*)&temp)[2]);
203 out = vaddh_f16(out,((half*)&temp)[3]);
204
205 return out;
206}
207
208/************************************/
209/* Loop unrolling: 48 instructions */
210/************************************/
211static
212half test_hp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){
213 register HP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
214
215 /* Generate starting data */
216 r0 = SET_VEC_PH(0.01);
217 r1 = SET_VEC_PH(0.02);
218 r2 = SET_VEC_PH(0.03);
219 r3 = SET_VEC_PH(0.04);
220 r4 = SET_VEC_PH(0.05);
221 r5 = SET_VEC_PH(0.06);
222 r6 = SET_VEC_PH(0.07);
223 r7 = SET_VEC_PH(0.08);
224 r8 = SET_VEC_PH(0.09);
225 r9 = SET_VEC_PH(0.10);
226 rA = SET_VEC_PH(0.11);
227 rB = SET_VEC_PH(0.12);
228 rC = SET_VEC_PH(0.13);
229 rD = SET_VEC_PH(0.14);
230 rE = SET_VEC_PH(0.15);
231 rF = SET_VEC_PH(0.16);
232
233 /* Start PAPI counters */
234 if ( PAPI_start( EventSet ) != PAPI_OK ) {
235 return -1;
236 }
237
238 uint64 c = 0;
239 while (c < iterations){
240 size_t i = 0;
241 while (i < 1000){
242 /* The performance critical part */
243
244 r0 = FMA_VEC_PH(r0,r7,r9);
245 r1 = FMA_VEC_PH(r1,r8,rA);
246 r2 = FMA_VEC_PH(r2,r9,rB);
247 r3 = FMA_VEC_PH(r3,rA,rC);
248 r4 = FMA_VEC_PH(r4,rB,rD);
249 r5 = FMA_VEC_PH(r5,rC,rE);
250
251 r0 = FMA_VEC_PH(r0,rD,rF);
252 r1 = FMA_VEC_PH(r1,rC,rE);
253 r2 = FMA_VEC_PH(r2,rB,rD);
254 r3 = FMA_VEC_PH(r3,rA,rC);
255 r4 = FMA_VEC_PH(r4,r9,rB);
256 r5 = FMA_VEC_PH(r5,r8,rA);
257
258 r0 = FMA_VEC_PH(r0,r7,r9);
259 r1 = FMA_VEC_PH(r1,r8,rA);
260 r2 = FMA_VEC_PH(r2,r9,rB);
261 r3 = FMA_VEC_PH(r3,rA,rC);
262 r4 = FMA_VEC_PH(r4,rB,rD);
263 r5 = FMA_VEC_PH(r5,rC,rE);
264
265 r0 = FMA_VEC_PH(r0,rD,rF);
266 r1 = FMA_VEC_PH(r1,rC,rE);
267 r2 = FMA_VEC_PH(r2,rB,rD);
268 r3 = FMA_VEC_PH(r3,rA,rC);
269 r4 = FMA_VEC_PH(r4,r9,rB);
270 r5 = FMA_VEC_PH(r5,r8,rA);
271
272 r0 = FMA_VEC_PH(r0,r7,r9);
273 r1 = FMA_VEC_PH(r1,r8,rA);
274 r2 = FMA_VEC_PH(r2,r9,rB);
275 r3 = FMA_VEC_PH(r3,rA,rC);
276 r4 = FMA_VEC_PH(r4,rB,rD);
277 r5 = FMA_VEC_PH(r5,rC,rE);
278
279 r0 = FMA_VEC_PH(r0,rD,rF);
280 r1 = FMA_VEC_PH(r1,rC,rE);
281 r2 = FMA_VEC_PH(r2,rB,rD);
282 r3 = FMA_VEC_PH(r3,rA,rC);
283 r4 = FMA_VEC_PH(r4,r9,rB);
284 r5 = FMA_VEC_PH(r5,r8,rA);
285
286 r0 = FMA_VEC_PH(r0,r7,r9);
287 r1 = FMA_VEC_PH(r1,r8,rA);
288 r2 = FMA_VEC_PH(r2,r9,rB);
289 r3 = FMA_VEC_PH(r3,rA,rC);
290 r4 = FMA_VEC_PH(r4,rB,rD);
291 r5 = FMA_VEC_PH(r5,rC,rE);
292
293 r0 = FMA_VEC_PH(r0,rD,rF);
294 r1 = FMA_VEC_PH(r1,rC,rE);
295 r2 = FMA_VEC_PH(r2,rB,rD);
296 r3 = FMA_VEC_PH(r3,rA,rC);
297 r4 = FMA_VEC_PH(r4,r9,rB);
298 r5 = FMA_VEC_PH(r5,r8,rA);
299
300 i++;
301 }
302 c++;
303 }
304
305 /* Stop PAPI counters */
307
308 /* Use data so that compiler does not eliminate it when using -O2 */
309 r0 = ADD_VEC_PH(r0,r1);
310 r2 = ADD_VEC_PH(r2,r3);
311 r4 = ADD_VEC_PH(r4,r5);
312
313 r0 = ADD_VEC_PH(r0,r6);
314 r2 = ADD_VEC_PH(r2,r4);
315
316 r0 = ADD_VEC_PH(r0,r2);
317
318 half out = 0;
319 HP_VEC_TYPE temp = r0;
320 out = vaddh_f16(out,((half*)&temp)[0]);
321 out = vaddh_f16(out,((half*)&temp)[1]);
322 out = vaddh_f16(out,((half*)&temp)[2]);
323 out = vaddh_f16(out,((half*)&temp)[3]);
324
325 return out;
326}
327
328static
329void test_hp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp )
330{
331 half sum = 0.0;
332 half scalar_sum = 0.0;
333
334 if ( instr_per_loop == 12 ) {
335 sum = vaddh_f16(sum,test_hp_mac_VEC_FMA_12( iterations, EventSet, fp ));
336 scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_12( iterations ));
337 }
338 else if ( instr_per_loop == 24 ) {
339 sum = vaddh_f16(sum,test_hp_mac_VEC_FMA_24( iterations, EventSet, fp ));
340 scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_24( iterations ));
341 }
342 else if ( instr_per_loop == 48 ) {
343 sum = vaddh_f16(sum,test_hp_mac_VEC_FMA_48( iterations, EventSet, fp ));
344 scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_48( iterations ));
345 }
346
347 if( vdivh_f16(sum,4.0) != scalar_sum ) {
348 fprintf(stderr, "FMA: Inconsistent FLOP results detected!\n");
349 }
350}
351
352#else
353static
354float test_hp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){
355
356 (void)iterations;
357 (void)EventSet;
359
360 return 0.0;
361}
362
363static
364float test_hp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){
365
366 (void)iterations;
367 (void)EventSet;
369
370 return 0.0;
371}
372
373static
374float test_hp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){
375
376 (void)iterations;
377 (void)EventSet;
379
380 return 0.0;
381}
382
383static
384void test_hp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp )
385{
386 float sum = 0.0;
387 float scalar_sum = 0.0;
388
389 if ( instr_per_loop == 12 ) {
390 sum += test_hp_mac_VEC_FMA_12( iterations, EventSet, fp );
391 scalar_sum += test_hp_scalar_VEC_FMA_12( iterations );
392 }
393 else if ( instr_per_loop == 24 ) {
394 sum += test_hp_mac_VEC_FMA_24( iterations, EventSet, fp );
395 scalar_sum += test_hp_scalar_VEC_FMA_24( iterations );
396 }
397 else if ( instr_per_loop == 48 ) {
398 sum += test_hp_mac_VEC_FMA_48( iterations, EventSet, fp );
399 scalar_sum += test_hp_scalar_VEC_FMA_48( iterations );
400 }
401
402 if( sum/4.0 != scalar_sum ) {
403 fprintf(stderr, "FMA: Inconsistent FLOP results detected!\n");
404 }
405}
406#endif
int i
unsigned long long uint64
Definition: cat_arch.h:3
Start counting hardware events in an event set.
#define PAPI_OK
Definition: f90papi.h:73
static int EventSet
Definition: init_fini.c:8
static double c[MATRIX_SIZE][MATRIX_SIZE]
Definition: libmsr_basic.c:40
FILE * stderr
static FILE * fp
static void test_hp_VEC_FMA(int instr_per_loop, uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_hp.c:384
static float test_hp_mac_VEC_FMA_48(uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_hp.c:374
static float test_hp_mac_VEC_FMA_24(uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_hp.c:364
static float test_hp_mac_VEC_FMA_12(uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_hp.c:354
float test_hp_scalar_VEC_FMA_48(uint64 iterations)
float test_hp_scalar_VEC_FMA_24(uint64 iterations)
void papi_stop_and_print_placeholder(long long theory, FILE *fp)
float test_hp_scalar_VEC_FMA_12(uint64 iterations)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)