PAPI 7.1.0.0
Loading...
Searching...
No Matches
vec_fma_dp.c
Go to the documentation of this file.
1#include "vec_scalar_verify.h"
2
3static double test_dp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp );
4static double test_dp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp );
5static double test_dp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp );
6static void test_dp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
7
8/* Wrapper functions of different vector widths. */
9#if defined(X86_VEC_WIDTH_128B)
10void test_dp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
11 return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
12}
13#elif defined(X86_VEC_WIDTH_512B)
14void test_dp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
15 return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
16}
17#elif defined(X86_VEC_WIDTH_256B)
18void test_dp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
19 return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
20}
21#elif defined(ARM)
22void test_dp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
23 return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
24}
25#elif defined(POWER)
26void test_dp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) {
27 return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp );
28}
29#endif
30
31/************************************/
32/* Loop unrolling: 12 instructions */
33/************************************/
34static
35double test_dp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){
36 register DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
37
38 /* Generate starting data */
39 r0 = SET_VEC_PD(0.01);
40 r1 = SET_VEC_PD(0.02);
41 r2 = SET_VEC_PD(0.03);
42 r3 = SET_VEC_PD(0.04);
43 r4 = SET_VEC_PD(0.05);
44 r5 = SET_VEC_PD(0.06);
45 r6 = SET_VEC_PD(0.07);
46 r7 = SET_VEC_PD(0.08);
47 r8 = SET_VEC_PD(0.09);
48 r9 = SET_VEC_PD(0.10);
49 rA = SET_VEC_PD(0.11);
50 rB = SET_VEC_PD(0.12);
51 rC = SET_VEC_PD(0.13);
52 rD = SET_VEC_PD(0.14);
53 rE = SET_VEC_PD(0.15);
54 rF = SET_VEC_PD(0.16);
55
56 /* Start PAPI counters */
57 if ( PAPI_start( EventSet ) != PAPI_OK ) {
58 return -1;
59 }
60
61 uint64 c = 0;
62 while (c < iterations){
63 size_t i = 0;
64 while (i < 1000){
65
66 /* The performance critical part */
67 r0 = FMA_VEC_PD(r0,r7,r9);
68 r1 = FMA_VEC_PD(r1,r8,rA);
69 r2 = FMA_VEC_PD(r2,r9,rB);
70 r3 = FMA_VEC_PD(r3,rA,rC);
71 r4 = FMA_VEC_PD(r4,rB,rD);
72 r5 = FMA_VEC_PD(r5,rC,rE);
73
74 r0 = FMA_VEC_PD(r0,rD,rF);
75 r1 = FMA_VEC_PD(r1,rC,rE);
76 r2 = FMA_VEC_PD(r2,rB,rD);
77 r3 = FMA_VEC_PD(r3,rA,rC);
78 r4 = FMA_VEC_PD(r4,r9,rB);
79 r5 = FMA_VEC_PD(r5,r8,rA);
80
81 i++;
82 }
83 c++;
84 }
85
86 /* Stop PAPI counters */
88
89 /* Use data so that compiler does not eliminate it when using -O2 */
90 r0 = ADD_VEC_PD(r0,r1);
91 r2 = ADD_VEC_PD(r2,r3);
92 r4 = ADD_VEC_PD(r4,r5);
93
94 r0 = ADD_VEC_PD(r0,r6);
95 r2 = ADD_VEC_PD(r2,r4);
96
97 r0 = ADD_VEC_PD(r0,r2);
98
99 double out = 0;
100 DP_VEC_TYPE temp = r0;
101 out += ((double*)&temp)[0];
102 out += ((double*)&temp)[1];
103
104 return out;
105}
106
107/************************************/
108/* Loop unrolling: 24 instructions */
109/************************************/
110static
111double test_dp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){
112 register DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
113
114 /* Generate starting data */
115 r0 = SET_VEC_PD(0.01);
116 r1 = SET_VEC_PD(0.02);
117 r2 = SET_VEC_PD(0.03);
118 r3 = SET_VEC_PD(0.04);
119 r4 = SET_VEC_PD(0.05);
120 r5 = SET_VEC_PD(0.06);
121 r6 = SET_VEC_PD(0.07);
122 r7 = SET_VEC_PD(0.08);
123 r8 = SET_VEC_PD(0.09);
124 r9 = SET_VEC_PD(0.10);
125 rA = SET_VEC_PD(0.11);
126 rB = SET_VEC_PD(0.12);
127 rC = SET_VEC_PD(0.13);
128 rD = SET_VEC_PD(0.14);
129 rE = SET_VEC_PD(0.15);
130 rF = SET_VEC_PD(0.16);
131
132 /* Start PAPI counters */
133 if ( PAPI_start( EventSet ) != PAPI_OK ) {
134 return -1;
135 }
136
137 uint64 c = 0;
138 while (c < iterations){
139 size_t i = 0;
140 while (i < 1000){
141
142 /* The performance critical part */
143 r0 = FMA_VEC_PD(r0,r7,r9);
144 r1 = FMA_VEC_PD(r1,r8,rA);
145 r2 = FMA_VEC_PD(r2,r9,rB);
146 r3 = FMA_VEC_PD(r3,rA,rC);
147 r4 = FMA_VEC_PD(r4,rB,rD);
148 r5 = FMA_VEC_PD(r5,rC,rE);
149
150 r0 = FMA_VEC_PD(r0,rD,rF);
151 r1 = FMA_VEC_PD(r1,rC,rE);
152 r2 = FMA_VEC_PD(r2,rB,rD);
153 r3 = FMA_VEC_PD(r3,rA,rC);
154 r4 = FMA_VEC_PD(r4,r9,rB);
155 r5 = FMA_VEC_PD(r5,r8,rA);
156
157 r0 = FMA_VEC_PD(r0,r7,r9);
158 r1 = FMA_VEC_PD(r1,r8,rA);
159 r2 = FMA_VEC_PD(r2,r9,rB);
160 r3 = FMA_VEC_PD(r3,rA,rC);
161 r4 = FMA_VEC_PD(r4,rB,rD);
162 r5 = FMA_VEC_PD(r5,rC,rE);
163
164 r0 = FMA_VEC_PD(r0,rD,rF);
165 r1 = FMA_VEC_PD(r1,rC,rE);
166 r2 = FMA_VEC_PD(r2,rB,rD);
167 r3 = FMA_VEC_PD(r3,rA,rC);
168 r4 = FMA_VEC_PD(r4,r9,rB);
169 r5 = FMA_VEC_PD(r5,r8,rA);
170
171 i++;
172 }
173 c++;
174 }
175
176 /* Stop PAPI counters */
178
179 /* Use data so that compiler does not eliminate it when using -O2 */
180 r0 = ADD_VEC_PD(r0,r1);
181 r2 = ADD_VEC_PD(r2,r3);
182 r4 = ADD_VEC_PD(r4,r5);
183
184 r0 = ADD_VEC_PD(r0,r6);
185 r2 = ADD_VEC_PD(r2,r4);
186
187 r0 = ADD_VEC_PD(r0,r2);
188
189 double out = 0;
190 DP_VEC_TYPE temp = r0;
191 out += ((double*)&temp)[0];
192 out += ((double*)&temp)[1];
193
194 return out;
195}
196
197/************************************/
198/* Loop unrolling: 48 instructions */
199/************************************/
200static
201double test_dp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){
202 register DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
203
204 /* Generate starting data */
205 r0 = SET_VEC_PD(0.01);
206 r1 = SET_VEC_PD(0.02);
207 r2 = SET_VEC_PD(0.03);
208 r3 = SET_VEC_PD(0.04);
209 r4 = SET_VEC_PD(0.05);
210 r5 = SET_VEC_PD(0.06);
211 r6 = SET_VEC_PD(0.07);
212 r7 = SET_VEC_PD(0.08);
213 r8 = SET_VEC_PD(0.09);
214 r9 = SET_VEC_PD(0.10);
215 rA = SET_VEC_PD(0.11);
216 rB = SET_VEC_PD(0.12);
217 rC = SET_VEC_PD(0.13);
218 rD = SET_VEC_PD(0.14);
219 rE = SET_VEC_PD(0.15);
220 rF = SET_VEC_PD(0.16);
221
222 /* Start PAPI counters */
223 if ( PAPI_start( EventSet ) != PAPI_OK ) {
224 return -1;
225 }
226
227 uint64 c = 0;
228 while (c < iterations){
229 size_t i = 0;
230 while (i < 1000){
231
232 /* The performance critical part */
233 r0 = FMA_VEC_PD(r0,r7,r9);
234 r1 = FMA_VEC_PD(r1,r8,rA);
235 r2 = FMA_VEC_PD(r2,r9,rB);
236 r3 = FMA_VEC_PD(r3,rA,rC);
237 r4 = FMA_VEC_PD(r4,rB,rD);
238 r5 = FMA_VEC_PD(r5,rC,rE);
239
240 r0 = FMA_VEC_PD(r0,rD,rF);
241 r1 = FMA_VEC_PD(r1,rC,rE);
242 r2 = FMA_VEC_PD(r2,rB,rD);
243 r3 = FMA_VEC_PD(r3,rA,rC);
244 r4 = FMA_VEC_PD(r4,r9,rB);
245 r5 = FMA_VEC_PD(r5,r8,rA);
246
247 r0 = FMA_VEC_PD(r0,r7,r9);
248 r1 = FMA_VEC_PD(r1,r8,rA);
249 r2 = FMA_VEC_PD(r2,r9,rB);
250 r3 = FMA_VEC_PD(r3,rA,rC);
251 r4 = FMA_VEC_PD(r4,rB,rD);
252 r5 = FMA_VEC_PD(r5,rC,rE);
253
254 r0 = FMA_VEC_PD(r0,rD,rF);
255 r1 = FMA_VEC_PD(r1,rC,rE);
256 r2 = FMA_VEC_PD(r2,rB,rD);
257 r3 = FMA_VEC_PD(r3,rA,rC);
258 r4 = FMA_VEC_PD(r4,r9,rB);
259 r5 = FMA_VEC_PD(r5,r8,rA);
260
261 r0 = FMA_VEC_PD(r0,r7,r9);
262 r1 = FMA_VEC_PD(r1,r8,rA);
263 r2 = FMA_VEC_PD(r2,r9,rB);
264 r3 = FMA_VEC_PD(r3,rA,rC);
265 r4 = FMA_VEC_PD(r4,rB,rD);
266 r5 = FMA_VEC_PD(r5,rC,rE);
267
268 r0 = FMA_VEC_PD(r0,rD,rF);
269 r1 = FMA_VEC_PD(r1,rC,rE);
270 r2 = FMA_VEC_PD(r2,rB,rD);
271 r3 = FMA_VEC_PD(r3,rA,rC);
272 r4 = FMA_VEC_PD(r4,r9,rB);
273 r5 = FMA_VEC_PD(r5,r8,rA);
274
275 r0 = FMA_VEC_PD(r0,r7,r9);
276 r1 = FMA_VEC_PD(r1,r8,rA);
277 r2 = FMA_VEC_PD(r2,r9,rB);
278 r3 = FMA_VEC_PD(r3,rA,rC);
279 r4 = FMA_VEC_PD(r4,rB,rD);
280 r5 = FMA_VEC_PD(r5,rC,rE);
281
282 r0 = FMA_VEC_PD(r0,rD,rF);
283 r1 = FMA_VEC_PD(r1,rC,rE);
284 r2 = FMA_VEC_PD(r2,rB,rD);
285 r3 = FMA_VEC_PD(r3,rA,rC);
286 r4 = FMA_VEC_PD(r4,r9,rB);
287 r5 = FMA_VEC_PD(r5,r8,rA);
288
289 i++;
290 }
291 c++;
292 }
293
294 /* Stop PAPI counters */
296
297 /* Use data so that compiler does not eliminate it when using -O2 */
298 r0 = ADD_VEC_PD(r0,r1);
299 r2 = ADD_VEC_PD(r2,r3);
300 r4 = ADD_VEC_PD(r4,r5);
301
302 r0 = ADD_VEC_PD(r0,r6);
303 r2 = ADD_VEC_PD(r2,r4);
304
305 r0 = ADD_VEC_PD(r0,r2);
306
307 double out = 0;
308 DP_VEC_TYPE temp = r0;
309 out += ((double*)&temp)[0];
310 out += ((double*)&temp)[1];
311
312 return out;
313}
314
315static
316void test_dp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp )
317{
318 double sum = 0.0;
319 double scalar_sum = 0.0;
320
321 if ( instr_per_loop == 12 ) {
322 sum += test_dp_mac_VEC_FMA_12( iterations, EventSet, fp );
323 scalar_sum += test_dp_scalar_VEC_FMA_12( iterations );
324 }
325 else if ( instr_per_loop == 24 ) {
326 sum += test_dp_mac_VEC_FMA_24( iterations, EventSet, fp );
327 scalar_sum += test_dp_scalar_VEC_FMA_24( iterations );
328 }
329 else if ( instr_per_loop == 48 ) {
330 sum += test_dp_mac_VEC_FMA_48( iterations, EventSet, fp );
331 scalar_sum += test_dp_scalar_VEC_FMA_48( iterations );
332 }
333
334 if( sum/2.0 != scalar_sum ) {
335 fprintf(stderr, "FMA: Inconsistent FLOP results detected!\n");
336 }
337}
int i
unsigned long long uint64
Definition: cat_arch.h:3
Start counting hardware events in an event set.
#define PAPI_OK
Definition: f90papi.h:73
static int EventSet
Definition: init_fini.c:8
static double c[MATRIX_SIZE][MATRIX_SIZE]
Definition: libmsr_basic.c:40
FILE * stderr
static FILE * fp
static double test_dp_mac_VEC_FMA_48(uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_dp.c:201
static double test_dp_mac_VEC_FMA_12(uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_dp.c:35
static void test_dp_VEC_FMA(int instr_per_loop, uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_dp.c:316
static double test_dp_mac_VEC_FMA_24(uint64 iterations, int EventSet, FILE *fp)
Definition: vec_fma_dp.c:111
double test_dp_scalar_VEC_FMA_24(uint64 iterations)
double test_dp_scalar_VEC_FMA_12(uint64 iterations)
double test_dp_scalar_VEC_FMA_48(uint64 iterations)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)