9#if defined(X86_VEC_WIDTH_128B)
10void test_dp_x86_128B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
13#elif defined(X86_VEC_WIDTH_512B)
14void test_dp_x86_512B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
17#elif defined(X86_VEC_WIDTH_256B)
18void test_dp_x86_256B_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
22void test_dp_arm_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
26void test_dp_power_VEC_FMA(
int instr_per_loop,
uint64 iterations,
int EventSet, FILE *
fp ) {
36 register DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
39 r0 = SET_VEC_PD(0.01);
40 r1 = SET_VEC_PD(0.02);
41 r2 = SET_VEC_PD(0.03);
42 r3 = SET_VEC_PD(0.04);
43 r4 = SET_VEC_PD(0.05);
44 r5 = SET_VEC_PD(0.06);
45 r6 = SET_VEC_PD(0.07);
46 r7 = SET_VEC_PD(0.08);
47 r8 = SET_VEC_PD(0.09);
48 r9 = SET_VEC_PD(0.10);
49 rA = SET_VEC_PD(0.11);
50 rB = SET_VEC_PD(0.12);
51 rC = SET_VEC_PD(0.13);
52 rD = SET_VEC_PD(0.14);
53 rE = SET_VEC_PD(0.15);
54 rF = SET_VEC_PD(0.16);
62 while (
c < iterations){
67 r0 = FMA_VEC_PD(r0,r7,r9);
68 r1 = FMA_VEC_PD(r1,r8,rA);
69 r2 = FMA_VEC_PD(r2,r9,rB);
70 r3 = FMA_VEC_PD(r3,rA,rC);
71 r4 = FMA_VEC_PD(r4,rB,rD);
72 r5 = FMA_VEC_PD(r5,rC,rE);
74 r0 = FMA_VEC_PD(r0,rD,rF);
75 r1 = FMA_VEC_PD(r1,rC,rE);
76 r2 = FMA_VEC_PD(r2,rB,rD);
77 r3 = FMA_VEC_PD(r3,rA,rC);
78 r4 = FMA_VEC_PD(r4,r9,rB);
79 r5 = FMA_VEC_PD(r5,r8,rA);
90 r0 = ADD_VEC_PD(r0,r1);
91 r2 = ADD_VEC_PD(r2,r3);
92 r4 = ADD_VEC_PD(r4,r5);
94 r0 = ADD_VEC_PD(r0,r6);
95 r2 = ADD_VEC_PD(r2,r4);
97 r0 = ADD_VEC_PD(r0,r2);
100 DP_VEC_TYPE temp = r0;
101 out += ((
double*)&temp)[0];
102 out += ((
double*)&temp)[1];
112 register DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
115 r0 = SET_VEC_PD(0.01);
116 r1 = SET_VEC_PD(0.02);
117 r2 = SET_VEC_PD(0.03);
118 r3 = SET_VEC_PD(0.04);
119 r4 = SET_VEC_PD(0.05);
120 r5 = SET_VEC_PD(0.06);
121 r6 = SET_VEC_PD(0.07);
122 r7 = SET_VEC_PD(0.08);
123 r8 = SET_VEC_PD(0.09);
124 r9 = SET_VEC_PD(0.10);
125 rA = SET_VEC_PD(0.11);
126 rB = SET_VEC_PD(0.12);
127 rC = SET_VEC_PD(0.13);
128 rD = SET_VEC_PD(0.14);
129 rE = SET_VEC_PD(0.15);
130 rF = SET_VEC_PD(0.16);
138 while (
c < iterations){
143 r0 = FMA_VEC_PD(r0,r7,r9);
144 r1 = FMA_VEC_PD(r1,r8,rA);
145 r2 = FMA_VEC_PD(r2,r9,rB);
146 r3 = FMA_VEC_PD(r3,rA,rC);
147 r4 = FMA_VEC_PD(r4,rB,rD);
148 r5 = FMA_VEC_PD(r5,rC,rE);
150 r0 = FMA_VEC_PD(r0,rD,rF);
151 r1 = FMA_VEC_PD(r1,rC,rE);
152 r2 = FMA_VEC_PD(r2,rB,rD);
153 r3 = FMA_VEC_PD(r3,rA,rC);
154 r4 = FMA_VEC_PD(r4,r9,rB);
155 r5 = FMA_VEC_PD(r5,r8,rA);
157 r0 = FMA_VEC_PD(r0,r7,r9);
158 r1 = FMA_VEC_PD(r1,r8,rA);
159 r2 = FMA_VEC_PD(r2,r9,rB);
160 r3 = FMA_VEC_PD(r3,rA,rC);
161 r4 = FMA_VEC_PD(r4,rB,rD);
162 r5 = FMA_VEC_PD(r5,rC,rE);
164 r0 = FMA_VEC_PD(r0,rD,rF);
165 r1 = FMA_VEC_PD(r1,rC,rE);
166 r2 = FMA_VEC_PD(r2,rB,rD);
167 r3 = FMA_VEC_PD(r3,rA,rC);
168 r4 = FMA_VEC_PD(r4,r9,rB);
169 r5 = FMA_VEC_PD(r5,r8,rA);
180 r0 = ADD_VEC_PD(r0,r1);
181 r2 = ADD_VEC_PD(r2,r3);
182 r4 = ADD_VEC_PD(r4,r5);
184 r0 = ADD_VEC_PD(r0,r6);
185 r2 = ADD_VEC_PD(r2,r4);
187 r0 = ADD_VEC_PD(r0,r2);
190 DP_VEC_TYPE temp = r0;
191 out += ((
double*)&temp)[0];
192 out += ((
double*)&temp)[1];
202 register DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
205 r0 = SET_VEC_PD(0.01);
206 r1 = SET_VEC_PD(0.02);
207 r2 = SET_VEC_PD(0.03);
208 r3 = SET_VEC_PD(0.04);
209 r4 = SET_VEC_PD(0.05);
210 r5 = SET_VEC_PD(0.06);
211 r6 = SET_VEC_PD(0.07);
212 r7 = SET_VEC_PD(0.08);
213 r8 = SET_VEC_PD(0.09);
214 r9 = SET_VEC_PD(0.10);
215 rA = SET_VEC_PD(0.11);
216 rB = SET_VEC_PD(0.12);
217 rC = SET_VEC_PD(0.13);
218 rD = SET_VEC_PD(0.14);
219 rE = SET_VEC_PD(0.15);
220 rF = SET_VEC_PD(0.16);
228 while (
c < iterations){
233 r0 = FMA_VEC_PD(r0,r7,r9);
234 r1 = FMA_VEC_PD(r1,r8,rA);
235 r2 = FMA_VEC_PD(r2,r9,rB);
236 r3 = FMA_VEC_PD(r3,rA,rC);
237 r4 = FMA_VEC_PD(r4,rB,rD);
238 r5 = FMA_VEC_PD(r5,rC,rE);
240 r0 = FMA_VEC_PD(r0,rD,rF);
241 r1 = FMA_VEC_PD(r1,rC,rE);
242 r2 = FMA_VEC_PD(r2,rB,rD);
243 r3 = FMA_VEC_PD(r3,rA,rC);
244 r4 = FMA_VEC_PD(r4,r9,rB);
245 r5 = FMA_VEC_PD(r5,r8,rA);
247 r0 = FMA_VEC_PD(r0,r7,r9);
248 r1 = FMA_VEC_PD(r1,r8,rA);
249 r2 = FMA_VEC_PD(r2,r9,rB);
250 r3 = FMA_VEC_PD(r3,rA,rC);
251 r4 = FMA_VEC_PD(r4,rB,rD);
252 r5 = FMA_VEC_PD(r5,rC,rE);
254 r0 = FMA_VEC_PD(r0,rD,rF);
255 r1 = FMA_VEC_PD(r1,rC,rE);
256 r2 = FMA_VEC_PD(r2,rB,rD);
257 r3 = FMA_VEC_PD(r3,rA,rC);
258 r4 = FMA_VEC_PD(r4,r9,rB);
259 r5 = FMA_VEC_PD(r5,r8,rA);
261 r0 = FMA_VEC_PD(r0,r7,r9);
262 r1 = FMA_VEC_PD(r1,r8,rA);
263 r2 = FMA_VEC_PD(r2,r9,rB);
264 r3 = FMA_VEC_PD(r3,rA,rC);
265 r4 = FMA_VEC_PD(r4,rB,rD);
266 r5 = FMA_VEC_PD(r5,rC,rE);
268 r0 = FMA_VEC_PD(r0,rD,rF);
269 r1 = FMA_VEC_PD(r1,rC,rE);
270 r2 = FMA_VEC_PD(r2,rB,rD);
271 r3 = FMA_VEC_PD(r3,rA,rC);
272 r4 = FMA_VEC_PD(r4,r9,rB);
273 r5 = FMA_VEC_PD(r5,r8,rA);
275 r0 = FMA_VEC_PD(r0,r7,r9);
276 r1 = FMA_VEC_PD(r1,r8,rA);
277 r2 = FMA_VEC_PD(r2,r9,rB);
278 r3 = FMA_VEC_PD(r3,rA,rC);
279 r4 = FMA_VEC_PD(r4,rB,rD);
280 r5 = FMA_VEC_PD(r5,rC,rE);
282 r0 = FMA_VEC_PD(r0,rD,rF);
283 r1 = FMA_VEC_PD(r1,rC,rE);
284 r2 = FMA_VEC_PD(r2,rB,rD);
285 r3 = FMA_VEC_PD(r3,rA,rC);
286 r4 = FMA_VEC_PD(r4,r9,rB);
287 r5 = FMA_VEC_PD(r5,r8,rA);
298 r0 = ADD_VEC_PD(r0,r1);
299 r2 = ADD_VEC_PD(r2,r3);
300 r4 = ADD_VEC_PD(r4,r5);
302 r0 = ADD_VEC_PD(r0,r6);
303 r2 = ADD_VEC_PD(r2,r4);
305 r0 = ADD_VEC_PD(r0,r2);
308 DP_VEC_TYPE temp = r0;
309 out += ((
double*)&temp)[0];
310 out += ((
double*)&temp)[1];
319 double scalar_sum = 0.0;
321 if ( instr_per_loop == 12 ) {
325 else if ( instr_per_loop == 24 ) {
329 else if ( instr_per_loop == 48 ) {
334 if( sum/2.0 != scalar_sum ) {
335 fprintf(
stderr,
"FMA: Inconsistent FLOP results detected!\n");
unsigned long long uint64
Start counting hardware events in an event set.
static double c[MATRIX_SIZE][MATRIX_SIZE]
static double test_dp_mac_VEC_FMA_48(uint64 iterations, int EventSet, FILE *fp)
static double test_dp_mac_VEC_FMA_12(uint64 iterations, int EventSet, FILE *fp)
static void test_dp_VEC_FMA(int instr_per_loop, uint64 iterations, int EventSet, FILE *fp)
static double test_dp_mac_VEC_FMA_24(uint64 iterations, int EventSet, FILE *fp)
double test_dp_scalar_VEC_FMA_24(uint64 iterations)
double test_dp_scalar_VEC_FMA_12(uint64 iterations)
double test_dp_scalar_VEC_FMA_48(uint64 iterations)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)