PAPI 7.1.0.0
Loading...
Searching...
No Matches
vec_scalar_verify.c
Go to the documentation of this file.
1#include "vec_scalar_verify.h"
2
3void papi_stop_and_print_placeholder(long long theory, FILE *fp)
4{
5 fprintf(fp, "%lld 0\n", theory);
6}
7
8void papi_stop_and_print(long long theory, int EventSet, FILE *fp)
9{
10 long long flpins = 0;
11 int retval;
12
13 if ( (retval=PAPI_stop(EventSet, &flpins)) != PAPI_OK){
14 fprintf(stderr, "Problem.\n");
15 return;
16 }
17
18 fprintf(fp, "%lld %lld\n", theory, flpins);
19}
20
21#if defined(ARM)
22half test_hp_scalar_VEC_24( uint64 iterations ){
23 register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
24
25 /* Generate starting data */
26 r0 = SET_VEC_SH(0.01);
27 r1 = SET_VEC_SH(0.02);
28 r2 = SET_VEC_SH(0.03);
29 r3 = SET_VEC_SH(0.04);
30 r4 = SET_VEC_SH(0.05);
31 r5 = SET_VEC_SH(0.06);
32 r6 = SET_VEC_SH(0.07);
33 r7 = SET_VEC_SH(0.08);
34 r8 = SET_VEC_SH(0.09);
35 r9 = SET_VEC_SH(0.10);
36 rA = SET_VEC_SH(0.11);
37 rB = SET_VEC_SH(0.12);
38 rC = SET_VEC_SH(0.13);
39 rD = SET_VEC_SH(0.14);
40 rE = SET_VEC_SH(0.15);
41 rF = SET_VEC_SH(0.16);
42
43 uint64 c = 0;
44 while (c < iterations){
45 size_t i = 0;
46 while (i < 1000){
47
48 /* The performance critical part */
49 r0 = MUL_VEC_SH(r0,rC);
50 r1 = ADD_VEC_SH(r1,rD);
51 r2 = MUL_VEC_SH(r2,rE);
52 r3 = ADD_VEC_SH(r3,rF);
53 r4 = MUL_VEC_SH(r4,rC);
54 r5 = ADD_VEC_SH(r5,rD);
55 r6 = MUL_VEC_SH(r6,rE);
56 r7 = ADD_VEC_SH(r7,rF);
57 r8 = MUL_VEC_SH(r8,rC);
58 r9 = ADD_VEC_SH(r9,rD);
59 rA = MUL_VEC_SH(rA,rE);
60 rB = ADD_VEC_SH(rB,rF);
61
62 r0 = ADD_VEC_SH(r0,rF);
63 r1 = MUL_VEC_SH(r1,rE);
64 r2 = ADD_VEC_SH(r2,rD);
65 r3 = MUL_VEC_SH(r3,rC);
66 r4 = ADD_VEC_SH(r4,rF);
67 r5 = MUL_VEC_SH(r5,rE);
68 r6 = ADD_VEC_SH(r6,rD);
69 r7 = MUL_VEC_SH(r7,rC);
70 r8 = ADD_VEC_SH(r8,rF);
71 r9 = MUL_VEC_SH(r9,rE);
72 rA = ADD_VEC_SH(rA,rD);
73 rB = MUL_VEC_SH(rB,rC);
74
75 i++;
76 }
77 c++;
78 }
79
80 /* Use data so that compiler does not eliminate it when using -O2 */
81 r0 = ADD_VEC_SH(r0,r1);
82 r2 = ADD_VEC_SH(r2,r3);
83 r4 = ADD_VEC_SH(r4,r5);
84 r6 = ADD_VEC_SH(r6,r7);
85 r8 = ADD_VEC_SH(r8,r9);
86 rA = ADD_VEC_SH(rA,rB);
87
88 r0 = ADD_VEC_SH(r0,r2);
89 r4 = ADD_VEC_SH(r4,r6);
90 r8 = ADD_VEC_SH(r8,rA);
91
92 r0 = ADD_VEC_SH(r0,r4);
93 r0 = ADD_VEC_SH(r0,r8);
94
95 half out = 0;
96 half temp = r0;
97 out = ADD_VEC_SH(out,temp);
98
99 return out;
100}
101
102half test_hp_scalar_VEC_48( uint64 iterations ){
103 register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
104
105 /* Generate starting data */
106 r0 = SET_VEC_SH(0.01);
107 r1 = SET_VEC_SH(0.02);
108 r2 = SET_VEC_SH(0.03);
109 r3 = SET_VEC_SH(0.04);
110 r4 = SET_VEC_SH(0.05);
111 r5 = SET_VEC_SH(0.06);
112 r6 = SET_VEC_SH(0.07);
113 r7 = SET_VEC_SH(0.08);
114 r8 = SET_VEC_SH(0.09);
115 r9 = SET_VEC_SH(0.10);
116 rA = SET_VEC_SH(0.11);
117 rB = SET_VEC_SH(0.12);
118 rC = SET_VEC_SH(0.13);
119 rD = SET_VEC_SH(0.14);
120 rE = SET_VEC_SH(0.15);
121 rF = SET_VEC_SH(0.16);
122
123 uint64 c = 0;
124 while (c < iterations){
125 size_t i = 0;
126 while (i < 1000){
127
128 /* The performance critical part */
129 r0 = MUL_VEC_SH(r0,rC);
130 r1 = ADD_VEC_SH(r1,rD);
131 r2 = MUL_VEC_SH(r2,rE);
132 r3 = ADD_VEC_SH(r3,rF);
133 r4 = MUL_VEC_SH(r4,rC);
134 r5 = ADD_VEC_SH(r5,rD);
135 r6 = MUL_VEC_SH(r6,rE);
136 r7 = ADD_VEC_SH(r7,rF);
137 r8 = MUL_VEC_SH(r8,rC);
138 r9 = ADD_VEC_SH(r9,rD);
139 rA = MUL_VEC_SH(rA,rE);
140 rB = ADD_VEC_SH(rB,rF);
141
142 r0 = ADD_VEC_SH(r0,rF);
143 r1 = MUL_VEC_SH(r1,rE);
144 r2 = ADD_VEC_SH(r2,rD);
145 r3 = MUL_VEC_SH(r3,rC);
146 r4 = ADD_VEC_SH(r4,rF);
147 r5 = MUL_VEC_SH(r5,rE);
148 r6 = ADD_VEC_SH(r6,rD);
149 r7 = MUL_VEC_SH(r7,rC);
150 r8 = ADD_VEC_SH(r8,rF);
151 r9 = MUL_VEC_SH(r9,rE);
152 rA = ADD_VEC_SH(rA,rD);
153 rB = MUL_VEC_SH(rB,rC);
154
155 r0 = MUL_VEC_SH(r0,rC);
156 r1 = ADD_VEC_SH(r1,rD);
157 r2 = MUL_VEC_SH(r2,rE);
158 r3 = ADD_VEC_SH(r3,rF);
159 r4 = MUL_VEC_SH(r4,rC);
160 r5 = ADD_VEC_SH(r5,rD);
161 r6 = MUL_VEC_SH(r6,rE);
162 r7 = ADD_VEC_SH(r7,rF);
163 r8 = MUL_VEC_SH(r8,rC);
164 r9 = ADD_VEC_SH(r9,rD);
165 rA = MUL_VEC_SH(rA,rE);
166 rB = ADD_VEC_SH(rB,rF);
167
168 r0 = ADD_VEC_SH(r0,rF);
169 r1 = MUL_VEC_SH(r1,rE);
170 r2 = ADD_VEC_SH(r2,rD);
171 r3 = MUL_VEC_SH(r3,rC);
172 r4 = ADD_VEC_SH(r4,rF);
173 r5 = MUL_VEC_SH(r5,rE);
174 r6 = ADD_VEC_SH(r6,rD);
175 r7 = MUL_VEC_SH(r7,rC);
176 r8 = ADD_VEC_SH(r8,rF);
177 r9 = MUL_VEC_SH(r9,rE);
178 rA = ADD_VEC_SH(rA,rD);
179 rB = MUL_VEC_SH(rB,rC);
180
181 i++;
182 }
183 c++;
184 }
185
186 /* Use data so that compiler does not eliminate it when using -O2 */
187 r0 = ADD_VEC_SH(r0,r1);
188 r2 = ADD_VEC_SH(r2,r3);
189 r4 = ADD_VEC_SH(r4,r5);
190 r6 = ADD_VEC_SH(r6,r7);
191 r8 = ADD_VEC_SH(r8,r9);
192 rA = ADD_VEC_SH(rA,rB);
193
194 r0 = ADD_VEC_SH(r0,r2);
195 r4 = ADD_VEC_SH(r4,r6);
196 r8 = ADD_VEC_SH(r8,rA);
197
198 r0 = ADD_VEC_SH(r0,r4);
199 r0 = ADD_VEC_SH(r0,r8);
200
201 half out = 0;
202 half temp = r0;
203 out = ADD_VEC_SH(out,temp);
204
205 return out;
206}
207
208half test_hp_scalar_VEC_96( uint64 iterations ){
209 register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
210
211 /* Generate starting data */
212 r0 = SET_VEC_SH(0.01);
213 r1 = SET_VEC_SH(0.02);
214 r2 = SET_VEC_SH(0.03);
215 r3 = SET_VEC_SH(0.04);
216 r4 = SET_VEC_SH(0.05);
217 r5 = SET_VEC_SH(0.06);
218 r6 = SET_VEC_SH(0.07);
219 r7 = SET_VEC_SH(0.08);
220 r8 = SET_VEC_SH(0.09);
221 r9 = SET_VEC_SH(0.10);
222 rA = SET_VEC_SH(0.11);
223 rB = SET_VEC_SH(0.12);
224 rC = SET_VEC_SH(0.13);
225 rD = SET_VEC_SH(0.14);
226 rE = SET_VEC_SH(0.15);
227 rF = SET_VEC_SH(0.16);
228
229 uint64 c = 0;
230 while (c < iterations){
231 size_t i = 0;
232 while (i < 1000){
233
234 /* The performance critical part */
235 r0 = MUL_VEC_SH(r0,rC);
236 r1 = ADD_VEC_SH(r1,rD);
237 r2 = MUL_VEC_SH(r2,rE);
238 r3 = ADD_VEC_SH(r3,rF);
239 r4 = MUL_VEC_SH(r4,rC);
240 r5 = ADD_VEC_SH(r5,rD);
241 r6 = MUL_VEC_SH(r6,rE);
242 r7 = ADD_VEC_SH(r7,rF);
243 r8 = MUL_VEC_SH(r8,rC);
244 r9 = ADD_VEC_SH(r9,rD);
245 rA = MUL_VEC_SH(rA,rE);
246 rB = ADD_VEC_SH(rB,rF);
247
248 r0 = ADD_VEC_SH(r0,rF);
249 r1 = MUL_VEC_SH(r1,rE);
250 r2 = ADD_VEC_SH(r2,rD);
251 r3 = MUL_VEC_SH(r3,rC);
252 r4 = ADD_VEC_SH(r4,rF);
253 r5 = MUL_VEC_SH(r5,rE);
254 r6 = ADD_VEC_SH(r6,rD);
255 r7 = MUL_VEC_SH(r7,rC);
256 r8 = ADD_VEC_SH(r8,rF);
257 r9 = MUL_VEC_SH(r9,rE);
258 rA = ADD_VEC_SH(rA,rD);
259 rB = MUL_VEC_SH(rB,rC);
260
261 r0 = MUL_VEC_SH(r0,rC);
262 r1 = ADD_VEC_SH(r1,rD);
263 r2 = MUL_VEC_SH(r2,rE);
264 r3 = ADD_VEC_SH(r3,rF);
265 r4 = MUL_VEC_SH(r4,rC);
266 r5 = ADD_VEC_SH(r5,rD);
267 r6 = MUL_VEC_SH(r6,rE);
268 r7 = ADD_VEC_SH(r7,rF);
269 r8 = MUL_VEC_SH(r8,rC);
270 r9 = ADD_VEC_SH(r9,rD);
271 rA = MUL_VEC_SH(rA,rE);
272 rB = ADD_VEC_SH(rB,rF);
273
274 r0 = ADD_VEC_SH(r0,rF);
275 r1 = MUL_VEC_SH(r1,rE);
276 r2 = ADD_VEC_SH(r2,rD);
277 r3 = MUL_VEC_SH(r3,rC);
278 r4 = ADD_VEC_SH(r4,rF);
279 r5 = MUL_VEC_SH(r5,rE);
280 r6 = ADD_VEC_SH(r6,rD);
281 r7 = MUL_VEC_SH(r7,rC);
282 r8 = ADD_VEC_SH(r8,rF);
283 r9 = MUL_VEC_SH(r9,rE);
284 rA = ADD_VEC_SH(rA,rD);
285 rB = MUL_VEC_SH(rB,rC);
286
287 r0 = MUL_VEC_SH(r0,rC);
288 r1 = ADD_VEC_SH(r1,rD);
289 r2 = MUL_VEC_SH(r2,rE);
290 r3 = ADD_VEC_SH(r3,rF);
291 r4 = MUL_VEC_SH(r4,rC);
292 r5 = ADD_VEC_SH(r5,rD);
293 r6 = MUL_VEC_SH(r6,rE);
294 r7 = ADD_VEC_SH(r7,rF);
295 r8 = MUL_VEC_SH(r8,rC);
296 r9 = ADD_VEC_SH(r9,rD);
297 rA = MUL_VEC_SH(rA,rE);
298 rB = ADD_VEC_SH(rB,rF);
299
300 r0 = ADD_VEC_SH(r0,rF);
301 r1 = MUL_VEC_SH(r1,rE);
302 r2 = ADD_VEC_SH(r2,rD);
303 r3 = MUL_VEC_SH(r3,rC);
304 r4 = ADD_VEC_SH(r4,rF);
305 r5 = MUL_VEC_SH(r5,rE);
306 r6 = ADD_VEC_SH(r6,rD);
307 r7 = MUL_VEC_SH(r7,rC);
308 r8 = ADD_VEC_SH(r8,rF);
309 r9 = MUL_VEC_SH(r9,rE);
310 rA = ADD_VEC_SH(rA,rD);
311 rB = MUL_VEC_SH(rB,rC);
312
313 r0 = MUL_VEC_SH(r0,rC);
314 r1 = ADD_VEC_SH(r1,rD);
315 r2 = MUL_VEC_SH(r2,rE);
316 r3 = ADD_VEC_SH(r3,rF);
317 r4 = MUL_VEC_SH(r4,rC);
318 r5 = ADD_VEC_SH(r5,rD);
319 r6 = MUL_VEC_SH(r6,rE);
320 r7 = ADD_VEC_SH(r7,rF);
321 r8 = MUL_VEC_SH(r8,rC);
322 r9 = ADD_VEC_SH(r9,rD);
323 rA = MUL_VEC_SH(rA,rE);
324 rB = ADD_VEC_SH(rB,rF);
325
326 r0 = ADD_VEC_SH(r0,rF);
327 r1 = MUL_VEC_SH(r1,rE);
328 r2 = ADD_VEC_SH(r2,rD);
329 r3 = MUL_VEC_SH(r3,rC);
330 r4 = ADD_VEC_SH(r4,rF);
331 r5 = MUL_VEC_SH(r5,rE);
332 r6 = ADD_VEC_SH(r6,rD);
333 r7 = MUL_VEC_SH(r7,rC);
334 r8 = ADD_VEC_SH(r8,rF);
335 r9 = MUL_VEC_SH(r9,rE);
336 rA = ADD_VEC_SH(rA,rD);
337 rB = MUL_VEC_SH(rB,rC);
338
339 i++;
340 }
341 c++;
342 }
343
344 /* Use data so that compiler does not eliminate it when using -O2 */
345 r0 = ADD_VEC_SH(r0,r1);
346 r2 = ADD_VEC_SH(r2,r3);
347 r4 = ADD_VEC_SH(r4,r5);
348 r6 = ADD_VEC_SH(r6,r7);
349 r8 = ADD_VEC_SH(r8,r9);
350 rA = ADD_VEC_SH(rA,rB);
351
352 r0 = ADD_VEC_SH(r0,r2);
353 r4 = ADD_VEC_SH(r4,r6);
354 r8 = ADD_VEC_SH(r8,rA);
355
356 r0 = ADD_VEC_SH(r0,r4);
357 r0 = ADD_VEC_SH(r0,r8);
358
359 half out = 0;
360 half temp = r0;
361 out = ADD_VEC_SH(out,temp);
362
363 return out;
364}
365
366#else
367float test_hp_scalar_VEC_24( uint64 iterations ){
368
369 (void)iterations;
370 return 0.0;
371}
372
373float test_hp_scalar_VEC_48( uint64 iterations ){
374
375 (void)iterations;
376 return 0.0;
377}
378
379float test_hp_scalar_VEC_96( uint64 iterations ){
380
381 (void)iterations;
382 return 0.0;
383}
384#endif
385
386/************************************/
387/* Loop unrolling: 24 instructions */
388/************************************/
389float test_sp_scalar_VEC_24( uint64 iterations ){
390 register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
391
392 /* Generate starting data */
393 r0 = SET_VEC_SS(0.01);
394 r1 = SET_VEC_SS(0.02);
395 r2 = SET_VEC_SS(0.03);
396 r3 = SET_VEC_SS(0.04);
397 r4 = SET_VEC_SS(0.05);
398 r5 = SET_VEC_SS(0.06);
399 r6 = SET_VEC_SS(0.07);
400 r7 = SET_VEC_SS(0.08);
401 r8 = SET_VEC_SS(0.09);
402 r9 = SET_VEC_SS(0.10);
403 rA = SET_VEC_SS(0.11);
404 rB = SET_VEC_SS(0.12);
405 rC = SET_VEC_SS(0.13);
406 rD = SET_VEC_SS(0.14);
407 rE = SET_VEC_SS(0.15);
408 rF = SET_VEC_SS(0.16);
409
410 uint64 c = 0;
411 while (c < iterations){
412 size_t i = 0;
413 while (i < 1000){
414
415 /* The performance critical part */
416 r0 = MUL_VEC_SS(r0,rC);
417 r1 = ADD_VEC_SS(r1,rD);
418 r2 = MUL_VEC_SS(r2,rE);
419 r3 = ADD_VEC_SS(r3,rF);
420 r4 = MUL_VEC_SS(r4,rC);
421 r5 = ADD_VEC_SS(r5,rD);
422 r6 = MUL_VEC_SS(r6,rE);
423 r7 = ADD_VEC_SS(r7,rF);
424 r8 = MUL_VEC_SS(r8,rC);
425 r9 = ADD_VEC_SS(r9,rD);
426 rA = MUL_VEC_SS(rA,rE);
427 rB = ADD_VEC_SS(rB,rF);
428
429 r0 = ADD_VEC_SS(r0,rF);
430 r1 = MUL_VEC_SS(r1,rE);
431 r2 = ADD_VEC_SS(r2,rD);
432 r3 = MUL_VEC_SS(r3,rC);
433 r4 = ADD_VEC_SS(r4,rF);
434 r5 = MUL_VEC_SS(r5,rE);
435 r6 = ADD_VEC_SS(r6,rD);
436 r7 = MUL_VEC_SS(r7,rC);
437 r8 = ADD_VEC_SS(r8,rF);
438 r9 = MUL_VEC_SS(r9,rE);
439 rA = ADD_VEC_SS(rA,rD);
440 rB = MUL_VEC_SS(rB,rC);
441
442 i++;
443 }
444 c++;
445 }
446
447 /* Use data so that compiler does not eliminate it when using -O2 */
448 r0 = ADD_VEC_SS(r0,r1);
449 r2 = ADD_VEC_SS(r2,r3);
450 r4 = ADD_VEC_SS(r4,r5);
451 r6 = ADD_VEC_SS(r6,r7);
452 r8 = ADD_VEC_SS(r8,r9);
453 rA = ADD_VEC_SS(rA,rB);
454
455 r0 = ADD_VEC_SS(r0,r2);
456 r4 = ADD_VEC_SS(r4,r6);
457 r8 = ADD_VEC_SS(r8,rA);
458
459 r0 = ADD_VEC_SS(r0,r4);
460 r0 = ADD_VEC_SS(r0,r8);
461
462 float out = 0;
463 SP_SCALAR_TYPE temp = r0;
464 out += ((float*)&temp)[0];
465
466 return out;
467}
468
469/************************************/
470/* Loop unrolling: 48 instructions */
471/************************************/
472float test_sp_scalar_VEC_48( uint64 iterations ){
473 register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
474
475 /* Generate starting data */
476 r0 = SET_VEC_SS(0.01);
477 r1 = SET_VEC_SS(0.02);
478 r2 = SET_VEC_SS(0.03);
479 r3 = SET_VEC_SS(0.04);
480 r4 = SET_VEC_SS(0.05);
481 r5 = SET_VEC_SS(0.06);
482 r6 = SET_VEC_SS(0.07);
483 r7 = SET_VEC_SS(0.08);
484 r8 = SET_VEC_SS(0.09);
485 r9 = SET_VEC_SS(0.10);
486 rA = SET_VEC_SS(0.11);
487 rB = SET_VEC_SS(0.12);
488 rC = SET_VEC_SS(0.13);
489 rD = SET_VEC_SS(0.14);
490 rE = SET_VEC_SS(0.15);
491 rF = SET_VEC_SS(0.16);
492
493 uint64 c = 0;
494 while (c < iterations){
495 size_t i = 0;
496 while (i < 1000){
497
498 /* The performance critical part */
499 r0 = MUL_VEC_SS(r0,rC);
500 r1 = ADD_VEC_SS(r1,rD);
501 r2 = MUL_VEC_SS(r2,rE);
502 r3 = ADD_VEC_SS(r3,rF);
503 r4 = MUL_VEC_SS(r4,rC);
504 r5 = ADD_VEC_SS(r5,rD);
505 r6 = MUL_VEC_SS(r6,rE);
506 r7 = ADD_VEC_SS(r7,rF);
507 r8 = MUL_VEC_SS(r8,rC);
508 r9 = ADD_VEC_SS(r9,rD);
509 rA = MUL_VEC_SS(rA,rE);
510 rB = ADD_VEC_SS(rB,rF);
511
512 r0 = ADD_VEC_SS(r0,rF);
513 r1 = MUL_VEC_SS(r1,rE);
514 r2 = ADD_VEC_SS(r2,rD);
515 r3 = MUL_VEC_SS(r3,rC);
516 r4 = ADD_VEC_SS(r4,rF);
517 r5 = MUL_VEC_SS(r5,rE);
518 r6 = ADD_VEC_SS(r6,rD);
519 r7 = MUL_VEC_SS(r7,rC);
520 r8 = ADD_VEC_SS(r8,rF);
521 r9 = MUL_VEC_SS(r9,rE);
522 rA = ADD_VEC_SS(rA,rD);
523 rB = MUL_VEC_SS(rB,rC);
524
525 r0 = MUL_VEC_SS(r0,rC);
526 r1 = ADD_VEC_SS(r1,rD);
527 r2 = MUL_VEC_SS(r2,rE);
528 r3 = ADD_VEC_SS(r3,rF);
529 r4 = MUL_VEC_SS(r4,rC);
530 r5 = ADD_VEC_SS(r5,rD);
531 r6 = MUL_VEC_SS(r6,rE);
532 r7 = ADD_VEC_SS(r7,rF);
533 r8 = MUL_VEC_SS(r8,rC);
534 r9 = ADD_VEC_SS(r9,rD);
535 rA = MUL_VEC_SS(rA,rE);
536 rB = ADD_VEC_SS(rB,rF);
537
538 r0 = ADD_VEC_SS(r0,rF);
539 r1 = MUL_VEC_SS(r1,rE);
540 r2 = ADD_VEC_SS(r2,rD);
541 r3 = MUL_VEC_SS(r3,rC);
542 r4 = ADD_VEC_SS(r4,rF);
543 r5 = MUL_VEC_SS(r5,rE);
544 r6 = ADD_VEC_SS(r6,rD);
545 r7 = MUL_VEC_SS(r7,rC);
546 r8 = ADD_VEC_SS(r8,rF);
547 r9 = MUL_VEC_SS(r9,rE);
548 rA = ADD_VEC_SS(rA,rD);
549 rB = MUL_VEC_SS(rB,rC);
550
551 i++;
552 }
553 c++;
554 }
555
556 /* Use data so that compiler does not eliminate it when using -O2 */
557 r0 = ADD_VEC_SS(r0,r1);
558 r2 = ADD_VEC_SS(r2,r3);
559 r4 = ADD_VEC_SS(r4,r5);
560 r6 = ADD_VEC_SS(r6,r7);
561 r8 = ADD_VEC_SS(r8,r9);
562 rA = ADD_VEC_SS(rA,rB);
563
564 r0 = ADD_VEC_SS(r0,r2);
565 r4 = ADD_VEC_SS(r4,r6);
566 r8 = ADD_VEC_SS(r8,rA);
567
568 r0 = ADD_VEC_SS(r0,r4);
569 r0 = ADD_VEC_SS(r0,r8);
570
571 float out = 0;
572 SP_SCALAR_TYPE temp = r0;
573 out += ((float*)&temp)[0];
574
575 return out;
576}
577
578/************************************/
579/* Loop unrolling: 96 instructions */
580/************************************/
581float test_sp_scalar_VEC_96( uint64 iterations ){
582 register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
583
584 /* Generate starting data */
585 r0 = SET_VEC_SS(0.01);
586 r1 = SET_VEC_SS(0.02);
587 r2 = SET_VEC_SS(0.03);
588 r3 = SET_VEC_SS(0.04);
589 r4 = SET_VEC_SS(0.05);
590 r5 = SET_VEC_SS(0.06);
591 r6 = SET_VEC_SS(0.07);
592 r7 = SET_VEC_SS(0.08);
593 r8 = SET_VEC_SS(0.09);
594 r9 = SET_VEC_SS(0.10);
595 rA = SET_VEC_SS(0.11);
596 rB = SET_VEC_SS(0.12);
597 rC = SET_VEC_SS(0.13);
598 rD = SET_VEC_SS(0.14);
599 rE = SET_VEC_SS(0.15);
600 rF = SET_VEC_SS(0.16);
601
602 uint64 c = 0;
603 while (c < iterations){
604 size_t i = 0;
605 while (i < 1000){
606
607 /* The performance critical part */
608 r0 = MUL_VEC_SS(r0,rC);
609 r1 = ADD_VEC_SS(r1,rD);
610 r2 = MUL_VEC_SS(r2,rE);
611 r3 = ADD_VEC_SS(r3,rF);
612 r4 = MUL_VEC_SS(r4,rC);
613 r5 = ADD_VEC_SS(r5,rD);
614 r6 = MUL_VEC_SS(r6,rE);
615 r7 = ADD_VEC_SS(r7,rF);
616 r8 = MUL_VEC_SS(r8,rC);
617 r9 = ADD_VEC_SS(r9,rD);
618 rA = MUL_VEC_SS(rA,rE);
619 rB = ADD_VEC_SS(rB,rF);
620
621 r0 = ADD_VEC_SS(r0,rF);
622 r1 = MUL_VEC_SS(r1,rE);
623 r2 = ADD_VEC_SS(r2,rD);
624 r3 = MUL_VEC_SS(r3,rC);
625 r4 = ADD_VEC_SS(r4,rF);
626 r5 = MUL_VEC_SS(r5,rE);
627 r6 = ADD_VEC_SS(r6,rD);
628 r7 = MUL_VEC_SS(r7,rC);
629 r8 = ADD_VEC_SS(r8,rF);
630 r9 = MUL_VEC_SS(r9,rE);
631 rA = ADD_VEC_SS(rA,rD);
632 rB = MUL_VEC_SS(rB,rC);
633
634 r0 = MUL_VEC_SS(r0,rC);
635 r1 = ADD_VEC_SS(r1,rD);
636 r2 = MUL_VEC_SS(r2,rE);
637 r3 = ADD_VEC_SS(r3,rF);
638 r4 = MUL_VEC_SS(r4,rC);
639 r5 = ADD_VEC_SS(r5,rD);
640 r6 = MUL_VEC_SS(r6,rE);
641 r7 = ADD_VEC_SS(r7,rF);
642 r8 = MUL_VEC_SS(r8,rC);
643 r9 = ADD_VEC_SS(r9,rD);
644 rA = MUL_VEC_SS(rA,rE);
645 rB = ADD_VEC_SS(rB,rF);
646
647 r0 = ADD_VEC_SS(r0,rF);
648 r1 = MUL_VEC_SS(r1,rE);
649 r2 = ADD_VEC_SS(r2,rD);
650 r3 = MUL_VEC_SS(r3,rC);
651 r4 = ADD_VEC_SS(r4,rF);
652 r5 = MUL_VEC_SS(r5,rE);
653 r6 = ADD_VEC_SS(r6,rD);
654 r7 = MUL_VEC_SS(r7,rC);
655 r8 = ADD_VEC_SS(r8,rF);
656 r9 = MUL_VEC_SS(r9,rE);
657 rA = ADD_VEC_SS(rA,rD);
658 rB = MUL_VEC_SS(rB,rC);
659
660 r0 = MUL_VEC_SS(r0,rC);
661 r1 = ADD_VEC_SS(r1,rD);
662 r2 = MUL_VEC_SS(r2,rE);
663 r3 = ADD_VEC_SS(r3,rF);
664 r4 = MUL_VEC_SS(r4,rC);
665 r5 = ADD_VEC_SS(r5,rD);
666 r6 = MUL_VEC_SS(r6,rE);
667 r7 = ADD_VEC_SS(r7,rF);
668 r8 = MUL_VEC_SS(r8,rC);
669 r9 = ADD_VEC_SS(r9,rD);
670 rA = MUL_VEC_SS(rA,rE);
671 rB = ADD_VEC_SS(rB,rF);
672
673 r0 = ADD_VEC_SS(r0,rF);
674 r1 = MUL_VEC_SS(r1,rE);
675 r2 = ADD_VEC_SS(r2,rD);
676 r3 = MUL_VEC_SS(r3,rC);
677 r4 = ADD_VEC_SS(r4,rF);
678 r5 = MUL_VEC_SS(r5,rE);
679 r6 = ADD_VEC_SS(r6,rD);
680 r7 = MUL_VEC_SS(r7,rC);
681 r8 = ADD_VEC_SS(r8,rF);
682 r9 = MUL_VEC_SS(r9,rE);
683 rA = ADD_VEC_SS(rA,rD);
684 rB = MUL_VEC_SS(rB,rC);
685
686 r0 = MUL_VEC_SS(r0,rC);
687 r1 = ADD_VEC_SS(r1,rD);
688 r2 = MUL_VEC_SS(r2,rE);
689 r3 = ADD_VEC_SS(r3,rF);
690 r4 = MUL_VEC_SS(r4,rC);
691 r5 = ADD_VEC_SS(r5,rD);
692 r6 = MUL_VEC_SS(r6,rE);
693 r7 = ADD_VEC_SS(r7,rF);
694 r8 = MUL_VEC_SS(r8,rC);
695 r9 = ADD_VEC_SS(r9,rD);
696 rA = MUL_VEC_SS(rA,rE);
697 rB = ADD_VEC_SS(rB,rF);
698
699 r0 = ADD_VEC_SS(r0,rF);
700 r1 = MUL_VEC_SS(r1,rE);
701 r2 = ADD_VEC_SS(r2,rD);
702 r3 = MUL_VEC_SS(r3,rC);
703 r4 = ADD_VEC_SS(r4,rF);
704 r5 = MUL_VEC_SS(r5,rE);
705 r6 = ADD_VEC_SS(r6,rD);
706 r7 = MUL_VEC_SS(r7,rC);
707 r8 = ADD_VEC_SS(r8,rF);
708 r9 = MUL_VEC_SS(r9,rE);
709 rA = ADD_VEC_SS(rA,rD);
710 rB = MUL_VEC_SS(rB,rC);
711
712 i++;
713 }
714 c++;
715 }
716
717 /* Use data so that compiler does not eliminate it when using -O2 */
718 r0 = ADD_VEC_SS(r0,r1);
719 r2 = ADD_VEC_SS(r2,r3);
720 r4 = ADD_VEC_SS(r4,r5);
721 r6 = ADD_VEC_SS(r6,r7);
722 r8 = ADD_VEC_SS(r8,r9);
723 rA = ADD_VEC_SS(rA,rB);
724
725 r0 = ADD_VEC_SS(r0,r2);
726 r4 = ADD_VEC_SS(r4,r6);
727 r8 = ADD_VEC_SS(r8,rA);
728
729 r0 = ADD_VEC_SS(r0,r4);
730 r0 = ADD_VEC_SS(r0,r8);
731
732 float out = 0;
733 SP_SCALAR_TYPE temp = r0;
734 out += ((float*)&temp)[0];
735
736 return out;
737}
738
739/************************************/
740/* Loop unrolling: 24 instructions */
741/************************************/
742double test_dp_scalar_VEC_24( uint64 iterations ){
743 register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
744
745 /* Generate starting data */
746 r0 = SET_VEC_SD(0.01);
747 r1 = SET_VEC_SD(0.02);
748 r2 = SET_VEC_SD(0.03);
749 r3 = SET_VEC_SD(0.04);
750 r4 = SET_VEC_SD(0.05);
751 r5 = SET_VEC_SD(0.06);
752 r6 = SET_VEC_SD(0.07);
753 r7 = SET_VEC_SD(0.08);
754 r8 = SET_VEC_SD(0.09);
755 r9 = SET_VEC_SD(0.10);
756 rA = SET_VEC_SD(0.11);
757 rB = SET_VEC_SD(0.12);
758 rC = SET_VEC_SD(0.13);
759 rD = SET_VEC_SD(0.14);
760 rE = SET_VEC_SD(0.15);
761 rF = SET_VEC_SD(0.16);
762
763 uint64 c = 0;
764 while (c < iterations){
765 size_t i = 0;
766 while (i < 1000){
767
768 /* The performance critical part */
769 r0 = MUL_VEC_SD(r0,rC);
770 r1 = ADD_VEC_SD(r1,rD);
771 r2 = MUL_VEC_SD(r2,rE);
772 r3 = ADD_VEC_SD(r3,rF);
773 r4 = MUL_VEC_SD(r4,rC);
774 r5 = ADD_VEC_SD(r5,rD);
775 r6 = MUL_VEC_SD(r6,rE);
776 r7 = ADD_VEC_SD(r7,rF);
777 r8 = MUL_VEC_SD(r8,rC);
778 r9 = ADD_VEC_SD(r9,rD);
779 rA = MUL_VEC_SD(rA,rE);
780 rB = ADD_VEC_SD(rB,rF);
781
782 r0 = ADD_VEC_SD(r0,rF);
783 r1 = MUL_VEC_SD(r1,rE);
784 r2 = ADD_VEC_SD(r2,rD);
785 r3 = MUL_VEC_SD(r3,rC);
786 r4 = ADD_VEC_SD(r4,rF);
787 r5 = MUL_VEC_SD(r5,rE);
788 r6 = ADD_VEC_SD(r6,rD);
789 r7 = MUL_VEC_SD(r7,rC);
790 r8 = ADD_VEC_SD(r8,rF);
791 r9 = MUL_VEC_SD(r9,rE);
792 rA = ADD_VEC_SD(rA,rD);
793 rB = MUL_VEC_SD(rB,rC);
794
795 i++;
796 }
797 c++;
798 }
799
800 /* Use data so that compiler does not eliminate it when using -O2 */
801 r0 = ADD_VEC_SD(r0,r1);
802 r2 = ADD_VEC_SD(r2,r3);
803 r4 = ADD_VEC_SD(r4,r5);
804 r6 = ADD_VEC_SD(r6,r7);
805 r8 = ADD_VEC_SD(r8,r9);
806 rA = ADD_VEC_SD(rA,rB);
807
808 r0 = ADD_VEC_SD(r0,r2);
809 r4 = ADD_VEC_SD(r4,r6);
810 r8 = ADD_VEC_SD(r8,rA);
811
812 r0 = ADD_VEC_SD(r0,r4);
813 r0 = ADD_VEC_SD(r0,r8);
814
815 double out = 0;
816 DP_SCALAR_TYPE temp = r0;
817 out += ((double*)&temp)[0];
818
819 return out;
820}
821
822/************************************/
823/* Loop unrolling: 48 instructions */
824/************************************/
825double test_dp_scalar_VEC_48( uint64 iterations ){
826 register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
827
828 /* Generate starting data */
829 r0 = SET_VEC_SD(0.01);
830 r1 = SET_VEC_SD(0.02);
831 r2 = SET_VEC_SD(0.03);
832 r3 = SET_VEC_SD(0.04);
833 r4 = SET_VEC_SD(0.05);
834 r5 = SET_VEC_SD(0.06);
835 r6 = SET_VEC_SD(0.07);
836 r7 = SET_VEC_SD(0.08);
837 r8 = SET_VEC_SD(0.09);
838 r9 = SET_VEC_SD(0.10);
839 rA = SET_VEC_SD(0.11);
840 rB = SET_VEC_SD(0.12);
841 rC = SET_VEC_SD(0.13);
842 rD = SET_VEC_SD(0.14);
843 rE = SET_VEC_SD(0.15);
844 rF = SET_VEC_SD(0.16);
845
846 uint64 c = 0;
847 while (c < iterations){
848 size_t i = 0;
849 while (i < 1000){
850
851 /* The performance critical part */
852 r0 = MUL_VEC_SD(r0,rC);
853 r1 = ADD_VEC_SD(r1,rD);
854 r2 = MUL_VEC_SD(r2,rE);
855 r3 = ADD_VEC_SD(r3,rF);
856 r4 = MUL_VEC_SD(r4,rC);
857 r5 = ADD_VEC_SD(r5,rD);
858 r6 = MUL_VEC_SD(r6,rE);
859 r7 = ADD_VEC_SD(r7,rF);
860 r8 = MUL_VEC_SD(r8,rC);
861 r9 = ADD_VEC_SD(r9,rD);
862 rA = MUL_VEC_SD(rA,rE);
863 rB = ADD_VEC_SD(rB,rF);
864
865 r0 = ADD_VEC_SD(r0,rF);
866 r1 = MUL_VEC_SD(r1,rE);
867 r2 = ADD_VEC_SD(r2,rD);
868 r3 = MUL_VEC_SD(r3,rC);
869 r4 = ADD_VEC_SD(r4,rF);
870 r5 = MUL_VEC_SD(r5,rE);
871 r6 = ADD_VEC_SD(r6,rD);
872 r7 = MUL_VEC_SD(r7,rC);
873 r8 = ADD_VEC_SD(r8,rF);
874 r9 = MUL_VEC_SD(r9,rE);
875 rA = ADD_VEC_SD(rA,rD);
876 rB = MUL_VEC_SD(rB,rC);
877
878 r0 = MUL_VEC_SD(r0,rC);
879 r1 = ADD_VEC_SD(r1,rD);
880 r2 = MUL_VEC_SD(r2,rE);
881 r3 = ADD_VEC_SD(r3,rF);
882 r4 = MUL_VEC_SD(r4,rC);
883 r5 = ADD_VEC_SD(r5,rD);
884 r6 = MUL_VEC_SD(r6,rE);
885 r7 = ADD_VEC_SD(r7,rF);
886 r8 = MUL_VEC_SD(r8,rC);
887 r9 = ADD_VEC_SD(r9,rD);
888 rA = MUL_VEC_SD(rA,rE);
889 rB = ADD_VEC_SD(rB,rF);
890
891 r0 = ADD_VEC_SD(r0,rF);
892 r1 = MUL_VEC_SD(r1,rE);
893 r2 = ADD_VEC_SD(r2,rD);
894 r3 = MUL_VEC_SD(r3,rC);
895 r4 = ADD_VEC_SD(r4,rF);
896 r5 = MUL_VEC_SD(r5,rE);
897 r6 = ADD_VEC_SD(r6,rD);
898 r7 = MUL_VEC_SD(r7,rC);
899 r8 = ADD_VEC_SD(r8,rF);
900 r9 = MUL_VEC_SD(r9,rE);
901 rA = ADD_VEC_SD(rA,rD);
902 rB = MUL_VEC_SD(rB,rC);
903
904 i++;
905 }
906 c++;
907 }
908
909 /* Use data so that compiler does not eliminate it when using -O2 */
910 r0 = ADD_VEC_SD(r0,r1);
911 r2 = ADD_VEC_SD(r2,r3);
912 r4 = ADD_VEC_SD(r4,r5);
913 r6 = ADD_VEC_SD(r6,r7);
914 r8 = ADD_VEC_SD(r8,r9);
915 rA = ADD_VEC_SD(rA,rB);
916
917 r0 = ADD_VEC_SD(r0,r2);
918 r4 = ADD_VEC_SD(r4,r6);
919 r8 = ADD_VEC_SD(r8,rA);
920
921 r0 = ADD_VEC_SD(r0,r4);
922 r0 = ADD_VEC_SD(r0,r8);
923
924 double out = 0;
925 DP_SCALAR_TYPE temp = r0;
926 out += ((double*)&temp)[0];
927
928 return out;
929}
930
931/************************************/
932/* Loop unrolling: 96 instructions */
933/************************************/
934double test_dp_scalar_VEC_96( uint64 iterations ){
935 register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
936
937 /* Generate starting data */
938 r0 = SET_VEC_SD(0.01);
939 r1 = SET_VEC_SD(0.02);
940 r2 = SET_VEC_SD(0.03);
941 r3 = SET_VEC_SD(0.04);
942 r4 = SET_VEC_SD(0.05);
943 r5 = SET_VEC_SD(0.06);
944 r6 = SET_VEC_SD(0.07);
945 r7 = SET_VEC_SD(0.08);
946 r8 = SET_VEC_SD(0.09);
947 r9 = SET_VEC_SD(0.10);
948 rA = SET_VEC_SD(0.11);
949 rB = SET_VEC_SD(0.12);
950 rC = SET_VEC_SD(0.13);
951 rD = SET_VEC_SD(0.14);
952 rE = SET_VEC_SD(0.15);
953 rF = SET_VEC_SD(0.16);
954
955 uint64 c = 0;
956 while (c < iterations){
957 size_t i = 0;
958 while (i < 1000){
959
960 /* The performance critical part */
961 r0 = MUL_VEC_SD(r0,rC);
962 r1 = ADD_VEC_SD(r1,rD);
963 r2 = MUL_VEC_SD(r2,rE);
964 r3 = ADD_VEC_SD(r3,rF);
965 r4 = MUL_VEC_SD(r4,rC);
966 r5 = ADD_VEC_SD(r5,rD);
967 r6 = MUL_VEC_SD(r6,rE);
968 r7 = ADD_VEC_SD(r7,rF);
969 r8 = MUL_VEC_SD(r8,rC);
970 r9 = ADD_VEC_SD(r9,rD);
971 rA = MUL_VEC_SD(rA,rE);
972 rB = ADD_VEC_SD(rB,rF);
973
974 r0 = ADD_VEC_SD(r0,rF);
975 r1 = MUL_VEC_SD(r1,rE);
976 r2 = ADD_VEC_SD(r2,rD);
977 r3 = MUL_VEC_SD(r3,rC);
978 r4 = ADD_VEC_SD(r4,rF);
979 r5 = MUL_VEC_SD(r5,rE);
980 r6 = ADD_VEC_SD(r6,rD);
981 r7 = MUL_VEC_SD(r7,rC);
982 r8 = ADD_VEC_SD(r8,rF);
983 r9 = MUL_VEC_SD(r9,rE);
984 rA = ADD_VEC_SD(rA,rD);
985 rB = MUL_VEC_SD(rB,rC);
986
987 r0 = MUL_VEC_SD(r0,rC);
988 r1 = ADD_VEC_SD(r1,rD);
989 r2 = MUL_VEC_SD(r2,rE);
990 r3 = ADD_VEC_SD(r3,rF);
991 r4 = MUL_VEC_SD(r4,rC);
992 r5 = ADD_VEC_SD(r5,rD);
993 r6 = MUL_VEC_SD(r6,rE);
994 r7 = ADD_VEC_SD(r7,rF);
995 r8 = MUL_VEC_SD(r8,rC);
996 r9 = ADD_VEC_SD(r9,rD);
997 rA = MUL_VEC_SD(rA,rE);
998 rB = ADD_VEC_SD(rB,rF);
999
1000 r0 = ADD_VEC_SD(r0,rF);
1001 r1 = MUL_VEC_SD(r1,rE);
1002 r2 = ADD_VEC_SD(r2,rD);
1003 r3 = MUL_VEC_SD(r3,rC);
1004 r4 = ADD_VEC_SD(r4,rF);
1005 r5 = MUL_VEC_SD(r5,rE);
1006 r6 = ADD_VEC_SD(r6,rD);
1007 r7 = MUL_VEC_SD(r7,rC);
1008 r8 = ADD_VEC_SD(r8,rF);
1009 r9 = MUL_VEC_SD(r9,rE);
1010 rA = ADD_VEC_SD(rA,rD);
1011 rB = MUL_VEC_SD(rB,rC);
1012
1013 r0 = MUL_VEC_SD(r0,rC);
1014 r1 = ADD_VEC_SD(r1,rD);
1015 r2 = MUL_VEC_SD(r2,rE);
1016 r3 = ADD_VEC_SD(r3,rF);
1017 r4 = MUL_VEC_SD(r4,rC);
1018 r5 = ADD_VEC_SD(r5,rD);
1019 r6 = MUL_VEC_SD(r6,rE);
1020 r7 = ADD_VEC_SD(r7,rF);
1021 r8 = MUL_VEC_SD(r8,rC);
1022 r9 = ADD_VEC_SD(r9,rD);
1023 rA = MUL_VEC_SD(rA,rE);
1024 rB = ADD_VEC_SD(rB,rF);
1025
1026 r0 = ADD_VEC_SD(r0,rF);
1027 r1 = MUL_VEC_SD(r1,rE);
1028 r2 = ADD_VEC_SD(r2,rD);
1029 r3 = MUL_VEC_SD(r3,rC);
1030 r4 = ADD_VEC_SD(r4,rF);
1031 r5 = MUL_VEC_SD(r5,rE);
1032 r6 = ADD_VEC_SD(r6,rD);
1033 r7 = MUL_VEC_SD(r7,rC);
1034 r8 = ADD_VEC_SD(r8,rF);
1035 r9 = MUL_VEC_SD(r9,rE);
1036 rA = ADD_VEC_SD(rA,rD);
1037 rB = MUL_VEC_SD(rB,rC);
1038
1039 r0 = MUL_VEC_SD(r0,rC);
1040 r1 = ADD_VEC_SD(r1,rD);
1041 r2 = MUL_VEC_SD(r2,rE);
1042 r3 = ADD_VEC_SD(r3,rF);
1043 r4 = MUL_VEC_SD(r4,rC);
1044 r5 = ADD_VEC_SD(r5,rD);
1045 r6 = MUL_VEC_SD(r6,rE);
1046 r7 = ADD_VEC_SD(r7,rF);
1047 r8 = MUL_VEC_SD(r8,rC);
1048 r9 = ADD_VEC_SD(r9,rD);
1049 rA = MUL_VEC_SD(rA,rE);
1050 rB = ADD_VEC_SD(rB,rF);
1051
1052 r0 = ADD_VEC_SD(r0,rF);
1053 r1 = MUL_VEC_SD(r1,rE);
1054 r2 = ADD_VEC_SD(r2,rD);
1055 r3 = MUL_VEC_SD(r3,rC);
1056 r4 = ADD_VEC_SD(r4,rF);
1057 r5 = MUL_VEC_SD(r5,rE);
1058 r6 = ADD_VEC_SD(r6,rD);
1059 r7 = MUL_VEC_SD(r7,rC);
1060 r8 = ADD_VEC_SD(r8,rF);
1061 r9 = MUL_VEC_SD(r9,rE);
1062 rA = ADD_VEC_SD(rA,rD);
1063 rB = MUL_VEC_SD(rB,rC);
1064
1065 i++;
1066 }
1067 c++;
1068 }
1069
1070 /* Use data so that compiler does not eliminate it when using -O2 */
1071 r0 = ADD_VEC_SD(r0,r1);
1072 r2 = ADD_VEC_SD(r2,r3);
1073 r4 = ADD_VEC_SD(r4,r5);
1074 r6 = ADD_VEC_SD(r6,r7);
1075 r8 = ADD_VEC_SD(r8,r9);
1076 rA = ADD_VEC_SD(rA,rB);
1077
1078 r0 = ADD_VEC_SD(r0,r2);
1079 r4 = ADD_VEC_SD(r4,r6);
1080 r8 = ADD_VEC_SD(r8,rA);
1081
1082 r0 = ADD_VEC_SD(r0,r4);
1083 r0 = ADD_VEC_SD(r0,r8);
1084
1085 double out = 0;
1086 DP_SCALAR_TYPE temp = r0;
1087 out += ((double*)&temp)[0];
1088
1089 return out;
1090}
1091
1092#if defined(ARM)
1093half test_hp_scalar_VEC_FMA_12( uint64 iterations ){
1094 register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1095
1096 /* Generate starting data */
1097 r0 = SET_VEC_SH(0.01);
1098 r1 = SET_VEC_SH(0.02);
1099 r2 = SET_VEC_SH(0.03);
1100 r3 = SET_VEC_SH(0.04);
1101 r4 = SET_VEC_SH(0.05);
1102 r5 = SET_VEC_SH(0.06);
1103 r6 = SET_VEC_SH(0.07);
1104 r7 = SET_VEC_SH(0.08);
1105 r8 = SET_VEC_SH(0.09);
1106 r9 = SET_VEC_SH(0.10);
1107 rA = SET_VEC_SH(0.11);
1108 rB = SET_VEC_SH(0.12);
1109 rC = SET_VEC_SH(0.13);
1110 rD = SET_VEC_SH(0.14);
1111 rE = SET_VEC_SH(0.15);
1112 rF = SET_VEC_SH(0.16);
1113
1114 uint64 c = 0;
1115 while (c < iterations){
1116 size_t i = 0;
1117 while (i < 1000){
1118
1119 /* The performance critical part */
1120 FMA_VEC_SH(r0,r0,r7,r9);
1121 FMA_VEC_SH(r1,r1,r8,rA);
1122 FMA_VEC_SH(r2,r2,r9,rB);
1123 FMA_VEC_SH(r3,r3,rA,rC);
1124 FMA_VEC_SH(r4,r4,rB,rD);
1125 FMA_VEC_SH(r5,r5,rC,rE);
1126
1127 FMA_VEC_SH(r0,r0,rD,rF);
1128 FMA_VEC_SH(r1,r1,rC,rE);
1129 FMA_VEC_SH(r2,r2,rB,rD);
1130 FMA_VEC_SH(r3,r3,rA,rC);
1131 FMA_VEC_SH(r4,r4,r9,rB);
1132 FMA_VEC_SH(r5,r5,r8,rA);
1133
1134 i++;
1135 }
1136 c++;
1137 }
1138
1139 /* Use data so that compiler does not eliminate it when using -O2 */
1140 r0 = ADD_VEC_SH(r0,r1);
1141 r2 = ADD_VEC_SH(r2,r3);
1142 r4 = ADD_VEC_SH(r4,r5);
1143
1144 r0 = ADD_VEC_SH(r0,r6);
1145 r2 = ADD_VEC_SH(r2,r4);
1146
1147 r0 = ADD_VEC_SH(r0,r2);
1148
1149 half out = 0;
1150 half temp = r0;
1151 out = ADD_VEC_SH(out,temp);
1152
1153 return out;
1154}
1155
1156half test_hp_scalar_VEC_FMA_24( uint64 iterations ){
1157 register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1158
1159 /* Generate starting data */
1160 r0 = SET_VEC_SH(0.01);
1161 r1 = SET_VEC_SH(0.02);
1162 r2 = SET_VEC_SH(0.03);
1163 r3 = SET_VEC_SH(0.04);
1164 r4 = SET_VEC_SH(0.05);
1165 r5 = SET_VEC_SH(0.06);
1166 r6 = SET_VEC_SH(0.07);
1167 r7 = SET_VEC_SH(0.08);
1168 r8 = SET_VEC_SH(0.09);
1169 r9 = SET_VEC_SH(0.10);
1170 rA = SET_VEC_SH(0.11);
1171 rB = SET_VEC_SH(0.12);
1172 rC = SET_VEC_SH(0.13);
1173 rD = SET_VEC_SH(0.14);
1174 rE = SET_VEC_SH(0.15);
1175 rF = SET_VEC_SH(0.16);
1176
1177 uint64 c = 0;
1178 while (c < iterations){
1179 size_t i = 0;
1180 while (i < 1000){
1181
1182 /* The performance critical part */
1183 FMA_VEC_SH(r0,r0,r7,r9);
1184 FMA_VEC_SH(r1,r1,r8,rA);
1185 FMA_VEC_SH(r2,r2,r9,rB);
1186 FMA_VEC_SH(r3,r3,rA,rC);
1187 FMA_VEC_SH(r4,r4,rB,rD);
1188 FMA_VEC_SH(r5,r5,rC,rE);
1189
1190 FMA_VEC_SH(r0,r0,rD,rF);
1191 FMA_VEC_SH(r1,r1,rC,rE);
1192 FMA_VEC_SH(r2,r2,rB,rD);
1193 FMA_VEC_SH(r3,r3,rA,rC);
1194 FMA_VEC_SH(r4,r4,r9,rB);
1195 FMA_VEC_SH(r5,r5,r8,rA);
1196
1197 FMA_VEC_SH(r0,r0,r7,r9);
1198 FMA_VEC_SH(r1,r1,r8,rA);
1199 FMA_VEC_SH(r2,r2,r9,rB);
1200 FMA_VEC_SH(r3,r3,rA,rC);
1201 FMA_VEC_SH(r4,r4,rB,rD);
1202 FMA_VEC_SH(r5,r5,rC,rE);
1203
1204 FMA_VEC_SH(r0,r0,rD,rF);
1205 FMA_VEC_SH(r1,r1,rC,rE);
1206 FMA_VEC_SH(r2,r2,rB,rD);
1207 FMA_VEC_SH(r3,r3,rA,rC);
1208 FMA_VEC_SH(r4,r4,r9,rB);
1209 FMA_VEC_SH(r5,r5,r8,rA);
1210
1211 i++;
1212 }
1213 c++;
1214 }
1215
1216 /* Use data so that compiler does not eliminate it when using -O2 */
1217 r0 = ADD_VEC_SH(r0,r1);
1218 r2 = ADD_VEC_SH(r2,r3);
1219 r4 = ADD_VEC_SH(r4,r5);
1220
1221 r0 = ADD_VEC_SH(r0,r6);
1222 r2 = ADD_VEC_SH(r2,r4);
1223
1224 r0 = ADD_VEC_SH(r0,r2);
1225
1226 half out = 0;
1227 half temp = r0;
1228 out = ADD_VEC_SH(out,temp);
1229
1230 return out;
1231}
1232
1233half test_hp_scalar_VEC_FMA_48( uint64 iterations ){
1234 register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1235
1236 /* Generate starting data */
1237 r0 = SET_VEC_SH(0.01);
1238 r1 = SET_VEC_SH(0.02);
1239 r2 = SET_VEC_SH(0.03);
1240 r3 = SET_VEC_SH(0.04);
1241 r4 = SET_VEC_SH(0.05);
1242 r5 = SET_VEC_SH(0.06);
1243 r6 = SET_VEC_SH(0.07);
1244 r7 = SET_VEC_SH(0.08);
1245 r8 = SET_VEC_SH(0.09);
1246 r9 = SET_VEC_SH(0.10);
1247 rA = SET_VEC_SH(0.11);
1248 rB = SET_VEC_SH(0.12);
1249 rC = SET_VEC_SH(0.13);
1250 rD = SET_VEC_SH(0.14);
1251 rE = SET_VEC_SH(0.15);
1252 rF = SET_VEC_SH(0.16);
1253
1254 uint64 c = 0;
1255 while (c < iterations){
1256 size_t i = 0;
1257 while (i < 1000){
1258
1259 /* The performance critical part */
1260 FMA_VEC_SH(r0,r0,r7,r9);
1261 FMA_VEC_SH(r1,r1,r8,rA);
1262 FMA_VEC_SH(r2,r2,r9,rB);
1263 FMA_VEC_SH(r3,r3,rA,rC);
1264 FMA_VEC_SH(r4,r4,rB,rD);
1265 FMA_VEC_SH(r5,r5,rC,rE);
1266
1267 FMA_VEC_SH(r0,r0,rD,rF);
1268 FMA_VEC_SH(r1,r1,rC,rE);
1269 FMA_VEC_SH(r2,r2,rB,rD);
1270 FMA_VEC_SH(r3,r3,rA,rC);
1271 FMA_VEC_SH(r4,r4,r9,rB);
1272 FMA_VEC_SH(r5,r5,r8,rA);
1273
1274 FMA_VEC_SH(r0,r0,r7,r9);
1275 FMA_VEC_SH(r1,r1,r8,rA);
1276 FMA_VEC_SH(r2,r2,r9,rB);
1277 FMA_VEC_SH(r3,r3,rA,rC);
1278 FMA_VEC_SH(r4,r4,rB,rD);
1279 FMA_VEC_SH(r5,r5,rC,rE);
1280
1281 FMA_VEC_SH(r0,r0,rD,rF);
1282 FMA_VEC_SH(r1,r1,rC,rE);
1283 FMA_VEC_SH(r2,r2,rB,rD);
1284 FMA_VEC_SH(r3,r3,rA,rC);
1285 FMA_VEC_SH(r4,r4,r9,rB);
1286 FMA_VEC_SH(r5,r5,r8,rA);
1287
1288 FMA_VEC_SH(r0,r0,r7,r9);
1289 FMA_VEC_SH(r1,r1,r8,rA);
1290 FMA_VEC_SH(r2,r2,r9,rB);
1291 FMA_VEC_SH(r3,r3,rA,rC);
1292 FMA_VEC_SH(r4,r4,rB,rD);
1293 FMA_VEC_SH(r5,r5,rC,rE);
1294
1295 FMA_VEC_SH(r0,r0,rD,rF);
1296 FMA_VEC_SH(r1,r1,rC,rE);
1297 FMA_VEC_SH(r2,r2,rB,rD);
1298 FMA_VEC_SH(r3,r3,rA,rC);
1299 FMA_VEC_SH(r4,r4,r9,rB);
1300 FMA_VEC_SH(r5,r5,r8,rA);
1301
1302 FMA_VEC_SH(r0,r0,r7,r9);
1303 FMA_VEC_SH(r1,r1,r8,rA);
1304 FMA_VEC_SH(r2,r2,r9,rB);
1305 FMA_VEC_SH(r3,r3,rA,rC);
1306 FMA_VEC_SH(r4,r4,rB,rD);
1307 FMA_VEC_SH(r5,r5,rC,rE);
1308
1309 FMA_VEC_SH(r0,r0,rD,rF);
1310 FMA_VEC_SH(r1,r1,rC,rE);
1311 FMA_VEC_SH(r2,r2,rB,rD);
1312 FMA_VEC_SH(r3,r3,rA,rC);
1313 FMA_VEC_SH(r4,r4,r9,rB);
1314 FMA_VEC_SH(r5,r5,r8,rA);
1315
1316 i++;
1317 }
1318 c++;
1319 }
1320
1321 /* Use data so that compiler does not eliminate it when using -O2 */
1322 r0 = ADD_VEC_SH(r0,r1);
1323 r2 = ADD_VEC_SH(r2,r3);
1324 r4 = ADD_VEC_SH(r4,r5);
1325
1326 r0 = ADD_VEC_SH(r0,r6);
1327 r2 = ADD_VEC_SH(r2,r4);
1328
1329 r0 = ADD_VEC_SH(r0,r2);
1330
1331 half out = 0;
1332 half temp = r0;
1333 out = ADD_VEC_SH(out,temp);
1334
1335 return out;
1336}
1337
1338#else
1340
1341 (void)iterations;
1342 return 0.0;
1343}
1344
1346
1347 (void)iterations;
1348 return 0.0;
1349}
1350
1352
1353 (void)iterations;
1354 return 0.0;
1355}
1356#endif
1357
1358/************************************/
1359/* Loop unrolling: 12 instructions */
1360/************************************/
1362 register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1363
1364 /* Generate starting data */
1365 r0 = SET_VEC_SS(0.01);
1366 r1 = SET_VEC_SS(0.02);
1367 r2 = SET_VEC_SS(0.03);
1368 r3 = SET_VEC_SS(0.04);
1369 r4 = SET_VEC_SS(0.05);
1370 r5 = SET_VEC_SS(0.06);
1371 r6 = SET_VEC_SS(0.07);
1372 r7 = SET_VEC_SS(0.08);
1373 r8 = SET_VEC_SS(0.09);
1374 r9 = SET_VEC_SS(0.10);
1375 rA = SET_VEC_SS(0.11);
1376 rB = SET_VEC_SS(0.12);
1377 rC = SET_VEC_SS(0.13);
1378 rD = SET_VEC_SS(0.14);
1379 rE = SET_VEC_SS(0.15);
1380 rF = SET_VEC_SS(0.16);
1381
1382 uint64 c = 0;
1383 while (c < iterations){
1384 size_t i = 0;
1385 while (i < 1000){
1386
1387 /* The performance critical part */
1388 FMA_VEC_SS(r0,r0,r7,r9);
1389 FMA_VEC_SS(r1,r1,r8,rA);
1390 FMA_VEC_SS(r2,r2,r9,rB);
1391 FMA_VEC_SS(r3,r3,rA,rC);
1392 FMA_VEC_SS(r4,r4,rB,rD);
1393 FMA_VEC_SS(r5,r5,rC,rE);
1394
1395 FMA_VEC_SS(r0,r0,rD,rF);
1396 FMA_VEC_SS(r1,r1,rC,rE);
1397 FMA_VEC_SS(r2,r2,rB,rD);
1398 FMA_VEC_SS(r3,r3,rA,rC);
1399 FMA_VEC_SS(r4,r4,r9,rB);
1400 FMA_VEC_SS(r5,r5,r8,rA);
1401
1402 i++;
1403 }
1404 c++;
1405 }
1406
1407 /* Use data so that compiler does not eliminate it when using -O2 */
1408 r0 = ADD_VEC_SS(r0,r1);
1409 r2 = ADD_VEC_SS(r2,r3);
1410 r4 = ADD_VEC_SS(r4,r5);
1411
1412 r0 = ADD_VEC_SS(r0,r6);
1413 r2 = ADD_VEC_SS(r2,r4);
1414
1415 r0 = ADD_VEC_SS(r0,r2);
1416
1417 float out = 0;
1418 SP_SCALAR_TYPE temp = r0;
1419 out += ((float*)&temp)[0];
1420
1421 return out;
1422}
1423
1424/************************************/
1425/* Loop unrolling: 24 instructions */
1426/************************************/
1428 register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1429
1430 /* Generate starting data */
1431 r0 = SET_VEC_SS(0.01);
1432 r1 = SET_VEC_SS(0.02);
1433 r2 = SET_VEC_SS(0.03);
1434 r3 = SET_VEC_SS(0.04);
1435 r4 = SET_VEC_SS(0.05);
1436 r5 = SET_VEC_SS(0.06);
1437 r6 = SET_VEC_SS(0.07);
1438 r7 = SET_VEC_SS(0.08);
1439 r8 = SET_VEC_SS(0.09);
1440 r9 = SET_VEC_SS(0.10);
1441 rA = SET_VEC_SS(0.11);
1442 rB = SET_VEC_SS(0.12);
1443 rC = SET_VEC_SS(0.13);
1444 rD = SET_VEC_SS(0.14);
1445 rE = SET_VEC_SS(0.15);
1446 rF = SET_VEC_SS(0.16);
1447
1448 uint64 c = 0;
1449 while (c < iterations){
1450 size_t i = 0;
1451 while (i < 1000){
1452
1453 /* The performance critical part */
1454 FMA_VEC_SS(r0,r0,r7,r9);
1455 FMA_VEC_SS(r1,r1,r8,rA);
1456 FMA_VEC_SS(r2,r2,r9,rB);
1457 FMA_VEC_SS(r3,r3,rA,rC);
1458 FMA_VEC_SS(r4,r4,rB,rD);
1459 FMA_VEC_SS(r5,r5,rC,rE);
1460
1461 FMA_VEC_SS(r0,r0,rD,rF);
1462 FMA_VEC_SS(r1,r1,rC,rE);
1463 FMA_VEC_SS(r2,r2,rB,rD);
1464 FMA_VEC_SS(r3,r3,rA,rC);
1465 FMA_VEC_SS(r4,r4,r9,rB);
1466 FMA_VEC_SS(r5,r5,r8,rA);
1467
1468 FMA_VEC_SS(r0,r0,r7,r9);
1469 FMA_VEC_SS(r1,r1,r8,rA);
1470 FMA_VEC_SS(r2,r2,r9,rB);
1471 FMA_VEC_SS(r3,r3,rA,rC);
1472 FMA_VEC_SS(r4,r4,rB,rD);
1473 FMA_VEC_SS(r5,r5,rC,rE);
1474
1475 FMA_VEC_SS(r0,r0,rD,rF);
1476 FMA_VEC_SS(r1,r1,rC,rE);
1477 FMA_VEC_SS(r2,r2,rB,rD);
1478 FMA_VEC_SS(r3,r3,rA,rC);
1479 FMA_VEC_SS(r4,r4,r9,rB);
1480 FMA_VEC_SS(r5,r5,r8,rA);
1481
1482 i++;
1483 }
1484 c++;
1485 }
1486
1487 /* Use data so that compiler does not eliminate it when using -O2 */
1488 r0 = ADD_VEC_SS(r0,r1);
1489 r2 = ADD_VEC_SS(r2,r3);
1490 r4 = ADD_VEC_SS(r4,r5);
1491
1492 r0 = ADD_VEC_SS(r0,r6);
1493 r2 = ADD_VEC_SS(r2,r4);
1494
1495 r0 = ADD_VEC_SS(r0,r2);
1496
1497 float out = 0;
1498 SP_SCALAR_TYPE temp = r0;
1499 out += ((float*)&temp)[0];
1500
1501 return out;
1502}
1503
1504/************************************/
1505/* Loop unrolling: 48 instructions */
1506/************************************/
1508 register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1509
1510 /* Generate starting data */
1511 r0 = SET_VEC_SS(0.01);
1512 r1 = SET_VEC_SS(0.02);
1513 r2 = SET_VEC_SS(0.03);
1514 r3 = SET_VEC_SS(0.04);
1515 r4 = SET_VEC_SS(0.05);
1516 r5 = SET_VEC_SS(0.06);
1517 r6 = SET_VEC_SS(0.07);
1518 r7 = SET_VEC_SS(0.08);
1519 r8 = SET_VEC_SS(0.09);
1520 r9 = SET_VEC_SS(0.10);
1521 rA = SET_VEC_SS(0.11);
1522 rB = SET_VEC_SS(0.12);
1523 rC = SET_VEC_SS(0.13);
1524 rD = SET_VEC_SS(0.14);
1525 rE = SET_VEC_SS(0.15);
1526 rF = SET_VEC_SS(0.16);
1527
1528 uint64 c = 0;
1529 while (c < iterations){
1530 size_t i = 0;
1531 while (i < 1000){
1532
1533 /* The performance critical part */
1534 FMA_VEC_SS(r0,r0,r7,r9);
1535 FMA_VEC_SS(r1,r1,r8,rA);
1536 FMA_VEC_SS(r2,r2,r9,rB);
1537 FMA_VEC_SS(r3,r3,rA,rC);
1538 FMA_VEC_SS(r4,r4,rB,rD);
1539 FMA_VEC_SS(r5,r5,rC,rE);
1540
1541 FMA_VEC_SS(r0,r0,rD,rF);
1542 FMA_VEC_SS(r1,r1,rC,rE);
1543 FMA_VEC_SS(r2,r2,rB,rD);
1544 FMA_VEC_SS(r3,r3,rA,rC);
1545 FMA_VEC_SS(r4,r4,r9,rB);
1546 FMA_VEC_SS(r5,r5,r8,rA);
1547
1548 FMA_VEC_SS(r0,r0,r7,r9);
1549 FMA_VEC_SS(r1,r1,r8,rA);
1550 FMA_VEC_SS(r2,r2,r9,rB);
1551 FMA_VEC_SS(r3,r3,rA,rC);
1552 FMA_VEC_SS(r4,r4,rB,rD);
1553 FMA_VEC_SS(r5,r5,rC,rE);
1554
1555 FMA_VEC_SS(r0,r0,rD,rF);
1556 FMA_VEC_SS(r1,r1,rC,rE);
1557 FMA_VEC_SS(r2,r2,rB,rD);
1558 FMA_VEC_SS(r3,r3,rA,rC);
1559 FMA_VEC_SS(r4,r4,r9,rB);
1560 FMA_VEC_SS(r5,r5,r8,rA);
1561
1562 FMA_VEC_SS(r0,r0,r7,r9);
1563 FMA_VEC_SS(r1,r1,r8,rA);
1564 FMA_VEC_SS(r2,r2,r9,rB);
1565 FMA_VEC_SS(r3,r3,rA,rC);
1566 FMA_VEC_SS(r4,r4,rB,rD);
1567 FMA_VEC_SS(r5,r5,rC,rE);
1568
1569 FMA_VEC_SS(r0,r0,rD,rF);
1570 FMA_VEC_SS(r1,r1,rC,rE);
1571 FMA_VEC_SS(r2,r2,rB,rD);
1572 FMA_VEC_SS(r3,r3,rA,rC);
1573 FMA_VEC_SS(r4,r4,r9,rB);
1574 FMA_VEC_SS(r5,r5,r8,rA);
1575
1576 FMA_VEC_SS(r0,r0,r7,r9);
1577 FMA_VEC_SS(r1,r1,r8,rA);
1578 FMA_VEC_SS(r2,r2,r9,rB);
1579 FMA_VEC_SS(r3,r3,rA,rC);
1580 FMA_VEC_SS(r4,r4,rB,rD);
1581 FMA_VEC_SS(r5,r5,rC,rE);
1582
1583 FMA_VEC_SS(r0,r0,rD,rF);
1584 FMA_VEC_SS(r1,r1,rC,rE);
1585 FMA_VEC_SS(r2,r2,rB,rD);
1586 FMA_VEC_SS(r3,r3,rA,rC);
1587 FMA_VEC_SS(r4,r4,r9,rB);
1588 FMA_VEC_SS(r5,r5,r8,rA);
1589
1590 i++;
1591 }
1592 c++;
1593 }
1594
1595 /* Use data so that compiler does not eliminate it when using -O2 */
1596 r0 = ADD_VEC_SS(r0,r1);
1597 r2 = ADD_VEC_SS(r2,r3);
1598 r4 = ADD_VEC_SS(r4,r5);
1599
1600 r0 = ADD_VEC_SS(r0,r6);
1601 r2 = ADD_VEC_SS(r2,r4);
1602
1603 r0 = ADD_VEC_SS(r0,r2);
1604
1605 float out = 0;
1606 SP_SCALAR_TYPE temp = r0;
1607 out += ((float*)&temp)[0];
1608
1609 return out;
1610}
1611
1612/************************************/
1613/* Loop unrolling: 12 instructions */
1614/************************************/
1616 register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1617
1618 /* Generate starting data */
1619 r0 = SET_VEC_SD(0.01);
1620 r1 = SET_VEC_SD(0.02);
1621 r2 = SET_VEC_SD(0.03);
1622 r3 = SET_VEC_SD(0.04);
1623 r4 = SET_VEC_SD(0.05);
1624 r5 = SET_VEC_SD(0.06);
1625 r6 = SET_VEC_SD(0.07);
1626 r7 = SET_VEC_SD(0.08);
1627 r8 = SET_VEC_SD(0.09);
1628 r9 = SET_VEC_SD(0.10);
1629 rA = SET_VEC_SD(0.11);
1630 rB = SET_VEC_SD(0.12);
1631 rC = SET_VEC_SD(0.13);
1632 rD = SET_VEC_SD(0.14);
1633 rE = SET_VEC_SD(0.15);
1634 rF = SET_VEC_SD(0.16);
1635
1636 uint64 c = 0;
1637 while (c < iterations){
1638 size_t i = 0;
1639 while (i < 1000){
1640
1641 /* The performance critical part */
1642 FMA_VEC_SD(r0,r0,r7,r9);
1643 FMA_VEC_SD(r1,r1,r8,rA);
1644 FMA_VEC_SD(r2,r2,r9,rB);
1645 FMA_VEC_SD(r3,r3,rA,rC);
1646 FMA_VEC_SD(r4,r4,rB,rD);
1647 FMA_VEC_SD(r5,r5,rC,rE);
1648
1649 FMA_VEC_SD(r0,r0,rD,rF);
1650 FMA_VEC_SD(r1,r1,rC,rE);
1651 FMA_VEC_SD(r2,r2,rB,rD);
1652 FMA_VEC_SD(r3,r3,rA,rC);
1653 FMA_VEC_SD(r4,r4,r9,rB);
1654 FMA_VEC_SD(r5,r5,r8,rA);
1655
1656 i++;
1657 }
1658 c++;
1659 }
1660
1661 /* Use data so that compiler does not eliminate it when using -O2 */
1662 r0 = ADD_VEC_SD(r0,r1);
1663 r2 = ADD_VEC_SD(r2,r3);
1664 r4 = ADD_VEC_SD(r4,r5);
1665
1666 r0 = ADD_VEC_SD(r0,r6);
1667 r2 = ADD_VEC_SD(r2,r4);
1668
1669 r0 = ADD_VEC_SD(r0,r2);
1670
1671 double out = 0;
1672 DP_SCALAR_TYPE temp = r0;
1673 out += ((double*)&temp)[0];
1674
1675 return out;
1676}
1677
1678/************************************/
1679/* Loop unrolling: 24 instructions */
1680/************************************/
1682 register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1683
1684 /* Generate starting data */
1685 r0 = SET_VEC_SD(0.01);
1686 r1 = SET_VEC_SD(0.02);
1687 r2 = SET_VEC_SD(0.03);
1688 r3 = SET_VEC_SD(0.04);
1689 r4 = SET_VEC_SD(0.05);
1690 r5 = SET_VEC_SD(0.06);
1691 r6 = SET_VEC_SD(0.07);
1692 r7 = SET_VEC_SD(0.08);
1693 r8 = SET_VEC_SD(0.09);
1694 r9 = SET_VEC_SD(0.10);
1695 rA = SET_VEC_SD(0.11);
1696 rB = SET_VEC_SD(0.12);
1697 rC = SET_VEC_SD(0.13);
1698 rD = SET_VEC_SD(0.14);
1699 rE = SET_VEC_SD(0.15);
1700 rF = SET_VEC_SD(0.16);
1701
1702 uint64 c = 0;
1703 while (c < iterations){
1704 size_t i = 0;
1705 while (i < 1000){
1706
1707 /* The performance critical part */
1708 FMA_VEC_SD(r0,r0,r7,r9);
1709 FMA_VEC_SD(r1,r1,r8,rA);
1710 FMA_VEC_SD(r2,r2,r9,rB);
1711 FMA_VEC_SD(r3,r3,rA,rC);
1712 FMA_VEC_SD(r4,r4,rB,rD);
1713 FMA_VEC_SD(r5,r5,rC,rE);
1714
1715 FMA_VEC_SD(r0,r0,rD,rF);
1716 FMA_VEC_SD(r1,r1,rC,rE);
1717 FMA_VEC_SD(r2,r2,rB,rD);
1718 FMA_VEC_SD(r3,r3,rA,rC);
1719 FMA_VEC_SD(r4,r4,r9,rB);
1720 FMA_VEC_SD(r5,r5,r8,rA);
1721
1722 FMA_VEC_SD(r0,r0,r7,r9);
1723 FMA_VEC_SD(r1,r1,r8,rA);
1724 FMA_VEC_SD(r2,r2,r9,rB);
1725 FMA_VEC_SD(r3,r3,rA,rC);
1726 FMA_VEC_SD(r4,r4,rB,rD);
1727 FMA_VEC_SD(r5,r5,rC,rE);
1728
1729 FMA_VEC_SD(r0,r0,rD,rF);
1730 FMA_VEC_SD(r1,r1,rC,rE);
1731 FMA_VEC_SD(r2,r2,rB,rD);
1732 FMA_VEC_SD(r3,r3,rA,rC);
1733 FMA_VEC_SD(r4,r4,r9,rB);
1734 FMA_VEC_SD(r5,r5,r8,rA);
1735
1736 i++;
1737 }
1738 c++;
1739 }
1740
1741 /* Use data so that compiler does not eliminate it when using -O2 */
1742 r0 = ADD_VEC_SD(r0,r1);
1743 r2 = ADD_VEC_SD(r2,r3);
1744 r4 = ADD_VEC_SD(r4,r5);
1745
1746 r0 = ADD_VEC_SD(r0,r6);
1747 r2 = ADD_VEC_SD(r2,r4);
1748
1749 r0 = ADD_VEC_SD(r0,r2);
1750
1751 double out = 0;
1752 DP_SCALAR_TYPE temp = r0;
1753 out += ((double*)&temp)[0];
1754
1755 return out;
1756}
1757
1758/************************************/
1759/* Loop unrolling: 48 instructions */
1760/************************************/
1762 register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
1763
1764 /* Generate starting data */
1765 r0 = SET_VEC_SD(0.01);
1766 r1 = SET_VEC_SD(0.02);
1767 r2 = SET_VEC_SD(0.03);
1768 r3 = SET_VEC_SD(0.04);
1769 r4 = SET_VEC_SD(0.05);
1770 r5 = SET_VEC_SD(0.06);
1771 r6 = SET_VEC_SD(0.07);
1772 r7 = SET_VEC_SD(0.08);
1773 r8 = SET_VEC_SD(0.09);
1774 r9 = SET_VEC_SD(0.10);
1775 rA = SET_VEC_SD(0.11);
1776 rB = SET_VEC_SD(0.12);
1777 rC = SET_VEC_SD(0.13);
1778 rD = SET_VEC_SD(0.14);
1779 rE = SET_VEC_SD(0.15);
1780 rF = SET_VEC_SD(0.16);
1781
1782 uint64 c = 0;
1783 while (c < iterations){
1784 size_t i = 0;
1785 while (i < 1000){
1786
1787 /* The performance critical part */
1788 FMA_VEC_SD(r0,r0,r7,r9);
1789 FMA_VEC_SD(r1,r1,r8,rA);
1790 FMA_VEC_SD(r2,r2,r9,rB);
1791 FMA_VEC_SD(r3,r3,rA,rC);
1792 FMA_VEC_SD(r4,r4,rB,rD);
1793 FMA_VEC_SD(r5,r5,rC,rE);
1794
1795 FMA_VEC_SD(r0,r0,rD,rF);
1796 FMA_VEC_SD(r1,r1,rC,rE);
1797 FMA_VEC_SD(r2,r2,rB,rD);
1798 FMA_VEC_SD(r3,r3,rA,rC);
1799 FMA_VEC_SD(r4,r4,r9,rB);
1800 FMA_VEC_SD(r5,r5,r8,rA);
1801
1802 FMA_VEC_SD(r0,r0,r7,r9);
1803 FMA_VEC_SD(r1,r1,r8,rA);
1804 FMA_VEC_SD(r2,r2,r9,rB);
1805 FMA_VEC_SD(r3,r3,rA,rC);
1806 FMA_VEC_SD(r4,r4,rB,rD);
1807 FMA_VEC_SD(r5,r5,rC,rE);
1808
1809 FMA_VEC_SD(r0,r0,rD,rF);
1810 FMA_VEC_SD(r1,r1,rC,rE);
1811 FMA_VEC_SD(r2,r2,rB,rD);
1812 FMA_VEC_SD(r3,r3,rA,rC);
1813 FMA_VEC_SD(r4,r4,r9,rB);
1814 FMA_VEC_SD(r5,r5,r8,rA);
1815
1816 FMA_VEC_SD(r0,r0,r7,r9);
1817 FMA_VEC_SD(r1,r1,r8,rA);
1818 FMA_VEC_SD(r2,r2,r9,rB);
1819 FMA_VEC_SD(r3,r3,rA,rC);
1820 FMA_VEC_SD(r4,r4,rB,rD);
1821 FMA_VEC_SD(r5,r5,rC,rE);
1822
1823 FMA_VEC_SD(r0,r0,rD,rF);
1824 FMA_VEC_SD(r1,r1,rC,rE);
1825 FMA_VEC_SD(r2,r2,rB,rD);
1826 FMA_VEC_SD(r3,r3,rA,rC);
1827 FMA_VEC_SD(r4,r4,r9,rB);
1828 FMA_VEC_SD(r5,r5,r8,rA);
1829
1830 FMA_VEC_SD(r0,r0,r7,r9);
1831 FMA_VEC_SD(r1,r1,r8,rA);
1832 FMA_VEC_SD(r2,r2,r9,rB);
1833 FMA_VEC_SD(r3,r3,rA,rC);
1834 FMA_VEC_SD(r4,r4,rB,rD);
1835 FMA_VEC_SD(r5,r5,rC,rE);
1836
1837 FMA_VEC_SD(r0,r0,rD,rF);
1838 FMA_VEC_SD(r1,r1,rC,rE);
1839 FMA_VEC_SD(r2,r2,rB,rD);
1840 FMA_VEC_SD(r3,r3,rA,rC);
1841 FMA_VEC_SD(r4,r4,r9,rB);
1842 FMA_VEC_SD(r5,r5,r8,rA);
1843
1844 i++;
1845 }
1846 c++;
1847 }
1848
1849 /* Use data so that compiler does not eliminate it when using -O2 */
1850 r0 = ADD_VEC_SD(r0,r1);
1851 r2 = ADD_VEC_SD(r2,r3);
1852 r4 = ADD_VEC_SD(r4,r5);
1853
1854 r0 = ADD_VEC_SD(r0,r6);
1855 r2 = ADD_VEC_SD(r2,r4);
1856
1857 r0 = ADD_VEC_SD(r0,r2);
1858
1859 double out = 0;
1860 DP_SCALAR_TYPE temp = r0;
1861 out += ((double*)&temp)[0];
1862
1863 return out;
1864}
int i
unsigned long long uint64
Definition: cat_arch.h:3
Stop counting hardware events in an event set.
#define PAPI_OK
Definition: f90papi.h:73
static int EventSet
Definition: init_fini.c:8
static double c[MATRIX_SIZE][MATRIX_SIZE]
Definition: libmsr_basic.c:40
FILE * stderr
static FILE * fp
float test_hp_scalar_VEC_FMA_48(uint64 iterations)
float test_hp_scalar_VEC_24(uint64 iterations)
double test_dp_scalar_VEC_FMA_24(uint64 iterations)
double test_dp_scalar_VEC_FMA_12(uint64 iterations)
float test_sp_scalar_VEC_96(uint64 iterations)
float test_hp_scalar_VEC_FMA_24(uint64 iterations)
double test_dp_scalar_VEC_FMA_48(uint64 iterations)
void papi_stop_and_print_placeholder(long long theory, FILE *fp)
float test_sp_scalar_VEC_24(uint64 iterations)
double test_dp_scalar_VEC_48(uint64 iterations)
float test_hp_scalar_VEC_FMA_12(uint64 iterations)
float test_sp_scalar_VEC_FMA_12(uint64 iterations)
void papi_stop_and_print(long long theory, int EventSet, FILE *fp)
float test_sp_scalar_VEC_FMA_48(uint64 iterations)
double test_dp_scalar_VEC_96(uint64 iterations)
float test_hp_scalar_VEC_48(uint64 iterations)
double test_dp_scalar_VEC_24(uint64 iterations)
float test_sp_scalar_VEC_48(uint64 iterations)
float test_hp_scalar_VEC_96(uint64 iterations)
float test_sp_scalar_VEC_FMA_24(uint64 iterations)
int retval
Definition: zero_fork.c:53