00001
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include "kflops.h"
00033
00034 #ifdef NORUSAGE
00035 #include <sys/times.h>
00036 #include <unistd.h>
00037 static REAL ClockTick = 0.0;
00038 struct tms ru;
00039 #else
00040 struct rusage ru;
00041 #endif
00042
00043 #include <stdio.h>
00044 #include <math.h>
00045 #include <sys/time.h>
00046 #include <sys/resource.h>
00047
00048 static REAL my_time[9][9];
00049 static int idamax();
00050 static REAL ddot();
00051
00052
00053
00054 #ifdef __STDC__
00055 static void
00056 daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
00057 #else
00058 static void
00059 daxpy(n, da, dx, incx, dy, incy)
00060
00061
00062 REAL dx[], dy[], da;
00063 int incx, incy, n;
00064 #endif
00065 {
00066 int i, ix, iy;
00067
00068 if(n <= 0)
00069 return;
00070 if(da == ZERO)
00071 return;
00072
00073 if(incx != 1 || incy != 1) {
00074
00075
00076
00077 ix = 0;
00078 iy = 0;
00079 if(incx < 0)
00080 ix = (-n + 1) * incx;
00081 if(incy < 0)
00082 iy = (-n + 1) * incy;
00083 for(i = 0; i < n; i++) {
00084 dy[iy] = dy[iy] + da * dx[ix];
00085 ix = ix + incx;
00086 iy = iy + incy;
00087 }
00088 return;
00089 }
00090
00091
00092
00093 #ifdef ROLL
00094 for(i = 0; i < n; i++) {
00095 dy[i] = dy[i] + da * dx[i];
00096 }
00097 #endif
00098 #ifdef UNROLL
00099
00100 m = n % 4;
00101 if(m != 0) {
00102 for(i = 0; i < m; i++)
00103 dy[i] = dy[i] + da * dx[i];
00104 if(n < 4)
00105 return;
00106 }
00107 for(i = m; i < n; i = i + 4) {
00108 dy[i] = dy[i] + da * dx[i];
00109 dy[i + 1] = dy[i + 1] + da * dx[i + 1];
00110 dy[i + 2] = dy[i + 2] + da * dx[i + 2];
00111 dy[i + 3] = dy[i + 3] + da * dx[i + 3];
00112 }
00113 #endif
00114 }
00115
00116
00117 #ifdef __STDC__
00118 static void
00119 matgen(REAL a[], int lda, int n, REAL b[], REAL * norma)
00120 #else
00121 static void
00122 matgen(a, lda, n, b, norma)
00123 REAL a[], b[], *norma;
00124 int lda, n;
00125 #endif
00126
00127
00128
00129 {
00130 int init, i, j;
00131
00132 init = 1325;
00133 *norma = 0.0;
00134 for(j = 0; j < n; j++) {
00135 for(i = 0; i < n; i++) {
00136 init = 3125 * init % 65536;
00137 a[lda * j + i] = (init - 32768.0) / 16384.0;
00138 *norma = (a[lda * j + i] > *norma) ? a[lda * j + i] : *norma;
00139 }
00140 }
00141 for(i = 0; i < n; i++) {
00142 b[i] = 0.0;
00143 }
00144 for(j = 0; j < n; j++) {
00145 for(i = 0; i < n; i++) {
00146 b[i] = b[i] + a[lda * j + i];
00147 }
00148 }
00149 }
00150
00151
00152
00153
00154 #ifdef __STDC__
00155 static void
00156 dgesl(REAL a[], int lda, int n, int ipvt[], REAL b[], int job)
00157 #else
00158 static void
00159 dgesl(a, lda, n, ipvt, b, job)
00160 int lda, n, ipvt[], job;
00161 REAL a[], b[];
00162 #endif
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207 {
00208
00209
00210 REAL t;
00211 int k, kb, l, nm1;
00212
00213 nm1 = n - 1;
00214 if(job == 0) {
00215
00216
00217
00218 if(nm1 >= 1) {
00219 for(k = 0; k < nm1; k++) {
00220 l = ipvt[k];
00221 t = b[l];
00222 if(l != k) {
00223 b[l] = b[k];
00224 b[k] = t;
00225 }
00226 daxpy(n - (k + 1), t, &a[lda * k + k + 1], 1, &b[k + 1], 1);
00227 }
00228 }
00229
00230
00231
00232 for(kb = 0; kb < n; kb++) {
00233 k = n - (kb + 1);
00234 b[k] = b[k] / a[lda * k + k];
00235 t = -b[k];
00236 daxpy(k, t, &a[lda * k + 0], 1, &b[0], 1);
00237 }
00238 }
00239 else {
00240
00241
00242
00243 for(k = 0; k < n; k++) {
00244 t = ddot(k, &a[lda * k + 0], 1, &b[0], 1);
00245 b[k] = (b[k] - t) / a[lda * k + k];
00246 }
00247
00248
00249
00250 if(nm1 >= 1) {
00251 for(kb = 1; kb < nm1; kb++) {
00252 k = n - (kb + 1);
00253 b[k] = b[k] + ddot(n - (k + 1), &a[lda * k + k + 1], 1, &b[k + 1], 1);
00254 l = ipvt[k];
00255 if(l != k) {
00256 t = b[l];
00257 b[l] = b[k];
00258 b[k] = t;
00259 }
00260 }
00261 }
00262 }
00263 }
00264
00265
00266
00267
00268 #ifdef __STDC__
00269 static REAL
00270 ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
00271 #else
00272 static REAL
00273 ddot(n, dx, incx, dy, incy)
00274
00275
00276 REAL dx[], dy[];
00277 int incx, incy, n;
00278 #endif
00279 {
00280 REAL dtemp;
00281 int i, ix, iy;
00282
00283 dtemp = ZERO;
00284
00285 if(n <= 0)
00286 return (ZERO);
00287
00288 if(incx != 1 || incy != 1) {
00289
00290
00291
00292 ix = 0;
00293 iy = 0;
00294 if(incx < 0)
00295 ix = (-n + 1) * incx;
00296 if(incy < 0)
00297 iy = (-n + 1) * incy;
00298 for(i = 0; i < n; i++) {
00299 dtemp = dtemp + dx[ix] * dy[iy];
00300 ix = ix + incx;
00301 iy = iy + incy;
00302 }
00303 return (dtemp);
00304 }
00305
00306
00307
00308 #ifdef ROLL
00309 for(i = 0; i < n; i++)
00310 dtemp = dtemp + dx[i] * dy[i];
00311 return (dtemp);
00312 #endif
00313 #ifdef UNROLL
00314
00315 m = n % 5;
00316 if(m != 0) {
00317 for(i = 0; i < m; i++)
00318 dtemp = dtemp + dx[i] * dy[i];
00319 if(n < 5)
00320 return (dtemp);
00321 }
00322 for(i = m; i < n; i = i + 5) {
00323 dtemp = dtemp + dx[i] * dy[i] +
00324 dx[i + 1] * dy[i + 1] + dx[i + 2] * dy[i + 2] +
00325 dx[i + 3] * dy[i + 3] + dx[i + 4] * dy[i + 4];
00326 }
00327 return (dtemp);
00328 #endif
00329 }
00330
00331
00332 #ifdef __STDC__
00333 static void
00334 dscal(int n, REAL da, REAL dx[], int incx)
00335 #else
00336 static void
00337 dscal(n, da, dx, incx)
00338
00339
00340 REAL da, dx[];
00341 int n, incx;
00342 #endif
00343 {
00344 int i, nincx;
00345
00346 if(n <= 0)
00347 return;
00348 if(incx != 1) {
00349
00350
00351
00352 nincx = n * incx;
00353 for(i = 0; i < nincx; i = i + incx)
00354 dx[i] = da * dx[i];
00355 return;
00356 }
00357
00358
00359
00360 #ifdef ROLL
00361 for(i = 0; i < n; i++)
00362 dx[i] = da * dx[i];
00363 #endif
00364 #ifdef UNROLL
00365
00366 m = n % 5;
00367 if(m != 0) {
00368 for(i = 0; i < m; i++)
00369 dx[i] = da * dx[i];
00370 if(n < 5)
00371 return;
00372 }
00373 for(i = m; i < n; i = i + 5) {
00374 dx[i] = da * dx[i];
00375 dx[i + 1] = da * dx[i + 1];
00376 dx[i + 2] = da * dx[i + 2];
00377 dx[i + 3] = da * dx[i + 3];
00378 dx[i + 4] = da * dx[i + 4];
00379 }
00380 #endif
00381
00382 }
00383
00384
00385 #ifdef __STDC__
00386 static void
00387 dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
00388 #else
00389 static void
00390 dgefa(a, lda, n, ipvt, info)
00391 REAL a[];
00392 int lda, n, ipvt[], *info;
00393 #endif
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431 {
00432
00433
00434 REAL t;
00435 int j, k, kp1, l, nm1;
00436
00437
00438
00439 *info = 0;
00440 nm1 = n - 1;
00441 if(nm1 >= 0) {
00442 for(k = 0; k < nm1; k++) {
00443 kp1 = k + 1;
00444
00445
00446
00447 l = idamax(n - k, &a[lda * k + k], 1) + k;
00448 ipvt[k] = l;
00449
00450
00451
00452 if(a[lda * k + l] != ZERO) {
00453
00454
00455
00456 if(l != k) {
00457 t = a[lda * k + l];
00458 a[lda * k + l] = a[lda * k + k];
00459 a[lda * k + k] = t;
00460 }
00461
00462
00463
00464 t = -ONE / a[lda * k + k];
00465 dscal(n - (k + 1), t, &a[lda * k + k + 1], 1);
00466
00467
00468
00469 for(j = kp1; j < n; j++) {
00470 t = a[lda * j + l];
00471 if(l != k) {
00472 a[lda * j + l] = a[lda * j + k];
00473 a[lda * j + k] = t;
00474 }
00475 daxpy(n - (k + 1), t, &a[lda * k + k + 1], 1,
00476 &a[lda * j + k + 1], 1);
00477 }
00478 }
00479 else {
00480 *info = k;
00481 }
00482 }
00483 }
00484 ipvt[n - 1] = n - 1;
00485 if(a[lda * (n - 1) + (n - 1)] == ZERO)
00486 *info = n - 1;
00487 }
00488
00489
00490 #ifdef __STDC__
00491 static int
00492 idamax(int n, REAL dx[], int incx)
00493 #else
00494 static int
00495 idamax(n, dx, incx)
00496
00497
00498
00499 REAL dx[];
00500 int incx, n;
00501 #endif
00502 {
00503 REAL dmax;
00504 int i, ix, itemp;
00505
00506 if(n < 1)
00507 return (-1);
00508 if(n == 1)
00509 return (0);
00510 if(incx != 1) {
00511
00512
00513
00514 ix = 1;
00515 itemp = 0;
00516 dmax = fabs((double) dx[0]);
00517 ix = ix + incx;
00518 for(i = 1; i < n; i++) {
00519 if(fabs((double) dx[ix]) > dmax) {
00520 itemp = i;
00521 dmax = fabs((double) dx[ix]);
00522 }
00523 ix = ix + incx;
00524 }
00525 }
00526 else {
00527
00528
00529
00530 itemp = 0;
00531 dmax = fabs((double) dx[0]);
00532 for(i = 1; i < n; i++) {
00533 if(fabs((double) dx[i]) > dmax) {
00534 itemp = i;
00535 dmax = fabs((double) dx[i]);
00536 }
00537 }
00538 }
00539 return (itemp);
00540 }
00541
00542
00543
00544 #ifdef __STDC__
00545 static void
00546 dmxpy(int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
00547 #else
00548 static void
00549 dmxpy(n1, y, n2, ldm, x, m)
00550 REAL y[], x[], m[];
00551 int n1, n2, ldm;
00552 #endif
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575 {
00576 int j, i, jmin;
00577
00578
00579 j = n2 % 2;
00580 if(j >= 1) {
00581 j = j - 1;
00582 for(i = 0; i < n1; i++)
00583 y[i] = (y[i]) + x[j] * m[ldm * j + i];
00584 }
00585
00586
00587
00588 j = n2 % 4;
00589 if(j >= 2) {
00590 j = j - 1;
00591 for(i = 0; i < n1; i++)
00592 y[i] = ((y[i])
00593 + x[j - 1] * m[ldm * (j - 1) + i]) + x[j] * m[ldm * j + i];
00594 }
00595
00596
00597
00598 j = n2 % 8;
00599 if(j >= 4) {
00600 j = j - 1;
00601 for(i = 0; i < n1; i++)
00602 y[i] = ((((y[i])
00603 + x[j - 3] * m[ldm * (j - 3) + i])
00604 + x[j - 2] * m[ldm * (j - 2) + i])
00605 + x[j - 1] * m[ldm * (j - 1) + i]) + x[j] * m[ldm * j + i];
00606 }
00607
00608
00609
00610 j = n2 % 16;
00611 if(j >= 8) {
00612 j = j - 1;
00613 for(i = 0; i < n1; i++)
00614 y[i] = ((((((((y[i])
00615 + x[j - 7] * m[ldm * (j - 7) + i]) + x[j -
00616 6] * m[ldm * (j -
00617 6) +
00618 i])
00619 + x[j - 5] * m[ldm * (j - 5) + i]) + x[j - 4] * m[ldm * (j -
00620 4)
00621 + i])
00622 + x[j - 3] * m[ldm * (j - 3) + i]) + x[j - 2] * m[ldm * (j -
00623 2) +
00624 i])
00625 + x[j - 1] * m[ldm * (j - 1) + i]) + x[j] * m[ldm * j + i];
00626 }
00627
00628
00629
00630 jmin = (n2 % 16) + 16;
00631 for(j = jmin - 1; j < n2; j = j + 16) {
00632 for(i = 0; i < n1; i++)
00633 y[i] = ((((((((((((((((y[i])
00634 + x[j - 15] * m[ldm * (j - 15) + i])
00635 + x[j - 14] * m[ldm * (j - 14) + i])
00636 + x[j - 13] * m[ldm * (j - 13) + i])
00637 + x[j - 12] * m[ldm * (j - 12) + i])
00638 + x[j - 11] * m[ldm * (j - 11) + i])
00639 + x[j - 10] * m[ldm * (j - 10) + i])
00640 + x[j - 9] * m[ldm * (j - 9) + i])
00641 + x[j - 8] * m[ldm * (j - 8) + i])
00642 + x[j - 7] * m[ldm * (j - 7) + i])
00643 + x[j - 6] * m[ldm * (j - 6) + i])
00644 + x[j - 5] * m[ldm * (j - 5) + i])
00645 + x[j - 4] * m[ldm * (j - 4) + i])
00646 + x[j - 3] * m[ldm * (j - 3) + i])
00647 + x[j - 2] * m[ldm * (j - 2) + i])
00648 + x[j - 1] * m[ldm * (j - 1) + i])
00649 + x[j] * m[ldm * j + i];
00650 }
00651 }
00652
00653
00654 #ifdef __STDC__
00655 static REAL
00656 second()
00657 #else
00658 static REAL
00659 second()
00660 #endif
00661 {
00662 REAL t;
00663
00664 #ifndef MPP
00665
00666
00667 #ifndef NORUSAGE
00668 getrusage(RUSAGE_SELF, &ru);
00669 #else
00670 if(ClockTick == 0.0)
00671 ClockTick = (REAL) sysconf(_SC_CLK_TCK);
00672 if(times(&ru) == -1)
00673 fprintf(stderr, "second() : Oups\n");
00674 #endif
00675
00676
00677 #ifndef NORUSAGE
00678 t = (REAL) (ru.ru_utime.tv_sec) + ((REAL) (ru.ru_utime.tv_usec)) / 1.0e6;
00679 #else
00680 t = (REAL) ((REAL) ru.tms_utime / ClockTick);
00681 #endif
00682 #else
00683
00684 #endif
00685
00686 return t;
00687 }
00688
00689
00690
00691 int
00692 kflops()
00693 {
00694 static REAL aa[SIZE][SIZE], a[SIZE][SIZE + 1], b[SIZE], x[SIZE];
00695 REAL Newcray, ops, total, norma, normx;
00696 REAL resid, eps, t1, tm, tm2;
00697 REAL kflops_epslon(), second(), kf;
00698 static int ipvt[SIZE], n, i, ntimes, info, lda, ldaa, kflops;
00699
00700 lda = SIZE + 1;
00701 ldaa = SIZE;
00702 Newcray = .056;
00703 n = SIZE/2;
00704
00705 ops = (2.0e0 * (n * n * n)) / 3.0 + 2.0 * (n * n);
00706
00707 matgen((REAL *) a, lda, n, b, &norma);
00708 t1 = second();
00709 dgefa((REAL *) a, lda, n, ipvt, &info);
00710 my_time[0][0] = second() - t1;
00711 t1 = second();
00712 dgesl((REAL *) a, lda, n, ipvt, b, 0);
00713 my_time[1][0] = second() - t1;
00714 total = my_time[0][0] + my_time[1][0];
00715
00716
00717
00718 for(i = 0; i < n; i++) {
00719 x[i] = b[i];
00720 }
00721 matgen((REAL *) a, lda, n, b, &norma);
00722 for(i = 0; i < n; i++) {
00723 b[i] = -b[i];
00724 }
00725 dmxpy(n, b, n, lda, x, (REAL *) a);
00726 resid = 0.0;
00727 normx = 0.0;
00728 for(i = 0; i < n; i++) {
00729 resid = (resid > fabs((double) b[i]))
00730 ? resid : fabs((double) b[i]);
00731 normx = (normx > fabs((double) x[i]))
00732 ? normx : fabs((double) x[i]);
00733 }
00734 eps = kflops_epslon((REAL) ONE);
00735
00736
00737 my_time[2][0] = total;
00738 my_time[3][0] = ops / (1.0e3 * total);
00739 my_time[4][0] = 2.0e3 / my_time[3][0];
00740 my_time[5][0] = total / Newcray;
00741
00742 matgen((REAL *) a, lda, n, b, &norma);
00743 t1 = second();
00744 dgefa((REAL *) a, lda, n, ipvt, &info);
00745 my_time[0][1] = second() - t1;
00746 t1 = second();
00747 dgesl((REAL *) a, lda, n, ipvt, b, 0);
00748 my_time[1][1] = second() - t1;
00749 total = my_time[0][1] + my_time[1][1];
00750 my_time[2][1] = total;
00751 my_time[3][1] = ops / (1.0e3 * total);
00752 my_time[4][1] = 2.0e3 / my_time[3][1];
00753 my_time[5][1] = total / Newcray;
00754
00755 matgen((REAL *) a, lda, n, b, &norma);
00756 t1 = second();
00757 dgefa((REAL *) a, lda, n, ipvt, &info);
00758 my_time[0][2] = second() - t1;
00759 t1 = second();
00760 dgesl((REAL *) a, lda, n, ipvt, b, 0);
00761 my_time[1][2] = second() - t1;
00762 total = my_time[0][2] + my_time[1][2];
00763 my_time[2][2] = total;
00764 my_time[3][2] = ops / (1.0e3 * total);
00765 my_time[4][2] = 2.0e3 / my_time[3][2];
00766 my_time[5][2] = total / Newcray;
00767
00768 ntimes = NTIMES;
00769 tm2 = 0.0;
00770 t1 = second();
00771
00772 for(i = 0; i < ntimes; i++) {
00773 tm = second();
00774 matgen((REAL *) a, lda, n, b, &norma);
00775 tm2 = tm2 + second() - tm;
00776 dgefa((REAL *) a, lda, n, ipvt, &info);
00777 }
00778
00779 my_time[0][3] = (second() - t1 - tm2) / ntimes;
00780 t1 = second();
00781
00782 for(i = 0; i < ntimes; i++) {
00783 dgesl((REAL *) a, lda, n, ipvt, b, 0);
00784 }
00785
00786 my_time[1][3] = (second() - t1) / ntimes;
00787 total = my_time[0][3] + my_time[1][3];
00788 my_time[2][3] = total;
00789 my_time[3][3] = ops / (1.0e3 * total);
00790 my_time[4][3] = 2.0e3 / my_time[3][3];
00791 my_time[5][3] = total / Newcray;
00792
00793 matgen((REAL *) aa, ldaa, n, b, &norma);
00794 t1 = second();
00795 dgefa((REAL *) aa, ldaa, n, ipvt, &info);
00796 my_time[0][4] = second() - t1;
00797 t1 = second();
00798 dgesl((REAL *) aa, ldaa, n, ipvt, b, 0);
00799 my_time[1][4] = second() - t1;
00800 total = my_time[0][4] + my_time[1][4];
00801 my_time[2][4] = total;
00802 my_time[3][4] = ops / (1.0e3 * total);
00803 my_time[4][4] = 2.0e3 / my_time[3][4];
00804 my_time[5][4] = total / Newcray;
00805
00806 matgen((REAL *) aa, ldaa, n, b, &norma);
00807 t1 = second();
00808 dgefa((REAL *) aa, ldaa, n, ipvt, &info);
00809 my_time[0][5] = second() - t1;
00810 t1 = second();
00811 dgesl((REAL *) aa, ldaa, n, ipvt, b, 0);
00812 my_time[1][5] = second() - t1;
00813 total = my_time[0][5] + my_time[1][5];
00814 my_time[2][5] = total;
00815 my_time[3][5] = ops / (1.0e3 * total);
00816 my_time[4][5] = 2.0e3 / my_time[3][5];
00817 my_time[5][5] = total / Newcray;
00818
00819 matgen((REAL *) aa, ldaa, n, b, &norma);
00820 t1 = second();
00821 dgefa((REAL *) aa, ldaa, n, ipvt, &info);
00822 my_time[0][6] = second() - t1;
00823 t1 = second();
00824 dgesl((REAL *) aa, ldaa, n, ipvt, b, 0);
00825 my_time[1][6] = second() - t1;
00826 total = my_time[0][6] + my_time[1][6];
00827 my_time[2][6] = total;
00828 my_time[3][6] = ops / (1.0e3 * total);
00829 my_time[4][6] = 2.0e3 / my_time[3][6];
00830 my_time[5][6] = total / Newcray;
00831
00832 ntimes = NTIMES;
00833 tm2 = 0;
00834 t1 = second();
00835 for(i = 0; i < ntimes; i++) {
00836 tm = second();
00837 matgen((REAL *) aa, ldaa, n, b, &norma);
00838 tm2 = tm2 + second() - tm;
00839 dgefa((REAL *) aa, ldaa, n, ipvt, &info);
00840 }
00841 my_time[0][7] = (second() - t1 - tm2) / ntimes;
00842 t1 = second();
00843 for(i = 0; i < ntimes; i++) {
00844 dgesl((REAL *) aa, ldaa, n, ipvt, b, 0);
00845 }
00846 my_time[1][7] = (second() - t1) / ntimes;
00847 total = my_time[0][7] + my_time[1][7];
00848
00849 my_time[2][7] = total;
00850 my_time[3][7] = ops / (1.0e3 * total);
00851 my_time[4][7] = 2.0e3 / my_time[3][7];
00852 my_time[5][7] = total / Newcray;
00853
00854
00855
00856
00857 kf = (my_time[3][3] < my_time[3][7]) ? my_time[3][3] : my_time[3][7];
00858 kf = (kf > ZERO) ? (kf + .5) : (kf - .5);
00859 if(fabs((double) kf) < ONE)
00860 kflops = 0;
00861 else {
00862 kflops = (int) fabs((double) kf);
00863 if(kf < ZERO)
00864 kflops = -kflops;
00865 }
00866
00867 return kflops;
00868 }