4#include <sys/resource.h>
18 long long int ev_values[2];
19 int i32_00, i32_01, i32_02, i32_03, i32_04, i32_05, i32_06, i32_07, i32_08, i32_09;
41 for(
int i=0;
i<M;
i++){
42 for(
int j=0; j<
N; j++){
108 fprintf(
fp,
"%d %lld # INT_ADD_count: %lld (%.3lf)\n",
N, ev_values[0], 50LL*
N*M, ev_values[0]/(50.0*
N*M));
110 sum_i32 += i32_00 + i32_01 + i32_02 + i32_03 + i32_04 + i32_05 + i32_06 + i32_07 + i32_08 + i32_09;
123 long long int ev_values[2];
124 double f64_00, f64_01, f64_02, f64_03;
127 f64_00 = (double)p/1.02;
128 f64_01 = -(double)p/1.03;
129 f64_02 = (double)p/1.04;
130 f64_03 = -(double)p/1.05;
140#define FADD_BLOCK() {f64_01 += f64_00; f64_02 += f64_01; f64_03 += f64_02; f64_00 += f64_03;}
142 for(
int i=0;
i<M;
i++){
143 for(
int j=0; j<
N; j++){
163 long long int fp_op_count = 40LL*
N*M;
164 fprintf(
fp,
"%d %lld # FP_ADD_count: %lld (%.3lf)\n",
N, ev_values[0], fp_op_count, (
double)ev_values[0]/fp_op_count);
166 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03;
176 long long int ev_values[2];
177 double f64_00, f64_01, f64_02, f64_03, f64_04, f64_05, f64_06, f64_07;
178 double f64_08, f64_09, f64_10, f64_11;
179 double f64_100, f64_101, f64_102;
214#define F64_ADDS(_X) {f64_00 += _X; f64_01 += _X; f64_02 += _X; f64_03 += _X; f64_04 += _X; f64_05 += _X; f64_06 += _X; f64_07 += _X; f64_08 += _X; f64_09 += _X; f64_10 += _X; f64_11 += _X;}
216 for(
int i=0;
i<M;
i++){
217 for(
int j=0; j<
N; j++){
231 fprintf(
fp,
"%d %lld # FP_ADD_count_ILP12: %lld (%.3lf)\n",
N, ev_values[0], 12LL*3LL*
N*M, (
double)ev_values[0]/(12.0*3.0*
N*M));
233 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03 + f64_04 + f64_05 + f64_06 + f64_07;
234 sum_f64 += f64_08 + f64_09 + f64_10 + f64_11;
246 long long int ev_values[2];
248#define BUFFER_SIZE 512+2
267 for(
int i=0;
i<UB;
i++){
269 a[j] =
a[j-2] +
b[j];
279 fprintf(
fp,
"%d %lld # FP_ADD_DVEC128_count: %lld (%.3lf)\n",
N, ev_values[0],
N*M/2LL, (
double)ev_values[0]/(
N*M/2.0));
292 long long int ev_values[2];
294#define BUFFER_SIZE 512+4
313 for(
int i=0;
i<UB;
i++){
315 a[j] =
a[j-4] +
b[j];
325 fprintf(
fp,
"%d %lld # FP_ADD_DVEC256_count: %lld (%.3lf)\n",
N, ev_values[0],
N*M/4LL, (
double)ev_values[0]/(
N*M/4.0));
338 long long int ev_values[2];
340#define BUFFER_SIZE 512+8
359 for(
int i=0;
i<UB;
i++){
361 a[j] =
a[j-8] +
b[j];
371 fprintf(
fp,
"%d %lld # FP_ADD_DVEC512_count: %lld (%.3lf)\n",
N, ev_values[0],
N*M/8LL, (
double)ev_values[0]/(
N*M/8.0));
387 long long int ev_values[2];
389#define BUFFER_SIZE 512+2
408 for(
int i=0;
i<UB;
i++){
410 a[j] =
a[j-2] +
b[j];
420 fprintf(
fp,
"%d %lld # FP_ADD_SVEC128_count: %lld (%.3lf)\n",
N, ev_values[0],
N*M/2LL, (
double)ev_values[0]/(
N*M/2.0));
433 long long int ev_values[2];
435#define BUFFER_SIZE 512+4
454 for(
int i=0;
i<UB;
i++){
456 a[j] =
a[j-4] +
b[j];
466 fprintf(
fp,
"%d %lld # FP_ADD_SVEC256_count: %lld (%.3lf)\n",
N, ev_values[0],
N*M/4LL, (
double)ev_values[0]/(
N*M/4.0));
479 long long int ev_values[2];
481#define BUFFER_SIZE 512+8
500 for(
int i=0;
i<UB;
i++){
502 a[j] =
a[j-8] +
b[j];
512 fprintf(
fp,
"%d %lld # FP_ADD_SVEC512_count: %lld (%.3lf)\n",
N, ev_values[0],
N*M/8LL, (
double)ev_values[0]/(
N*M/8.0));
528 long long int ev_values[2];
529 double f64_00, f64_01, f64_02, f64_03;
532 f64_00 = (double)p/1.02;
533 f64_01 = -(double)p/1.03;
534 f64_02 = (double)p/1.04;
535 f64_03 = -(double)p/1.05;
545#define FSUB_BLOCK() {f64_01 -= f64_00; f64_02 -= f64_01; f64_03 -= f64_02; f64_00 -= f64_03;}
547 for(
int i=0;
i<M;
i++){
548 for(
int j=0; j<
N; j++){
568 long long int fp_op_count = 40LL*
N*M;
569 fprintf(
fp,
"%d %lld # FP_SUB_count: %lld (%.3lf)\n",
N, ev_values[0], fp_op_count, (
double)ev_values[0]/fp_op_count);
571 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03;
581 long long int ev_values[2];
582 double f64_00, f64_01, f64_02, f64_03, f64_04, f64_05, f64_06, f64_07;
583 double f64_08, f64_09, f64_10, f64_11;
584 double f64_100, f64_101, f64_102;
619#define F64_SUBS(_X) {f64_00 -= _X; f64_01 -= _X; f64_02 -= _X; f64_03 -= _X; f64_04 -= _X; f64_05 -= _X; f64_06 -= _X; f64_07 -= _X; f64_08 -= _X; f64_09 -= _X; f64_10 -= _X; f64_11 -= _X;}
621 for(
int i=0;
i<M;
i++){
622 for(
int j=0; j<
N; j++){
636 fprintf(
fp,
"%d %lld # FP_SUB_count_ILP12: %lld (%.3lf)\n",
N, ev_values[0], 12LL*3LL*
N*M, (
double)ev_values[0]/(12.0*3.0*
N*M));
638 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03 + f64_04 + f64_05 + f64_06 + f64_07;
639 sum_f64 += f64_08 + f64_09 + f64_10 + f64_11;
651 long long int ev_values[2];
652 double f64_00, f64_01, f64_02, f64_03;
655 f64_00 = (double)p/1.02;
656 f64_01 = 1.03/(double)p;
657 f64_02 = (double)p/1.04;
658 f64_03 = 1.05/(double)p;
668#define FMUL_BLOCK() {f64_01 *= f64_00; f64_02 *= f64_01; f64_03 *= f64_02; f64_00 *= f64_03;}
670 for(
int i=0;
i<M;
i++){
671 for(
int j=0; j<
N; j++){
691 long long int fp_op_count = 40LL*
N*M;
692 fprintf(
fp,
"%d %lld # FP_MUL_count: %lld (%.3lf)\n",
N, ev_values[0], fp_op_count, (
double)ev_values[0]/fp_op_count);
694 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03;
704 long long int ev_values[2];
705 double f64_00, f64_01, f64_02, f64_03, f64_04, f64_05, f64_06, f64_07;
706 double f64_08, f64_09, f64_10, f64_11;
707 double f64_100, f64_101, f64_102;
742#define F64_MULS(_X) {f64_00 *= _X; f64_01 *= _X; f64_02 *= _X; f64_03 *= _X; f64_04 *= _X; f64_05 *= _X; f64_06 *= _X; f64_07 *= _X; f64_08 *= _X; f64_09 *= _X; f64_10 *= _X; f64_11 *= _X;}
744 for(
int i=0;
i<M;
i++){
745 for(
int j=0; j<
N; j++){
759 fprintf(
fp,
"%d %lld # FP_MUL_count_ILP12: %lld (%.3lf)\n",
N, ev_values[0], 12LL*3LL*
N*M, (
double)ev_values[0]/(12.0*3.0*
N*M));
761 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03 + f64_04 + f64_05 + f64_06 + f64_07;
762 sum_f64 += f64_08 + f64_09 + f64_10 + f64_11;
774 long long int ev_values[2];
775 double f64_00, f64_01, f64_02, f64_03;
778 f64_00 = 1.0 + 1.0/(1000.1*(double)p);
779 f64_01 = 1.0 + 1.0/(1000.2*(double)p);
780 f64_02 = 1.0 + 1.0/(1000.3*(double)p);
781 f64_03 = 1.0 + 1.0/(1000.4*(double)p);
791#define FDIV_BLOCK() {f64_01 /= f64_00; f64_02 /= f64_01; f64_03 /= f64_02; f64_00 /= f64_03;}
793 for(
int i=0;
i<M;
i++){
794 for(
int j=0; j<
N; j++){
814 long long int fp_op_count = 40LL*
N*M;
815 fprintf(
fp,
"%d %lld # FP_DIV_count: %lld (%.3lf)\n",
N, ev_values[0], fp_op_count, (
double)ev_values[0]/fp_op_count);
817 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03;
827 long long int ev_values[2];
828 double f64_00, f64_01, f64_02, f64_03, f64_04, f64_05, f64_06, f64_07;
829 double f64_08, f64_09, f64_10, f64_11;
830 double f64_100, f64_101, f64_102;
865#define F64_DIVS(_X) {f64_00 /= _X; f64_01 /= _X; f64_02 /= _X; f64_03 /= _X; f64_04 /= _X; f64_05 /= _X; f64_06 /= _X; f64_07 /= _X; f64_08 /= _X; f64_09 /= _X; f64_10 /= _X; f64_11 /= _X;}
867 for(
int i=0;
i<M;
i++){
868 for(
int j=0; j<
N; j++){
882 fprintf(
fp,
"%d %lld # FP_DIV_count_ILP12: %lld (%.3lf)\n",
N, ev_values[0], 12LL*3LL*
N*M, (
double)ev_values[0]/(12.0*3.0*
N*M));
884 sum_f64 += f64_00 + f64_01 + f64_02 + f64_03 + f64_04 + f64_05 + f64_06 + f64_07;
885 sum_f64 += f64_08 + f64_09 + f64_10 + f64_11;
897 long long int ev_values[2];
899#define BUFFER_SIZE 256
904 buffer[
i] = p/(1223+
i);
915 uintptr_t index = buffer[0];
916 for(
int i=0;
i<M;
i++){
917 for(
int j=0; j<
N; j++){
928 fprintf(
fp,
"%d %lld # MEM_OPS_RO_count: %lld (%.3lf)\n",
N, ev_values[0], 1LL*
N*M, ev_values[0]/(1.0*
N*M));
939 long long int ev_values[2];
941#define BUFFER_SIZE (256+1)
946 buffer[
i] = p/(1223+
i);
957 for(
int i=0;
i<M;
i++){
958 for(
int j=0; j<
N; j++){
960 buffer[index+1] += buffer[index];
970 fprintf(
fp,
"%d %lld # MEM_OPS_RW_count: %lld (%.3lf)\n",
N, ev_values[0], 2LL*
N*M, ev_values[0]/(2.0*
N*M));
981 long long int ev_values[2];
984#define BUFFER_SIZE (256+8)
989 buffer[
i] = p/(223+
i);
993 c0 = (
int)((5+p)/(12+1));
994 c1 = (
int)((7+p)/(12+2));
995 c2 = (
int)((11+p)/(12+3));
996 c3 = (
int)((13+p)/(12+4));
1006 for(
int i=0;
i<M;
i++){
1008 uintptr_t base =
i*(c0+c1)/(c2+c3+1);
1009 for(
int j=0; j<
N; j++){
1012 c0 += buffer[offset+1];
1013 c1 += buffer[offset+2];
1014 c2 += buffer[offset+3];
1015 c3 += buffer[offset+4];
1017 c0 += buffer[offset+5];
1018 c1 += buffer[offset+6];
1019 c2 += buffer[offset+7];
1020 c3 += buffer[offset+8];
1031 fprintf(
fp,
"%d %lld # MEM_OPS_RO_count(par): %lld (%.3lf)\n",
N, ev_values[0], 8LL*
N*M, ev_values[0]/(8.0*
N*M));
1042 long long int ev_values[2];
1045#define BUFFER_SIZE 256
1050 buffer[
i] = p/(1223+
i);
1062 for(
int i=0;
i<M;
i++){
1063 for(
int j=0; j<
N; j++){
1074 fprintf(
fp,
"%d %lld # MEM_OPS_WO_count(par): %lld (%.3lf)\n",
N, ev_values[0], 1LL*
N*M, ev_values[0]/(1.0*
N*M));
1088 int minM=64, minN=64;
1089 double f[4] = {1.0, 1.1892, 1.4142, 1.6818};
1090 int p = (
int)getpid();
1093 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1094 for(
i=16;
i<50;
i*=2){
1096 M = (
int)(
i*
f[j]*minM);
1104 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1105 for(
i=16;
i<50;
i*=2){
1107 M = (
int)(
i*
f[j]*minM);
1115 fprintf(
fp,
"# (((9+3)*N)+3)*M\n");
1116 for(
i=16;
i<50;
i*=2){
1118 M = (
int)(
i*
f[j]*minM);
1126 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1127 for(
i=16;
i<50;
i*=2){
1129 M = (
int)(
i*
f[j]*minM);
1137 fprintf(
fp,
"# (((50.0+3)*N)+3)*M\n");
1138 for(
i=16;
i<50;
i*=2){
1140 M = (
int)(
i*
f[j]*minM);
1148 fprintf(
fp,
"# (((40+3)*N)+3)*M\n");
1149 for(
i=16;
i<50;
i*=2){
1151 M = (
int)(
i*
f[j]*minM);
1159 fprintf(
fp,
"# (((12.0*3+5)*N)+3)*M\n");
1160 for(
i=16;
i<50;
i*=2){
1162 M = (
int)(
i*
f[j]*minM);
1170 fprintf(
fp,
"# (((40+3)*N)+3)*M\n");
1171 for(
i=16;
i<50;
i*=2){
1173 M = (
int)(
i*
f[j]*minM);
1181 fprintf(
fp,
"# (((12.0*3+5)*N)+3)*M\n");
1182 for(
i=16;
i<50;
i*=2){
1184 M = (
int)(
i*
f[j]*minM);
1192 fprintf(
fp,
"# (((40+3)*N)+3)*M\n");
1193 for(
i=16;
i<50;
i*=2){
1195 M = (
int)(
i*
f[j]*minM);
1203 fprintf(
fp,
"# (((12.0*3+5)*N)+3)*M\n");
1204 for(
i=16;
i<50;
i*=2){
1206 M = (
int)(
i*
f[j]*minM);
1214 fprintf(
fp,
"# (((40+3)*N)+3)*M\n");
1215 for(
i=16;
i<50;
i*=2){
1217 M = (
int)(
i*
f[j]*minM/4);
1218 N = (
int)(
i*
f[j]*minN/4);
1225 fprintf(
fp,
"# (((12.0*3+5)*N)+3)*M\n");
1226 for(
i=16;
i<50;
i*=2){
1228 M = (
int)(
i*
f[j]*minM/4);
1229 N = (
int)(
i*
f[j]*minN/4);
1236 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1237 for(
i=16;
i<50;
i*=2){
1239 M = (
int)(
i*
f[j]*minM*2);
1240 N = (
int)(
i*
f[j]*minN*2);
1247 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1248 for(
i=16;
i<50;
i*=2){
1250 M = (
int)(
i*
f[j]*minM*2);
1251 N = (
int)(
i*
f[j]*minN*2);
1258 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1259 for(
i=16;
i<50;
i*=2){
1261 M = (
int)(
i*
f[j]*minM*2);
1262 N = (
int)(
i*
f[j]*minN*2);
1269 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1270 for(
i=16;
i<50;
i*=2){
1272 M = (
int)(
i*
f[j]*minM*2);
1273 N = (
int)(
i*
f[j]*minN*2);
1280 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1281 for(
i=16;
i<50;
i*=2){
1283 M = (
int)(
i*
f[j]*minM*2);
1284 N = (
int)(
i*
f[j]*minN*2);
1291 fprintf(
fp,
"# (((2+3)*N)+3)*M\n");
1292 for(
i=16;
i<50;
i*=2){
1294 M = (
int)(
i*
f[j]*minM*2);
1295 N = (
int)(
i*
f[j]*minN*2);
1302 fprintf(
stderr,
"Side-effect to disable dead code elimination by the compiler. Please ignore.\n");
1313 const char *sufx =
".instr";
1318 int l = strlen(outdir)+strlen(papi_event_name)+strlen(sufx);
1319 if (NULL == (papiFileName = (
char *)calloc( 1+l,
sizeof(
char)))) {
1322 if (l != (sprintf(papiFileName,
"%s%s%s", outdir, papi_event_name, sufx))) {
1325 if (NULL == (ofp_papi = fopen(papiFileName,
"w"))) {
1326 fprintf(
stderr,
"Failed to open file %s.\n", papiFileName);
add PAPI preset or native hardware event by name to an EventSet
Empty and destroy an EventSet.
Create a new empty PAPI EventSet.
Empty and destroy an EventSet.
Start counting hardware events in an event set.
Stop counting hardware events in an event set.
Returns a string describing the PAPI error code.
void test_f64_add_DVEC512(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_div(int p, int M, int N, int EventSet, FILE *fp)
void test_mem_ops_parallel_WO(int p, int M, int N, int EventSet, FILE *fp)
void test_mem_ops_parallel_RO(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_add(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_add_DVEC128(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_add_SVEC256(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_mul(int p, int M, int N, int EventSet, FILE *fp)
void test_mem_ops_serial_RO(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_add_SVEC512(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_add_SVEC128(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_sub_max(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_add_DVEC256(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_div_max(int p, int M, int N, int EventSet, FILE *fp)
void test_int_add(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_sub(int p, int M, int N, int EventSet, FILE *fp)
void instr_driver(char *papi_event_name, hw_desc_t *hw_desc, char *outdir)
void test_f64_add_max(int p, int M, int N, int EventSet, FILE *fp)
void test_f64_mul_max(int p, int M, int N, int EventSet, FILE *fp)
void test_mem_ops_serial_RW(int p, int M, int N, int EventSet, FILE *fp)
void instr_test(int EventSet, FILE *fp)
static double a[MATRIX_SIZE][MATRIX_SIZE]
static double b[MATRIX_SIZE][MATRIX_SIZE]
Return codes and api definitions.
int fclose(FILE *__stream)