33 void matmul (
double *A,
double *B,
double *C,
int NB )
41 C[i+j*NB] += A[i+k*NB] * B[k+j*NB];
63 sizeof(
double)*NB*NB, A,
INPUT,
64 sizeof(
double)*NB*NB, B,
INPUT,
65 sizeof(
double)*NB*NB, C,
INOUT,
66 sizeof(
int), &NB,
VALUE,
74 printf(
"%s (intial 4x4 submatrix)\n", label);
75 for (i=0; i<N && i<4; i++) {
76 for (j=0; j<N && j<4; j++) {
77 printf(
"%5.2f ", A[i + j*N] );
90 diff = fabs( A[i*N+j] - A2[i*N+j] );
91 if ( diff > 0.0001 ) {
104 if (x->tv_usec < y->tv_usec) {
105 int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
106 y->tv_usec -= 1000000 * nsec;
109 if (x->tv_usec - y->tv_usec > 1000000) {
110 int nsec = (x->tv_usec - y->tv_usec) / 1000000;
111 y->tv_usec += 1000000 * nsec;
116 result->tv_sec = x->tv_sec - y->tv_sec;
117 result->tv_usec = x->tv_usec - y->tv_usec;
119 return (
double)result->tv_sec + (double)result->tv_usec/1000000.0;
127 for (X = 0; X < BB; X++)
128 for (Y = 0; Y < BB; Y++)
129 for (x = 0; x < NB; x++)
130 for (y = 0; y < NB; y++)
131 Ablk[Y*NB*NB + y + X*NB*NB*BB + x*NB] = A[Y*NB + y + X*NB*N + x*N];
139 for (X = 0; X < BB; X++)
140 for (Y = 0; Y < BB; Y++)
141 for (x = 0; x < NB; x++)
142 for (y = 0; y < NB; y++) {
143 A[Y*NB + y + X*NB*N + x*N] = Ablk[Y*NB*NB + y + X*NB*NB*BB + x*NB];
155 double *A = (
double*)malloc(N*N*
sizeof(
double));
156 double *Ablk = (
double*)malloc(N*N*
sizeof(
double));
157 double *B = (
double*)malloc(N*N*
sizeof(
double));
158 double *Bblk = (
double*)malloc(N*N*
sizeof(
double));
159 double *C_direct = (
double*)malloc(N*N*
sizeof(
double));
160 double *C = (
double*)malloc(N*N*
sizeof(
double));
161 double *Cblk = (
double*)malloc(N*N*
sizeof(
double));
162 double *C_quark = (
double*)malloc(N*N*
sizeof(
double));
163 double *C_quark_blk = (
double*)malloc(N*N*
sizeof(
double));
164 struct timeval tstart, tend, tdiff;
165 double t_blk=0, t_quark=0, t_direct=0;
168 for (i = 0; i < N; i++) {
169 for (j = 0; j < N; j++) {
170 A[i+j*N] = (double)1.0+i;
171 B[i+j*N] = (double)2.0+i+j;
172 C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0;
188 printf(
"Doing matrix multiplication using serial tile-by-tile algorithm\n");
189 gettimeofday( &tstart, NULL );
190 for (i = 0; i < BB; i++)
191 for (j = 0; j < BB; j++)
192 for (k = 0; k < BB; k++)
193 matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB);
194 gettimeofday( &tend, NULL );
196 printf(
"Time taken: %f\n", tdiff.tv_sec + (
double)tdiff.tv_usec/1000000 );
198 matrix_print(
"Printing C produced by serial tile-algorithm after computation", C, N);
203 printf(
"Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n");
205 gettimeofday( &tstart, NULL );
206 for (i = 0; i < BB; i++)
207 for (j = 0; j < BB; j++)
208 for (k = 0; k < BB; k++)
209 matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB);
211 gettimeofday( &tend, NULL );
213 printf(
"Time taken: %f\n", tdiff.tv_sec + (
double)tdiff.tv_usec/1000000 );
216 matrix_print(
"Printing C produced by QUARK runtime after computation", C_quark, N);
221 printf(
"Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n");
222 gettimeofday( &tstart, NULL );
223 matmul ( A, B, C_direct, N );
224 gettimeofday( &tend, NULL );
226 printf(
"Time taken: %f\n", (
double)(tdiff.tv_sec + (
double)tdiff.tv_usec/1000000) );
227 matrix_print(
"Printing C produced by direct matmul after computation", C_direct, N);
231 printf(
"Comparing result matrices (direct versus QUARK)\n");
233 printf(
"Number of differences: %d\n", nerr);
236 printf(
"Summary of time taken\n");
237 printf(
"%-12s %-12s %-12s (%d threads)\n",
"BigLoops",
"SerialBlocks",
"QUARKBlocks", THREADS);
238 printf(
"%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark);
240 free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk);
245 int main (
int argc,
char **argv)
249 printf(
"Usage: %s NB N THREADS\n", argv[0] );
250 printf(
"Usage: Note: N / NB must be an integer\n" );
251 printf(
"Usage: Assuming a simple test run, with the following parameters\n" );
252 printf(
"%s 200 800 2\n", argv[0] );
259 THREADS = atoi(argv[3]);
261 assert( N % NB == 0 );