14 static int eventNumCore [SVG_TRACE_MAX_CORES];
15 static double eventStartCore[SVG_TRACE_MAX_CORES][SVG_TRACE_MAX_EVENTS];
16 static double eventStopCore [SVG_TRACE_MAX_CORES][SVG_TRACE_MAX_EVENTS];
17 static int eventColorCore[SVG_TRACE_MAX_CORES][SVG_TRACE_MAX_EVENTS];
20 static int eventNumDevice [SVG_TRACE_MAX_DEVICES];
21 static cudaEvent_t evStartDevice [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
22 static cudaEvent_t evStopDevice [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
23 static double eventStartDevice[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
24 static double eventStopDevice [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
25 static int eventColorDevice[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
28 static int eventNumDMA [SVG_TRACE_MAX_DEVICES];
29 static cudaEvent_t evStartDMA [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
30 static cudaEvent_t evStopDMA [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
31 static double eventStartDMA[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
32 static double eventStopDMA [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
33 static int eventColorDMA[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
36 static long memoryLevelHost;
37 static long memoryMaxHost;
38 static long memoryLevelDevice[SVG_TRACE_MAX_DEVICES];
39 static long memoryMaxDevice [SVG_TRACE_MAX_DEVICES];
40 static pthread_spinlock_t memorySpinlockHost;
41 static pthread_spinlock_t memorySpinlockDevice[SVG_TRACE_MAX_DEVICES];
43 static void svg_trace_print_cores(
44 int mpi_rank,
int mpi_size,
45 int num_cores,
int num_devices,
46 double hscale,
double vscale, FILE *trace_file);
48 static void svg_trace_print_devices(
49 int mpi_rank,
int mpi_size,
50 int num_cores,
int num_devices,
51 double hscale,
double vscale, FILE *trace_file);
53 static void svg_trace_print_dmas(
54 int mpi_rank,
int mpi_size,
55 int num_cores,
int num_devices,
56 double hscale,
double vscale, FILE *trace_file);
58 static void svg_trace_print_memory(
int mpi_rank,
int mpi_size,
int num_devices);
68 struct timeval time_val;
69 struct timezone time_zone;
70 gettimeofday(&time_val, &time_zone);
71 return (
double)(time_val.tv_sec) + (
double)(time_val.tv_usec) / 1000000.0;
83 assert(num_cores <= SVG_TRACE_MAX_CORES);
84 assert(num_devices <= SVG_TRACE_MAX_DEVICES);
85 assert(__builtin_popcount(SVG_TRACE_MAX_EVENTS) == 0x01);
86 assert(__builtin_popcount(SVG_TRACE_MAX_MEM_SNAPSHOTS) == 0x01);
89 int retval = pthread_spin_init(
90 &memorySpinlockHost, PTHREAD_PROCESS_PRIVATE);
95 for (device = 0; device < num_devices; device++) {
97 cudaError_t error = cudaSetDevice(device);
98 assert(error == cudaSuccess);
102 for (event = 0;
event < SVG_TRACE_MAX_EVENTS;
event++) {
103 cudaError_t error1 = cudaEventCreate(&evStartDevice[device][event]);
104 cudaError_t error2 = cudaEventCreate(&evStopDevice [device][event]);
105 assert(error1 == cudaSuccess);
106 assert(error2 == cudaSuccess);
107 error1 = cudaEventCreate(&evStartDMA[device][event]);
108 error2 = cudaEventCreate(&evStopDMA [device][event]);
109 assert(error1 == cudaSuccess);
110 assert(error2 == cudaSuccess);
113 int retval = pthread_spin_init(
114 &memorySpinlockDevice[device], PTHREAD_PROCESS_PRIVATE);
127 assert(thread_rank < SVG_TRACE_MAX_CORES);
128 eventStartCore[thread_rank][eventNumCore[thread_rank]] =
get_time_of_day();
140 assert(thread_rank < SVG_TRACE_MAX_CORES);
141 eventStopCore [thread_rank][eventNumCore[thread_rank]] =
get_time_of_day();
142 eventColorCore[thread_rank][eventNumCore[thread_rank]] = color;
143 eventNumCore[thread_rank]++;
144 eventNumCore[thread_rank] &= (SVG_TRACE_MAX_EVENTS-1);
156 cudaGetDevice(&device);
157 assert(device < SVG_TRACE_MAX_DEVICES);
158 cudaError_t error = cudaEventRecord(
159 evStartDevice[device][eventNumDevice[device]], stream);
160 assert(error == cudaSuccess);
173 cudaGetDevice(&device);
174 assert(device < SVG_TRACE_MAX_DEVICES);
175 cudaError_t error = cudaEventRecord(
176 evStopDevice[device][eventNumDevice[device]], stream);
177 assert(error == cudaSuccess);
178 eventColorDevice[device][eventNumDevice[device]] = color;
179 eventNumDevice[device]++;
180 eventNumDevice[device] &= (SVG_TRACE_MAX_EVENTS-1);
192 cudaGetDevice(&device);
193 assert(device < SVG_TRACE_MAX_DEVICES);
194 cudaError_t error = cudaEventRecord(
195 evStartDMA[device][eventNumDMA[device]], stream);
196 assert(error == cudaSuccess);
209 cudaGetDevice(&device);
210 assert(device < SVG_TRACE_MAX_DEVICES);
211 cudaError_t error = cudaEventRecord(
212 evStopDMA[device][eventNumDMA[device]], stream);
213 assert(error == cudaSuccess);
214 eventColorDMA[device][eventNumDMA[device]] = color;
215 eventNumDMA[device]++;
216 eventNumDMA[device] &= (SVG_TRACE_MAX_EVENTS-1);
230 pthread_spin_lock(&memorySpinlockHost);
231 memoryLevelHost += delta;
232 if (memoryLevelHost > memoryMaxHost)
233 memoryMaxHost = memoryLevelHost;
234 pthread_spin_unlock(&memorySpinlockHost);
246 cudaGetDevice(&device);
247 assert(device < SVG_TRACE_MAX_DEVICES);
248 pthread_spin_lock(&memorySpinlockDevice[device]);
249 memoryLevelDevice[device] += delta;
250 if (memoryLevelDevice[device] > memoryMaxDevice[device])
251 memoryMaxDevice[device] = memoryLevelDevice[device];
252 pthread_spin_unlock(&memorySpinlockDevice[device]);
265 MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
266 MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
274 double max_time_in = 0.0;
277 for (core = 0; core < SVG_TRACE_MAX_CORES; core++) {
279 eventStopCore [core][eventNumCore[core]-1] -
281 if (time > max_time_in)
287 &max_time_in, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
289 max_time = max_time_in;
291 double hscale_cpu = 2000.0 / max_time;
292 double hscale_gpu = hscale_cpu / 1000.0;
295 double vscale = 1000.0 / ((num_cores+num_devices*2)*mpi_size);
299 char trace_file_name[SVG_TRACE_FILE_NAME_SIZE];
300 sprintf(trace_file_name,
"trace_%d.svg", (
int)(time(NULL)));
301 trace_file = fopen(trace_file_name,
"w");
302 assert(trace_file != NULL);
304 "<svg width=\"200mm\" height=\"100mm\" viewBox=\"0 0 2000 1000\">\n"
308 svg_trace_print_cores(
310 num_cores, num_devices,
311 hscale_cpu, vscale, trace_file);
314 svg_trace_print_devices(
316 num_cores, num_devices,
317 hscale_gpu, vscale, trace_file);
320 svg_trace_print_dmas(
322 num_cores, num_devices,
323 hscale_gpu, vscale, trace_file);
332 svg_trace_print_memory(mpi_rank, mpi_size, num_devices);
347 static void svg_trace_print_cores(
348 int mpi_rank,
int mpi_size,
349 int num_cores,
int num_devices,
350 double hscale,
double vscale, FILE *trace_file)
356 for (proc = 0; proc < mpi_size; proc++) {
361 SVG_TRACE_MAX_CORES, MPI_INT,
362 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
364 &eventStartCore[0][0],
365 SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
366 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
368 &eventStopCore[0][0],
369 SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
370 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
372 &eventColorCore[0][0],
373 SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS, MPI_INT,
374 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
376 for (core = 0; core < num_cores; core++) {
377 for (event = 0;
event < eventNumCore[core];
event++) {
378 double start = eventStartCore[core][event] - eventStopCore[0][0];
379 double stop = eventStopCore [core][event] - eventStopCore[0][0];
380 double width = (stop-start) * hscale;
381 int color = eventColorCore[core][event];
382 int thread = proc*(num_cores+2*num_devices);
386 "x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
387 "fill=\"#%06x\" stroke=\"#%06x\" stroke-width=\"0.5\"/>\n",
390 width < 2.0 ? 2.0 : width,
393 color < 0 ? color : 0);
403 MPI_INT, 0, 0, MPI_COMM_WORLD);
405 &eventStartCore[0][0],
406 SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS,
407 MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
409 &eventStopCore[0][0],
410 SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS,
411 MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
413 &eventColorCore[0][0],
414 SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS,
415 MPI_INT, 0, 0, MPI_COMM_WORLD);
431 static void svg_trace_print_devices(
432 int mpi_rank,
int mpi_size,
433 int num_cores,
int num_devices,
434 double hscale,
double vscale, FILE *trace_file)
439 for (device = 0; device < num_devices; device++) {
440 cudaError_t error = cudaSetDevice(device);
441 assert(error == cudaSuccess);
442 for (event = 0;
event < eventNumDevice[device];
event++)
446 cudaEventElapsedTime(&fstart,
447 evStartDevice[device][0], evStartDevice[device][event]);
448 cudaEventElapsedTime(&fstop,
449 evStartDevice[device][0], evStopDevice[device][event]);
450 double dstart = (double)fstart;
451 double dstop = (double)fstop;
452 eventStartDevice[device][event] = dstart;
453 eventStopDevice[device][event] = dstop;
455 eventStartDevice[device][0] = 0.0;
456 eventStopDevice[device][0] = 0.0;
460 for (proc = 0; proc < mpi_size; proc++) {
465 SVG_TRACE_MAX_DEVICES, MPI_INT,
466 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
468 &eventStartDevice[0][0],
469 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
470 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
472 &eventStopDevice[0][0],
473 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
474 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
476 &eventColorDevice[0][0],
477 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_INT,
478 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
480 for (device = 0; device < num_devices; device++) {
481 for (event = 1;
event < eventNumDevice[device];
event++) {
482 double start = eventStartDevice[device][event];
483 double stop = eventStopDevice [device][event];
484 double width = (stop-start) * hscale;
485 int color = eventColorDevice[device][event];
486 int accel = proc*(num_cores+2*num_devices);
487 accel += num_cores+device;
490 "x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
491 "fill=\"#%06x\" stroke=\"#%06x\" stroke-width=\"0.5\"/>\n",
494 width < 2.0 ? 2.0 : width,
497 color < 0 ? color : 0);
506 SVG_TRACE_MAX_DEVICES,
507 MPI_INT, 0, 0, MPI_COMM_WORLD);
509 &eventStartDevice[0][0],
510 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
511 MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
513 &eventStopDevice[0][0],
514 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
515 MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
517 &eventColorDevice[0][0],
518 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
519 MPI_INT, 0, 0, MPI_COMM_WORLD);
535 static void svg_trace_print_dmas(
536 int mpi_rank,
int mpi_size,
537 int num_cores,
int num_devices,
538 double hscale,
double vscale, FILE *trace_file)
543 for (device = 0; device < num_devices; device++) {
544 cudaError_t error = cudaSetDevice(device);
545 assert(error == cudaSuccess);
546 for (event = 0;
event < eventNumDMA[device];
event++)
550 cudaEventElapsedTime(&fstart,
551 evStartDMA[device][0], evStartDMA[device][event]);
552 cudaEventElapsedTime(&fstop,
553 evStartDMA[device][0], evStopDMA[device][event]);
554 double dstart = (double)fstart;
555 double dstop = (double)fstop;
556 eventStartDMA[device][event] = dstart;
557 eventStopDMA[device][event] = dstop;
559 eventStartDMA[device][0] = 0.0;
560 eventStopDMA[device][0] = 0.0;
564 for (proc = 0; proc < mpi_size; proc++) {
569 SVG_TRACE_MAX_DEVICES, MPI_INT,
570 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
572 &eventStartDMA[0][0],
573 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
574 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
577 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
578 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
580 &eventColorDMA[0][0],
581 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_INT,
582 proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
584 for (device = 0; device < num_devices; device++) {
585 for (event = 1;
event < eventNumDMA[device];
event++) {
586 double start = eventStartDMA[device][event];
587 double stop = eventStopDMA [device][event];
588 double width = (stop-start) * hscale;
589 int color = eventColorDMA[device][event];
590 int accel = proc*(num_cores+2*num_devices);
591 accel += num_cores+num_devices+device;
594 "x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
595 "fill=\"#%06x\" stroke=\"#%06x\" stroke-width=\"0.5\"/>\n",
598 width < 2.0 ? 2.0 : width,
601 color < 0 ? color : 0);
610 SVG_TRACE_MAX_DEVICES,
611 MPI_INT, 0, 0, MPI_COMM_WORLD);
613 &eventStartDMA[0][0],
614 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
615 MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
618 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
619 MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
621 &eventColorDMA[0][0],
622 SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
623 MPI_INT, 0, 0, MPI_COMM_WORLD);
635 static void svg_trace_print_memory(
int mpi_rank,
int mpi_size,
int num_devices)
639 for (rank = 0; rank < mpi_size; rank++) {
640 if (rank == mpi_rank) {
643 if (labs(memoryMaxHost) < 1024)
644 printf(
"Host %d max:\t%ld B\n",
645 rank, memoryMaxHost);
646 else if (labs(memoryMaxHost) < 1024*1024)
647 printf(
"Host %d max:\t%.1lf KB\n",
648 rank, memoryMaxHost/1024.0);
649 else if (labs(memoryMaxHost) < 1024*1024*1024)
650 printf(
"Host %d max:\t%.1lf MB\n",
651 rank, memoryMaxHost/1024.0/1024.0);
653 printf(
"Host %d max:\t%.1lf GB\n",
654 rank, memoryMaxHost/1024.0/1024.0/1024.0);
657 if (labs(memoryLevelHost) < 1024)
658 printf(
"Host %d end:\t%ld B\n",
659 rank, memoryLevelHost);
660 else if (labs(memoryLevelHost) < 1024*1024)
661 printf(
"Host %d end:\t%.1lf KB\n",
662 rank, memoryLevelHost/1024.0);
663 else if (labs(memoryLevelHost) < 1024*1024*1024)
664 printf(
"Host %d end:\t%.1lf MB\n",
665 rank, memoryLevelHost/1024.0/1024.0);
667 printf(
"Host %d end:\t%.1lf GB\n",
668 rank, memoryLevelHost/1024.0/1024.0/1024.0);
671 for (device = 0; device < num_devices; device++) {
674 if (labs(memoryMaxDevice[device]) < 1024)
675 printf(
" Device %d max:\t%ld B\n",
676 device, memoryMaxDevice[device]);
677 else if (labs(memoryMaxDevice[device]) < 1024*1024)
678 printf(
" Device %d max:\t%.1lf KB\n",
679 device, memoryMaxDevice[device]/1024.0);
680 else if (labs(memoryMaxDevice[device]) < 1024*1024*1024)
681 printf(
" Device %d max:\t%.1lf MB\n",
682 device, memoryMaxDevice[device]/1024.0/1024.0);
684 printf(
" Device %d max:\t%.1lf GB\n",
685 device, memoryMaxDevice[device]/1024.0/1024.0/1024.0);
688 if (labs(memoryLevelDevice[device]) < 1024)
689 printf(
" Device %d end:\t%ld B\n",
690 device, memoryLevelDevice[device]);
691 else if (labs(memoryLevelDevice[device]) < 1024*1024)
692 printf(
" Device %d end:\t%.1lf KB\n",
693 device, memoryLevelDevice[device]/1024.0);
694 else if (labs(memoryLevelDevice[device]) < 1024*1024*1024)
695 printf(
" Device %d end:\t%.1lf MB\n",
696 device, memoryLevelDevice[device]/1024.0/1024.0);
698 printf(
" Device %d end:\t%.1lf GB\n",
699 device, memoryLevelDevice[device]/1024.0/1024.0/1024.0);
702 MPI_Barrier(MPI_COMM_WORLD);