29 int num_threads,
int num_devices,
void *global_store,
33 prt_assert(num_threads >= 0,
"negative number of threads");
34 prt_assert(num_devices >= 0,
"negative number of devices");
35 prt_assert(vdp_mapping != NULL,
"NULL mapping function");
39 prt_assert(vsa != NULL,
"malloc failed");
43 int retval = MPI_Initialized(&initialized);
44 prt_assert(retval == MPI_SUCCESS,
"MPI_Initialized failed");
46 MPI_Comm_rank(MPI_COMM_WORLD, &vsa->node_rank);
47 MPI_Comm_size(MPI_COMM_WORLD, &vsa->num_nodes);
54 vsa->num_threads = num_threads;
55 vsa->num_cores = vsa->num_nodes*vsa->num_threads;
56 vsa->thread_warmup_func = NULL;
58 vsa->num_devices = num_devices;
59 vsa->num_accelerators = vsa->num_nodes*vsa->num_devices;
60 vsa->device_warmup_func = NULL;
62 vsa->vdp_mapping = vdp_mapping;
65 vsa->global_store = global_store;
68 vsa->concurrency = num_threads;
69 if (vsa->num_nodes > 1 || vsa->num_devices > 0) {
71 vsa->proxy->vsa = vsa;
75 pthread_setconcurrency(vsa->concurrency);
76 pthread_attr_init(&vsa->thread_attr);
77 pthread_attr_setscope(&vsa->thread_attr, PTHREAD_SCOPE_SYSTEM);
82 prt_assert(vsa->thread != NULL,
"malloc failed");
83 for (i = 0; i < vsa->num_threads; i++) {
84 vsa->thread[i] =
prt_thread_new(i, vsa->node_rank*vsa->num_threads+i, i);
85 vsa->thread[i]->vsa = vsa;
90 prt_assert(vsa->device != NULL,
"malloc failed");
91 for (i = 0; i < vsa->num_devices; i++) {
92 int agent = vsa->num_threads+i;
95 vsa->device[i]->vsa = vsa;
100 prt_assert(vsa->devmem != NULL,
"malloc failed");
101 for (i = 0; i < vsa->num_devices; i++) {
105 error = cudaSetDevice(i);
106 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
107 error = cudaMemGetInfo(&mem_free, &mem_total);
108 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
110 num_segments = num_segments * 4 / 5;
111 prt_assert(num_segments > 0,
"zero segments available");
114 prt_assert(vsa->devmem[i] != NULL,
"gpu_malloc_init failed");
118 pthread_barrier_init(&vsa->barrier, NULL, vsa->concurrency);
127 prt_assert(vsa->channel_lists != NULL,
"malloc failed");
143 prt_assert(vsa != NULL,
"NULL VSA");
152 if (vsa->proxy != NULL)
156 pthread_barrier_destroy(&vsa->barrier);
160 for (i = 0; i < vsa->num_threads; i++)
165 for (i = 0; i < vsa->num_devices; i++)
170 for (i = 0; i < vsa->num_devices; i++) {
171 cudaError_t error = cudaSetDevice(i);
172 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
174 prt_assert(retval == 0,
"gpu_malloc_fini failed");
179 pthread_attr_destroy(&vsa->thread_attr);
203 prt_assert(vsa != NULL,
"NULL VSA");
204 prt_assert(vdp != NULL,
"NULL VDP");
209 vdp->tuple, vsa->global_store,
210 vsa->num_cores, vsa->num_accelerators);
214 if (mapping.location == PRT_LOCATION_HOST) {
216 node_rank = mapping.rank / vsa->num_threads;
217 int thread_rank = mapping.rank % vsa->num_threads;
220 if (node_rank != vsa->node_rank) {
228 prt_assert(node != NULL,
"icl_list_append failed");
229 vdp->thread = vsa->thread[thread_rank];
234 node_rank = mapping.rank / vsa->num_devices;
235 int device_rank = mapping.rank % vsa->num_devices;
238 if (node_rank != vsa->node_rank) {
245 prt_assert(node != NULL,
"icl_list_append failed");
246 vdp->device = vsa->device[device_rank];
250 error = cudaSetDevice(device_rank);
251 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
252 error = cudaStreamCreateWithFlags(&vdp->stream, cudaStreamNonBlocking);
253 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
256 vdp->location = mapping.location;
257 vdp->global_store = vsa->global_store;
261 for (i = 0; i < vdp->num_inputs; i++)
262 if (vdp->input[i] != NULL)
263 vdp->input[i]->proxy = vsa->proxy;
266 for (i = 0; i < vdp->num_outputs; i++)
267 if (vdp->output[i] != NULL)
268 vdp->output[i]->proxy = vsa->proxy;
272 vsa->vdps_hash, (
void*)vdp->tuple, (
void*)vdp);
273 prt_assert(entry != NULL,
"icl_hash_insert failed");
294 for (i = 0; i < vdp->num_inputs; i++) {
296 if (channel != NULL) {
303 if (src_vdp != NULL) {
305 int *src_vdp_dst_tuple =
306 src_vdp->output[channel->src_slot]->dst_tuple;
308 "VDP channel tuple mismatch");
311 src_vdp->output[channel->src_slot] = channel;
313 channel->src_vdp = src_vdp;
318 for (i = 0; i < vdp->num_outputs; i++) {
320 if (channel != NULL) {
327 if (dst_vdp != NULL) {
329 int *dst_vdp_src_tuple =
330 dst_vdp->input[channel->dst_slot]->src_tuple;
332 "VDP channel tuple mismatch");
334 vdp->output[i] = dst_vdp->input[channel->dst_slot];
336 vdp->output[i]->src_vdp = vdp;
355 for (i = 0; i < vdp->num_inputs; i++) {
357 if (channel != NULL) {
359 channel->dst_node = vsa->node_rank;
365 channel->src_tuple, vsa->global_store,
366 vsa->num_cores, vsa->num_accelerators);
367 if (src_mapping.location == PRT_LOCATION_HOST)
368 src_node = src_mapping.rank / vsa->num_threads;
370 src_node = src_mapping.rank / vsa->num_devices;
371 channel->src_node = src_node;
374 if (src_node != vsa->node_rank) {
376 if (vsa->channel_lists[src_node] == NULL) {
378 prt_assert(vsa->channel_lists[src_node] != NULL,
379 "icl_list_new failed");
384 prt_assert(node != NULL,
"icl_list_isort failed");
389 for (i = 0; i < vdp->num_outputs; i++) {
391 if (channel != NULL) {
393 channel->src_node = vsa->node_rank;
399 channel->dst_tuple, vsa->global_store,
400 vsa->num_cores, vsa->num_accelerators);
401 if (dst_mapping.location == PRT_LOCATION_HOST)
402 dst_node = dst_mapping.rank / vsa->num_threads;
404 dst_node = dst_mapping.rank / vsa->num_devices;
405 channel->dst_node = dst_node;
408 if (dst_node != vsa->node_rank) {
410 if (vsa->channel_lists[dst_node] == NULL) {
412 prt_assert(vsa->channel_lists[dst_node] != NULL,
413 "icl_list_new failed");
418 prt_assert(node != NULL,
"icl_list_isort failed");
435 for (i = 0; i < vsa->num_nodes; i++) {
436 if (vsa->channel_lists[i] != NULL) {
440 icl_list_foreach(vsa->channel_lists[i], node) {
442 channel->tag = tag++;
445 if (channel->dst_node == vsa->node_rank)
446 node_tag = prt_tuple_new2(channel->src_node, channel->tag);
448 node_tag = prt_tuple_new2(channel->dst_node, channel->tag);
451 vsa->proxy->tags_hash, (
void*)node_tag, (
void*)channel);
452 prt_assert(entry != NULL,
"icl_hash_insert failed");
456 prt_assert(status == 0,
"icl_list_destroy failed");
460 free(vsa->channel_lists);
473 for (j = 0; j < vsa->num_devices; j++) {
477 icl_list_foreach(device->vdps, vdp_node) {
481 for (i = 0; i < vdp->num_inputs; i++) {
484 if (channel != NULL) {
489 if (channel->src_vdp == NULL ||
490 channel->src_vdp->location == PRT_LOCATION_HOST ||
491 channel->src_vdp->device->rank !=
492 channel->dst_vdp->device->rank) {
495 error = cudaSetDevice(device->rank);
496 prt_assert(error == cudaSuccess,
497 cudaGetErrorString(error));
498 error = cudaStreamCreateWithFlags(
499 &channel->in_stream, cudaStreamNonBlocking);
500 prt_assert(error == cudaSuccess,
501 cudaGetErrorString(error));
506 for (i = 0; i < vdp->num_outputs; i++) {
509 if (channel != NULL) {
514 if (channel->dst_vdp == NULL ||
515 channel->dst_vdp->location == PRT_LOCATION_HOST ||
516 channel->dst_vdp->device->rank !=
517 channel->src_vdp->device->rank) {
520 error = cudaSetDevice(device->rank);
521 prt_assert(error == cudaSuccess,
522 cudaGetErrorString(error));
523 error = cudaStreamCreateWithFlags(
524 &channel->out_stream, cudaStreamNonBlocking);
525 prt_assert(error == cudaSuccess,
526 cudaGetErrorString(error));
549 prt_assert(vsa != NULL,
"NULL VSA");
563 i = vsa->proxy == NULL;
564 for (; i < vsa->num_threads; i++) {
565 status = pthread_create(
566 &vsa->thread[i]->id, &vsa->thread_attr,
568 prt_assert(status == 0,
"pthread_create failed");
572 if (vsa->proxy == NULL) {
574 vsa->thread[0]->id = pthread_self();
576 time = vsa->thread[0]->time;
585 i = vsa->proxy == NULL;
586 for (; i < vsa->num_threads; i++) {
587 status = pthread_join(vsa->thread[i]->id, NULL);
588 prt_assert(status == 0,
"pthread_join failed");
591 if (vsa->config->svg_tracing == PRT_SVG_TRACING_ON)
610 prt_assert(vsa != NULL,
"NULL VSA");
614 case PRT_VDP_SCHEDULING:
616 case PRT_VDP_SCHEDULING_LAZY:
617 case PRT_VDP_SCHEDULING_AGGRESSIVE:
618 vsa->config->vdp_scheduling = value;
621 prt_error(
"invalid value PRT_VDP_SCHEDULING");
625 case PRT_SVG_TRACING:
627 case PRT_SVG_TRACING_ON:
628 case PRT_SVG_TRACING_OFF:
629 vsa->config->svg_tracing = value;
632 prt_error(
"invalid value for PRT_SVG_TRACING");
637 prt_error(
"invalid parameter");
659 prt_assert(vsa != NULL,
"NULL VSA");
662 vsa->thread_warmup_func = func;
682 prt_assert(vsa != NULL,
"NULL VSA");
685 vsa->device_warmup_func = func;
697 if (vsa->device_warmup_func == NULL)
702 for (dev = 0; dev < vsa->num_devices; dev++) {
703 cudaError_t error = cudaSetDevice(dev);
704 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
705 vsa->device_warmup_func();
708 for (dev = 0; dev < vsa->num_devices; dev++) {
709 cudaError_t error = cudaSetDevice(dev);
710 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
711 error = cudaDeviceSynchronize();
712 prt_assert(error == cudaSuccess, cudaGetErrorString(error));