html/doxygen/prt__vsa_8c_source.html

#include "prt_vsa.h"


extern int prt_tuple_equal(void *tuple_a, void *tuple_b);

extern unsigned int prt_tuple_hash(void *tuple);


prt_vsa_t* prt_vsa_new(

    int num_threads, int num_devices, void *global_store,

    struct prt_mapping_s (*vdp_mapping)(int*, void*, int, int))

{

    // Check input parameters.

    prt_assert(num_threads >= 0, "negative number of threads");

    prt_assert(num_devices >= 0, "negative number of devices");

    prt_assert(vdp_mapping != NULL, "NULL mapping function");


    // Allocate the VSA.

    prt_vsa_t *vsa = (prt_vsa_t*)malloc(sizeof(prt_vsa_t));

    prt_assert(vsa != NULL, "malloc failed");


    // Check for MPI.

    int initialized;

    int retval = MPI_Initialized(&initialized);

    prt_assert(retval == MPI_SUCCESS, "MPI_Initialized failed");

    if (initialized) {

        MPI_Comm_rank(MPI_COMM_WORLD, &vsa->node_rank);

        MPI_Comm_size(MPI_COMM_WORLD, &vsa->num_nodes);

    }

    else {

        vsa->num_nodes = 1;

        vsa->node_rank = 0;

    }

    // Init the VSA.

    vsa->num_threads = num_threads;

    vsa->num_cores = vsa->num_nodes*vsa->num_threads;

    vsa->thread_warmup_func = NULL;


    vsa->num_devices = num_devices;

    vsa->num_accelerators = vsa->num_nodes*vsa->num_devices;

    vsa->device_warmup_func = NULL;


    vsa->vdp_mapping = vdp_mapping;

    vsa->proxy = NULL;

    vsa->config = prt_config_new();

    vsa->global_store = global_store;


    // Init proxy if required.

    vsa->concurrency = num_threads;

    if (vsa->num_nodes > 1 || vsa->num_devices > 0) {

        vsa->proxy = prt_proxy_new(num_threads+num_devices);

        vsa->proxy->vsa = vsa;

        vsa->concurrency++;

    }

    // Init pthreads.

    pthread_setconcurrency(vsa->concurrency);

    pthread_attr_init(&vsa->thread_attr);

    pthread_attr_setscope(&vsa->thread_attr, PTHREAD_SCOPE_SYSTEM);


    int i;

    // Initialize threads.

    vsa->thread = (prt_thread_t**)malloc(vsa->num_threads*sizeof(prt_thread_t*));

    prt_assert(vsa->thread != NULL, "malloc failed");

    for (i = 0; i < vsa->num_threads; i++) {

        vsa->thread[i] = prt_thread_new(i, vsa->node_rank*vsa->num_threads+i, i);

        vsa->thread[i]->vsa = vsa;

    }


    // Initialize devices.

    vsa->device = (prt_device_t**)malloc(vsa->num_devices*sizeof(prt_device_t));

    prt_assert(vsa->device != NULL, "malloc failed");

    for (i = 0; i < vsa->num_devices; i++) {

        int agent = vsa->num_threads+i;

        vsa->device[i] =

            prt_device_new(i, vsa->node_rank*vsa->num_devices+i, agent);

        vsa->device[i]->vsa = vsa;

    }

    // Initialize device memory allocators.

    // Allocating 80% of available GPU memory.

    vsa->devmem = (gpu_malloc_t**)malloc(vsa->num_devices*sizeof(gpu_malloc_t*));

    prt_assert(vsa->devmem != NULL, "malloc failed");

    for (i = 0; i < vsa->num_devices; i++) {

        size_t mem_free;

        size_t mem_total;

        cudaError_t error;

        error = cudaSetDevice(i);

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

        error = cudaMemGetInfo(&mem_free, &mem_total);

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

        int num_segments = mem_free / PRT_VSA_GPU_ALLOC_UNIT_SIZE;

        num_segments = num_segments * 4 / 5;

        prt_assert(num_segments > 0, "zero segments available");

        vsa->devmem[i] = gpu_malloc_init(

            num_segments, PRT_VSA_GPU_ALLOC_UNIT_SIZE);

        prt_assert(vsa->devmem[i] != NULL, "gpu_malloc_init failed");

    }


    // Initialize thread barrier.

    pthread_barrier_init(&vsa->barrier, NULL, vsa->concurrency);


    // Initialize the VDPs hash.

    int nbuckets = PRT_VSA_MAX_VDPS_PER_NODE;

    vsa->vdps_hash = icl_hash_create(nbuckets, prt_tuple_hash, prt_tuple_equal);


    // Allocate the array of channel lists.

    vsa->channel_lists =

        (icl_list_t**)calloc(vsa->num_nodes, sizeof(icl_list_t*));

    prt_assert(vsa->channel_lists != NULL, "malloc failed");


    // Return the VSA.

    return vsa;

}


void prt_vsa_delete(prt_vsa_t *vsa)

{

    // Check input parameters.

    prt_assert(vsa != NULL, "NULL VSA");


    // Destroy the VDPs hash.

    icl_hash_destroy(vsa->vdps_hash, NULL, (void(*)(void*))prt_vdp_delete);


    // Delete the config.

    prt_config_delete(vsa->config);


    // Delete the proxy.

    if (vsa->proxy != NULL)

        prt_proxy_delete(vsa->proxy);


    // Delete thread barrier.

    pthread_barrier_destroy(&vsa->barrier);


    int i;

    // Delete threads.

    for (i = 0; i < vsa->num_threads; i++)

        prt_thread_delete(vsa->thread[i]);

    free(vsa->thread);


    // Delete devices.

    for (i = 0; i < vsa->num_devices; i++)

        prt_device_delete(vsa->device[i]);

    free(vsa->device);


    // Destroy device memory allocators.

    for (i = 0; i < vsa->num_devices; i++) {

        cudaError_t error = cudaSetDevice(i);

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

        int retval = gpu_malloc_fini(vsa->devmem[i]);

        prt_assert(retval == 0, "gpu_malloc_fini failed");

    }

    free(vsa->devmem);


    // Destroy thread attributes.

    pthread_attr_destroy(&vsa->thread_attr);


    // Free the VSA.

    free(vsa);

}


void prt_vsa_vdp_insert(prt_vsa_t *vsa, prt_vdp_t *vdp)

{

    // Check input parameters.

    prt_assert(vsa != NULL, "NULL VSA");

    prt_assert(vdp != NULL, "NULL VDP");


    // Find the mapping.

    prt_mapping_t mapping =

        vsa->vdp_mapping(

            vdp->tuple, vsa->global_store,

            vsa->num_cores, vsa->num_accelerators);


    int node_rank;

    // IF host VDP.

    if (mapping.location == PRT_LOCATION_HOST) {

        // Compute node rank and thread rank;

        node_rank = mapping.rank / vsa->num_threads;

        int thread_rank = mapping.rank % vsa->num_threads;


        // IF VDP not in this node.

        if (node_rank != vsa->node_rank) {

            // Destroy along with all channels and return.

            prt_vdp_annihilate(vdp);

            return;

        }


        // Insert in the thread's list of VDPs.

        icl_list_t *node = icl_list_append(vsa->thread[thread_rank]->vdps, vdp);

        prt_assert(node != NULL, "icl_list_append failed");

        vdp->thread = vsa->thread[thread_rank];

    }

    // ELSE IF device VDP.

    else {

        // Compute node rank and device rank.

        node_rank = mapping.rank / vsa->num_devices;

        int device_rank = mapping.rank % vsa->num_devices;


        // IF VDP not in this node.

        if (node_rank != vsa->node_rank) {

            // Destroy along with all channels and return.

            prt_vdp_annihilate(vdp);

            return;

        }

        // Insert in the device's list of VDPs.

        icl_list_t *node = icl_list_append(vsa->device[device_rank]->vdps, vdp);

        prt_assert(node != NULL, "icl_list_append failed");

        vdp->device = vsa->device[device_rank];


        cudaError_t error;

        // Create the VDP's stream.

        error = cudaSetDevice(device_rank);

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

        error = cudaStreamCreateWithFlags(&vdp->stream, cudaStreamNonBlocking);

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

    }

    vdp->vsa = vsa;

    vdp->location = mapping.location;

    vdp->global_store = vsa->global_store;


    int i;

    // Provide proxy to the input channels.

    for (i = 0; i < vdp->num_inputs; i++)

        if (vdp->input[i] != NULL)

            vdp->input[i]->proxy = vsa->proxy;


    // Provide proxy to the output channels.

    for (i = 0; i < vdp->num_outputs; i++)

        if (vdp->output[i] != NULL)

            vdp->output[i]->proxy = vsa->proxy;


    // Insert in the VSA's VDP hash.

    icl_entry_t *entry = icl_hash_insert(

        vsa->vdps_hash, (void*)vdp->tuple, (void*)vdp);

    prt_assert(entry != NULL, "icl_hash_insert failed");


    // Merge intra-node channels.

    prt_vsa_vdp_merge_channels(vsa, vdp);

    // Track tags for inter-node communication.

    prt_vsa_vdp_track_tags(vsa, vdp);

}


void prt_vsa_vdp_merge_channels(prt_vsa_t *vsa, prt_vdp_t *vdp)

{

    int i;

    // FOR each input channel.

    for (i = 0; i < vdp->num_inputs; i++) {

        prt_channel_t *channel = vdp->input[i];

        if (channel != NULL) {

            // Look for maximum channel size.

            prt_proxy_max_channel_size(vsa->proxy, channel);

            // Look up the source VDP.

            prt_vdp_t *src_vdp =

                icl_hash_find(vsa->vdps_hash, (void*)channel->src_tuple);

            // IF source VDP found.

            if (src_vdp != NULL) {

                // Check for channel-tuple mismatch.

                int *src_vdp_dst_tuple =

                    src_vdp->output[channel->src_slot]->dst_tuple;

                prt_assert(prt_tuple_equal(src_vdp_dst_tuple, vdp->tuple),

                    "VDP channel tuple mismatch");

                // Swap the existing channel to this channel.

                prt_channel_delete(src_vdp->output[channel->src_slot]);

                src_vdp->output[channel->src_slot] = channel;

                // Point to the source VDP in the channel.

                channel->src_vdp = src_vdp;

            }

        }

    }

    // FOR each output channel.

    for (i = 0; i < vdp->num_outputs; i++) {

        prt_channel_t *channel = vdp->output[i];

        if (channel != NULL) {

            // Look for maximum channel size.

            prt_proxy_max_channel_size(vsa->proxy, channel);

            // Look up the destination VDP.

            prt_vdp_t *dst_vdp =

                icl_hash_find(vsa->vdps_hash, (void*)channel->dst_tuple);

            // IF destination VDP found.

            if (dst_vdp != NULL) {

                // Check for channel-tuple mismatch.

                int *dst_vdp_src_tuple =

                    dst_vdp->input[channel->dst_slot]->src_tuple;

                prt_assert(prt_tuple_equal(dst_vdp_src_tuple, vdp->tuple),

                    "VDP channel tuple mismatch");

                // Swap this channel for the existing channel.

                vdp->output[i] = dst_vdp->input[channel->dst_slot];

                // Point to the source VDP in the channel.

                vdp->output[i]->src_vdp = vdp;

                prt_channel_delete(channel);


            }

        }

    }

}


void prt_vsa_vdp_track_tags(prt_vsa_t *vsa, prt_vdp_t *vdp)

{

    int i;

    // FOR each input channel.

    for (i = 0; i < vdp->num_inputs; i++) {

        prt_channel_t *channel = vdp->input[i];

        if (channel != NULL) {

            // Assing destination node.

            channel->dst_node = vsa->node_rank;


            int src_node;

            // Find source node.

            prt_mapping_t src_mapping =

                vsa->vdp_mapping(

                    channel->src_tuple, vsa->global_store,

                    vsa->num_cores, vsa->num_accelerators);

            if (src_mapping.location == PRT_LOCATION_HOST)

                src_node = src_mapping.rank / vsa->num_threads;

            else

                src_node = src_mapping.rank / vsa->num_devices;

            channel->src_node = src_node;


            // IF another node is the source.

            if (src_node != vsa->node_rank) {

                // Create the list if empty.

                if (vsa->channel_lists[src_node] == NULL) {

                    vsa->channel_lists[src_node] = icl_list_new();

                    prt_assert(vsa->channel_lists[src_node] != NULL,

                        "icl_list_new failed");

                }

                // Add the channel to the list.

                icl_list_t *node = icl_list_isort(

                    vsa->channel_lists[src_node], channel, prt_channel_compare);

                prt_assert(node != NULL, "icl_list_isort failed");

            }

        }

    }

    // FOR each output channel.

    for (i = 0; i < vdp->num_outputs; i++) {

        prt_channel_t *channel = vdp->output[i];

        if (channel != NULL) {

            // Assing source node.

            channel->src_node = vsa->node_rank;


            int dst_node;

            // Find destination node.

            prt_mapping_t dst_mapping =

                vsa->vdp_mapping(

                    channel->dst_tuple, vsa->global_store,

                    vsa->num_cores, vsa->num_accelerators);

            if (dst_mapping.location == PRT_LOCATION_HOST)

                dst_node = dst_mapping.rank / vsa->num_threads;

            else

                dst_node = dst_mapping.rank / vsa->num_devices;

            channel->dst_node = dst_node;


            // IF another node is the destination.

            if (dst_node != vsa->node_rank) {

                // Create the list if empty.

                if (vsa->channel_lists[dst_node] == NULL) {

                    vsa->channel_lists[dst_node] = icl_list_new();

                    prt_assert(vsa->channel_lists[dst_node] != NULL,

                        "icl_list_new failed");

                }

                // Add the channel to the list.

                icl_list_t *node = icl_list_isort(

                    vsa->channel_lists[dst_node], channel, prt_channel_compare);

                prt_assert(node != NULL, "icl_list_isort failed");

            }

        }

    }

}


void prt_vsa_channel_tags(prt_vsa_t *vsa)

{

    int i;

    for (i = 0; i < vsa->num_nodes; i++) {

        if (vsa->channel_lists[i] != NULL) {

            int tag = 0;

            icl_list_t *node;

            // Assign consecutive tags to the elements.

            icl_list_foreach(vsa->channel_lists[i], node) {

                prt_channel_t *channel = (prt_channel_t*)node->data;

                channel->tag = tag++;


                int *node_tag;

                if (channel->dst_node == vsa->node_rank)

                    node_tag = prt_tuple_new2(channel->src_node, channel->tag);

                else

                    node_tag = prt_tuple_new2(channel->dst_node, channel->tag);


                icl_entry_t *entry = icl_hash_insert(

                    vsa->proxy->tags_hash, (void*)node_tag, (void*)channel);

                prt_assert(entry != NULL, "icl_hash_insert failed");

            }

            // Destroy the list.

            int status = icl_list_destroy(vsa->channel_lists[i], NULL);

            prt_assert(status == 0, "icl_list_destroy failed");

        }

    }

    // Free the array of lists.

    free(vsa->channel_lists);

}


void prt_vsa_channel_streams(prt_vsa_t *vsa)

{

    int j;

    // FOR each device.

    for (j = 0; j < vsa->num_devices; j++) {

        prt_device_t *device = vsa->device[j];

        icl_list_t *vdp_node;

        // FOR each device VDP.

        icl_list_foreach(device->vdps, vdp_node) {

            prt_vdp_t *vdp = (prt_vdp_t*)vdp_node->data;

            int i;

            // FOR each input channel.

            for (i = 0; i < vdp->num_inputs; i++) {

                prt_channel_t *channel = vdp->input[i];

                // IF the channel is not NULL.

                if (channel != NULL) {

                    // IF coming from a different node

                    // OR coming from a host VDP

                    // OR coming from another device

                    // (when the device has to pull).

                    if (channel->src_vdp == NULL ||

                        channel->src_vdp->location == PRT_LOCATION_HOST ||

                        channel->src_vdp->device->rank !=

                        channel->dst_vdp->device->rank) {

                        // Create the in_stream.

                        cudaError_t error;

                        error = cudaSetDevice(device->rank);

                        prt_assert(error == cudaSuccess,

                            cudaGetErrorString(error));

                        error = cudaStreamCreateWithFlags(

                            &channel->in_stream, cudaStreamNonBlocking);

                        prt_assert(error == cudaSuccess,

                            cudaGetErrorString(error));

                    }

                }

            }

            // FOR each output channel.

            for (i = 0; i < vdp->num_outputs; i++) {

                prt_channel_t *channel = vdp->output[i];

                // IF the channel is not NULL.

                if (channel != NULL) {

                    // IF going to another node

                    // OR going to a host VDP

                    // OR going to another device

                    // (when the device has to push).

                    if (channel->dst_vdp == NULL ||

                        channel->dst_vdp->location == PRT_LOCATION_HOST ||

                        channel->dst_vdp->device->rank !=

                        channel->src_vdp->device->rank) {

                        // Create the out_stream.

                        cudaError_t error;

                        error = cudaSetDevice(device->rank);

                        prt_assert(error == cudaSuccess,

                            cudaGetErrorString(error));

                        error = cudaStreamCreateWithFlags(

                            &channel->out_stream, cudaStreamNonBlocking);

                        prt_assert(error == cudaSuccess,

                            cudaGetErrorString(error));

                    }

                }

            }

        }

    }

}


double prt_vsa_run(prt_vsa_t *vsa)

{

    // Check input parameters.

    prt_assert(vsa != NULL, "NULL VSA");


    // Assign channel tags.

    prt_vsa_channel_tags(vsa);


    // Create channel streams.

    prt_vsa_channel_streams(vsa);


    // Initialize SVG tracing.

    svg_trace_init(vsa->concurrency, vsa->num_devices);


    int i;

    int status;

    // Launch threads.

    i = vsa->proxy == NULL;

    for (; i < vsa->num_threads; i++) {

        status = pthread_create(

            &vsa->thread[i]->id, &vsa->thread_attr,

            prt_thread_run, vsa->thread[i]);

        prt_assert(status == 0, "pthread_create failed");

    }

    double time;

    // IF no proxy.

    if (vsa->proxy == NULL) {

        // Serve as thread zero.

        vsa->thread[0]->id = pthread_self();

        prt_thread_run((void*)vsa->thread[0]);

        time = vsa->thread[0]->time;

    }

    else {

        // Call devices warmup function.

        prt_vsa_devices_warmup(vsa);

        // Serve as the proxy.

        time = prt_proxy_run(vsa->proxy);

    }

    // Join threads.

    i = vsa->proxy == NULL;

    for (; i < vsa->num_threads; i++) {

        status = pthread_join(vsa->thread[i]->id, NULL);

        prt_assert(status == 0, "pthread_join failed");

    }

    // Finish tracing.

    if (vsa->config->svg_tracing == PRT_SVG_TRACING_ON)

        svg_trace_finish(vsa->concurrency, vsa->num_devices);


    return time;

}


void prt_vsa_config_set(

    prt_vsa_t *vsa, prt_config_param_t param, prt_config_value_t value)

{

    // Check input parameters.

    prt_assert(vsa != NULL, "NULL VSA");


    // Set the value for the parameter.

    switch (param) {

        case PRT_VDP_SCHEDULING:

            switch (value) {

                case PRT_VDP_SCHEDULING_LAZY:

                case PRT_VDP_SCHEDULING_AGGRESSIVE:

                    vsa->config->vdp_scheduling = value;

                    break;

                default:

                    prt_error("invalid value PRT_VDP_SCHEDULING");

                    break;

            }

            break;

        case PRT_SVG_TRACING:

            switch (value) {

                case PRT_SVG_TRACING_ON:

                case PRT_SVG_TRACING_OFF:

                    vsa->config->svg_tracing = value;

                    break;

                default:

                    prt_error("invalid value for PRT_SVG_TRACING");

                    break;

            }

            break;

        default:

            prt_error("invalid parameter");

            break;

    }

}


void prt_vsa_thread_warmup_func_set(prt_vsa_t *vsa, void (*func)())

{

    // Check input parameters.

    prt_assert(vsa != NULL, "NULL VSA");


    // Set the thread warmup function.

    vsa->thread_warmup_func = func;

}


void prt_vsa_device_warmup_func_set(prt_vsa_t *vsa, void (*func)())

{

    // Check input parameters.

    prt_assert(vsa != NULL, "NULL VSA");


    // Set the device warmup function.

    vsa->device_warmup_func = func;

}


void prt_vsa_devices_warmup(prt_vsa_t *vsa)

{

    // Quick return.

    if (vsa->device_warmup_func == NULL)

        return;


    int dev;

    // Call the device warmup function.

    for (dev = 0; dev < vsa->num_devices; dev++) {

        cudaError_t error = cudaSetDevice(dev);

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

        vsa->device_warmup_func();

    }

    // Synchronize each device.

    for (dev = 0; dev < vsa->num_devices; dev++) {

        cudaError_t error = cudaSetDevice(dev);

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

        error = cudaDeviceSynchronize();

        prt_assert(error == cudaSuccess, cudaGetErrorString(error));

    }

}