/**
 * Copyright (c) 2016      The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * AUTHOR: George Bosilca
 */ 
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "header.h"

int init_matrix(TYPE* matrix, int nb, int mb)
{
    int i, j, k = 0;
    for (i = 0; i < nb; i++) {
        for (j = 0; j < mb; j++) {
            matrix[k] = (TYPE)(((double) rand()) / ((double) RAND_MAX) - 0.5);
            k++;   
        }
    }
}

int main( int argc, char* argv[] )
{
    int i, rc, size, rank, N = -1, M = -1, NB, MB, P = -1, Q;
    TYPE *om, *som, epsilon=1e-6;

    /* get the problem size from the command arguments */
    for( i = 1; i < argc; i++ ) {
        if( !strcmp(argv[i], "-p") ) {
            i++;
            P = atoi(argv[i]);
            continue;
        }
        if( !strcmp(argv[i], "-N") ) {
            i++;
            N = atoi(argv[i]);
            continue;
        }
        if( !strcmp(argv[i], "-M") ) {
            i++;
            M = atoi(argv[i]);
            continue;
        }
    }
    if( M == -1 ) M = N;
    if( P < 1 ) {
        printf("Missing number of processes per row (-p #)\n");
        exit(-1);
    }
    if( N == -1 ) {
        printf("Missing the first dimension of th matrix (-N #)\n");
        exit(-1);
    }

    /* Interaction with the CUDA aware MPI library. In Open MPI CUDA
     * must be initialized before the MPI_Init in order to enable CUDA
     * support in the library.
     * In the case multiple GPUs are available per node and we have
     * multiple processes per node, let's distribute the processes
     * across all GPUs.
     */
    char* lrank = getenv("OMPI_COMM_WORLD_LOCAL_RANK");
    int local_rank, num_devices;
    if( NULL != lrank ) {
        local_rank = strtoul(lrank, NULL, 10);
    }
    cudaGetDeviceCount(&num_devices);
    if( 0 == num_devices ) {
        printf("No CUDA devices on this node. Disable CUDA!\n");
    } else {
        cudaSetDevice(local_rank % num_devices);
    }
    
    MPI_Init(NULL, NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    Q = size / P;
    NB = N / P;
    MB = M / Q;

    /**
     * Ugly hack to allow us to attach with a ssh-based debugger to the application.
     */
    int do_sleep = 0;
    while(do_sleep) {
        sleep(1);
    }

    printf("Rank %d uses device %d\n", rank, local_rank % num_devices);

    /* make sure we have some randomness */
    int seed = rank*N*M; srand(seed);
    om = (TYPE*)malloc(sizeof(TYPE) * (NB+2) * (MB+2));
    som = (TYPE*)malloc(sizeof(TYPE) * (NB+2) * (MB+2));
    init_matrix(om, NB+2, MB+2);
    /* save a copy for the GPU computation */
    memcpy(som, om, sizeof(TYPE) * (NB+2) * (MB+2));
#if 0
    rc = jacobi_cpu( om, N, M, P, MPI_COMM_WORLD, 0 /* no epsilon */);
    if( rc < 0 ) {
        printf("The CPU Jacobi failed\n");
        goto cleanup_and_be_gone;
    }
#endif
    rc = jacobi_gpu( som, N, M, P, MPI_COMM_WORLD, 0 /* no epsilon */);
    if( rc < 0 ) {
        printf("The GPU Jacobi failed\n");
        goto cleanup_and_be_gone;
    }

 cleanup_and_be_gone:
    /* free the resources and shutdown */
    free(om);
    free(som);

    MPI_Finalize();
    return 0;
}
