/**
 *
 * @file psshift.c
 *
 *  PLASMA InPlaceTransformation module
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 *  This work is the implementation of an inplace transformation 
 *  based on the GKK algorithm by Gustavson, Karlsson, Kagstrom 
 *  and its fortran implementation.
 *
 * @version 2.3.0
 * @author Mathieu Faverge
 * @date 2010-11-15
 *
 * @generated s
 *
 **/

#include <stdlib.h>
#include <sys/types.h>
#include <assert.h>
#include "common.h"
#include "primes.h"
#include "gkkleader.h"

/** ****************************************************************************
 *
 * @ingroup InPlaceTransformation
 *
 *  plasma_sgetmi2 Implementation of inplace transposition
 *    based on the GKK algorithm by Gustavson, Karlsson, Kagstrom.
 *    This algorithm shift some cycles to transpose the matrix.
 *
 *******************************************************************************
 *
 * @param[in] m
 *         Number of rows of matrix A
 *
 * @param[in] n
 *         Number of columns of matrix A
 *
 * @param[in/out] A
 *         Matrix of size L*m*n
 *
 * @param[in] nprob
 *         Number of parallel and independant problems
 *
 * @param[in] me
 *         Number of rows of the problem
 *
 * @param[in] ne
 *         Number of columns in the problem
 *
 * @param[in] L
 *         Size of chunk to use for transformation
 *
 ******************************************************************************/
int plasma_sshift(plasma_context_t *plasma, int m, int n, float *A,
                  int nprob, int me, int ne, int L,
                  PLASMA_sequence *sequence, PLASMA_request *request) 
{
    int *leaders = NULL;
    int ngrp, thrdbypb, thrdtot, nleaders;

    /* Check Plasma context */
    thrdtot  = PLASMA_SIZE;
    thrdbypb = PLASMA_GRPSIZE;
    ngrp = thrdtot/thrdbypb;

    /* check input */
    if( (nprob * me * ne * L) != (m * n) ) {
        plasma_error(__func__, "problem size does not match matrix size");
        /*printf("m=%d,  n=%d,  nprob=%d,  me=%d,  ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if( thrdbypb > thrdtot ) {
        plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if( (thrdtot % thrdbypb) != 0 ) {
        plasma_error(__func__, "number of thread per problem must divide the total number of thread");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    /* quick return */
    if( (me < 2) || (ne < 2) || (nprob < 1) ) {
        return PLASMA_SUCCESS;
    }

    GKK_getLeaderNbr(me, ne, &nleaders, &leaders);
    nleaders *= 3;

    if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) {
        int *Tp      = NULL;
        int i, ipb;
        int q, owner;

        q = me*ne - 1;
    
        Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger);
        for (i=0; i<thrdtot; i++)
            Tp[i] = 0;

        ipb = 0;
        
        /* First part with coarse parallelism */
        if (nprob > ngrp) {
            ipb = (nprob / ngrp)*ngrp;
        
            /* loop over leader */
            if (thrdbypb > 1) {
                for (i=0; i<nleaders; i+=3) {
                    /* assign this cycle to a thread */
                    owner = minloc(thrdbypb, Tp);
                
                    /* assign it to owner */
                    Tp[owner] = Tp[owner] + leaders[i+1] * L;
                    leaders[i+2] = owner;
                }
            
                GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L);
            }
            else {
                for (i=0; i<nleaders; i+=3) {
                    Tp[0] = Tp[0] + leaders[i+1] * L;
                    leaders[i+2] = 0;
                }
            }

            /* shift in parallel */
            for (i=0; i< (nprob/ngrp); i++) {
                plasma_static_call_9(plasma_psshift,
                                     int,                 me,
                                     int,                 ne,
                                     int,                 L,
                                     float*, &(A[i*ngrp*me*ne*L]),
                                     int *,               leaders,
                                     int,                 nleaders,
                                     int,                 thrdbypb,
                                     PLASMA_sequence*,    sequence,
                                     PLASMA_request*,     request);
            }
        }
    
        /* Second part with fine parallelism */
        if (ipb < nprob) {
            for (i=0; i<thrdtot; i++)
                Tp[i] = 0;
        
            if (thrdtot > 1) {
                /* loop over leader */
                for (i=0; i<nleaders; i+=3) {
                    /* assign this cycle to a thread */
                    owner = minloc(thrdtot, Tp);
                
                    /* assign it to owner */
                    Tp[owner] = Tp[owner] + leaders[i+1] * L;
                    leaders[i+2] = owner;
                }
                GKK_BalanceLoad(thrdtot, Tp, leaders, nleaders, L);
            }
            else {
                for (i=0; i<nleaders; i+=3) {
                    Tp[0] = Tp[0] + leaders[i+1] * L;
                    leaders[i+2] = 0;
                }
            }
        
            /* shift in parallel */
            for (i=ipb; i<nprob; i++) {
                plasma_static_call_9(plasma_psshift,
                                     int,                 me,
                                     int,                 ne,
                                     int,                 L,
                                     float*, &(A[i*me*ne*L]),
                                     int *,               leaders,
                                     int,                 nleaders,
                                     int,                 thrdtot,
                                     PLASMA_sequence*,    sequence,
                                     PLASMA_request*,     request);
            }
        }

        plasma_shared_free(plasma, Tp);
    }
    /* Dynamic scheduling */
    else {
        plasma_dynamic_call_9(plasma_psshift,
                              int,                 me,
                              int,                 ne,
                              int,                 L,
                              float*, A,
                              int *,               leaders,
                              int,                 nleaders,
                              int,                 nprob,
                              PLASMA_sequence*,    sequence,
                              PLASMA_request*,     request);
    }

    free(leaders);

    return PLASMA_SUCCESS;
}

/** ****************************************************************************
 *
 * @ingroup InPlaceTransformation
 *
 * plasma_psshift shifts a batch of cycles in parallel.
 *
 *******************************************************************************
 *
 * @param[in] plasma
 *         Plasma context
 *
 * @param[in] m Number of rows of the panel to shift.
 *
 * @param[in] n Number of columns of the panel to shift
 *
 * @param[in] L Size of each chunk to shift (Usually mb)
 *
 * @param[in,out] A 
 *
 * @param[in] leaders
 *
 * @param[in] nleaders
 *
 * @param[in] thrdbypb
 *
 ******************************************************************************/
void plasma_psshift(plasma_context_t *plasma) {
    PLASMA_sequence *sequence;
    PLASMA_request *request;
    float *A, *Al, *W;
    int     locrnk, myrank;
    int     i, x, snix, cl, iprob;
    int     n, m, L, nleaders, thrdbypb;
    int    *leaders;
    int64_t s, q;
    
    plasma_unpack_args_9(m, n, L, A, leaders, nleaders, thrdbypb, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    myrank   = PLASMA_RANK;
    locrnk   = PLASMA_RANK % thrdbypb;
    iprob    = PLASMA_RANK / thrdbypb;

    q  = m * n - 1;
    Al = &(A[iprob*m*n*L]);
    
    W = (float*)plasma_private_alloc(plasma, L, PlasmaRealFloat);

    /* shift cycles in parallel. */
    /* each thread shifts the cycles it owns. */
    for(i=0; i<nleaders; i+=3) {
        if( leaders[i+2] == locrnk ) {
            /* cycle #i belongs to this thread, so shift it */
            memcpy(W, &(Al[leaders[i]*L]), L*sizeof(float));
            CORE_sshiftw(leaders[i], leaders[i+1], m, n, L, Al, W);
        }
        else if( leaders[i+2] == -2 ) {
            /* cycle #i has been split, so shift in parallel */
            x  = leaders[i+1] / thrdbypb;
            cl = x;
            if( locrnk == 0 ) {
                cl = leaders[i+1] - x * (thrdbypb - 1);
            }
            s    = leaders[i];
            snix = (s * modpow(n, locrnk*x, m * n - 1)) % q;
            
            /* copy the block at s*n^(thid*x) (snix) */
            memcpy(W, &(Al[snix*L]), L*sizeof(float));

            /* wait for peers to finish copy their block. */
            plasma_barrier(plasma);

            /* shift the linear array. */
            if( cl > 0 ) {
                CORE_sshiftw(snix, cl, m, n, L, Al, W);
            }
        }
    }

    plasma_private_free(plasma, W);
}


void plasma_psshift_quark(int m, int n, int L, float *A, 
                          int *leaders, int nleaders, int nprob,
                          PLASMA_sequence *sequence, PLASMA_request *request) 
{
    plasma_context_t   *plasma;
    Quark_Task_Flags    task_flags = Quark_Task_Flags_Initializer;
    float *Al;
    int     i, iprob, size;
    
    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    size = m*n*L;

    for(iprob=0; iprob<nprob; iprob++) {
        Al = &(A[iprob*size]);

        QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
                          sizeof(float)*size, Al,  INOUT,
#ifdef TRACE_IPT
                          13, "Foo In shift",   VALUE | TASKLABEL,
                          4, "red",  VALUE | TASKCOLOR,
#endif
                          0);

        /* shift cycles in parallel. */
        for(i=0; i<nleaders; i+=3) {
            //assert( leaders[i+2] != -2 );
            QUARK_CORE_sshift(plasma->quark, &task_flags,
                              leaders[i], m, n, L, Al);
        }

        QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
                          sizeof(float)*size, Al,  INOUT,
#ifdef TRACE_IPT
                          14, "Foo Out shift",   VALUE | TASKLABEL,
                          4, "red",  VALUE | TASKCOLOR,
#endif
                          0);
    }
}


