/**
 *
 * @file pssymm.c
 *
 *  PLASMA auxiliary routines
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.2.0
 * @author Emmanuel Agullo
 * @author Mathieu Faverge
 * @date 2009-11-15
 *
 **/
#include "common.h"

#define A(m,n) BLKADDR(A, float, m, n)
#define B(m,n) BLKADDR(B, float, m, n)
#define C(m,n) BLKADDR(C, float, m, n)
/***************************************************************************//**
 *  Parallel tile SSYMM matrix-matrix operations
 **/
void plasma_pssymm(plasma_context_t *plasma)
{
    PLASMA_enum side;
    PLASMA_enum uplo;
    PLASMA_enum transA;
    PLASMA_enum transAt;
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_desc C;
    float alpha;
    float beta;
    float *lA, *lB, *lC;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int X, Y, K;
    int k, m, n;
    int next_m;
    int next_n;

    plasma_unpack_args_9(side, uplo, alpha, A, B, beta, C, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    if (uplo == PlasmaLower ) {
        transA  = PlasmaNoTrans;
        transAt = PlasmaTrans;
    } else {
        transA  = PlasmaTrans;
        transAt = PlasmaNoTrans;
    }
    n = 0;
    m = PLASMA_RANK;
    while (m >= C.mt && n < C.nt) {
        n++;
        m = m-C.mt;
    }

    while (n < C.nt) {
        next_m = m;
        next_n = n;

        next_m += PLASMA_SIZE;
        while (next_m >= C.mt && next_n < C.nt) {
            next_n++;
            next_m = next_m - C.mt;
        }

        X = m == C.mt-1 ? C.m - m * C.mb : C.mb;
        Y = n == C.nt-1 ? C.n - n * C.nb : C.nb;
        lC = C(m, n);

        if (side == PlasmaLeft) {
            for (k = 0 ; k < C.mt ; k++) {
                K = k == C.mt-1 ? C.m - k * C.mb : C.mb;
                lB = B(k, n);

                if ( k < m ) {
                    lA = (uplo == PlasmaLower) ? A(m, k) : A(k, m);
                    CORE_sgemm( transA, PlasmaNoTrans, X, Y, K,
                                alpha, lA, A.mb, lB, B.mb,
                                (( k == 0 ) ? beta : ((float) 1.0)), lC, C.mb);
                }
                else if ( k == m ) {
                    CORE_ssymm( side, uplo, X, Y,
                                alpha, A(k, k), A.mb, lB, B.mb,
                                (( k == 0 ) ? beta : ((float) 1.0)), lC, C.mb);
                } else { /* k > m */
                    lA = (uplo == PlasmaLower) ? A(k, m) : A(m, k);
                    CORE_sgemm( transAt, PlasmaNoTrans, X, Y, K,
                                alpha, lA, A.mb, lB, B.mb,
                                (( k == 0 ) ? beta : ((float) 1.0)), lC, C.mb);
                }
            }
        }
        else { /* PlasmaRight */
            for (k = 0 ; k < C.nt ; k++) {
                K = k == C.nt-1 ? C.n - k * C.nb : C.nb;
                lB = B(m, k);

                if ( k < n ) {
                    lA = (uplo == PlasmaLower) ? A(n, k) : A(k, n);
                    CORE_sgemm( PlasmaNoTrans, transAt, X, Y, K,
                                alpha, lB, B.mb, lA, A.mb,
                                (( k == 0 ) ? beta : ((float) 1.0)), lC, C.mb);
                }
                else if ( n == k ) {
                    CORE_ssymm( side, uplo, X, Y,
                                alpha, A(k, k), A.mb, lB, B.mb,
                                (( k == 0 ) ? beta : ((float) 1.0)), lC, C.mb);
                } else {
                    lA = (uplo == PlasmaLower) ? A(k, n) : A(n, k);
                    CORE_sgemm( PlasmaNoTrans, transA, X, Y, K,
                                alpha, lB, B.mb, lA, A.mb,
                                (( k == 0 ) ? beta : ((float) 1.0)), lC, C.mb);
                }
            }
        }
        m = next_m;
        n = next_n;
    }
}

/***************************************************************************//**
 *  Parallel tile SSYMM matrix-matrix operations (dynamic)
 **/
void plasma_pssymm_quark(PLASMA_enum side, PLASMA_enum uplo,
                          float alpha, PLASMA_desc A, PLASMA_desc B,
                          float beta, PLASMA_desc C,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    PLASMA_enum transA;
    PLASMA_enum transAt;
    PLASMA_enum NoTrans = PlasmaNoTrans;
    float *lA, *lB, *lC;
    float lbeta;
    int X, Y, K;
    int k, m, n;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    if (uplo == PlasmaLower ) {
        transA  = PlasmaNoTrans;
        transAt = PlasmaTrans;
    } else {
        transA  = PlasmaTrans;
        transAt = PlasmaNoTrans;
    }

    for(m = 0; m < C.mt; m++) {
        X = m == C.mt-1 ? C.m - m * C.mb : C.mb;

        for(n = 0; n < C.nt; n++) {
            Y = n == C.nt-1 ? C.n - n * C.nb : C.nb;
            lC = C(m, n);

            if (side == PlasmaLeft) {
                for (k = 0 ; k < C.mt ; k++) {
                    K = k == C.mt-1 ? C.m - k * C.mb : C.mb;
                    lbeta = k == 0 ? beta : ((float) 1.0);
                    lB = B(k, n);

                    if ( k < m ) {
                        lA = (uplo == PlasmaLower) ? A(m, k) : A(k, m);
                        QUARK_Insert_Task(plasma->quark, CORE_sgemm_quark, &task_flags,
                            sizeof(PLASMA_enum),                  &transA,  VALUE,
                            sizeof(PLASMA_enum),                  &NoTrans, VALUE,
                            sizeof(int),                          &X,       VALUE,
                            sizeof(int),                          &Y,       VALUE,
                            sizeof(int),                          &K,       VALUE,
                            sizeof(float),           &alpha,   VALUE,
                            sizeof(float)*A.mb*A.nb, lA,       INPUT,
                            sizeof(int),                          &A.mb,    VALUE,
                            sizeof(float)*B.mb*B.nb, lB,       INPUT,
                            sizeof(int),                          &B.mb,    VALUE,
                            sizeof(float),           &lbeta,   VALUE,
                            sizeof(float)*C.mb*C.nb, lC,       INOUT | LOCALITY,
                            sizeof(int),                          &C.mb,    VALUE,
                            0);
                    }
                    else if ( m == k ) {
                        lA = A(k, k);
                        QUARK_Insert_Task(plasma->quark, CORE_ssymm_quark, &task_flags,
                            sizeof(PLASMA_enum),                  &side,  VALUE,
                            sizeof(PLASMA_enum),                  &uplo,  VALUE,
                            sizeof(int),                          &X,     VALUE,
                            sizeof(int),                          &Y,     VALUE,
                            sizeof(float),           &alpha, VALUE,
                            sizeof(float)*A.mb*A.nb, lA,     INPUT,
                            sizeof(int),                          &A.mb,  VALUE,
                            sizeof(float)*B.mb*B.nb, lB,     INPUT,
                            sizeof(int),                          &B.mb,  VALUE,
                            sizeof(float),           &lbeta, VALUE,
                            sizeof(float)*C.mb*C.nb, lC,     INOUT | LOCALITY,
                            sizeof(int),                          &C.mb,  VALUE,
                            0);
                    } else {
                        lA = (uplo == PlasmaLower) ? A(k, m) : A(m, k);
                        QUARK_Insert_Task(plasma->quark, CORE_sgemm_quark, &task_flags,
                            sizeof(PLASMA_enum),                  &transAt, VALUE,
                            sizeof(PLASMA_enum),                  &NoTrans, VALUE,
                            sizeof(int),                          &X,       VALUE,
                            sizeof(int),                          &Y,       VALUE,
                            sizeof(int),                          &K,       VALUE,
                            sizeof(float),           &alpha,   VALUE,
                            sizeof(float)*A.mb*A.nb, lA,       INPUT,
                            sizeof(int),                          &A.mb,    VALUE,
                            sizeof(float)*B.mb*B.nb, lB,       INPUT,
                            sizeof(int),                          &B.mb,    VALUE,
                            sizeof(float),           &lbeta,   VALUE,
                            sizeof(float)*C.mb*C.nb, lC,       INOUT | LOCALITY,
                            sizeof(int),                          &C.mb,    VALUE,
                            0);
                    }
                }
            }
            else { /* PlasmaRight */
                for (k = 0 ; k < C.nt ; k++) {
                    K = k == C.nt-1 ? C.n - k * C.nb : C.nb;
                    lbeta = k == 0 ? beta : ((float) 1.0);
                    lB = B(m, k);

                    if ( k < n ) {
                        lA = (uplo == PlasmaLower) ? A(n, k) : A(k, n);
                        QUARK_Insert_Task(plasma->quark, CORE_sgemm_quark, &task_flags,
                            sizeof(PLASMA_enum),                  &NoTrans, VALUE,
                            sizeof(PLASMA_enum),                  &transAt, VALUE,
                            sizeof(int),                          &X,       VALUE,
                            sizeof(int),                          &Y,       VALUE,
                            sizeof(int),                          &K,       VALUE,
                            sizeof(float),           &alpha,   VALUE,
                            sizeof(float)*B.mb*B.nb, lB,       INPUT,
                            sizeof(int),                          &B.mb,    VALUE,
                            sizeof(float)*A.mb*A.nb, lA,       INPUT,
                            sizeof(int),                          &A.mb,    VALUE,
                            sizeof(float),           &lbeta,   VALUE,
                            sizeof(float)*C.mb*C.nb, lC,       INOUT | LOCALITY,
                            sizeof(int),                          &C.mb,    VALUE,
                            0);
                    }
                    else if ( n == k ) {
                        lA = A(k, k);
                        QUARK_Insert_Task(plasma->quark, CORE_ssymm_quark, &task_flags,
                            sizeof(PLASMA_enum),                  &side,  VALUE,
                            sizeof(PLASMA_enum),                  &uplo,  VALUE,
                            sizeof(int),                          &X,     VALUE,
                            sizeof(int),                          &Y,     VALUE,
                            sizeof(float),           &alpha, VALUE,
                            sizeof(float)*A.mb*A.nb, lA,     INPUT,
                            sizeof(int),                          &A.mb,  VALUE,
                            sizeof(float)*B.mb*B.nb, lB,     INPUT,
                            sizeof(int),                          &B.mb,  VALUE,
                            sizeof(float),           &lbeta, VALUE,
                            sizeof(float)*C.mb*C.nb, lC,     INOUT | LOCALITY,
                            sizeof(int),                          &C.mb,  VALUE,
                            0);
                    } else {
                        lA = (uplo == PlasmaLower) ? A(k, n) : A(n, k);
                        QUARK_Insert_Task(plasma->quark, CORE_sgemm_quark, &task_flags,
                            sizeof(PLASMA_enum),                  &NoTrans, VALUE,
                            sizeof(PLASMA_enum),                  &transA,  VALUE,
                            sizeof(int),                          &X,       VALUE,
                            sizeof(int),                          &Y,       VALUE,
                            sizeof(int),                          &K,       VALUE,
                            sizeof(float),           &alpha,   VALUE,
                            sizeof(float)*B.mb*B.nb, lB,       INPUT,
                            sizeof(int),                          &B.mb,    VALUE,
                            sizeof(float)*A.mb*A.nb, lA,       INPUT,
                            sizeof(int),                          &A.mb,    VALUE,
                            sizeof(float),           &lbeta,   VALUE,
                            sizeof(float)*C.mb*C.nb, lC,       INOUT | LOCALITY,
                            sizeof(int),                          &C.mb,    VALUE,
                            0);
                    }
                }
            }
        }
    }
}
