/**
 *
 * @file pcsyrk.c
 *
 *  PLASMA auxiliary routines
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.2.0
 * @author Mathieu Faverge
 * @date 2009-11-15
 *
 **/
#include "common.h"

#define A(m,n) BLKADDR(A, PLASMA_Complex32_t, m, n)
#define C(m,n) BLKADDR(C, PLASMA_Complex32_t, m, n)
/***************************************************************************//**
 *  Parallel zsyrk - static scheduling
 **/
void plasma_pcsyrk(plasma_context_t *plasma)
{
    PLASMA_enum uplo;
    PLASMA_enum trans;
    PLASMA_Complex32_t alpha;
    PLASMA_desc A;
    PLASMA_Complex32_t beta;
    PLASMA_desc C;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    PLASMA_Complex32_t *lA, *lB;
    int m, n, k, M, N, K;
    int Anb, An, kt, transB;
    int next_m;
    int next_n;

    plasma_unpack_args_8(uplo, trans, alpha, A, beta, C, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    if (trans == PlasmaNoTrans) {
        Anb = A.nb;
        An  = A.n;
        kt  = A.nt;
        transB = PlasmaTrans;
    }
    else {
        Anb = A.mb;
        An  = A.m;
        kt  = A.mt;
        transB = PlasmaNoTrans;
    }

    /* Compute indices on lower part */
    n = 0;
    m = PLASMA_RANK;
    while (m >= C.mt && n < C.nt) {
        n++;
        m = m-C.mt+n;
    }

    while (n < C.nt) {

        next_n = n;
        next_m = m + PLASMA_SIZE;
        while (next_m >= C.mt && next_n < C.nt) {
            next_n++;
            next_m = next_m - C.mt + next_n;
        }

        M = m == C.mt-1 ? C.m - m * C.mb : C.mb;
        N = n == C.nt-1 ? C.n - n * C.nb : C.nb;
        if (m == n) {
            for (k = 0 ; k < kt ; k++) {
                K = k == kt-1 ? An - k * Anb : Anb;
                lA = (trans == PlasmaNoTrans) ? A(m, k) : A(k, m);

                CORE_csyrk( uplo, trans, N, K,
                            alpha, lA, A.mb,
                            k == 0 ? beta : (PLASMA_Complex32_t)1.0,
                            C(m, n), C.mb);
            }
        }
        else {

            for (k = 0 ; k < kt ; k++) {
                K = k == kt-1 ? An - k * Anb : Anb;
                lA = (trans == PlasmaNoTrans) ? A(m, k) : A(k, m);
                lB = (trans == PlasmaNoTrans) ? A(n, k) : A(k, n);

                if ( uplo == PlasmaLower ) {
                    CORE_cgemm( trans, transB, M, N, K,
                                alpha, lA, A.mb, lB, A.mb,
                                k == 0 ? (PLASMA_Complex32_t)beta : ((PLASMA_Complex32_t) 1.0),
                                C(m, n), C.mb);
                  }
                else {
                    CORE_cgemm( trans, transB, N, M, K,
                                alpha, lB, A.mb, lA, A.mb,
                                k == 0 ? (PLASMA_Complex32_t)beta : ((PLASMA_Complex32_t) 1.0),
                                C(n, m), C.mb);
                }
            }
        }

        m = next_m;
        n = next_n;
    }
}

/***************************************************************************//**
 *  Parallel zsyrk - dynamic scheduling
 **/
void plasma_pcsyrk_quark(PLASMA_enum uplo, PLASMA_enum trans,
                          PLASMA_Complex32_t alpha, PLASMA_desc A,
                          PLASMA_Complex32_t beta,  PLASMA_desc C,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    int m, n, k, M, N, K;
    int Anb, An, kt;
    PLASMA_Complex32_t lbeta;
    PLASMA_Complex32_t *lA, *lB;
    PLASMA_enum transB;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    if (trans == PlasmaNoTrans) {
        Anb = A.nb;
        An  = A.n;
        kt  = A.nt;
        transB = PlasmaTrans;
    }
    else {
        Anb = A.mb;
        An  = A.m;
        kt  = A.mt;
        transB = PlasmaNoTrans;
    }

    for (n = 0; n < C.nt; n++) {
        N = n == C.nt-1 ? C.n - n * C.nb : C.nb;

        for (k = 0 ; k < kt ; k++) {
            K = k == kt-1 ? An - k * Anb : Anb;
            lbeta = k == 0 ? beta : ((PLASMA_Complex32_t) 1.0);
            lA = (trans == PlasmaNoTrans) ? A(n, k) : A(k, n);

            QUARK_Insert_Task(plasma->quark, CORE_csyrk_quark, &task_flags,
                sizeof(PLASMA_enum),                  &uplo,   VALUE,
                sizeof(PLASMA_enum),                  &trans,  VALUE,
                sizeof(int),                          &N,      VALUE,
                sizeof(int),                          &K,      VALUE,
                sizeof(PLASMA_Complex32_t),           &alpha,  VALUE,
                sizeof(PLASMA_Complex32_t)*A.mb*A.nb, lA,      INPUT,
                sizeof(int),                          &A.mb,   VALUE,
                sizeof(PLASMA_Complex32_t),           &lbeta,  VALUE,
                sizeof(PLASMA_Complex32_t)*C.mb*C.nb, C(n, n), INOUT | LOCALITY,
                sizeof(int),                          &C.mb,   VALUE,
                0);
        }

        for (m = n+1; m < C.mt; m++) {
            M = m == C.mt-1 ? C.m - m * C.mb : C.mb;
            for (k = 0; k < kt; k++) {
                K = k == kt-1 ? An - k * Anb : Anb;
                lbeta = k == 0 ? beta : ((PLASMA_Complex32_t) 1.0);
                lA = (trans == PlasmaNoTrans) ? A(m, k) : A(k, m);
                lB = (trans == PlasmaNoTrans) ? A(n, k) : A(k, n);

                if ( uplo == PlasmaLower ) {
                    QUARK_Insert_Task(plasma->quark, CORE_cgemm_quark, &task_flags,
                        sizeof(PLASMA_enum),                  &trans,   VALUE,
                        sizeof(PLASMA_enum),                  &transB,  VALUE,
                        sizeof(int),                          &M,       VALUE,
                        sizeof(int),                          &N,       VALUE,
                        sizeof(int),                          &K,       VALUE,
                        sizeof(PLASMA_Complex32_t),           &alpha,   VALUE,
                        sizeof(PLASMA_Complex32_t)*A.mb*A.nb, lA,       INPUT,
                        sizeof(int),                          &A.mb,    VALUE,
                        sizeof(PLASMA_Complex32_t)*A.mb*A.nb, lB,       INPUT,
                        sizeof(int),                          &A.mb,    VALUE,
                        sizeof(PLASMA_Complex32_t),           &lbeta,   VALUE,
                        sizeof(PLASMA_Complex32_t)*C.mb*C.nb, C(m, n),  INOUT | LOCALITY,
                        sizeof(int),                          &C.mb,    VALUE,
                        0);
                }
                else {
                    QUARK_Insert_Task(plasma->quark, CORE_cgemm_quark, &task_flags,
                        sizeof(PLASMA_enum),                  &trans,   VALUE,
                        sizeof(PLASMA_enum),                  &transB,  VALUE,
                        sizeof(int),                          &N,       VALUE,
                        sizeof(int),                          &M,       VALUE,
                        sizeof(int),                          &K,       VALUE,
                        sizeof(PLASMA_Complex32_t),           &alpha,   VALUE,
                        sizeof(PLASMA_Complex32_t)*A.mb*A.nb, lB,       INPUT,
                        sizeof(int),                          &A.mb,    VALUE,
                        sizeof(PLASMA_Complex32_t)*A.mb*A.nb, lA,       INPUT,
                        sizeof(int),                          &A.mb,    VALUE,
                        sizeof(PLASMA_Complex32_t),           &lbeta,   VALUE,
                        sizeof(PLASMA_Complex32_t)*C.mb*C.nb, C(n, m),  INOUT | LOCALITY,
                        sizeof(int),                          &C.mb,    VALUE,
                        0);
                }
            }
        }
    }
}
