/**
 *
 * @file psormlq.c
 *
 *  PLASMA auxiliary routines
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.2.0
 * @author Hatem Ltaief
 * @author Jakub Kurzak
 * @date 2009-11-15
 *
 **/
#include "common.h"

#define A(m,n) BLKADDR(A, float, m, n)
#define B(m,n) BLKADDR(B, float, m, n)
#define T(m,n) BLKADDR(T, float, m, n)
/***************************************************************************//**
 *  Parallel application of Q using tile V - LQ factorization - static scheduling
 **/
void plasma_psormlq(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc B;
    PLASMA_desc T;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    float *work;

    plasma_unpack_args_5(A, B, T, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    work = (float*)plasma_private_alloc(plasma, T.mb*T.nb, T.dtyp);
    ss_init(B.mt, B.nt, min(A.mt, A.nt));

    k = min(A.mt, A.nt)-1;
    n = PLASMA_RANK;
    while (n >= B.nt) {
        k--;
        n = n-B.nt;
    }
    m = B.mt-1;

    while (k >= 0 && n < B.nt) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m--;
        if (next_m == k-1) {
            next_n += PLASMA_SIZE;
            while (next_n >= B.nt && next_k >= 0) {
                next_k--;
                next_n = next_n-B.nt;
            }
            next_m = B.mt-1;
        }

        if (m == k) {
            CORE_sormlq(
                PlasmaLeft, PlasmaTrans,
                k == A.nt-1 ? A.n-k*A.nb : A.nb,
                n == B.nt-1 ? B.n-n*B.nb : B.nb,
                T.mb,
                k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.nb : A.nb,
                A(k, k), A.nb,
                T(k, k), T.mb,
                B(k, n), B.nb,
                work, T.nb);
            ss_cond_set(k, n, k);
        }
        else {
            ss_cond_wait(m, n, k+1);
            CORE_sssmlq(
                PlasmaLeft, PlasmaTrans,
                A.nb,
                m == B.mt-1 ? B.m-m*B.nb : B.nb,
                n == B.nt-1 ? B.n-n*B.nb : B.nb,
                T.mb,
                k == A.mt-1 ? A.m-k*A.nb : A.nb,
                B(k, n), B.nb,
                B(m, n), B.nb,
                A(k, m), A.nb,
                T(k, m), T.mb,
                work, T.mb);
            ss_cond_set(m, n, k);
        }
        m = next_m;
        n = next_n;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    ss_finalize();
}

/***************************************************************************//**
 *  Parallel application of Q using tile V - LQ factorization - dynamic scheduling
 **/
void plasma_psormlq_quark(PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
{
    int k, m, n;
    plasma_context_t *plasma;
    PLASMA_enum plasma_left = PlasmaLeft;
    PLASMA_enum plasma__trans = PlasmaTrans;
    int temp1, temp2, temp3;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (k = min(A.mt, A.nt)-1; k >= 0; k--)
    {
        for (n = 0; n < B.nt; n++)
        {
            for (m = B.mt-1; m > k; m--)
            {
                temp1 = B.m-m*B.nb;
                temp2 = B.n-n*B.nb;
                temp3 = A.m-k*A.nb;
                QUARK_Insert_Task(plasma->quark, CORE_sssmlq_quark, &task_flags,
                    sizeof(PLASMA_enum),                  &plasma_left,                 VALUE,
                    sizeof(PLASMA_enum),                  &plasma__trans,           VALUE,
                    sizeof(int),                          &A.nb,                        VALUE,
                    sizeof(int),                          m == B.mt-1 ? &temp1 : &B.nb, VALUE,
                    sizeof(int),                          n == B.nt-1 ? &temp2 : &B.nb, VALUE,
                    sizeof(int),                          &T.mb,                        VALUE,
                    sizeof(int),                          k == A.mt-1 ? &temp3 : &A.nb, VALUE,
                    sizeof(float)*A.mb*A.nb, B(k, n),                          INOUT | LOCALITY,
                    sizeof(int),                          &B.nb,                        VALUE,
                    sizeof(float)*A.mb*A.nb, B(m, n),                          INOUT,
                    sizeof(int),                          &B.nb,                        VALUE,
                    sizeof(float)*A.mb*A.nb, A(k, m),                          INPUT,
                    sizeof(int),                          &A.nb,                        VALUE,
                    sizeof(float)*T.mb*T.nb, T(k, m),                          INPUT,
                    sizeof(int),                          &T.mb,                        VALUE,
                    sizeof(float)*T.mb*T.nb, NULL,                             SCRATCH,
                    sizeof(int),                          &T.mb,                        VALUE,
                    0);
            }

            m = k;
            temp1 = A.n-k*A.nb;
            temp2 = B.n-n*B.nb;
            temp3 = min(A.m, A.n)-k*A.nb;
            QUARK_Insert_Task(plasma->quark, CORE_sormlq_quark, &task_flags,
                sizeof(PLASMA_enum),                  &plasma_left,                            VALUE,
                sizeof(PLASMA_enum),                  &plasma__trans,                      VALUE,
                sizeof(int),                          k == A.nt-1 ? &temp1 : &A.nb,            VALUE,
                sizeof(int),                          n == B.nt-1 ? &temp2 : &B.nb,            VALUE,
                sizeof(int),                          &T.mb,                                   VALUE,
                sizeof(int),                          k == min(A.mt, A.nt)-1 ? &temp3 : &A.nb, VALUE,
                sizeof(float)*A.mb*A.nb, A(k, k),                                     INPUT,
                sizeof(int),                          &A.nb,                                   VALUE,
                sizeof(float)*T.mb*T.nb, T(k, k),                                     INPUT,
                sizeof(int),                          &T.mb,                                   VALUE,
                sizeof(float)*A.mb*A.nb, B(k, n),                                     INOUT | LOCALITY,
                sizeof(int),                          &B.nb,                                   VALUE,
                sizeof(float)*T.mb*T.nb, NULL,                                        SCRATCH,
                sizeof(int),                          &T.nb,                                   VALUE,
                0);
        }
    }
}
