/**
 *
 * @file pzgetrf.c
 *
 *  PLASMA auxiliary routines
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.2.0
 * @author Jakub Kurzak
 * @author Hatem Ltaief
 * @date 2009-11-15
 *
 **/
#include "common.h"

#define A(m,n) BLKADDR(A, PLASMA_Complex64_t, m, n)
#define L(m,n) BLKADDR(L, PLASMA_Complex64_t, m, n)
#define IPIV(m,n) &(IPIV[(int64_t)A.mb*((int64_t)(m)+(int64_t)A.lmt*(int64_t)(n))])
/***************************************************************************//**
 *  Parallel tile LU factorization - static scheduling
 **/
void plasma_pzgetrf(plasma_context_t *plasma)
{
    PLASMA_desc A;
    PLASMA_desc L;
    int *IPIV;
    PLASMA_sequence *sequence;
    PLASMA_request *request;

    int k, m, n;
    int next_k;
    int next_m;
    int next_n;
    int info;
    PLASMA_Complex64_t *work;

    plasma_unpack_args_5(A, L, IPIV, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    work = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, L.mb*L.nb, L.dtyp);
    ss_init(A.mt, A.nt, -1);

    k = 0;
    n = PLASMA_RANK;
    while (n >= A.nt) {
        k++;
        n = n-A.nt+k;
    }
    m = k;

    while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == A.mt) {
            next_n += PLASMA_SIZE;
            while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
                next_k++;
                next_n = next_n-A.nt+next_k;
            }
            next_m = next_k;
        }

        if (n == k) {
            if (m == k) {
                ss_cond_wait(k, k, k-1);
                CORE_zgetrf(
                    k == A.mt-1 ? A.m-k*A.mb : A.mb,
                    k == A.nt-1 ? A.n-k*A.nb : A.nb,
                    L.mb,
                    A(k, k), A.nb,
                    IPIV(k, k), &info);
                if (info != 0 && m == A.mt-1) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(k, k, k);
            }
            else {
                ss_cond_wait(m, k, k-1);
                CORE_ztstrf(
                    m == A.mt-1 ? A.m-m*A.mb : A.mb,
                    k == A.nt-1 ? A.n-k*A.nb : A.nb,
                    L.mb,
                    A.nb,
                    A(k, k), A.nb,
                    A(m, k), A.nb,
                    L(m, k), L.mb,
                    IPIV(m, k),
                    work, L.nb, &info);
                if (info != 0 && m == A.mt-1) {
                    plasma_request_fail(sequence, request, info + A.nb*k);
                    ss_abort();
                }
                ss_cond_set(m, k, k);
            }
        }
        else {
            if (m == k) {
                ss_cond_wait(k, k, k);
                ss_cond_wait(k, n, k-1);
                CORE_zgessm(
                    k == A.mt-1 ? A.m-k*A.mb : A.mb,
                    n == A.nt-1 ? A.n-n*A.nb : A.nb,
                    k == A.mt-1 ? A.m-k*A.mb : A.mb,
                    L.mb,
                    IPIV(k, k),
                    A(k, k), A.nb,
                    A(k, n), A.nb);
            }
            else {
                ss_cond_wait(m, k, k);
                ss_cond_wait(m, n, k-1);
                CORE_zssssm(
                    A.nb,
                    m == A.mt-1 ? A.m-m*A.mb : A.mb,
                    n == A.nt-1 ? A.n-n*A.nb : A.nb,
                    L.mb,
                    A.nb,
                    A(k, n), A.nb,
                    A(m, n), A.nb,
                    L(m, k), L.mb,
                    A(m, k), A.nb,
                    IPIV(m, k));
                ss_cond_set(m, n, k);
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
    plasma_private_free(plasma, work);
    ss_finalize();
}

/***************************************************************************//**
 *  Parallel tile LU factorization - dynamic scheduling
 **/
void plasma_pzgetrf_quark(PLASMA_desc A, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
{
    int k, m, n;
    plasma_context_t *plasma;
    int temp1, temp2;
    PLASMA_bool check_info; // TRUE if error check is appropriate (last tile in the column)
    int iinfo;              // value to be added to the error code returned from the kernel
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    for (k = 0; k < min(A.mt, A.nt); k++)
    {
        temp1 = A.m-k*A.mb;
        temp2 = A.n-k*A.nb;
        check_info = (k == A.mt-1);
        iinfo = A.nb*k;
        QUARK_Insert_Task(plasma->quark, CORE_zgetrf_quark, &task_flags,
            sizeof(int),                          k == A.mt-1 ? &temp1 : &A.nb, VALUE,
            sizeof(int),                          k == A.nt-1 ? &temp2 : &A.nb, VALUE,
            sizeof(int),                          &L.mb,                        VALUE,
            sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(k, k),                          INOUT | LOCALITY,
            sizeof(int),                          &A.nb,                        VALUE,
            sizeof(int)*A.mb,                     IPIV(k, k),                       OUTPUT,
            sizeof(PLASMA_sequence*),             &sequence,                    VALUE,
            sizeof(PLASMA_request*),              &request,                     VALUE,
            sizeof(PLASMA_bool),                  &check_info,                  VALUE,
            sizeof(int),                          &iinfo,                       VALUE,
            0);

        for (n = k+1; n < A.nt; n++)
        {
            temp1 = A.m-k*A.mb;
            temp2 = A.n-n*A.nb;
            QUARK_Insert_Task(plasma->quark, CORE_zgessm_quark, &task_flags,
                sizeof(int),                          k == A.mt-1 ? &temp1 : &A.nb, VALUE,
                sizeof(int),                          n == A.nt-1 ? &temp2 : &A.nb, VALUE,
                sizeof(int),                          k == A.mt-1 ? &temp1 : &A.nb, VALUE,
                sizeof(int),                          &L.mb,                        VALUE,
                sizeof(int)*A.mb,                     IPIV(k, k),                       INPUT,
                sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(k, k),                          INPUT,
                sizeof(int),                          &A.nb,                        VALUE,
                sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(k, n),                          INOUT | LOCALITY,
                sizeof(int),                          &A.nb,                        VALUE,
                0);
        }

        for (m = k+1; m < A.mt; m++)
        {
            temp1 = A.m-m*A.mb;
            temp2 = A.n-k*A.nb;
            check_info = (m == A.mt-1);
            iinfo = A.nb*k;
            QUARK_Insert_Task(plasma->quark, CORE_ztstrf_quark, &task_flags,
                sizeof(int),                          m == A.mt-1 ? &temp1 : &A.nb, VALUE,
                sizeof(int),                          k == A.nt-1 ? &temp2 : &A.nb, VALUE,
                sizeof(int),                          &L.mb,                        VALUE,
                sizeof(int),                          &A.nb,                        VALUE,
                sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(k, k),                          INOUT | LOCALITY,
                sizeof(int),                          &A.nb,                        VALUE,
                sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(m, k),                          INOUT,
                sizeof(int),                          &A.nb,                        VALUE,
                sizeof(PLASMA_Complex64_t)*L.mb*L.nb, L(m, k),                          OUTPUT,
                sizeof(int),                          &L.mb,                        VALUE,
                sizeof(int)*A.mb,                     IPIV(m, k),                       OUTPUT,
                sizeof(PLASMA_Complex64_t)*L.mb*L.nb, NULL,                             SCRATCH,
                sizeof(int),                          &L.nb,                        VALUE,
                sizeof(PLASMA_sequence*),             &sequence,                    VALUE,
                sizeof(PLASMA_request*),              &request,                     VALUE,
                sizeof(PLASMA_bool),                  &check_info,                  VALUE,
                sizeof(int),                          &iinfo,                       VALUE,
                0);

            for (n = k+1; n < A.nt; n++)
            {
                temp1 = A.m-m*A.mb;
                temp2 = A.n-n*A.nb;
                QUARK_Insert_Task(plasma->quark, CORE_zssssm_quark, &task_flags,
                    sizeof(int),                          &A.nb,                        VALUE,
                    sizeof(int),                          m == A.mt-1 ? &temp1 : &A.nb, VALUE,
                    sizeof(int),                          n == A.nt-1 ? &temp2 : &A.nb, VALUE,
                    sizeof(int),                          &L.mb,                        VALUE,
                    sizeof(int),                          &A.nb,                        VALUE,
                    sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(k, n),                          INOUT | LOCALITY,
                    sizeof(int),                          &A.nb,                        VALUE,
                    sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(m, n),                          INOUT,
                    sizeof(int),                          &A.nb,                        VALUE,
                    sizeof(PLASMA_Complex64_t)*L.mb*L.nb, L(m, k),                          INPUT,
                    sizeof(int),                          &L.mb,                        VALUE,
                    sizeof(PLASMA_Complex64_t)*A.mb*A.nb, A(m, k),                          INPUT,
                    sizeof(int),                          &A.nb,                        VALUE,
                    sizeof(int)*A.mb,                     IPIV(m, k),                       INPUT,
                    0);
            }
        }
    }
}
