/* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
/* ///                    PLASMA computational routines (version 2.1.0)                      ///
 * ///                    Author: Jakub Kurzak                                               ///
 * ///                    Release Date: November, 15th 2009                                  ///
 * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
 * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */
/* ///////////////////////////////////////////////////////////////////////////////////////////// */
#include "common.h"

/* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
// PLASMA_dgels - solves overdetermined or underdetermined linear systems involving an M-by-N
// matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
// The following options are provided:
//
// 1. trans = PlasmaNoTrans and M >= N: find the least squares solution of an overdetermined
//    system, i.e., solve the least squares problem: minimize || B - A*X ||.
//
// 2. trans = PlasmaNoTrans and M < N:  find the minimum norm solution of an underdetermined
//    system A * X = B.
//
// Several right hand side vectors B and solution vectors X can be handled in a single call;
// they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
// solution
// matrix X.

/* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
// trans    PLASMA_enum (IN)
//          Intended usage:
//          = PlasmaNoTrans:   the linear system involves A;
//          = PlasmaTrans: the linear system involves A**T.
//          Currently only PlasmaNoTrans is supported.
//
// M        int (IN)
//          The number of rows of the matrix A. M >= 0.
//
// N        int (IN)
//          The number of columns of the matrix A. N >= 0.
//
// NRHS     int (IN)
//          The number of right hand sides, i.e., the number of columns of the matrices B and X.
//          NRHS >= 0.
//
// A        double* (INOUT)
//          On entry, the M-by-N matrix A.
//          On exit,
//          if M >= N, A is overwritten by details of its QR factorization as returned by
//                     PLASMA_dgeqrf;
//          if M < N, A is overwritten by details of its LQ factorization as returned by
//                      PLASMA_dgelqf.
//
// LDA      int (IN)
//          The leading dimension of the array A. LDA >= max(1,M).
//
// T        double* (OUT)
//          On exit, auxiliary factorization data.
//
// B        double* (INOUT)
//          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
//          On exit, if return value = 0, B is overwritten by the solution vectors, stored
//          columnwise:
//          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
//          sum of squares for the solution in each column is given by the sum of squares of the
//          modulus of elements N+1 to M in that column;
//          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
//
// LDB      int (IN)
//          The leading dimension of the array B. LDB >= MAX(1,M,N).

/* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
//          = 0: successful exit
//          < 0: if -i, the i-th argument had an illegal value

/* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
int PLASMA_dgels(PLASMA_enum trans, int M, int N, int NRHS, double *A, int LDA,
                 double *T, double *B, int LDB)
{
    int i, j;
    int NB, MT, NT, NTRHS;
    int status;
    double *Abdl;
    double *Bbdl;
    double *Tbdl;
    plasma_context_t *plasma;
    PLASMA_desc descA;
    PLASMA_desc descB;
    PLASMA_desc descT;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_dgels", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    /* Check input arguments */
    if (trans != PlasmaNoTrans) {
        plasma_error("PLASMA_dgels", "only PlasmaNoTrans supported");
        return PLASMA_ERR_NOT_SUPPORTED;
    }
    if (M < 0) {
        plasma_error("PLASMA_dgels", "illegal value of M");
        return -2;
    }
    if (N < 0) {
        plasma_error("PLASMA_dgels", "illegal value of N");
        return -3;
    }
    if (NRHS < 0) {
        plasma_error("PLASMA_dgels", "illegal value of NRHS");
        return -4;
    }
    if (LDA < max(1, M)) {
        plasma_error("PLASMA_dgels", "illegal value of LDA");
        return -6;
    }
    if (LDB < max(1, max(M, N))) {
        plasma_error("PLASMA_dgels", "illegal value of LDB");
        return -9;
    }
    /* Quick return */
    if (min(M, min(N, NRHS)) == 0) {
        for (i = 0; i < max(M, N); i++)
            for (j = 0; j < NRHS; j++)
                B[j*LDB+i] = 0.0;
        return PLASMA_SUCCESS;
    }

    /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
    status = plasma_tune(PLASMA_FUNC_DGELS, M, N, NRHS);
    if (status != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgels", "plasma_tune() failed");
        return status;
    }

    /* Set MT, NT & NTRHS */
    NB = PLASMA_NB;
    NT = (N%NB==0) ? (N/NB) : (N/NB+1);
    MT = (M%NB==0) ? (M/NB) : (M/NB+1);
    NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);

    /* Allocate memory for matrices in block layout */
    Abdl = (double *)plasma_shared_alloc(plasma, MT*NT*PLASMA_NBNBSIZE, PlasmaRealDouble);
    Tbdl = (double *)plasma_shared_alloc(plasma, MT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);
    Bbdl = (double *)plasma_shared_alloc(plasma, max(MT, NT)*NTRHS*PLASMA_NBNBSIZE, PlasmaRealDouble);
    if (Abdl == NULL || Tbdl == NULL || Bbdl == NULL) {
        plasma_error("PLASMA_dgels", "plasma_shared_alloc() failed");
        plasma_shared_free(plasma, Abdl);
        plasma_shared_free(plasma, Tbdl);
        plasma_shared_free(plasma, Bbdl);
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }

    if (M >= N) {
        descA = plasma_desc_init(
            Abdl, PlasmaRealDouble,
            PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
            M, N, 0, 0, M, N);

        descB = plasma_desc_init(
            Bbdl, PlasmaRealDouble,
            PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
            M, NRHS, 0, 0, M, NRHS);

        descT = plasma_desc_init(
            Tbdl, PlasmaRealDouble,
            PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
            M, N, 0, 0, M, N);

        plasma_parallel_call_3(plasma_lapack_to_tile,
            double*, A,
            int, LDA,
            PLASMA_desc, descA);

        plasma_parallel_call_3(plasma_lapack_to_tile,
            double*, B,
            int, LDB,
            PLASMA_desc, descB);
    }
    else {
        descA = plasma_desc_init(
            Abdl, PlasmaRealDouble,
            PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
            M, N, 0, 0, M, N);

        descB = plasma_desc_init(
            Bbdl, PlasmaRealDouble,
            PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
            N, NRHS, 0, 0, N, NRHS);

        descT = plasma_desc_init(
            Tbdl, PlasmaRealDouble,
            PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
            M, N, 0, 0, M, N);

        plasma_parallel_call_3(plasma_lapack_to_tile,
            double*, A,
            int, LDA,
            PLASMA_desc, descA);

        plasma_parallel_call_3(plasma_lapack_to_tile,
            double*, B,
            int, LDB,
            PLASMA_desc, descB);
    }

    /* Call the native interface */
    status = PLASMA_dgels_Tile(PlasmaNoTrans, &descA, &descT, &descB);

    if (status == PLASMA_SUCCESS) {
        if (M >= N) {
            /* Return T to the user */
            plasma_memcpy(T, Tbdl, MT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);

            plasma_parallel_call_3(plasma_tile_to_lapack,
                PLASMA_desc, descA,
                double*, A,
                int, LDA);

            plasma_parallel_call_3(plasma_tile_to_lapack,
                PLASMA_desc, descB,
                double*, B,
                int, LDB);
        }
        else {
            /* Return T to the user */
            plasma_memcpy(T, Tbdl, MT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);

            plasma_parallel_call_3(plasma_tile_to_lapack,
                PLASMA_desc, descA,
                double*, A,
                int, LDA);

            plasma_parallel_call_3(plasma_tile_to_lapack,
                PLASMA_desc, descB,
                double*, B,
                int, LDB);
        }
    }
    plasma_shared_free(plasma, Abdl);
    plasma_shared_free(plasma, Tbdl);
    plasma_shared_free(plasma, Bbdl);
    return status;
}

/* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
// PLASMA_dgels_Tile - solves overdetermined or underdetermined linear systems involving an M-by-N
// matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
// The following options are provided:
//
// 1. trans = PlasmaNoTrans and M >= N: find the least squares solution of an overdetermined
//    system, i.e., solve the least squares problem: minimize || B - A*X ||.
//
// 2. trans = PlasmaNoTrans and M < N:  find the minimum norm solution of an underdetermined
//    system A * X = B.
//
// Several right hand side vectors B and solution vectors X can be handled in a single call;
// they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
// solution
// matrix X.
// All matrices are passed through descriptors. All dimensions are taken from the descriptors.

/* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
// trans    PLASMA_enum (IN)
//          Intended usage:
//          = PlasmaNoTrans:   the linear system involves A;
//          = PlasmaTrans: the linear system involves A**T.
//          Currently only PlasmaNoTrans is supported.
//
// A        double* (INOUT)
//          On entry, the M-by-N matrix A.
//          On exit,
//          if M >= N, A is overwritten by details of its QR factorization as returned by
//                     PLASMA_dgeqrf;
//          if M < N, A is overwritten by details of its LQ factorization as returned by
//                      PLASMA_dgelqf.
//
// T        double* (OUT)
//          On exit, auxiliary factorization data.
//
// B        double* (INOUT)
//          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
//          On exit, if return value = 0, B is overwritten by the solution vectors, stored
//          columnwise:
//          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
//          sum of squares for the solution in each column is given by the sum of squares of the
//          modulus of elements N+1 to M in that column;
//          if M < N, rows 1 to N of B contain the minimum norm solution vectors;

/* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
//          = 0: successful exit

/* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
int PLASMA_dgels_Tile(PLASMA_enum trans, PLASMA_desc *A, PLASMA_desc *T, PLASMA_desc *B)
{
    PLASMA_desc descA = *A;
    PLASMA_desc descT = *T;
    PLASMA_desc descB = *B;
    plasma_context_t *plasma;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_dgels_Tile", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    /* Check descriptors for correctness */
    if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgels_Tile", "invalid first descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descT) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgels_Tile", "invalid second descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgels_Tile", "invalid third descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    /* Check input arguments */
    if (descA.nb != descA.mb || descB.nb != descB.mb) {
        plasma_error("PLASMA_dgels_Tile", "only square tiles supported");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (trans != PlasmaNoTrans) {
        plasma_error("PLASMA_dgels_Tile", "only PlasmaNoTrans supported");
        return PLASMA_ERR_NOT_SUPPORTED;
    }
    /* Quick return  - currently NOT equivalent to LAPACK's:
    if (min(M, min(N, NRHS)) == 0) {
        for (i = 0; i < max(M, N); i++)
            for (j = 0; j < NRHS; j++)
                B[j*LDB+i] = 0.0;
        return PLASMA_SUCCESS;
    }
*/
    if (descA.m >= descA.n) {
        plasma_parallel_call_2(plasma_pdgeqrf,
            PLASMA_desc, descA,
            PLASMA_desc, descT);

        plasma_parallel_call_3(plasma_pdormqr,
            PLASMA_desc, descA,
            PLASMA_desc, descB,
            PLASMA_desc, descT);

        plasma_parallel_call_7(plasma_pdtrsm,
            PLASMA_enum, PlasmaLeft,
            PLASMA_enum, PlasmaUpper,
            PLASMA_enum, PlasmaNoTrans,
            PLASMA_enum, PlasmaNonUnit,
            double, 1.0,
            PLASMA_desc, plasma_desc_submatrix(descA, 0, 0, descA.n, descA.n),
            PLASMA_desc, plasma_desc_submatrix(descB, 0, 0, descA.n, descB.n));
    }
    else {
        plasma_parallel_call_1(plasma_tile_zero,
            PLASMA_desc, plasma_desc_submatrix(descB, descA.m, 0, descA.n-descA.m, descB.n));

        plasma_parallel_call_2(plasma_pdgelqf,
            PLASMA_desc, descA,
            PLASMA_desc, descT);

        plasma_parallel_call_7(plasma_pdtrsm,
            PLASMA_enum, PlasmaLeft,
            PLASMA_enum, PlasmaLower,
            PLASMA_enum, PlasmaNoTrans,
            PLASMA_enum, PlasmaNonUnit,
            double, 1.0,
            PLASMA_desc, plasma_desc_submatrix(descA, 0, 0, descA.m, descA.m),
            PLASMA_desc, plasma_desc_submatrix(descB, 0, 0, descA.m, descB.n));

        plasma_parallel_call_3(plasma_pdormlq,
            PLASMA_desc, descA,
            PLASMA_desc, descB,
            PLASMA_desc, descT);
    }
    return PLASMA_SUCCESS;
}
