/**
 *
 * @file dsungesv.c
 *
 *  PLASMA computational routines
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.2.0
 * @author Emmanuel Agullo
 * @date 2009-11-15
 *
 **/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <lapack.h>
#include "common.h"

#define PLASMA_dlag2s(_descA, _descB) plasma_static_call_4(plasma_pdlag2s, PLASMA_desc, _descA, PLASMA_desc, _descB, PLASMA_sequence*, sequence, PLASMA_request*, request)
#define PLASMA_slag2d(_descA, _descB) plasma_static_call_4(plasma_pslag2d, PLASMA_desc, _descA, PLASMA_desc, _descB, PLASMA_sequence*, sequence, PLASMA_request*, request)
#define PLASMA_dlange(_norm, _descA, _result, _work, _counter) _result = 0;    \
                                                     plasma_static_call_5(plasma_pdlange, int, _norm, PLASMA_desc, _descA, double*, _work, PLASMA_sequence*, sequence, PLASMA_request*, request);\
                                                     for (_counter = 0; _counter < PLASMA_SIZE; _counter++){if (((double *)_work)[_counter] > _result) _result = ((double *)_work)[_counter];}

#define PLASMA_dlacpy(_descA, _descB) plasma_static_call_4(plasma_pdlacpy, PLASMA_desc, _descA, PLASMA_desc, _descB, PLASMA_sequence*, sequence, PLASMA_request*, request)
#define PLASMA_daxpy(_alpha, _descA, _descB) plasma_static_call_5(plasma_pdaxpy, double, _alpha, PLASMA_desc, _descA, PLASMA_desc, _descB, PLASMA_sequence*, sequence, PLASMA_request*, request)

/***************************************************************************//**
 *
 * @ingroup double
 *
 *  PLASMA_dsungesv - Solves overdetermined or underdetermined linear systems involving an M-by-N
 *  matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
 *  The following options are provided:
 *
 *  # trans = PlasmaNoTrans and M >= N: find the least squares solution of an overdetermined
 *    system, i.e., solve the least squares problem: minimize || B - A*X ||.
 *
 *  # trans = PlasmaNoTrans and M < N:  find the minimum norm solution of an underdetermined
 *    system A * X = B.
 *
 *  Several right hand side vectors B and solution vectors X can be handled in a single call;
 *  they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
 *  solution matrix X.
 *
 *  PLASMA_dsungesv first attempts to factorize the matrix in COMPLEX and use this
 *  factorization within an iterative refinement procedure to produce a
 *  solution with COMPLEX*16 normwise backward error quality (see below).
 *  If the approach fails the method switches to a COMPLEX*16
 *  factorization and solve.
 *
 *  The iterative refinement is not going to be a winning strategy if
 *  the ratio COMPLEX performance over COMPLEX*16 performance is too
 *  small. A reasonable strategy should take the number of right-hand
 *  sides and the size of the matrix into account. This might be done
 *  with a call to ILAENV in the future. Up to now, we always try
 *  iterative refinement.
 *
 *  The iterative refinement process is stopped if ITER > ITERMAX or
 *  for all the RHS we have: RNRM < N*XNRM*ANRM*EPS*BWDMAX
 *  where:
 *
 *  - ITER is the number of the current iteration in the iterative refinement process
 *  - RNRM is the infinity-norm of the residual
 *  - XNRM is the infinity-norm of the solution
 *  - ANRM is the infinity-operator-norm of the matrix A
 *  - EPS is the machine epsilon returned by DLAMCH('Epsilon').
 *
 *  Actually, in its current state (PLASMA 2.2.0), the test is slightly relaxed.
 *
 *  The values ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.
 *
 *  We follow Bjorck's algorithm proposed in "Iterative Refinement of Linear
 *  Least Squares solutions I", BIT, 7:257-278, 1967.
 *
 *******************************************************************************
 *
 * @param[in] trans
 *          Intended usage:
 *          = PlasmaNoTrans:   the linear system involves A;
 *          = PlasmaTrans: the linear system involves A**H.
 *          Currently only PlasmaNoTrans is supported.
 *
 * @param[in] M
 *          The number of rows of the matrix A. M >= 0.
 *
 * @param[in] N
 *          The number of columns of the matrix A. N >= 0.
 *
 * @param[in] NRHS
 *          The number of right hand sides, i.e., the number of columns of the matrices B and X.
 *          NRHS >= 0.
 *
 * @param[in] A
 *          The M-by-N matrix A. This matrix is not modified.
 *
 * @param[in] LDA
 *          The leading dimension of the array A. LDA >= max(1,M).
 *
 * @param[in] B
 *          The M-by-NRHS matrix B of right hand side vectors, stored columnwise. Not modified.
 *
 * @param[in] LDB
 *          The leading dimension of the array B. LDB >= MAX(1,M,N).
 *
 * @param[out] X
 *          If return value = 0, the solution vectors, stored columnwise.
 *          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
 *          sum of squares for the solution in each column is given by the sum of squares of the
 *          modulus of elements N+1 to M in that column;
 *          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
 *
 * @param[in] LDX
 *          The leading dimension of the array B. LDB >= MAX(1,M,N).
 *
 * @param[out] ITER
 *          The number of the current iteration in the iterative refinement process
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 *******************************************************************************
 *
 * @sa PLASMA_dsungesv_Tile
 * @sa PLASMA_dsungesv_Tile_Async
 * @sa PLASMA_dsungesv
 * @sa PLASMA_zgels
 *
 ******************************************************************************/
int PLASMA_dsungesv(PLASMA_enum trans, int N, int NRHS,
                    double *A, int LDA,
                    double *B, int LDB,
                    double *X, int LDX, int *ITER)
{
    int NB, NT, NTRHS;
    int status;
    double *Abdl;
    double *Bbdl;
    double *Xbdl;
    PLASMA_desc  descA;
    PLASMA_desc  descB;
    PLASMA_desc *descT;
    PLASMA_desc  descX;
    plasma_context_t *plasma;
    PLASMA_sequence *sequence = NULL;
    PLASMA_request request = PLASMA_REQUEST_INITIALIZER;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_dsungesv", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    /* Check input arguments */
    if (trans != PlasmaNoTrans) {
        plasma_error("PLASMA_dsungesv", "only PlasmaNoTrans supported");
        return PLASMA_ERR_NOT_SUPPORTED;
    }
    if (N < 0) {
        plasma_error("PLASMA_dsungesv", "illegal value of N");
        return -2;
    }
    if (NRHS < 0) {
        plasma_error("PLASMA_dsungesv", "illegal value of NRHS");
        return -3;
    }
    if (LDA < max(1, N)) {
        plasma_error("PLASMA_dsungesv", "illegal value of LDA");
        return -5;
    }
    if (LDB < max(1, N)) {
        plasma_error("PLASMA_dsungesv", "illegal value of LDB");
        return -8;
    }
    if (LDX < max(1, N)) {
        plasma_error("PLASMA_dsungesv", "illegal value of LDX");
        return -9;
    }

    /* Quick return */
    if ( N == 0 )
        return PLASMA_SUCCESS;

    /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
    status = plasma_tune(PLASMA_FUNC_DSGELS, N, N, NRHS);
    if (status != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dsungesv", "plasma_tune() failed");
        return status;
    }

    /* Set MT, NT & NTRHS */
    NB = PLASMA_NB;
    NT    = (N%NB   ==0) ? (N/NB)    : (N/NB+1);
    NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);

    /* DOUBLE PRECISION INITIALIZATION */
    /* Allocate memory for matrices in block layout */
    Abdl = (double *)plasma_shared_alloc(plasma, NT*NT   *PLASMA_NBNBSIZE, PlasmaRealDouble);
    Bbdl = (double *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaRealDouble);
    Xbdl = (double *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaRealDouble);
    if (Abdl == NULL || Bbdl == NULL || Xbdl == NULL) {
        plasma_error("PLASMA_dsungesv", "plasma_shared_alloc() failed");
        plasma_shared_free(plasma, Abdl);
        plasma_shared_free(plasma, Bbdl);
        plasma_shared_free(plasma, Xbdl);
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }

    plasma_sequence_create(plasma, &sequence);

    descA = plasma_desc_init(
        Abdl, PlasmaRealDouble,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, N, 0, 0, N, N);

    descB = plasma_desc_init(
        Bbdl, PlasmaRealDouble,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, NRHS, 0, 0, N, NRHS);

    descX = plasma_desc_init(
        Xbdl, PlasmaRealDouble,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, NRHS, 0, 0, N, NRHS);

    plasma_parallel_call_5(plasma_dlapack_to_tile,
        double*, A,
        int, LDA,
        PLASMA_desc, descA,
        PLASMA_sequence*, sequence,
        PLASMA_request*, &request);

    plasma_parallel_call_5(plasma_dlapack_to_tile,
        double*, B,
        int, LDB,
        PLASMA_desc, descB,
        PLASMA_sequence*, sequence,
        PLASMA_request*, &request);

    /* Allocate workspace */
    PLASMA_Alloc_Workspace_dgels_Tile(N, N, &descT);

    /* Call the native interface */
    status = PLASMA_dsungesv_Tile_Async(PlasmaNoTrans, &descA, descT, &descB, &descX, ITER,
                                        sequence, &request);

    if (status == PLASMA_SUCCESS) {
        plasma_parallel_call_5(plasma_dtile_to_lapack,
            PLASMA_desc, descX,
            double*, X,
            int, LDX,
            PLASMA_sequence*, sequence,
            PLASMA_request*, &request);
    }

    plasma_dynamic_sync();
    PLASMA_Dealloc_Handle_Tile(&descT);
    plasma_sequence_destroy(plasma, sequence);
    plasma_shared_free(plasma, Abdl);
    plasma_shared_free(plasma, Bbdl);
    plasma_shared_free(plasma, Xbdl);
    return status;
}

/***************************************************************************//**
 *
 * @ingroup double_Tile
 *
 *  PLASMA_dsungesv_Tile - Solves overdetermined or underdetermined linear systems involving an M-by-N
 *  matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
 *  All matrices are passed through descriptors. All dimensions are taken from the descriptors.
 *  The following options are provided:
 *
 *  # trans = PlasmaNoTrans and M >= N: find the least squares solution of an overdetermined
 *    system, i.e., solve the least squares problem: minimize || B - A*X ||.
 *
 *  # trans = PlasmaNoTrans and M < N:  find the minimum norm solution of an underdetermined
 *    system A * X = B.
 *
 *  Several right hand side vectors B and solution vectors X can be handled in a single call;
 *  they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
 *  solution matrix X.
 *
 *  PLASMA_dsungesv first attempts to factorize the matrix in COMPLEX and use this
 *  factorization within an iterative refinement procedure to produce a
 *  solution with COMPLEX*16 normwise backward error quality (see below).
 *  If the approach fails the method switches to a COMPLEX*16
 *  factorization and solve.
 *
 *  The iterative refinement is not going to be a winning strategy if
 *  the ratio COMPLEX performance over COMPLEX*16 performance is too
 *  small. A reasonable strategy should take the number of right-hand
 *  sides and the size of the matrix into account. This might be done
 *  with a call to ILAENV in the future. Up to now, we always try
 *  iterative refinement.
 *
 *  The iterative refinement process is stopped if ITER > ITERMAX or
 *  for all the RHS we have: RNRM < N*XNRM*ANRM*EPS*BWDMAX
 *  where:
 *
 *  - ITER is the number of the current iteration in the iterative refinement process
 *  - RNRM is the infinity-norm of the residual
 *  - XNRM is the infinity-norm of the solution
 *  - ANRM is the infinity-operator-norm of the matrix A
 *  - EPS is the machine epsilon returned by DLAMCH('Epsilon').
 *
 *  Actually, in its current state (PLASMA 2.2.0), the test is slightly relaxed.
 *
 *  The values ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.
 *
 *  We follow Bjorck's algorithm proposed in "Iterative Refinement of Linear
 *  Least Squares solutions I", BIT, 7:257-278, 1967.
 *
 *******************************************************************************
 *
 * @param[in] trans
 *          Intended usage:
 *          = PlasmaNoTrans:   the linear system involves A;
 *          = PlasmaTrans: the linear system involves A**H.
 *          Currently only PlasmaNoTrans is supported.
 *
 * @param[in,out] A
 *          - If the iterative refinement converged, A is not modified;
 *          - otherwise, it fell back to double precision solution, and
 *          on exit the M-by-N matrix A contains:
 *          if M >= N, A is overwritten by details of its QR factorization as returned by
 *                     PLASMA_zgeqrf;
 *          if M < N, A is overwritten by details of its LQ factorization as returned by
 *                      PLASMA_zgelqf.
 *
 * @param[out] T
 *          On exit:
 *          - if the iterative refinement converged, T is not modified;
 *          - otherwise, it fell back to double precision solution,
 *          and then T is an auxiliary factorization data.
 *
 * @param[in,out] B
 *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
 *          On exit, if return value = 0, B is overwritten by the solution vectors, stored
 *          columnwise:
 *          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
 *          sum of squares for the solution in each column is given by the sum of squares of the
 *          modulus of elements N+1 to M in that column;
 *          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *
 *******************************************************************************
 *
 * @sa PLASMA_dsungesv
 * @sa PLASMA_dsungesv_Tile_Async
 * @sa PLASMA_dsungesv_Tile
 * @sa PLASMA_zgels_Tile
 *
 ******************************************************************************/
int PLASMA_dsungesv_Tile(PLASMA_enum trans, PLASMA_desc *A, PLASMA_desc *T,
                         PLASMA_desc *B, PLASMA_desc *X, int *ITER)
{
    plasma_context_t *plasma;
    PLASMA_sequence *sequence = NULL;
    PLASMA_request request = PLASMA_REQUEST_INITIALIZER;
    int status;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_dsungesv_Tile", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    plasma_sequence_create(plasma, &sequence);
    status = PLASMA_dsungesv_Tile_Async(trans, A, T, B, X, ITER, sequence, &request);
    if (status != PLASMA_SUCCESS)
        return status;
    plasma_dynamic_sync();
    status = sequence->status;
    plasma_sequence_destroy(plasma, sequence);
    return status;
}

/***************************************************************************//**
 *
 * @ingroup double_Tile_Async
 *
 *  Non-blocking equivalent of PLASMA_dsungesv_Tile().
 *  Returns control to the user thread before worker threads finish the computation
 *  to allow for pipelined execution of diferent routines.
 *
 *******************************************************************************
 *
 * @param[in] sequence
 *          Identifies the sequence of function calls that this call belongs to
 *          (for completion checks and exception handling purposes).
 *
 * @param[out] request
 *          Identifies this function call (for exception handling purposes).
 *
 *******************************************************************************
 *
 * @sa PLASMA_dsungesv
 * @sa PLASMA_dsungesv_Tile
 * @sa PLASMA_dsungesv_Tile_Async
 * @sa PLASMA_zgels_Tile_Async
 *
 ******************************************************************************/
int PLASMA_dsungesv_Tile_Async(PLASMA_enum trans, PLASMA_desc *A, PLASMA_desc *T,
                               PLASMA_desc *B, PLASMA_desc *X, int *ITER,
                               PLASMA_sequence *sequence, PLASMA_request *request)
{
    int N, NRHS, NB, NT, NTRHS;
    PLASMA_desc descA = *A;
    PLASMA_desc descT = *T;
    PLASMA_desc descB = *B;
    PLASMA_desc descX = *X;
    float *SAbdl;
    float *STbdl;
    float *SXbdl;
    double *Rbdl;
    plasma_context_t *plasma;
    int counter;
    double *work;

    const int    itermax = 30;
    const double bwdmax  = 1.0;
    const double negone = -1.0;
    const double one = 1.0;
    int iiter;
    double Anorm, cte, eps, Rnorm, Xnorm;
    *ITER=0;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_dsungesv_Tile", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    /* Check descriptors for correctness */
    if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dsungesv_Tile", "invalid first descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descT) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dsungesv_Tile", "invalid second descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dsungesv_Tile", "invalid third descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descX) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dsungesv_Tile", "invalid fourth descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    /* Check input arguments */
    if ( (descA.nb != descA.mb) || (descB.nb != descB.mb) || (descX.nb != descX.mb) ||
         (descA.mb != descB.mb) || (descB.mb != descX.mb) ) {
        plasma_error("PLASMA_dsungesv_Tile", "only square tiles of same size are supported");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (trans != PlasmaNoTrans) {
        plasma_error("PLASMA_dsungesv_Tile", "only PlasmaNoTrans supported");
        return PLASMA_ERR_NOT_SUPPORTED;
    }

    /* Set N, NRHS, NB, NT & NTRHS */
    N     = descA.lm;
    NRHS  = descB.ln;
    NB    = PLASMA_NB;
    NT    = (   N%NB==0) ?    (N/NB) :    (N/NB+1);
    NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);

    work = (double *)plasma_shared_alloc(plasma, PLASMA_SIZE, PlasmaRealDouble);
    if (work == NULL) {
        plasma_error("PLASMA_dsungesv", "plasma_shared_alloc() failed");
        plasma_shared_free(plasma, work);
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }

    Rbdl = (double *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaRealDouble);
    if (Rbdl == NULL) {
        plasma_error("PLASMA_dsgesv", "plasma_shared_alloc() failed");
        plasma_shared_free(plasma, Rbdl);
        plasma_shared_free(plasma, work);
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }

    PLASMA_desc descR = plasma_desc_init(
        Rbdl, PlasmaRealDouble,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, NRHS, 0, 0, N, NRHS);

    /* Allocate memory for single precision matrices in block layout */
    SAbdl = (float *)plasma_shared_alloc(plasma, NT*NT   *PLASMA_NBNBSIZE, PlasmaRealFloat);
    STbdl = (float *)plasma_shared_alloc(plasma, NT*NT   *PLASMA_IBNBSIZE, PlasmaRealFloat);
    SXbdl = (float *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaRealFloat);
    if (SAbdl == NULL || STbdl == NULL || SXbdl == NULL) {
        plasma_error("PLASMA_dsgesv", "plasma_shared_alloc() failed");
        plasma_shared_free(plasma, SAbdl);
        plasma_shared_free(plasma, STbdl);
        plasma_shared_free(plasma, SXbdl);
        plasma_shared_free(plasma, Rbdl);
        plasma_shared_free(plasma, work);
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }

    PLASMA_desc descSA = plasma_desc_init(
            SAbdl, PlasmaRealFloat,
            PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
            N, N, 0, 0, N, N);

    PLASMA_desc descST = plasma_desc_init(
            STbdl, PlasmaRealFloat,
            PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
            N, N, 0, 0, N, N);

    PLASMA_desc descSX = plasma_desc_init(
            SXbdl, PlasmaRealFloat,
            PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
            N, NRHS, 0, 0, N, NRHS);

    /* Compute some constants */
    PLASMA_dlange(lapack_inf_norm, descA, Anorm, work, counter);
    eps = lapack_dlamch(lapack_eps);
    cte = Anorm*eps*((double) N)*bwdmax;

    /* Convert B from double precision to single precision and store
       the result in SX. */
    PLASMA_dlag2s(descB, descSX);

    /* Convert A from double precision to single precision and store
       the result in SA. */
    PLASMA_dlag2s(descA, descSA);

    /* Set INFO to ZERO */
    PLASMA_INFO = PLASMA_SUCCESS;

    /* Compute the QR factorization of SA */
    plasma_parallel_call_4(plasma_psgeqrf,
        PLASMA_desc, descSA,
        PLASMA_desc, descST,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    /* Compute the solve in simple */
    plasma_parallel_call_5(plasma_psormqr,
        PLASMA_desc, descSA,
        PLASMA_desc, descSX,
        PLASMA_desc, descST,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    plasma_parallel_call_9(plasma_pstrsm,
        PLASMA_enum, PlasmaLeft,
        PLASMA_enum, PlasmaUpper,
        PLASMA_enum, PlasmaNoTrans,
        PLASMA_enum, PlasmaNonUnit,
        float, 1.0,
        PLASMA_desc, descSA,
        PLASMA_desc, descSX,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    /* Convert SX back to double precision */
    PLASMA_slag2d(descSX, descX);

    /* Compute R = B - AX. */
    PLASMA_dlacpy(descB, descR);

    plasma_parallel_call_9(plasma_pdgemm,
        PLASMA_enum, PlasmaNoTrans,
        PLASMA_enum, PlasmaNoTrans,
        double, negone,
        PLASMA_desc, descA,
        PLASMA_desc, descX,
        double, one,
        PLASMA_desc, descR,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    /* Check whether the NRHS normwise backward error satisfies the
       stopping criterion. If yes return. Note that ITER=0 (already set). */
    PLASMA_dlange(lapack_inf_norm, descX, Xnorm, work, counter);
    PLASMA_dlange(lapack_inf_norm, descR, Rnorm, work, counter);

    if (Rnorm < Xnorm * cte){
        /* The NRHS normwise backward errors satisfy the
           stopping criterion. We are good to exit. */
        plasma_shared_free(plasma, SAbdl);
        plasma_shared_free(plasma, STbdl);
        plasma_shared_free(plasma, SXbdl);
        plasma_shared_free(plasma, Rbdl);
        plasma_shared_free(plasma, work);
        return PLASMA_INFO;
    }

    /* Iterative refinement */
    for (iiter = 0; iiter < itermax; iiter++){

        /* Convert R from double precision to single precision
           and store the result in SX. */
        PLASMA_dlag2s(descR, descSX);

        plasma_parallel_call_5(plasma_psormqr,
            PLASMA_desc, descSA,
            PLASMA_desc, descSX,
            PLASMA_desc, descST,
            PLASMA_sequence*, sequence,
            PLASMA_request*, request);

        plasma_parallel_call_9(plasma_pstrsm,
            PLASMA_enum, PlasmaLeft,
            PLASMA_enum, PlasmaUpper,
            PLASMA_enum, PlasmaNoTrans,
            PLASMA_enum, PlasmaNonUnit,
            float, (float)1.0,
            PLASMA_desc, descSA,
            PLASMA_desc, descSX,
            PLASMA_sequence*, sequence,
            PLASMA_request*, request);

        /* Convert SX back to double precision and update the current
           iterate. */
        PLASMA_slag2d(descSX, descR);
        PLASMA_daxpy(one, descR, descX);

        /* Compute R = B - AX. */
        PLASMA_dlacpy(descB,descR);
        plasma_parallel_call_9(plasma_pdgemm,
            PLASMA_enum, PlasmaNoTrans,
            PLASMA_enum, PlasmaNoTrans,
            double, negone,
            PLASMA_desc, descA,
            PLASMA_desc, descX,
            double, one,
            PLASMA_desc, descR,
            PLASMA_sequence*, sequence,
            PLASMA_request*, request);

        /* Check whether the NRHS normwise backward errors satisfy the
           stopping criterion. If yes, set ITER=IITER>0 and return. */
        PLASMA_dlange(lapack_inf_norm, descX, Xnorm, work, counter);
        PLASMA_dlange(lapack_inf_norm, descR, Rnorm, work, counter);

        if (Rnorm < Xnorm * cte){
            /* The NRHS normwise backward errors satisfy the
               stopping criterion. We are good to exit. */
            *ITER = iiter;

            plasma_shared_free(plasma, SAbdl);
            plasma_shared_free(plasma, STbdl);
            plasma_shared_free(plasma, SXbdl);
            plasma_shared_free(plasma, Rbdl);
            plasma_shared_free(plasma, work);
            return PLASMA_INFO;
        }
    }

    /* We have performed ITER=itermax iterations and never satisified
       the stopping criterion, set up the ITER flag accordingly and
       follow up on double precision routine. */
    *ITER = -itermax - 1;

    plasma_shared_free(plasma, SAbdl);
    plasma_shared_free(plasma, STbdl);
    plasma_shared_free(plasma, SXbdl);
    plasma_shared_free(plasma, Rbdl);
    plasma_shared_free(plasma, work);

    /* Single-precision iterative refinement failed to converge to a
       satisfactory solution, so we restart to double precision. */
    PLASMA_dlacpy(descB, descX);

    plasma_parallel_call_4(plasma_pdgeqrf,
        PLASMA_desc, descA,
        PLASMA_desc, descT,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    plasma_parallel_call_5(plasma_pdormqr,
        PLASMA_desc, descA,
        PLASMA_desc, descX,
        PLASMA_desc, descT,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    plasma_parallel_call_9(plasma_pdtrsm,
        PLASMA_enum, PlasmaLeft,
        PLASMA_enum, PlasmaUpper,
        PLASMA_enum, PlasmaNoTrans,
        PLASMA_enum, PlasmaNonUnit,
        double, (double)1.0,
        PLASMA_desc, descA,
        PLASMA_desc, descX,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    return PLASMA_SUCCESS;
}
