/* ///////////////////////////////////////////////////////////////////////////////////////
 *  -- PLASMA --
 *     University of Tennessee
 */
#include "common.h"

#include <stdio.h>
#include <math.h>
#include <cblas.h>
#include "../src/lapack.h"

/* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
/* ///                    PLASMA computational routine (version 2.0.0)                       ///
 * ///                    Release Date: July, 4th 2009                                       ///
 * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
 * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */

/* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
// PLASMA_zcgesv - Computes the solution to a system of linear equations A * X = B,
// where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
// The tile LU decomposition with partial tile pivoting and row interchanges is used to factor A.
// The factored form of A is then used to solve the system of equations A * X = B.
// All matrices are passed through descriptors. All dimensions are taken from the descriptors.
//
// IMPORTANT NOTICE: in its current state, this routine only intends to be a proof-of-concept. 
// There are still some costly serial parts and one may NOT expect to achieve high performance.

// PLASMA_zcgesv first attempts to factorize the matrix in COMPLEX and use this
// factorization within an iterative refinement procedure to produce a
// solution with COMPLEX*16 normwise backward error quality (see below).
// If the approach fails the method switches to a COMPLEX*16
// factorization and solve.
//
// The iterative refinement is not going to be a winning strategy if
// the ratio COMPLEX performance over COMPLEX*16 performance is too
// small. A reasonable strategy should take the number of right-hand
// sides and the size of the matrix into account. This might be done
// with a call to ILAENV in the future. Up to now, we always try
// iterative refinement.
//
// The iterative refinement process is stopped if ITER > ITERMAX or
// for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
// where
//
// - ITER is the number of the current iteration in the iterative refinement process
// - RNRM is the infinity-norm of the residual
// - XNRM is the infinity-norm of the solution
// - ANRM is the infinity-operator-norm of the matrix A
// - EPS is the machine epsilon returned by DLAMCH('Epsilon').
//
// The values ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.

/* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
// N        int (IN)
//          The number of linear equations, i.e., the order of the matrix A. N >= 0.
//
// NRHS     int (IN)
//          The number of right hand sides, i.e., the number of columns of the matrix B.
//          NRHS >= 0.
//
// A        PLASMA_Complex64_t* (INOUT)
//          On entry, the N-by-N coefficient matrix A.
//          On exit, the tile L and U factors from the factorization (not equivalent to LAPACK).
//
// LDA      int (IN)
//          The leading dimension of the array A. LDA >= max(1,N).
//
// L        PLASMA_Complex64_t* (OUT)
//          On exit, auxiliary factorization data, related to the tile L factor,
//          necessary to solve the system of equations.
//
// IPIV     int* (OUT)
//          On exit, the pivot indices that define the permutations (not equivalent to LAPACK).
//
// B        PLASMA_Complex64_t* (IN)
//          The N-by-NRHS matrix of right hand side matrix B.
//
// LDB      int (IN)
//          The leading dimension of the array B. LDB >= max(1,N).
//
// X        PLASMA_Complex64_t* (OUT)
//          If return value = 0, the N-by-NRHS solution matrix X.
//
// LDX      int (IN)
//          The leading dimension of the array B. LDX >= max(1,N).
//
// ITER     int* (OUT)is the number of the current iteration in the iterative refinement process


/* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
//          = 0: successful exit
//          < 0: if -i, the i-th argument had an illegal value
//          > 0: if i, U(i,i) is exactly zero. The factorization has been completed,
//               but the factor U is exactly singular, so the solution could not be computed.

/* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */

  int PLASMA_zcgesv_Tile(PLASMA_desc *Atile, PLASMA_desc *Ltile, int *IPIV, PLASMA_desc *Btile, PLASMA_desc *Xtile, int *ITER)
{

    PLASMA_desc descA = *Atile;
    PLASMA_desc descL = *Ltile;
    PLASMA_desc descX = *Xtile;
    PLASMA_desc descB = *Btile;

    int N, LDA, LDB, LDX, NRHS, NTRHS, NB, NT;
    N=LDA=LDB=LDX=descA.n;
    NRHS=descX.n;

    PLASMA_Complex32_t *SAbdl;
    PLASMA_Complex32_t *SXbdl;
    PLASMA_Complex32_t *SLbdl;
    PLASMA_Complex64_t *work;
    PLASMA_Complex32_t *swork;
    double *rwork;
    PLASMA_Complex64_t *Abdl;
    PLASMA_Complex64_t *Xbdl;
    PLASMA_Complex64_t *Lbdl;
    plasma_context_t *plasma;

    const int itermax = 30;
    const double bwdmax = 1.0;
    const PLASMA_Complex64_t negone = -1.0;
    const PLASMA_Complex64_t one = 1.0;

    char norm='I';
    char all='A';

    int info;
    int i, iiter, ptsa, ptsx;
    double Anorm, cte, eps, Rnorm, Xnorm;

    *ITER=0;

    plasma = plasma_context_self();

    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_dgesv_Tile", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    /* Check descriptors for correctness */
    if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgesv_Tile", "invalid first descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descL) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgesv_Tile", "invalid second descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgesv_Tile", "invalid third descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (plasma_desc_check(&descX) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_dgesv_Tile", "invalid fourth descriptor");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }

    /* Set NT & NTRHS */
    NB = PLASMA_NB;
    NT = (N%NB==0) ? (N/NB) : (N/NB+1);
    NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);

    /* Allocate memory for working arrays */
    work = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, N*NRHS, PlasmaComplexDouble);
    swork = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, N*(N+NRHS), PlasmaComplexFloat);
    rwork = (double *)plasma_shared_alloc(plasma, N, PlasmaRealDouble);
    /* Allocate memory for matrices in Lapack layout */
    PLASMA_Complex64_t *A = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexDouble);
    PLASMA_Complex64_t *L = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);
    PLASMA_Complex64_t *B = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexDouble);
    PLASMA_Complex64_t *X = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexDouble);
    /* Allocate memory for matrices in block layout */
    SAbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexFloat);
    SLbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);
    SXbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexFloat);
    if (work == NULL || swork == NULL || rwork == NULL || SAbdl == NULL || SLbdl == NULL || SXbdl == NULL || A == NULL || L == NULL || B == NULL || X == NULL) {
        plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
        plasma_shared_free(plasma, work);
        plasma_shared_free(plasma, swork);
        plasma_shared_free(plasma, rwork);
        plasma_shared_free(plasma, A);
        plasma_shared_free(plasma, L);
        plasma_shared_free(plasma, B);
        plasma_shared_free(plasma, X);
        plasma_shared_free(plasma, SAbdl);
        plasma_shared_free(plasma, SLbdl);
        plasma_shared_free(plasma, SXbdl);
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }

    plasma_parallel_call_3(plasma_tile_to_lapack,
        PLASMA_desc, descA,
        PLASMA_Complex64_t*, A,
        int, LDA);

    plasma_parallel_call_3(plasma_tile_to_lapack,
        PLASMA_desc, descB,
        PLASMA_Complex64_t*, B,
        int, LDB);

    /* Compute some constants */
    Anorm = zlange(&norm, &N, &N, A, &LDA, rwork);
    eps = dlamch("Epsilon");
    cte = Anorm*eps*sqrt((double) N)*bwdmax;

    /* Set the indices ptsa, ptsx for referencing sa and sx in swork. */

    ptsa = 0;
    ptsx = ptsa + N*N;

    /* Convert B from double precision to single precision and store
       the result in SX. */

    zlag2c(&N, &NRHS, B, &LDB, swork+ptsx, &N, &info);

    /* Convert A from double precision to single precision and store
       the result in SA. */

    zlag2c(&N, &N, A, &LDA, swork+ptsa, &N, &info);

    PLASMA_desc descSA = plasma_desc_init(
        SAbdl, PlasmaComplexFloat,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, N, 0, 0, N, N);

    PLASMA_desc descSX = plasma_desc_init(
        SXbdl, PlasmaComplexFloat,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, NRHS, 0, 0, N, NRHS);

    PLASMA_desc descSL = plasma_desc_init(
        SLbdl, PlasmaComplexFloat,
        PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
        N, N, 0, 0, N, N);

    plasma_parallel_call_3(plasma_lapack_to_tile,
        PLASMA_Complex32_t*, swork+ptsa,
        int, N,
        PLASMA_desc, descSA);

    plasma_parallel_call_3(plasma_lapack_to_tile,
        PLASMA_Complex32_t*, swork+ptsx,
        int, N,
        PLASMA_desc, descSX);

    /* Clear IPIV and Lbdl */
    plasma_memzero(IPIV, NT*NT*PLASMA_NB, PlasmaInteger);
    plasma_memzero(SLbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);

    /* Set INFO to ZERO */
    PLASMA_INFO = PLASMA_SUCCESS;

    /* Compute the LU factorization of SA */
    plasma_parallel_call_3(plasma_pcgetrf,
        PLASMA_desc, descSA,
        PLASMA_desc, descSL,
        int*, IPIV);

    if (PLASMA_INFO == PLASMA_SUCCESS)
    {
        /* Solve the system SA*SX = SB */

        /* Forward substitution */
        plasma_parallel_call_4(plasma_pctrsmpl,
            PLASMA_desc, descSA,
            PLASMA_desc, descSX,
            PLASMA_desc, descSL,
            int*, IPIV);

        /* Backward substitution */
        plasma_parallel_call_7(plasma_pctrsm,
            PLASMA_enum, PlasmaLeft,
            PLASMA_enum, PlasmaUpper,
            PLASMA_enum, PlasmaNoTrans,
            PLASMA_enum, PlasmaNonUnit,
            PLASMA_Complex32_t, 1.0,
            PLASMA_desc, descSA,
            PLASMA_desc, descSX);

        /* Come back to lapack layout for A */
        plasma_parallel_call_3(plasma_tile_to_lapack,
            PLASMA_desc, descSA,
            PLASMA_Complex32_t*, swork+ptsa,
            int, N);

        /* Come back to lapack layout for X */
        plasma_parallel_call_3(plasma_tile_to_lapack,
            PLASMA_desc, descSX,
            PLASMA_Complex32_t*, swork+ptsx,
            int, N);
    }

    /* Convert SX back to double precision */
    clag2z( &N, &NRHS, swork+ptsx, &N, X, &LDX, &info );

    /* Compute R = B - AX (R is WORK). */
    zlacpy( &all, &N, &NRHS, B, &LDB, work, &N);
    cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, NRHS, N, CBLAS_SADDR(negone), A, LDA, X, LDX, CBLAS_SADDR(one), work, N);

    /* Check whether the NRHS normwise backward errors satisfy the
       stopping criterion. If yes return. Note that ITER=0 (already set). */
    Xnorm = zlange(&norm, &N, &NRHS, X, &LDX, rwork);
    Rnorm = zlange(&norm, &N, &NRHS, work, &N, rwork);
    if (Rnorm < Xnorm * cte){
      /* The NRHS normwise backward errors satisfy the
         stopping criterion. We are good to exit. */
      plasma_parallel_call_3(plasma_lapack_to_tile,
          PLASMA_Complex64_t*, X,
          int, N,
          PLASMA_desc, descX);
      return PLASMA_INFO;;
    }

    for (iiter = 0; iiter < itermax; iiter++){

      /* Convert R (in WORK) from double precision to single precision
         and store the result in SX. */
      zlag2c( &N, &NRHS, work, &N, swork+ptsx, &N, &info );

      /* Set X to bdl layout again */
      plasma_parallel_call_3(plasma_lapack_to_tile,
        PLASMA_Complex32_t*, swork+ptsx,
        int, N,
        PLASMA_desc, descSX);

      /* Solve the system SA*SX = SB */

      /* Forward substitution */
      plasma_parallel_call_4(plasma_pctrsmpl,
            PLASMA_desc, descSA,
            PLASMA_desc, descSX,
            PLASMA_desc, descSL,
            int*, IPIV);

      /* Backward substitution */
      plasma_parallel_call_7(plasma_pctrsm,
            PLASMA_enum, PlasmaLeft,
            PLASMA_enum, PlasmaUpper,
            PLASMA_enum, PlasmaNoTrans,
            PLASMA_enum, PlasmaNonUnit,
            PLASMA_Complex32_t, 1.0,
            PLASMA_desc, descSA,
            PLASMA_desc, descSX);

      /* Come back to lapack layout for X */
      plasma_parallel_call_3(plasma_tile_to_lapack,
            PLASMA_desc, descSX,
            PLASMA_Complex32_t*, swork+ptsx,
            int, N);

      /* Convert SX back to double precision and update the current
         iterate. */
      clag2z( &N, &NRHS, swork+ptsx, &N, work, &N, &info );

      for (i = 0; i < NRHS; i++){
        cblas_zaxpy(N, CBLAS_SADDR(one), work+i*N, 1, X+i*LDX, 1);
      }

      /* Compute R = B - AX (R is WORK). */
      zlacpy( &all, &N, &NRHS, B, &LDB, work, &N);
      cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, NRHS, N, CBLAS_SADDR(negone), A, LDA, X, LDX, CBLAS_SADDR(one), work, N);

      /* Check whether the NRHS normwise backward errors satisfy the
         stopping criterion. If yes, set ITER=IITER>0 and return. */
      Xnorm = zlange(&norm, &N, &NRHS, X, &LDX, rwork);
      Rnorm = zlange(&norm, &N, &NRHS, work, &N, rwork);
      if (Rnorm < Xnorm * cte){
        /* The NRHS normwise backward errors satisfy the
           stopping criterion. We are good to exit. */
        *ITER = iiter;
        plasma_parallel_call_3(plasma_lapack_to_tile,
            PLASMA_Complex64_t*, X,
            int, N,
            PLASMA_desc, descX);
        plasma_shared_free(plasma, SAbdl);
        plasma_shared_free(plasma, SLbdl);
        plasma_shared_free(plasma, SXbdl);
        plasma_shared_free(plasma, work);
        plasma_shared_free(plasma, swork);
        plasma_shared_free(plasma, rwork);
        return PLASMA_INFO;
      }

    }

    /* We have performed ITER=itermax iterations and never satisified
       the stopping criterion, set up the ITER flag accordingly and
       follow up on double precision routine. */

    *ITER = -itermax - 1;

    plasma_shared_free(plasma, SAbdl);
    plasma_shared_free(plasma, SLbdl);
    plasma_shared_free(plasma, SXbdl);
    plasma_shared_free(plasma, work);
    plasma_shared_free(plasma, swork);
    plasma_shared_free(plasma, rwork);

    /* Single-precision iterative refinement failed to converge to a
       satisfactory solution, so we resort to double precision. */

    /* Allocate memory for matrices in block layout */
    Abdl = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexDouble);
    Lbdl = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);
    Xbdl = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexDouble);
    if (Abdl == NULL || Lbdl == NULL || Xbdl == NULL) {
        plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
        plasma_shared_free(plasma, Abdl);
        plasma_shared_free(plasma, Lbdl);
        plasma_shared_free(plasma, Xbdl);
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }

    zlacpy( &all, &N, &NRHS, B, &LDB, X, &LDX );

    descA = plasma_desc_init(
        Abdl, PlasmaComplexDouble,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, N, 0, 0, N, N);

    descX = plasma_desc_init(
        Xbdl, PlasmaComplexDouble,
        PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
        N, NRHS, 0, 0, N, NRHS);

    descL = plasma_desc_init(
        Lbdl, PlasmaComplexDouble,
        PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
        N, N, 0, 0, N, N);

    plasma_parallel_call_3(plasma_lapack_to_tile,
        PLASMA_Complex64_t*, A,
        int, LDA,
        PLASMA_desc, descA);

    plasma_parallel_call_3(plasma_lapack_to_tile,
        PLASMA_Complex64_t*, X,
        int, LDX,
        PLASMA_desc, descX);

    /* Clear IPIV and Lbdl */
    plasma_memzero(IPIV, NT*NT*PLASMA_NB, PlasmaInteger);
    plasma_memzero(Lbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);

    /* Set INFO to ZERO */
    PLASMA_INFO = PLASMA_SUCCESS;

    plasma_parallel_call_3(plasma_pzgetrf,
        PLASMA_desc, descA,
        PLASMA_desc, descL,
        int*, IPIV);

    /* Return L to the user */
    plasma_memcpy(L, Lbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);

    if (PLASMA_INFO == PLASMA_SUCCESS)
    {
        plasma_parallel_call_4(plasma_pztrsmpl,
            PLASMA_desc, descA,
            PLASMA_desc, descX,
            PLASMA_desc, descL,
            int*, IPIV);

        plasma_parallel_call_7(plasma_pztrsm,
            PLASMA_enum, PlasmaLeft,
            PLASMA_enum, PlasmaUpper,
            PLASMA_enum, PlasmaNoTrans,
            PLASMA_enum, PlasmaNonUnit,
            PLASMA_Complex64_t, 1.0,
            PLASMA_desc, descA,
            PLASMA_desc, descX);

        plasma_parallel_call_3(plasma_tile_to_lapack,
            PLASMA_desc, descA,
            PLASMA_Complex64_t*, A,
            int, LDA);

        plasma_parallel_call_3(plasma_tile_to_lapack,
            PLASMA_desc, descX,
            PLASMA_Complex64_t*, X,
            int, LDX);
    }

    return PLASMA_INFO;

}
