/**
 * @file zstedc.c
 *
 *  PLASMA computational routines
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.7.0
 * @author Grgoire Pichon
 * @author Azzam Haidar
 * @date 2014-07
 * @precisions normal z -> s d c
 *
 **/
#include <math.h>
#include <lapacke.h>
#include "common.h"

#undef REAL
#define COMPLEX

/**
 *****************************************************************************
 *
 * @ingroup PLASMA_Complex64_t_Tile
 *
 *  PLASMA_zstedc - Computes all eigenpairs of a symmetric tridiagonal matrix
 *
 *******************************************************************************
 *
 * @param[in] jobz
 *          Intended usage:
 *          = PlasmaIVec: computes eigenpairs of the symmetric tridiagonal matrix
 *          = PlasmaVec: computes eigenpairs of the original matrix (not supported now)
 *
 * @param[in] n
 *          n specifies the dimension of the original matrix
 *
 * @param[in,out] D
 *          On entry, D contains the diagonal elements of the tridiagonal matrix
 *          On exit, D contains the eigenvalues
 *
 * @param[in] E
 *          On entry, E contains the extra-diagonal elements of the tridiagonal matrix
 *
 * @param[out] Z
 *          On exit, if jobz = PlasmaVec and info = 0, the eigenvectors.
 *
 * @param[in] LDZ
 *          The leading dimention of the eigenvectors matrix Z. LDZ >= max(1,N).
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 *******************************************************************************
 *
 * @sa PLASMA_zstedc
 * @sa PLASMA_zstedc_Async
 * @sa PLASMA_cstedc
 * @sa PLASMA_dstedc
 * @sa PLASMA_sstedc
 *
 ******************************************************************************/
int PLASMA_zstedc(PLASMA_enum jobz, int n,
                  double *D, double *E,
                  PLASMA_Complex64_t *Z, int LDZ)
{
    plasma_context_t *plasma;
    PLASMA_sequence *sequence = NULL;
    PLASMA_request request = PLASMA_REQUEST_INITIALIZER;
    int status;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_zstedc", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    plasma_sequence_create(plasma, &sequence);
    PLASMA_zstedc_Async(jobz, n, D, E, Z, LDZ, sequence, &request);
    plasma_dynamic_sync();
    status = sequence->status;
    plasma_sequence_destroy(plasma, sequence);
    return status;
}

/**
 *****************************************************************************
 *
 * @ingroup PLASMA_Complex64_t_Tile_Async
 *
 *  PLASMA_zstedc_Async - Computes all eigenpairs of a symmetric tridiagonal matrix
 *
 *******************************************************************************
 *
 * @param[in] jobz
 *          Intended usage:
 *          = PlasmaIVec: computes eigenpairs of the symmetric tridiagonal matrix
 *          = PlasmaVec: computes eigenpairs of the original matrix (not supported now)
 *
 * @param[in] n
 *          n specifies the dimension of the original matrix
 *
 * @param[in,out] D
 *          On entry, D contains the diagonal elements of the tridiagonal matrix
 *          On exit, D contains the eigenvalues
 *
 * @param[in] E
 *          On entry, E contains the extra-diagonal elements of the tridiagonal matrix
 *
 * @param[out] Z
 *          On exit, if jobz = PlasmaVec and info = 0, the eigenvectors.
 *
 * @param[in] LDZ
 *          The leading dimention of the eigenvectors matrix Z. LDZ >= max(1,N).
 *
 * @param[in] sequence
 *          Identifies the sequence of function calls that this call belongs to
 *          (for completion checks and exception handling purposes).
 *
 * @param[out] request
 *          Identifies this function call (for exception handling purposes).
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 *******************************************************************************/
#ifdef REAL
int PLASMA_zstedc_Async(PLASMA_enum jobz, int n,
                        double *D, double *E,
                        PLASMA_Complex64_t *Z, int LDZ,
                        PLASMA_sequence *sequence, PLASMA_request *request)
{
    int info = 0;
    int SMLSIZ;
    plasma_context_t *plasma;

    /* Variables for sorting eigenpairs */
    int act_perm = 0;

    plasma = plasma_context_self();

    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_zstedc_Async", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    if (sequence == NULL) {
        plasma_fatal_error("PLASMA_zstedc_Async", "NULL sequence");
        return PLASMA_ERR_UNALLOCATED;
    }
    if (request == NULL) {
        plasma_fatal_error("PLASMA_zstedc_Async", "NULL request");
        return PLASMA_ERR_UNALLOCATED;
    }
    /* Check sequence status */
    if (sequence->status == PLASMA_SUCCESS)
        request->status = PLASMA_SUCCESS;
    else
        return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);

    SMLSIZ = plasma->ev_smlsze;

    /* Check input arguments */
    if (jobz != PlasmaNoVec && jobz != PlasmaIvec) {
        plasma_error("PLASMA_zstedc_Async", "illegal value of jobz");
        return -1;
    }

    if (n < 0){
        plasma_error("PLASMA_zstedc_Async", "illegal value of n");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    if (D == NULL){
        plasma_error("PLASMA_zstedc_Async", "illegal value of D");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    if (E == NULL){
        plasma_error("PLASMA_zstedc_Async", "illegal value of E");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    if (Z == NULL){
        plasma_error("PLASMA_zstedc_Async", "illegal value of Z");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    if (LDZ < max(1, n)){
        plasma_error("PLASMA_zstedc_Async", "illegal value of LDZ");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    if ((n < SMLSIZ) || (jobz == PlasmaNoVec)) {
        LAPACKE_zlaset_work( LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), n, n, 0.0, 1.0, Z, LDZ);
        info = LAPACKE_zstedc( LAPACK_COL_MAJOR, lapack_const(jobz),
                               n, D, E, Z, LDZ);
        if (info != 0){
            plasma_error("PLASMA_zstedc_Async", "LAPACKE zstedc failed");
            return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
        }
        else{
            return PLASMA_SUCCESS;
        }
    }

#if defined(ENABLE_TIMER)
    printf("\n");
    PLASMA_Double_t timestedc=0.0, timeonesolve=0.0, timeallsolve=0.0, timeswap=0.0, timesort=0.0;
    timestedc   = PLASMA_Wtime();
#endif

    int work_pos   = 0;
    int work_pos2  = 0;
    double *WORK   = malloc(n*n*sizeof(double));
    double *WORK2  = malloc((4*n)*sizeof(double));
    int *IWORK     = malloc((5*n)*sizeof(int));
    int *localdata = malloc(n*sizeof(int));
    int LDWORK = LDZ;

    double eps = LAPACKE_dlamch_work('e');
    double tiny;
    int current = 0;
    int nsubsml = 0, nsubdlaed0 = 0, nsub1 = 0;

    int il = 0;
    int iu = n;
    double vl = 0., vu = 0.;
    char range = 'A';

    int start, size;
    int i = 0;

    memset(localdata, 0, n*sizeof(int));

    /* Set WORK to identity */
    plasma_dynamic_call_5( plasma_pdlaset_identity,
                           int, n,
                           double*, WORK,
                           int, n,
                           PLASMA_sequence*, sequence,
                           PLASMA_request*, request);
    plasma_dynamic_sync();

#if defined(ENABLE_TIMER)
    plasma_dynamic_sync();
    timeallsolve   = PLASMA_Wtime();
#endif

#if defined(ENABLE_DEBUG2)
    plasma_dynamic_sync();
    printf("start dstedc\n");
#endif


    /******************************************************
     * loop over the main or possible-subproblem and solve
     * ****************************************************
     * */
    i = 0;
    while (i<n-1){
#if defined(ENABLE_TIMER) && defined(ENABLE_DEBUG2)
        plasma_dynamic_sync();
        timeonesolve   = PLASMA_Wtime();
#endif

        tiny = eps*sqrt(fabs(D[i]))*sqrt(fabs(D[i+1]));

        if ((fabs(E[i]) <= tiny) || (i==n-2)){
            start = current;
            size = i-current+1;

            /* The last index (n-1) is not treated in the loop  */
            if (i == n-2){
                size++;
            }

            if (size == 1){
                nsub1 += 1;
            }

            else if (size < SMLSIZ){
                nsubsml += 1;
                plasma_dynamic_call_8( plasma_pdstedc,
                                       PLASMA_enum, jobz,
                                       int,     size,
                                       double*, D+start,
                                       double*, E+start,
                                       double*, WORK+n*start+start,
                                       int,     n,
                                       PLASMA_sequence*, sequence,
                                       PLASMA_request*, request);
            }
            else{

#if defined(ENABLE_DEBUG2)
                plasma_dynamic_sync();
                printf("  start  solving subproblems of size %5d\n",size);
#endif


                nsubdlaed0 += 1;

                plasma_dynamic_call_21( plasma_pzlaed0,
                                        int, 2, /* int or Plasma_enum ??? */
                                        char, range,
                                        int, size,
                                        int, size,
                                        double*, D+start,
                                        double*, E+start,
                                        int, il,
                                        int, iu,
                                        double, vl,
                                        double, vu,
                                        double*, WORK+n*start+start,
                                        int, n,
                                        double*, NULL,
                                        int, size,
                                        double*, Z+LDZ*start+start,
                                        double*, WORK2+work_pos2,
                                        int, LDWORK,
                                        int*, IWORK+5*start,
                                        int*, localdata+start,
                                        PLASMA_sequence*, sequence,
                                        PLASMA_request*, request);

                work_pos2 += 4*size;
                work_pos  += size*size;
            }

            current += size;

#if defined(ENABLE_TIMER) && defined(ENABLE_DEBUG2)
            plasma_dynamic_sync();
            timeonesolve   = PLASMA_Wtime()-timeonesolve;
            printf("  Finish solving subproblems of size %5d timing= %lf \n",size, timeonesolve);
#endif

        }    /* End solving one independant subproblem */

        i++;
    } /* End While */

#if defined(ENABLE_TIMER)
    plasma_dynamic_sync();
    timeallsolve   = PLASMA_Wtime()-timeallsolve;
    printf("  Finish all solve dlaed0  nbsub %5d     timing= %lf \n",nsubdlaed0, timeallsolve);
#endif


    /* Wait for the end of each independant subproblem */
    plasma_dynamic_sync();


#if defined(ENABLE_TIMER)
    plasma_dynamic_sync();
    timesort   = PLASMA_Wtime();
#endif


    /* Create the permutation to sort D eigenvalues into increasing order */
    CORE_dlapst(PlasmaIncreasingOrder, n, D, IWORK);
    memcpy(WORK2, D, n*sizeof(double));
    for (i=0; i<n; i++){
        if (IWORK[i] != i){
            act_perm = 1;
            D[i] = WORK2[IWORK[i]];
        }
    }

#if defined(ENABLE_TIMER)
    plasma_dynamic_sync();
    timeswap   = PLASMA_Wtime();
#endif

    act_perm = 1;                 /* always copy back to Q */
    if (act_perm == 1){
        plasma_dynamic_call_7(plasma_pzswaps,
                              int, n,
                              int*, IWORK,
                              double*, Z,
                              int, LDZ,
                              double*, WORK,
                              PLASMA_sequence*, sequence,
                              PLASMA_request*, request);
    }

    plasma_dynamic_sync();

#if defined(ENABLE_TIMER)
    plasma_dynamic_sync();
    timesort   = timeswap-timesort;
    timeswap   = PLASMA_Wtime()-timeswap;
    timestedc  = PLASMA_Wtime()-timestedc;
    printf("  Finish sort                              timing= %lf \n", timesort);
    printf("  Finish swap                              timing= %lf \n", timeswap);
    printf("  Finish dstedc with nsub_dlaed0 %5d     nsub_smlsiz %5d   nsub1 %5d timing= %lf \n",nsubdlaed0, nsubsml, nsub1, timestedc);
#endif


    free(localdata);
    free(WORK);
    free(WORK2);
    free(IWORK);
    return info;
}





#else  /* COMPLEX */
int PLASMA_zstedc_Async(PLASMA_enum jobz, int n,
                        double *D, double *E,
                        PLASMA_Complex64_t *Z, int LDZ,
                        PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    plasma = plasma_context_self();

    double *Q = malloc(n*n*sizeof(double));
    memset(Q, 0, n*n*sizeof(double));

    PLASMA_dstedc_Async(jobz, n, D, E,
                        Q, n,
                        sequence, request);

    plasma_dynamic_call_8(plasma_pdlag2z,
                          int,     n,
                          int,     n,
                          double*,             Q,
                          int,                 n,
                          PLASMA_Complex64_t*, Z,
                          int,                 LDZ,
                          PLASMA_sequence*, sequence,
                          PLASMA_request*, request);

    // TODO: need to be removed by using dependencies on Q
    QUARK_Barrier(plasma->quark);
    free(Q);

    return 0;
}

#endif
