/*////////////////////////////////////////////////////////////////////////////////////////
 *  -- PLASMA --
 *     University of Tennessee
 */

#include <stdio.h>    // Standard stuff
#include <stdlib.h>
#include <assert.h>
#include <malloc.h>
#include <string.h>

#include <fcntl.h>    // Huge TLB pages
#include <sys/mman.h>

#include <math.h>     // Math and MKL
#include <mkl_cblas.h>

#define __USE_UNIX98  // Pthreads
#include <pthread.h>

#include <sys/time.h> // Time


#include "plasma.h"


#define MAX_NB 1024
#define MAX_BB 1024

#define CORES_MAX 1024

/*////////////////////////////////////////////////////////////////////////////////////////
 */

struct {
    pthread_mutex_t action_mutex;
    pthread_cond_t action_condt;
    PLASMA_enum action;

    PLASMA_bool initialized;
    PLASMA_bool bdl_huge_pages;
    int NB_max;
    int NB_min;
    int IB_max;
    int NB;
    int IB;
    int NBNBSIZE;
    int IBNBSIZE;
    int cores_max;
    int cores_num;
    int bdl_size_elems;
    int progress_size_elems;

    int core_num[CORES_MAX];
    pthread_t core_id[CORES_MAX];
    pthread_attr_t core_attr;
    pthread_t thread_id[CORES_MAX];

    int cache_line_size;
    int page_size;
    int huge_page_size;
    int huge_pages_free;
    int huge_pages_total;
    int huge_page_fmem;
}
plasma_cntrl = {
    PTHREAD_MUTEX_INITIALIZER,  // action_mutex
    PTHREAD_COND_INITIALIZER,   // action_condt
    PLASMA_ACT_STAND_BY,        // action
    PLASMA_FALSE,               // initialized
    PLASMA_FALSE,               // bdl_huge_pages
    256,                        // NB_max
    5,                          // NB_min          --- eventualy change to 32
    128                         // IB_max
};


struct {
    double *bdl_mem;
    volatile int *progress;
    double *WORK[CORES_MAX];
    double *TAU[CORES_MAX];
}
plasma_aux = {
    NULL,       // bdl_mem
    NULL,       // progress
    {NULL},     // WORK[0]
    {NULL}      // TAU[0]
};


struct {
    PLASMA_enum trans;
    PLASMA_enum side;
    PLASMA_enum uplo;
    int M;
    int N;
    int NRHS;
    int NB;
    int NBNBSIZE;
    int IBNBSIZE;
    int MT;
    int NT;
    int NTRHS;
    int IB;
    int LDA;
    int LDB;
    double *A;
    double *B;
    double *T;
    int INFO;
}
plasma_args;

/*////////////////////////////////////////////////////////////////////////////////////////
 */

#define dormqr dormqr_
#define dtsqr2  dtsqr2_
#define dssrfb  dssrfb_
#define dlarfib dlarfib_
#define dsqr2ib dsqr2ib_

double GFLOPS;

void *parallel_section(void *thread_id);
void tile_cholesky_parallel(int my_core_id);
void dump_trace(int cores_num, int NB);
void diff_matrix(double *A, double *B, int NB, int BBM, int BBN, int M, int N);

void tile_ssyrk(int k, int n, double *IN, int NB, int BB);
void tile_spotrf(int k, double *IN, int NB, int BB, int *INFO);
void tile_sgemm(int k, int m, int n, double *IN, int NB, int BB);
void tile_strsm(int k, int m, double *IN, int NB, int BB);

void dpotrf(char*, int*, double*, int*, int*);
void dpotrs(char*, int*, int*, double*, int*, double*, int*, int*);
void dgeqrf(int*, int*, double*, int*, double*, double*, int*, int*);

void dormqr(char*, char*, int*, int*, int*, double*, int*, double*, double*, int*,
            double*, int*, int*);


char Left = 'L', Right = 'R', Transpose = 'T', Forward = 'F', Columnwise = 'C', Upper = 'U';

void core_dtsqrt_(int*, int*, int*, double*, int*, double*, int*, double*, int*, double*, double*, int*);

void core_dssrfb_(char*, char*, int*, int*, int*, int*, int*, double*, int*, double*, int*, double*, int*, double*, int*, double*, int*);

void core_dlarfb_(char*,char* , char*, char*, int*, int*, int*, int*, double*, int*, double*, int*,
    double*, int*, double*, int*, int*);

void core_dgeqrt_(int*, int*, int*, double*, int*, double* , int*, double*, double* , int*);

void plasma_pDGEQRF(int M, int N, double *A, int NB, int NBNBSIZE, int IBNBSIZE, int IB, int MT,
                    int NT, double *T, int *INFO, int cores_num, int my_core_id);

void plasma_pDORMQR(int M, int NRHS, int N, double *A, int NB, int NBNBSIZE, int IBNBSIZE, int IB,
                    int MT, int NTRHS, int NT, double *T, double *B, int *INFO,
                    int cores_num, int my_core_id);

void plasma_pDTRSM(int side, int uplo, int transA, int diag, int N, int NRHS,
                   double alpha, double *A, int NB, int NBNBSIZE, int NT, int MT,
                   double *B, int NTRHS, int cores_num, int my_core_id);
/*////////////////////////////////////////////////////////////////////////////////////////
 */

double get_current_time(void)
{
    struct timeval  time_val;
    struct timezone time_zone;

    gettimeofday(&time_val, &time_zone);
    return (double)(time_val.tv_sec) + (double)(time_val.tv_usec) / 1000000.0;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 */

#define EVENTS_MAX 16384
//#define EVENTS_MAX 1048576

static int    event_num        [CORES_MAX];
static double event_start_time [CORES_MAX];
static double event_end_time   [CORES_MAX];
static double event_log        [CORES_MAX][EVENTS_MAX];
static int log_events = 1;

#define core_event_start(my_core_id)\
    event_start_time[my_core_id] = get_current_time();\

#define core_event_end(my_core_id)\
    event_end_time[my_core_id] = get_current_time();\

#define core_event_log(event, my_core_id)\
    event_log[my_core_id][event_num[my_core_id]+0] = my_core_id;\
    event_log[my_core_id][event_num[my_core_id]+1] = event_start_time[my_core_id];\
    event_log[my_core_id][event_num[my_core_id]+2] = event_end_time[my_core_id];\
    event_log[my_core_id][event_num[my_core_id]+3] = (event);\
    event_num[my_core_id] += (log_events << 2);\
    event_num[my_core_id] &= (EVENTS_MAX-1);


/*////////////////////////////////////////////////////////////////////////////////////////
 */

static volatile int barrier_in[CORES_MAX];
static volatile int barrier_out[CORES_MAX];

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Busy-waiting barrier initialization
 */
void plasma_barrier_init()
{
    int core;

    for (core = 0; core < CORES_MAX; core++) {
        barrier_in[core] = 0;
        barrier_out[core] = 0;
    }
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Busy-waiting barrier
 */
void plasma_barrier(int my_core_id, int cores_num)
{
    int core;

    if (my_core_id == 0)    {
        for (core = 1; core < cores_num; core++)
            while (barrier_in[core] == 0);

        for (core = 1; core < cores_num; core++)
            barrier_in[core] = 0;

        for (core = 1; core < cores_num; core++)
            barrier_out[core] = 1;
    }
    else
    {
        barrier_in[my_core_id] = 1;
        while (barrier_out[my_core_id] == 0);
        barrier_out[my_core_id] = 0;
    }
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Conversion from LAPACK F77 matrix layout to Block Data Layout
 */
void plasma_lapack_to_bdl(double *A, double *Abdl, int M, int N, int LDA,
                          int NB, int MT, int NT, int NBNBSIZE)
{
    int X, Y, x, y;

    for (X = 0; X < NT; X++)
      for (Y = 0; Y < MT; Y++)
        for (x = 0; x < NB; x++)
          for (y = 0; y < NB; y++)
            if (Y*NB + y < M && X*NB + x < N)
              Abdl[Y*NBNBSIZE + y + X*NBNBSIZE*MT + x*NB] = A[Y*NB + y + X*NB*LDA + x*LDA];
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Conversion from Block Data Layout to LAPACK F77 matrix layout
 */
void plasma_bdl_to_lapack(double *Abdl, double *A, int M, int N, int LDA,
                          int NB, int MT, int NT, int NBNBSIZE)
{
    int X, Y, x, y;

    for (X = 0; X < NT; X++)
      for (Y = 0; Y < MT; Y++)
        for (x = 0; x < NB; x++)
          for (y = 0; y < NB; y++)
            if (Y*NB + y < M && X*NB + x < N)
              A[Y*NB + y + X*NB*LDA + x*LDA] = Abdl[Y*NBNBSIZE + y + X*NBNBSIZE*MT + x*NB];
}

/*////////////////////////////////////////////////////////////////////////////////////////
 */
int main (int argc, char **argv)
{
    assert(argc == 7);
    int M = atoi(argv[1]);
    int N = atoi(argv[2]);
    int NB = atoi(argv[3]); assert(NB <= MAX_NB);
    int IB = atoi(argv[4]); assert(NB%IB == 0);
    int NT = (N%NB==0) ? (N/NB) : (N/NB+1);
    int MT = (M%NB==0) ? (M/NB) : (M/NB+1);
    int NRHS = atoi(argv[5]);
    int NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
    int THREADS = atoi(argv[6]); assert(THREADS <= CORES_MAX);

    int NBNBSIZE = NB*NB;
    NBNBSIZE *= sizeof(double);
    NBNBSIZE = roundup(NBNBSIZE, CACHE_LINE_SIZE);
    NBNBSIZE /= sizeof(double);

    int thread;

    char mem_file_name[32];
    int  huge_size;
    int  fmem;
    char *mem_block = 0;

    int INFO;

    #define HUGE_PAGE_SIZE 2048*1024
    // Allocate memory in huge TLB pages
    double *Ablk2 = (double*)mem_block; mem_block +=  MT*NT*NBNBSIZE*sizeof(double);
    double *A1    = (double*)mem_block; mem_block +=  M*N*sizeof(double);
    double *A2    = (double*)mem_block; mem_block +=  M*N*sizeof(double);

    double *Bblk2 = (double*)mem_block; mem_block +=  MT*NTRHS*NBNBSIZE*sizeof(double);
    double *B1    = (double*)mem_block; mem_block +=  M*NRHS*sizeof(double);
    double *B2    = (double*)mem_block; mem_block +=  M*NRHS*sizeof(double);

    double *WORK   = (double*)mem_block; mem_block += MT*MT*NBNBSIZE+NT*NT*NBNBSIZE*sizeof(double);
    double *TAU    = (double*)mem_block; mem_block += N*sizeof(double);
    double *T      = (double*)mem_block; mem_block += MT*NT*NBNBSIZE*sizeof(double);

    huge_size = (int)mem_block;
    huge_size = (huge_size + HUGE_PAGE_SIZE-1) & ~(HUGE_PAGE_SIZE-1);
    sprintf(mem_file_name, "/huge/huge_tlb_page.bin");
    assert((fmem = open(mem_file_name, O_CREAT | O_RDWR, 0755)) != -1);
    remove(mem_file_name);
    mem_block = (char*)mmap(0, huge_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fmem, 0);
    assert(mem_block != MAP_FAILED);
    #undef HUGE_PAGE_SIZE

    Ablk2 = (double*)(mem_block + (unsigned int)Ablk2);
    A1    = (double*)(mem_block + (unsigned int)A1);
    A2    = (double*)(mem_block + (unsigned int)A2);

    Bblk2 = (double*)(mem_block + (unsigned int)Bblk2);
    B1    = (double*)(mem_block + (unsigned int)B1);
    B2    = (double*)(mem_block + (unsigned int)B2);

    WORK  = (double*)(mem_block + (unsigned int)WORK);
    TAU   = (double*)(mem_block + (unsigned int)TAU);
    T     = (double*)(mem_block + (unsigned int)T);

    // Init log
    for (thread = 0; thread < THREADS; thread++)
        event_num[thread] = 0;


//  double *A1    = (double*)memalign(4096,  NxN*sizeof(double));
//  double *A2    = (double*)memalign(4096,  NxN*sizeof(double));


/*
    // Initialize A1 and A2 - Cholesky
    int m, n;
    for (m = 0; m < N; m++)
      for (n = 0; n <= m; n++)
      {
        A1[n*N+m] = A1[m*N+n] =
        A2[n*N+m] = A2[m*N+n] = 0.5 - (double)rand() / RAND_MAX;
      }

    // Make SPD for Cholesky
    int i;
    for (i = 0; i < N; i++)
    {
        A1[i*N+i] += sqrtf(N);
        A2[i*N+i] += sqrtf(N);
    }

//  Generate zero pivot form error check
//  A1[13*N+13] = 0.0;
//  A2[13*N+13] = 0.0;
*/


    int MN = M*N;
    int IONE = 1;
    int ISEED[4] = {0,0,0,1};
    dlarnv_(&IONE, ISEED, &MN, A1);

    // Initialize A1 and A2 - QR
    int m, n;
    for (m = 0; m < M; m++)
      for (n = 0; n < N; n++)
        A2[M*n+m] = A1[M*n+m]; // = 0.5 - (double)rand() / RAND_MAX;

    // Initialize B1 and B2
    for (m = 0; m < M; m++)
      for (n = 0; n < NRHS; n++)
      {
        B1[m+M*n] = B2[m+M*n] = 0.5 - (double)rand() / RAND_MAX;
      }
/*
    // Initialize B1 and B2
    memset(B1, 0, M*NRHS*sizeof(double));
    for (m = 0; m < min(M, NRHS); m++)
        B1[m+M*m] = B2[m+M*m] = 1.0;
*/
/*
    // LAPACK Cholesky
    dpotrf(lapack_const(PlasmaUpper), &N, A1, &N, &INFO);
    printf("INFO: %d\n\n", INFO);
*/

    // LAPACK QR
    int LWORK = MT*MT*NBNBSIZE+NT*NT*NBNBSIZE;
    //dgeqrf(&M, &N, A1, &M, TAU, WORK, &LWORK, &INFO);


    int LDA = M;
    int LDB = M;

    // Move from F77 to BDL
    //plasma_lapack_to_bdl(A2, Ablk2, M, N, LDA, NB, MT, NT, NBNBSIZE);
    //plasma_lapack_to_bdl(B2, Bblk2, M, NRHS, LDB, NB, MT, NTRHS, NBNBSIZE);

/*
    // Set input parameters
    plasma_cntrl.cores_num = THREADS;

    plasma_args.uplo = PlasmaUpper;
    plasma_args.M = M;
    plasma_args.N = N;
    plasma_args.NRHS = NRHS;
    plasma_args.NB = NB;
    plasma_args.NBNBSIZE = NBNBSIZE;
    plasma_args.IB = IB;
    plasma_args.MT = MT;
    plasma_args.NT = NT;
    plasma_args.NTRHS = NTRHS;
    plasma_args.A = Ablk2;
    plasma_args.B = Bblk2;
    plasma_args.T = T;
*/





    plasma_init(6000, 6000, 6000);

    plasma_DGELS(PlasmaNoTrans, M, N, NRHS, A2, LDA, B2, LDB);

    plasma_finalize();





/*
    // Move from BDL to F77
    plasma_bdl_to_lapack(Ablk2, A2, M, N, LDA, NB, MT, NT, NBNBSIZE);
    plasma_bdl_to_lapack(Bblk2, B2, M, NRHS, LDB, NB, MT, NTRHS, NBNBSIZE);
*/


    dgels_( "No Transpose", &M, &N, &NRHS, A1, &M, B1, &M, WORK, &LWORK, &INFO);





//  dormqr_(&Left, &Transpose, &M, &NRHS, &N, A1, &M,
//          TAU, B1, &M, WORK, &LWORK, &INFO);

//  dtrtrs_("Upper", "No transpose", "Non-unit", &N, &NRHS,
//           A2, &M, B2, &M, &INFO);

//  double ONE = 1.0;
//  dtrsm_( "Left", "Upper", "NoTrans", "Non-unit", &N, &NRHS, &ONE, A2, &M, B2, &M);


//    print_lmat(M, N, B1, 0);


//  dgeqrf(&M, &N, A2, &M, TAU, WORK, &LWORK, &INFO);

//  dpotrs(lapack_const(PlasmaUpper), &N, &NRHS, A1, &N, B1, &N, &INFO);

/*
    cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper,
          CblasTrans, CblasNonUnit,
          N, NRHS, 1.0, A1, N, B1, N);

    cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper,
               CblasNoTrans, CblasNonUnit,
               N, NRHS, 1.0, A1, N, B1, N);

    cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper,
          CblasTrans, CblasNonUnit,
          N, NRHS, 1.0, A2, N, B2, N);

    cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper,
               CblasNoTrans, CblasNonUnit,
               N, NRHS, 1.0, A2, N, B2, N);

    cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper,
          CblasTrans, CblasNonUnit,
          N, NRHS, 1.0, A1, N, B1, N);

    cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper,
               CblasNoTrans, CblasNonUnit,
               N, NRHS, 1.0, A1, N, B1, N);
*/


//  Cholesky
//  diff_matrix(A1, A2, NB, NT, NT,    N,    N);
//  diff_matrix(B1, B2, NB, NT, NTRHS, N, NRHS);




    // QR
    diff_matrix(A1, A2, NB, MT, NT, M, N);
    diff_matrix(B1, B2, NB, MT, NTRHS, M, NRHS);


    for (n = 0; n < NRHS; n++) {
      double temp = 0.0;
      for (m = N; m < M; m++)
        temp += B1[M*n+m]*B1[M*n+m];
      printf("%6.2lf", temp);
    }
    printf("\n");
    for (n = 0; n < NRHS; n++) {
      double temp = 0.0;
      for (m = N; m < M; m++)
        temp += B2[M*n+m]*B2[M*n+m];
      printf("%6.2lf", temp);
    }
    printf("\n");
/*
{
int mm, nn;
for (mm = 0; mm < 2*NB; mm++){
  for (nn = 0; nn < NB; nn++)
    printf("%6.2lf", A1[mm+nn*2*NB]);
  printf("\n");
  }
}


{
int mm, nn;
for (mm = 0; mm < 2*NB; mm++){
  for (nn = 0; nn < NB; nn++)
    printf("%6.2lf", A2[mm+nn*2*NB]);
  printf("\n");
  }
}
*/

//  dump_trace(THREADS, NB);
//  printf("\t%.2lf\t%.2lf\n", GFLOPS, GFLOPS / (2.393895 * 8 * THREADS) * 100.0);


}

//##################################################################################################

#include <unistd.h>

/*////////////////////////////////////////////////////////////////////////////////////////
 */
void plasma_warning(char *func_name, char* msg_text)
{
    fprintf(stderr, "PLASMA warning: %s(): %s\n", func_name, msg_text);
}

/*////////////////////////////////////////////////////////////////////////////////////////
 */
void plasma_error(char *func_name, char* msg_text)
{
    fprintf(stderr, "PLASMA error: %s(): %s\n", func_name, msg_text);
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Set PLASMA integer parameter
 */
int plasma_set_int(PLASMA_enum param, int value)
{
    if (!plasma_cntrl.initialized) {
        plasma_warning("plasma_set_concurrency", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    switch (param) {
        case PLASMA_CONCURRENCY:
            if (value <= 0 || value > plasma_cntrl.cores_max) {
                plasma_warning("plasma_set_concurrency", "illegal parameter value");
                return PLASMA_ERR_ILLEGAL_VALUE;
            }
            plasma_cntrl.cores_num = value;
            break;
        default:
            plasma_error("plasma_set_int", "illegal parameter value");
            return PLASMA_ERR_ILLEGAL_VALUE;
    }
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Get PLASMA integer parameter
 */
int plasma_get_int(PLASMA_enum param)
{
    if (!plasma_cntrl.initialized) {
        plasma_warning("plasma_get_int", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    switch (param) {
        case PLASMA_CONCURRENCY:
            return plasma_cntrl.cores_num;
        default:
            plasma_error("plasma_get_int", "illegal parameter value");
            return PLASMA_ERR_ILLEGAL_VALUE;
    }
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Allocate auxiliary structures WORK[cores_max][NB_max^2] and TAU[cores_max][NB_max]
 */
int plasma_alloc_aux_work_tau()
{
    int size_elems;
    int size_bytes;
    int mem_size;
    double *mem_block;
    int core;

    /* Allocate cache line aligned workspace of size NB_max^2 for each core */
    size_elems = plasma_cntrl.NB_max * plasma_cntrl.NB_max;
    size_bytes = size_elems * sizeof(double);
    size_bytes = roundup(size_bytes, CACHE_LINE_SIZE);
    mem_size = size_bytes * plasma_cntrl.cores_max;

    mem_block = (double*)memalign(CACHE_LINE_SIZE, mem_size);
    if (mem_block == NULL) {
        plasma_error("plasma_alloc_aux", "out of memory");
        return PLASMA_ERR_OUT_OF_MEMORY;
    }
    for (core = 0; core < plasma_cntrl.cores_max; core++) {
        plasma_aux.WORK[core] = mem_block;
        mem_block += size_elems;
    }

    /* Allocate cache line aligned workspace of size NB_max for each core */
    size_elems = plasma_cntrl.NB_max;
    size_bytes = size_elems * sizeof(double);
    size_bytes = roundup(size_bytes, CACHE_LINE_SIZE);
    mem_size = size_bytes * plasma_cntrl.cores_max;

    mem_block = (double*)memalign(CACHE_LINE_SIZE, mem_size);
    if (mem_block == NULL) {
        plasma_error("plasma_alloc_aux", "out of memory");
        return PLASMA_ERR_OUT_OF_MEMORY;
    }
    for (core = 0; core < plasma_cntrl.cores_max; core++) {
        plasma_aux.TAU[core] = mem_block;
        mem_block += size_elems;
    }
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Free auxiliary structures WORK[cores_max][NB_max^2] and TAU[cores_max][NB_max]
 */
int plasma_free_aux_work_tau()
{
    if (plasma_aux.WORK[0] == NULL) {
        plasma_error("plasma_free_aux_work_tau", "attempting to free null pointer");
        return PLASMA_ERR_UNALLOCATED;
    }
    free(plasma_aux.WORK[0]);

    if (plasma_aux.TAU[0] == NULL) {
        plasma_error("plasma_free_aux_work_tau", "attempting to free null pointer");
        return PLASMA_ERR_UNALLOCATED;
    }
    free(plasma_aux.TAU[0]);
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Allocate auxiliary structure progress
 */
int plasma_alloc_aux_progress(PLASMA_long size_elems)
{
    PLASMA_long size_bytes;
    int *mem_block;

    size_bytes = size_elems * sizeof(int);
    size_bytes = roundup(size_bytes, plasma_cntrl.page_size);
    mem_block = (int*)memalign(plasma_cntrl.page_size, size_bytes);
    if (mem_block == NULL) {
        plasma_error("plasma_alloc_aux_progress", "out of memory");
        return PLASMA_ERR_OUT_OF_MEMORY;
    }
    plasma_aux.progress = mem_block;
    plasma_cntrl.progress_size_elems = size_bytes / sizeof(int);
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Free auxiliary structure progress
 */
int plasma_free_aux_progress()
{
    if (plasma_aux.progress == NULL) {
        plasma_error("plasma_free_aux_progress", "attempting to free null pointer");
        return PLASMA_ERR_UNALLOCATED;
    }
    free((void*)plasma_aux.progress);
    plasma_cntrl.progress_size_elems = 0;
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Clear auxiliary structure progress
 */
void plasma_clear_aux_progress(int size, int value)
{
    int i;

    for (i = 0; i < size; i++)
        plasma_aux.progress[i] = value;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Allocate storage for Block Data Layout
 */
int plasma_alloc_aux_bdl(PLASMA_long size_elems, PLASMA_bool warning)
{
    PLASMA_long size_bytes;
    PLASMA_long byte;
    double *mem_block;
    int status;
    int fmem;

    /* Attempt allocation in huge pages */
    if (plasma_cntrl.huge_pages_total > 0) {
        size_bytes = size_elems * sizeof(double);
        size_bytes = roundup(size_bytes, plasma_cntrl.huge_page_size);
        if (size_bytes <= plasma_cntrl.huge_pages_total * plasma_cntrl.huge_page_size) {
            if (size_bytes <= plasma_cntrl.huge_pages_free * plasma_cntrl.huge_page_size) {
                fmem = open(HUGE_PAGE_FILE_NAME, O_CREAT | O_RDWR, 0755);
                if (fmem != -1) {
                    mem_block = (double*)mmap(0, size_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE, fmem, 0);
                    if (mem_block != MAP_FAILED) {
                        plasma_aux.bdl_mem = mem_block;
                        plasma_cntrl.huge_page_fmem = fmem;
                        plasma_cntrl.bdl_huge_pages = PLASMA_TRUE;
                        plasma_cntrl.bdl_size_elems = size_bytes / sizeof(double);
                        /* Fault-in huge pages */
                        for (byte = 0; byte < size_bytes; byte += plasma_cntrl.huge_page_size)
                            ((char*)mem_block)[byte] = 0;
                        return PLASMA_SUCCESS;
                    }
                    else {
                        plasma_warning("plasma_alloc_aux_bdl", "mmap() failed");
                        status = close(fmem);
                        if (status != 0) {
                            plasma_error("plasma_alloc_aux_bdl", "close() failed");
                        }
                        else {
                            status = remove(HUGE_PAGE_FILE_NAME);
                            if (status != 0) {
                                plasma_error("plasma_alloc_aux_bdl", "remove() failed");
                            }
                        }
                    }
                }
                else {
                    plasma_warning("plasma_alloc_aux_bdl", "failed to open huge page file");
                }
            }
            else {
                if (warning)
                    plasma_warning("plasma_alloc_aux_bdl", "not enough free huge pages");
            }
        }
        else {
            if (warning)
                plasma_warning("plasma_alloc_aux_bdl", "not enough total huge pages");
        }
    }
    else {
        if (warning)
            plasma_warning("plasma_alloc_aux_bdl", "huge pages not available");
    }
    if (warning)
        plasma_warning("plasma_alloc_aux_bdl", "failed to allocate huge pages");

    /* Allocate in standard pages */
    plasma_cntrl.bdl_huge_pages = PLASMA_FALSE;
    size_bytes = size_elems * sizeof(double);
    size_bytes = roundup(size_bytes, plasma_cntrl.page_size);
    mem_block = (double*)memalign(plasma_cntrl.page_size, size_bytes);
    if (mem_block != NULL) {
        plasma_aux.bdl_mem = mem_block;
        plasma_cntrl.bdl_size_elems = size_bytes / sizeof(double);
        return PLASMA_SUCCESS;
    }
    return PLASMA_ERR_OUT_OF_MEMORY;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Free storage of Block Data Layout
 */
int plasma_free_aux_bdl()
{
    int status;

    if (plasma_cntrl.bdl_huge_pages) {
        if (plasma_aux.bdl_mem == MAP_FAILED) {
            plasma_error("plasma_free_aux_bdl", "attempting to free unmapped memory");
            return PLASMA_ERR_UNALLOCATED;
        }
        status = munmap(plasma_aux.bdl_mem, plasma_cntrl.bdl_size_elems * sizeof(double));
        if (status != 0) {
            plasma_error("plasma_free_aux_bdl", "munmap() failed");
        }
        else {
            status = close(plasma_cntrl.huge_page_fmem);
            if (status != 0) {
                plasma_error("plasma_free_aux_bdl", "close() failed");
            }
            else {
                status = remove(HUGE_PAGE_FILE_NAME);
                if (status != 0)
                    plasma_error("plasma_free_aux_bdl", "remove() failed");
            }
        }
    }
    else {
        if (plasma_aux.bdl_mem == NULL) {
            plasma_error("plasma_free_aux_bdl", "attempting to free null pointer");
            return PLASMA_ERR_UNALLOCATED;
        }
        free(plasma_aux.bdl_mem);
    }
    plasma_cntrl.bdl_size_elems = 0;
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Parallel (per-core) initialization
 */
int plasma_parallel_init(int my_core_id)
{
    plasma_cntrl.thread_id[my_core_id] = pthread_self();
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Returns core id
 */
int plasma_self()
{
    int core;
    pthread_t thread_id;

    thread_id = pthread_self();
    for (core = 0; core < plasma_cntrl.cores_num; core++)
        if (plasma_cntrl.thread_id[core] == thread_id)
            return core;
    return PLASMA_ERR_NOT_FOUND;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Main thread control
 */
void *plasma_parallel_section(void *core_id)
{
    int my_core_id = *((int*)core_id);
    int status;
    PLASMA_enum action;

    /* Parallel initializations */
    status = plasma_parallel_init(my_core_id);
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_parallel_section", "plasma_parallel_init() failed");
        return (void*)status;
    }
    plasma_barrier(my_core_id, plasma_cntrl.cores_num);

    while(1) {
        pthread_mutex_lock(&plasma_cntrl.action_mutex);
        while ((action = plasma_cntrl.action) == PLASMA_ACT_STAND_BY)
            pthread_cond_wait(&plasma_cntrl.action_condt, &plasma_cntrl.action_mutex);
        pthread_mutex_unlock(&plasma_cntrl.action_mutex);
        plasma_barrier(my_core_id, plasma_cntrl.cores_num);

        switch (action) {
            case PLASMA_ACT_DGEQRF:
                plasma_pDGEQRF(plasma_args.M, plasma_args.N, plasma_args.A,
                               plasma_args.NB, plasma_args.NBNBSIZE, plasma_args.IBNBSIZE,
                               plasma_args.IB, plasma_args.MT, plasma_args.NT, plasma_args.T,
                               &plasma_args.INFO, plasma_cntrl.cores_num, my_core_id);
                break;
            case PLASMA_ACT_DORMQR:
                plasma_pDORMQR(plasma_args.M, plasma_args.NRHS, plasma_args.N,
                               plasma_args.A, plasma_args.NB, plasma_args.NBNBSIZE,
                               plasma_args.IBNBSIZE, plasma_args.IB, plasma_args.MT,
                               plasma_args.NTRHS, plasma_args.NT, plasma_args.T,
                               plasma_args.B, &plasma_args.INFO, plasma_cntrl.cores_num,
                               my_core_id);
                break;
            case PLASMA_ACT_DTRSM:
                plasma_pDTRSM(PlasmaLeft, PlasmaUpper, PlasmaNoTrans, PlasmaNonUnit,
                              plasma_args.N, plasma_args.NRHS, 1.0, plasma_args.A,
                              plasma_args.NB, plasma_args.NBNBSIZE, plasma_args.NT,
                              plasma_args.MT, plasma_args.B, plasma_args.NTRHS,
                              plasma_cntrl.cores_num, my_core_id);
                break;
            case PLASMA_ACT_FINALIZE:
                return NULL;
            default:
                break;
        }
        plasma_barrier(my_core_id, plasma_cntrl.cores_num);
    }
    return NULL;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Get memory information
 */
int plasma_tune(PLASMA_enum func, int M, int N, int NRHS)
{
    plasma_cntrl.NB = 6;
    plasma_cntrl.IB = 3;
    /* Calculate A, B tile size and round up to cache line size */
    plasma_cntrl.NBNBSIZE = plasma_cntrl.NB * plasma_cntrl.NB * sizeof(double);
    plasma_cntrl.NBNBSIZE = roundup(plasma_cntrl.NBNBSIZE, plasma_cntrl.cache_line_size);
    /* Calculate T tile size and round up to cache line size */
    plasma_cntrl.IBNBSIZE = plasma_cntrl.IB * plasma_cntrl.NB * sizeof(double);
    plasma_cntrl.IBNBSIZE = roundup(plasma_cntrl.IBNBSIZE, plasma_cntrl.cache_line_size);
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Get memory information
 */
int plasma_DGELS(PLASMA_enum trans, int M, int N, int NRHS, double *A,
                 int LDA, double *B, int LDB)
{
    int NB, MT, NT, NTRHS;
    int status;
    double *Abdl;
    double *Bbdl;
    double *Tbdl;
    double *bdl_mem;
    PLASMA_long size_elems;

    /* Check input arguments */
    if (trans != PlasmaNoTrans) {
        plasma_error("plasma_DGELS", "only PlasmaNoTrans supported");
        return PLASMA_ERR_NOT_SUPPORTED;
    }
    if (M < 0) {
        plasma_error("plasma_DGELS", "illegal value of M");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (N < 0) {
        plasma_error("plasma_DGELS", "illegal value of N");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (NRHS < 0) {
        plasma_error("plasma_DGELS", "illegal value of NRHS");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (LDA < max(1, M)) {
        plasma_error("plasma_DGELS", "illegal value of LDA");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    if (LDB < max(1, max(M, N))) {
        plasma_error("plasma_DGELS", "illegal value of LDB");
        return PLASMA_ERR_ILLEGAL_VALUE;
    }
    /* Quick return - currently NOT equivalent to LAPACK's:
     * CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) */
    if (min(M, min(N, NRHS)) == 0)
        return PLASMA_SUCCESS;

    /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
    status = plasma_tune(PLASMA_TUNE_DGELS, M, N, NRHS);
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_DGELS", "plasma_tune() failed");
        return status;
    }

    /* Set MT, NT & NTRHS */
    NB = plasma_cntrl.NB;
    NT = (N%NB==0) ? (N/NB) : (N/NB+1);
    MT = (M%NB==0) ? (M/NB) : (M/NB+1);
    NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);

    /* If NB larger than NB_max, set NB_max to NB, reallocate WORK & TAU */
    if (plasma_cntrl.NB > plasma_cntrl.NB_max) {
        status = plasma_free_aux_work_tau();
        if (status != PLASMA_SUCCESS) {
            plasma_error("plasma_DGELS", "plasma_free_aux_work_tau() failed");
            return status;
        }
        plasma_cntrl.NB_max = plasma_cntrl.NB;
        status = plasma_alloc_aux_work_tau();
        if (status != PLASMA_SUCCESS) {
            plasma_error("plasma_DGELS", "plasma_alloc_aux_work_tau() failed");
            return status;
        }
    }

    /* If progress table too small, reallocate */
    size_elems = max(MT*max(NT, NTRHS), NT*NTRHS);
    if (plasma_cntrl.progress_size_elems < size_elems) {
        status = plasma_free_aux_progress();
        if (status != PLASMA_SUCCESS) {
            plasma_error("plasma_DGELS", "plasma_free_aux_progress() failed");
        }
        status = plasma_alloc_aux_progress(size_elems);
        if (status != PLASMA_SUCCESS) {
            plasma_error("plasma_DGELS", "plasma_alloc_aux_progress");
            return status;
        }
    }

    /* Assign arrays to BDL storage */
    bdl_mem = plasma_aux.bdl_mem;
    Abdl = bdl_mem; bdl_mem += MT*NT*plasma_cntrl.NBNBSIZE;
    Tbdl = bdl_mem; bdl_mem += MT*NT*plasma_cntrl.IBNBSIZE;
    Bbdl = bdl_mem; bdl_mem += MT*NTRHS*plasma_cntrl.NBNBSIZE;
    /* If BDL storage too small, reallocate & reassign */
    size_elems = bdl_mem - plasma_aux.bdl_mem;
    if (plasma_cntrl.bdl_size_elems < size_elems) {
        status = plasma_free_aux_bdl();
        if (status != PLASMA_SUCCESS) {
            plasma_error("plasma_DGELS", "plasma_free_aux_bdl() failed");
            return status;
        }
        status = plasma_alloc_aux_bdl(size_elems, PLASMA_TRUE);
        if (status != PLASMA_SUCCESS) {
            plasma_error("plasma_DGELS", "plasma_alloc_aux_bdl() failed");
            return status;
        }
        bdl_mem = plasma_aux.bdl_mem;
        Abdl = bdl_mem; bdl_mem += MT*NT*plasma_cntrl.NBNBSIZE;
        Tbdl = bdl_mem; bdl_mem += MT*NT*plasma_cntrl.IBNBSIZE;
        Bbdl = bdl_mem; bdl_mem += MT*NTRHS*plasma_cntrl.NBNBSIZE;
    }

    /* Convert arrays from LAPACK to BDL */
    plasma_lapack_to_bdl(A, Abdl, M, N, LDA,
        plasma_cntrl.NB, MT, NT, plasma_cntrl.NBNBSIZE);
    plasma_lapack_to_bdl(B, Bbdl, M, NRHS, LDB,
         plasma_cntrl.NB, MT, NTRHS, plasma_cntrl.NBNBSIZE);

    /* Call parallel DGEQRF */
    /* Set arguments */
    plasma_args.M = M;
    plasma_args.N = N;
    plasma_args.A = Abdl;
    plasma_args.NB = plasma_cntrl.NB;
    plasma_args.NBNBSIZE = plasma_cntrl.NBNBSIZE;
    plasma_args.IBNBSIZE = plasma_cntrl.IBNBSIZE;
    plasma_args.IB = plasma_cntrl.IB;
    plasma_args.MT = MT;
    plasma_args.NT = NT;
    plasma_args.T = Tbdl;
    /* Clear progress table */
    plasma_clear_aux_progress(MT*NT, -1);
    /* Signal workers */
    pthread_mutex_lock(&plasma_cntrl.action_mutex);
    plasma_cntrl.action = PLASMA_ACT_DGEQRF;
    pthread_mutex_unlock(&plasma_cntrl.action_mutex);
    pthread_cond_broadcast(&plasma_cntrl.action_condt);
    /* Call for master */
    plasma_barrier(0, plasma_cntrl.cores_num);
    plasma_cntrl.action = PLASMA_ACT_STAND_BY;
    plasma_pDGEQRF(plasma_args.M, plasma_args.N, plasma_args.A, plasma_args.NB,
                    plasma_args.NBNBSIZE, plasma_args.IBNBSIZE, plasma_args.IB,
                    plasma_args.MT, plasma_args.NT, plasma_args.T, &plasma_args.INFO,
                    plasma_cntrl.cores_num, 0);
    plasma_barrier(0, plasma_cntrl.cores_num);

    /* Call parallel DORMQR */
    /* Set arguments */
    plasma_args.M = M;
    plasma_args.N = N;
    plasma_args.NRHS = NRHS;
    plasma_args.A = Abdl;
    plasma_args.NB = plasma_cntrl.NB;
    plasma_args.NBNBSIZE = plasma_cntrl.NBNBSIZE;
    plasma_args.IBNBSIZE = plasma_cntrl.IBNBSIZE;
    plasma_args.IB = plasma_cntrl.IB;
    plasma_args.MT = MT;
    plasma_args.NTRHS = NTRHS;
    plasma_args.NT = NT;
    plasma_args.T = Tbdl;
    plasma_args.B = Bbdl;
    /* Clear progress table */
    plasma_clear_aux_progress(MT*NTRHS, -1);
    /* Signal workers */
    pthread_mutex_lock(&plasma_cntrl.action_mutex);
    plasma_cntrl.action = PLASMA_ACT_DORMQR;
    pthread_mutex_unlock(&plasma_cntrl.action_mutex);
    pthread_cond_broadcast(&plasma_cntrl.action_condt);
    /* Call for master */
    plasma_barrier(0, plasma_cntrl.cores_num);
    plasma_cntrl.action = PLASMA_ACT_STAND_BY;
    plasma_pDORMQR(plasma_args.M, plasma_args.NRHS, plasma_args.N, plasma_args.A,
                   plasma_args.NB, plasma_args.NBNBSIZE, plasma_args.IBNBSIZE,
                   plasma_args.IB, plasma_args.MT, plasma_args.NTRHS, plasma_args.NT,
                   plasma_args.T, plasma_args.B, &plasma_args.INFO, plasma_cntrl.cores_num, 0);
    plasma_barrier(0, plasma_cntrl.cores_num);

    /* Call parallel DTRSM */
    /* Set arguments */
    plasma_args.N = N;
    plasma_args.NRHS = NRHS;
    plasma_args.A = Abdl;
    plasma_args.NB = plasma_cntrl.NB;
    plasma_args.NBNBSIZE = plasma_cntrl.NBNBSIZE;
    plasma_args.IBNBSIZE = plasma_cntrl.IBNBSIZE;
    plasma_args.NT = NT;
    plasma_args.MT = MT;
    plasma_args.B = Bbdl;
    plasma_args.NTRHS = NTRHS;
    /* Clear progress table */
    plasma_clear_aux_progress(NT*NTRHS, -1);
    /* Signal workers */
    pthread_mutex_lock(&plasma_cntrl.action_mutex);
    plasma_cntrl.action = PLASMA_ACT_DTRSM;
    pthread_mutex_unlock(&plasma_cntrl.action_mutex);
    pthread_cond_broadcast(&plasma_cntrl.action_condt);
    /* Call for master */
    plasma_barrier(0, plasma_cntrl.cores_num);
    plasma_cntrl.action = PLASMA_ACT_STAND_BY;
    plasma_pDTRSM(PlasmaLeft, PlasmaUpper, PlasmaNoTrans, PlasmaNonUnit, plasma_args.N,
                  plasma_args.NRHS, 1.0, plasma_args.A, plasma_args.NB, plasma_args.NBNBSIZE,
                  plasma_args.NT, plasma_args.MT, plasma_args.B, plasma_args.NTRHS,
                  plasma_cntrl.cores_num, 0);
    plasma_barrier(0, plasma_cntrl.cores_num);

    /* Convert arrays from BDL to LAPACK */
    plasma_bdl_to_lapack(Abdl, A, M, N, LDA,
        plasma_cntrl.NB, MT, NT, plasma_cntrl.NBNBSIZE);
    plasma_bdl_to_lapack(Bbdl, B, M, NRHS, LDB,
        plasma_cntrl.NB, MT, NTRHS, plasma_cntrl.NBNBSIZE);

    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  Get memory information
 */
int plasma_get_meminfo()
{
    FILE *shell_cmd_fp;
    char sh_cmd_out[SHELL_CMD_OUT_MAX];
    char *fgets_status;
    int pclose_status;

    /* Set cache line size */
    plasma_cntrl.cache_line_size = CACHE_LINE_SIZE;

    /* Get standard page size */
    plasma_cntrl.page_size = sysconf(_SC_PAGESIZE);
    if (plasma_cntrl.page_size <= 0) {
        plasma_error("plasma_get_meminfo", "failed to get page size");
        return PLASMA_ERR_NOT_FOUND;
    }

    /* Find out if huge pages are supported */
    shell_cmd_fp = popen("cat /proc/meminfo | grep Huge", "r");
    if (shell_cmd_fp == NULL) {
        plasma_error("plasma_get_meminfo", "popen() failed");
        return PLASMA_ERR_NOT_SUPPORTED;
    }
    fgets_status = fgets(sh_cmd_out, SHELL_CMD_OUT_MAX, shell_cmd_fp);
    pclose_status = pclose(shell_cmd_fp);
    if (pclose_status == -1) {
        plasma_error("plasma_get_meminfo", "pclose() failed");
        return PLASMA_ERR_FILESYSTEM;
    }
    if (fgets_status != NULL) {
        /* Get total number of huge pages */
        shell_cmd_fp = popen("cat /proc/meminfo | awk '/HugePages_Total/ {print $2}'", "r");
        fgets_status = fgets(sh_cmd_out, SHELL_CMD_OUT_MAX, shell_cmd_fp);
        pclose_status = pclose(shell_cmd_fp);
        if (shell_cmd_fp == NULL || fgets_status == NULL || pclose_status == -1) {
            plasma_error("plasma_get_meminfo", "failed to get total huge pages");
            return PLASMA_ERR_NOT_FOUND;
        }
        plasma_cntrl.huge_pages_total = atoi(sh_cmd_out);

        /* Get number of free huge pages */
        shell_cmd_fp = popen("cat /proc/meminfo | awk '/HugePages_Free/ {print $2}'", "r");
        fgets_status = fgets(sh_cmd_out, SHELL_CMD_OUT_MAX, shell_cmd_fp);
        pclose_status = pclose(shell_cmd_fp);
        if (shell_cmd_fp == NULL || fgets_status == NULL || pclose_status == -1) {
            plasma_error("plasma_get_meminfo", "failed to get free huge pages");
            return PLASMA_ERR_NOT_FOUND;
        }
        plasma_cntrl.huge_pages_free = atoi(sh_cmd_out);

        /* Get huge page size */
        shell_cmd_fp = popen("cat /proc/meminfo | awk '/Hugepagesize/ {print $2}'", "r");
        fgets_status = fgets(sh_cmd_out, SHELL_CMD_OUT_MAX, shell_cmd_fp);
        pclose_status = pclose(shell_cmd_fp);
        if (shell_cmd_fp == NULL || fgets_status == NULL || pclose_status == -1) {
            plasma_error("plasma_get_meminfo", "failed to get huge page size");
            return PLASMA_ERR_NOT_FOUND;
        }
        else {
            plasma_cntrl.huge_page_size = atoi(sh_cmd_out) * 1024;
            if (plasma_cntrl.huge_page_size <= 0) {
                plasma_error("plasma_get_meminfo", "invalid huge page size");
                return PLASMA_ERR_ILLEGAL_VALUE;
            }
        }
    }
    else {
        plasma_cntrl.huge_pages_total = 0;
        plasma_cntrl.huge_pages_free = 0;
        plasma_cntrl.huge_page_size = 0;
    }
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  PLASMA initialization
 */
int plasma_init(int M, int N, int NRHS)
{
    PLASMA_bool huge_page_warning = PLASMA_TRUE;
    PLASMA_long size_elems;
    PLASMA_long size_bytes;
    int NB, IB, MT, NT, NTRHS;
    int status;
    int core;

    /* Check if not initialized */
    if (plasma_cntrl.initialized) {
        plasma_warning("plasma_init", "PLASMA re-initialized");
        return PLASMA_ERR_REINITIALIZED;
    }

    /* Get system size (number of cores)
       Set number of cores to system size */
    plasma_cntrl.cores_max = sysconf(_SC_NPROCESSORS_ONLN);
    plasma_cntrl.cores_num = plasma_cntrl.cores_max;
    if (plasma_cntrl.cores_max <= 0) {
        plasma_error("plasma_init", "failed to get system size");
        return PLASMA_ERR_NOT_FOUND;
    }
    /* Check if not more cores than the hard limit */
    if (plasma_cntrl.cores_max > CORES_MAX) {
        plasma_error("plasma_init", "not supporting so many cores");
        return PLASMA_ERR_INTERNAL_LIMIT;
    }

    status = plasma_get_meminfo();
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_init", "plasma_get_meminfo() failed");
    }

    /* Allocate temporary kernel workspace */
    status = plasma_alloc_aux_work_tau();
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_init", "plasma_alloc_work_tau() failed");
        return status;
    }

    /* Allocate progress table using hinted problem size values
     * On failure recursively decrease the size by 25% */
    NB = plasma_cntrl.NB_min;
    MT = (M%NB==0) ? (M/NB) : (M/NB+1);
    NT = (N%NB==0) ? (N/NB) : (N/NB+1);
    NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
    size_elems = MT*max(NT, NTRHS);
    do {
        status = plasma_alloc_aux_progress(size_elems);
        if (status != PLASMA_SUCCESS) {
            size_elems = size_elems / 4 * 3;
            if (size_elems == 0) {
                plasma_error("plasma_init", "plasma_alloc_aux_progress() failed");
                return PLASMA_ERR_OUT_OF_MEMORY;
            }
        }
    }
    while (status != PLASMA_SUCCESS);

    /* Allocate bdl memory using hinted problem size values
     * On failure recursively decrease the size by 25% */
    NB = plasma_cntrl.NB_max;
    IB = plasma_cntrl.IB_max;
    MT = (M%NB==0) ? (M/NB) : (M/NB+1);
    NT = (N%NB==0) ? (N/NB) : (N/NB+1);
    NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
    size_elems  = (MT*NT + MT*NTRHS)*NB*NB + (MT*NT)*IB*NB;
    size_bytes = size_elems * sizeof(double);
    /* Warn if free huge pages is less than total */
    if (plasma_cntrl.huge_pages_free < plasma_cntrl.huge_pages_total)
        plasma_warning("plasma_init", "not all huge pages are free");
    /* If huge pages available but more requested than free
     * attempt to allocate the number of huge pages free */
    if (plasma_cntrl.huge_pages_free > 0) {
        size_bytes = min(size_bytes, plasma_cntrl.huge_pages_free * plasma_cntrl.huge_page_size);
        size_elems = size_bytes / sizeof(double);
    }
    do {
        status = plasma_alloc_aux_bdl(size_elems, huge_page_warning);
        if (status != PLASMA_SUCCESS) {
            huge_page_warning = PLASMA_FALSE;
            size_elems = size_elems / 4 * 3;
            if (size_elems == 0) {
                plasma_error("plasma_init", "plasma_alloc_aux_bld() failed");
                return PLASMA_ERR_OUT_OF_MEMORY;
            }
        }
    }
    while (status != PLASMA_SUCCESS);

    /* Initialize barrier */
    plasma_barrier_init();

    /* Initialize default thread attributes */
    status = pthread_attr_init(&plasma_cntrl.core_attr);
    if (status != 0) {
        plasma_error("plasma_init()", "pthread_attr_init() failed");
        return status;
    }

    /* Set scope to system */
    status = pthread_attr_setscope(&plasma_cntrl.core_attr, PTHREAD_SCOPE_SYSTEM);
    if (status != 0) {
        plasma_error("plasma_init()", "pthread_attr_setscope() failed");
        return status;
    }

    /* Set concurrency */
    status = pthread_setconcurrency(plasma_cntrl.cores_num);
    if (status != 0) {
        plasma_error("plasma_init()", "pthread_setconcurrency() failed");
        return status;
    }

    /*  Launch threads */
    for (core = 1; core < plasma_cntrl.cores_num; core++) {
        plasma_cntrl.core_num[core] = core;
        pthread_create(
            &plasma_cntrl.core_id[core],
            &plasma_cntrl.core_attr,
             plasma_parallel_section,
            &plasma_cntrl.core_num[core]);
    }
    plasma_cntrl.core_num[0] = 0;

    /* Parallel initializations for core 0 */
    status = plasma_parallel_init(0);
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_init", "plasma_parallel_init() failed");
        return status;
    }

    plasma_barrier(0, plasma_cntrl.cores_num);
    plasma_cntrl.initialized = PLASMA_TRUE;
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 *  PLASMA completion
 */
int plasma_finalize()
{
    int core;
    int status;
    void *exitcodep;

    /* Check if initialized */
    if (!plasma_cntrl.initialized) {
        plasma_warning("plasma_finalize", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }

    /* Set termination action */
    pthread_mutex_lock(&plasma_cntrl.action_mutex);
    plasma_cntrl.action = PLASMA_ACT_FINALIZE;
    pthread_mutex_unlock(&plasma_cntrl.action_mutex);
    pthread_cond_broadcast(&plasma_cntrl.action_condt);

    /* Barrier and clear action */
    plasma_barrier(0, plasma_cntrl.cores_num);
    plasma_cntrl.action = PLASMA_ACT_STAND_BY;

    // Join threads
    for (core = 1; core < plasma_cntrl.cores_num; core++) {
        status = pthread_join(plasma_cntrl.core_id[core], &exitcodep);
        if (status != 0) {
            plasma_error("plasma_finalize", "pthread_join() failed");
            return status;
        }
    }

    /* Destroy thread attributes */
    status = pthread_attr_destroy(&plasma_cntrl.core_attr);
    if (status != 0) {
        plasma_error("plasma_finalize()", "pthread_attr_destroy() failed");
        return status;
    }

    /* Release memory for storage in BDL */
    status = plasma_free_aux_bdl();
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_finalize", "plasma_free_aux_bdl() failed");
    }

    /* Destroy progress table */
    status = plasma_free_aux_progress();
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_finalize", "plasma_free_aux_progress() failed");
    }

    /* Destroy temporary kernel workspace */
    status = plasma_free_aux_work_tau();
    if (status != PLASMA_SUCCESS) {
        plasma_error("plasma_finalize", "plasma_free_aux_work_tau() failed");
    }

    plasma_cntrl.initialized = PLASMA_FALSE;
    return PLASMA_SUCCESS;
}

/*////////////////////////////////////////////////////////////////////////////////////////
 */

void core_DPOTRF(int uplo, int N, double *A, int LDA, int *INFO)
{
    dpotrf(lapack_const(uplo), &N, A, &LDA, INFO);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void core_DSYRK(int uplo, int trans, int N, int K, double alpha, double *A,
               int LDA, double beta, double *C, int LDC)
{
    cblas_dsyrk(
        CblasColMajor,
        (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans,
        N, K,
        alpha, A, LDA,
         beta, C, LDC);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void core_DTRSM(int side, int uplo, int transA, int diag, int M, int N,
                double alpha, double *A, int LDA, double *B, int LDB)
{
    cblas_dtrsm(
        CblasColMajor,
        (CBLAS_SIDE)side, (CBLAS_UPLO)uplo,
        (CBLAS_TRANSPOSE)transA, (CBLAS_DIAG)diag,
        M, N,
        alpha, A, LDA,
               B, LDB);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void core_DGEMM(int transA, int transB, int M, int N, int K, double alpha,
                double *A, int LDA, double *B, int LDB, double beta, double *C,
                int LDC)
{
    cblas_dgemm(
        CblasColMajor,
        (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
        M, N, K,
        alpha, A, LDA,
               B, LDB,
         beta, C, LDC);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

#define A(m,n) &A[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define B(m,n) &B[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define progress(m,n) plasma_aux.progress[(m)+NT*(n)]

void plasma_pDTRSM(int side, int uplo, int transA, int diag, int N, int NRHS,
                   double alpha, double *A, int NB, int NBNBSIZE, int NT, int MT,
                   double *B, int NTRHS, int cores_num, int my_core_id)
{
    int k, m, n;
    int next_k;
    int next_m;
    int next_n;

    k = 0;
    m = my_core_id;
    while (m >= NT) {
        k++;
        m = m-NT+k;
    }
    n = 0;

    while (k < NT && m < NT) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_n++;
        if (next_n >= NTRHS) {
            next_m += cores_num;
            while (next_m >= NT && next_k < NT) {
                next_k++;
                next_m = next_m-NT+next_k;
            }
            next_n = 0;
        }

        if (m == k)
        {
            while (progress(m, n) != k-1);
            if (uplo == PlasmaLower) {
                if (transA == PlasmaNoTrans)
                    core_DTRSM(
                        PlasmaLeft, PlasmaLower,
                        PlasmaNoTrans, PlasmaNonUnit,
                        k == NT-1 ? N-k*NB : NB,
                        n == NTRHS-1 ? NRHS-n*NB : NB,
                        1.0, A(k, k), NB,
                             B(k, n), NB);
                else
                    core_DTRSM(
                        PlasmaLeft, PlasmaLower,
                        PlasmaTrans, PlasmaNonUnit,
                        k == 0 ? N-(NT-1)*NB : NB,
                        n == NTRHS-1 ? NRHS-n*NB : NB,
                        1.0, A(NT-1-k, NT-1-k), NB,
                             B(NT-1-k, n), NB);
                }
                else {
                    if (transA == PlasmaNoTrans)
                        core_DTRSM(
                            PlasmaLeft, PlasmaUpper,
                            PlasmaNoTrans, PlasmaNonUnit,
                            k == 0 ? N-(NT-1)*NB : NB,
                            n == NTRHS-1 ? NRHS-n*NB : NB,
                            1.0, A(NT-1-k, NT-1-k), NB,
                                 B(NT-1-k, n), NB);
                    else
                        core_DTRSM(
                            PlasmaLeft, PlasmaUpper,
                            PlasmaTrans, PlasmaNonUnit,
                            k == NT-1 ? N-k*NB : NB,
                            n == NTRHS-1 ? NRHS-n*NB : NB,
                            1.0, A(k, k), NB,
                                 B(k, n), NB);
                }
                progress(k, n) = k;
        }
        else
        {
            while (progress(k, n) != k);
            while (progress(m, n) != k-1);
            if (uplo == PlasmaLower) {
                if (transA == PlasmaNoTrans)
                    core_DGEMM(
                        PlasmaNoTrans, PlasmaNoTrans,
                        m == NT-1 ? N-m*NB : NB,
                        n == NTRHS-1 ? NRHS-n*NB : NB,
                        NB,
                       -1.0, A(m, k), NB,
                             B(k, n), NB,
                        1.0, B(m, n), NB);
                else
                    core_DGEMM(
                        PlasmaTrans, PlasmaNoTrans,
                        m == 0 ? N-(NT-1)*NB : NB,
                        n == NTRHS-1 ? NRHS-n*NB : NB,
                        NB,
                       -1.0, A(NT-1-k, NT-1-m), NB,
                             B(NT-1-k, n), NB,
                        1.0, B(NT-1-m, n), NB);
                }
                else {
                    if (transA == PlasmaNoTrans)
                        core_DGEMM(
                            PlasmaNoTrans, PlasmaNoTrans,
                            m == 0 ? N-(NT-1)*NB : NB,
                            n == NTRHS-1 ? NRHS-n*NB : NB,
                            NB,
                           -1.0, A(NT-1-m, NT-1-k), NB,
                                 B(NT-1-k, n), NB,
                            1.0, B(NT-1-m, n), NB);
                    else
                        core_DGEMM(
                            PlasmaTrans, PlasmaNoTrans,
                            m == NT-1 ? N-m*NB : NB,
                            n == NTRHS-1 ? NRHS-n*NB : NB,
                            NB,
                           -1.0, A(k, m), NB,
                                 B(k, n), NB,
                            1.0, B(m, n), NB);
                }
                progress(m, n) = k;
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
}

#undef A
#undef B
#undef progress

////////////////////////////////////////////////////////////////////////////////////////////////////

#define A(m,n) &A[NBNBSIZE*(m)+NBNBSIZE*NT*(n)]
#define progress(m,n) plasma_aux.progress[(m)+NT*(n)]

void plasma_DPOTRF(int uplo, int N, double *A, int NB, int NBNBSIZE, int NT,
                   int *INFO, int cores_num, int my_core_id)
{
    int k, m, n;
    int next_k;
    int next_m;
    int next_n;

    k = 0;
    m = my_core_id;
    while (m >= NT) {
        k++;
        m = m-NT+k;
    }
    n = 0;

    while (k < NT && m < NT) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_n++;
        if (next_n > next_k) {
            next_m += cores_num;
            while (next_m >= NT && next_k < NT) {
                next_k++;
                next_m = next_m-NT+next_k;
            }
            next_n = 0;
        }

        if (m == k) {
            if (n == k) {
                if (uplo == PlasmaLower)
                    core_DPOTRF(
                        PlasmaLower,
                        k == NT-1 ? N-k*NB : NB,
                        A(k, k), NB,
                        INFO);
                else
                    core_DPOTRF(
                        PlasmaUpper,
                        k == NT-1 ? N-k*NB : NB,
                        A(k, k), NB,
                        INFO);
                  if (*INFO != 0)
                      *INFO += NB*k;
                progress(k, k) = 1;
            }
            else {
                while(progress(k, n) != 1 && *INFO == 0);
                if (uplo == PlasmaLower)
                    core_DSYRK(
                         PlasmaLower, PlasmaNoTrans,
                         k == NT-1 ? N-k*NB : NB,
                         NB,
                        -1.0, A(k, n), NB,
                         1.0, A(k, k), NB);
                else
                    core_DSYRK(
                         PlasmaUpper, PlasmaTrans,
                         k == NT-1 ? N-k*NB : NB,
                         NB,
                        -1.0, A(n, k), NB,
                         1.0, A(k, k), NB);
            }
        }
        else {
            if (n == k) {
                while(progress(k, k) != 1 && *INFO == 0);
                if (uplo == PlasmaLower)
                    core_DTRSM(
                        PlasmaRight, PlasmaLower, PlasmaTrans, PlasmaNonUnit,
                        m == NT-1 ? N-m*NB : NB,
                        NB,
                        1.0, A(k, k), NB,
                             A(m, k), NB);
                else
                    core_DTRSM(
                        PlasmaLeft, PlasmaUpper, PlasmaTrans, PlasmaNonUnit,
                        NB,
                        m == NT-1 ? N-m*NB : NB,
                        1.0, A(k, k), NB,
                             A(k, m), NB);
                progress(m, k) = 1;
            }
            else {
                while(progress(k, n) != 1 && *INFO == 0);
                while(progress(m, n) != 1 && *INFO == 0);
                if (uplo == PlasmaLower)
                    core_DGEMM(
                        PlasmaNoTrans, PlasmaTrans,
                        m == NT-1 ? N-m*NB : NB,
                        NB,
                        NB,
                       -1.0, A(m, n), NB,
                             A(k, n), NB,
                        1.0, A(m, k), NB);
                else
                    core_DGEMM(
                        PlasmaTrans, PlasmaNoTrans,
                        NB,
                        m == NT-1 ? N-m*NB : NB,
                        NB,
                       -1.0, A(n, k), NB,
                             A(n, m), NB,
                        1.0, A(k, m), NB);
            }
        }
        if (*INFO != 0)
            return;

        n = next_n;
        m = next_m;
        k = next_k;
    }
}
#undef A
#undef progress

////////////////////////////////////////////////////////////////////////////////////////////////////

#define core_dgeqrt core_dgeqrt_
#define core_dtsqrt core_dtsqrt_
#define core_dlarfb core_dlarfb_
#define core_dssrfb core_dssrfb_

void core_DGEQRT(int M, int N, int IB, double *A, int LDA, double *T , int LDT,
                 double *TAU, double *WORK)
{
    int INFO;

    core_dgeqrt(
        &M, &N, &IB,
        A, &LDA,
        T , &LDT,
        TAU, WORK, &INFO);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void core_DTSQRT(int M, int N, int IB, double *A1, int LDA1, double *A2,
                 int LDA2, double *T, int LDT, double *TAU, double *WORK)
{
    int INFO;

    core_dtsqrt(
        &M, &N, &IB,
        A1, &LDA1,
        A2, &LDA2,
        T, &LDT,
        TAU, WORK, &INFO);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void core_DLARFB(int side, int trans, int direct, int storev, int M, int N,
                 int K, int IB, double *V, int LDV, double *T, int LDT,
                 double *C, int LDC, double *WORK, int LDWORK)
{
    int INFO;

    core_dlarfb(
        lapack_const(side), lapack_const(trans),
        lapack_const(direct), lapack_const(storev),
        &M, &N, &K, &IB,
        V, &LDV,
        T, &LDT,
        C, &LDC,
        WORK, &LDWORK, &INFO);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void core_DSSRFB(int M1, int M2, int NN, int IB, int K, double *A1, int LDA1,
                 double *A2, int LDA2, double *V, int LDV, double *T, int LDT,
                 double *WORK)
{
    int INFO;

    core_dssrfb_(
        lapack_const(PlasmaLeft), lapack_const(PlasmaColumnwise),
        &M1, &M2, &NN, &IB, &K,
        A1, &LDA1,
        A2, &LDA2,
        V, &LDV,
        T, &LDT,
        WORK, &INFO);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

#define A(m,n) &A[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define T(m,n) &T[IBNBSIZE*(m)+IBNBSIZE*MT*(n)]
#define progress(m,n) plasma_aux.progress[(m)+MT*(n)]

void plasma_pDGEQRF(int M, int N, double *A, int NB, int NBNBSIZE, int IBNBSIZE,
                    int IB, int MT, int NT, double *T, int *INFO, int cores_num,
                    int my_core_id)
{
    double *WORK = plasma_aux.WORK[my_core_id];
    double *TAU = plasma_aux.TAU[my_core_id];
    int k, m, n;
    int next_k;
    int next_m;
    int next_n;

    k = 0;
    n = my_core_id;
    while (n >= NT) {
        k++;
        n = n-NT+k;
    }
    m = k;

    while (k < min(MT, NT) && n < NT) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == MT) {
            next_n += cores_num;
            while (next_n >= NT && next_k < min(MT, NT)) {
                next_k++;
                next_n = next_n-NT+next_k;
            }
            next_m = next_k;
        }

        if (n == k) {
            if (m == k) {
                while(progress(k, k) != k-1);
                core_DGEQRT(
                    k == MT-1 ? M-k*NB : NB,
                    k == NT-1 ? N-k*NB : NB,
                    IB,
                    A(k, k), NB,
                    T(k, k), IB,
                    TAU, WORK);
                progress(k, k) = k;
            }
            else {
                while(progress(m, k) != k-1);
                core_DTSQRT(
                    m == MT-1 ? M-m*NB : NB,
                    k == NT-1 ? N-k*NB : NB,
                    IB,
                    A(k, k), NB,
                    A(m, k), NB,
                    T(m, k), IB,
                    TAU, WORK);
                progress(m, k) = k;
            }
        }
        else {
            if (m == k) {
                while(progress(k, k) != k);
                while(progress(k, n) != k-1);
                core_DLARFB(
                    PlasmaLeft, PlasmaTrans,
                    PlasmaForward, PlasmaColumnwise,
                    k == MT-1 ? M-k*NB : NB,
                    n == NT-1 ? N-n*NB : NB,
                    NB,
                    IB,
                    A(k, k), NB,
                    T(k, k), IB,
                    A(k, n), NB,
                    WORK, NB);
            }
            else {
                while(progress(m, k) != k);
                while(progress(m, n) != k-1);
                core_DSSRFB(
                    NB,
                    m == MT-1 ? M-m*NB : NB,
                    n == NT-1 ? N-n*NB : NB,
                    IB,
                    NB,
                    A(k, n), NB,
                    A(m, n), NB,
                    A(m, k), NB,
                    T(m, k), IB,
                    WORK);
                progress(m, n) = k;
            }
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
}

#undef A
#undef T
#undef progress

////////////////////////////////////////////////////////////////////////////////////////////////////

#define A(m,n) &A[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define B(m,n) &B[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define T(m,n) &T[IBNBSIZE*(m)+IBNBSIZE*MT*(n)]
#define progress(m,n) plasma_aux.progress[(m)+MT*(n)]

void plasma_pDORMQR(int M, int NRHS, int N, double *A, int NB, int NBNBSIZE,
                    int IBNBSIZE, int IB, int MT, int NTRHS, int NT, double *T,
                    double *B, int *INFO, int cores_num, int my_core_id)
{
    double *WORK = plasma_aux.WORK[my_core_id];
    int k, m, n;
    int next_k;
    int next_m;
    int next_n;

    k = 0;
    n = my_core_id;
    while (n >= NTRHS) {
        k++;
        n = n-NTRHS;
    }
    m = k;

    while (k < min(MT, NT) && n < NTRHS) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == MT) {
            next_n += cores_num;
            while (next_n >= NTRHS && next_k < min(MT, NT)) {
                next_k++;
                next_n = next_n-NTRHS;
            }
            next_m = next_k;
        }

        if (m == k) {
            while (progress(k, n) != k-1);
            core_DLARFB(
                PlasmaLeft, PlasmaTrans,
                PlasmaForward, PlasmaColumnwise,
                k == MT-1 ? M-k*NB : NB,
                n == NTRHS-1 ? NRHS-n*NB : NB,
                k == NT-1 ? N-k*NB : NB,
                IB,
                A(k, k), NB,
                T(k, k), IB,
                B(k, n), NB,
                WORK, NB);
            progress(k, n) = k;
        }
        else {
            while (progress(k, n) != k);
            while (progress(m, n) != k-1);
            core_DSSRFB(
                NB,
                m == MT-1 ? M-m*NB : NB,
                n == NTRHS-1 ? NRHS-n*NB : NB,
                IB,
                k == NT-1 ? N-k*NB : NB,
                B(k, n), NB,
                B(m, n), NB,
                A(m, k), NB,
                T(m, k), IB,
                WORK);
            progress(m, n) = k;
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
}

#undef A
#undef B
#undef T
#undef progress

////////////////////////////////////////////////////////////////////////////////////////////////////

#define A(m,n) &A[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define B(m,n) &B[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define T(m,n) &T[NBNBSIZE*(m)+NBNBSIZE*MT*(n)]
#define progress(m,n) plasma_aux.progress[(m)+MT*(n)]

void plasma_pDORGQR(int M, int NRHS, int N, double *A, int NB, int NBNBSIZE, int IB,
                    int MT, int NTRHS, int NT, double *T, double *B, int *INFO,
                    int cores_num, int my_core_id)
{
    double *WORK = plasma_aux.WORK[my_core_id];
    int k, m, n;
    int next_k;
    int next_m;
    int next_n;

    k = 0;
    n = my_core_id;
    while (n >= NTRHS) {
        k++;
        n = n-NTRHS;
    }
    m = k;

    while (k < min(MT, NT) && n < NTRHS) {
        next_n = n;
        next_m = m;
        next_k = k;

        next_m++;
        if (next_m == MT) {
            next_n += cores_num;
            while (next_n >= NTRHS && next_k < min(MT, NT)) {
                next_k++;
                next_n = next_n-NTRHS;
            }
            next_m = next_k;
        }

        if (m == k) {
            while (progress(k, n) != k-1);
            if (n < min(k+1, NTRHS))
                core_DLARFB(
                    PlasmaLeft, PlasmaTrans,
                    PlasmaForward, PlasmaColumnwise,
                    k == MT-1 ? M-k*NB : NB,
                    n == NTRHS-1 ? NRHS-n*NB : NB,
                    k == NT-1 ? N-k*NB : NB,
                    IB,
                    A(k, k), NB,
                    T(k, k), NB,
                    B(k, n), NB,
                    WORK, NB);
            progress(k, n) = k;
        }
        else {
            while (progress(k, n) != k);
            while (progress(m, n) != k-1);
            if (n < (m+1, NTRHS))
                core_DSSRFB(
                    NB,
                    m == MT-1 ? M-m*NB : NB,
                    n == NTRHS-1 ? NRHS-n*NB : NB,
                    IB,
                    k == NT-1 ? N-k*NB : NB,
                    B(k, n), NB,
                    B(m, n), NB,
                    A(m, k), NB,
                    T(m, k), NB,
                    WORK);
            progress(m, n) = k;
        }
        n = next_n;
        m = next_m;
        k = next_k;
    }
}

#undef A
#undef B
#undef T
#undef progress

//##################################################################################################

#define plasma_lockstep(M,N,c) { \
    int i; \
    plasma_barrier(my_core_id, cores_num); \
    if (my_core_id == 0) \
        for (i = 0; i < M*N; i++) \
            plasma_aux.progress[i] = c; \
    plasma_barrier(my_core_id, cores_num); }


void *parallel_section(void *thread_id)
{
    int my_core_id = *((int*)thread_id);
    int cores_num = plasma_cntrl.cores_num;
    double start_time, end_time, elapsed_time;
    int N = plasma_args.N;
    int i;
    int NT = plasma_args.NT;
    int MT = plasma_args.MT;
    int NTRHS = plasma_args.NTRHS;
/*
    plasma_barrier(my_core_id, cores_num);
    tile_cholesky_parallel(my_core_id);
    plasma_barrier(my_core_id, cores_num);

    int i;
    if (my_core_id == 0)
        for (i = 0; i < BB*BB; i++)
            plasma.progress[i] = 0;
*/
    log_events = 1;
    core_event_start(my_core_id);
    plasma_barrier(my_core_id, cores_num);
    core_event_end(my_core_id);
    core_event_log(0x010101, my_core_id);

    if (my_core_id == 0)
        start_time = get_current_time();



/*
    plasma_lockstep(NT, NT, 0);
    plasma_DPOTRF(
        plasma.uplo,
        plasma.N,
        plasma.A,
        plasma.NB,
        plasma.NBNBSIZE,
        plasma.NT,
        &plasma.INFO,
        cores_num,
        my_core_id);

    plasma_lockstep(NT, NTRHS, 0);
    plasma_pDTRSM(
        plasma.side,
        plasma.uplo,
        PlasmaTrans,
        PlasmaNonUnit,
        plasma.N,
        plasma.NRHS,
        1.0,
        plasma.A,
        plasma.NB,
        plasma.NBNBSIZE,
        plasma.NT,
        plasma.NT,
        plasma.B,
        plasma.NTRHS,
        cores_num,
        my_core_id);

    plasma_lockstep(NT, NTRHS, 0);
    plasma_pDTRSM(
        plasma.side,
        plasma.uplo,
        PlasmaNoTrans,
        PlasmaNonUnit,
        plasma.N,
        plasma.NRHS,
        1.0,
        plasma.A,
        plasma.NB,
        plasma.NBNBSIZE,
        plasma.NT,
        plasma.NT,
        plasma.B,
        plasma.NTRHS,
        cores_num,
        my_core_id);

    plasma_barrier(my_core_id, cores_num);
    printf("%2d INFO: %d\n", my_core_id, plasma.INFO);
*/

/*
    plasma_lockstep(MT, NT, -1);
    plasma_pDGEQRF(
        plasma_args.M,
        plasma_args.N,
        plasma_args.A,
        plasma_args.NB,
        plasma_args.NBNBSIZE,
        plasma_args.IB,
        plasma_args.MT,
        plasma_args.NT,
        plasma_args.T,
        &plasma_args.INFO,
        cores_num,
        my_core_id);

    plasma_lockstep(MT, NTRHS, -1);
    plasma_pDORGQR(
        plasma_args.M,
        plasma_args.NRHS,
        plasma_args.N,
        plasma_args.A,
        plasma_args.NB,
        plasma_args.NBNBSIZE,
        plasma_args.IB,
        plasma_args.MT,
        plasma_args.NTRHS,
        plasma_args.NT,
        plasma_args.T,
        plasma_args.B,
        &plasma_args.INFO,
        cores_num,
        my_core_id);
*/
    plasma_lockstep(NT, NTRHS, -1);
    plasma_pDTRSM(
        PlasmaLeft,
        PlasmaUpper,
        PlasmaNoTrans,
        PlasmaNonUnit,
        plasma_args.N,
        plasma_args.NRHS,
        1.0,
        plasma_args.A,
        plasma_args.NB,
        plasma_args.NBNBSIZE,
        plasma_args.NT,
        plasma_args.MT,
        plasma_args.B,
        plasma_args.NTRHS,
        cores_num,
        my_core_id);

    plasma_barrier(my_core_id, cores_num);


    if (my_core_id == 0)
        end_time = get_current_time();

    elapsed_time = end_time - start_time;
    if (my_core_id == 0)
      if (elapsed_time != 0.0)
        GFLOPS = 1.0*N*N*N/3.0 / elapsed_time / 1000000000;

    return NULL;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void dump_trace(int cores_num, int NB)
{
    char trace_file_name[32];
    FILE *trace_file;
    int event;
    int core;
//  double scale = 200000000000.0 / (NB*NB*NB);
    double scale = 300000.0;


    sprintf(trace_file_name, "trace_%d.svg", (int)(time(NULL)));
    trace_file = fopen(trace_file_name, "w");
    assert(trace_file != NULL);

    fprintf(trace_file,
        "<svg width=\"200mm\" height=\"40mm\" viewBox=\"0 0 20000 4000\">\n"
        "  <g>\n");

    for (core = 0; core < cores_num; core++)
        for (event = 0; event < event_num[core]; event += 4)
        {
            int    thread = event_log[core][event+0];
            double start  = event_log[core][event+1];
            double end    = event_log[core][event+2];
            int    color  = event_log[core][event+3];

            start -= event_log[core][2];
            end   -= event_log[core][2];

            fprintf(trace_file,
                "    "
                "<rect x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
                "fill=\"#%06x\" stroke=\"#000000\" stroke-width=\"1\"/>\n",
                start * scale,
                thread * 100.0,
                (end - start) * scale,
                90.0,
                color);
        }

    fprintf(trace_file,
        "  </g>\n"
        "</svg>\n");

    fclose(trace_file);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

void diff_matrix(double *A, double *B, int NB, int BBM, int BBN, int M, int N)
{
    int X, Y, x, y, i;

    printf("\n");
    for (Y = 0; Y < BBM; Y++) {
      for (y = 0; y < NB; y++) {
        for (X = 0; X < BBN; X++) {
          for (x = 0; x < NB; x++) {
            if (Y*NB + y < M && X*NB + x < N) {

              double a, b, c, d, e;
              a = fabs(A[(Y*NB+y) + (X*NB+x)*M]);
              b = fabs(B[(Y*NB+y) + (X*NB+x)*M]);
              c = max(a, b);
              d = min(a, b);
              e = (c - d) / d;

              printf("%c", e < 0.00000000001 ? '.' : '#');
//            if (x == 3) x = NB-5;
//            if (x == 7) x = NB-1;
            }
            else
            {
              printf("%c", '+');
            }
          }
          printf("  |");
        }
        printf("\n");
//      if (y == 3) y = NB-5;
//      if (y == 7) y = NB-1;
      }
      if (Y < BBM-1)
        for (i = 0; i < BBM*12; i++) printf("=");
      printf("\n");
    }
    printf("\n");
}

////////////////////////////////////////////////////////////////////////////////////////////////////























