//==============================================================================================
//
//  Innovative Computing Laboratory - Computer Science Department - University of Tennessee
//  Written by Jakub Kurzak
//
//==============================================================================================

#include <cbe_mfc.h>
#include <spu_mfcio.h>

#include "../cbe/cbe_blas.h"
#include "spu_blas.h"

//----------------------------------------------------------------------------------------------

int NB;
int DNB;
int spus_num;
int my_spu_id;

CallArgs spu_call_args __attribute__ ((aligned (128)));
GlobalParams spu_global_params __attribute__ ((aligned (128)));

unsigned int ppu_global_params_p;
unsigned int ppu_call_args_p;

//----------------------------------------------------------

unsigned char mem_pool[64*64*4*8] __attribute__ ((aligned(64*64*4)));

volatile unsigned char spu_progress[4096] __attribute__ ((aligned (16)));

int spu_event_log[1024] __attribute__ ((aligned (128)));
int spu_event_num = 0;

//----------------------------------------------------------

float *buffer1 = (float*)(mem_pool + 0*16384);
float *buffer2 = (float*)(mem_pool + 1*16384);
float *buffer3 = (float*)(mem_pool + 2*16384);
float *buffer4 = (float*)(mem_pool + 3*16384);
float *buffer5 = (float*)(mem_pool + 4*16384);
float *buffer6 = (float*)(mem_pool + 5*16384);
float *buffer7 = (float*)(mem_pool + 6*16384);
float *buffer8 = (float*)(mem_pool + 7*16384);

float *buffer[BUFFERS] =
{
    (float*)(mem_pool + 0*16384),
    (float*)(mem_pool + 1*16384),
    (float*)(mem_pool + 2*16384),
    (float*)(mem_pool + 3*16384),
    (float*)(mem_pool + 4*16384),
    (float*)(mem_pool + 5*16384),
    (float*)(mem_pool + 6*16384),
    (float*)(mem_pool + 7*16384)
};

//----------------------------------------------------------

double  *dbuffer1 = (double*)(mem_pool +  0*8192);
double  *dbuffer2 = (double*)(mem_pool +  1*8192);
double  *dbuffer3 = (double*)(mem_pool +  2*8192);
double  *dbuffer4 = (double*)(mem_pool +  3*8192);
double  *dbuffer5 = (double*)(mem_pool +  4*8192);
double  *dbuffer6 = (double*)(mem_pool +  5*8192);
double  *dbuffer7 = (double*)(mem_pool +  6*8192);
double  *dbuffer8 = (double*)(mem_pool +  7*8192);
double  *dbuffer9 = (double*)(mem_pool +  8*8192);
double *dbuffer10 = (double*)(mem_pool +  9*8192);
double *dbuffer11 = (double*)(mem_pool + 10*8192);
double *dbuffer12 = (double*)(mem_pool + 11*8192);
double *dbuffer13 = (double*)(mem_pool + 12*8192);
double *dbuffer14 = (double*)(mem_pool + 13*8192);
double *dbuffer15 = (double*)(mem_pool + 14*8192);
double *dbuffer16 = (double*)(mem_pool + 15*8192);

double *dbuffer[DBUFFERS] =
{
    (double*)(mem_pool +  0*8192),
    (double*)(mem_pool +  1*8192),
    (double*)(mem_pool +  2*8192),
    (double*)(mem_pool +  3*8192),
    (double*)(mem_pool +  4*8192),
    (double*)(mem_pool +  5*8192),
    (double*)(mem_pool +  6*8192),
    (double*)(mem_pool +  7*8192),
    (double*)(mem_pool +  8*8192),
    (double*)(mem_pool +  9*8192),
    (double*)(mem_pool + 10*8192),
    (double*)(mem_pool + 11*8192),
    (double*)(mem_pool + 12*8192),
    (double*)(mem_pool + 13*8192),
    (double*)(mem_pool + 14*8192),
    (double*)(mem_pool + 15*8192)
};

//----------------------------------------------------------------------------------------------

void spu_spu_init(unsigned int ppu_glob_par_p)
{
    int i;


    my_spu_id = spu_read_in_mbox();
    ppu_global_params_p = ppu_glob_par_p;

    mfc_get(&spu_global_params, ppu_global_params_p, sizeof(GlobalParams), 0, 0, 0);
    mfc_write_tag_mask(0x01 << 0);
    mfc_read_tag_status_all();

    NB = spu_global_params.NB;
    DNB = spu_global_params.DNB;
    spus_num = spu_global_params.spus_num;
    ppu_call_args_p = spu_global_params.ppu_call_args_p;

    //----------------------------------------------------------

    for (i = 0; i < spus_num; i++)
    {
        mfc_put(
            (void*)&spu_progress[0],
            spu_global_params.local_store[i] + (unsigned int)(&spu_progress[0]),
            sizeof(unsigned char), 0, 0, 0);
        mfc_write_tag_mask(0x01 << 0);
        mfc_read_tag_status_all();
    }

    //----------------------------------------------------------

    spu_barrier_();
    spu_write_decrementer(-1);
    spu_barrier();
}

//----------------------------------------------------------------------------------------------

int main(int speid, addr64 argp, addr64 envp)
{
    unsigned int tag;
    Command command;


    spu_spu_init(argp.ui[1]);

    do
    {
        command = spu_read_in_mbox();
    
        switch (command)
        {
            case STRSM_NOTRANS: spu_strsm_no_trans();   break;
            case STRSM_TRANS:   spu_strsm_trans();      break;
            case SPOTRF:        spu_spotrf();           break;
            case DGEMM:         spu_dgemm();            break;
            case CONVERT_D2S:   spu_convert_d2s();      break;

            case LAPACK2BLOCKED:        tag = spu_lapack2blocked();         break;
            case BLOCKED2LAPACK:        tag = spu_blocked2lapack();         break;
            case LAPACK2BLOCKED_DOUBLE: tag = spu_lapack2blocked_double();  break;
            case BLOCKED2LAPACK_DOUBLE: tag = spu_blocked2lapack_double();  break;
        }

        spu_mfcstat(2);
        spu_barrier();

        spu_ppu_ack(DONE, 0);
    }
    while (command != END);

    //----------------------------------------------------------

    mfc_put((void*)&spu_event_log[0], spu_global_params.spu_event_log[my_spu_id], 4096, 0, 0, 0);
    mfc_write_tag_mask(0x01 << 0);
    mfc_read_tag_status_all();

    //----------------------------------------------------------

    return (0);
}

//----------------------------------------------------------------------------------------------
