//==============================================================================================
//
//  Innovative Computing Laboratory - Computer Science Department - University of Tennessee
//  Written by Jakub Kurzak
//
//==============================================================================================

#include <cbe_mfc.h>
#include <spu_mfcio.h>

#include "spu_blas.h"
#include "spu_comm.h"

//----------------------------------------------------------------------------------------------

extern int NB;
extern int DNB;
extern int spus_num;
extern int my_spu_id;
extern CallArgs spu_call_args __attribute__ ((aligned (128)));

extern GlobalParams spu_global_params __attribute__ ((aligned (128)));
extern unsigned int ppu_global_params_p;
extern unsigned int ppu_call_args_p;

//----------------------------------------------------------------------------------------------

static mfc_list_element_t  dma_list_pool[DMA_LISTS][64] __attribute__ ((aligned (8)));
static mfc_list_element_t *dma_list;
static int dma_list_cnt = 0;

//----------------------------------------------------------------------------------------------

void spu_recv_call_args()
{
    mfc_get(&spu_call_args, ppu_call_args_p, sizeof(CallArgs), 0, 0, 0);
    mfc_write_tag_mask(0x01 << 0);
    mfc_read_tag_status_all();
}

//----------------------------------------------------------------------------------------------

void spu_wait_tag(int tag)
{
    mfc_write_tag_mask(0x01 << tag);
    mfc_read_tag_status_all();
}

//----------------------------------------------------------------------------------------------

void spu_ppu_ack(Response response, int tag)
{
    mfc_putb(&response, spu_global_params.ppu_spu_ack_p[my_spu_id], sizeof(int), tag, 0, 0);
    mfc_write_tag_mask(0x01 << tag);
    mfc_read_tag_status_all();
}

//----------------------------------------------------------------------------------------------

void spu_recv_tile(float *spu_block, unsigned int ppu_block, int LDA, int tag)
{
    unsigned int i, increment;

    vector unsigned int *vec_dma_list;
    vector unsigned int *vec_ptr;

    vector unsigned int addend;
    vector unsigned int vec0, vec1, vec2, vec3;
    vector unsigned int vec4, vec5, vec6, vec7;


    dma_list = dma_list_pool[dma_list_cnt];
    dma_list_cnt ++;
    dma_list_cnt &= DMA_LISTS - 1;
    vec_dma_list = (vector unsigned int *) dma_list;

    increment = LDA * sizeof(float);

    vec0 = spu_insert(ppu_block, (vector unsigned int) (256, 0, 256, 0), 1);
    vec0 = spu_insert(ppu_block + increment, vec0, 3);

    addend = (vector unsigned int) (0, 0, 0, 0);
    addend = spu_insert(increment, addend, 1);
    addend = spu_insert(increment, addend, 3);

    addend = spu_add(addend, addend);

    vec1 = spu_add(vec0, addend);
    vec2 = spu_add(vec1, addend);
    vec3 = spu_add(vec2, addend);
    vec4 = spu_add(vec3, addend);
    vec5 = spu_add(vec4, addend);
    vec6 = spu_add(vec5, addend);
    vec7 = spu_add(vec6, addend);

    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);           

    vec_ptr = vec_dma_list;
    for (i = 0; i < NB/2; i+=8)
    {
        vec_ptr[0] = vec0;
        vec_ptr[1] = vec1;
        vec_ptr[2] = vec2;
        vec_ptr[3] = vec3;
        vec_ptr[4] = vec4;
        vec_ptr[5] = vec5;
        vec_ptr[6] = vec6;
        vec_ptr[7] = vec7;

        vec_ptr += 8;
        
        vec0 = spu_add(vec0, addend);
        vec1 = spu_add(vec1, addend);
        vec2 = spu_add(vec2, addend);
        vec3 = spu_add(vec3, addend);
        vec4 = spu_add(vec4, addend);
        vec5 = spu_add(vec5, addend);
        vec6 = spu_add(vec6, addend);
        vec7 = spu_add(vec7, addend);
    }

    mfc_getl(spu_block, 0, (unsigned int)&dma_list[0], NB*sizeof(mfc_list_element_t), tag, 0, 0);
}

//----------------------------------------------------------------------------------------------

void spu_send_tile(float *spu_block, unsigned int ppu_block, int LDA, int tag)
{
    unsigned int i, increment;

    vector unsigned int *vec_dma_list;
    vector unsigned int *vec_ptr;

    vector unsigned int addend;
    vector unsigned int vec0, vec1, vec2, vec3;
    vector unsigned int vec4, vec5, vec6, vec7;


    dma_list = dma_list_pool[dma_list_cnt];
    dma_list_cnt ++;
    dma_list_cnt &= DMA_LISTS - 1;
    vec_dma_list = (vector unsigned int *) dma_list;

    increment = LDA * sizeof(float);

    vec0 = spu_insert(ppu_block, (vector unsigned int) (256, 0, 256, 0), 1);
    vec0 = spu_insert(ppu_block + increment, vec0, 3);

    addend = (vector unsigned int) (0, 0, 0, 0);
    addend = spu_insert(increment, addend, 1);
    addend = spu_insert(increment, addend, 3);

    addend = spu_add(addend, addend);

    vec1 = spu_add(vec0, addend);
    vec2 = spu_add(vec1, addend);
    vec3 = spu_add(vec2, addend);
    vec4 = spu_add(vec3, addend);
    vec5 = spu_add(vec4, addend);
    vec6 = spu_add(vec5, addend);
    vec7 = spu_add(vec6, addend);

    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);           

    vec_ptr = vec_dma_list;
    for (i = 0; i < NB/2; i+=8)
    {
        vec_ptr[0] = vec0;
        vec_ptr[1] = vec1;
        vec_ptr[2] = vec2;
        vec_ptr[3] = vec3;
        vec_ptr[4] = vec4;
        vec_ptr[5] = vec5;
        vec_ptr[6] = vec6;
        vec_ptr[7] = vec7;

        vec_ptr += 8;
        
        vec0 = spu_add(vec0, addend);
        vec1 = spu_add(vec1, addend);
        vec2 = spu_add(vec2, addend);
        vec3 = spu_add(vec3, addend);
        vec4 = spu_add(vec4, addend);
        vec5 = spu_add(vec5, addend);
        vec6 = spu_add(vec6, addend);
        vec7 = spu_add(vec7, addend);
    }

    mfc_putl(spu_block, 0, (unsigned int)&dma_list[0], NB*sizeof(mfc_list_element_t), tag, 0, 0);
}

//==============================================================================================

#define MAX_DMA_SIZE_BYTES  16384
#define MAX_DMA_SIZE_FLOATS  4096
void spu_recv_tile_blocked(float *spu_block, unsigned int ppu_block, int tag)
{
    int size = NB*NB*sizeof(float);


    mfc_get(spu_block, ppu_block, size <= MAX_DMA_SIZE_BYTES ? size : MAX_DMA_SIZE_BYTES, tag, 0, 0);
    size -= MAX_DMA_SIZE_BYTES;
    if (size > 0)
    {
        spu_block += MAX_DMA_SIZE_FLOATS;
        ppu_block += MAX_DMA_SIZE_BYTES;
        mfc_get(spu_block, ppu_block, size, tag, 0, 0);
    }
}

//----------------------------------------------------------------------------------------------

void spu_recv_tile_blocked_fence(float *spu_block, unsigned int ppu_block, int tag)
{
    int size = NB*NB*sizeof(float);


    mfc_getf(spu_block, ppu_block, size <= MAX_DMA_SIZE_BYTES ? size : MAX_DMA_SIZE_BYTES, tag, 0, 0);
    size -= MAX_DMA_SIZE_BYTES;
    if (size > 0)
    {
        spu_block += MAX_DMA_SIZE_FLOATS;
        ppu_block += MAX_DMA_SIZE_BYTES;
        mfc_getf(spu_block, ppu_block, size, tag, 0, 0);
    }
}

//----------------------------------------------------------------------------------------------

void spu_send_tile_blocked(float *spu_block, unsigned int ppu_block, int tag)
{
    int size = NB*NB*sizeof(float);


    mfc_put(spu_block, ppu_block, size <= MAX_DMA_SIZE_BYTES ? size : MAX_DMA_SIZE_BYTES, tag, 0, 0);
    size -= MAX_DMA_SIZE_BYTES;
    if (size > 0)
    {
        spu_block += MAX_DMA_SIZE_FLOATS;
        ppu_block += MAX_DMA_SIZE_BYTES;
        mfc_put(spu_block, ppu_block, size, tag, 0, 0);
    }
}
#undef MAX_DMA_SIZE_BYTES
#undef MAX_DMA_SIZE_FLOATS

//==============================================================================================

void spu_recv_tile_double(double *spu_block, unsigned int ppu_block, int LDA, int tag)
{
    unsigned int i, increment;

    vector unsigned int *vec_dma_list;
    vector unsigned int *vec_ptr;

    vector unsigned int addend;
    vector unsigned int vec0, vec1, vec2, vec3;
    vector unsigned int vec4, vec5, vec6, vec7;


    dma_list = dma_list_pool[dma_list_cnt];
    dma_list_cnt ++;
    dma_list_cnt &= DMA_LISTS - 1;
    vec_dma_list = (vector unsigned int *) dma_list;

    increment = LDA * sizeof(double);

    vec0 = spu_insert(ppu_block, (vector unsigned int) (256, 0, 256, 0), 1);
    vec0 = spu_insert(ppu_block + increment, vec0, 3);

    addend = (vector unsigned int) (0, 0, 0, 0);
    addend = spu_insert(increment, addend, 1);
    addend = spu_insert(increment, addend, 3);

    addend = spu_add(addend, addend);

    vec1 = spu_add(vec0, addend);
    vec2 = spu_add(vec1, addend);
    vec3 = spu_add(vec2, addend);
    vec4 = spu_add(vec3, addend);
    vec5 = spu_add(vec4, addend);
    vec6 = spu_add(vec5, addend);
    vec7 = spu_add(vec6, addend);

    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);           

    vec_ptr = vec_dma_list;
    for (i = 0; i < DNB/2; i+=8)
    {
        vec_ptr[0] = vec0;
        vec_ptr[1] = vec1;
        vec_ptr[2] = vec2;
        vec_ptr[3] = vec3;
        vec_ptr[4] = vec4;
        vec_ptr[5] = vec5;
        vec_ptr[6] = vec6;
        vec_ptr[7] = vec7;

        vec_ptr += 8;
        
        vec0 = spu_add(vec0, addend);
        vec1 = spu_add(vec1, addend);
        vec2 = spu_add(vec2, addend);
        vec3 = spu_add(vec3, addend);
        vec4 = spu_add(vec4, addend);
        vec5 = spu_add(vec5, addend);
        vec6 = spu_add(vec6, addend);
        vec7 = spu_add(vec7, addend);
    }

    mfc_getl(spu_block, 0, (unsigned int)&dma_list[0], DNB*sizeof(mfc_list_element_t), tag, 0, 0);
}

//----------------------------------------------------------------------------------------------

void spu_send_tile_double(double *spu_block, unsigned int ppu_block, int LDA, int tag)
{
    unsigned int i, increment;

    vector unsigned int *vec_dma_list;
    vector unsigned int *vec_ptr;

    vector unsigned int addend;
    vector unsigned int vec0, vec1, vec2, vec3;
    vector unsigned int vec4, vec5, vec6, vec7;


    dma_list = dma_list_pool[dma_list_cnt];
    dma_list_cnt ++;
    dma_list_cnt &= DMA_LISTS - 1;
    vec_dma_list = (vector unsigned int *) dma_list;

    increment = LDA * sizeof(double);

    vec0 = spu_insert(ppu_block, (vector unsigned int) (256, 0, 256, 0), 1);
    vec0 = spu_insert(ppu_block + increment, vec0, 3);

    addend = (vector unsigned int) (0, 0, 0, 0);
    addend = spu_insert(increment, addend, 1);
    addend = spu_insert(increment, addend, 3);

    addend = spu_add(addend, addend);

    vec1 = spu_add(vec0, addend);
    vec2 = spu_add(vec1, addend);
    vec3 = spu_add(vec2, addend);
    vec4 = spu_add(vec3, addend);
    vec5 = spu_add(vec4, addend);
    vec6 = spu_add(vec5, addend);
    vec7 = spu_add(vec6, addend);

    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);
    addend = spu_add(addend, addend);           

    vec_ptr = vec_dma_list;
    for (i = 0; i < DNB/2; i+=8)
    {
        vec_ptr[0] = vec0;
        vec_ptr[1] = vec1;
        vec_ptr[2] = vec2;
        vec_ptr[3] = vec3;
        vec_ptr[4] = vec4;
        vec_ptr[5] = vec5;
        vec_ptr[6] = vec6;
        vec_ptr[7] = vec7;

        vec_ptr += 8;
        
        vec0 = spu_add(vec0, addend);
        vec1 = spu_add(vec1, addend);
        vec2 = spu_add(vec2, addend);
        vec3 = spu_add(vec3, addend);
        vec4 = spu_add(vec4, addend);
        vec5 = spu_add(vec5, addend);
        vec6 = spu_add(vec6, addend);
        vec7 = spu_add(vec7, addend);
    }

    mfc_putl(spu_block, 0, (unsigned int)&dma_list[0], DNB*sizeof(mfc_list_element_t), tag, 0, 0);
}

//----------------------------------------------------------------------------------------------

void spu_recv_tile_blocked_double(double *spu_block, unsigned int ppu_block, int tag)
{
    mfc_get(spu_block, ppu_block, DNB*DNB*sizeof(double), tag, 0, 0);
}

//----------------------------------------------------------------------------------------------

void spu_send_tile_blocked_double(double *spu_block, unsigned int ppu_block, int tag)
{
    mfc_put(spu_block, ppu_block, DNB*DNB*sizeof(double), tag, 0, 0);
}

//----------------------------------------------------------------------------------------------
