//==============================================================================================
//
//  Innovative Computing Laboratory - Computer Science Department - University of Tennessee
//  Written by Jakub Kurzak
//
//==============================================================================================

#include <cbe_mfc.h>
#include <spu_mfcio.h>
#include <vec_literal.h>
#include <spu_intrinsics.h>

#include "spu_blas.h"
#include "spu_convert.h"

//----------------------------------------------------------------------------------------------

extern int NB;
extern int spus_num;
extern int my_spu_id;
extern CallArgs spu_call_args __attribute__ ((aligned (128)));

extern float *buffer3;
extern float *buffer4;

extern double  *dbuffer1;
extern double  *dbuffer3;

//----------------------------------------------------------------------------------------------

void spu_convert_d2s()
{
    double *spuD_front;
    double *spuD_back;

    float *spuS_front;
    float *spuS_back;

    unsigned int tag_front, tag_back;
    unsigned int ppuD, ppuS;

    int tile, next_tile, prev_tile;
    int num_tiles;
    int M, N;


    spu_recv_call_args();


    ppuD = spu_call_args.convert.D;
    ppuS = spu_call_args.convert.S;

    M = spu_call_args.convert.M;
    N = spu_call_args.convert.N;
    
    num_tiles = M*N / 2048;

    spuD_front = dbuffer1;
    spuD_back  = dbuffer3;

    spuS_front = buffer3;
    spuS_back  = buffer4;

    tag_front = 1;
    tag_back  = 2;


    #define receive_tile(tile, spuD, tag)                       \
    {                                                           \
        unsigned int D = ppuD + (tile) * 2048*sizeof(double);   \
        mfc_get(spuD, D, 2048*sizeof(double), tag, 0, 0);       \
    }
 
    #define send_tile(tile, spuS, tag)                          \
    {                                                           \
        unsigned int S = ppuS + (tile) * 2048*sizeof(float);    \
        mfc_put(spuS, S, 2048*sizeof(float), tag, 0, 0);        \
    }

    #define swap_buffers()                                      \
    {                                                           \
        float *s; double *d; unsigned int t;                    \
        s = spuS_front; spuS_front = spuS_back; spuS_back = s;  \
        d = spuD_front; spuD_front = spuD_back; spuD_back = d;  \
        t = tag_front; tag_front = tag_back; tag_back = t;      \
    }


    /* Receive N-1th tile */
    if (my_spu_id < num_tiles)
        receive_tile(my_spu_id, spuD_front, tag_front);

    /* Receive Nth tile */
    if (my_spu_id + spus_num < num_tiles)
        receive_tile(my_spu_id + spus_num, spuD_back, tag_back);

    /* Compute N-1th tile */
    spu_wait_tag(tag_front);
    spu_convert_d2s_tile(spuD_front, spuS_front);

    /* Swap buffers */
    swap_buffers();

    /* Pipelined loop */
    for (tile = my_spu_id + spus_num; tile < num_tiles - spus_num; tile += spus_num)
    {
        prev_tile = tile - spus_num;
        next_tile = tile + spus_num;

        /* Send N-1th tile */
        send_tile(prev_tile, spuS_back, tag_back);

        /* Receive N+1th tile */
        receive_tile(next_tile, spuD_back, tag_back);

        /* Compute Nth tile */
        spu_wait_tag(tag_front);
        spu_convert_d2s_tile(spuD_front, spuS_front);

        /* Swap buffers */
        swap_buffers();
    }

    /* Send Nth tile */
    if (my_spu_id < num_tiles)
        send_tile(tile - spus_num, spuS_back, tag_back);

    /* Compute N+1th tile */
    if (my_spu_id + spus_num < num_tiles)
    {
        spu_wait_tag(tag_front);
        spu_convert_d2s_tile(spuD_front, spuS_front);
    }

    /* Send N+1th tile */
    if (my_spu_id + spus_num < num_tiles)
        send_tile(tile, spuS_front, tag_back);

    /* Wait for last send completions */
    if (my_spu_id < num_tiles)
        spu_wait_tag(tag_back);
}

#undef receive_tile
#undef send_tile
#undef swap_buffers

//----------------------------------------------------------------------------------------------

void spu_convert_d2s_tile(double *D, float *S)
{
    int i;

    vector double *vecD = (vector double*)D;
    vector double *Dp = vecD;

    vector float *vecS = (vector float*)S;
    vector float *Sp = vecS;

    vector unsigned char shuffle = VEC_LITERAL(vector unsigned char,
        0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
        0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B);

    //----------------------------------------------------------

    extern int spu_event_num;
    extern int spu_event_log[];

    #define spu_log_event(start, end, event)\
        spu_event_log[spu_event_num+0] = start;\
        spu_event_log[spu_event_num+1] = end;\
        spu_event_log[spu_event_num+2] = event;\
        spu_event_num += 4;\
        spu_event_num &= 1024-1;\

    int start;
    int end;

    //----------------------------------------------------------

    start = spu_read_decrementer();

    for (i = 0; i < 2048; i += 8)
    {
        Sp[0] = spu_shuffle(spu_roundtf(Dp[i  ]), spu_roundtf(Dp[i+1]), shuffle);
        Sp[1] = spu_shuffle(spu_roundtf(Dp[i+2]), spu_roundtf(Dp[i+3]), shuffle);
        Sp[2] = spu_shuffle(spu_roundtf(Dp[i+4]), spu_roundtf(Dp[i+5]), shuffle);
        Sp[3] = spu_shuffle(spu_roundtf(Dp[i+6]), spu_roundtf(Dp[i+7]), shuffle);
        Sp += 4;
    }

    end = spu_read_decrementer();
    spu_log_event(start, end, 0x6080A0);
}

//----------------------------------------------------------------------------------------------
