//==============================================================================================
//
//  Innovative Computing Laboratory - Computer Science Department - University of Tennessee
//  Written by Jakub Kurzak
//
//==============================================================================================

#include <vec_literal.h>

//----------------------------------------------------------------------------------------------

#define  BLK 64
#define VBLK 16

//----------------------------------------------------------------------------------------------

void ppu_strsm_tile(float *T, float *B)
{
    vector float *Bp  = (vector float*)B;
    vector float *BIp = (vector float*)B;
    vector float *BJp = (vector float*)B;
    
    float *Tp = T;

    vector float bi0, bi1, bi2, bi3;

    #define bj(N)\
    \
        vector float bj##N##_0;\
        vector float bj##N##_1;\
        vector float bj##N##_2;\
        vector float bj##N##_3;\

    bj(0)  bj(1)  bj(2)  bj(3)
    bj(4)  bj(5)  bj(6)  bj(7)
    bj(8)  bj(9)  bj(10) bj(11)
    bj(12) bj(13) bj(14) bj(15)

    vector float t0_0, t0_1, t0_2, t0_3;
    vector float t1_0, t1_1, t1_2, t1_3;
    vector float t2_0, t2_1, t2_2, t2_3;
    vector float t3_0, t3_1, t3_2, t3_3;

    float t0 __attribute__ ((aligned (16)));
    float t1 __attribute__ ((aligned (16)));
    float t2 __attribute__ ((aligned (16)));
    float t3 __attribute__ ((aligned (16)));

    vector float zero = VEC_LITERAL(vector float, 0.0, 0.0, 0.0, 0.0);

    vector unsigned char shufflehi = VEC_LITERAL(vector unsigned char,
        0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
        0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17);

    vector unsigned char shufflelo = VEC_LITERAL(vector unsigned char,
        0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
        0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F);

    vector float aibj;
    vector float ckdl;
    vector float emfn;
    vector float gohp;

    int i, j, k;

    //----------------------------------------------------------

    #define shuffle_4x1(abcd, efgh, ijkl, mnop)\
    \
        aibj = vec_perm(abcd, ijkl, shufflehi);\
        ckdl = vec_perm(abcd, ijkl, shufflelo);\
        emfn = vec_perm(efgh, mnop, shufflehi);\
        gohp = vec_perm(efgh, mnop, shufflelo);\
        \
        abcd = vec_perm(aibj, emfn, shufflehi);\
        efgh = vec_perm(aibj, emfn, shufflelo);\
        ijkl = vec_perm(ckdl, gohp, shufflehi);\
        mnop = vec_perm(ckdl, gohp, shufflelo);\

    //----------------------------------------------------------

    #define trsm_shuffle_4x4(OFFX, OFFY)\
        shuffle_4x1(\
            BJp[OFFX+OFFY+0*VBLK],\
            BJp[OFFX+OFFY+1*VBLK],\
            BJp[OFFX+OFFY+2*VBLK],\
            BJp[OFFX+OFFY+3*VBLK]);\


    #define trsm_shuffle_4xNB(OFFY)\
        trsm_shuffle_4x4( 0, OFFY);\
        trsm_shuffle_4x4( 1, OFFY);\
        trsm_shuffle_4x4( 2, OFFY);\
        trsm_shuffle_4x4( 3, OFFY);\
        trsm_shuffle_4x4( 4, OFFY);\
        trsm_shuffle_4x4( 5, OFFY);\
        trsm_shuffle_4x4( 6, OFFY);\
        trsm_shuffle_4x4( 7, OFFY);\
        trsm_shuffle_4x4( 8, OFFY);\
        trsm_shuffle_4x4( 9, OFFY);\
        trsm_shuffle_4x4(10, OFFY);\
        trsm_shuffle_4x4(11, OFFY);\
        trsm_shuffle_4x4(12, OFFY);\
        trsm_shuffle_4x4(13, OFFY);\
        trsm_shuffle_4x4(14, OFFY);\
        trsm_shuffle_4x4(15, OFFY);\

    //----------------------------------------------------------

    #define trsm_4x4x4_load(OFFI, N)\
    \
        bj##N##_0 = BJp[OFFI+0*VBLK];\
        bj##N##_1 = BJp[OFFI+1*VBLK];\
        bj##N##_2 = BJp[OFFI+2*VBLK];\
        bj##N##_3 = BJp[OFFI+3*VBLK];\


    #define trsm_4x4xRHS_load(OFFJ) \
    \
        trsm_4x4x4_load(OFFJ+ 0*BLK, 0);\
        trsm_4x4x4_load(OFFJ+ 1*BLK, 1);\
        trsm_4x4x4_load(OFFJ+ 2*BLK, 2);\
        trsm_4x4x4_load(OFFJ+ 3*BLK, 3);\

    //----------------------------------------------------------

    #define trsm_4x4x4(OFFI, N)\
    \
        bi0 = BIp[OFFI+0*VBLK];\
        bi1 = BIp[OFFI+1*VBLK];\
        bi2 = BIp[OFFI+2*VBLK];\
        bi3 = BIp[OFFI+3*VBLK];\
        \
        bj##N##_0 = vec_nmsub(bi0, t0_0, bj##N##_0);\
        bj##N##_0 = vec_nmsub(bi1, t0_1, bj##N##_0);\
        bj##N##_0 = vec_nmsub(bi2, t0_2, bj##N##_0);\
        bj##N##_0 = vec_nmsub(bi3, t0_3, bj##N##_0);\
        \
        bj##N##_1 = vec_nmsub(bi0, t1_0, bj##N##_1);\
        bj##N##_1 = vec_nmsub(bi1, t1_1, bj##N##_1);\
        bj##N##_1 = vec_nmsub(bi2, t1_2, bj##N##_1);\
        bj##N##_1 = vec_nmsub(bi3, t1_3, bj##N##_1);\
        \
        bj##N##_2 = vec_nmsub(bi0, t2_0, bj##N##_2);\
        bj##N##_2 = vec_nmsub(bi1, t2_1, bj##N##_2);\
        bj##N##_2 = vec_nmsub(bi2, t2_2, bj##N##_2);\
        bj##N##_2 = vec_nmsub(bi3, t2_3, bj##N##_2);\
        \
        bj##N##_3 = vec_nmsub(bi0, t3_0, bj##N##_3);\
        bj##N##_3 = vec_nmsub(bi1, t3_1, bj##N##_3);\
        bj##N##_3 = vec_nmsub(bi2, t3_2, bj##N##_3);\
        bj##N##_3 = vec_nmsub(bi3, t3_3, bj##N##_3);\


    #define trsm_4x4xRHS(OFFJ) \
    \
        t0_0 = vec_splat(vec_lde(0, &Tp[BLK*0+0]), 0);\
        t0_1 = vec_splat(vec_lde(0, &Tp[BLK*0+1]), 1);\
        t0_2 = vec_splat(vec_lde(0, &Tp[BLK*0+2]), 2);\
        t0_3 = vec_splat(vec_lde(0, &Tp[BLK*0+3]), 3);\
        \
        t1_0 = vec_splat(vec_lde(0, &Tp[BLK*1+0]), 0);\
        t1_1 = vec_splat(vec_lde(0, &Tp[BLK*1+1]), 1);\
        t1_2 = vec_splat(vec_lde(0, &Tp[BLK*1+2]), 2);\
        t1_3 = vec_splat(vec_lde(0, &Tp[BLK*1+3]), 3);\
        \
        t2_0 = vec_splat(vec_lde(0, &Tp[BLK*2+0]), 0);\
        t2_1 = vec_splat(vec_lde(0, &Tp[BLK*2+1]), 1);\
        t2_2 = vec_splat(vec_lde(0, &Tp[BLK*2+2]), 2);\
        t2_3 = vec_splat(vec_lde(0, &Tp[BLK*2+3]), 3);\
        \
        t3_0 = vec_splat(vec_lde(0, &Tp[BLK*3+0]), 0);\
        t3_1 = vec_splat(vec_lde(0, &Tp[BLK*3+1]), 1);\
        t3_2 = vec_splat(vec_lde(0, &Tp[BLK*3+2]), 2);\
        t3_3 = vec_splat(vec_lde(0, &Tp[BLK*3+3]), 3);\
        \
        trsm_4x4x4(OFFJ+ 0*BLK, 0);\
        trsm_4x4x4(OFFJ+ 1*BLK, 1);\
        trsm_4x4x4(OFFJ+ 2*BLK, 2);\
        trsm_4x4x4(OFFJ+ 3*BLK, 3);\

    //----------------------------------------------------------

    #define trsm_4x4x4_(OFFI, N)\
    \
        bj##N##_0 = vec_madd(bj##N##_0, t0_0, zero);\
        \
        bj##N##_1 = vec_nmsub(bj##N##_0, t1_0, bj##N##_1);\
        bj##N##_1 = vec_madd(bj##N##_1, t1_1, zero);\
        \
        bj##N##_2 = vec_nmsub(bj##N##_0, t2_0, bj##N##_2);\
        bj##N##_2 = vec_nmsub(bj##N##_1, t2_1, bj##N##_2);\
        bj##N##_2 = vec_madd(bj##N##_2, t2_2, zero);\
        \
        bj##N##_3 = vec_nmsub(bj##N##_0, t3_0, bj##N##_3);\
        bj##N##_3 = vec_nmsub(bj##N##_1, t3_1, bj##N##_3);\
        bj##N##_3 = vec_nmsub(bj##N##_2, t3_2, bj##N##_3);\
        bj##N##_3 = vec_madd(bj##N##_3, t3_3, zero);\


    #define trsm_4x4xRHS_(OFFJ)\
    \
        t0 = 1.0f/Tp[BLK*0+0];\
        t1 = 1.0f/Tp[BLK*1+1];\
        t2 = 1.0f/Tp[BLK*2+2];\
        t3 = 1.0f/Tp[BLK*3+3];\
        \
        t0_0 = vec_splat(vec_lde(0, &t0), 0);\
        t1_1 = vec_splat(vec_lde(0, &t1), 0);\
        t2_2 = vec_splat(vec_lde(0, &t2), 0);\
        t3_3 = vec_splat(vec_lde(0, &t3), 0);\
        \
        t1_0 = vec_splat(vec_lde(0, &Tp[BLK*1+0]), 0);\
        t2_0 = vec_splat(vec_lde(0, &Tp[BLK*2+0]), 0);\
        t2_1 = vec_splat(vec_lde(0, &Tp[BLK*2+1]), 1);\
        t3_0 = vec_splat(vec_lde(0, &Tp[BLK*3+0]), 0);\
        t3_1 = vec_splat(vec_lde(0, &Tp[BLK*3+1]), 1);\
        t3_2 = vec_splat(vec_lde(0, &Tp[BLK*3+2]), 2);\
        \
        trsm_4x4x4_(OFFJ+ 0*BLK, 0);\
        trsm_4x4x4_(OFFJ+ 1*BLK, 1);\
        trsm_4x4x4_(OFFJ+ 2*BLK, 2);\
        trsm_4x4x4_(OFFJ+ 3*BLK, 3);\

    //----------------------------------------------------------

    #define trsm_4x4x4_store(OFFI, N)\
    \
        BJp[OFFI+0*VBLK] = bj##N##_0;\
        BJp[OFFI+1*VBLK] = bj##N##_1;\
        BJp[OFFI+2*VBLK] = bj##N##_2;\
        BJp[OFFI+3*VBLK] = bj##N##_3;\


    #define trsm_4x4xRHS_store(OFFJ) \
    \
        trsm_4x4x4_store(OFFJ+ 0*BLK, 0);\
        trsm_4x4x4_store(OFFJ+ 1*BLK, 1);\
        trsm_4x4x4_store(OFFJ+ 2*BLK, 2);\
        trsm_4x4x4_store(OFFJ+ 3*BLK, 3);\

    //----------------------------------------------------------

    for (i = 0; i < VBLK; i++)
    {
        trsm_shuffle_4xNB(0);
        BJp += BLK;
    }
    BJp = (vector float*)B;


    for (k = 0; k < VBLK; k += 4)
    {    
        Bp  = (vector float*)B;
        BIp = (vector float*)B;
        BJp = (vector float*)B;

        Bp  += k*BLK;
        BIp += k*BLK;
        BJp += k*BLK;

        Tp = T;

        for (j = 0; j < VBLK; j++)
        {
            trsm_4x4xRHS_load(0);
            for (i = 0; i < j; i++)
            {
                trsm_4x4xRHS(0);
                Tp += 4;        
                BIp++;
            }
            trsm_4x4xRHS_(0);
            trsm_4x4xRHS_store(0);

            Tp  -= j*4;
            BIp -= j;

            Tp += BLK*4;
            BJp++;
        }
        BJp = (vector float*)B;
    }


    for (i = 0; i < VBLK; i++)
    {
        trsm_shuffle_4xNB(0);
        BJp += BLK;
    }
}

//----------------------------------------------------------------------------------------------
