//==============================================================================================
//
//  Innovative Computing Laboratory - Computer Science Department - University of Tennessee
//  Written by Jakub Kurzak
//
//==============================================================================================

#include <vec_literal.h>

//----------------------------------------------------------------------------------------------

static vector unsigned char shufflehi = VEC_LITERAL(vector unsigned char,
        0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
        0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17);

static vector unsigned char shufflelo = VEC_LITERAL(vector unsigned char,
        0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
        0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F);

static vector float aibj;
static vector float ckdl;
static vector float emfn;
static vector float gohp;

vector float zero = VEC_LITERAL(vector float, 0.0, 0.0, 0.0, 0.0);

#define  BLK 64
#define VBLK 16

//----------------------------------------------------------------------------------------------

void ppu_sgemm_tile(float *A, float *B, float *C)
{
    int i, j;

    vector float *Ap = (vector float*)A;
    vector float *Bp = (vector float*)B;
    vector float *Cp = (vector float*)C;

    vector float c0_0, c0_1, c0_2, c0_3;
    vector float c1_0, c1_1, c1_2, c1_3;
    vector float c2_0, c2_1, c2_2, c2_3;
    vector float c3_0, c3_1, c3_2, c3_3;

    //----------------------------------------------------------

    #define sgemm_1x1xNB(c, OFFA, OFFB)\
    \
        c = vec_madd(Ap[OFFA+ 0], Bp[OFFB+ 0], zero);\
        c = vec_madd(Ap[OFFA+ 1], Bp[OFFB+ 1], c);\
        c = vec_madd(Ap[OFFA+ 2], Bp[OFFB+ 2], c);\
        c = vec_madd(Ap[OFFA+ 3], Bp[OFFB+ 3], c);\
        c = vec_madd(Ap[OFFA+ 4], Bp[OFFB+ 4], c);\
        c = vec_madd(Ap[OFFA+ 5], Bp[OFFB+ 5], c);\
        c = vec_madd(Ap[OFFA+ 6], Bp[OFFB+ 6], c);\
        c = vec_madd(Ap[OFFA+ 7], Bp[OFFB+ 7], c);\
        c = vec_madd(Ap[OFFA+ 8], Bp[OFFB+ 8], c);\
        c = vec_madd(Ap[OFFA+ 9], Bp[OFFB+ 9], c);\
        c = vec_madd(Ap[OFFA+10], Bp[OFFB+10], c);\
        c = vec_madd(Ap[OFFA+11], Bp[OFFB+11], c);\
        c = vec_madd(Ap[OFFA+12], Bp[OFFB+12], c);\
        c = vec_madd(Ap[OFFA+13], Bp[OFFB+13], c);\
        c = vec_madd(Ap[OFFA+14], Bp[OFFB+14], c);\
        c = vec_madd(Ap[OFFA+15], Bp[OFFB+15], c);\

    //----------------------------------------------------------

    #define sgemm_4x1xNB(c0, c1, c2, c3, OFFB)\
    \
        sgemm_1x1xNB(c0, 0*VBLK, OFFB);\
        sgemm_1x1xNB(c1, 1*VBLK, OFFB);\
        sgemm_1x1xNB(c2, 2*VBLK, OFFB);\
        sgemm_1x1xNB(c3, 3*VBLK, OFFB);\

    //----------------------------------------------------------

    #define sgemm_4x4xNB(OFFB)\
    \
        sgemm_4x1xNB(c0_0, c0_1, c0_2, c0_3, OFFB + 0*VBLK);\
        sgemm_4x1xNB(c1_0, c1_1, c1_2, c1_3, OFFB + 1*VBLK);\
        sgemm_4x1xNB(c2_0, c2_1, c2_2, c2_3, OFFB + 2*VBLK);\
        sgemm_4x1xNB(c3_0, c3_1, c3_2, c3_3, OFFB + 3*VBLK);\

    //----------------------------------------------------------

    #define shuffle_4x1(abcd, efgh, ijkl, mnop)\
    \
        aibj = vec_perm(abcd, ijkl, shufflehi);\
        ckdl = vec_perm(abcd, ijkl, shufflelo);\
        emfn = vec_perm(efgh, mnop, shufflehi);\
        gohp = vec_perm(efgh, mnop, shufflelo);\
        \
        abcd = vec_perm(aibj, emfn, shufflehi);\
        efgh = vec_perm(aibj, emfn, shufflelo);\
        ijkl = vec_perm(ckdl, gohp, shufflehi);\
        mnop = vec_perm(ckdl, gohp, shufflelo);\

    //----------------------------------------------------------

    #define shuffle_4x4(OFFC)\
    \
        shuffle_4x1(c0_0, c0_1, c0_2, c0_3);\
        shuffle_4x1(c1_0, c1_1, c1_2, c1_3);\
        shuffle_4x1(c2_0, c2_1, c2_2, c2_3);\
        shuffle_4x1(c3_0, c3_1, c3_2, c3_3);\
        \
        c0_0 = vec_add(c0_0, c0_1); c0_0 = vec_add(c0_0, c0_2); c0_0 = vec_add(c0_0, c0_3);\
        c1_0 = vec_add(c1_0, c1_1); c1_0 = vec_add(c1_0, c1_2); c1_0 = vec_add(c1_0, c1_3);\
        c2_0 = vec_add(c2_0, c2_1); c2_0 = vec_add(c2_0, c2_2); c2_0 = vec_add(c2_0, c2_3);\
        c3_0 = vec_add(c3_0, c3_1); c3_0 = vec_add(c3_0, c3_2); c3_0 = vec_add(c3_0, c3_3);\
        \
        shuffle_4x1(c0_0, c1_0, c2_0, c3_0);\
        \
        Cp[OFFC + 0*VBLK] = vec_sub(Cp[OFFC + 0*VBLK], c0_0);\
        Cp[OFFC + 1*VBLK] = vec_sub(Cp[OFFC + 1*VBLK], c1_0);\
        Cp[OFFC + 2*VBLK] = vec_sub(Cp[OFFC + 2*VBLK], c2_0);\
        Cp[OFFC + 3*VBLK] = vec_sub(Cp[OFFC + 3*VBLK], c3_0);\

    //----------------------------------------------------------

    for (j = 0; j < VBLK; j++)
    {
        for (i = 0; i < VBLK; i++)
        {
            sgemm_4x4xNB(0);
            shuffle_4x4(0);

            Bp += BLK;
            Cp++;
        }
        Ap +=  BLK;
        Bp -=  BLK*VBLK;
        Cp += (BLK-VBLK);
    }
}

//----------------------------------------------------------------------------------------------
