/**
 *
 * @file core_zttrfb.c
 *
 *  PLASMA core_blas kernel
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.2.0
 * @author Hatem Ltaief
 * @author Mathieu Faverge
 * @date 2009-11-15
 *
 **/
#include <cblas.h>
#include <lapack.h>
#include "common.h"

/***************************************************************************//**
 *
 * @ingroup CORE_PLASMA_Complex64_t
 *
 *  CORE_zttrfb applies a complex upper triangular block reflector H
 *  or its transpose H' to a
 *  complex rectangular matrix formed by coupling two tiles A1 and A2.
 *  (Only SIDE='L' and STOREV='C' supported!)
 *
 *******************************************************************************
 *
 * @param[in] SIDE
 *         @arg PlasmaLeft  : apply Q or Q**H from the Left;
 *         @arg PlasmaRight : apply Q or Q**H from the Right.
 *
 * @param[in] TRANS
 *         @arg PlasmaNoTrans   : No transpose, apply Q;
 *         @arg PlasmaConjTrans : ConjTranspose, apply Q**H.
 *
 * @param[in] DIRECT
 *         Indicates how H is formed from a product of elementary
 *         reflectors
 *         @arg PlasmaForward  : H = H(1) H(2) . . . H(k) (Forward)
 *         @arg PlasmaBackward : H = H(k) . . . H(2) H(1) (Backward)
 *
 * @param[in] STOREV
 *         Indicates how the vectors which define the elementary
 *         reflectors are stored:
 *         @arg PlasmaColumnwise
 *         @arg PlasmaRowwise
 *
 * @param[in] M1
 *         The number of rows of the tile A1. M1 >= 0.
 *
 * @param[in] M2
 *         The number of rows of the tile A2. M2 >= 0.
 *
 * @param[in] NN
 *         The number of columns of the tiles A1 and A2. NN >= 0.
 *
 * @param[in] IB
 *         The inner-blocking size.  IB >= 0.
 *
 * @param[in] K
 *         The number of elementary reflectors whose product defines
 *         the matrix Q.
 *
 * @param[in,out] A1
 *         On entry, the M1-by-NN tile A1.
 *         On exit, A1 is overwritten by the application of Q.
 *
 * @param[in] LDA1
 *         The leading dimension of the array A1. LDA1 >= max(1,M1).
 *
 * @param[in,out] A2
 *         On entry, the M2-by-NN tile A2.
 *         On exit, A2 is overwritten by the application of Q.
 *
 * @param[in] LDA2
 *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
 *
 * @param[in] V
 *         (LDV,K) if STOREV = 'C'
 *         (LDV,M) if STOREV = 'R' and SIDE = 'L'
 *         (LDV,N) if STOREV = 'R' and SIDE = 'R'
 *         The upper triangular matrix V.
 *
 * @param[in] LDV
 *         The leading dimension of the array V.
 *         If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
 *         if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
 *         if STOREV = 'R', LDV >= K.
 *
 * @param[out] T
 *         The triangular K-by-K matrix T in the representation of the
 *         block reflector.
 *
 * @param[in] LDT
 *         The leading dimension of the array T. LDT >= K.
 *
 * @param[in,out] WORK
 *
 * @param[in] LDWORK
 *         The dimension of the array WORK.
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 ******************************************************************************/
int CORE_zttrfb(int side, int trans, int direct, int storev,
                int M1, int M2, int NN, int IB,
                PLASMA_Complex64_t *A1,   int LDA1,
                PLASMA_Complex64_t *A2,   int LDA2,
                PLASMA_Complex64_t *V,    int LDV,
                PLASMA_Complex64_t *T,    int LDT,
                PLASMA_Complex64_t *WORK, int LDWORK)
{
    static PLASMA_Complex64_t zone  =  1.0;
    static PLASMA_Complex64_t mzone = -1.0;

    int j;

    /* Check input arguments */
    if (M1 < 0) {
        plasma_error("CORE_zttrfb", "illegal value of M1");
        return -5;
    }
    if (M2 < 0) {
        plasma_error("CORE_zttrfb", "illegal value of M2");
        return -6;
    }
    if (NN < 0) {
        plasma_error("CORE_zttrfb", "illegal value of NN");
        return -7;
    }
    if (IB < 0) {
        plasma_error("CORE_zttrfb", "illegal value of IB");
        return -8;
    }

    /* Quick return */
    if ( (M1 == 0) || (M2 == 0) || (NN == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    if (storev == PlasmaColumnwise) {
        if (direct == PlasmaForward) {
            if (side == PlasmaLeft) {
                /*
                 * B = A1 + V' * A2
                 */
                lapack_zlacpy((enum lapack_uplo_type)lapack_upper_lower, 
                              IB, NN,
                              &A2[M2-IB], LDA2, WORK, LDWORK);

                cblas_ztrmm(CblasColMajor, CblasLeft, CblasUpper,
                            CblasConjTrans, CblasNonUnit, IB, NN,
                            CBLAS_SADDR(zone), &V[M2-IB], LDV,
                            WORK, LDWORK);

                if (M2 > IB) {
                    cblas_zgemm(CblasColMajor, CblasConjTrans, CblasNoTrans,
                                IB, NN, M2-IB,
                                CBLAS_SADDR(zone), V, LDV,
                                A2, LDA2,
                                CBLAS_SADDR(zone), WORK, LDWORK);
                }

                for(j=0; j<NN; j++) {
                    cblas_zaxpy(IB, CBLAS_SADDR(zone),
                                &A1[LDA1*j],     1,
                                &WORK[LDWORK*j], 1);
                }

                /*
                 * A2 = A2 - V*T*B --->  B = T*B, A2 = A2 - V*B
                 */
                cblas_ztrmm(CblasColMajor, CblasLeft, CblasUpper,
                            (CBLAS_TRANSPOSE)trans, CblasNonUnit, IB, NN,
                            CBLAS_SADDR(zone), T, LDT, WORK, LDWORK);

                /*
                 * A1 = A1 - T*B
                 */
                for(j=0; j<NN; j++) {
                    cblas_zaxpy(IB, CBLAS_SADDR(mzone),
                                &WORK[LDWORK*j], 1,
                                &A1[LDA1*j],     1);
                }

                if ( M2 > IB ) {
                    cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
                                M2-IB, NN, IB,
                                CBLAS_SADDR(mzone), V, LDV,
                                WORK, LDWORK,
                                CBLAS_SADDR(zone), A2, LDA2);
                }

                cblas_ztrmm(CblasColMajor, CblasLeft, CblasUpper,
                            CblasNoTrans, CblasNonUnit, IB, NN,
                            CBLAS_SADDR(mzone), &V[M2-IB], LDV,
                            WORK, LDWORK);

                for(j=0; j<NN; j++) {
                    cblas_zaxpy(IB, CBLAS_SADDR(zone),
                                &WORK[LDWORK*j],     1,
                                &A2[LDA2*j+(M2-IB)], 1);
                }
            }
            else {
                plasma_error("CORE_zttrfb",
                             "Not implemented (ColMajor / Forward / Right)");

                return PLASMA_ERR_NOT_SUPPORTED;
            }
        }
        else {
            /* Backward */
            plasma_error("CORE_zttrfb",
                         "Not implemented (ColMajor / Backward / Left or Right)");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }
    else {
        /* RowWise */
        plasma_error("CORE_zttrfb",
                     "Not implemented (RowMajor / Forward or Backward / Left or Right)");
        return PLASMA_ERR_NOT_SUPPORTED;
    }
    return PLASMA_SUCCESS;
}
