/**
 *
 * @file core_ctsqrt.c
 *
 *  PLASMA core_blas kernel
 *  PLASMA is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.2.0
 * @author Hatem Ltaief
 * @author Mathieu Faverge
 * @date 2009-11-15
 *
 **/
#include <lapack.h>
#include "common.h"
#undef REAL
#define COMPLEX

/***************************************************************************//**
 *
 * @ingroup CORE_PLASMA_Complex32_t
 *
 * CORE_ZTSQRT computes a QR factorization of a rectangular matrix
 * formed by coupling a complex N-by-N upper triangular tile A1
 * on top of a complex M-by-N tile A2:
 *
 *    | A1 | = Q * R
 *    | A2 |
 *
 *******************************************************************************
 *
 * @param[in] M
 *         The number of columns of the tile A2. M >= 0.
 *
 * @param[in] N
 *         The number of rows of the tile A1.
 *         The number of columns of the tiles A1 and A2. N >= 0.
 *
 * @param[in] IB
 *         The inner-blocking size.  IB >= 0.
 *
 * @param[in,out] A1
 *         On entry, the N-by-N tile A1.
 *         On exit, the elements on and above the diagonal of the array
 *         contain the N-by-N upper trapezoidal tile R;
 *         the elements below the diagonal are not referenced.
 *
 * @param[in] LDA1
 *         The leading dimension of the array A1. LDA1 >= max(1,N).
 *
 * @param[in,out] A2
 *         On entry, the M-by-N tile A2.
 *         On exit, all the elements with the array TAU, represent
 *         the unitary tile Q as a product of elementary reflectors
 *         (see Further Details).
 *
 * @param[in] LDA2
 *         The leading dimension of the tile A2. LDA2 >= max(1,M).
 *
 * @param[out] T
 *         The IB-by-N triangular factor T of the block reflector.
 *         T is upper triangular by block (economic storage);
 *         The rest of the array is not referenced.
 *
 *         The IB-by-NN triangular factor T of the block reflector.
 *         T is upper triangular by block (economic storage);
 *         The rest of the array is not referenced.
 *
 * @param[in] LDT
 *         The leading dimension of the array T. LDT >= IB.
 *
 * @param[out] TAU
 *         The scalar factors of the elementary reflectors (see Further
 *         Details).
 *
 * @param[in,out] WORK
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 ******************************************************************************/
int CORE_ctsqrt(int M, int N, int IB,
                PLASMA_Complex32_t *A1, int LDA1,
                PLASMA_Complex32_t *A2, int LDA2,
                PLASMA_Complex32_t *T, int LDT,
                PLASMA_Complex32_t *TAU, PLASMA_Complex32_t *WORK)
{
    static PLASMA_Complex32_t zone  = 1.0;
    static PLASMA_Complex32_t zzero = 0.0;
    static int                ione  = 1;

    PLASMA_Complex32_t alpha;
    int i, ii, sb, tmp;

    /*
     * Check input arguments
     */
    if (M < 0) {
        plasma_error("CORE_ctsqrt", "illegal value of M");
        return -1;
    }
    if (N < 0) {
        plasma_error("CORE_ctsqrt", "illegal value of N");
        return -2;
    }
    if (IB < 0) {
        plasma_error("CORE_ctsqrt", "illegal value of IB");
        return -3;
    }
    if ((LDA2 < max(1,M)) && (M > 0)) {
        plasma_error("CORE_ctsqrt", "illegal value of LDA2");
        return -8;
    }

    /*
     * Quick return
     */
    if ( (M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    for(ii=0; ii<N; ii+=IB) {
        sb = min(N-ii, IB);
        for(i=0; i<sb; i++) {
            /*
             * Generate elementary reflector H( II*IB+I ) to annihilate
             * A( II*IB+I:M, II*IB+I ).
             */
            tmp = M+1;
            lapack_clarfg(tmp, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], ione, &TAU[ii+i]);

            if ((ii+i) < N) {
                /*
                 * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left.
                 */
                cblas_ccopy(sb-i-1,
                            &A1[LDA1*(ii+i+1)+(ii+i)], LDA1,
                            WORK, 1);
#ifdef COMPLEX
                /* LAPACK_clacgv( sb-i-1, WORK, ione ); */
                tmp = sb-i-1;
                lapack_clacgv( tmp, WORK, ione );
#endif
                cblas_cgemv(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans,
                            M, sb-i-1,
                            CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2,
                            &A2[LDA2*(ii+i)], 1,
                            CBLAS_SADDR(zone), WORK, 1);
#ifdef COMPLEX
                /* LAPACK_clacgv( sb-i-1, WORK, ione ); */
                tmp = sb-i-1;
                lapack_clacgv( tmp, WORK, ione );
#endif
                alpha = -conjf(TAU[ii+i]);
                cblas_caxpy(sb-i-1, CBLAS_SADDR(alpha),
                            WORK, 1,
                            &A1[LDA1*(ii+i+1)+ii+i], LDA1);

#ifdef COMPLEX
                /* LAPACK_clacgv( sb-i-1, WORK, ione ); */
                tmp = sb-i-1;
                lapack_clacgv( tmp, WORK, ione );
#endif
                alpha = -conjf(TAU[ii+i]);
                cblas_cgerc(CblasColMajor, M, sb-i-1, CBLAS_SADDR(alpha),
                            &A2[LDA2*(ii+i)], 1,
                            WORK, 1,
                            &A2[LDA2*(ii+i+1)], LDA2);
            }

            /*
             * Calculate T.
             */
            alpha = -TAU[ii+i];
            cblas_cgemv(CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, i,
                        CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2,
                        &A2[LDA2*(ii+i)], 1,
                        CBLAS_SADDR(zzero), &T[LDT*(ii+i)], 1);

            cblas_ctrmv(CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                        (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i,
                        &T[LDT*ii], LDT,
                        &T[LDT*(ii+i)], 1);

            T[LDT*(ii+i)+i] = TAU[ii+i];
        }

        if (N > (ii+IB)) {
            CORE_cssmqr(PlasmaLeft, PlasmaConjTrans,
                        sb, M, N-(ii+sb), IB, IB,
                        &A1[LDA1*(ii+sb)+ii], LDA1,
                        &A2[LDA2*(ii+sb)],    LDA2,
                        &A2[LDA2*ii],         LDA2,
                        &T[LDT*ii],           LDT,
                        WORK, sb);
        }
    }
    return PLASMA_SUCCESS;
}

/***************************************************************************//**
 *
 **/
void CORE_ctsqrt_quark(Quark* quark)
{
    int M;
    int N;
    int IB;
    PLASMA_Complex32_t *A1;
    int LDA1;
    PLASMA_Complex32_t *A2;
    int LDA2;
    PLASMA_Complex32_t *T;
    int LDT;
    PLASMA_Complex32_t *TAU;
    PLASMA_Complex32_t *WORK;

    quark_unpack_args_11(quark, M, N, IB, A1, LDA1, A2, LDA2, T, LDT, TAU, WORK);
    CORE_ctsqrt(M, N, IB, A1, LDA1, A2, LDA2, T, LDT, TAU, WORK);
}
