/* /home4/luszczek/mscratch/build/SCALAPACK/SRC/pzhettrd.f -- translated by f2c (version 20031025).
   You must link the resulting object file with libf2c:
	on Microsoft Windows system, link with libf2c.lib;
	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
	or, if you install libf2c.a in a standard place, with -lf2c -lm
	-- in that order, at the end of the command line, as in
		cc *.o -lf2c -lm
	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

		http://www.netlib.org/f2c/libf2c.zip
*/

#include "f2c.h"

/* Table of constant values */

static integer c__2 = 2;
static integer c__0 = 0;
static integer c__3 = 3;
static integer c__4 = 4;
static integer c__1 = 1;
static integer c__6 = 6;
static doublecomplex c_b60 = {0.,0.};
static integer c__5 = 5;
static integer c_n1 = -1;
static doublecomplex c_b82 = {1.,0.};
static doublecomplex c_b144 = {-1.,0.};

/* Subroutine */ int pzhettrd_(char *uplo, integer *n, doublecomplex *a, 
	integer *ia, integer *ja, integer *desca, doublereal *d__, doublereal 
	*e, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *
	info, ftnlen uplo_len)
{
    /* System generated locals */
    integer i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9;
    doublereal d__1;
    doublecomplex z__1, z__2, z__3, z__4, z__5, z__6;

    /* Builtin functions */
    double sqrt(doublereal);
    void d_cnjg(doublecomplex *, doublecomplex *);
    double d_imag(doublecomplex *), d_sign(doublereal *, doublereal *);
    void z_div(doublecomplex *, doublecomplex *, doublecomplex *);

    /* Local variables */
    logical balanced;
    integer indexinh, minindex, maxindex, indexinv;
    doublecomplex conjtoph;
    logical twogemms;
    doublecomplex conjtopv;
    integer mysetnum;
    extern /* Subroutine */ int dcombnrm2_();
    doublecomplex c__;
    integer i__, j;
    doublecomplex cc[3];
    integer np, nq;
    extern /* Subroutine */ int pdtreecomb_(integer *, char *, integer *, 
	    doublereal *, integer *, integer *, U_fp, ftnlen);
    logical interleave;
    integer anb, lda, lii, inh, lij, npb, nqb, myfirstrow, pnb, ldv, inv, nps;
    doublecomplex oneoverbeta;
    integer npm0, npm1, nqm1;
    doublecomplex beta;
    integer liib, inhb, lijb, rowsperproc, invb, ldzg, nbzg, inht;
    doublereal dtmp[5];
    doublecomplex toph;
    doublereal norm;
    integer invt;
    doublecomplex topv;
    integer liip1, lijp1, idum1[1], idum2[1], ltnm1;
    doublecomplex alpha;
    extern integer iceil_(integer *, integer *);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    integer inhtb, pbmin, index, pbmax, npcol;
    extern /* Subroutine */ int lfc_SLzscal(integer *, doublecomplex *, 
	    doublecomplex *, integer *), lfc_SLzgemm(char *, char *, integer *, 
	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
	     doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
	    integer *, ftnlen, ftnlen);
    integer invtb, mycol, lwmin, intmp;
    extern /* Subroutine */ int lfc_SLzgemv(char *, integer *, integer *, 
	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
	    integer *, doublecomplex *, doublecomplex *, integer *, ftnlen);
    integer npset;
    logical upper;
    integer ictxt, nprow;
    doublecomplex topnv;
    integer myrow, ltlip1;
    extern doublereal lfc_SLdznrm2(integer *, doublecomplex *, integer *);
    integer indexa, bindex;
    doublereal safmin, safmax;
    integer curcol, pbsize;
    extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, 
	    doublecomplex *, integer *, doublecomplex *, integer *, ftnlen), 
	    blacs_gridinfo__(integer *, integer *, integer *, integer *, 
	    integer *);
    extern integer numroc_(integer *, integer *, integer *, integer *, 
	    integer *);
    integer nxtcol;
    extern /* Subroutine */ int dgebr2d_(integer *, char *, char *, integer *,
	     integer *, doublereal *, integer *, integer *, integer *, ftnlen,
	     ftnlen), dgebs2d_(integer *, char *, char *, integer *, integer *
	    , doublereal *, integer *, ftnlen, ftnlen);
    doublecomplex toptau;
    integer currow;
    extern /* Subroutine */ int chk1mat_(integer *, integer *, integer *, 
	    integer *, integer *, integer *, integer *, integer *, integer *),
	     zgebr2d_(integer *, char *, char *, integer *, integer *, 
	    doublecomplex *, integer *, integer *, integer *, ftnlen, ftnlen),
	     zgebs2d_(integer *, char *, char *, integer *, integer *, 
	    doublecomplex *, integer *, ftnlen, ftnlen);
    integer nxtrow;
    extern /* Subroutine */ int zgesd2d_(integer *, integer *, integer *, 
	    doublecomplex *, integer *, integer *, integer *), dgsum2d_(
	    integer *, char *, char *, integer *, integer *, doublereal *, 
	    integer *, integer *, integer *, ftnlen, ftnlen), ztrmvt_(char *, 
	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
	     doublecomplex *, integer *, doublecomplex *, integer *, 
	    doublecomplex *, integer *, ftnlen), zgerv2d_(integer *, integer *
	    , integer *, doublecomplex *, integer *, integer *, integer *), 
	    zgsum2d_(integer *, char *, char *, integer *, integer *, 
	    doublecomplex *, integer *, integer *, integer *, ftnlen, ftnlen);
    extern doublereal pdlamch_(integer *, char *, ftnlen);
    extern /* Subroutine */ int pxerbla_(integer *, char *, integer *, ftnlen)
	    ;
    extern integer pjlaenv_(integer *, integer *, char *, char *, integer *, 
	    integer *, integer *, integer *, ftnlen, ftnlen);
    extern /* Subroutine */ int pchk1mat_(integer *, integer *, integer *, 
	    integer *, integer *, integer *, integer *, integer *, integer *, 
	    integer *, integer *, integer *);
    integer numrows;


/*  -- ScaLAPACK routine (version 1.7) -- */
/*     University of Tennessee, Knoxville, Oak Ridge National Laboratory, */
/*     and University of California, Berkeley. */
/*     October 15, 1999 */

/*     .. Scalar Arguments .. */
/*     .. */
/*     .. Array Arguments .. */
/*     .. */

/*     Purpose */

/*     ======= */

/*     PZHETTRD reduces a complex Hermitian matrix sub( A ) to Hermitian */
/*     tridiagonal form T by an unitary similarity transformation: */
/*     Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1). */

/*     Notes */
/*     ===== */

/*     Each global data object is described by an associated description */
/*     vector.  This vector stores the information required to establish */
/*     the mapping between an object element and its corresponding */
/*     process and memory location. */

/*     Let A be a generic term for any 2D block cyclicly distributed */
/*     array. */
/*     Such a global array has an associated description vector DESCA. */
/*     In the following comments, the character _ should be read as */
/*     "of the global array". */

/*     NOTATION        STORED IN      EXPLANATION */
/*     --------------- -------------- ----------------------------------- */
/*     DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case, */
/*     DTYPE_A = 1. */
/*     CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, */
/*     indicating the BLACS process grid A is distribu- */
/*     ted over. The context itself is glo- */
/*     bal, but the handle (the integer */
/*     value) may vary. */
/*     M_A    (global) DESCA( M_ )    The number of rows in the global */
/*     array A. */
/*     N_A    (global) DESCA( N_ )    The number of columns in the global */
/*     array A. */
/*     MB_A   (global) DESCA( MB_ )   The blocking factor used to */
/*     distribute the rows of the array. */
/*     NB_A   (global) DESCA( NB_ )   The blocking factor used to */
/*     distribute the columns of the array. */
/*     RSRC_A (global) DESCA( RSRC_ ) The process row over which the */
/*     first row of the array A is distributed. */
/*     CSRC_A (global) DESCA( CSRC_ ) The process column over which the */
/*     first column of the array A is */
/*     distributed. */
/*     LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local */
/*     array.  LLD_A >= MAX(1,LOCp(M_A)). */

/*     Let K be the number of rows or columns of a distributed matrix, */
/*     and assume that its process grid has dimension p x q. */
/*     LOCp( K ) denotes the number of elements of K that a process */
/*     would receive if K were distributed over the p processes of its */
/*     process column. */
/*     Similarly, LOCq( K ) denotes the number of elements of K that a */
/*     process would receive if K were distributed over the q processes */
/*     of its process row. */
/*     The values of LOCp() and LOCq() may be determined via a call to */
/*     the ScaLAPACK tool function, NUMROC: */
/*     LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), */
/*     LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). */
/*     An upper bound for these quantities may be computed by: */
/*     LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A */
/*     LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A */

/*     Arguments */
/*     ========= */

/*     UPLO    (global input) CHARACTER */
/*     Specifies whether the upper or lower triangular part of the */
/*     Hermitian matrix sub( A ) is stored: */
/*     = 'U':  Upper triangular */
/*     = 'L':  Lower triangular */

/*     N       (global input) INTEGER */
/*     The number of rows and columns to be operated on, i.e. the */
/*     order of the distributed submatrix sub( A ). N >= 0. */

/*     A       (local input/local output) COMPLEX*16 pointer into the */
/*     local memory to an array of dimension (LLD_A,LOCq(JA+N-1)). */
/*     On entry, this array contains the local pieces of the */
/*     Hermitian distributed matrix sub( A ).  If UPLO = 'U', the */
/*     leading N-by-N upper triangular part of sub( A ) contains */
/*     the upper triangular part of the matrix, and its strictly */
/*     lower triangular part is not referenced. If UPLO = 'L', the */
/*     leading N-by-N lower triangular part of sub( A ) contains the */
/*     lower triangular part of the matrix, and its strictly upper */
/*     triangular part is not referenced. On exit, if UPLO = 'U', */
/*     the diagonal and first superdiagonal of sub( A ) are over- */
/*     written by the corresponding elements of the tridiagonal */
/*     matrix T, and the elements above the first superdiagonal, */
/*     with the array TAU, represent the unitary matrix Q as a */
/*     product of elementary reflectors; if UPLO = 'L', the diagonal */
/*     and first subdiagonal of sub( A ) are overwritten by the */
/*     corresponding elements of the tridiagonal matrix T, and the */
/*     elements below the first subdiagonal, with the array TAU, */
/*     represent the unitary matrix Q as a product of elementary */
/*     reflectors. See Further Details. */

/*     IA      (global input) INTEGER */
/*     The row index in the global array A indicating the first */
/*     row of sub( A ). */

/*     JA      (global input) INTEGER */
/*     The column index in the global array A indicating the */
/*     first column of sub( A ). */

/*     DESCA   (global and local input) INTEGER array of dimension DLEN_. */
/*     The array descriptor for the distributed matrix A. */

/*     D       (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1) */
/*     The diagonal elements of the tridiagonal matrix T: */
/*     D(i) = A(i,i). D is tied to the distributed matrix A. */

/*     E       (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1) */
/*     if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal */
/*     elements of the tridiagonal matrix T: E(i) = A(i,i+1) if */
/*     UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the */
/*     distributed matrix A. */

/*     TAU     (local output) COMPLEX*16, array, dimension */
/*     LOCq(JA+N-1). This array contains the scalar factors TAU of */
/*     the elementary reflectors. TAU is tied to the distributed */
/*     matrix A. */

/*     WORK    (local workspace) COMPLEX*16 array, dimension (LWORK) */
/*     On exit, WORK( 1 ) returns the minimal and optimal workspace */

/*     LWORK   (local input) INTEGER */
/*     The dimension of the array WORK. */
/*     LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS */
/*     Where: */
/*         NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB ) */
/*         ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PZHETTRD', 'L', 0, 0, */
/*           0, 0 ) */

/*         NUMROC is a ScaLAPACK tool function; */
/*         PJLAENV is a ScaLAPACK envionmental inquiry function */
/*         MYROW, MYCOL, NPROW and NPCOL can be determined by calling */
/*         the subroutine BLACS_GRIDINFO. */

/*     INFO    (global output) INTEGER */
/*     = 0:  successful exit */
/*     < 0:  If the i-th argument is an array and the j-entry had */
/*     an illegal value, then INFO = -(i*100+j), if the i-th */
/*     argument is a scalar and had an illegal value, then */
/*     INFO = -i. */

/*     Further Details */
/*     =============== */

/*     If UPLO = 'U', the matrix Q is represented as a product of */
/*     elementary reflectors */

/*     Q = H(n-1) . . . H(2) H(1). */

/*     Each H(i) has the form */

/*     H(i) = I - tau * v * v' */

/*     where tau is a complex scalar, and v is a complex vector with */
/*     v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in */
/*     A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1). */

/*     If UPLO = 'L', the matrix Q is represented as a product of */
/*     elementary reflectors */

/*     Q = H(1) H(2) . . . H(n-1). */

/*     Each H(i) has the form */

/*     H(i) = I - tau * v * v' */

/*     where tau is a complex scalar, and v is a complex vector with */
/*     v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in */
/*     A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1). */

/*     The contents of sub( A ) on exit are illustrated by the following */
/*     examples with n = 5: */

/*     if UPLO = 'U':                       if UPLO = 'L': */

/*     (  d   e   v2  v3  v4 )              (  d                  ) */
/*     (      d   e   v3  v4 )              (  e   d              ) */
/*     (          d   e   v4 )              (  v1  e   d          ) */
/*     (              d   e  )              (  v1  v2  e   d      ) */
/*     (                  d  )              (  v1  v2  v3  e   d  ) */

/*     where d and e denote diagonal and off-diagonal elements of T, and */
/*     vi denotes an element of the vector defining H(i). */

/*     Data storage requirements */
/*     ========================= */

/*     PZHETTRD is not intended to be called directly.  All users are */
/*     encourage to call PZHETRD which will then call PZHETTRD if */
/*     appropriate.  A must be in cyclic format (i.e. MB = NB = 1), */
/*     the process grid must be square ( i.e. NPROW = NPCOL ) and */
/*     only lower triangular storage is supported. */

/*     Local variables */
/*     =============== */

/*     PZHETTRD uses five local arrays: */
/*       WORK ( InV ) dimension ( NP, ANB+1): array V */
/*       WORK ( InH ) dimension ( NP, ANB+1): array H */
/*       WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V */
/*       WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H */
/*       WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT */

/*     Arrays V and H are replicated across all processor columns. */
/*     Arrays V^T and H^T are replicated across all processor rows. */

/*         WORK ( InVT ), or V^T, is stored as a tall skinny */
/*         array ( NQ x ANB-1 ) for efficiency.  Since only the lower */
/*         triangular portion of A is updated, Av is computed as: */
/*         tril(A) * v + v^T * tril(A,-1).  This is performed as */
/*         two local triangular matrix-vector multiplications (both in */
/*         MVR2) followed by a transpose and a sum across the columns. */
/*         In the local computation, WORK( InVT ) is used to compute */
/*         tril(A) * v and WORK( InV ) is used to compute */
/*         v^T * tril(A,-1) */

/*     The following variables are global indices into A: */
/*       INDEX:  The current global row and column number. */
/*       MAXINDEX:  The global row and column for the first row and */
/*       column in the trailing block of A. */
/*       LIIB, LIJB:  The first row, column in */

/*     The following variables point into the arrays A, V, H, V^T, H^T: */
/*       BINDEX  =INDEX-MININDEX: The column index in V, H, V^T, H^T. */
/*       LII:  local index I:  The local row number for row INDEX */
/*       LIJ:  local index J:  The local column number for column INDEX */
/*       LIIP1:  local index I+1:  The local row number for row INDEX+1 */
/*       LIJP1:  local index J+1:  The local col number for col INDEX+1 */
/*       LTLI: lower triangular local index I:  The local row for the */
/*         upper left entry in tril( A(INDEX, INDEX) ) */
/*       LTLIP1: lower triangular local index I+1:  The local row for the */
/*         upper left entry in tril( A(INDEX+1, INDEX+1) ) */

/*         Details:  The distinction between LII and LTLI (and between */
/*         LIIP1 and LTLIP1) is subtle.  Within the current processor */
/*         column (i.e. MYCOL .eq. CURCOL) they are the same.  However, */
/*         on some processors, A( LII, LIJ ) points to an element */
/*         above the diagonal, on these processors, LTLI = LII+1. */

/*     The following variables give the number of rows and/or columns */
/*     in various matrices: */
/*       NP:  The number of local rows in A( 1:N, 1:N ) */
/*       NQ:  The number of local columns in A( 1:N, 1:N ) */
/*       NPM0:  The number of local rows in A( INDEX:N, INDEX:N ) */
/*       NQM0:  The number of local columns in A( INDEX:N, INDEX:N ) */
/*       NPM1:  The number of local rows in A( INDEX+1:N, INDEX:N ) */
/*       NQM1:  The number of local columns in A( INDEX+1:N, INDEX:N ) */
/*       LTNM0:  The number of local rows & columns in */
/*         tril( A( INDEX:N, INDEX:N ) ) */
/*       LTNM1:  The number of local rows & columns in */
/*         tril( A( INDEX+1:N, INDEX+1:N ) ) */
/*         NOTE:  LTNM0 == LTNM1 on all processors except the diagonal */
/*         processors, i.e. those where MYCOL == MYROW. */

/*         Invariants: */
/*           NP = NPM0 + LII - 1 */
/*           NQ = NQM0 + LIJ - 1 */
/*           NP = NPM1 + LIIP1 - 1 */
/*           NQ = NQM1 + LIJP1 - 1 */
/*           NP = LTLI + LTNM0 - 1 */
/*           NP = LTLIP1 + LTNM1 - 1 */

/*       Temporary variables.  The following variables are used within */
/*       a few lines after they are set and do hold state from one loop */
/*       iteration to the next: */

/*     The matrix A: */
/*       The matrix A does not hold the same values that it would */
/*       in an unblocked code nor the values that it would hold in */
/*       in a blocked code. */

/*       The value of A is confusing.  It is easiest to state the */
/*       difference between trueA and A at the point that MVR2 is called, */
/*       so we will start there. */

/*       Let trueA be the value that A would */
/*       have at a given point in an unblocked code and A */
/*       be the value that A has in this code at the same point. */

/*       At the time of the call to MVR2, */
/*       trueA = A + V' * H + H' * V */
/*       where H = H( MAXINDEX:N, 1:BINDEX ) and */
/*       V = V( MAXINDEX:N, 1:BINDEX ). */

/*       At the bottom of the inner loop, */
/*       trueA = A +  V' * H + H' * V + v' * h + h' * v */
/*       where H = H( MAXINDEX:N, 1:BINDEX ) and */
/*       V = V( MAXINDEX:N, 1:BINDEX ) and */
/*       v = V( liip1:N, BINDEX+1 ) and */
/*       h = H( liip1:N, BINDEX+1 ) */

/*       At the top of the loop, BINDEX gets incremented, hence: */
/*       trueA = A +  V' * H + H' * V + v' * h + h' * v */
/*       where H = H( MAXINDEX:N, 1:BINDEX-1 ) and */
/*       V = V( MAXINDEX:N, 1:BINDEX-1 ) and */
/*       v = V( liip1:N, BINDEX ) and */
/*       h = H( liip1:N, BINDEX ) */


/*       A gets updated at the bottom of the outer loop */
/*       After this update, trueA = A + v' * h + h' * v */
/*       where v = V( liip1:N, BINDEX ) and */
/*       h = H( liip1:N, BINDEX ) and BINDEX = 0 */
/*       Indeed, the previous loop invariant as stated above for the */
/*       top of the loop still holds, but with BINDEX = 0, H and V */
/*       are null matrices. */

/*       After the current column of A is updated, */
/*         trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N ) */
/*       the rest of A is untouched. */

/*       After the current block column of A is updated, */
/*       trueA = A + V' * H + H' * V */
/*       where H = H( MAXINDEX:N, 1:BINDEX ) and */
/*       V = V( MAXINDEX:N, 1:BINDEX ) */

/*       This brings us back to the point at which mvr2 is called. */


/*     Details of the parallelization: */

/*       We delay spreading v across to all processor columns (which */
/*       would naturally happen at the bottom of the loop) in order to */
/*       combine the spread of v( : , i-1 ) with the spread of h( : , i ) */

/*       In order to compute h( :, i ), we must update A( :, i ) */
/*       which means that the processor column owning A( :, i ) must */
/*       have: c, tau, v( i, i ) and h( i, i ). */

/*       The traditional */
/*       way of computing v (and the one used in pzlatrd.f and */
/*       zlatrd.f) is: */
/*         v = tau * v */
/*         c = v' * h */
/*         alpha = - tau * c / 2 */
/*         v = v + alpha * h */
/*       However, the traditional way of computing v requires that tau */
/*       be broadcast to all processors in the current column (to compute */
/*       v = tau * v) and then a sum-to-all is required (to */
/*       compute v' * h ).  We use the following formula instead: */
/*         c = v' * h */
/*         v = tau * ( v - c * tau' * h / 2 ) */
/*       The above formula allows tau to be spread down in the */
/*       same call to DGSUM2D which performs the sum-to-all of c. */

/*       The computation of v, which could be performed in any processor */
/*       column (or other procesor subsets), is performed in the */
/*       processor column that owns A( :, i+1 ) so that A( :, i+1 ) */
/*       can be updated prior to spreading v across. */

/*       We keep the block column of A up-to-date to minimize the */
/*       work required in updating the current column of A.  Updating */
/*       the block column of A is reasonably load balanced whereas */
/*       updating the current column of A is not (only the current */
/*       processor column is involved). */

/*     In the following overview of the steps performed, M in the */
/*     margin indicates message traffic and C indicates O(n^2 nb/sqrt(p)) */
/*     or more flops per processor. */

/*     Inner loop: */
/*       A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) ) */
/* M      h = house( A(index:n, index) ) */
/* M      Spread v, h across */
/* M      vt = v^T; ht = h^T */
/*       A( index+1:n, index+1:maxindex ) -= */
/*         ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) ) */
/* C      v = tril(A) * h; vt = ht * tril(A,-1) */
/* MorC   v = v - H*V*h - V*H*h */
/* M      v = v + vt^T */
/* M      c = v' * h */
/*       v = tau * ( v - c * tau' * h / 2 ) */
/* C    A = A - H*V - V*H */



/*     ================================================================= */

/*     .. Parameters .. */
/*     .. */


/*     .. Local Scalars .. */


/*     .. */
/*     .. Local Arrays .. */




/*     .. */
/*     .. External Subroutines .. */
/*     .. */
/*     .. External Functions .. */

/*     .. */
/*     .. Intrinsic Functions .. */
/*     .. */


/*     .. Executable Statements .. */
/*       This is just to keep ftnchek and toolpack/1 happy */
    /* Parameter adjustments */
    --work;
    --tau;
    --e;
    --d__;
    --desca;
    --a;

    /* Function Body */
    if (FALSE_) {
	return 0;
    }



/*     Further details */
/*     =============== */

/*     At the top of the loop, v and nh have been computed but not */
/*     spread across.  Hence, A is out-of-date even after the */
/*     rank 2k update.  Furthermore, we compute the next v before */
/*     nh is spread across. */

/*     I claim that if we used a sum-to-all on NV, by summing CC within */
/*     each column, that we could compute NV locally and could avoid */
/*     spreading V across.  Bruce claims that sum-to-all can be made */
/*     to cost no more than sum-to-one on the Paragon.  If that is */
/*     true, this would be a win.  But, */
/*     the BLACS sum-to-all is just a sum-to-one followed by a broadcast, */
/*     and hence the present scheme is better for now. */

/*     Get grid parameters */

    ictxt = desca[2];
    blacs_gridinfo__(&ictxt, &nprow, &npcol, &myrow, &mycol);

    safmax = sqrt(pdlamch_(&ictxt, "O", (ftnlen)1)) / *n;
    safmin = sqrt(pdlamch_(&ictxt, "S", (ftnlen)1));

/*     Test the input parameters */

    *info = 0;
    if (nprow == -1) {
	*info = -602;
    } else {

/*     Here we set execution options for PZHETTRD */

	pnb = pjlaenv_(&ictxt, &c__2, "PZHETTRD", "L", &c__0, &c__0, &c__0, &
		c__0, (ftnlen)8, (ftnlen)1);
	anb = pjlaenv_(&ictxt, &c__3, "PZHETTRD", "L", &c__0, &c__0, &c__0, &
		c__0, (ftnlen)8, (ftnlen)1);

	interleave = pjlaenv_(&ictxt, &c__4, "PZHETTRD", "L", &c__1, &c__0, &
		c__0, &c__0, (ftnlen)8, (ftnlen)1) == 1;
	twogemms = pjlaenv_(&ictxt, &c__4, "PZHETTRD", "L", &c__2, &c__0, &
		c__0, &c__0, (ftnlen)8, (ftnlen)1) == 1;
	balanced = pjlaenv_(&ictxt, &c__4, "PZHETTRD", "L", &c__3, &c__0, &
		c__0, &c__0, (ftnlen)8, (ftnlen)1) == 1;

	chk1mat_(n, &c__2, n, &c__2, ia, ja, &desca[1], &c__6, info);


	upper = lsame_(uplo, "U", (ftnlen)1, (ftnlen)1);
	if (*info == 0 && desca[6] != 1) {
	    *info = 606;
	}
	if (*info == 0) {


/*           Here is the arithmetic: */
/*             Let maxnpq = max( np, nq, 2 * ANB ) */
/*             LDV = 4 * max( np, nq ) + 2 */
/*             LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB ) */
/*             = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS */

/*           This overestimates memory requirements when ANB > NP/2 */
/*           Memory requirements are lower when interleave = .false. */
/*           Hence, we could have two sets of memory requirements, */
/*           one for interleave and one for */


/* Computing MAX */
	    i__1 = numroc_(n, &c__1, &c__0, &c__0, &nprow), i__2 = anb << 1;
	    nps = max(i__1,i__2);
	    lwmin = (anb + 1 << 1) * ((nps << 2) + 2) + nps;

	    z__1.r = (doublereal) lwmin, z__1.i = 0.;
	    work[1].r = z__1.r, work[1].i = z__1.i;
	    if (! lsame_(uplo, "L", (ftnlen)1, (ftnlen)1)) {
		*info = -1;
	    } else if (*ia != 1) {
		*info = -4;
	    } else if (*ja != 1) {
		*info = -5;
	    } else if (nprow != npcol) {
		*info = -602;
	    } else if (desca[1] != 1) {
		*info = -601;
	    } else if (desca[5] != 1) {
		*info = -605;
	    } else if (desca[6] != 1) {
		*info = -606;
	    } else if (desca[7] != 0) {
		*info = -607;
	    } else if (desca[8] != 0) {
		*info = -608;
	    } else if (*lwork < lwmin) {
		*info = -11;
	    }
	}
	if (upper) {
	    idum1[0] = 'U';
	} else {
	    idum1[0] = 'L';
	}
	idum2[0] = 1;

	pchk1mat_(n, &c__2, n, &c__2, ia, ja, &desca[1], &c__6, &c__1, idum1, 
		idum2, info);
    }

    if (*info != 0) {
	i__1 = -(*info);
	pxerbla_(&ictxt, "PZHETTRD", &i__1, (ftnlen)8);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }



/*     Reduce the lower triangle of sub( A ) */
    np = numroc_(n, &c__1, &myrow, &c__0, &nprow);
    nq = numroc_(n, &c__1, &mycol, &c__0, &npcol);

    nxtrow = 0;
    nxtcol = 0;

    liip1 = 1;
    lijp1 = 1;
    npm1 = np;
    nqm1 = nq;

    lda = desca[9];
    ictxt = desca[2];



/*     Miscellaneous details: */
/*     Put tau, D and E in the right places */
/*     Check signs */
/*     Place all the arrays in WORK, control their placement */
/*     in  memory. */



/*     Loop invariants */
/*     A(LIIP1, LIJ) points to the first element of A(I+1,J) */
/*     NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N ) */
/*     A(LII:N,LIJ:N) is one step out of date. */
/*     proc( CURROW, CURCOL ) owns A(LII,LIJ) */
/*     proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ) */

    inh = 1;

    if (interleave) {

/*        H and V are interleaved to minimize memory movement */
/*        LDV has to be twice as large to accomodate interleaving. */
/*        In addition, LDV is doubled again to allow v, h and */
/*        toptau to be spreaad across and transposed in a */
/*        single communication operation with minimum memory */
/*        movement. */

/*        We could reduce LDV back to 2*MAX(NPM1,NQM1) */
/*        by increasing the memory movement required in */
/*        the spread and transpose of v, h and toptau. */
/*        However, since the non-interleaved path already */
/*        provides a mear minimum memory requirement option, */
/*        we did not provide this additional path. */

	ldv = (max(npm1,nqm1) << 2) + 2;

	inh = 1;

	inv = inh + ldv / 2;
	invt = inh + (anb + 1) * ldv;

	inht = invt + ldv / 2;
	intmp = invt + ldv * (anb + 1);

    } else {
	ldv = max(npm1,nqm1);

	inht = inh + ldv * (anb + 1);
	inv = inht + ldv * (anb + 1);

/*        The code works without this +1, but only because of a */
/*        coincidence.  Without the +1, WORK(INVT) gets trashed, but */
/*        WORK(INVT) is only used once and when it is used, it is */
/*        multiplied by WORK( INH ) which is zero.  Hence, the fact */
/*        that WORK(INVT) is trashed has no effect. */

	invt = inv + ldv * (anb + 1) + 1;
	intmp = invt + ldv * (anb << 1);

    }

    if (*info != 0) {
	i__1 = -(*info);
	pxerbla_(&ictxt, "PZHETTRD", &i__1, (ftnlen)8);
	z__1.r = (doublereal) lwmin, z__1.i = 0.;
	work[1].r = z__1.r, work[1].i = z__1.i;
	return 0;
    }


/*        The satisfies the loop invariant: trueA = A - V * HT - H * VT, */
/*        (where V, H, VT and HT all have BINDEX+1 rows/columns) */
/*        the first ANB times through the loop. */



/*     Setting either ( InH and InHT ) or InV to Z_ZERO */
/*     is adequate except in the face of NaNs. */


    i__1 = np;
    for (i__ = 1; i__ <= i__1; ++i__) {
	i__2 = inh + i__ - 1;
	work[i__2].r = 0., work[i__2].i = 0.;
	i__2 = inv + i__ - 1;
	work[i__2].r = 0., work[i__2].i = 0.;
/* L10: */
    }
    i__1 = nq;
    for (i__ = 1; i__ <= i__1; ++i__) {
	i__2 = inht + i__ - 1;
	work[i__2].r = 0., work[i__2].i = 0.;
/* L20: */
    }



    topnv.r = 0., topnv.i = 0.;

    ltlip1 = lijp1;
    ltnm1 = npm1;
    if (mycol > myrow) {
	++ltlip1;
	--ltnm1;
    }


    i__1 = *n - 1;
    i__2 = anb;
    for (minindex = 1; i__2 < 0 ? minindex >= i__1 : minindex <= i__1; 
	    minindex += i__2) {


/* Computing MIN */
	i__3 = minindex + anb - 1;
	maxindex = min(i__3,*n);
	lijb = numroc_(&maxindex, &c__1, &mycol, &c__0, &npcol) + 1;
	liib = numroc_(&maxindex, &c__1, &myrow, &c__0, &nprow) + 1;

	nqb = nq - lijb + 1;
	npb = np - liib + 1;
	inhtb = inht + lijb - 1;
	invtb = invt + lijb - 1;
	inhb = inh + liib - 1;
	invb = inv + liib - 1;




/* Computing MIN */
	i__4 = maxindex, i__5 = *n - 1;
	i__3 = min(i__4,i__5);
	for (index = minindex; index <= i__3; ++index) {

	    bindex = index - minindex;

	    currow = nxtrow;
	    curcol = nxtcol;

	    nxtrow = (currow + 1) % nprow;
	    nxtcol = (curcol + 1) % npcol;

	    lii = liip1;
	    lij = lijp1;
	    npm0 = npm1;

	    if (myrow == currow) {
		--npm1;
		++liip1;
	    }
	    if (mycol == curcol) {
		--nqm1;
		++lijp1;
		++ltlip1;
		--ltnm1;
	    }




/*     V = NV, VT = NVT, H = NH, HT = NHT */


/*     Update the current column of A */


	    if (mycol == curcol) {

		indexa = lii + (lij - 1) * lda;
		indexinv = inv + lii - 1 + (bindex - 1) * ldv;
		indexinh = inh + lii - 1 + (bindex - 1) * ldv;
		d_cnjg(&z__1, &work[inht + lij - 1 + bindex * ldv]);
		conjtoph.r = z__1.r, conjtoph.i = z__1.i;
		d_cnjg(&z__1, &topnv);
		conjtopv.r = z__1.r, conjtopv.i = z__1.i;

		if (index > 1) {
		    i__4 = npm0 - 1;
		    for (i__ = 0; i__ <= i__4; ++i__) {
/*                  A( INDEXA+I ) = A( INDEXA+I ) */
			i__5 = indexa + i__;
			i__6 = indexa + i__;
			i__7 = indexinv + ldv + i__;
			z__3.r = work[i__7].r * conjtoph.r - work[i__7].i * 
				conjtoph.i, z__3.i = work[i__7].r * 
				conjtoph.i + work[i__7].i * conjtoph.r;
			z__2.r = a[i__6].r - z__3.r, z__2.i = a[i__6].i - 
				z__3.i;
			i__8 = indexinh + ldv + i__;
			z__4.r = work[i__8].r * conjtopv.r - work[i__8].i * 
				conjtopv.i, z__4.i = work[i__8].r * 
				conjtopv.i + work[i__8].i * conjtopv.r;
			z__1.r = z__2.r - z__4.r, z__1.i = z__2.i - z__4.i;
			a[i__5].r = z__1.r, a[i__5].i = z__1.i;
/* L30: */
		    }
		}


	    }


	    if (mycol == curcol) {

/*     Compute the householder vector */

		if (myrow == currow) {
		    i__4 = lii + (lij - 1) * lda;
		    dtmp[1] = a[i__4].r;
		} else {
		    dtmp[1] = 0.;
		}
		if (myrow == nxtrow) {
		    i__4 = liip1 + (lij - 1) * lda;
		    dtmp[2] = a[i__4].r;
		    dtmp[3] = d_imag(&a[liip1 + (lij - 1) * lda]);
		} else {
		    dtmp[2] = 0.;
		    dtmp[3] = 0.;
		}

		norm = lfc_SLdznrm2(&npm1, &a[liip1 + (lij - 1) * lda], &c__1);
		dtmp[0] = norm;

/*              IF DTMP(5) = 1.0, NORM is too large and might cause */
/*              overflow, hence PDTREECOMB must be called.  IF DTMP(5) */
/*              is zero on output, DTMP(1) can be trusted. */

		dtmp[4] = 0.;
		if (dtmp[0] >= safmax || dtmp[0] < safmin) {
		    dtmp[4] = 1.;
		    dtmp[0] = 0.;
		}

		dtmp[0] *= dtmp[0];
		dgsum2d_(&ictxt, "C", " ", &c__5, &c__1, dtmp, &c__5, &c_n1, &
			curcol, (ftnlen)1, (ftnlen)1);
		if (dtmp[4] == 0.) {
		    dtmp[0] = sqrt(dtmp[0]);
		} else {
		    dtmp[0] = norm;
		    pdtreecomb_(&ictxt, "C", &c__1, dtmp, &c_n1, &mycol, (
			    U_fp)dcombnrm2_, (ftnlen)1);
		}

		norm = dtmp[0];

		d__[lij] = dtmp[1];
		if (myrow == currow && mycol == curcol) {
		    i__4 = lii + (lij - 1) * lda;
		    i__5 = lij;
		    z__1.r = d__[i__5], z__1.i = 0.;
		    a[i__4].r = z__1.r, a[i__4].i = z__1.i;
		}


		z__1.r = dtmp[2], z__1.i = dtmp[3];
		alpha.r = z__1.r, alpha.i = z__1.i;

		d__1 = alpha.r;
		norm = d_sign(&norm, &d__1);

		if (norm == 0.) {
		    toptau.r = 0., toptau.i = 0.;
		} else {
		    z__1.r = norm + alpha.r, z__1.i = alpha.i;
		    beta.r = z__1.r, beta.i = z__1.i;
		    z__1.r = beta.r / norm, z__1.i = beta.i / norm;
		    toptau.r = z__1.r, toptau.i = z__1.i;
		    z_div(&z__1, &c_b82, &beta);
		    oneoverbeta.r = z__1.r, oneoverbeta.i = z__1.i;

		    lfc_SLzscal(&npm1, &oneoverbeta, &a[liip1 + (lij - 1) * lda], &
			    c__1);
		}

		if (myrow == nxtrow) {
		    i__4 = liip1 + (lij - 1) * lda;
		    a[i__4].r = 1., a[i__4].i = 0.;
		}

		i__4 = lij;
		tau[i__4].r = toptau.r, tau[i__4].i = toptau.i;
		e[lij] = -norm;

	    }


/*     Spread v, nh, toptau across */

	    i__4 = npm1 - 1;
	    for (i__ = 0; i__ <= i__4; ++i__) {
		i__5 = inv + liip1 - 1 + bindex * ldv + npm1 + i__;
		i__6 = liip1 + i__ + (lij - 1) * lda;
		work[i__5].r = a[i__6].r, work[i__5].i = a[i__6].i;
/* L40: */
	    }

	    if (mycol == curcol) {
		i__4 = inv + liip1 - 1 + bindex * ldv + npm1 + npm1;
		work[i__4].r = toptau.r, work[i__4].i = toptau.i;
		i__4 = npm1 + npm1 + 1;
		i__5 = npm1 + npm1 + 1;
		zgebs2d_(&ictxt, "R", " ", &i__4, &c__1, &work[inv + liip1 - 
			1 + bindex * ldv], &i__5, (ftnlen)1, (ftnlen)1);
	    } else {
		i__4 = npm1 + npm1 + 1;
		i__5 = npm1 + npm1 + 1;
		zgebr2d_(&ictxt, "R", " ", &i__4, &c__1, &work[inv + liip1 - 
			1 + bindex * ldv], &i__5, &myrow, &curcol, (ftnlen)1, 
			(ftnlen)1);
		i__4 = inv + liip1 - 1 + bindex * ldv + npm1 + npm1;
		toptau.r = work[i__4].r, toptau.i = work[i__4].i;
	    }
	    i__4 = npm1 - 1;
	    for (i__ = 0; i__ <= i__4; ++i__) {
		i__5 = inh + liip1 - 1 + (bindex + 1) * ldv + i__;
		i__6 = inv + liip1 - 1 + bindex * ldv + npm1 + i__;
		work[i__5].r = work[i__6].r, work[i__5].i = work[i__6].i;
/* L50: */
	    }

	    if (index < *n) {
		if (myrow == nxtrow && mycol == curcol) {
		    i__4 = liip1 + (lij - 1) * lda;
		    i__5 = lij;
		    a[i__4].r = e[i__5], a[i__4].i = 0.;
		}
	    }

/*     Transpose v, nh */


	    if (myrow == mycol) {
		i__4 = npm1 + npm1;
		for (i__ = 0; i__ <= i__4; ++i__) {
		    i__5 = invt + lijp1 - 1 + bindex * ldv + i__;
		    i__6 = inv + liip1 - 1 + bindex * ldv + i__;
		    work[i__5].r = work[i__6].r, work[i__5].i = work[i__6].i;
/* L60: */
		}
	    } else {
		i__4 = npm1 + npm1;
		i__5 = npm1 + npm1;
		zgesd2d_(&ictxt, &i__4, &c__1, &work[inv + liip1 - 1 + bindex 
			* ldv], &i__5, &mycol, &myrow);
		i__4 = nqm1 + nqm1;
		i__5 = nqm1 + nqm1;
		zgerv2d_(&ictxt, &i__4, &c__1, &work[invt + lijp1 - 1 + 
			bindex * ldv], &i__5, &mycol, &myrow);
	    }

	    i__4 = nqm1 - 1;
	    for (i__ = 0; i__ <= i__4; ++i__) {
		i__5 = inht + lijp1 - 1 + (bindex + 1) * ldv + i__;
		i__6 = invt + lijp1 - 1 + bindex * ldv + nqm1 + i__;
		work[i__5].r = work[i__6].r, work[i__5].i = work[i__6].i;
/* L70: */
	    }


/*           Update the current block column of A */

	    if (index > 1) {
		i__4 = lijb - 1;
		for (j = lijp1; j <= i__4; ++j) {
		    i__5 = npm1 - 1;
		    for (i__ = 0; i__ <= i__5; ++i__) {

			i__6 = liip1 + i__ + (j - 1) * lda;
			i__7 = liip1 + i__ + (j - 1) * lda;
			i__8 = inv + liip1 - 1 + bindex * ldv + i__;
			d_cnjg(&z__4, &work[inht + j - 1 + bindex * ldv]);
			z__3.r = work[i__8].r * z__4.r - work[i__8].i * 
				z__4.i, z__3.i = work[i__8].r * z__4.i + work[
				i__8].i * z__4.r;
			z__2.r = a[i__7].r - z__3.r, z__2.i = a[i__7].i - 
				z__3.i;
			i__9 = inh + liip1 - 1 + bindex * ldv + i__;
			d_cnjg(&z__6, &work[invt + j - 1 + bindex * ldv]);
			z__5.r = work[i__9].r * z__6.r - work[i__9].i * 
				z__6.i, z__5.i = work[i__9].r * z__6.i + work[
				i__9].i * z__6.r;
			z__1.r = z__2.r - z__5.r, z__1.i = z__2.i - z__5.i;
			a[i__6].r = z__1.r, a[i__6].i = z__1.i;
/* L80: */
		    }
/* L90: */
		}
	    }



/*     Compute NV = A * NHT; NVT = A * NH */

/*           These two lines are necessary because these elements */
/*           are not always involved in the calls to ZTRMVT */
/*           for two reasons: */
/*           1)  On diagonal processors, the call to TRMVT */
/*               involves only LTNM1-1 elements */
/*           2)  On some processes, NQM1 < LTM1 or  LIIP1 < LTLIP1 */
/*               and when the results are combined across all processes, */
/*               uninitialized values may be included. */
	    i__4 = inv + liip1 - 1 + (bindex + 1) * ldv;
	    work[i__4].r = 0., work[i__4].i = 0.;
	    i__4 = invt + lijp1 - 1 + (bindex + 1) * ldv + nqm1 - 1;
	    work[i__4].r = 0., work[i__4].i = 0.;


	    if (myrow == mycol) {
		if (ltnm1 > 1) {
		    i__4 = ltnm1 - 1;
		    ztrmvt_("L", &i__4, &a[ltlip1 + 1 + (lijp1 - 1) * lda], &
			    lda, &work[invt + lijp1 - 1 + (bindex + 1) * ldv],
			     &c__1, &work[inh + ltlip1 + (bindex + 1) * ldv], 
			    &c__1, &work[inv + ltlip1 + (bindex + 1) * ldv], &
			    c__1, &work[inht + lijp1 - 1 + (bindex + 1) * ldv]
			    , &c__1, (ftnlen)1);
		}
		i__4 = ltnm1;
		for (i__ = 1; i__ <= i__4; ++i__) {
		    i__5 = invt + lijp1 + i__ - 2 + (bindex + 1) * ldv;
		    i__6 = invt + lijp1 + i__ - 2 + (bindex + 1) * ldv;
		    i__7 = ltlip1 + i__ - 1 + (lijp1 + i__ - 2) * lda;
		    i__8 = inh + ltlip1 + i__ - 2 + (bindex + 1) * ldv;
		    z__2.r = a[i__7].r * work[i__8].r - a[i__7].i * work[i__8]
			    .i, z__2.i = a[i__7].r * work[i__8].i + a[i__7].i 
			    * work[i__8].r;
		    z__1.r = work[i__6].r + z__2.r, z__1.i = work[i__6].i + 
			    z__2.i;
		    work[i__5].r = z__1.r, work[i__5].i = z__1.i;
/* L100: */
		}
	    } else {
		if (ltnm1 > 0) {
		    ztrmvt_("L", &ltnm1, &a[ltlip1 + (lijp1 - 1) * lda], &lda,
			     &work[invt + lijp1 - 1 + (bindex + 1) * ldv], &
			    c__1, &work[inh + ltlip1 - 1 + (bindex + 1) * ldv]
			    , &c__1, &work[inv + ltlip1 - 1 + (bindex + 1) * 
			    ldv], &c__1, &work[inht + lijp1 - 1 + (bindex + 1)
			     * ldv], &c__1, (ftnlen)1);
		}

	    }


/*     We take advantage of the fact that: */
/*     A * sum( B ) = sum ( A * B ) for matrices A,B */

/*     trueA = A + V * HT + H * VT */
/*     hence:  (trueA)v = Av' + V * HT * v + H * VT * v */
/*     VT * v = sum_p_in_NPROW ( VTp * v ) */
/*     H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v ) */

/*     v = v + V * HT * h + H * VT * h */



/*     tmp = HT * nh1 */
	    i__4 = bindex + 1 << 1;
	    for (i__ = 1; i__ <= i__4; ++i__) {
		i__5 = intmp - 1 + i__;
		work[i__5].r = 0., work[i__5].i = 0.;
/* L110: */
	    }

	    if (balanced) {
		npset = nprow;
		mysetnum = myrow;
		rowsperproc = iceil_(&nqb, &npset);
/* Computing MIN */
		i__4 = nqb + 1, i__5 = rowsperproc * mysetnum + 1;
		myfirstrow = min(i__4,i__5);
/* Computing MIN */
		i__4 = rowsperproc, i__5 = nqb - myfirstrow + 1;
		numrows = min(i__4,i__5);


/*     tmp = HT * v */

		i__4 = bindex + 1;
		lfc_SLzgemv("C", &numrows, &i__4, &c_b82, &work[inhtb + myfirstrow 
			- 1], &ldv, &work[inhtb + myfirstrow - 1 + (bindex + 
			1) * ldv], &c__1, &c_b60, &work[intmp], &c__1, (
			ftnlen)1);
/*     tmp2 = VT * v */
		i__4 = bindex + 1;
		lfc_SLzgemv("C", &numrows, &i__4, &c_b82, &work[invtb + myfirstrow 
			- 1], &ldv, &work[inhtb + myfirstrow - 1 + (bindex + 
			1) * ldv], &c__1, &c_b60, &work[intmp + bindex + 1], &
			c__1, (ftnlen)1);


		i__4 = bindex + 1 << 1;
		i__5 = bindex + 1 << 1;
		zgsum2d_(&ictxt, "C", " ", &i__4, &c__1, &work[intmp], &i__5, 
			&c_n1, &c_n1, (ftnlen)1, (ftnlen)1);
	    } else {
/*     tmp = HT * v */

		i__4 = bindex + 1;
		lfc_SLzgemv("C", &nqb, &i__4, &c_b82, &work[inhtb], &ldv, &work[
			inhtb + (bindex + 1) * ldv], &c__1, &c_b60, &work[
			intmp], &c__1, (ftnlen)1);
/*     tmp2 = VT * v */
		i__4 = bindex + 1;
		lfc_SLzgemv("C", &nqb, &i__4, &c_b82, &work[invtb], &ldv, &work[
			inhtb + (bindex + 1) * ldv], &c__1, &c_b60, &work[
			intmp + bindex + 1], &c__1, (ftnlen)1);

	    }



	    if (balanced) {
		mysetnum = mycol;

		rowsperproc = iceil_(&npb, &npset);
/* Computing MIN */
		i__4 = npb + 1, i__5 = rowsperproc * mysetnum + 1;
		myfirstrow = min(i__4,i__5);
/* Computing MIN */
		i__4 = rowsperproc, i__5 = npb - myfirstrow + 1;
		numrows = min(i__4,i__5);

		i__4 = bindex + 1 << 1;
		i__5 = bindex + 1 << 1;
		zgsum2d_(&ictxt, "R", " ", &i__4, &c__1, &work[intmp], &i__5, 
			&c_n1, &c_n1, (ftnlen)1, (ftnlen)1);


/*     v = v + V * tmp */
		if ((real) index > 1.f) {
		    i__4 = bindex + 1;
		    lfc_SLzgemv("N", &numrows, &i__4, &c_b144, &work[invb + 
			    myfirstrow - 1], &ldv, &work[intmp], &c__1, &
			    c_b82, &work[invb + myfirstrow - 1 + (bindex + 1) 
			    * ldv], &c__1, (ftnlen)1);

/*     v = v + H * tmp2 */
		    i__4 = bindex + 1;
		    lfc_SLzgemv("N", &numrows, &i__4, &c_b144, &work[inhb + 
			    myfirstrow - 1], &ldv, &work[intmp + bindex + 1], 
			    &c__1, &c_b82, &work[invb + myfirstrow - 1 + (
			    bindex + 1) * ldv], &c__1, (ftnlen)1);
		}

	    } else {
/*     v = v + V * tmp */
		i__4 = bindex + 1;
		lfc_SLzgemv("N", &npb, &i__4, &c_b144, &work[invb], &ldv, &work[
			intmp], &c__1, &c_b82, &work[invb + (bindex + 1) * 
			ldv], &c__1, (ftnlen)1);


/*     v = v + H * tmp2 */
		i__4 = bindex + 1;
		lfc_SLzgemv("N", &npb, &i__4, &c_b144, &work[inhb], &ldv, &work[
			intmp + bindex + 1], &c__1, &c_b82, &work[invb + (
			bindex + 1) * ldv], &c__1, (ftnlen)1);

	    }


/*     Transpose NV and add it back into NVT */

	    if (myrow == mycol) {
		i__4 = nqm1 - 1;
		for (i__ = 0; i__ <= i__4; ++i__) {
		    i__5 = intmp + i__;
		    i__6 = invt + lijp1 - 1 + (bindex + 1) * ldv + i__;
		    work[i__5].r = work[i__6].r, work[i__5].i = work[i__6].i;
/* L120: */
		}
	    } else {
		zgesd2d_(&ictxt, &nqm1, &c__1, &work[invt + lijp1 - 1 + (
			bindex + 1) * ldv], &nqm1, &mycol, &myrow);
		zgerv2d_(&ictxt, &npm1, &c__1, &work[intmp], &npm1, &mycol, &
			myrow);

	    }
	    i__4 = npm1 - 1;
	    for (i__ = 0; i__ <= i__4; ++i__) {
		i__5 = inv + liip1 - 1 + (bindex + 1) * ldv + i__;
		i__6 = inv + liip1 - 1 + (bindex + 1) * ldv + i__;
		i__7 = intmp + i__;
		z__1.r = work[i__6].r + work[i__7].r, z__1.i = work[i__6].i + 
			work[i__7].i;
		work[i__5].r = z__1.r, work[i__5].i = z__1.i;
/* L130: */
	    }

/*     Sum-to-one NV rowwise (within a row) */

	    zgsum2d_(&ictxt, "R", " ", &npm1, &c__1, &work[inv + liip1 - 1 + (
		    bindex + 1) * ldv], &npm1, &myrow, &nxtcol, (ftnlen)1, (
		    ftnlen)1);


/*     Dot product c = NV * NH */
/*     Sum-to-all c within next processor column */


	    if (mycol == nxtcol) {
		cc[0].r = 0., cc[0].i = 0.;
		i__4 = npm1 - 1;
		for (i__ = 0; i__ <= i__4; ++i__) {
		    d_cnjg(&z__3, &work[inv + liip1 - 1 + (bindex + 1) * ldv 
			    + i__]);
		    i__5 = inh + liip1 - 1 + (bindex + 1) * ldv + i__;
		    z__2.r = z__3.r * work[i__5].r - z__3.i * work[i__5].i, 
			    z__2.i = z__3.r * work[i__5].i + z__3.i * work[
			    i__5].r;
		    z__1.r = cc[0].r + z__2.r, z__1.i = cc[0].i + z__2.i;
		    cc[0].r = z__1.r, cc[0].i = z__1.i;
/* L140: */
		}
		if (myrow == nxtrow) {
		    i__4 = inv + liip1 - 1 + (bindex + 1) * ldv;
		    cc[1].r = work[i__4].r, cc[1].i = work[i__4].i;
		    i__4 = inh + liip1 - 1 + (bindex + 1) * ldv;
		    cc[2].r = work[i__4].r, cc[2].i = work[i__4].i;
		} else {
		    cc[1].r = 0., cc[1].i = 0.;
		    cc[2].r = 0., cc[2].i = 0.;
		}
		zgsum2d_(&ictxt, "C", " ", &c__3, &c__1, cc, &c__3, &c_n1, &
			nxtcol, (ftnlen)1, (ftnlen)1);

		topv.r = cc[1].r, topv.i = cc[1].i;
		c__.r = cc[0].r, c__.i = cc[0].i;
		toph.r = cc[2].r, toph.i = cc[2].i;

		d_cnjg(&z__6, &toptau);
		z__5.r = c__.r * z__6.r - c__.i * z__6.i, z__5.i = c__.r * 
			z__6.i + c__.i * z__6.r;
		d__1 = 2.;
		z__4.r = z__5.r / d__1, z__4.i = z__5.i / d__1;
		z__3.r = z__4.r * toph.r - z__4.i * toph.i, z__3.i = z__4.r * 
			toph.i + z__4.i * toph.r;
		z__2.r = topv.r - z__3.r, z__2.i = topv.i - z__3.i;
		z__1.r = toptau.r * z__2.r - toptau.i * z__2.i, z__1.i = 
			toptau.r * z__2.i + toptau.i * z__2.r;
		topnv.r = z__1.r, topnv.i = z__1.i;


/*     Compute V = Tau * (V - C * Tau' / 2 * H ) */


		i__4 = npm1 - 1;
		for (i__ = 0; i__ <= i__4; ++i__) {
		    i__5 = inv + liip1 - 1 + (bindex + 1) * ldv + i__;
		    i__6 = inv + liip1 - 1 + (bindex + 1) * ldv + i__;
		    d_cnjg(&z__6, &toptau);
		    z__5.r = c__.r * z__6.r - c__.i * z__6.i, z__5.i = c__.r *
			     z__6.i + c__.i * z__6.r;
		    d__1 = 2.;
		    z__4.r = z__5.r / d__1, z__4.i = z__5.i / d__1;
		    i__7 = inh + liip1 - 1 + (bindex + 1) * ldv + i__;
		    z__3.r = z__4.r * work[i__7].r - z__4.i * work[i__7].i, 
			    z__3.i = z__4.r * work[i__7].i + z__4.i * work[
			    i__7].r;
		    z__2.r = work[i__6].r - z__3.r, z__2.i = work[i__6].i - 
			    z__3.i;
		    z__1.r = toptau.r * z__2.r - toptau.i * z__2.i, z__1.i = 
			    toptau.r * z__2.i + toptau.i * z__2.r;
		    work[i__5].r = z__1.r, work[i__5].i = z__1.i;
/* L150: */
		}

	    }


/* L160: */
	}


/*     Perform the rank2k update */

	if (maxindex < *n) {

	    i__3 = npm1 - 1;
	    for (i__ = 0; i__ <= i__3; ++i__) {
		i__4 = intmp + i__;
		i__5 = inh + liip1 - 1 + anb * ldv + i__;
		work[i__4].r = work[i__5].r, work[i__4].i = work[i__5].i;
/* L170: */
	    }



	    if (! twogemms) {
		if (interleave) {
		    ldzg = ldv / 2;
		} else {
		    zlacpy_("A", &ltnm1, &anb, &work[inht + lijp1 - 1], &ldv, 
			    &work[invt + lijp1 - 1 + anb * ldv], &ldv, (
			    ftnlen)1);

		    zlacpy_("A", &ltnm1, &anb, &work[inv + ltlip1 - 1], &ldv, 
			    &work[inh + ltlip1 - 1 + anb * ldv], &ldv, (
			    ftnlen)1);
		    ldzg = ldv;
		}
		nbzg = anb << 1;
	    } else {
		ldzg = ldv;
		nbzg = anb;
	    }


	    i__3 = ltnm1;
	    i__4 = pnb;
	    for (pbmin = 1; i__4 < 0 ? pbmin >= i__3 : pbmin <= i__3; pbmin +=
		     i__4) {

/* Computing MIN */
		i__5 = pnb, i__6 = ltnm1 - pbmin + 1;
		pbsize = min(i__5,i__6);
/* Computing MIN */
		i__5 = ltnm1, i__6 = pbmin + pnb - 1;
		pbmax = min(i__5,i__6);
		lfc_SLzgemm("N", "C", &pbsize, &pbmax, &nbzg, &c_b144, &work[inh + 
			ltlip1 - 1 + pbmin - 1], &ldzg, &work[invt + lijp1 - 
			1], &ldzg, &c_b82, &a[ltlip1 + pbmin - 1 + (lijp1 - 1)
			 * lda], &lda, (ftnlen)1, (ftnlen)1);
		if (twogemms) {
		    lfc_SLzgemm("N", "C", &pbsize, &pbmax, &anb, &c_b144, &work[
			    inv + ltlip1 - 1 + pbmin - 1], &ldzg, &work[inht 
			    + lijp1 - 1], &ldzg, &c_b82, &a[ltlip1 + pbmin - 
			    1 + (lijp1 - 1) * lda], &lda, (ftnlen)1, (ftnlen)
			    1);
		}
/* L180: */
	    }



	    i__4 = npm1 - 1;
	    for (i__ = 0; i__ <= i__4; ++i__) {
		i__3 = inv + liip1 - 1 + i__;
		i__5 = inv + liip1 - 1 + anb * ldv + i__;
		work[i__3].r = work[i__5].r, work[i__3].i = work[i__5].i;
		i__3 = inh + liip1 - 1 + i__;
		i__5 = intmp + i__;
		work[i__3].r = work[i__5].r, work[i__3].i = work[i__5].i;
/* L190: */
	    }
	    i__4 = nqm1 - 1;
	    for (i__ = 0; i__ <= i__4; ++i__) {
		i__3 = inht + lijp1 - 1 + i__;
		i__5 = inht + lijp1 - 1 + anb * ldv + i__;
		work[i__3].r = work[i__5].r, work[i__3].i = work[i__5].i;
/* L200: */
	    }


	}

/*     End of the update A code */

/* L210: */
    }

    if (mycol == nxtcol) {
	if (myrow == nxtrow) {

	    i__2 = np + (nq - 1) * lda;
	    d__[nq] = a[i__2].r;
	    i__2 = np + (nq - 1) * lda;
	    i__1 = nq;
	    a[i__2].r = d__[i__1], a[i__2].i = 0.;

	    dgebs2d_(&ictxt, "C", " ", &c__1, &c__1, &d__[nq], &c__1, (ftnlen)
		    1, (ftnlen)1);
	} else {
	    dgebr2d_(&ictxt, "C", " ", &c__1, &c__1, &d__[nq], &c__1, &nxtrow,
		     &nxtcol, (ftnlen)1, (ftnlen)1);
	}
    }




    z__1.r = (doublereal) lwmin, z__1.i = 0.;
    work[1].r = z__1.r, work[1].i = z__1.i;
    return 0;

/*     End of PZHETTRD */


} /* pzhettrd_ */

