/* /home4/luszczek/mscratch/build/SCALAPACK/SRC/pzdbtrf.f -- translated by f2c (version 20031025).
   You must link the resulting object file with libf2c:
	on Microsoft Windows system, link with libf2c.lib;
	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
	or, if you install libf2c.a in a standard place, with -lf2c -lm
	-- in that order, at the end of the command line, as in
		cc *.o -lf2c -lm
	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

		http://www.netlib.org/f2c/libf2c.zip
*/

#include "f2c.h"

/* Table of constant values */

static doublecomplex c_b1 = {1.,0.};
static doublecomplex c_b2 = {0.,0.};
static integer c__9 = 9;
static integer c__1 = 1;
static integer c__0 = 0;
static integer c_n1 = -1;

/* Subroutine */ int pzdbtrf_(integer *n, integer *bwl, integer *bwu, 
	doublecomplex *a, integer *ja, integer *desca, doublecomplex *af, 
	integer *laf, doublecomplex *work, integer *lwork, integer *info)
{
    /* System generated locals */
    integer i__1, i__2, i__3, i__4, i__5, i__6;
    doublecomplex z__1;

    /* Local variables */
    integer prev_tri_size_m__, prev_tri_size_n__, next_tri_size_m__, 
	    next_tri_size_n__, odd_size__, desca_1xp__[7], i__, store_n_a__, 
	    comm_proc__, part_size__, ictxt_new__, nb, np, level_dist__, 
	    ictxt_save__, first_proc__, param_check__[27]	/* was [9][3] 
	    */, up_prev_tri_size_m__, up_prev_tri_size_n__, mbw2, 
	    return_code__, part_offset__, my_num_cols__, llda, csrc, ofst;
    extern /* Subroutine */ int desc_convert__(integer *, integer *, integer *
	    );
    integer idum3, npcol;
    extern /* Subroutine */ int lfc_SLzgemm(char *, char *, integer *, integer *, 
	    integer *, doublecomplex *, doublecomplex *, integer *, 
	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
	    integer *, ftnlen, ftnlen);
    integer mycol, ictxt, nprow;
    extern /* Subroutine */ int lfc_SLztrmm(char *, char *, char *, char *, 
	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
	     doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), 
	    lfc_SLzaxpy(integer *, doublecomplex *, doublecomplex *, integer *, 
	    doublecomplex *, integer *);
    integer myrow, work_size_min__, ja_new__, max_bw__;
    extern /* Subroutine */ int zdbtrf_(integer *, integer *, integer *, 
	    integer *, doublecomplex *, integer *, integer *), zlacpy_(char *,
	     integer *, integer *, doublecomplex *, integer *, doublecomplex *
	    , integer *, ftnlen), blacs_gridinfo__(integer *, integer *, 
	    integer *, integer *, integer *);
    extern integer numroc_(integer *, integer *, integer *, integer *, 
	    integer *);
    integer work_u__;
    extern /* Subroutine */ int igebr2d_(integer *, char *, char *, integer *,
	     integer *, integer *, integer *, integer *, integer *, ftnlen, 
	    ftnlen), igebs2d_(integer *, char *, char *, integer *, integer *,
	     integer *, integer *, ftnlen, ftnlen), blacs_gridexit__(integer *
	    ), ztbtrs_(char *, char *, char *, integer *, integer *, integer *
	    , doublecomplex *, integer *, doublecomplex *, integer *, integer 
	    *, ftnlen, ftnlen, ftnlen), igamx2d_(integer *, char *, char *, 
	    integer *, integer *, integer *, integer *, integer *, integer *, 
	    integer *, integer *, integer *, ftnlen, ftnlen), zgesd2d_(
	    integer *, integer *, integer *, doublecomplex *, integer *, 
	    integer *, integer *), zgerv2d_(integer *, integer *, integer *, 
	    doublecomplex *, integer *, integer *, integer *), ztrsd2d_(
	    integer *, char *, char *, integer *, integer *, doublecomplex *, 
	    integer *, integer *, integer *, ftnlen, ftnlen);
    integer laf_min__;
    extern /* Subroutine */ int globchk_(integer *, integer *, integer *, 
	    integer *, integer *, integer *), ztrrv2d_(integer *, char *, 
	    char *, integer *, integer *, doublecomplex *, integer *, integer 
	    *, integer *, ftnlen, ftnlen), reshape_(integer *, integer *, 
	    integer *, integer *, integer *, integer *, integer *);
    integer np_save__;
    extern /* Subroutine */ int pxerbla_(integer *, char *, integer *, ftnlen)
	    , zlatcpy_(char *, integer *, integer *, doublecomplex *, integer 
	    *, doublecomplex *, integer *, ftnlen);




/*  -- ScaLAPACK routine (version 1.7) -- */
/*     University of Tennessee, Knoxville, Oak Ridge National Laboratory, */
/*     and University of California, Berkeley. */
/*     May 25, 2001 */

/*     .. Scalar Arguments .. */
/*     .. */
/*     .. Array Arguments .. */
/*     .. */


/*  Purpose */
/*  ======= */

/*  PZDBTRF computes a LU factorization */
/*  of an N-by-N complex banded */
/*  diagonally dominant-like distributed matrix */
/*  with bandwidth BWL, BWU: A(1:N, JA:JA+N-1). */
/*  Reordering is used to increase parallelism in the factorization. */
/*  This reordering results in factors that are DIFFERENT from those */
/*  produced by equivalent sequential codes. These factors cannot */
/*  be used directly by users; however, they can be used in */
/*  subsequent calls to PZDBTRS to solve linear systems. */

/*  The factorization has the form */

/*          P A(1:N, JA:JA+N-1) P^T = L U */

/*  where U is a banded upper triangular matrix and L is banded */
/*  lower triangular, and P is a permutation matrix. */

/*  ===================================================================== */

/*  Arguments */
/*  ========= */


/*  N       (global input) INTEGER */
/*          The number of rows and columns to be operated on, i.e. the */
/*          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. */

/*  BWL     (global input) INTEGER */
/*          Number of subdiagonals. 0 <= BWL <= N-1 */

/*  BWU     (global input) INTEGER */
/*          Number of superdiagonals. 0 <= BWU <= N-1 */

/*  A       (local input/local output) COMPLEX*16 pointer into */
/*          local memory to an array with first dimension */
/*          LLD_A >=(bwl+bwu+1) (stored in DESCA). */
/*          On entry, this array contains the local pieces of the */
/*          N-by-N unsymmetric banded distributed matrix */
/*          A(1:N, JA:JA+N-1) to be factored. */
/*          This local portion is stored in the packed banded format */
/*            used in LAPACK. Please see the Notes below and the */
/*            ScaLAPACK manual for more detail on the format of */
/*            distributed matrices. */
/*          On exit, this array contains information containing details */
/*            of the factorization. */
/*          Note that permutations are performed on the matrix, so that */
/*            the factors returned are different from those returned */
/*            by LAPACK. */

/*  JA      (global input) INTEGER */
/*          The index in the global array A that points to the start of */
/*          the matrix to be operated on (which may be either all of A */
/*          or a submatrix of A). */

/*  DESCA   (global and local input) INTEGER array of dimension DLEN. */
/*          if 1D type (DTYPE_A=501), DLEN >= 7; */
/*          if 2D type (DTYPE_A=1), DLEN >= 9 . */
/*          The array descriptor for the distributed matrix A. */
/*          Contains information of mapping of A to memory. Please */
/*          see NOTES below for full description and options. */

/*  AF      (local output) COMPLEX*16 array, dimension LAF. */
/*          Auxiliary Fillin Space. */
/*          Fillin is created during the factorization routine */
/*          PZDBTRF and this is stored in AF. If a linear system */
/*          is to be solved using PZDBTRS after the factorization */
/*          routine, AF *must not be altered* after the factorization. */

/*  LAF     (local input) INTEGER */
/*          Size of user-input Auxiliary Fillin space AF. Must be >= */
/*          NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu) */
/*          If LAF is not large enough, an error code will be returned */
/*          and the minimum acceptable size will be returned in AF( 1 ) */

/*  WORK    (local workspace/local output) */
/*          COMPLEX*16 temporary workspace. This space may */
/*          be overwritten in between calls to routines. WORK must be */
/*          the size given in LWORK. */
/*          On exit, WORK( 1 ) contains the minimal LWORK. */

/*  LWORK   (local input or global input) INTEGER */
/*          Size of user-input workspace WORK. */
/*          If LWORK is too small, the minimal acceptable size will be */
/*          returned in WORK(1) and an error code is returned. LWORK>= */
/*          max(bwl,bwu)*max(bwl,bwu) */

/*  INFO    (global output) INTEGER */
/*          = 0:  successful exit */
/*          < 0:  If the i-th argument is an array and the j-entry had */
/*                an illegal value, then INFO = -(i*100+j), if the i-th */
/*                argument is a scalar and had an illegal value, then */
/*                INFO = -i. */
/*          > 0:  If INFO = K<=NPROCS, the submatrix stored on processor */
/*                INFO and factored locally was not */
/*                diagonally dominant-like,  and */
/*                the factorization was not completed. */
/*                If INFO = K>NPROCS, the submatrix stored on processor */
/*                INFO-NPROCS representing interactions with other */
/*                processors was not */
/*                stably factorable wo/interchanges, */
/*                and the factorization was not completed. */

/*  ===================================================================== */


/*  Restrictions */
/*  ============ */

/*  The following are restrictions on the input parameters. Some of these */
/*    are temporary and will be removed in future releases, while others */
/*    may reflect fundamental technical limitations. */

/*    Non-cyclic restriction: VERY IMPORTANT! */
/*      P*NB>= mod(JA-1,NB)+N. */
/*      The mapping for matrices must be blocked, reflecting the nature */
/*      of the divide and conquer algorithm as a task-parallel algorithm. */
/*      This formula in words is: no processor may have more than one */
/*      chunk of the matrix. */

/*    Blocksize cannot be too small: */
/*      If the matrix spans more than one processor, the following */
/*      restriction on NB, the size of each block on each processor, */
/*      must hold: */
/*      NB >= 2*MAX(BWL,BWU) */
/*      The bulk of parallel computation is done on the matrix of size */
/*      O(NB) on each processor. If this is too small, divide and conquer */
/*      is a poor choice of algorithm. */

/*    Submatrix reference: */
/*      JA = IB */
/*      Alignment restriction that prevents unnecessary communication. */


/*  ===================================================================== */


/*  Notes */
/*  ===== */

/*  If the factorization routine and the solve routine are to be called */
/*    separately (to solve various sets of righthand sides using the same */
/*    coefficient matrix), the auxiliary space AF *must not be altered* */
/*    between calls to the factorization routine and the solve routine. */

/*  The best algorithm for solving banded and tridiagonal linear systems */
/*    depends on a variety of parameters, especially the bandwidth. */
/*    Currently, only algorithms designed for the case N/P >> bw are */
/*    implemented. These go by many names, including Divide and Conquer, */
/*    Partitioning, domain decomposition-type, etc. */

/*  Algorithm description: Divide and Conquer */

/*    The Divide and Conqer algorithm assumes the matrix is narrowly */
/*      banded compared with the number of equations. In this situation, */
/*      it is best to distribute the input matrix A one-dimensionally, */
/*      with columns atomic and rows divided amongst the processes. */
/*      The basic algorithm divides the banded matrix up into */
/*      P pieces with one stored on each processor, */
/*      and then proceeds in 2 phases for the factorization or 3 for the */
/*      solution of a linear system. */
/*      1) Local Phase: */
/*         The individual pieces are factored independently and in */
/*         parallel. These factors are applied to the matrix creating */
/*         fillin, which is stored in a non-inspectable way in auxiliary */
/*         space AF. Mathematically, this is equivalent to reordering */
/*         the matrix A as P A P^T and then factoring the principal */
/*         leading submatrix of size equal to the sum of the sizes of */
/*         the matrices factored on each processor. The factors of */
/*         these submatrices overwrite the corresponding parts of A */
/*         in memory. */
/*      2) Reduced System Phase: */
/*         A small (max(bwl,bwu)* (P-1)) system is formed representing */
/*         interaction of the larger blocks, and is stored (as are its */
/*         factors) in the space AF. A parallel Block Cyclic Reduction */
/*         algorithm is used. For a linear system, a parallel front solve */
/*         followed by an analagous backsolve, both using the structure */
/*         of the factored matrix, are performed. */
/*      3) Backsubsitution Phase: */
/*         For a linear system, a local backsubstitution is performed on */
/*         each processor in parallel. */


/*  Descriptors */
/*  =========== */

/*  Descriptors now have *types* and differ from ScaLAPACK 1.0. */

/*  Note: banded codes can use either the old two dimensional */
/*    or new one-dimensional descriptors, though the processor grid in */
/*    both cases *must be one-dimensional*. We describe both types below. */

/*  Each global data object is described by an associated description */
/*  vector.  This vector stores the information required to establish */
/*  the mapping between an object element and its corresponding process */
/*  and memory location. */

/*  Let A be a generic term for any 2D block cyclicly distributed array. */
/*  Such a global array has an associated description vector DESCA. */
/*  In the following comments, the character _ should be read as */
/*  "of the global array". */

/*  NOTATION        STORED IN      EXPLANATION */
/*  --------------- -------------- -------------------------------------- */
/*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case, */
/*                                 DTYPE_A = 1. */
/*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating */
/*                                 the BLACS process grid A is distribu- */
/*                                 ted over. The context itself is glo- */
/*                                 bal, but the handle (the integer */
/*                                 value) may vary. */
/*  M_A    (global) DESCA( M_ )    The number of rows in the global */
/*                                 array A. */
/*  N_A    (global) DESCA( N_ )    The number of columns in the global */
/*                                 array A. */
/*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute */
/*                                 the rows of the array. */
/*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute */
/*                                 the columns of the array. */
/*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first */
/*                                 row of the array A is distributed. */
/*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the */
/*                                 first column of the array A is */
/*                                 distributed. */
/*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local */
/*                                 array.  LLD_A >= MAX(1,LOCr(M_A)). */

/*  Let K be the number of rows or columns of a distributed matrix, */
/*  and assume that its process grid has dimension p x q. */
/*  LOCr( K ) denotes the number of elements of K that a process */
/*  would receive if K were distributed over the p processes of its */
/*  process column. */
/*  Similarly, LOCc( K ) denotes the number of elements of K that a */
/*  process would receive if K were distributed over the q processes of */
/*  its process row. */
/*  The values of LOCr() and LOCc() may be determined via a call to the */
/*  ScaLAPACK tool function, NUMROC: */
/*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), */
/*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). */
/*  An upper bound for these quantities may be computed by: */
/*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A */
/*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A */


/*  One-dimensional descriptors: */

/*  One-dimensional descriptors are a new addition to ScaLAPACK since */
/*    version 1.0. They simplify and shorten the descriptor for 1D */
/*    arrays. */

/*  Since ScaLAPACK supports two-dimensional arrays as the fundamental */
/*    object, we allow 1D arrays to be distributed either over the */
/*    first dimension of the array (as if the grid were P-by-1) or the */
/*    2nd dimension (as if the grid were 1-by-P). This choice is */
/*    indicated by the descriptor type (501 or 502) */
/*    as described below. */

/*    IMPORTANT NOTE: the actual BLACS grid represented by the */
/*    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P */
/*    irrespective of which one-dimensional descriptor type */
/*    (501 or 502) is input. */
/*    This routine will interpret the grid properly either way. */
/*    ScaLAPACK routines *do not support intercontext operations* so that */
/*    the grid passed to a single ScaLAPACK routine *must be the same* */
/*    for all array descriptors passed to that routine. */

/*    NOTE: In all cases where 1D descriptors are used, 2D descriptors */
/*    may also be used, since a one-dimensional array is a special case */
/*    of a two-dimensional array with one dimension of size unity. */
/*    The two-dimensional array used in this case *must* be of the */
/*    proper orientation: */
/*      If the appropriate one-dimensional descriptor is DTYPEA=501 */
/*      (1 by P type), then the two dimensional descriptor must */
/*      have a CTXT value that refers to a 1 by P BLACS grid; */
/*      If the appropriate one-dimensional descriptor is DTYPEA=502 */
/*      (P by 1 type), then the two dimensional descriptor must */
/*      have a CTXT value that refers to a P by 1 BLACS grid. */


/*  Summary of allowed descriptors, types, and BLACS grids: */
/*  DTYPE           501         502         1         1 */
/*  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1 */
/*  ----------------------------------------------------- */
/*  A               OK          NO          OK        NO */
/*  B               NO          OK          NO        OK */

/*  Let A be a generic term for any 1D block cyclicly distributed array. */
/*  Such a global array has an associated description vector DESCA. */
/*  In the following comments, the character _ should be read as */
/*  "of the global array". */

/*  NOTATION        STORED IN  EXPLANATION */
/*  --------------- ---------- ------------------------------------------ */
/*  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, */
/*                                TYPE_A = 501: 1-by-P grid. */
/*                                TYPE_A = 502: P-by-1 grid. */
/*  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating */
/*                                the BLACS process grid A is distribu- */
/*                                ted over. The context itself is glo- */
/*                                bal, but the handle (the integer */
/*                                value) may vary. */
/*  N_A    (global) DESCA( 3 ) The size of the array dimension being */
/*                                distributed. */
/*  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute */
/*                                the distributed dimension of the array. */
/*  SRC_A  (global) DESCA( 5 ) The process row or column over which the */
/*                                first row or column of the array */
/*                                is distributed. */
/*  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array */
/*                                storing the local blocks of the distri- */
/*                                buted array A. Minimum value of LLD_A */
/*                                depends on TYPE_A. */
/*                                TYPE_A = 501: LLD_A >= */
/*                                   size of undistributed dimension, 1. */
/*                                TYPE_A = 502: LLD_A >=NB_A, 1. */
/*  Reserved        DESCA( 7 ) Reserved for future use. */



/*  ===================================================================== */

/*  Code Developer: Andrew J. Cleary, University of Tennessee. */
/*    Current address: Lawrence Livermore National Labs. */
/*  This version released: August, 2001. */

/*  ===================================================================== */

/*     .. */
/*     .. Parameters .. */
/*     .. */
/*     .. Local Scalars .. */
/*     .. */
/*     .. Local Arrays .. */
/*     .. */
/*     .. External Subroutines .. */
/*     .. */
/*     .. External Functions .. */
/*     .. */
/*     .. Intrinsic Functions .. */
/*     .. */
/*     .. Executable Statements .. */

/*     Test the input parameters */

    /* Parameter adjustments */
    --work;
    --af;
    --desca;
    --a;

    /* Function Body */
    *info = 0;

/*     Convert descriptor into standard form for easy access to */
/*        parameters, check that grid is of right shape. */

    desca_1xp__[0] = 501;

    desc_convert__(&desca[1], desca_1xp__, &return_code__);

    if (return_code__ != 0) {
	*info = -602;
    }

/*     Get values out of descriptor for use in code. */

    ictxt = desca_1xp__[1];
    csrc = desca_1xp__[4];
    nb = desca_1xp__[3];
    llda = desca_1xp__[5];
    store_n_a__ = desca_1xp__[2];

/*     Get grid parameters */


/*     Size of separator blocks is maximum of bandwidths */

    max_bw__ = max(*bwl,*bwu);
    mbw2 = max_bw__ * max_bw__;

    blacs_gridinfo__(&ictxt, &nprow, &npcol, &myrow, &mycol);
    np = nprow * npcol;



    if (*lwork < -1) {
	*info = -10;
    } else if (*lwork == -1) {
	idum3 = -1;
    } else {
	idum3 = 1;
    }

    if (*n < 0) {
	*info = -1;
    }

    if (*n + *ja - 1 > store_n_a__) {
	*info = -606;
    }

    if (*bwl > *n - 1 || *bwl < 0) {
	*info = -2;
    }

    if (*bwu > *n - 1 || *bwu < 0) {
	*info = -3;
    }

    if (llda < *bwl + *bwu + 1) {
	*info = -606;
    }

    if (nb <= 0) {
	*info = -604;
    }

/*     Argument checking that is specific to Divide & Conquer routine */

    if (nprow != 1) {
	*info = -602;
    }

    if (*n > np * nb - (*ja - 1) % nb) {
	*info = -1;
	i__1 = -(*info);
	pxerbla_(&ictxt, "PZDBTRF, D&C alg.: only 1 block per proc", &i__1, (
		ftnlen)40);
	return 0;
    }

    if (*ja + *n - 1 > nb && nb < max(*bwl,*bwu) << 1) {
	*info = -604;
	i__1 = -(*info);
	pxerbla_(&ictxt, "PZDBTRF, D&C alg.: NB too small", &i__1, (ftnlen)31)
		;
	return 0;
    }


/*     Check auxiliary storage size */

    laf_min__ = nb * (*bwl + *bwu) + max(*bwl,*bwu) * 6 * max(*bwl,*bwu);

    if (*laf < laf_min__) {
	*info = -8;
/*        put minimum value of laf into AF( 1 ) */
	af[1].r = (doublereal) laf_min__, af[1].i = 0.;
	i__1 = -(*info);
	pxerbla_(&ictxt, "PZDBTRF: auxiliary storage error ", &i__1, (ftnlen)
		33);
	return 0;
    }

/*     Check worksize */

    work_size_min__ = max(*bwl,*bwu) * max(*bwl,*bwu);

    work[1].r = (doublereal) work_size_min__, work[1].i = 0.;

    if (*lwork < work_size_min__) {
	if (*lwork != -1) {
	    *info = -10;
	    i__1 = -(*info);
	    pxerbla_(&ictxt, "PZDBTRF: worksize error ", &i__1, (ftnlen)24);
	}
	return 0;
    }

/*     Pack params and positions into arrays for global consistency check */

    param_check__[8] = desca[5];
    param_check__[7] = desca[4];
    param_check__[6] = desca[3];
    param_check__[5] = desca[1];
    param_check__[4] = *ja;
    param_check__[3] = *bwu;
    param_check__[2] = *bwl;
    param_check__[1] = *n;
    param_check__[0] = idum3;

    param_check__[17] = 605;
    param_check__[16] = 604;
    param_check__[15] = 603;
    param_check__[14] = 601;
    param_check__[13] = 5;
    param_check__[12] = 3;
    param_check__[11] = 2;
    param_check__[10] = 1;
    param_check__[9] = 10;

/*     Want to find errors with MIN( ), so if no error, set it to a big */
/*     number. If there already is an error, multiply by the the */
/*     descriptor multiplier. */

    if (*info >= 0) {
	*info = 10000;
    } else if (*info < -100) {
	*info = -(*info);
    } else {
	*info = -(*info) * 100;
    }

/*     Check consistency across processors */

    globchk_(&ictxt, &c__9, param_check__, &c__9, &param_check__[18], info);

/*     Prepare output: set info = 0 if no error, and divide by DESCMULT */
/*     if error is not in a descriptor entry. */

    if (*info == 10000) {
	*info = 0;
    } else if (*info % 100 == 0) {
	*info = -(*info) / 100;
    } else {
	*info = -(*info);
    }

    if (*info < 0) {
	i__1 = -(*info);
	pxerbla_(&ictxt, "PZDBTRF", &i__1, (ftnlen)7);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }


/*     Adjust addressing into matrix space to properly get into */
/*        the beginning part of the relevant data */

    part_offset__ = nb * ((*ja - 1) / (npcol * nb));

    if (mycol - csrc < (*ja - part_offset__ - 1) / nb) {
	part_offset__ += nb;
    }

    if (mycol < csrc) {
	part_offset__ -= nb;
    }

/*     Form a new BLACS grid (the "standard form" grid) with only procs */
/*        holding part of the matrix, of size 1xNP where NP is adjusted, */
/*        starting at csrc=0, with JA modified to reflect dropped procs. */

/*     First processor to hold part of the matrix: */

    first_proc__ = ((*ja - 1) / nb + csrc) % npcol;

/*     Calculate new JA one while dropping off unused processors. */

    ja_new__ = (*ja - 1) % nb + 1;

/*     Save and compute new value of NP */

    np_save__ = np;
    np = (ja_new__ + *n - 2) / nb + 1;

/*     Call utility routine that forms "standard-form" grid */

    reshape_(&ictxt, &c__1, &ictxt_new__, &c__1, &first_proc__, &c__1, &np);

/*     Use new context from standard grid as context. */

    ictxt_save__ = ictxt;
    ictxt = ictxt_new__;
    desca_1xp__[1] = ictxt_new__;

/*     Get information about new grid. */

    blacs_gridinfo__(&ictxt, &nprow, &npcol, &myrow, &mycol);

/*     Drop out processors that do not have part of the matrix. */

    if (myrow < 0) {
	goto L1234;
    }

/*     ******************************** */
/*     Values reused throughout routine */

/*     User-input value of partition size */

    part_size__ = nb;

/*     Number of columns in each processor */

    my_num_cols__ = numroc_(n, &part_size__, &mycol, &c__0, &npcol);

/*     Offset in columns to beginning of main partition in each proc */

    if (mycol == 0) {
	part_offset__ += (ja_new__ - 1) % part_size__;
	my_num_cols__ -= (ja_new__ - 1) % part_size__;
    }

/*     Offset in elements */

    ofst = part_offset__ * llda;

/*     Size of main (or odd) partition in each processor */

    odd_size__ = my_num_cols__;
    if (mycol < np - 1) {
	odd_size__ -= max_bw__;
    }

/*     Offset to workspace for Upper triangular factor */

    work_u__ = *bwu * odd_size__ + mbw2 * 3;


/*       Zero out space for fillin */

    i__1 = laf_min__;
    for (i__ = 1; i__ <= i__1; ++i__) {
	i__2 = i__;
	af[i__2].r = 0., af[i__2].i = 0.;
/* L10: */
    }

/*       Zero out space for work */

    i__1 = work_size_min__;
    for (i__ = 1; i__ <= i__1; ++i__) {
	i__2 = i__;
	work[i__2].r = 0., work[i__2].i = 0.;
/* L20: */
    }

/*     Begin main code */


/* ******************************************************************* */
/*       PHASE 1: Local computation phase. */
/* ******************************************************************* */


/*       Sizes of the extra triangles communicated bewtween processors */

    if (mycol > 0) {
/* Computing MIN */
	i__1 = *bwl, i__2 = numroc_(n, &part_size__, &mycol, &c__0, &npcol);
	prev_tri_size_m__ = min(i__1,i__2);
/* Computing MIN */
	i__3 = mycol - 1;
	i__1 = *bwl, i__2 = numroc_(n, &part_size__, &i__3, &c__0, &npcol);
	prev_tri_size_n__ = min(i__1,i__2);
    }

    if (mycol > 0) {
/* Computing MIN */
	i__1 = *bwu, i__2 = numroc_(n, &part_size__, &mycol, &c__0, &npcol);
	up_prev_tri_size_m__ = min(i__1,i__2);
/* Computing MIN */
	i__3 = mycol - 1;
	i__1 = *bwu, i__2 = numroc_(n, &part_size__, &i__3, &c__0, &npcol);
	up_prev_tri_size_n__ = min(i__1,i__2);
    }

    if (mycol < npcol - 1) {
/* Computing MIN */
	i__3 = mycol + 1;
	i__1 = *bwl, i__2 = numroc_(n, &part_size__, &i__3, &c__0, &npcol);
	next_tri_size_m__ = min(i__1,i__2);
/* Computing MIN */
	i__1 = *bwl, i__2 = numroc_(n, &part_size__, &mycol, &c__0, &npcol);
	next_tri_size_n__ = min(i__1,i__2);
    }

    if (mycol < np - 1) {
/*         Transfer last triangle D_i of local matrix to next processor */
/*         which needs it to calculate fillin due to factorization of */
/*         its main (odd) block A_i. */
/*         Overlap the send with the factorization of A_i. */

	i__1 = llda - 1;
	i__2 = mycol + 1;
	ztrsd2d_(&ictxt, "U", "N", &next_tri_size_m__, &next_tri_size_n__, &a[
		ofst + (my_num_cols__ - *bwl) * llda + (*bwl + *bwu + 1)], &
		i__1, &c__0, &i__2, (ftnlen)1, (ftnlen)1);

    }


/*       Factor main partition A_i = L_i {U_i} in each processor */

    zdbtrf_(&odd_size__, &odd_size__, bwl, bwu, &a[ofst + 1], &llda, info);

    if (*info != 0) {
	*info = mycol + 1;
	goto L1500;
    }


    if (mycol < np - 1) {

/*         Apply factorization to lower connection block BL_i */
/*         conjugate transpose the connection block in preparation. */
/*         Apply factorization to upper connection block BU_i */
/*         Move the connection block in preparation. */

	i__1 = llda - 1;
	zlatcpy_("U", bwl, bwl, &a[ofst + (*bwl + *bwu + 1) + (odd_size__ - *
		bwl) * llda], &i__1, &af[odd_size__ * *bwu + (mbw2 << 1) + 1 
		+ max_bw__ - *bwl], &max_bw__, (ftnlen)1);
	i__1 = llda - 1;
	zlacpy_("L", bwu, bwu, &a[ofst + 1 + odd_size__ * llda], &i__1, &af[
		work_u__ + odd_size__ * *bwl + (mbw2 << 1) + 1 + max_bw__ - *
		bwu], &max_bw__, (ftnlen)1);

/*         Perform the triangular system solve {L_i}{{BU'}_i} = {B_i} */

	ztbtrs_("L", "N", "U", bwu, bwl, bwu, &a[ofst + *bwu + 1 + (
		odd_size__ - *bwu) * llda], &llda, &af[work_u__ + odd_size__ *
		 *bwl + (mbw2 << 1) + 1 + max_bw__ - *bwu], &max_bw__, info, (
		ftnlen)1, (ftnlen)1, (ftnlen)1);

/*         Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C */

	ztbtrs_("U", "C", "N", bwl, bwu, bwl, &a[ofst + 1 + (odd_size__ - *
		bwl) * llda], &llda, &af[odd_size__ * *bwu + (mbw2 << 1) + 1 
		+ max_bw__ - *bwl], &max_bw__, info, (ftnlen)1, (ftnlen)1, (
		ftnlen)1);

/*         conjugate transpose resulting block to its location */
/*           in main storage. */

	i__1 = llda - 1;
	zlatcpy_("L", bwl, bwl, &af[odd_size__ * *bwu + (mbw2 << 1) + 1 + 
		max_bw__ - *bwl], &max_bw__, &a[ofst + (*bwl + *bwu + 1) + (
		odd_size__ - *bwl) * llda], &i__1, (ftnlen)1);

/*         Move the resulting block back to its location in main storage. */

	i__1 = llda - 1;
	zlacpy_("L", bwu, bwu, &af[work_u__ + odd_size__ * *bwl + (mbw2 << 1) 
		+ 1 + max_bw__ - *bwu], &max_bw__, &a[ofst + 1 + odd_size__ * 
		llda], &i__1, (ftnlen)1);


/*         Compute contribution to diagonal block(s) of reduced system. */
/*          {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i} */

/*         The following method uses more flops than necessary but */
/*           does not necessitate the writing of a new BLAS routine. */


	z__1.r = -1., z__1.i = -0.;
	i__1 = llda - 1;
	lfc_SLzgemm("C", "N", &max_bw__, &max_bw__, &max_bw__, &z__1, &af[
		odd_size__ * *bwu + (mbw2 << 1) + 1], &max_bw__, &af[work_u__ 
		+ odd_size__ * *bwl + (mbw2 << 1) + 1], &max_bw__, &c_b1, &a[
		ofst + odd_size__ * llda + 1 + *bwu], &i__1, (ftnlen)1, (
		ftnlen)1);

    }
/*       End of "if ( MYCOL .lt. NP-1 )..." loop */


L1500:
/*       If the processor could not locally factor, it jumps here. */

    if (mycol != 0) {
/*         Discard temporary matrix stored beginning in */
/*           AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for */
/*           off_diagonal block of reduced system. */

/*         Receive previously transmitted matrix section, which forms */
/*         the right-hand-side for the triangular solve that calculates */
/*         the "spike" fillin. */


	i__1 = mycol - 1;
	ztrrv2d_(&ictxt, "U", "N", &prev_tri_size_m__, &prev_tri_size_n__, &
		af[work_u__ + 1], &odd_size__, &c__0, &i__1, (ftnlen)1, (
		ftnlen)1);

	if (*info == 0) {

/*         Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ . */

	    ztbtrs_("L", "N", "U", &odd_size__, bwl, bwl, &a[ofst + *bwu + 1],
		     &llda, &af[work_u__ + 1], &odd_size__, info, (ftnlen)1, (
		    ftnlen)1, (ftnlen)1);


/*         Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$ */


/*         Copy D block into AF storage for solve. */

	    i__1 = llda - 1;
	    zlatcpy_("L", &up_prev_tri_size_n__, &up_prev_tri_size_m__, &a[
		    ofst + 1], &i__1, &af[1], &odd_size__, (ftnlen)1);

	    ztbtrs_("U", "C", "N", &odd_size__, bwu, bwu, &a[ofst + 1], &llda,
		     &af[1], &odd_size__, info, (ftnlen)1, (ftnlen)1, (ftnlen)
		    1);


/*         Calculate the update block for previous proc, E_i = GL_i{GU_i} */


/*         Zero out space in case result is smaller than storage block */

	    i__1 = mbw2;
	    for (i__ = 1; i__ <= i__1; ++i__) {
		i__2 = odd_size__ * *bwu + (mbw2 << 1) + i__;
		af[i__2].r = 0., af[i__2].i = 0.;
/* L30: */
	    }

	    z__1.r = -1., z__1.i = -0.;
/* Computing MAX */
	    i__1 = 0, i__2 = *bwl - *bwu;
/* Computing MAX */
	    i__3 = 0, i__4 = *bwu - *bwl;
	    lfc_SLzgemm("C", "N", bwu, bwl, &odd_size__, &z__1, &af[1], &
		    odd_size__, &af[work_u__ + 1], &odd_size__, &c_b2, &af[
		    max(i__1,i__2) + 1 + odd_size__ * *bwu + ((max_bw__ << 1) 
		    + max(i__3,i__4)) * max_bw__], &max_bw__, (ftnlen)1, (
		    ftnlen)1);


/*         Initiate send of E_i to previous processor to overlap */
/*           with next computation. */

	    i__1 = mycol - 1;
	    zgesd2d_(&ictxt, &max_bw__, &max_bw__, &af[odd_size__ * *bwu + (
		    mbw2 << 1) + 1], &max_bw__, &c__0, &i__1);


	    if (mycol < np - 1) {

/*           Calculate off-diagonal block(s) of reduced system. */
/*           Note: for ease of use in solution of reduced system, store */
/*           L's off-diagonal block in conjugate transpose form. */

/*           Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage */
/*             as per requirements of BLAS routine ZTRMM. */
/*             Since we have GU_i stored, */
/*             conjugate transpose HU_i to HU_i^C. */

		zlatcpy_("N", bwl, bwl, &af[work_u__ + odd_size__ - *bwl + 1],
			 &odd_size__, &af[odd_size__ * *bwu + 1 + (max_bw__ - 
			*bwl)], &max_bw__, (ftnlen)1);

		z__1.r = -1., z__1.i = -0.;
		i__1 = llda - 1;
		lfc_SLztrmm("R", "U", "C", "N", bwl, bwl, &z__1, &a[ofst + (*bwl + 
			*bwu + 1) + (odd_size__ - *bwl) * llda], &i__1, &af[
			odd_size__ * *bwu + 1 + (max_bw__ - *bwl)], &max_bw__,
			 (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1);


/*           Copy matrix HL_i (the last bwu rows of GL_i^C) to AFU store */
/*             as per requirements of BLAS routine ZTRMM. */
/*             Since we have GL_i^C stored, */
/*             conjugate transpose HL_i^C to HL_i. */

		zlatcpy_("N", bwu, bwu, &af[odd_size__ - *bwu + 1], &
			odd_size__, &af[work_u__ + odd_size__ * *bwl + 1 + 
			max_bw__ - *bwu], &max_bw__, (ftnlen)1);

		z__1.r = -1., z__1.i = -0.;
		i__1 = llda - 1;
		lfc_SLztrmm("R", "L", "N", "N", bwu, bwu, &z__1, &a[ofst + 1 + 
			odd_size__ * llda], &i__1, &af[work_u__ + odd_size__ *
			 *bwl + 1 + max_bw__ - *bwu], &max_bw__, (ftnlen)1, (
			ftnlen)1, (ftnlen)1, (ftnlen)1);

	    }

	}
/*       End of "if ( MYCOL .ne. 0 )..." */

    }
/*       End of "if (info.eq.0) then" */


/*       Check to make sure no processors have found errors */

    igamx2d_(&ictxt, "A", " ", &c__1, &c__1, info, &c__1, info, info, &c_n1, &
	    c__0, &c__0, (ftnlen)1, (ftnlen)1);

    if (mycol == 0) {
	igebs2d_(&ictxt, "A", " ", &c__1, &c__1, info, &c__1, (ftnlen)1, (
		ftnlen)1);
    } else {
	igebr2d_(&ictxt, "A", " ", &c__1, &c__1, info, &c__1, &c__0, &c__0, (
		ftnlen)1, (ftnlen)1);
    }

    if (*info != 0) {
	goto L1000;
    }
/*       No errors found, continue */


/* ******************************************************************* */
/*       PHASE 2: Formation and factorization of Reduced System. */
/* ******************************************************************* */

/*       Gather up local sections of reduced system */


/*     The last processor does not participate in the factorization of */
/*       the reduced system, having sent its E_i already. */
    if (mycol == npcol - 1) {
	goto L14;
    }

/*       Initiate send of off-diag block(s) to overlap with next part. */
/*       Off-diagonal block needed on neighboring processor to start */
/*       algorithm. */

    if ((mycol + 1) % 2 == 0 && mycol > 0) {

	i__1 = mycol - 1;
	zgesd2d_(&ictxt, &max_bw__, &max_bw__, &af[odd_size__ * *bwu + 1], &
		max_bw__, &c__0, &i__1);

	i__1 = mycol - 1;
	zgesd2d_(&ictxt, &max_bw__, &max_bw__, &af[work_u__ + odd_size__ * *
		bwl + 1], &max_bw__, &c__0, &i__1);

    }

/*       Copy last diagonal block into AF storage for subsequent */
/*         operations. */

    i__1 = llda - 1;
    zlacpy_("N", &max_bw__, &max_bw__, &a[ofst + odd_size__ * llda + *bwu + 1]
	    , &i__1, &af[odd_size__ * *bwu + mbw2 + 1], &max_bw__, (ftnlen)1);

/*       Receive cont. to diagonal block that is stored on this proc. */

    if (mycol < npcol - 1) {

	i__1 = mycol + 1;
	zgerv2d_(&ictxt, &max_bw__, &max_bw__, &af[odd_size__ * *bwu + (mbw2 
		<< 1) + 1], &max_bw__, &c__0, &i__1);

/*          Add contribution to diagonal block */

	lfc_SLzaxpy(&mbw2, &c_b1, &af[odd_size__ * *bwu + (mbw2 << 1) + 1], &c__1, 
		&af[odd_size__ * *bwu + mbw2 + 1], &c__1);

    }


/*       ************************************* */
/*       Modification Loop */

/*       The distance for sending and receiving for each level starts */
/*         at 1 for the first level. */
    level_dist__ = 1;

/*       Do until this proc is needed to modify other procs' equations */

L12:
    if ((mycol + 1) / level_dist__ % 2 != 0) {
	goto L11;
    }

/*         Receive and add contribution to diagonal block from the left */

    if (mycol - level_dist__ >= 0) {
	i__1 = mycol - level_dist__;
	zgerv2d_(&ictxt, &max_bw__, &max_bw__, &work[1], &max_bw__, &c__0, &
		i__1);

	lfc_SLzaxpy(&mbw2, &c_b1, &work[1], &c__1, &af[odd_size__ * *bwu + mbw2 + 
		1], &c__1);

    }

/*         Receive and add contribution to diagonal block from the right */

    if (mycol + level_dist__ < npcol - 1) {
	i__1 = mycol + level_dist__;
	zgerv2d_(&ictxt, &max_bw__, &max_bw__, &work[1], &max_bw__, &c__0, &
		i__1);

	lfc_SLzaxpy(&mbw2, &c_b1, &work[1], &c__1, &af[odd_size__ * *bwu + mbw2 + 
		1], &c__1);

    }

    level_dist__ <<= 1;

    goto L12;
L11:
/*       [End of GOTO Loop] */


/*       ********************************* */
/*       Calculate and use this proc's blocks to modify other procs'... */

/*       Factor diagonal block */

/* Computing MIN */
    i__2 = max_bw__ - 1;
    i__1 = min(i__2,*bwl);
/* Computing MIN */
    i__4 = max_bw__ - 1;
    i__3 = min(i__4,*bwu);
/* Computing MIN */
    i__5 = max_bw__ - 1;
    i__6 = max_bw__ + 1;
    zdbtrf_(&max_bw__, &max_bw__, &i__1, &i__3, &af[odd_size__ * *bwu + mbw2 
	    + 1 - min(i__5,*bwu)], &i__6, info);

    if (*info != 0) {
	*info = npcol + mycol;
    }

/*       **************************************************************** */
/*       Receive offdiagonal block from processor to right. */
/*         If this is the first group of processors, the receive comes */
/*         from a different processor than otherwise. */

    if (level_dist__ == 1) {
	comm_proc__ = mycol + 1;

/*           Move block into place that it will be expected to be for */
/*             calcs. */

	zlacpy_("N", &max_bw__, &max_bw__, &af[odd_size__ * *bwu + 1], &
		max_bw__, &af[work_u__ + odd_size__ * *bwl + (mbw2 << 1) + 1],
		 &max_bw__, (ftnlen)1);

	zlacpy_("N", &max_bw__, &max_bw__, &af[work_u__ + odd_size__ * *bwl + 
		1], &max_bw__, &af[odd_size__ * *bwu + (mbw2 << 1) + 1], &
		max_bw__, (ftnlen)1);

    } else {
	comm_proc__ = mycol + level_dist__ / 2;
    }

    if (mycol / level_dist__ <= (npcol - 1) / level_dist__ - 2) {

	zgerv2d_(&ictxt, &max_bw__, &max_bw__, &af[odd_size__ * *bwu + 1], &
		max_bw__, &c__0, &comm_proc__);

	zgerv2d_(&ictxt, &max_bw__, &max_bw__, &af[work_u__ + odd_size__ * *
		bwl + 1], &max_bw__, &c__0, &comm_proc__);

	if (*info == 0) {


/*         Modify upper off_diagonal block with diagonal block */


/* Computing MIN */
	    i__2 = *bwl, i__3 = *bwu - 1;
	    i__1 = min(i__2,i__3);
	    i__4 = max_bw__ + 1;
	    ztbtrs_("L", "N", "U", bwu, &i__1, bwu, &af[odd_size__ * *bwu + 
		    mbw2 + 1 + (max_bw__ + 1) * (max_bw__ - *bwu)], &i__4, &
		    af[work_u__ + odd_size__ * *bwl + 1 + max_bw__ - *bwu], &
		    max_bw__, info, (ftnlen)1, (ftnlen)1, (ftnlen)1);

/*         Modify lower off_diagonal block with diagonal block */


/* Computing MIN */
	    i__2 = *bwu, i__3 = *bwl - 1;
	    i__1 = min(i__2,i__3);
/* Computing MIN */
	    i__4 = *bwu, i__5 = *bwl - 1;
	    i__6 = max_bw__ + 1;
	    ztbtrs_("U", "C", "N", bwl, &i__1, bwl, &af[odd_size__ * *bwu + 
		    mbw2 + 1 - min(i__4,i__5) + (max_bw__ + 1) * (max_bw__ - *
		    bwl)], &i__6, &af[odd_size__ * *bwu + 1 + max_bw__ - *bwl]
		    , &max_bw__, info, (ftnlen)1, (ftnlen)1, (ftnlen)1);

	}
/*         End of "if ( info.eq.0 ) then" */

/*         Calculate contribution from this block to next diagonal block */

	z__1.r = -1., z__1.i = -0.;
	lfc_SLzgemm("C", "N", &max_bw__, &max_bw__, &max_bw__, &z__1, &af[
		odd_size__ * *bwu + 1], &max_bw__, &af[work_u__ + odd_size__ *
		 *bwl + 1], &max_bw__, &c_b2, &work[1], &max_bw__, (ftnlen)1, 
		(ftnlen)1);

/*         Send contribution to diagonal block's owning processor. */

	i__1 = mycol + level_dist__;
	zgesd2d_(&ictxt, &max_bw__, &max_bw__, &work[1], &max_bw__, &c__0, &
		i__1);

    }
/*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..." */


/*       **************************************************************** */
/*       Receive off_diagonal block from left and use to finish with this */
/*         processor. */

    if (mycol / level_dist__ > 0 && mycol / level_dist__ <= (npcol - 1) / 
	    level_dist__ - 1) {

	if (level_dist__ > 1) {

/*           Receive offdiagonal block(s) from proc level_dist/2 to the */
/*           left */

	    i__1 = mycol - level_dist__ / 2;
	    zgerv2d_(&ictxt, &max_bw__, &max_bw__, &af[work_u__ + odd_size__ *
		     *bwl + (mbw2 << 1) + 1], &max_bw__, &c__0, &i__1);

/*           Receive offdiagonal block(s) from proc level_dist/2 to the */
/*           left */

	    i__1 = mycol - level_dist__ / 2;
	    zgerv2d_(&ictxt, &max_bw__, &max_bw__, &af[odd_size__ * *bwu + (
		    mbw2 << 1) + 1], &max_bw__, &c__0, &i__1);

	}


	if (*info == 0) {

/*         Use diagonal block(s) to modify this offdiagonal block */


/*         Since ZTBTRS has no "left-right" option, we must transpose */

	    zlatcpy_("N", &max_bw__, &max_bw__, &af[work_u__ + odd_size__ * *
		    bwl + (mbw2 << 1) + 1], &max_bw__, &work[1], &max_bw__, (
		    ftnlen)1);

/* Computing MIN */
	    i__2 = *bwl, i__3 = max_bw__ - 1;
	    i__1 = min(i__2,i__3);
	    i__4 = max_bw__ + 1;
	    ztbtrs_("L", "N", "U", &max_bw__, &i__1, bwl, &af[odd_size__ * *
		    bwu + mbw2 + 1], &i__4, &work[max_bw__ * (max_bw__ - *bwl)
		     + 1], &max_bw__, info, (ftnlen)1, (ftnlen)1, (ftnlen)1);

/*         Transpose back */

	    zlatcpy_("N", &max_bw__, &max_bw__, &work[1], &max_bw__, &af[
		    work_u__ + odd_size__ * *bwl + (mbw2 << 1) + 1], &
		    max_bw__, (ftnlen)1);



/*         Since ZTBTRS has no "left-right" option, we must transpose */

	    zlatcpy_("N", &max_bw__, &max_bw__, &af[odd_size__ * *bwu + (mbw2 
		    << 1) + 1], &max_bw__, &work[1], &max_bw__, (ftnlen)1);

/* Computing MIN */
	    i__2 = *bwu, i__3 = max_bw__ - 1;
	    i__1 = min(i__2,i__3);
/* Computing MIN */
	    i__4 = *bwu, i__5 = max_bw__ - 1;
	    i__6 = max_bw__ + 1;
	    ztbtrs_("U", "C", "N", &max_bw__, &i__1, bwu, &af[odd_size__ * *
		    bwu + mbw2 + 1 - min(i__4,i__5)], &i__6, &work[max_bw__ * 
		    (max_bw__ - *bwu) + 1], &max_bw__, info, (ftnlen)1, (
		    ftnlen)1, (ftnlen)1);

/*         Transpose back */

	    zlatcpy_("N", &max_bw__, &max_bw__, &work[1], &max_bw__, &af[
		    odd_size__ * *bwu + (mbw2 << 1) + 1], &max_bw__, (ftnlen)
		    1);


	}
/*         End of "if( info.eq.0 ) then" */

/*         Use offdiag block(s) to calculate modification to diag block */
/*           of processor to the left */

	z__1.r = -1., z__1.i = -0.;
	lfc_SLzgemm("N", "C", &max_bw__, &max_bw__, &max_bw__, &z__1, &af[
		odd_size__ * *bwu + (mbw2 << 1) + 1], &max_bw__, &af[work_u__ 
		+ odd_size__ * *bwl + (mbw2 << 1) + 1], &max_bw__, &c_b2, &
		work[1], &max_bw__, (ftnlen)1, (ftnlen)1);

/*         Send contribution to diagonal block's owning processor. */

	i__1 = mycol - level_dist__;
	zgesd2d_(&ictxt, &max_bw__, &max_bw__, &work[1], &max_bw__, &c__0, &
		i__1);

/*         ******************************************************* */

	if (mycol / level_dist__ <= (npcol - 1) / level_dist__ - 2) {

/*           Decide which processor offdiagonal block(s) goes to */

	    if (mycol / (level_dist__ << 1) % 2 == 0) {
		comm_proc__ = mycol + level_dist__;
	    } else {
		comm_proc__ = mycol - level_dist__;
	    }

/*           Use offdiagonal blocks to calculate offdiag */
/*             block to send to neighboring processor. Depending */
/*             on circumstances, may need to transpose the matrix. */

	    z__1.r = -1., z__1.i = -0.;
	    lfc_SLzgemm("N", "N", &max_bw__, &max_bw__, &max_bw__, &z__1, &af[
		    work_u__ + odd_size__ * *bwl + (mbw2 << 1) + 1], &
		    max_bw__, &af[odd_size__ * *bwu + 1], &max_bw__, &c_b2, &
		    work[1], &max_bw__, (ftnlen)1, (ftnlen)1);

/*           Send contribution to offdiagonal block's owning processor. */

	    zgesd2d_(&ictxt, &max_bw__, &max_bw__, &work[1], &max_bw__, &c__0,
		     &comm_proc__);

	    z__1.r = -1., z__1.i = -0.;
	    lfc_SLzgemm("N", "N", &max_bw__, &max_bw__, &max_bw__, &z__1, &af[
		    odd_size__ * *bwu + (mbw2 << 1) + 1], &max_bw__, &af[
		    work_u__ + odd_size__ * *bwl + 1], &max_bw__, &c_b2, &
		    work[1], &max_bw__, (ftnlen)1, (ftnlen)1);

/*           Send contribution to offdiagonal block's owning processor. */

	    zgesd2d_(&ictxt, &max_bw__, &max_bw__, &work[1], &max_bw__, &c__0,
		     &comm_proc__);

	}

    }
/*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..." */

L14:


L1000:


/*     Free BLACS space used to hold standard-form grid. */

    if (ictxt_save__ != ictxt_new__) {
	blacs_gridexit__(&ictxt_new__);
    }

L1234:

/*     Restore saved input parameters */

    ictxt = ictxt_save__;
    np = np_save__;

/*     Output minimum worksize */

    work[1].r = (doublereal) work_size_min__, work[1].i = 0.;

/*         Make INFO consistent across processors */

    igamx2d_(&ictxt, "A", " ", &c__1, &c__1, info, &c__1, info, info, &c_n1, &
	    c__0, &c__0, (ftnlen)1, (ftnlen)1);

    if (mycol == 0) {
	igebs2d_(&ictxt, "A", " ", &c__1, &c__1, info, &c__1, (ftnlen)1, (
		ftnlen)1);
    } else {
	igebr2d_(&ictxt, "A", " ", &c__1, &c__1, info, &c__1, &c__0, &c__0, (
		ftnlen)1, (ftnlen)1);
    }


    return 0;

/*     End of PZDBTRF */

} /* pzdbtrf_ */

