/* /home4/luszczek/mscratch/build/SCALAPACK/SRC/pdgbtrs.f -- translated by f2c (version 20031025).
   You must link the resulting object file with libf2c:
	on Microsoft Windows system, link with libf2c.lib;
	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
	or, if you install libf2c.a in a standard place, with -lf2c -lm
	-- in that order, at the end of the command line, as in
		cc *.o -lf2c -lm
	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

		http://www.netlib.org/f2c/libf2c.zip
*/

#include "f2c.h"

/* Table of constant values */

static integer c__17 = 17;
static integer c__1 = 1;
static integer c__0 = 0;
static doublereal c_b25 = -1.;
static doublereal c_b41 = 1.;

/* Subroutine */ int pdgbtrs_(char *trans, integer *n, integer *bwl, integer *
	bwu, integer *nrhs, doublereal *a, integer *ja, integer *desca, 
	integer *ipiv, doublereal *b, integer *ib, integer *descb, doublereal 
	*af, integer *laf, doublereal *work, integer *lwork, integer *info, 
	ftnlen trans_len)
{
    /* System generated locals */
    integer i__1, i__2, i__3;
    doublereal d__1;

    /* Local variables */
    integer odd_size__, desca_1xp__[7], descb_px1__[7], j, l, store_m_b__, 
	    store_n_a__, ictxt_new__, bm, bn, nb, bw, lm, ln, np, 
	    ictxt_save__, first_proc__, bmn, bnn, lmj, ldw, param_check__[51]	
	    /* was [17][3] */, return_code__, part_offset__, ldbb, llda, lldb;
    extern /* Subroutine */ int lfc_SLdger(integer *, integer *, doublereal *, 
	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
	    integer *);
    integer csrc, lbwl, aptr, lbwu, lptr, wptr;
    extern /* Subroutine */ int desc_convert__(integer *, integer *, integer *
	    );
    integer idum2, idum3, recovery_val__;
    extern /* Subroutine */ int lfc_SLdscal(integer *, doublereal *, doublereal *, 
	    integer *), lfc_SLdgemm(char *, char *, integer *, integer *, integer *
	    , doublereal *, doublereal *, integer *, doublereal *, integer *, 
	    doublereal *, doublereal *, integer *, ftnlen, ftnlen);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int lfc_SLdgemv(char *, integer *, integer *, 
	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
	    doublereal *, doublereal *, integer *, ftnlen);
    integer npact, bbptr, npcol;
    extern /* Subroutine */ int lfc_SLdcopy(integer *, doublereal *, integer *, 
	    doublereal *, integer *), lfc_SLdswap(integer *, doublereal *, integer 
	    *, doublereal *, integer *);
    integer mycol;
    extern /* Subroutine */ int lfc_SLdtrsm(char *, char *, char *, char *, 
	    integer *, integer *, doublereal *, doublereal *, integer *, 
	    doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen);
    integer ictxt, nprow, npstr, myrow, work_size_min__, ja_new__, neicol;
    extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, 
	    doublereal *, integer *, doublereal *, integer *, ftnlen), 
	    dgetrs_(char *, integer *, integer *, doublereal *, integer *, 
	    integer *, doublereal *, integer *, integer *, ftnlen), dlaswp_(
	    integer *, doublereal *, integer *, integer *, integer *, integer 
	    *, integer *), blacs_gridinfo__(integer *, integer *, integer *, 
	    integer *, integer *);
    extern integer numroc_(integer *, integer *, integer *, integer *, 
	    integer *);
    extern /* Subroutine */ int dgesd2d_(integer *, integer *, integer *, 
	    doublereal *, integer *, integer *, integer *), blacs_gridexit__(
	    integer *), dgerv2d_(integer *, integer *, integer *, doublereal *
	    , integer *, integer *, integer *), globchk_(integer *, integer *,
	     integer *, integer *, integer *, integer *), reshape_(integer *, 
	    integer *, integer *, integer *, integer *, integer *, integer *);
    integer np_save__;
    extern /* Subroutine */ int pxerbla_(integer *, char *, integer *, ftnlen)
	    ;


/*  -- ScaLAPACK routine (version 1.7) -- */
/*     University of Tennessee, Knoxville, Oak Ridge National Laboratory, */
/*     and University of California, Berkeley. */
/*     April 3, 2000 */

/*     .. Scalar Arguments .. */
/*     .. */
/*     .. Array Arguments .. */
/*     .. */

/*  Purpose */
/*  ======= */

/*  PDGBTRS solves a system of linear equations */

/*            A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS) */
/*                                    or */
/*            A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS) */

/*  where A(1:N, JA:JA+N-1) is the matrix used to produce the factors */
/*  stored in A(1:N,JA:JA+N-1) and AF by PDGBTRF. */
/*  A(1:N, JA:JA+N-1) is an N-by-N real */
/*  banded distributed */
/*  matrix with bandwidth BWL, BWU. */

/*  Routine PDGBTRF MUST be called first. */

/*  ===================================================================== */

/*  Arguments */
/*  ========= */


/*  TRANS   (global input) CHARACTER */
/*          = 'N':  Solve with A(1:N, JA:JA+N-1); */
/*          = 'T' or 'C':  Solve with A(1:N, JA:JA+N-1)^T; */

/*  N       (global input) INTEGER */
/*          The number of rows and columns to be operated on, i.e. the */
/*          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0. */

/*  BWL     (global input) INTEGER */
/*          Number of subdiagonals. 0 <= BWL <= N-1 */

/*  BWU     (global input) INTEGER */
/*          Number of superdiagonals. 0 <= BWU <= N-1 */

/*  NRHS    (global input) INTEGER */
/*          The number of right hand sides, i.e., the number of columns */
/*          of the distributed submatrix B(IB:IB+N-1, 1:NRHS). */
/*          NRHS >= 0. */

/*  A       (local input/local output) DOUBLE PRECISION pointer into */
/*          local memory to an array with first dimension */
/*          LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA). */
/*          On entry, this array contains the local pieces of the */
/*          N-by-N unsymmetric banded distributed Cholesky factor L or */
/*          L^T A(1:N, JA:JA+N-1). */
/*          This local portion is stored in the packed banded format */
/*            used in LAPACK. Please see the Notes below and the */
/*            ScaLAPACK manual for more detail on the format of */
/*            distributed matrices. */

/*  JA      (global input) INTEGER */
/*          The index in the global array A that points to the start of */
/*          the matrix to be operated on (which may be either all of A */
/*          or a submatrix of A). */

/*  DESCA   (global and local input) INTEGER array of dimension DLEN. */
/*          if 1D type (DTYPE_A=501), DLEN >= 7; */
/*          if 2D type (DTYPE_A=1), DLEN >= 9 . */
/*          The array descriptor for the distributed matrix A. */
/*          Contains information of mapping of A to memory. Please */
/*          see NOTES below for full description and options. */

/*  IPIV    (local output) INTEGER array, dimension >= DESCA( NB ). */
/*          Pivot indices for local factorizations. */
/*          Users *should not* alter the contents between */
/*          factorization and solve. */

/*  B       (local input/local output) DOUBLE PRECISION pointer into */
/*          local memory to an array of local lead dimension lld_b>=NB. */
/*          On entry, this array contains the */
/*          the local pieces of the right hand sides */
/*          B(IB:IB+N-1, 1:NRHS). */
/*          On exit, this contains the local piece of the solutions */
/*          distributed matrix X. */

/*  IB      (global input) INTEGER */
/*          The row index in the global array B that points to the first */
/*          row of the matrix to be operated on (which may be either */
/*          all of B or a submatrix of B). */

/*  DESCB   (global and local input) INTEGER array of dimension DLEN. */
/*          if 1D type (DTYPE_B=502), DLEN >=7; */
/*          if 2D type (DTYPE_B=1), DLEN >= 9. */
/*          The array descriptor for the distributed matrix B. */
/*          Contains information of mapping of B to memory. Please */
/*          see NOTES below for full description and options. */

/*  AF      (local output) DOUBLE PRECISION array, dimension LAF. */
/*          Auxiliary Fillin Space. */
/*          Fillin is created during the factorization routine */
/*          PDGBTRF and this is stored in AF. If a linear system */
/*          is to be solved using PDGBTRS after the factorization */
/*          routine, AF *must not be altered* after the factorization. */

/*  LAF     (local input) INTEGER */
/*          Size of user-input Auxiliary Fillin space AF. Must be >= */
/*          (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu) */
/*          If LAF is not large enough, an error code will be returned */
/*          and the minimum acceptable size will be returned in AF( 1 ) */

/*  WORK    (local workspace/local output) */
/*          DOUBLE PRECISION temporary workspace. This space may */
/*          be overwritten in between calls to routines. WORK must be */
/*          the size given in LWORK. */
/*          On exit, WORK( 1 ) contains the minimal LWORK. */

/*  LWORK   (local input or global input) INTEGER */
/*          Size of user-input workspace WORK. */
/*          If LWORK is too small, the minimal acceptable size will be */
/*          returned in WORK(1) and an error code is returned. LWORK>= */
/*          NRHS*(NB+2*bwl+4*bwu) */

/*  INFO    (global output) INTEGER */
/*          = 0:  successful exit */
/*          < 0:  If the i-th argument is an array and the j-entry had */
/*                an illegal value, then INFO = -(i*100+j), if the i-th */
/*                argument is a scalar and had an illegal value, then */
/*                INFO = -i. */

/*  ===================================================================== */

/*  Restrictions */
/*  ============ */

/*  The following are restrictions on the input parameters. Some of these */
/*    are temporary and will be removed in future releases, while others */
/*    may reflect fundamental technical limitations. */

/*    Non-cyclic restriction: VERY IMPORTANT! */
/*      P*NB>= mod(JA-1,NB)+N. */
/*      The mapping for matrices must be blocked, reflecting the nature */
/*      of the divide and conquer algorithm as a task-parallel algorithm. */
/*      This formula in words is: no processor may have more than one */
/*      chunk of the matrix. */

/*    Blocksize cannot be too small: */
/*      If the matrix spans more than one processor, the following */
/*      restriction on NB, the size of each block on each processor, */
/*      must hold: */
/*      NB >= (BWL+BWU)+1 */
/*      The bulk of parallel computation is done on the matrix of size */
/*      O(NB) on each processor. If this is too small, divide and conquer */
/*      is a poor choice of algorithm. */

/*    Submatrix reference: */
/*      JA = IB */
/*      Alignment restriction that prevents unnecessary communication. */

/*  ===================================================================== */

/*  Notes */
/*  ===== */

/*  If the factorization routine and the solve routine are to be called */
/*    separately (to solve various sets of righthand sides using the same */
/*    coefficient matrix), the auxiliary space AF *must not be altered* */
/*    between calls to the factorization routine and the solve routine. */

/*  The best algorithm for solving banded and tridiagonal linear systems */
/*    depends on a variety of parameters, especially the bandwidth. */
/*    Currently, only algorithms designed for the case N/P >> bw are */
/*    implemented. These go by many names, including Divide and Conquer, */
/*    Partitioning, domain decomposition-type, etc. */

/*  Algorithm description: Divide and Conquer */

/*    The Divide and Conqer algorithm assumes the matrix is narrowly */
/*      banded compared with the number of equations. In this situation, */
/*      it is best to distribute the input matrix A one-dimensionally, */
/*      with columns atomic and rows divided amongst the processes. */
/*      The basic algorithm divides the banded matrix up into */
/*      P pieces with one stored on each processor, */
/*      and then proceeds in 2 phases for the factorization or 3 for the */
/*      solution of a linear system. */
/*      1) Local Phase: */
/*         The individual pieces are factored independently and in */
/*         parallel. These factors are applied to the matrix creating */
/*         fillin, which is stored in a non-inspectable way in auxiliary */
/*         space AF. Mathematically, this is equivalent to reordering */
/*         the matrix A as P A P^T and then factoring the principal */
/*         leading submatrix of size equal to the sum of the sizes of */
/*         the matrices factored on each processor. The factors of */
/*         these submatrices overwrite the corresponding parts of A */
/*         in memory. */
/*      2) Reduced System Phase: */
/*         A small (max(bwl,bwu)* (P-1)) system is formed representing */
/*         interaction of the larger blocks, and is stored (as are its */
/*         factors) in the space AF. A parallel Block Cyclic Reduction */
/*         algorithm is used. For a linear system, a parallel front solve */
/*         followed by an analagous backsolve, both using the structure */
/*         of the factored matrix, are performed. */
/*      3) Backsubsitution Phase: */
/*         For a linear system, a local backsubstitution is performed on */
/*         each processor in parallel. */


/*  Descriptors */
/*  =========== */

/*  Descriptors now have *types* and differ from ScaLAPACK 1.0. */

/*  Note: banded codes can use either the old two dimensional */
/*    or new one-dimensional descriptors, though the processor grid in */
/*    both cases *must be one-dimensional*. We describe both types below. */

/*  Each global data object is described by an associated description */
/*  vector.  This vector stores the information required to establish */
/*  the mapping between an object element and its corresponding process */
/*  and memory location. */

/*  Let A be a generic term for any 2D block cyclicly distributed array. */
/*  Such a global array has an associated description vector DESCA. */
/*  In the following comments, the character _ should be read as */
/*  "of the global array". */

/*  NOTATION        STORED IN      EXPLANATION */
/*  --------------- -------------- -------------------------------------- */
/*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case, */
/*                                 DTYPE_A = 1. */
/*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating */
/*                                 the BLACS process grid A is distribu- */
/*                                 ted over. The context itself is glo- */
/*                                 bal, but the handle (the integer */
/*                                 value) may vary. */
/*  M_A    (global) DESCA( M_ )    The number of rows in the global */
/*                                 array A. */
/*  N_A    (global) DESCA( N_ )    The number of columns in the global */
/*                                 array A. */
/*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute */
/*                                 the rows of the array. */
/*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute */
/*                                 the columns of the array. */
/*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first */
/*                                 row of the array A is distributed. */
/*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the */
/*                                 first column of the array A is */
/*                                 distributed. */
/*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local */
/*                                 array.  LLD_A >= MAX(1,LOCr(M_A)). */

/*  Let K be the number of rows or columns of a distributed matrix, */
/*  and assume that its process grid has dimension p x q. */
/*  LOCr( K ) denotes the number of elements of K that a process */
/*  would receive if K were distributed over the p processes of its */
/*  process column. */
/*  Similarly, LOCc( K ) denotes the number of elements of K that a */
/*  process would receive if K were distributed over the q processes of */
/*  its process row. */
/*  The values of LOCr() and LOCc() may be determined via a call to the */
/*  ScaLAPACK tool function, NUMROC: */
/*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), */
/*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). */
/*  An upper bound for these quantities may be computed by: */
/*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A */
/*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A */


/*  One-dimensional descriptors: */

/*  One-dimensional descriptors are a new addition to ScaLAPACK since */
/*    version 1.0. They simplify and shorten the descriptor for 1D */
/*    arrays. */

/*  Since ScaLAPACK supports two-dimensional arrays as the fundamental */
/*    object, we allow 1D arrays to be distributed either over the */
/*    first dimension of the array (as if the grid were P-by-1) or the */
/*    2nd dimension (as if the grid were 1-by-P). This choice is */
/*    indicated by the descriptor type (501 or 502) */
/*    as described below. */

/*    IMPORTANT NOTE: the actual BLACS grid represented by the */
/*    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P */
/*    irrespective of which one-dimensional descriptor type */
/*    (501 or 502) is input. */
/*    This routine will interpret the grid properly either way. */
/*    ScaLAPACK routines *do not support intercontext operations* so that */
/*    the grid passed to a single ScaLAPACK routine *must be the same* */
/*    for all array descriptors passed to that routine. */

/*    NOTE: In all cases where 1D descriptors are used, 2D descriptors */
/*    may also be used, since a one-dimensional array is a special case */
/*    of a two-dimensional array with one dimension of size unity. */
/*    The two-dimensional array used in this case *must* be of the */
/*    proper orientation: */
/*      If the appropriate one-dimensional descriptor is DTYPEA=501 */
/*      (1 by P type), then the two dimensional descriptor must */
/*      have a CTXT value that refers to a 1 by P BLACS grid; */
/*      If the appropriate one-dimensional descriptor is DTYPEA=502 */
/*      (P by 1 type), then the two dimensional descriptor must */
/*      have a CTXT value that refers to a P by 1 BLACS grid. */


/*  Summary of allowed descriptors, types, and BLACS grids: */
/*  DTYPE           501         502         1         1 */
/*  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1 */
/*  ----------------------------------------------------- */
/*  A               OK          NO          OK        NO */
/*  B               NO          OK          NO        OK */

/*  Note that a consequence of this chart is that it is not possible */
/*    for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead */
/*    to opposite requirements for the orientation of the BLACS grid, */
/*    and as noted before, the *same* BLACS context must be used in */
/*    all descriptors in a single ScaLAPACK subroutine call. */

/*  Let A be a generic term for any 1D block cyclicly distributed array. */
/*  Such a global array has an associated description vector DESCA. */
/*  In the following comments, the character _ should be read as */
/*  "of the global array". */

/*  NOTATION        STORED IN  EXPLANATION */
/*  --------------- ---------- ------------------------------------------ */
/*  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids, */
/*                                TYPE_A = 501: 1-by-P grid. */
/*                                TYPE_A = 502: P-by-1 grid. */
/*  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating */
/*                                the BLACS process grid A is distribu- */
/*                                ted over. The context itself is glo- */
/*                                bal, but the handle (the integer */
/*                                value) may vary. */
/*  N_A    (global) DESCA( 3 ) The size of the array dimension being */
/*                                distributed. */
/*  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute */
/*                                the distributed dimension of the array. */
/*  SRC_A  (global) DESCA( 5 ) The process row or column over which the */
/*                                first row or column of the array */
/*                                is distributed. */
/*  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array */
/*                                storing the local blocks of the distri- */
/*                                buted array A. Minimum value of LLD_A */
/*                                depends on TYPE_A. */
/*                                TYPE_A = 501: LLD_A >= */
/*                                   size of undistributed dimension, 1. */
/*                                TYPE_A = 502: LLD_A >=NB_A, 1. */
/*  Reserved        DESCA( 7 ) Reserved for future use. */

/*  ===================================================================== */

/*  Implemented for ScaLAPACK by: */
/*     Andrew J. Cleary, Livermore National Lab and University of Tenn., */
/*     and Markus Hegland, Australian National University. Feb., 1997. */
/*  Based on code written by    : Peter Arbenz, ETH Zurich, 1996. */
/*  Last modified by:  Peter Arbenz, Institute of Scientific Computing, */
/*    ETH, Zurich. */

/*  ===================================================================== */

/*     .. Parameters .. */
/*     .. */
/*     .. Local Scalars .. */
/*     .. */
/*     .. Local Arrays .. */
/*     .. */
/*     .. External Subroutines .. */
/*     .. */
/*     .. External Functions .. */
/*     .. */
/*     .. Intrinsic Functions .. */
/*     .. */
/*     .. Executable Statements .. */


/*     Test the input parameters */

    /* Parameter adjustments */
    --work;
    --af;
    --descb;
    --b;
    --ipiv;
    --desca;
    --a;

    /* Function Body */
    *info = 0;

/*     Convert descriptor into standard form for easy access to */
/*        parameters, check that grid is of right shape. */

    desca_1xp__[0] = 501;
    descb_px1__[0] = 502;

    desc_convert__(&desca[1], desca_1xp__, &return_code__);

    if (return_code__ != 0) {
	*info = -802;
    }

    desc_convert__(&descb[1], descb_px1__, &return_code__);

    if (return_code__ != 0) {
	*info = -1102;
    }

/*     Consistency checks for DESCA and DESCB. */

/*     Context must be the same */
    if (desca_1xp__[1] != descb_px1__[1]) {
	*info = -1102;
    }

/*        These are alignment restrictions that may or may not be removed */
/*        in future releases. -Andy Cleary, April 14, 1996. */

/*     Block sizes must be the same */
    if (desca_1xp__[3] != descb_px1__[3]) {
	*info = -1104;
    }

/*     Source processor must be the same */

    if (desca_1xp__[4] != descb_px1__[4]) {
	*info = -1105;
    }

/*     Get values out of descriptor for use in code. */

    ictxt = desca_1xp__[1];
    csrc = desca_1xp__[4];
    nb = desca_1xp__[3];
    llda = desca_1xp__[5];
    store_n_a__ = desca_1xp__[2];
    lldb = descb_px1__[5];
    store_m_b__ = descb_px1__[2];

/*     Get grid parameters */


    blacs_gridinfo__(&ictxt, &nprow, &npcol, &myrow, &mycol);
    np = nprow * npcol;



    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
	idum2 = 'N';
    } else if (lsame_(trans, "T", (ftnlen)1, (ftnlen)1)) {
	idum2 = 'T';
    } else if (lsame_(trans, "C", (ftnlen)1, (ftnlen)1)) {
	idum2 = 'T';
    } else {
	*info = -1;
    }

    if (*lwork < -1) {
	*info = -16;
    } else if (*lwork == -1) {
	idum3 = -1;
    } else {
	idum3 = 1;
    }

    if (*n < 0) {
	*info = -2;
    }

    if (*n + *ja - 1 > store_n_a__) {
	*info = -806;
    }

    if (*bwl > *n - 1 || *bwl < 0) {
	*info = -3;
    }

    if (*bwu > *n - 1 || *bwu < 0) {
	*info = -4;
    }

    if (llda < (*bwl << 1) + (*bwu << 1) + 1) {
	*info = -806;
    }

    if (nb <= 0) {
	*info = -804;
    }

    bw = *bwu + *bwl;

    if (*n + *ib - 1 > store_m_b__) {
	*info = -1103;
    }

    if (lldb < nb) {
	*info = -1106;
    }

    if (*nrhs < 0) {
	*info = -5;
    }

/*     Current alignment restriction */

    if (*ja != *ib) {
	*info = -7;
    }

/*     Argument checking that is specific to Divide & Conquer routine */

    if (nprow != 1) {
	*info = -802;
    }

    if (*n > np * nb - (*ja - 1) % nb) {
	*info = -2;
	i__1 = -(*info);
	pxerbla_(&ictxt, "PDGBTRS, D&C alg.: only 1 block per proc", &i__1, (
		ftnlen)40);
	return 0;
    }

    if (*ja + *n - 1 > nb && nb < *bwl + *bwu + 1) {
	*info = -804;
	i__1 = -(*info);
	pxerbla_(&ictxt, "PDGBTRS, D&C alg.: NB too small", &i__1, (ftnlen)31)
		;
	return 0;
    }


/*     Check worksize */

    work_size_min__ = *nrhs * (nb + (*bwl << 1) + (*bwu << 2));

    work[1] = (doublereal) work_size_min__;

    if (*lwork < work_size_min__) {
	if (*lwork != -1) {
	    *info = -16;
	    i__1 = -(*info);
	    pxerbla_(&ictxt, "PDGBTRS: worksize error ", &i__1, (ftnlen)24);
	}
	return 0;
    }

/*     Pack params and positions into arrays for global consistency check */

    param_check__[16] = descb[5];
    param_check__[15] = descb[4];
    param_check__[14] = descb[3];
    param_check__[13] = descb[2];
    param_check__[12] = descb[1];
    param_check__[11] = *ib;
    param_check__[10] = desca[5];
    param_check__[9] = desca[4];
    param_check__[8] = desca[3];
    param_check__[7] = desca[1];
    param_check__[6] = *ja;
    param_check__[5] = *nrhs;
    param_check__[4] = *bwu;
    param_check__[3] = *bwl;
    param_check__[2] = *n;
    param_check__[1] = idum3;
    param_check__[0] = idum2;

    param_check__[33] = 1105;
    param_check__[32] = 1104;
    param_check__[31] = 1103;
    param_check__[30] = 1102;
    param_check__[29] = 1101;
    param_check__[28] = 10;
    param_check__[27] = 805;
    param_check__[26] = 804;
    param_check__[25] = 803;
    param_check__[24] = 801;
    param_check__[23] = 7;
    param_check__[22] = 5;
    param_check__[21] = 4;
    param_check__[20] = 3;
    param_check__[19] = 2;
    param_check__[18] = 16;
    param_check__[17] = 1;

/*     Want to find errors with MIN( ), so if no error, set it to a big */
/*     number. If there already is an error, multiply by the the */
/*     descriptor multiplier. */

    if (*info >= 0) {
	*info = 10000;
    } else if (*info < -100) {
	*info = -(*info);
    } else {
	*info = -(*info) * 100;
    }

/*     Check consistency across processors */

    globchk_(&ictxt, &c__17, param_check__, &c__17, &param_check__[34], info);

/*     Prepare output: set info = 0 if no error, and divide by DESCMULT */
/*     if error is not in a descriptor entry. */

    if (*info == 10000) {
	*info = 0;
    } else if (*info % 100 == 0) {
	*info = -(*info) / 100;
    } else {
	*info = -(*info);
    }

    if (*info < 0) {
	i__1 = -(*info);
	pxerbla_(&ictxt, "PDGBTRS", &i__1, (ftnlen)7);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

    if (*nrhs == 0) {
	return 0;
    }


/*     Adjust addressing into matrix space to properly get into */
/*        the beginning part of the relevant data */

    part_offset__ = nb * ((*ja - 1) / (npcol * nb));

    if (mycol - csrc < (*ja - part_offset__ - 1) / nb) {
	part_offset__ += nb;
    }

    if (mycol < csrc) {
	part_offset__ -= nb;
    }

/*     Form a new BLACS grid (the "standard form" grid) with only procs */
/*        holding part of the matrix, of size 1xNP where NP is adjusted, */
/*        starting at csrc=0, with JA modified to reflect dropped procs. */

/*     First processor to hold part of the matrix: */

    first_proc__ = ((*ja - 1) / nb + csrc) % npcol;

/*     Calculate new JA one while dropping off unused processors. */

    ja_new__ = (*ja - 1) % nb + 1;

/*     Save and compute new value of NP */

    np_save__ = np;
    np = (ja_new__ + *n - 2) / nb + 1;

/*     Call utility routine that forms "standard-form" grid */

    reshape_(&ictxt, &c__1, &ictxt_new__, &c__1, &first_proc__, &c__1, &np);

/*     Use new context from standard grid as context. */

    ictxt_save__ = ictxt;
    ictxt = ictxt_new__;
    desca_1xp__[1] = ictxt_new__;
    descb_px1__[1] = ictxt_new__;

/*     Get information about new grid. */

    blacs_gridinfo__(&ictxt, &nprow, &npcol, &myrow, &mycol);

/*     Drop out processors that do not have part of the matrix. */

    if (myrow < 0) {
	goto L100;
    }



/*     Begin main code */

/*     Move data into workspace - communicate/copy (overlap) */

    if (mycol < npcol - 1) {
	i__1 = mycol + 1;
	dgesd2d_(&ictxt, bwu, nrhs, &b[nb - *bwu + 1], &lldb, &c__0, &i__1);
    }

    if (mycol < npcol - 1) {
	lm = nb - *bwu;
    } else {
	lm = nb;
    }

    if (mycol > 0) {
	wptr = *bwu + 1;
    } else {
	wptr = 1;
    }

    ldw = nb + *bwu + (bw << 1) + *bwu;

    dlacpy_("G", &lm, nrhs, &b[1], &lldb, &work[wptr], &ldw, (ftnlen)1);

/*     Zero out rest of work */

    i__1 = *nrhs;
    for (j = 1; j <= i__1; ++j) {
	i__2 = ldw;
	for (l = wptr + lm; l <= i__2; ++l) {
	    work[(j - 1) * ldw + l] = 0.;
/* L10: */
	}
/* L20: */
    }

    if (mycol > 0) {
	i__1 = mycol - 1;
	dgerv2d_(&ictxt, bwu, nrhs, &work[1], &ldw, &c__0, &i__1);
    }

/* ******************************************************************* */
/*       PHASE 1: Local computation phase -- Solve L*X = B */
/* ******************************************************************* */

/*     Size of main (or odd) partition in each processor */

    odd_size__ = numroc_(n, &nb, &mycol, &c__0, &npcol);

    if (mycol != 0) {
	lbwl = bw;
	lbwu = 0;
	aptr = 1;
    } else {
	lbwl = *bwl;
	lbwu = *bwu;
	aptr = *bwu + 1;
    }

    if (mycol != npcol - 1) {
	lm = nb - lbwu;
	ln = nb - bw;
    } else if (mycol != 0) {
	lm = odd_size__ + *bwu;
/* Computing MAX */
	i__1 = odd_size__ - bw;
	ln = max(i__1,0);
    } else {
	lm = *n;
/* Computing MAX */
	i__1 = *n - bw;
	ln = max(i__1,0);
    }

    i__1 = ln;
    for (j = 1; j <= i__1; ++j) {

/* Computing MIN */
	i__2 = lbwl, i__3 = lm - j;
	lmj = min(i__2,i__3);
	l = ipiv[j];

	if (l != j) {
	    lfc_SLdswap(nrhs, &work[l], &ldw, &work[j], &ldw);
	}

	lptr = bw + 1 + (j - 1) * llda + aptr;

	lfc_SLdger(&lmj, nrhs, &c_b25, &a[lptr], &c__1, &work[j], &ldw, &work[j + 
		1], &ldw);

/* L30: */
    }

/* ******************************************************************* */
/*       PHASE 2: Global computation phase -- Solve L*X = B */
/* ******************************************************************* */

/*     Define the initial dimensions of the diagonal blocks */
/*     The offdiagonal blocks (for MYCOL > 0) are of size BM by BW */

    if (mycol != npcol - 1) {
	bm = bw - lbwu;
	bn = bw;
    } else {
	bm = min(bw,odd_size__) + *bwu;
	bn = min(bw,odd_size__);
    }

/*     Pointer to first element of block bidiagonal matrix in AF */
/*     Leading dimension of block bidiagonal system */

    bbptr = (nb + *bwu) * bw + 1;
    ldbb = (bw << 1) + *bwu;

    if (npcol == 1) {

/*        In this case the loop over the levels will not be */
/*        performed. */
	i__1 = *n - ln;
	dgetrs_("N", &i__1, nrhs, &af[bbptr + bw * ldbb], &ldbb, &ipiv[ln + 1]
		, &work[ln + 1], &ldw, info, (ftnlen)1);

    }

/* Loop over levels ... */

/*     The two integers NPACT (nu. of active processors) and NPSTR */
/*     (stride between active processors) is used to control the */
/*     loop. */

    npact = npcol;
    npstr = 1;

/*     Begin loop over levels */
L40:
    if (npact <= 1) {
	goto L50;
    }

/*     Test if processor is active */
    if (mycol % npstr == 0) {

/*   Send/Receive blocks */

	if (mycol % (npstr << 1) == 0) {

	    neicol = mycol + npstr;

	    if (neicol / npstr <= npact - 1) {

		if (neicol / npstr < npact - 1) {
		    bmn = bw;
		} else {
/* Computing MIN */
		    i__1 = bw, i__2 = numroc_(n, &nb, &neicol, &c__0, &npcol);
		    bmn = min(i__1,i__2) + *bwu;
		}

		dgesd2d_(&ictxt, &bm, nrhs, &work[ln + 1], &ldw, &c__0, &
			neicol);

		if (npact != 2) {

/*                     Receive answers back from partner processor */

		    i__1 = bm + bmn - bw;
		    dgerv2d_(&ictxt, &i__1, nrhs, &work[ln + 1], &ldw, &c__0, 
			    &neicol);

		    bm = bm + bmn - bw;

		}

	    }

	} else {

	    neicol = mycol - npstr;

	    if (neicol == 0) {
		bmn = bw - *bwu;
	    } else {
		bmn = bw;
	    }

	    dlacpy_("G", &bm, nrhs, &work[ln + 1], &ldw, &work[nb + *bwu + 
		    bmn + 1], &ldw, (ftnlen)1);

	    dgerv2d_(&ictxt, &bmn, nrhs, &work[nb + *bwu + 1], &ldw, &c__0, &
		    neicol);

/*               and do the permutations and eliminations */

	    if (npact != 2) {

/*                  Solve locally for BW variables */

		dlaswp_(nrhs, &work[nb + *bwu + 1], &ldw, &c__1, &bw, &ipiv[
			ln + 1], &c__1);

		lfc_SLdtrsm("L", "L", "N", "U", &bw, nrhs, &c_b41, &af[bbptr + bw *
			 ldbb], &ldbb, &work[nb + *bwu + 1], &ldw, (ftnlen)1, 
			(ftnlen)1, (ftnlen)1, (ftnlen)1);

/*                  Use soln just calculated to update RHS */

		i__1 = bm + bmn - bw;
		lfc_SLdgemm("N", "N", &i__1, nrhs, &bw, &c_b25, &af[bbptr + bw * 
			ldbb + bw], &ldbb, &work[nb + *bwu + 1], &ldw, &c_b41,
			 &work[nb + *bwu + 1 + bw], &ldw, (ftnlen)1, (ftnlen)
			1);

/*                  Give answers back to partner processor */

		i__1 = bm + bmn - bw;
		dgesd2d_(&ictxt, &i__1, nrhs, &work[nb + *bwu + 1 + bw], &ldw,
			 &c__0, &neicol);

	    } else {

/*                  Finish up calculations for final level */

		i__1 = bm + bmn;
		dlaswp_(nrhs, &work[nb + *bwu + 1], &ldw, &c__1, &i__1, &ipiv[
			ln + 1], &c__1);

		i__1 = bm + bmn;
		lfc_SLdtrsm("L", "L", "N", "U", &i__1, nrhs, &c_b41, &af[bbptr + 
			bw * ldbb], &ldbb, &work[nb + *bwu + 1], &ldw, (
			ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1);
	    }

	}

	npact = (npact + 1) / 2;
	npstr <<= 1;
	goto L40;

    }

L50:


/* ************************************* */
/*     BACKSOLVE */
/* ******************************************************************* */
/*       PHASE 2: Global computation phase -- Solve U*Y = X */
/* ******************************************************************* */

    if (npcol == 1) {

/*        In this case the loop over the levels will not be */
/*        performed. */
/*        In fact, the backsolve portion was done in the call to */
/*          DGETRS in the frontsolve. */

    }

/*     Compute variable needed to reverse loop structure in */
/*        reduced system. */

    recovery_val__ = npact * npstr - npcol;

/*     Loop over levels */
/*      Terminal values of NPACT and NPSTR from frontsolve are used */

L60:
    if (npact >= npcol) {
	goto L80;
    }

    npstr /= 2;

    npact <<= 1;

/*        Have to adjust npact for non-power-of-2 */

    npact -= recovery_val__ / npstr % 2;

/*        Find size of submatrix in this proc at this level */

    if (mycol / npstr < npact - 1) {
	bn = bw;
    } else {
/* Computing MIN */
	i__3 = npcol - 1;
	i__1 = bw, i__2 = numroc_(n, &nb, &i__3, &c__0, &npcol);
	bn = min(i__1,i__2);
    }

/*        If this processor is even in this level... */

    if (mycol % (npstr << 1) == 0) {

	neicol = mycol + npstr;

	if (neicol / npstr <= npact - 1) {

	    if (neicol / npstr < npact - 1) {
		bmn = bw;
		bnn = bw;
	    } else {
/* Computing MIN */
		i__1 = bw, i__2 = numroc_(n, &nb, &neicol, &c__0, &npcol);
		bmn = min(i__1,i__2) + *bwu;
/* Computing MIN */
		i__1 = bw, i__2 = numroc_(n, &nb, &neicol, &c__0, &npcol);
		bnn = min(i__1,i__2);
	    }

	    if (npact > 2) {

		i__1 = bw << 1;
		dgesd2d_(&ictxt, &i__1, nrhs, &work[ln + 1], &ldw, &c__0, &
			neicol);

		dgerv2d_(&ictxt, &bw, nrhs, &work[ln + 1], &ldw, &c__0, &
			neicol);

	    } else {

		dgerv2d_(&ictxt, &bw, nrhs, &work[ln + 1], &ldw, &c__0, &
			neicol);

	    }

	}

    } else {
/*           This processor is odd on this level */

	neicol = mycol - npstr;

	if (neicol == 0) {
	    bmn = bw - *bwu;
	} else {
	    bmn = bw;
	}

	if (neicol < npcol - 1) {
	    bnn = bw;
	} else {
/* Computing MIN */
	    i__1 = bw, i__2 = numroc_(n, &nb, &neicol, &c__0, &npcol);
	    bnn = min(i__1,i__2);
	}

	if (npact > 2) {

/*              Move RHS to make room for received solutions */

	    dlacpy_("G", &bw, nrhs, &work[nb + *bwu + 1], &ldw, &work[nb + *
		    bwu + bw + 1], &ldw, (ftnlen)1);

	    i__1 = bw << 1;
	    dgerv2d_(&ictxt, &i__1, nrhs, &work[ln + 1], &ldw, &c__0, &neicol)
		    ;

	    lfc_SLdgemm("N", "N", &bw, nrhs, &bn, &c_b25, &af[bbptr], &ldbb, &work[
		    ln + 1], &ldw, &c_b41, &work[nb + *bwu + bw + 1], &ldw, (
		    ftnlen)1, (ftnlen)1);


	    if (mycol > npstr) {

		lfc_SLdgemm("N", "N", &bw, nrhs, &bw, &c_b25, &af[bbptr + (bw << 1)
			 * ldbb], &ldbb, &work[ln + bw + 1], &ldw, &c_b41, &
			work[nb + *bwu + bw + 1], &ldw, (ftnlen)1, (ftnlen)1);

	    }

	    lfc_SLdtrsm("L", "U", "N", "N", &bw, nrhs, &c_b41, &af[bbptr + bw * 
		    ldbb], &ldbb, &work[nb + *bwu + bw + 1], &ldw, (ftnlen)1, 
		    (ftnlen)1, (ftnlen)1, (ftnlen)1);

/*              Send new solution to neighbor */

	    dgesd2d_(&ictxt, &bw, nrhs, &work[nb + *bwu + bw + 1], &ldw, &
		    c__0, &neicol);

/*              Copy new solution into expected place */

	    dlacpy_("G", &bw, nrhs, &work[nb + *bwu + 1 + bw], &ldw, &work[ln 
		    + bw + 1], &ldw, (ftnlen)1);

	} else {

/*              Solve with local diagonal block */

	    i__1 = bn + bnn;
	    lfc_SLdtrsm("L", "U", "N", "N", &i__1, nrhs, &c_b41, &af[bbptr + bw * 
		    ldbb], &ldbb, &work[nb + *bwu + 1], &ldw, (ftnlen)1, (
		    ftnlen)1, (ftnlen)1, (ftnlen)1);

/*              Send new solution to neighbor */

	    dgesd2d_(&ictxt, &bw, nrhs, &work[nb + *bwu + 1], &ldw, &c__0, &
		    neicol);

/*              Shift solutions into expected positions */

	    i__1 = bnn + bn - bw;
	    dlacpy_("G", &i__1, nrhs, &work[nb + *bwu + 1 + bw], &ldw, &work[
		    ln + 1], &ldw, (ftnlen)1);


	    if (nb + *bwu + 1 != ln + 1 + bw) {

/*                 Copy one row at a time since spaces may overlap */

		i__1 = bw;
		for (j = 1; j <= i__1; ++j) {
		    lfc_SLdcopy(nrhs, &work[nb + *bwu + j], &ldw, &work[ln + bw + 
			    j], &ldw);
/* L70: */
		}

	    }

	}

    }

    goto L60;

L80:
/*     End of loop over levels */

/* ******************************************************************* */
/*       PHASE 1: (Almost) Local computation phase -- Solve U*Y = X */
/* ******************************************************************* */

/*     Reset BM to value it had before reduced system frontsolve... */

    if (mycol != npcol - 1) {
	bm = bw - lbwu;
    } else {
	bm = min(bw,odd_size__) + *bwu;
    }

/*     First metastep is to account for the fillin blocks AF */

    if (mycol < npcol - 1) {

	i__1 = mycol + 1;
	dgesd2d_(&ictxt, &bw, nrhs, &work[nb - bw + 1], &ldw, &c__0, &i__1);

    }

    if (mycol > 0) {

	i__1 = mycol - 1;
	dgerv2d_(&ictxt, &bw, nrhs, &work[nb + *bwu + 1], &ldw, &c__0, &i__1);

/*        Modify local right hand sides with received rhs's */

	i__1 = lm - bm;
	lfc_SLdgemm("T", "N", &i__1, nrhs, &bw, &c_b25, &af[1], &bw, &work[nb + *
		bwu + 1], &ldw, &c_b41, &work[1], &ldw, (ftnlen)1, (ftnlen)1);

    }

    for (j = ln; j >= 1; --j) {

/* Computing MIN */
	i__1 = bw, i__2 = odd_size__ - 1;
	lmj = min(i__1,i__2);

	lptr = bw - 1 + j * llda + aptr;

/*        In the following, the TRANS=T option is used to reverse */
/*           the order of multiplication, not as a true transpose */

	i__1 = llda - 1;
	lfc_SLdgemv("T", &lmj, nrhs, &c_b25, &work[j + 1], &ldw, &a[lptr], &i__1, &
		c_b41, &work[j], &ldw, (ftnlen)1);

/*        Divide by diagonal element */

	d__1 = 1. / a[lptr - llda + 1];
	lfc_SLdscal(nrhs, &d__1, &work[j], &ldw);
/* L90: */
    }



    dlacpy_("G", &odd_size__, nrhs, &work[1], &ldw, &b[1], &lldb, (ftnlen)1);

/*     Free BLACS space used to hold standard-form grid. */

    ictxt = ictxt_save__;
    if (ictxt != ictxt_new__) {
	blacs_gridexit__(&ictxt_new__);
    }

L100:

/*     Restore saved input parameters */

    np = np_save__;

/*     Output worksize */

    work[1] = (doublereal) work_size_min__;

    return 0;

/*     End of PDGBTRS */

} /* pdgbtrs_ */

