/* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */
/* lfcdpslv.c

  Solve a system of linear equations from disk in single (64 bit) or double
  precision (64 bit) arithmetic.

  This file is based on Fortran 77 ScaLAPACK example code written by Antoine
  Petitet in August 1995 (petitet@cs.utk.edu). The original code is available
  at http://netlib2.cs.utk.edu/scalapack/examples/scaex.tgz

  A sample command lines might look like this:
  mpirun -np 4 ./lfcdpslv n=6 nrhs=1 mb=2 nb=2 gp=2 gq=2 mat-fname=/../../
  rhs-fname=/../../ sol-fname=/../../ luf-fname=/../../ opcode=1 verbose=1
  mpirun -np 4 ./lfcdpslv n=6 nrhs=1 mb=2 nb=2 gp=2 gq=2 mat-fname=exmat.dat
  rhs-fname=exrhs.dat sol-fname=exsol.dat luf-fname=/../../ verbose=1
  mpirun -nolocal -machinefile torc.mf -np 4 ./lfcdpslv n=6 nrhs=1 mb=2 nb=2
  gp=2 gq=2 mat-fname=exmat.dat rhs-fname=exrhs.dat sol-fname=exsol.dat
*/

#include <mpi.h>

#include <lfc/lfci.h>

#include "lfcpslvr.h"

LFC_BEGIN_C_DECLS

static char nul_fname[] = "/../../";
static int nul_len = 7;

static int
is_io_proc(PInfo *p) {
  if (0 == p->sli.myrow && 0 == p->sli.mycol) return 1;
  return 0;
}
static int
is_in_grid(PInfo *p) {
  int myrow, mycol, nprow, npcol, ctxt;
  myrow = p->sli.myrow;
  mycol = p->sli.mycol;
  nprow = p->gp;
  npcol = p->gq;
  ctxt  = p->sli.ctxt;

  /* this process is not in the process grid */
  if (myrow < 0 || myrow >= nprow || mycol < 0 || mycol >= npcol) return 0;

  return 1;
}

static void
msg(PInfo *p, const char *m) {
  if (is_io_proc( p )) {printf( "# %s\n", m ); fflush( stdout );}
}

static int
dsetup(PInfo *p) {
  int m, n, mn, nrhs, mnrhs, mb, nb, sizeW;
  int i, ctxt, info, iam, nprocs, nprow, npcol, myrow, mycol, np, nq, nqrhs;
  int *desca, *descb, *descx;
  double *ptrA, *ptrAcpy, *ptrB, *ptrX, *ptrW; int *ptrP; void *null = NULL;
  int c__0 = 0;
  int rank, size;
  char *gridOrder;

  MPI_Comm_size( MPI_COMM_WORLD, &size );
  MPI_Comm_rank( MPI_COMM_WORLD, &rank );

  m     = p->m;
  n     = p->n;
  nrhs  = p->nrhs;
  mb    = p->mb;
  nb    = p->nb;
  nprow = p->gp;
  npcol = p->gq;
  desca = p->sli.desca;
  descb = p->sli.descb;
  descx = p->sli.descx;

  mn = MAX( m, n );

  /* Define process grid */
  Cblacs_pinfo( &iam, &nprocs );
  Cblacs_get( -1, 0, &ctxt );

  gridOrder = "Row-major";

  if (nprow * npcol == size - 1) { /* one extra process, skip process 0 */
    int *tmpgrid, *iptr;
    int i, j;

    p->IOrow = p->IOcol = -1; /* I/O node is outside of BLACS grid */

    /* FIXME: check if allocated */
    iptr = tmpgrid = LFC_MALLOC( int, nprow * npcol );

    if ('c' == gridOrder[0] || 'C' == gridOrder[0]) {
      i = npcol * nprow;
      for (j=0; j < i; j++) iptr[j] = j + 1;
    } else {
      for (j=0; j < npcol; j++) {
        for (i=0; i < nprow; i++) iptr[i] = i * npcol + j + 1;
        iptr += nprow;
      }
    }

    Cblacs_gridmap( &ctxt, tmpgrid, nprow, nprow, npcol);
    LFC_FREE( tmpgrid );
  } else {
    p->IOrow = p->IOcol = 0; /* I/O node is inside of BLACS grid */
    Cblacs_gridinit( &ctxt, gridOrder, nprow, npcol );
  }
  Cblacs_gridinfo( ctxt, &nprow, &npcol, &myrow, &mycol );

  /*
  printf( "%d %d %d %d %d %d\n", iam, nprocs, nprow, npcol, myrow, mycol ); fflush(stdout);
  MPI_Barrier( MPI_COMM_WORLD );
  */

  p->sli.myrow = myrow;
  p->sli.mycol = mycol;
  p->sli.ctxt  = ctxt;
  p->sli.dptrA = null;
  p->sli.dptrB = null;
  p->sli.dptrX = null;
  p->sli.ptrP = null;
  p->dptrAcpy = null;
  p->dptrW    = null;

  if (! is_in_grid( p )) { /* this node is not in the process grid*/
    sizeW = MAX( mb, nb );
    p->dptrW = LFC_CALLOC( double, sizeW );
    return 1;
  }

  /* Count the number of rows (np), columns (nq, nqrhs) of the system matrix
     and the right hand side matrix, respectively. */
  np    = numroc_( &m,    &mb, &myrow, &c__0, &nprow );
  nq    = numroc_( &n,    &nb, &mycol, &c__0, &npcol );
  nqrhs = numroc_( &nrhs, &nb, &mycol, &c__0, &npcol );

  mnrhs = numroc_( &mn, &mb, &myrow, &c__0, &nprow );

  /* Initialize the array descriptors for the matrices A, B, and X */
  i = MAX( 1, np );
  descinit_( desca, &m,  &n,    &mb, &nb, &c__0, &c__0, &ctxt, &i, &info );
  i = MAX( 1, mnrhs );
  descinit_( descb, &mn, &nrhs, &mb, &nb, &c__0, &c__0, &ctxt, &i, &info );
  i = MAX( 1, mnrhs );
  descinit_( descx, &mn, &nrhs, &mb, &nb, &c__0, &c__0, &ctxt, &i, &info );

  /*
  if (iam == 0) {
    for (i = 0; i < 9; i++) printf( "%d->%d ", i, desca[i] ); printf( "\n" );
    for (i = 0; i < 9; i++) printf( "%d->%d ", i, descb[i] ); printf( "\n" );
    for (i = 0; i < 9; i++) printf( "%d->%d ", i, descx[i] ); printf( "\n" );
  } */

  msg( p , "Allocating memory..." );

  /* Instead of malloc(), the calloc() function is used. This is because on the
   * Linux boxes I've tried (Red Hat 7.3) malloc() returns non-NULL pointer
   * even though there is not enough memory in the system. This happens in the
   * following scenario. Suppose the system has 256 MB of main memory and
   * 512 MB of swap. Allocate first 512 MB with a call to malloc(), and then
   * allocate another 512 MB, again with a call to malloc(). Both of them
   * (surprisingly) succeed but the process gets killed when it tries to touch
   * the allocated memory. calloc() doesn't seem to have this problem
   * on the same system - the system gets an exception touching it itself so it
   * returns NULL. */
  /* Allocate memory */
  ptrA    = LFC_CALLOC( double, desca[8] * nq );
  ptrB    = LFC_CALLOC( double, descb[8] * nqrhs );
  ptrX    = LFC_CALLOC( double, descx[8] * nqrhs ); /* FIXME: what if B and X are too big together */
  ptrP    = LFC_CALLOC( int, (np + nb) ); /* FIXME: don't allocate pivot for Cholesky */
  /* Functions pdlaread(), pdlaprnt(), and pdlawrite() require work space of
    size 'mb' (row blocking factor). Documentation of pdlange_() states the
    same requirement but in reality it requires MAX('mb', 'np') ('np' being
    number of local matrix rows). It's a bug in pdlange_(), to see it, trace
    the calls to IDAMAX() for infinity norm. */
  sizeW = MAX( np, nq ); /* for QR */
  sizeW = MAX( sizeW, mnrhs ); /* for QR */
  sizeW = MAX( sizeW, MAX( mb, nb ) );

  /* Query/Allocate required workspace */
  if (5 == p->opcode) {
    int i = -1, j; double work[2]; char trans = e2chTrans(p->trans); int c__1 = 1;
    pdgels_( &trans, &p->m, &p->n, &p->nrhs, p->sli.dptrA, &c__1, &c__1,
	    p->sli.desca, p->sli.dptrB, &c__1, &c__1, p->sli.descb, work, &i,
	    &info, 1 );
    i = work[0];

    j = MAX( sizeW, nq );

    /* try optimal workspace first */
    sizeW = MAX( p->nb * i, j );
    ptrW = LFC_CALLOC( double, sizeW );

    if (! ptrW) { /* if not enough space for optimal workspace */
      sizeW = MAX( i, j );
      ptrW = LFC_CALLOC( double, sizeW );
    }
  } else {
    ptrW = LFC_CALLOC( double, sizeW );
  }

  /* matrix memory copy gets allocated last; it's OK if it fails
   * since this program will usually run on the verge of main memory capacity,
   * we cannot afford to make an in-core copy of the matrix */
#if 0
  ptrAcpy = LFC_CALLOC( double, desca[8] * nq );
#else
  ptrAcpy = null;
#endif

  p->sli.dptrA = ptrA;
  p->sli.dptrB = ptrB;
  p->sli.dptrX = ptrX;
  p->sli.ptrP = ptrP;
  p->dptrAcpy  = ptrAcpy;
  p->dptrW     = ptrW;
  p->sizeW    = sizeW;

  info = 0;
  if (! ptrA || ! ptrB || ! ptrX || ! ptrP || ! ptrW) info = 1;

  /* Check all processes for error */
  Cigsum2d( ctxt, "All", " ", 1, 1, &info, 1, -1, -1 );
  if (info > 0) {
    msg( p, "Not enough memory." );
    return -1;
    /*
    Cblacs_gridexit(ctxt);
    Cblacs_exit( 0 );
    exit( EXIT_FAILURE );
    */
  }

  if (! ptrAcpy) info = 1;

  /* Check all processes for error */
  Cigsum2d( ctxt, "All", " ", 1, 1, &info, 1, -1, -1 );
  if (info > 0) { /* if one of processes was not able to allocate matrix copy*/
    msg( p, "Not enough memory for in-core copy of A." );
    LFC_FREE( ptrAcpy );
    p->dptrAcpy = ptrAcpy = null;
  }

  msg( p, "Setup successful." );

  return 0;
}	/* dsetup */

static void
dscale_diagonal(PInfo *p, int m, int n, int i, int j, int mb, double *lba) {
  int mn, iloc;

  mn = MAX( m, n );

  if (i <= j && j < i + mb) {
    iloc = j % mb;
    if (i + iloc < m)
      lba[iloc] += mn;
  }
}

/* Reads from a file named FILNAM a matrix and distributes it to the process
  grid. Only the process of coordinates {'irread', 'icread'} reads the file.
  'work' must be of size >= desca[4] (matrix row blocking factor). */
static void
pdlaread(PInfo *p, char *filnam, double *a, int *desca, int irread, int icread,
	 double *work) {
  int h, i, j, k, m, n, npcol, mycol, ctxt, iwork[2], nprow, myrow, ib, jb, mb;
  int ii, jj, lda, icurcol, icurrow, matrnd; double rcp = 1.0 / RAND_MAX;
  FILE *fin = NULL; char fmt[6] = "%d %d";
  int scaleDiag = 0, useBLACS = 1, tag = 1, dest;
  char gridOrder = 'R';
  MPI_Status stat;

  /* Get grid parameters */
  ctxt = desca[1];
  Cblacs_gridinfo( ctxt, &nprow, &npcol, &myrow, &mycol );

  if (irread < 0 && icread < 0) useBLACS = 0;

  /* update `desca' */
  if (! useBLACS) {
    if (myrow == irread && mycol == icread) {
      MPI_Recv( desca, 9, MPI_INT, 1, tag, MPI_COMM_WORLD, &stat );

      /* nprow,npcol are set to -1 for nodes outside of the grid */
      MPI_Recv( iwork, 2, MPI_INT, 1, tag, MPI_COMM_WORLD, &stat );
      nprow = iwork[0];
      npcol = iwork[1];
    } else if (myrow == 0 && mycol == 0) { /* always rank 1 (no matter what ordering) */
      iwork[0] = nprow;
      iwork[1] = npcol;
      MPI_Send( desca, 9, MPI_INT, 0, tag, MPI_COMM_WORLD );
      MPI_Send( iwork, 2, MPI_INT, 0, tag, MPI_COMM_WORLD );
    }
  }

  if (myrow == irread && mycol == icread) {
    if (strncmp( nul_fname, filnam, nul_len ) == 0) {
      iwork[0] = iwork[1] = -1;
    } else {
      if (! filnam || ! (fin = fopen( filnam, "rb" ))) exit( EXIT_FAILURE );
      while (fgetc( fin ) != '\n') ; /* skip the first line */
      fscanf( fin, fmt, iwork, iwork + 1 );
      while (fgetc( fin ) != '\n') ; /* skip the rest of the line */
    }
    if (useBLACS)
      Cigebs2d( ctxt, "All", " ", 2, 1, iwork, 2 );
    else
      MPI_Bcast( iwork, 2, MPI_INT, 0, MPI_COMM_WORLD );
  } else {
    if (useBLACS)
      Cigebr2d( ctxt, "All", " ", 2, 1, iwork, 2, irread, icread );
    else
      MPI_Bcast( iwork, 2, MPI_INT, 0, MPI_COMM_WORLD );
  }
  m = iwork[0];
  n = iwork[1];

  if (m <= 0 || n <= 0) {
      scaleDiag = 1;
      m = desca[2]; n = desca[3]; matrnd = 1;
  } else matrnd = 0;

  if (m > desca[2] || n > desca[3]) {
    msg( p, "PDLAREAD: Matrix too big to fit in\nAbort ...\n" );
    if (useBLACS)
      Cblacs_abort( ctxt, 0 );
    else
      MPI_Abort( MPI_COMM_WORLD, 1 );
  }

  mb = desca[4];
  if (scaleDiag) { /* if no file name */
    scaleDiag = 0;
    if (m == n && (p->opcode == 3 || p->opcode == 4)) /* if Cholesky */
      scaleDiag = 1;
  }

  ii = 0;
  jj = 0;
  icurrow = desca[6];
  icurcol = desca[7];
  lda = desca[8];

  /* Loop over column blocks */
  for (j = 1; j <= n; j += desca[5]) {
    jb = MIN( desca[5], n - j + 1 );
    for (h = 0; h < jb; ++h) {
      /* Loop over block of rows */
      for (i = 1; i <= m; i += desca[4]){
        ib = MIN( desca[4], m - i + 1 );
        if (icurrow == irread && icurcol == icread) {
          if (myrow == irread && mycol == icread) {
            if (matrnd) {
              for (k = 0; k < ib; k++) a[ii+k+(size_t)(jj+h)*lda] = rcp * rand();
              if (scaleDiag) dscale_diagonal( p, m, n, i-1, j+h-1, mb, a + ii+(size_t)(jj+h)*lda );
            } else {
              fread( a + ii + (size_t)(jj + h) * lda, sizeof *a, ib, fin );
            }
          } else {
            if (matrnd) {
              for (k = 0; k < ib; k++) work[k] = rcp * rand();
            }
          }
        } else {
          if (myrow == icurrow && mycol == icurcol) {
            if (matrnd) {
              for (k = 0; k < ib; k++) a[ii+k+(size_t)(jj+h)*lda] = rcp * rand();
              if (scaleDiag) dscale_diagonal( p, m, n, i-1, j+h-1, mb, a + ii+(size_t)(jj+h)*lda );
            } else {
              if (useBLACS)
                Cdgerv2d( ctxt, ib, 1, a + ii + (size_t)(jj + h) * lda, lda, irread, icread );
              else
                MPI_Recv( a + ii + (size_t)(jj + h) * lda, ib, MPI_DOUBLE, 0,
                          tag, MPI_COMM_WORLD, &stat );
              fflush(stdout);
            }
          } else if (myrow == irread && mycol == icread) {
            if (matrnd) {
              for (k = 0; k < ib; k++) work[k] = rcp * rand();
            } else {
              fread( work, sizeof *work, ib, fin );
              if (useBLACS)
                Cdgesd2d( ctxt, ib, 1, work, desca[4], icurrow, icurcol );
              else {
                if ('R' == gridOrder || 'r' == gridOrder)
                  dest = icurrow * p->gq + icurcol + 1;
                else
                  dest = icurrow + icurcol * p->gp + 1;
                MPI_Send( work, ib, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD );
              }
            }
          } else {
            if (matrnd) {
              for (k = 0; k < ib; k++) work[k] = rcp * rand();
            }
          }
        }
        if (myrow == icurrow) ii += ib;
        icurrow = (icurrow + 1) % nprow;
      }
      ii = 0;
      icurrow = desca[6];
    }
    /*if(myrow == 0 && mycol == 0) printf( "%d %d\n", i, j );*/
    if (mycol == icurcol) jj += jb;
    icurcol = (icurcol + 1) % npcol;
  }

  if (myrow == irread && mycol == icread && ! matrnd) fclose( fin );
}	/* pdlaread */

static void
dread_data(PInfo *p, int readX) {
  int randGen;

  srand( 1313 + 13 * (p->sli.myrow + p->gp * p->sli.mycol) );
  srand( 1313 ); /* every node starts with the same random seed */

  TB0( p->sli.ctxt, p->tread );

  /* Read from file and distribute matrices A and B from I/O node */
  pdlaread( p, p->mat_fname, p->sli.dptrA, p->sli.desca, p->IOrow, p->IOcol, p->dptrW );

  if (strncmp( nul_fname, p->mat_fname, nul_len ) == 0 ||
      strncmp( nul_fname, p->rhs_fname, nul_len ) == 0) {
    randGen = 1;
  } else {
    randGen = 0;
  }

  /* if random generation and (it's QR or QR's residual) */
  if (randGen && (p->opcode == 5 || p->opcode == 6)) {
    char trans = e2chTrans(p->trans);
    int i, j, n1, n2, nprhs, nqrhs, c__0 = 0, c__1 = 1;
    int m, n, nrhs, mb, nb, nprow, npcol, myrow, mycol;
    double c_d1 = 1.0, c_d0 = 0.0;

    m     = p->m;
    n     = p->n;
    nrhs  = p->nrhs;
    mb    = p->mb;
    nb    = p->nb;
    nprow = p->gp;
    npcol = p->gq;
    myrow = p->sli.myrow;
    mycol = p->sli.mycol;

    switch (p->trans) {
      case LFC_NoTrans: n1 = m; n2 = n; break;
      case LFC_Trans:   n1 = n; n2 = m; break;
    }

    nqrhs = numroc_( &nrhs, &nb, &mycol, &c__0, &npcol );
    nprhs = p->sli.descx[8];

    if (is_in_grid( p ))
    /* this will take care of unintialized (in the following pdlaread() call)
       parts of X */
    for (j = 0; j < nqrhs; j++)
      for (i = 0; i < nprhs; i++)
        p->sli.dptrX[i + (size_t)j * nprhs] = 1.0;

    /* depending on 'trans' and relation between 'm' and 'n' this will read to
       few or too many data */
    pdlaread( p, p->rhs_fname, p->sli.dptrX, p->sli.descx, p->IOrow, p->IOcol, p->dptrW );

    if (is_in_grid( p ))
    /* generate right hand side - this makes the scaled residual small */
    pdgemm_( &trans, "No transpose", &n1, &nrhs, &n2, &c_d1, p->sli.dptrA, &c__1, &c__1,
	    p->sli.desca, p->sli.dptrX, &c__1, &c__1, p->sli.descx, &c_d0, p->sli.dptrB,
	    &c__1, &c__1, p->sli.descb );

  } else { /* cannot monkey with user data */
    pdlaread( p, p->rhs_fname, p->sli.dptrB, p->sli.descb, p->IOrow, p->IOcol, p->dptrW );
  }

  if (readX)
    pdlaread( p, p->sol_fname, p->sli.dptrX, p->sli.descx, p->IOrow, p->IOcol, p->dptrW );

  if (is_in_grid( p )) {
  TE( p->sli.ctxt, p->tread );
  }

  msg( p, "Reading successful." );
}	/* dread_data */

static void
dcopy_data(PInfo *p) {
  int c__1 = 1;
  int m, n, mn, nrhs;

  m    = p->m;
  n    = p->n;
  nrhs = p->nrhs;

  mn = MAX( m, n );

  TB( p->sli.ctxt, p->tcopy );

  /* Make a copy of A and the rhs for checking purposes */
  if (p->dptrAcpy)
    pdlacpy_( "All", &m, &n,    p->sli.dptrA, &c__1, &c__1, p->sli.desca,
              p->dptrAcpy, &c__1, &c__1, p->sli.desca, 3 );
  pdlacpy_( "All", &mn, &nrhs, p->sli.dptrB, &c__1, &c__1, p->sli.descb,
            p->sli.dptrX, &c__1, &c__1, p->sli.descx, 3 );

  TE( p->sli.ctxt, p->tcopy );

  msg( p, "Copying successful." );
}	/* dcopy_data */

static void
dsolve_system(PInfo *p) {
  int m, n, mn, nrhs, info, ctxt, c__1 = 1, c__0 = 0, c__6 = 6; char *rtn = "unknown";

  ctxt = p->sli.ctxt;
  m = p->m;
  n = p->n;
  mn = MIN(m,n);
  nrhs = p->nrhs;

  if (p->verbose > 1) {
    pdlaprnt_( &m, &n,     p->sli.dptrA, &c__1, &c__1, p->sli.desca, &c__0,
               &c__0, "A", &c__6, p->dptrW, 1 );
    pdlaprnt_( &mn, &nrhs, p->sli.dptrB, &c__1, &c__1, p->sli.descb, &c__0,
               &c__0, "B", &c__6, p->dptrW, 1 );
  }

  TB( ctxt, p->tpdgesv );

  if (1 == p->opcode) {
    /* Call ScaLAPACK PDGESV routine */
    pdgesv_( &p->n, &p->nrhs, p->sli.dptrA, &c__1, &c__1, p->sli.desca,
	    p->sli.ptrP, p->sli.dptrB, &c__1, &c__1, p->sli.descb, &info );
    rtn = "PDGESV";
  } else if (3 == p->opcode) {
    char uplo = e2chUpLo(p->uplo);
    pdposv_( &uplo, &p->n, &p->nrhs, p->sli.dptrA, &c__1, &c__1, p->sli.desca,
	    p->sli.dptrB, &c__1, &c__1, p->sli.descb, &info, 1 );
    rtn = "PDPOSV";
  } else if (5 == p->opcode) {
    int itmp = p->sizeW; char trans = e2chTrans(p->trans);

    pdgels_( &trans, &p->m, &p->n, &p->nrhs, p->sli.dptrA, &c__1, &c__1,
	    p->sli.desca, p->sli.dptrB, &c__1, &c__1, p->sli.descb, p->dptrW, &itmp,
	    &info, 1 );
    rtn = "PDGELS";
  }

  TE( ctxt, p->tpdgesv );

  if (p->verbose > 1)
    pdlaprnt_( &mn, &nrhs, p->sli.dptrB, &c__1, &c__1, p->sli.descb, &c__0,
               &c__0, "X", &c__6, p->dptrW, 1 );

  if (p->verbose)
    if (is_io_proc(p)) {
      printf( "INFO code returned by %s = %d\n", rtn, info );
      fflush( stdout );
    }
}	/* dsolve_system */

static void
pdlawrite(char *filnam, int m, int n, double *a1, int ia, int ja,
	  int *desca, int irwrit, int icwrit, double *work) {
  /* Local variables */
  int h, i, j;
  int iacol, npcol, iarow, mycol;
  int ctxt, nprow, myrow, ib, jb, ii, jj, in, jn;
  int lda, iia, jja, icurcol, icurrow;

  /* char ofmt[] = "%+30.18e\n"; int k; */
  FILE *fout = NULL;

/*-- ScaLAPACK example --
     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
     and University of California, Berkeley.

     written by Antoine Petitet, August 1995 (petitet@cs.utk.edu)

  Purpose

  PDLAWRITE writes to a file named FILNAM a distributed matrix sub( A )
  denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent to and
  written by the process of coordinates (IRWWRITE, ICWRIT).

  WORK must be of size >= MB_ = DESCA( MB_ ). */

/*     Get grid parameters */

  ctxt = desca[1];
  Cblacs_gridinfo(ctxt, &nprow, &npcol, &myrow, &mycol);

  if (myrow == irwrit && mycol == icwrit) {
    if (strncmp( nul_fname, filnam, nul_len ) == 0) i = 1;
    else { i = 0;
    /* FIXME: make graceful exit */
      if (! filnam || ! (fout = fopen( filnam, "w" ))) exit( EXIT_FAILURE );
      fprintf( fout, "LFCMATRIX\n%d %d\n", m, n );
    }
    Cigebs2d( ctxt, "All", " ", 1, 1, &i, 1 );
  } else
    Cigebr2d( ctxt, "All", " ", 1, 1, &i, 1, irwrit, icwrit );

  /* nothing to write */
  if (i) return;

  infog2l_( &ia,  &ja, desca, &nprow, &npcol, &myrow, &mycol, &iia, &jja,
            &iarow, &iacol );
  icurrow = iarow;
  icurcol = iacol;
  iia--;
  ii = iia;
  jj = jja;
  lda = desca[8];

  /* Handle the first block of columns separately */

  jn = ICEIL( ja, desca[5] ) * desca[5];
  jn = MIN( jn, ja + n - 1 );
  jb = jn - ja + 1;
  for (h = 0; h < jb; ++h) {
    in = ICEIL( ia, desca[4] ) * desca[4];
    in = MIN( in, ia + m - 1 );
    ib = in - ia + 1;
    if (icurrow == irwrit && icurcol == icwrit) {
      if (myrow == irwrit && mycol == icwrit) {
        fwrite( a1 + ii + (jj + h - 1) * lda, sizeof *a1, ib, fout );
        /*
        for (k = 0; k < ib; ++k)
          fprintf( fout, ofmt, a1[ii + k + (jj + h - 1) * lda] );
        */
      }
    } else {
      if (myrow == icurrow && mycol == icurcol) {
        Cdgesd2d( ctxt, ib, 1, a1 + ii + (jj + h - 1) * lda, lda, irwrit,
                  icwrit );
      } else if (myrow == irwrit && mycol == icwrit) {
        Cdgerv2d( ctxt, ib, 1, work, desca[4], icurrow, icurcol );
        fwrite( work, sizeof *work, ib, fout );
        /* for (k = 0; k < ib; ++k) fprintf( fout, ofmt, work[k] ); */
      }
    }
    if (myrow == icurrow) ii += ib;
    icurrow = (icurrow + 1) % nprow;
    Cblacs_barrier( ctxt, "All" );

    /* Loop over remaining block of rows */

    for (i = in + 1; desca[4] < 0 ? i >= ia+m-1 : i <= ia+m-1;
         i += desca[4]) {
      ib = MIN( desca[4], ia + m - i );
      if (icurrow == irwrit && icurcol == icwrit) {
        if (myrow == irwrit && mycol == icwrit) {
          fwrite( a1 + ii + (jj + h - 1) * lda, sizeof *a1, ib, fout );
          /*
          for (k = 0; k < ib; ++k)
            fprintf( fout, ofmt, a1[ii + k + (jj + h - 1) * lda] );
          */
        }
      } else {
        if (myrow == icurrow && mycol == icurcol) {
          Cdgesd2d( ctxt, ib, 1, a1 + ii + (jj + h - 1) * lda, lda, irwrit,
                    icwrit );
        } else if (myrow == irwrit && mycol == icwrit) {
          Cdgerv2d( ctxt, ib, 1, work, desca[4], icurrow, icurcol );
          fwrite( work, sizeof *work, ib, fout );
          /* for (k = 0; k < ib; ++k) fprintf( fout, ofmt, work[k] ); */
        }
      }
      if (myrow == icurrow) ii += ib;
      icurrow = (icurrow + 1) % nprow;
      Cblacs_barrier( ctxt, "All" );
    }

    ii = iia;
    icurrow = iarow;
  }

  if (mycol == icurcol) jj += jb;
  icurcol = (icurcol + 1) % npcol;
  Cblacs_barrier( ctxt, "All" );

  /* Loop over remaining column blocks */

  for (j = jn + 1; desca[5] < 0 ? j >= ja+n-1 : j <= ja+n-1; j += desca[5]) {
    jb = MIN( desca[5], ja + n - j );
    for (h = 0; h < jb; ++h) {
      in = ICEIL( ia, desca[4] ) * desca[4];
      in = MIN( in, ia + m - 1 );
      ib = in - ia + 1;
      if (icurrow == irwrit && icurcol == icwrit) {
        if (myrow == irwrit && mycol == icwrit) {
          fwrite( a1 + ii + (jj + h - 1) * lda, sizeof *a1, ib, fout );
          /*
          for (k = 0; k < ib; ++k)
            fprintf( fout, ofmt, a1[ii + k + (jj + h - 1) * lda] );
          */
        }
      } else {
        if (myrow == icurrow && mycol == icurcol) {
          Cdgesd2d( ctxt, ib, 1, a1 + ii + (jj + h - 1) * lda, lda, irwrit,
                    icwrit );
        } else if (myrow == irwrit && mycol == icwrit) {
          Cdgerv2d( ctxt, ib, 1, work, desca[4], icurrow, icurcol);
          fwrite( work, sizeof *work, ib, fout );
          /* for (k = 0; k < ib; ++k) fprintf( fout, ofmt, work[k] ); */
        }
      }
      if (myrow == icurrow) ii += ib;
      icurrow = (icurrow + 1) % nprow;
      Cblacs_barrier( ctxt, "All" );

      /* Loop over remaining block of rows */

      for (i = in + 1; desca[4] < 0 ? i >= ia+m-1 : i <= ia+m-1;
           i += desca[4]) {
        ib = MIN( desca[4], ia + m - i);
        if (icurrow == irwrit && icurcol == icwrit) {
          if (myrow == irwrit && mycol == icwrit) {
            fwrite( a1 + ii + (jj + h - 1) * lda, sizeof *a1, ib, fout );
            /*
            for (k = 0; k < ib; ++k)
              fprintf( fout, ofmt, a1 + ii + k + (jj + h - 1) * lda );
            */
          }
        } else {
          if (myrow == icurrow && mycol == icurcol) {
            Cdgesd2d( ctxt, ib, 1, a1 + ii + (jj + h-1) * lda, lda, irwrit,
                      icwrit );
          } else if (myrow == irwrit && mycol == icwrit) {
            Cdgerv2d( ctxt, ib, 1, work, desca[4], icurrow, icurcol );
            fwrite( work, (sizeof *work), ib, fout );
            /* for (k = 0; k < ib; ++k) fprintf( fout, ofmt, work[k] ); */
          }
        }
        if (myrow == icurrow) ii += ib;
        icurrow = (icurrow + 1) % nprow;
        Cblacs_barrier( ctxt, "All" );
      }

      ii = iia;
      icurrow = iarow;
    }

    if (mycol == icurcol) jj += jb;
    icurcol = (icurcol + 1) % npcol;
    Cblacs_barrier( ctxt, "All" );
  }

  if (myrow == irwrit && mycol == icwrit) fclose( fout );
}	/* pdlawrite */

static void
dwrite_data(PInfo *p) {
  int m, n, n1, n2;

  m = p->m;
  n = p->n;

  switch (p->trans) {
    case LFC_NoTrans: n1 = m; n2 = n; break;
    case LFC_Trans:   n1 = n; n2 = m; break;
  }

  TB( p->sli.ctxt, p->twrite );
  pdlawrite( p->luf_fname, p->m, p->n, p->sli.dptrA, 1, 1, p->sli.desca,
	     0, 0, p->dptrW );
  pdlawrite( p->sol_fname, n2,p->nrhs, p->sli.dptrB, 1, 1, p->sli.descb,
	     0, 0, p->dptrW );
  TE( p->sli.ctxt, p->twrite );
  msg( p, "Writing successful." );
}	/* dwrite_data */

static int
dcompute_residual(PInfo *p) {
  int ctxt, m, n, mn, nrhs, c__1 = 1, myrow, mycol, n1, n2;
  double anorm, xnorm, eps, resid, rnorm;
  double *ptrA;
  double c_d1 = 1.0, c_dN1 = -1.0;
  char uplo, trans;

  m = p->m;
  n = p->n;
  nrhs = p->nrhs;
  ctxt = p->sli.ctxt;
  myrow = p->sli.myrow;
  mycol = p->sli.mycol;

  mn = MAX( m, n );
  uplo  = e2chUpLo( p->uplo );
  trans = e2chTrans( p->trans );
  n1 = n;
  n2 = n;

#if 0
  ptrA = p->dptrAcpy;
  if (! ptrA) {
    read_data( p, 0 );
    ptrA = p->sli.dptrA;
  }
#else
  ptrA = p->sli.dptrA;
#endif

  TB( ctxt, p->tresid );

  switch (p->opcode) {
    case 2:
      pdgemm_( "No transpose", "No transpose", &n, &nrhs, &n, &c_d1, ptrA, &c__1, &c__1,
	      p->sli.desca, p->sli.dptrX, &c__1, &c__1, p->sli.descx, &c_dN1, p->sli.dptrB,
	      &c__1, &c__1, p->sli.descb );
      break;
    case 4:
      pdsymm_( "Left", &uplo, &n, &nrhs, &c_d1, ptrA, &c__1, &c__1,
	       p->sli.desca, p->sli.dptrX, &c__1, &c__1, p->sli.descx, &c_dN1, p->sli.dptrB,
	       &c__1, &c__1, p->sli.descb );
      break;
    case 6:
      switch (p->trans) {
        case LFC_NoTrans: n1 = m; n2 = n; break;
        case LFC_Trans:   n1 = n; n2 = m; break;
      }

      pdgemm_( &trans, "No transpose", &n1, &nrhs, &n2, &c_d1, ptrA, &c__1, &c__1,
	      p->sli.desca, p->sli.dptrX, &c__1, &c__1, p->sli.descx, &c_dN1, p->sli.dptrB,
	      &c__1, &c__1, p->sli.descb );
      break;
  }

  /* Compute residual ||A * X  - B|| / ( ||X|| * ||A|| * eps * MAX( M, N ) ) */
  eps = pdlamch_( &ctxt, "Epsilon", 1 );
  anorm = pdlange_( "I", &m, &n, ptrA, &c__1, &c__1, p->sli.desca, p->dptrW, 1 );
  xnorm = pdlange_( "I", &n2, &nrhs, p->sli.dptrX, &c__1, &c__1, p->sli.descx,
                    p->dptrW, 1 );

  rnorm = pdlange_( "I", &n1, &nrhs, p->sli.dptrB, &c__1, &c__1, p->sli.descb,
                    p->dptrW, 1 );
  resid = rnorm / (anorm * xnorm * eps * mn);

  /*
  if (is_io_proc( p )) {printf( "eps=%g\n||A||=%g\n||X||=%g\n||A*X-B||=%g\n",
  eps, anorm, xnorm, rnorm ); fflush( stdout );} */

  TE( ctxt, p->tresid );

  p->resid = resid;

  return 0;
}	/* dcompute_residual */

static int
dreport_run(PInfo *p, int reportSolve) {
  int i, ctxt, myrow, mycol, gp, gq, mb, nb, rv = 0;
  double resid, work[5], m, n, nrhs, thflps;
  double tol=10.0;

  ctxt  = p->sli.ctxt;
  myrow = p->sli.myrow;
  mycol = p->sli.mycol;
  resid = p->resid;
  gp = p->gp;
  gq = p->gq;
  m = p->m;
  n = p->n;
  nrhs = p->nrhs;
  mb = p->mb;
  nb = p->nb;

  { char s[2*sizeof(int)*3 + 2 + 1]; sprintf( s, ":%dx%d", myrow, mycol );
  LFC_report_node_names( s ); }

  if (! is_in_grid( p )) return 0;
  if (! p->sli.dptrA) return 0;

  work[0] = p->tread;
  work[1] = p->tcopy;
  work[2] = p->tpdgesv;
  work[3] = p->twrite;
  work[4] = p->tresid;
  Cdgamx2d( ctxt, "All", " ", 5, 1, work, 5, &i, &i, -1, 0, 0 );

  if (is_io_proc( p ) && p->verbose) {

    if (! reportSolve) {
      printf( "||A*X-B||/(||X||*||A||*eps*MAX(M,N))=%11.9f\n", resid );
      if (resid < tol) rv = 0; else rv = 1;
      printf( "Answer_correct=%d\n", 1 - rv );
    }

    /* back-ward substitution */
    thflps = n * (n - 1.0) + n - 1.0;  /* lower triangular solve */
    thflps += n * (n - 1.0) + 2.0 * n; /* upper triangular solve */
    thflps *= nrhs;
    /* factorization */
    thflps += 0.5 * n * (n - 1.0);                   /* column scaling */
    thflps += n * (n - 1.0) * (2.0 * n - 1.0) / 3.0; /* updates */

    printf( "m=%g\n", m );
    printf( "n=%g\n", n );
    printf( "nrhs=%g\n", nrhs );
    printf( "mb=%d\n", mb );
    printf( "nb=%d\n", nb );
    printf( "gp=%d\n", gp );
    printf( "gq=%d\n", gq );
    printf( "Reading_time=%g\n",  work[0] );
    /* printf( "Copying_time=%g\n",  work[1] ); */
    if (reportSolve) {
      printf( "Solution_time=%g\n", work[2] );
      printf( "Writing_time=%g\n",  work[3] );
      printf( "Gflops=%g\n", 1e-9  * thflps );
      if (work[2] == -0.0 || work[2] == 0.0) work[2] = 1e-6;
      printf( "Gflops_per_second=%g\n", 1e-9 * thflps / work[2] );
      printf( "Gflops_per_second_per_cpu=%g\n", 1e-9*thflps / work[2] / gp/gq );
    } else {
      printf( "Residual_time=%g\n", work[4] );
    }
  }
  return rv;
}	/* dreport_run */

static void
dfinalize(PInfo *p) {

  LFC_FREE( p->sli.dptrA );
  LFC_FREE( p->dptrAcpy );
  LFC_FREE( p->sli.dptrB );
  LFC_FREE( p->sli.dptrX );
  LFC_FREE( p->sli.ptrP );
  LFC_FREE( p->dptrW );

  /* can only be called by processes in grid */
  if (is_in_grid( p )) Cblacs_gridexit( p->sli.ctxt );

  Cblacs_exit( 0 ); /* 0 means that this machine won't be used any more */
}	/* dfinalize */

int
LFC_dParallelSolve(PInfo *p) {
  int reportSolve = 0, factorize;

  switch (p->opcode) {
    case 2:
    case 4:
    case 6:
      factorize = 0;
      break;
    default:
      factorize = 1;
      break;
  }

  if (dsetup( p ) == 0) { /* if this process is in the process grid */
    if (factorize) { /* call pdgesv()/pdposv()/pdgels(): factorize and solve */
      dread_data( p, 0 );
      dsolve_system( p );
      dwrite_data( p );
      reportSolve = 1;
    } else { /* compute residual */
      dread_data( p, 1 );
      dcompute_residual( p );
      reportSolve = 0;
    }
  } else
    dread_data( p, factorize ? 0 : 1 );

  MPI_Barrier( MPI_COMM_WORLD );

  dreport_run( p, reportSolve );

  dfinalize( p );

  /* called by BLACS
  MPI_Finalize();
  */

  return 0;
}	/* LFC_dParallelSolve */

LFC_END_C_DECLS
