I have implemented pdsyevd to do eigenvalue-eigenfunction decomposition for a symmetric matrix. But the size of the matrix is huge, I put it into a vector excluding repeating elements to reduce the memory, and the final requested memory reaches to 20Gb. In other words, if the matrix is 100 x 100, then I put 100 (first column) + 99 + 98 + .... +1 (last column) elements into a vector.
Following the post below, I succeeded to get correct eigenvalues and eigenfunctions, and the subroutine in my code where parallelizing is done with Scalapack is attached at the end, where the 20Gb vector is saved in the array of b.
viewtopic.php?f=2&t=2913&p=7803&hilit=pdsyevd#p7803
My problem is that the every node reads the big vector b before distribution through PDELSET, so the code requires more memory as I use more nodes.
This is not ideal for me since I want to use hundreds of nodes due to long run time of about 2 weeks in a serial run.
Could you anyone give some tips for this problem to me?
I'm frustrated, so I'm desperate for any help.
Thank you.
Sung-Joon
- Code: Select all
Subroutine diaghscalapack(b,n,eigenval)
implicit none
integer, parameter :: MAXLINV=40
integer, parameter :: MXLENM=(MAXLINV+1)**2
integer, parameter :: MAXNRD=43
integer(8), parameter :: NDIMATA=MAXNRD*MXLENM
integer(8), parameter :: MXNATA=(NDIMATA*(NDIMATA+1))/2
integer(8) :: ind,ind1
integer :: lwork,maxn,liwork,isum,jsum
double precision :: rtmp(4)
integer :: itmp(4)
integer :: contxt,i,j,iam,info,mycol,myrow,n,nb,mb,npcol,nprocs,nprow,lnrowsa,lncolsa,sqrtnp,iu
integer :: desca(9),descz(9)
double precision :: b(mxnata),eigenval(ndimata)
double precision, allocatable :: a(:,:),z(:,:),work(:)
integer, allocatable :: iwork(:)
integer :: numroc
external :: BLACS_EXIT,BLACS_GET,BLACS_GRIDEXIT,BLACS_GRIDINFO,BLACS_GRIDINIT
external :: BLACS_PINFO,BLACS_SETUP,DESCINIT,PDSYEVD,NUMROC
external :: DGEBS2D,DGEBR2D
! Initialize the BLACS: refer to user's guide or lbll manual
call BLACS_PINFO(iam,nprocs)
! Calculating the number of row, NPROW, and columns, NPCOL of the process grid
call GRIDSETUP(nprocs,nprow,npcol)
! Initialize a single BLACS context
call BLACS_GET(-1,0,contxt)
call BLACS_GRIDINIT(contxt,'Row',nprow,npcol)
call BLACS_GRIDINFO(contxt,nprow,npcol,myrow,mycol)
! Calculating the block sizes
mb=0
nb=min(n/nprow,n/npcol)
if (mb.gt.0) then
nb=min(nb,mb)
endif
nb=max(nb,1)
! Calculate the number of rows, LNROWSA, and the number of columns, LNCOLSA, for the local matrices A
lnrowsa=NUMROC(n,nb,myrow,0,nprow)
lncolsa=NUMROC(n,nb,mycol,0,npcol)
allocate(a(lnrowsa,lncolsa),z(lnrowsa,lncolsa))
a=0.0d0
! Distribute the global b onto the local matrices a
call DESCINIT(desca,n,n,nb,nb,0,0,contxt,lnrowsa,info)
jsum=0
do j=1,n
isum=0
do i=1,n
if (i.ge.j) then
ind=dble(j-1)*dble(n)+dble(i)-dble(jsum)
call PDELSET(a,i,j,desca,b(ind))
else
! Since b is supposed to be symmetric but only contains
! true matrix elements in the lower triangular part
ind1=dble(i-1)*dble(n)+dble(j)-dble(isum)
call PDELSET(a,i,j,desca,b(ind1))
endif
isum=isum+i
enddo
jsum=jsum+j
enddo
call DESCINIT(descz,n,n,nb,nb,0,0,contxt,lnrowsa,info)
! ask pdsyevd to compute the entire eigendecomposition
lwork=-1
liwork=1
rtmp(:)=0.0d0
itmp(:)=0
call pdsyevd('V','U',n,a,1,1,desca,eigenval,z,1,1,descz,rtmp,lwork,itmp,liwork,info)
lwork=max(500000,2*int(rtmp(1))+1)
liwork=max(8*n,itmp(1)+1)
allocate(iwork(liwork),work(lwork))
call pdsyevd('V','U',n,a,1,1,desca,eigenval,z,1,1,descz,work,lwork,iwork,liwork,info)
! Print out the eigenvectors
call PDLAPRNTSJ(n,n,z,1,1,descz,0,0,'z',work,21)
call BLACS_GRIDEXIT(contxt)
call BLACS_EXIT(0)
end subroutine diaghscalapack
!------------------------------------------------------------------
Subroutine gridsetup(nproc,nprow,npcol)
!
! This subroutine factorizes the number of processors (nproc) into
! nprow and npcol, which are the sizes of the 2D processors mesh.
!
implicit none
integer nproc,nprow,npcol
integer sqrtnp,i
sqrtnp=int(sqrt(dble(nproc))+1)
do i=1,sqrtnp
if(mod(nproc,i).eq.0) npcol=i
enddo
nprow=nproc/npcol
return
end subroutine gridsetup
!-------------------------------------------------------------------
SUBROUTINE PDLAPRNTSJ( M, N, A, IA, JA, DESCA, IRPRNT, ICPRNT,
$ CMATNM, WORK, NOUT )
*
* -- ScaLAPACK tools routine (version 1.7) --
* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
* and University of California, Berkeley.
* May 1, 1997
*
* .. Scalar Arguments ..
INTEGER IA, ICPRNT, IRPRNT, JA, M, N, NOUT
* ..
* .. Array Arguments ..
CHARACTER*(*) CMATNM
INTEGER DESCA( * )
DOUBLE PRECISION A( * ), WORK( * )
* ..
*
* Purpose
* =======
*
* PDLAPRNT prints to the standard output a distributed matrix sub( A )
* denoting A(IA:IA+M-1,JA:JA+N-1). The local pieces are sent and
* printed by the process of coordinates (IRPRNT, ICPRNT).
*
* Notes
* =====
*
* Each global data object is described by an associated description
* vector. This vector stores the information required to establish
* the mapping between an object element and its corresponding process
* and memory location.
*
* Let A be a generic term for any 2D block cyclicly distributed array.
* Such a global array has an associated description vector DESCA.
* In the following comments, the character _ should be read as
* "of the global array".
*
* NOTATION STORED IN EXPLANATION
* --------------- -------------- --------------------------------------
* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
* DTYPE_A = 1.
* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
* the BLACS process grid A is distribu-
* ted over. The context itself is glo-
* bal, but the handle (the integer
* value) may vary.
* M_A (global) DESCA( M_ ) The number of rows in the global
* array A.
* N_A (global) DESCA( N_ ) The number of columns in the global
* array A.
* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
* the rows of the array.
* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
* the columns of the array.
* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
* row of the array A is distributed.
* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
* first column of the array A is
* distributed.
* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
* array. LLD_A >= MAX(1,LOCr(M_A)).
*
* Let K be the number of rows or columns of a distributed matrix,
* and assume that its process grid has dimension p x q.
* LOCr( K ) denotes the number of elements of K that a process
* would receive if K were distributed over the p processes of its
* process column.
* Similarly, LOCc( K ) denotes the number of elements of K that a
* process would receive if K were distributed over the q processes of
* its process row.
* The values of LOCr() and LOCc() may be determined via a call to the
* ScaLAPACK tool function, NUMROC:
* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
* An upper bound for these quantities may be computed by:
* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
*
* Arguments
* =========
*
* M (global input) INTEGER
* The number of rows to be operated on i.e the number of rows
* of the distributed submatrix sub( A ). M >= 0.
*
* N (global input) INTEGER
* The number of columns to be operated on i.e the number of
* columns of the distributed submatrix sub( A ). N >= 0.
*
* A (local input) DOUBLE PRECISION pointer into the local memory to a
* local array of dimension (LLD_A, LOCc(JA+N-1) ) containing
* the local pieces of the distributed matrix sub( A ).
*
* IA (global input) INTEGER
* The row index in the global array A indicating the first
* row of sub( A ).
*
* JA (global input) INTEGER
* The column index in the global array A indicating the
* first column of sub( A ).
*
* DESCA (global and local input) INTEGER array of dimension DLEN_.
* The array descriptor for the distributed matrix A.
*
* IRPRNT (global input) INTEGER
* The row index of the printing process.
*
* ICPRNT (global input) INTEGER
* The column index of the printing process.
*
* CMATNM (global input) CHARACTER*(*)
* Identifier of the distributed matrix to be printed.
*
* NOUT (global input) INTEGER
* The unit number for output file. NOUT = 6, ouput to screen,
* NOUT = 0, output to stderr.
*
* WORK (local workspace) DOUBLE PRECISION
* Working array of minimum size equal to MB_A.
*
* =====================================================================
*
* .. Parameters ..
INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
$ LLD_, MB_, M_, NB_, N_, RSRC_
PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
$ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
$ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
* ..
* .. Local Scalars ..
INTEGER H, I, IACOL, IAROW, IB, ICTXT, ICURCOL,
$ ICURROW, II, IIA, IN, J, JB, JJ, JJA, JN, K,
$ LDA, MYCOL, MYROW, NPCOL, NPROW
* ..
* .. External Subroutines ..
EXTERNAL BLACS_BARRIER, BLACS_GRIDINFO, INFOG2L,
$ DGERV2D, DGESD2D
* ..
* .. External Functions ..
INTEGER ICEIL
EXTERNAL ICEIL
* ..
* .. Intrinsic Functions ..
INTRINSIC MIN
* ..
* .. Executable Statements ..
*
* Get grid parameters
*
ICTXT = DESCA( CTXT_ )
CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
*
CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL,
$ IIA, JJA, IAROW, IACOL )
ICURROW = IAROW
ICURCOL = IACOL
II = IIA
JJ = JJA
LDA = DESCA( LLD_ )
*
* Handle the first block of column separately
*
JN = MIN( ICEIL( JA, DESCA( NB_ ) ) * DESCA( NB_ ), JA+N-1 )
JB = JN-JA+1
DO 60 H = 0, JB-1
IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 )
IB = IN-IA+1
IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN
IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
DO 10 K = 0, IB-1
WRITE( NOUT, * )
$ IA+K, JA+H, A( II+K+(JJ+H-1)*LDA )
10 CONTINUE
END IF
ELSE
IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN
CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ), LDA,
$ IRPRNT, ICPRNT )
ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ),
$ ICURROW, ICURCOL )
DO 20 K = 1, IB
WRITE( NOUT, * )
$ IA+K-1, JA+H, WORK( K )
20 CONTINUE
END IF
END IF
IF( MYROW.EQ.ICURROW )
$ II = II + IB
ICURROW = MOD( ICURROW+1, NPROW )
CALL BLACS_BARRIER( ICTXT, 'All' )
*
* Loop over remaining block of rows
*
DO 50 I = IN+1, IA+M-1, DESCA( MB_ )
IB = MIN( DESCA( MB_ ), IA+M-I )
IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN
IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
DO 30 K = 0, IB-1
WRITE( NOUT, * )
$ I+K, JA+H, A( II+K+(JJ+H-1)*LDA )
30 CONTINUE
END IF
ELSE
IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN
CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ),
$ LDA, IRPRNT, ICPRNT )
ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ),
$ ICURROW, ICURCOL )
DO 40 K = 1, IB
WRITE( NOUT, * )
$ I+K-1, JA+H, WORK( K )
40 CONTINUE
END IF
END IF
IF( MYROW.EQ.ICURROW )
$ II = II + IB
ICURROW = MOD( ICURROW+1, NPROW )
CALL BLACS_BARRIER( ICTXT, 'All' )
50 CONTINUE
*
II = IIA
ICURROW = IAROW
60 CONTINUE
*
IF( MYCOL.EQ.ICURCOL )
$ JJ = JJ + JB
ICURCOL = MOD( ICURCOL+1, NPCOL )
CALL BLACS_BARRIER( ICTXT, 'All' )
*
* Loop over remaining column blocks
*
DO 130 J = JN+1, JA+N-1, DESCA( NB_ )
JB = MIN( DESCA( NB_ ), JA+N-J )
DO 120 H = 0, JB-1
IN = MIN( ICEIL( IA, DESCA( MB_ ) ) * DESCA( MB_ ), IA+M-1 )
IB = IN-IA+1
IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN
IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
DO 70 K = 0, IB-1
WRITE( NOUT, * )
$ IA+K, J+H, A( II+K+(JJ+H-1)*LDA )
70 CONTINUE
END IF
ELSE
IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN
CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ),
$ LDA, IRPRNT, ICPRNT )
ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ),
$ ICURROW, ICURCOL )
DO 80 K = 1, IB
WRITE( NOUT, * )
$ IA+K-1, J+H, WORK( K )
80 CONTINUE
END IF
END IF
IF( MYROW.EQ.ICURROW )
$ II = II + IB
ICURROW = MOD( ICURROW+1, NPROW )
CALL BLACS_BARRIER( ICTXT, 'All' )
*
* Loop over remaining block of rows
*
DO 110 I = IN+1, IA+M-1, DESCA( MB_ )
IB = MIN( DESCA( MB_ ), IA+M-I )
IF( ICURROW.EQ.IRPRNT .AND. ICURCOL.EQ.ICPRNT ) THEN
IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
DO 90 K = 0, IB-1
WRITE( NOUT, * )
$ I+K, J+H, A( II+K+(JJ+H-1)*LDA )
90 CONTINUE
END IF
ELSE
IF( MYROW.EQ.ICURROW .AND. MYCOL.EQ.ICURCOL ) THEN
CALL DGESD2D( ICTXT, IB, 1, A( II+(JJ+H-1)*LDA ),
$ LDA, IRPRNT, ICPRNT )
ELSE IF( MYROW.EQ.IRPRNT .AND. MYCOL.EQ.ICPRNT ) THEN
CALL DGERV2D( ICTXT, IB, 1, WORK, DESCA( MB_ ),
$ ICURROW, ICURCOL )
DO 100 K = 1, IB
WRITE( NOUT, * )
$ I+K-1, J+H, WORK( K )
100 CONTINUE
END IF
END IF
IF( MYROW.EQ.ICURROW )
$ II = II + IB
ICURROW = MOD( ICURROW+1, NPROW )
CALL BLACS_BARRIER( ICTXT, 'All' )
110 CONTINUE
*
II = IIA
ICURROW = IAROW
120 CONTINUE
*
IF( MYCOL.EQ.ICURCOL )
$ JJ = JJ + JB
ICURCOL = MOD( ICURCOL+1, NPCOL )
CALL BLACS_BARRIER( ICTXT, 'All' )
*
130 CONTINUE
*
9999 FORMAT(A,'(',I6,',',I6,')=',D30.18)
*
RETURN
*
* End of PDLAPRNT
*
END

