I would like to use ScaLAPACK's parallel LU factorization routine 'pdgetrf'. At the moment I experience difficulties with the creation of matrix descriptor 'descinit' and as a result of that the program crashes with a memory corruption.
Short description of the example setup:
I create a 10x10 matrix A(MxN) and use 2x2 computation blocks MBxNB. The matrix is distributed amongst a 2x2 processor grid (npRow x npCol), i.e. each processor gets a 5x5 matrix block AA(RPP x CPP). The master process (0) allocates the matrix A and initializes it with consequtive numbers from 1 to 100 and sends 5x5 matrix blocks AA to all slave processes (1, 2, 3). The LU factorization routine 'pdgetrf' is called. The matrix blocks AA with the solution are send back to the master process and collected into a new matrix R. Please, correct me if my understanding of the ScaLAPACK logics is wrong and the usage of ScaLAPACK routines is inappropriate.
Source code of the example:
- Code: Select all
PROGRAM LU_FACT_PAR_MINI
IMPLICIT NONE
integer, parameter :: M = 10, N = 10, MB = 2, NB = 2
integer, parameter :: RPP = M/2, CPP = M/2 ! rows/columns per procesor
integer :: npRow = 2, npCol = 2
integer :: iCtxt, iRow, iCol, me, err
integer, parameter :: IA = 1, JA = 1
integer :: info, descA(9), iPiv(N)
double precision, allocatable :: A(:,:), AA(:,:), R(:,:)
integer :: i, j
! external functions
integer, external :: blacs_pnum
! initialize process grid
call sl_init(iCtxt, npRow, npCol)
call blacs_gridinfo(iCtxt, npRow, npCol, iRow, iCol)
! get process ID
me = blacs_pnum(iCtxt, iRow, iCol)
allocate(AA(RPP,CPP), stat=err);
call descinit(descA, M, N, MB, NB, 0, 0, iCtxt, RPP, info)
! master: initialize matrix, send matrix blocks to slaves
if (me == 0) then
allocate(A(M,N), stat=err)
do i = 1, M
do j = 1, N
A(i,j) = (i-1)*N+j
end do
end do
! copy own matrix block
AA = A(1:RPP,1:CPP)
! send matrix blocks to slaves
call dgesd2d(iCtxt, RPP, CPP, A(1:RPP,6:N), RPP, 0, 1)
call dgesd2d(iCtxt, RPP, CPP, A(6:M,1:CPP), RPP, 1, 0)
call dgesd2d(iCtxt, RPP, CPP, A(6:M,6:N), RPP, 1, 1)
! slave: receive matrix block
else
call dgerv2d(iCtxt, RPP, CPP, AA, RPP, 0, 0)
end if
call pdgetrf(M, N, AA, IA, JA, descA, iPiv, info)
if (me == 0) print *, "pdgetrf info:", info
! master: collect result blocks of LU factorization
if (me == 0) then
allocate(R(M,N), stat=err)
R = 0.0
! copy own result block
R(1:RPP,1:CPP) = AA
! receive result blocks from slaves
call dgerv2d(iCtxt, RPP, CPP, R(1:RPP,6:N), RPP, 0, 1)
call dgerv2d(iCtxt, RPP, CPP, R(6:M,1:CPP), RPP, 1, 0)
call dgerv2d(iCtxt, RPP, CPP, R(6:M,6:N), RPP, 1, 1)
! deallocate matrices
deallocate(A, R, stat=err)
! slave: send result block to master
else
call dgesd2d(iCtxt, RPP, CPP, AA, RPP, 0, 0)
end if
deallocate(AA, stat=err)
! finalize
call blacs_gridexit(iCtxt)
call blacs_exit(0)
END PROGRAM LU_FACT_PAR_MINI
Compilers:
- Code: Select all
MPICH Version: 3.2
MPICH Release date: Wed Nov 11 22:06:48 CST 2015
MPICH Device: ch3:nemesis
MPICH configure: --prefix=/opt/mpich-gnu --enable-fast=O3,ndebug --disable-error-checking --without-timing --without-mpit-pvars --enable-fortran=all --enable-romio
MPICH CC: gcc -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DNDEBUG -DNVALGRIND -O3
MPICH CXX: g++ -DNDEBUG -DNVALGRIND -O3
MPICH F77: gfortran -O3
MPICH FC: gfortran -O3
Compilation procedure:
- Code: Select all
mpif90 -g -O0 -fcheck=all -fbacktrace -fcheck=bounds -Wall -c lu_fact_par_mini.f90
mpif90 -g -O0 -fcheck=all -fbacktrace -fcheck=bounds -Wall -L /opt/scalapack/lib -L /opt/lapack/lib -L /opt/blas/lib -L ./ -o lu_fact_par_mini lu_fact_par_mini.o -lscalapack -llapack -lblas -lnagprint
Error message:
- Code: Select all
mpiexec -np 4 ./lu_fact_par_mini
{ 0, 0}: On entry to DESCINIT parameter number 9 had an illegal value
At line 60 of file lu_fact_par_mini.f90
Fortran runtime warning: An array temporary was created
At line 61 of file lu_fact_par_mini.f90
Fortran runtime warning: An array temporary was created
{ 0, 1}: On entry to DESCINIT parameter number 9 had an illegal value
At line 62 of file lu_fact_par_mini.f90
Fortran runtime warning: An array temporary was created
*** Error in `./lu_fact_par_mini': malloc(): memory corruption (fast): 0x0000000001a76220 ***
======= Backtrace: =========
/usr/lib/libc.so.6(+0x6f364)[0x7fd76869d364]
/usr/lib/libc.so.6(+0x74d96)[0x7fd7686a2d96]
/usr/lib/libc.so.6(+0x77344)[0x7fd7686a5344]
/usr/lib/libc.so.6(__libc_malloc+0x54)[0x7fd7686a6924]
/opt/mpich-gnu/lib/libmpi.so.12(MPID_Dataloop_alloc_and_copy+0x6a)[0x7fd769579ada]
/opt/mpich-gnu/lib/libmpi.so.12(MPID_Dataloop_create_contiguous+0x407)[0x7fd76957c977]
/opt/mpich-gnu/lib/libmpi.so.12(MPID_Dataloop_create_vector+0x417)[0x7fd76957ebf7]
/opt/mpich-gnu/lib/libmpi.so.12(MPID_Dataloop_create+0xbbd)[0x7fd76957aced]
/opt/mpich-gnu/lib/libmpi.so.12(MPID_Type_commit+0x94)[0x7fd769589004]
/opt/mpich-gnu/lib/libmpi.so.12(MPI_Type_commit+0x5d)[0x7fd7694cebbd]
./lu_fact_par_mini[0x405636]
./lu_fact_par_mini[0x40e4d6]
./lu_fact_par_mini[0x40cccc]
./lu_fact_par_mini[0x4047f6]
./lu_fact_par_mini[0x402942]
./lu_fact_par_mini[0x403ae8]
/usr/lib/libc.so.6(__libc_start_main+0xf0)[0x7fd76864e710]
./lu_fact_par_mini[0x4017e9]
======= Memory map: ========
00400000-00467000 r-xp 00000000 08:16 799033 /home/mabalenk/repository/git/plasma-exp/scalapack/lu_fact_par_mini
00667000-00668000 rw-p 00067000 08:16 799033 /home/mabalenk/repository/git/plasma-exp/scalapack/lu_fact_par_mini
01a54000-01a97000 rw-p 00000000 00:00 0 [heap]
7fd760000000-7fd760021000 rw-p 00000000 00:00 0
7fd760021000-7fd764000000 ---p 00000000 00:00 0
7fd766c95000-7fd766f36000 rw-p 00000000 00:00 0
7fd766f36000-7fd766f41000 r-xp 00000000 08:06 2624720 /usr/lib/libnss_files-2.23.so
7fd766f41000-7fd767140000 ---p 0000b000 08:06 2624720 /usr/lib/libnss_files-2.23.so
7fd767140000-7fd767141000 r--p 0000a000 08:06 2624720 /usr/lib/libnss_files-2.23.so
7fd767141000-7fd767142000 rw-p 0000b000 08:06 2624720 /usr/lib/libnss_files-2.23.so
7fd767142000-7fd767148000 rw-p 00000000 00:00 0
7fd767148000-7fd768209000 rw-s 00000000 00:13 15330110 /dev/shm/mpich_shar_tmpwOLvsY (deleted)
7fd768209000-7fd768221000 r-xp 00000000 08:06 2624624 /usr/lib/libpthread-2.23.so
7fd768221000-7fd768420000 ---p 00018000 08:06 2624624 /usr/lib/libpthread-2.23.so
7fd768420000-7fd768421000 r--p 00017000 08:06 2624624 /usr/lib/libpthread-2.23.so
7fd768421000-7fd768422000 rw-p 00018000 08:06 2624624 /usr/lib/libpthread-2.23.so
7fd768422000-7fd768426000 rw-p 00000000 00:00 0
7fd768426000-7fd76842d000 r-xp 00000000 08:06 2624728 /usr/lib/librt-2.23.so
7fd76842d000-7fd76862c000 ---p 00007000 08:06 2624728 /usr/lib/librt-2.23.so
7fd76862c000-7fd76862d000 r--p 00006000 08:06 2624728 /usr/lib/librt-2.23.so
7fd76862d000-7fd76862e000 rw-p 00007000 08:06 2624728 /usr/lib/librt-2.23.so
7fd76862e000-7fd7687c6000 r-xp 00000000 08:06 2624649 /usr/lib/libc-2.23.so
7fd7687c6000-7fd7689c5000 ---p 00198000 08:06 2624649 /usr/lib/libc-2.23.so
7fd7689c5000-7fd7689c9000 r--p 00197000 08:06 2624649 /usr/lib/libc-2.23.so
7fd7689c9000-7fd7689cb000 rw-p 0019b000 08:06 2624649 /usr/lib/libc-2.23.so
7fd7689cb000-7fd7689cf000 rw-p 00000000 00:00 0
7fd7689cf000-7fd768a0d000 r-xp 00000000 08:06 2624996 /usr/lib/libquadmath.so.0.0.0
7fd768a0d000-7fd768c0d000 ---p 0003e000 08:06 2624996 /usr/lib/libquadmath.so.0.0.0
7fd768c0d000-7fd768c0e000 rw-p 0003e000 08:06 2624996 /usr/lib/libquadmath.so.0.0.0
7fd768c0e000-7fd768c24000 r-xp 00000000 08:06 2624968 /usr/lib/libgcc_s.so.1
7fd768c24000-7fd768e23000 ---p 00016000 08:06 2624968 /usr/lib/libgcc_s.so.1
7fd768e23000-7fd768e24000 rw-p 00015000 08:06 2624968 /usr/lib/libgcc_s.so.1
7fd768e24000-7fd768f27000 r-xp 00000000 08:06 2624725 /usr/lib/libm-2.23.so
7fd768f27000-7fd769127000 ---p 00103000 08:06 2624725 /usr/lib/libm-2.23.so
7fd769127000-7fd769128000 r--p 00103000 08:06 2624725 /usr/lib/libm-2.23.so
7fd769128000-7fd769129000 rw-p 00104000 08:06 2624725 /usr/lib/libm-2.23.so
7fd769129000-7fd769251000 r-xp 00000000 08:06 2624977 /usr/lib/libgfortran.so.3.0.0
7fd769251000-7fd769451000 ---p 00128000 08:06 2624977 /usr/lib/libgfortran.so.3.0.0
7fd769451000-7fd769453000 rw-p 00128000 08:06 2624977 /usr/lib/libgfortran.so.3.0.0
7fd769453000-7fd769629000 r-xp 00000000 08:06 542411 /opt/mpich-gnu/lib/libmpi.so.12.1.0
7fd769629000-7fd769828000 ---p 001d6000 08:06 542411 /opt/mpich-gnu/lib/libmpi.so.12.1.0
7fd769828000-7fd76983a000 rw-p 001d5000 08:06 542411 /opt/mpich-gnu/lib/libmpi.so.12.1.0
7fd76983a000-7fd769874000 rw-p 00000000 00:00 0
7fd769874000-7fd7698aa000 r-xp 00000000 08:06 542415 /opt/mpich-gnu/lib/libmpifort.so.12.1.0
7fd7698aa000-7fd769aaa000 ---p 00036000 08:06 542415 /opt/mpich-gnu/lib/libmpifort.so.12.1.0
7fd769aaa000-7fd769aab000 rw-p 00036000 08:06 542415 /opt/mpich-gnu/lib/libmpifort.so.12.1.0
7fd769aab000-7fd769ace000 r-xp 00000000 08:06 2624648 /usr/lib/ld-2.23.so
7fd769adc000-7fd769c90000 rw-p 00000000 00:00 0
7fd769ccc000-7fd769cce000 rw-p 00000000 00:00 0
7fd769cce000-7fd769ccf000 r--p 00023000 08:06 2624648 /usr/lib/ld-2.23.so
7fd769ccf000-7fd769cd0000 rw-p 00024000 08:06 2624648 /usr/lib/ld-2.23.so
7fd769cd0000-7fd769cd1000 rw-p 00000000 00:00 0
7ffdaec99000-7ffdaecbc000 rw-p 00000000 00:00 0 [stack]
7ffdaecea000-7ffdaecec000 r--p 00000000 00:00 0 [vvar]
7ffdaecec000-7ffdaecee000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Program received signal SIGABRT: Process abort signal.
Backtrace for this error:

