using iso_c_binding to pass pointer from fortran. Is there any suggestions what could be causing the performance decrease?
When I run testing_magma_zgetrf example I get the performance you would expect, gpu faster.
I have a attached a snippet of my code. A little messy from debugging.
Code: Select all
module Gpu_infc
use, intrinsic :: iso_c_binding
implicit none!! Define GPU Variables
complex(kind=8), dimension(:,:), pointer :: h_A
type(C_PTR) :: cptr_h_A
integer(kind=8) :: d_A !device pointer
integer(kind=8) :: ldda
integer(C_SIZE_T), parameter :: sizeof_complex = 16
integer, parameter :: fp_kind = kind(0.0d0) ! Double precision
! Interface to cudaMallocHost and cudaFree
interface
! cudaMallocHost
integer (C_INT) function cudaMallocHost(buffer, size, flag) bind(C,name="cudaMallocHost")
use iso_c_binding
implicit none
type (C_PTR) :: buffer
integer (C_SIZE_T), value :: size
integer (C_INT), value :: flag
end function cudaMallocHost
! cudaFreeHost
integer (C_INT) function cudaFreeHost(buffer) bind(C,name="cudaFreeHost")
use iso_c_binding
implicit none
type (C_PTR), value :: buffer
end function cudaFreeHost
integer function cudaSetDeviceFlags(flag) bind(C,name="cudaSetDeviceFlags")
use iso_c_binding
implicit none
integer (C_INT), value :: flag
end function cudaSetDeviceFlags
end interface
!include 'mpif.h'
contains
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! Gpu_infc_init - Allocates gpu variables
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine Gpu_infc_init(d_A,size_fp,ldda)
integer cublas_Get_Error_, cublas_Init_, cublas_alloc_
external cublas_Get_Error_, cublas_Init_, cublas_alloc_, printout_devices
integer(kind=8), intent(in) :: size_fp
integer(kind=8), intent(out) :: d_A
integer(kind=8), intent(inout) :: ldda
!complex(kind=8), dimension(:,:), intent(inout) :: h_A
character*100 :: var
integer :: stat, Np_t, ldda_t
if(rank .eq. 0) write(*,*) ' ** Initializing GPU Cards'
stat = cublas_Init_() !Initallize GPUs
write(var,*) 'cublas_init - rank',rank
call Errors_cublas(stat,var)
if(rank .eq. 0) call printout_devices()
if(rank .eq. 0) write(*,*) ' Number of GPUs used per node:',nproc
!! Allocate CPU host memory
!allocate(h_A(Np*ldda)) !for gpu
!cudaHostAllocPortable=1, cudaHostAllocMapped= 2
stat=cudaSetDeviceFlags(8)
stat = cudaMallocHost(cptr_h_A, Np*Np*size_fp,2)
write(*,*) 'cudaMallocHost =',stat
Np_t = Np; ldda_t = ldda !need to be int*4
call c_f_pointer (cptr_h_A,h_A,(/Np_t,Np_t/))
if(rank .eq. 0) write(var,*) 'cublas_alloc - rank',rank
call Errors_cublas(stat,var)
end subroutine Gpu_infc_init
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! Gpu_infc_LU2 - Allocates gpu variables
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine Gpu_infc_LU2(h_A,lda,ldda,ipiv_gpu)
integer magma_zgetrf, magma_zgetrf_gpu
real(kind=8) get_current_time, Mpi_Wtime
external magma_zgetrf, magma_zgetrf_gpu, get_current_time, Mpi_Wtime
integer(kind=8), intent(in) :: lda, ldda
integer(kind=4), dimension(:), intent(out) :: ipiv_gpu
complex(kind=8), dimension(:,:), intent(inout) :: h_A
integer(kind=8) :: m, n, lda_t
character*100 :: var
integer(kind=4) :: stat, info
real(kind=8) :: end=0, start=0
m = Np; n = Np; lda_t = lda
!!Magma call doesn't require external set matrix
!stat = magma_zgetrf(m, n, h_A , lda_t, ipiv_gpu, info) ! Purge Compute on gpu
!write(*,*) 'stat1 = ',stat,info
start = MPI_Wtime()
stat = magma_zgetrf(m, n, h_A, lda_t, ipiv_gpu, info) ! Compute on gpu
end = MPI_Wtime()
write(*,*) 'stat2 = ',stat,info
end = end - start
write(*,'(A,F12.6)') 'magma time: ',end
start = MPI_Wtime()
call zgetrf_(Np,Np,h_A,lda,ipiv_gpu,info)
end = MPI_Wtime()
end = end - start
write(*,'(A,F12.6)') 'lapack time: ',end
end subroutine Gpu_infc_LU2
Code: Select all
Matrix Size: N, N*M = 1501 2253001
stat1 = 0 0
stat2 = 0 0
magma time: 0.111086
info = 0
lapack time: 0.031947