MAGMA 2.9.0
Matrix Algebra for GPU and Multicore Architectures
Loading...
Searching...
No Matches

Functions

magma_int_t magma_cgetrf (magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_cgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaFloatComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv, magmaFloatComplex_ptr d_lAP[], magmaFloatComplex *W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
 CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
bool magma_cgetrf_gpu_recommend_cpu (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_cgetrf_gpu_recommend_cpu returns true if magma_cgetrf_gpu is going to use the CPU only for performing the LU factorization.
 
bool magma_cgetrf_native_recommend_notrans (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_cgetrf_native_recommend_notrans returns true if magma_cgetrf_native is going to perform the LU factorization without transposing the matrix.
 
magma_int_t magma_cgetrf_expert_gpu_work (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_mode_t mode, magma_int_t nb, magma_int_t recnb, void *host_work, magma_int_t *lwork_host, void *device_work, magma_int_t *lwork_device, magma_event_t events[2], magma_queue_t queues[2])
 CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_cgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_int_t nb, magma_mode_t mode)
 magma_cgetrf_gpu_expert is similar to magma_cgetrf_expert_gpu_work except that all workspaces/queues are handled internally
 
magma_int_t magma_cgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_cgetrf_expert_gpu_work with mode = MagmaHybrid.
 
magma_int_t magma_cgetrf_native (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_cgetrf_expert_gpu_work with mode = MagmaNative.
 
magma_int_t magma_cgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_cgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_dgetrf (magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_dgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaDouble_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv, magmaDouble_ptr d_lAP[], double *W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
 DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
bool magma_dgetrf_gpu_recommend_cpu (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_dgetrf_gpu_recommend_cpu returns true if magma_dgetrf_gpu is going to use the CPU only for performing the LU factorization.
 
bool magma_dgetrf_native_recommend_notrans (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_dgetrf_native_recommend_notrans returns true if magma_dgetrf_native is going to perform the LU factorization without transposing the matrix.
 
magma_int_t magma_dgetrf_expert_gpu_work (magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_mode_t mode, magma_int_t nb, magma_int_t recnb, void *host_work, magma_int_t *lwork_host, void *device_work, magma_int_t *lwork_device, magma_event_t events[2], magma_queue_t queues[2])
 DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_dgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_int_t nb, magma_mode_t mode)
 magma_dgetrf_gpu_expert is similar to magma_dgetrf_expert_gpu_work except that all workspaces/queues are handled internally
 
magma_int_t magma_dgetrf_gpu (magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_dgetrf_expert_gpu_work with mode = MagmaHybrid.
 
magma_int_t magma_dgetrf_native (magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_dgetrf_expert_gpu_work with mode = MagmaNative.
 
magma_int_t magma_dgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 DGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_dgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_sgetrf (magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_sgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaFloat_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv, magmaFloat_ptr d_lAP[], float *W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
 SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
bool magma_sgetrf_gpu_recommend_cpu (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_sgetrf_gpu_recommend_cpu returns true if magma_sgetrf_gpu is going to use the CPU only for performing the LU factorization.
 
bool magma_sgetrf_native_recommend_notrans (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_sgetrf_native_recommend_notrans returns true if magma_sgetrf_native is going to perform the LU factorization without transposing the matrix.
 
magma_int_t magma_sgetrf_expert_gpu_work (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_mode_t mode, magma_int_t nb, magma_int_t recnb, void *host_work, magma_int_t *lwork_host, void *device_work, magma_int_t *lwork_device, magma_event_t events[2], magma_queue_t queues[2])
 SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_sgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_int_t nb, magma_mode_t mode)
 magma_sgetrf_gpu_expert is similar to magma_sgetrf_expert_gpu_work except that all workspaces/queues are handled internally
 
magma_int_t magma_sgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_sgetrf_expert_gpu_work with mode = MagmaHybrid.
 
magma_int_t magma_sgetrf_native (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_sgetrf_expert_gpu_work with mode = MagmaNative.
 
magma_int_t magma_sgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 SGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_sgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloat_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_xhsgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_mp_type_t enable_tc, magma_mp_type_t mp_algo_type)
 XHSGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_hgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 HGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_xshgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_mp_type_t enable_tc, magma_mp_type_t mp_algo_type)
 XSHGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_htgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 HTGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_zgetrf (magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_zgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaDoubleComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv, magmaDoubleComplex_ptr d_lAP[], magmaDoubleComplex *W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
 ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
bool magma_zgetrf_gpu_recommend_cpu (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_zgetrf_gpu_recommend_cpu returns true if magma_zgetrf_gpu is going to use the CPU only for performing the LU factorization.
 
bool magma_zgetrf_native_recommend_notrans (magma_int_t m, magma_int_t n, magma_int_t nb)
 magma_zgetrf_native_recommend_notrans returns true if magma_zgetrf_native is going to perform the LU factorization without transposing the matrix.
 
magma_int_t magma_zgetrf_expert_gpu_work (magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_mode_t mode, magma_int_t nb, magma_int_t recnb, void *host_work, magma_int_t *lwork_host, void *device_work, magma_int_t *lwork_device, magma_event_t events[2], magma_queue_t queues[2])
 ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_zgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_int_t nb, magma_mode_t mode)
 magma_zgetrf_gpu_expert is similar to magma_zgetrf_expert_gpu_work except that all workspaces/queues are handled internally
 
magma_int_t magma_zgetrf_gpu (magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_zgetrf_expert_gpu_work with mode = MagmaHybrid.
 
magma_int_t magma_zgetrf_native (magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 magma_zgetrf_expert_gpu_work with mode = MagmaNative.
 
magma_int_t magma_zgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
 ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 
magma_int_t magma_zgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 

Detailed Description

Function Documentation

◆ magma_cgetrf()

magma_int_t magma_cgetrf ( magma_int_t m,
magma_int_t n,
magmaFloatComplex * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]ACOMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_cgetrf2_mgpu()

magma_int_t magma_cgetrf2_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magma_int_t nb,
magma_int_t offset,
magmaFloatComplex_ptr d_lAT[],
magma_int_t lddat,
magma_int_t * ipiv,
magmaFloatComplex_ptr d_lAP[],
magmaFloatComplex * W,
magma_int_t ldw,
magma_queue_t queues[][2],
magma_int_t * info )

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The block size used for the matrix distribution.
[in]offsetINTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]d_lATCOMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddatINTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]d_lAPCOMPLEX array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)*nb*maxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]WCOMPLEX array, dimension (ngpu*nb*maxm). It is used to store panel on CPU.
[in]ldwINTEGER The leading dimension of the workspace w.
[in]queuesmagma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_cgetrf_gpu_recommend_cpu()

bool magma_cgetrf_gpu_recommend_cpu ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_cgetrf_gpu_recommend_cpu returns true if magma_cgetrf_gpu is going to use the CPU only for performing the LU factorization.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_cgetrf_native_recommend_notrans()

bool magma_cgetrf_native_recommend_notrans ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_cgetrf_native_recommend_notrans returns true if magma_cgetrf_native is going to perform the LU factorization without transposing the matrix.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_cgetrf_expert_gpu_work()

magma_int_t magma_cgetrf_expert_gpu_work ( magma_int_t m,
magma_int_t n,
magmaFloatComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_mode_t mode,
magma_int_t nb,
magma_int_t recnb,
void * host_work,
magma_int_t * lwork_host,
void * device_work,
magma_int_t * lwork_device,
magma_event_t events[2],
magma_queue_t queues[2] )

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

This is an expert API, exposing more controls to the end user.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dACOMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]modemagma_mode_t
  • = MagmaNative: Factorize dA using GPU only mode.
  • = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.
[in]nbINTEGER The blocking size used during the factorization. nb > 0; Users with no specific preference of nb can call magma_get_cgetrf_nb() or magma_get_cgetrf_native_nb() to get the value of nb as determined by MAGMA's internal tuning.
[in]recnbINTEGER The blocking size used during the recursive panel factorization (0 < recnb <= nb); Users with no specific preference of recnb can set it to a fixed value of 32.
[in,out]host_workWorkspace, allocated on host (CPU) memory. For faster CPU-GPU communication, user can allocate it as pinned memory using magma_malloc_pinned()
[in,out]lwork_hostINTEGER pointer The size of the workspace (host_work) in bytes
  • lwork_host[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_host. The workspace itself is not referenced, and no factorization is performed.
  • lwork[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_host.
Parameters
[in,out]device_workWorkspace, allocated on device (GPU) memory.
[in,out]lwork_deviceINTEGER pointer The size of the workspace (device_work) in bytes
  • lwork_device[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_device. The workspace itself is not referenced, and no factorization is performed.
  • lwork_device[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_device.
[in]eventsmagma_event_t array of size two
  • created/destroyed by the user outside the routine
  • Used to manage inter-stream dependencies
[in]queuesmagma_queue_t array of size two
  • created/destroyed by the user outside the routine
  • Used for concurrent kernel execution, if possible

◆ magma_cgetrf_gpu_expert()

magma_int_t magma_cgetrf_gpu_expert ( magma_int_t m,
magma_int_t n,
magmaFloatComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_int_t nb,
magma_mode_t mode )

magma_cgetrf_gpu_expert is similar to magma_cgetrf_expert_gpu_work except that all workspaces/queues are handled internally

See also
magma_cgetrf_expert_gpu_work

◆ magma_cgetrf_gpu()

magma_int_t magma_cgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaFloatComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_cgetrf_expert_gpu_work with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See also
magma_cgetrf_expert_gpu_work

◆ magma_cgetrf_native()

magma_int_t magma_cgetrf_native ( magma_int_t m,
magma_int_t n,
magmaFloatComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_cgetrf_expert_gpu_work with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See also
magma_cgetrf_expert_gpu_work

◆ magma_cgetrf_m()

magma_int_t magma_cgetrf_m ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magmaFloatComplex * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]ACOMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_cgetrf_mgpu()

magma_int_t magma_cgetrf_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magmaFloatComplex_ptr d_lA[],
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]d_lACOMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_dgetrf()

magma_int_t magma_dgetrf ( magma_int_t m,
magma_int_t n,
double * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]ADOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_dgetrf2_mgpu()

magma_int_t magma_dgetrf2_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magma_int_t nb,
magma_int_t offset,
magmaDouble_ptr d_lAT[],
magma_int_t lddat,
magma_int_t * ipiv,
magmaDouble_ptr d_lAP[],
double * W,
magma_int_t ldw,
magma_queue_t queues[][2],
magma_int_t * info )

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The block size used for the matrix distribution.
[in]offsetINTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]d_lATDOUBLE PRECISION array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddatINTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]d_lAPDOUBLE PRECISION array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)*nb*maxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]WDOUBLE PRECISION array, dimension (ngpu*nb*maxm). It is used to store panel on CPU.
[in]ldwINTEGER The leading dimension of the workspace w.
[in]queuesmagma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_dgetrf_gpu_recommend_cpu()

bool magma_dgetrf_gpu_recommend_cpu ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_dgetrf_gpu_recommend_cpu returns true if magma_dgetrf_gpu is going to use the CPU only for performing the LU factorization.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_dgetrf_native_recommend_notrans()

bool magma_dgetrf_native_recommend_notrans ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_dgetrf_native_recommend_notrans returns true if magma_dgetrf_native is going to perform the LU factorization without transposing the matrix.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_dgetrf_expert_gpu_work()

magma_int_t magma_dgetrf_expert_gpu_work ( magma_int_t m,
magma_int_t n,
magmaDouble_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_mode_t mode,
magma_int_t nb,
magma_int_t recnb,
void * host_work,
magma_int_t * lwork_host,
void * device_work,
magma_int_t * lwork_device,
magma_event_t events[2],
magma_queue_t queues[2] )

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

This is an expert API, exposing more controls to the end user.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dADOUBLE PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]modemagma_mode_t
  • = MagmaNative: Factorize dA using GPU only mode.
  • = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.
[in]nbINTEGER The blocking size used during the factorization. nb > 0; Users with no specific preference of nb can call magma_get_dgetrf_nb() or magma_get_dgetrf_native_nb() to get the value of nb as determined by MAGMA's internal tuning.
[in]recnbINTEGER The blocking size used during the recursive panel factorization (0 < recnb <= nb); Users with no specific preference of recnb can set it to a fixed value of 32.
[in,out]host_workWorkspace, allocated on host (CPU) memory. For faster CPU-GPU communication, user can allocate it as pinned memory using magma_malloc_pinned()
[in,out]lwork_hostINTEGER pointer The size of the workspace (host_work) in bytes
  • lwork_host[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_host. The workspace itself is not referenced, and no factorization is performed.
  • lwork[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_host.
Parameters
[in,out]device_workWorkspace, allocated on device (GPU) memory.
[in,out]lwork_deviceINTEGER pointer The size of the workspace (device_work) in bytes
  • lwork_device[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_device. The workspace itself is not referenced, and no factorization is performed.
  • lwork_device[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_device.
[in]eventsmagma_event_t array of size two
  • created/destroyed by the user outside the routine
  • Used to manage inter-stream dependencies
[in]queuesmagma_queue_t array of size two
  • created/destroyed by the user outside the routine
  • Used for concurrent kernel execution, if possible

◆ magma_dgetrf_gpu_expert()

magma_int_t magma_dgetrf_gpu_expert ( magma_int_t m,
magma_int_t n,
magmaDouble_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_int_t nb,
magma_mode_t mode )

magma_dgetrf_gpu_expert is similar to magma_dgetrf_expert_gpu_work except that all workspaces/queues are handled internally

See also
magma_dgetrf_expert_gpu_work

◆ magma_dgetrf_gpu()

magma_int_t magma_dgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaDouble_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_dgetrf_expert_gpu_work with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See also
magma_dgetrf_expert_gpu_work

◆ magma_dgetrf_native()

magma_int_t magma_dgetrf_native ( magma_int_t m,
magma_int_t n,
magmaDouble_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_dgetrf_expert_gpu_work with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See also
magma_dgetrf_expert_gpu_work

◆ magma_dgetrf_m()

magma_int_t magma_dgetrf_m ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
double * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

DGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]ADOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_dgetrf_mgpu()

magma_int_t magma_dgetrf_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magmaDouble_ptr d_lA[],
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]d_lADOUBLE PRECISION array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_sgetrf()

magma_int_t magma_sgetrf ( magma_int_t m,
magma_int_t n,
float * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]AREAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_sgetrf2_mgpu()

magma_int_t magma_sgetrf2_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magma_int_t nb,
magma_int_t offset,
magmaFloat_ptr d_lAT[],
magma_int_t lddat,
magma_int_t * ipiv,
magmaFloat_ptr d_lAP[],
float * W,
magma_int_t ldw,
magma_queue_t queues[][2],
magma_int_t * info )

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The block size used for the matrix distribution.
[in]offsetINTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]d_lATREAL array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddatINTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]d_lAPREAL array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)*nb*maxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]WREAL array, dimension (ngpu*nb*maxm). It is used to store panel on CPU.
[in]ldwINTEGER The leading dimension of the workspace w.
[in]queuesmagma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_sgetrf_gpu_recommend_cpu()

bool magma_sgetrf_gpu_recommend_cpu ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_sgetrf_gpu_recommend_cpu returns true if magma_sgetrf_gpu is going to use the CPU only for performing the LU factorization.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_sgetrf_native_recommend_notrans()

bool magma_sgetrf_native_recommend_notrans ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_sgetrf_native_recommend_notrans returns true if magma_sgetrf_native is going to perform the LU factorization without transposing the matrix.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_sgetrf_expert_gpu_work()

magma_int_t magma_sgetrf_expert_gpu_work ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_mode_t mode,
magma_int_t nb,
magma_int_t recnb,
void * host_work,
magma_int_t * lwork_host,
void * device_work,
magma_int_t * lwork_device,
magma_event_t events[2],
magma_queue_t queues[2] )

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

This is an expert API, exposing more controls to the end user.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dAREAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]modemagma_mode_t
  • = MagmaNative: Factorize dA using GPU only mode.
  • = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.
[in]nbINTEGER The blocking size used during the factorization. nb > 0; Users with no specific preference of nb can call magma_get_sgetrf_nb() or magma_get_sgetrf_native_nb() to get the value of nb as determined by MAGMA's internal tuning.
[in]recnbINTEGER The blocking size used during the recursive panel factorization (0 < recnb <= nb); Users with no specific preference of recnb can set it to a fixed value of 32.
[in,out]host_workWorkspace, allocated on host (CPU) memory. For faster CPU-GPU communication, user can allocate it as pinned memory using magma_malloc_pinned()
[in,out]lwork_hostINTEGER pointer The size of the workspace (host_work) in bytes
  • lwork_host[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_host. The workspace itself is not referenced, and no factorization is performed.
  • lwork[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_host.
Parameters
[in,out]device_workWorkspace, allocated on device (GPU) memory.
[in,out]lwork_deviceINTEGER pointer The size of the workspace (device_work) in bytes
  • lwork_device[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_device. The workspace itself is not referenced, and no factorization is performed.
  • lwork_device[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_device.
[in]eventsmagma_event_t array of size two
  • created/destroyed by the user outside the routine
  • Used to manage inter-stream dependencies
[in]queuesmagma_queue_t array of size two
  • created/destroyed by the user outside the routine
  • Used for concurrent kernel execution, if possible

◆ magma_sgetrf_gpu_expert()

magma_int_t magma_sgetrf_gpu_expert ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_int_t nb,
magma_mode_t mode )

magma_sgetrf_gpu_expert is similar to magma_sgetrf_expert_gpu_work except that all workspaces/queues are handled internally

See also
magma_sgetrf_expert_gpu_work

◆ magma_sgetrf_gpu()

magma_int_t magma_sgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_sgetrf_expert_gpu_work with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See also
magma_sgetrf_expert_gpu_work

◆ magma_sgetrf_native()

magma_int_t magma_sgetrf_native ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_sgetrf_expert_gpu_work with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See also
magma_sgetrf_expert_gpu_work

◆ magma_sgetrf_m()

magma_int_t magma_sgetrf_m ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
float * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

SGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]AREAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_sgetrf_mgpu()

magma_int_t magma_sgetrf_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magmaFloat_ptr d_lA[],
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]d_lAREAL array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_xhsgetrf_gpu()

magma_int_t magma_xhsgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_mp_type_t enable_tc,
magma_mp_type_t mp_algo_type )

XHSGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16-w/o TensorCores factorization techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dAREAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]enable_tcMAGMA_MP_TYPE_T internal and expert API uses. enable/disable tensor cores
[in]mp_algo_typeMAGMA_MP_TYPE_T internal and expert API uses.

◆ magma_hgetrf_gpu()

magma_int_t magma_hgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

HGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16 techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dAREAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

More details can be found in Azzam Haidar, Stanimire Tomov, Jack Dongarra, and Nicholas J. Higham. 2018. Harnessing GPU tensor cores for fast FP16 arithmetic to speed up mixed-precision iterative refinement solvers. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis (SC '18). IEEE Press, Piscataway, NJ, USA, Article 47, 11 pages.

◆ magma_xshgetrf_gpu()

magma_int_t magma_xshgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_mp_type_t enable_tc,
magma_mp_type_t mp_algo_type )

XSHGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16-w/o TensorCores factorization techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dAREAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]enable_tcMAGMA_MP_TYPE_T internal and expert API uses. enable/disable tensor cores
[in]mp_algo_typeMAGMA_MP_TYPE_T internal and expert API uses.

More details can be found in Azzam Haidar, Stanimire Tomov, Jack Dongarra, and Nicholas J. Higham. 2018. Harnessing GPU tensor cores for fast FP16 arithmetic to speed up mixed-precision iterative refinement solvers. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis (SC '18). IEEE Press, Piscataway, NJ, USA, Article 47, 11 pages.

◆ magma_htgetrf_gpu()

magma_int_t magma_htgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaFloat_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

HTGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16-TensorCores factorization techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dAREAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_zgetrf()

magma_int_t magma_zgetrf ( magma_int_t m,
magma_int_t n,
magmaDoubleComplex * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]ACOMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_zgetrf2_mgpu()

magma_int_t magma_zgetrf2_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magma_int_t nb,
magma_int_t offset,
magmaDoubleComplex_ptr d_lAT[],
magma_int_t lddat,
magma_int_t * ipiv,
magmaDoubleComplex_ptr d_lAP[],
magmaDoubleComplex * W,
magma_int_t ldw,
magma_queue_t queues[][2],
magma_int_t * info )

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The block size used for the matrix distribution.
[in]offsetINTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]d_lATCOMPLEX_16 array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddatINTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]d_lAPCOMPLEX_16 array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)*nb*maxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]WCOMPLEX_16 array, dimension (ngpu*nb*maxm). It is used to store panel on CPU.
[in]ldwINTEGER The leading dimension of the workspace w.
[in]queuesmagma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_zgetrf_gpu_recommend_cpu()

bool magma_zgetrf_gpu_recommend_cpu ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_zgetrf_gpu_recommend_cpu returns true if magma_zgetrf_gpu is going to use the CPU only for performing the LU factorization.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_zgetrf_native_recommend_notrans()

bool magma_zgetrf_native_recommend_notrans ( magma_int_t m,
magma_int_t n,
magma_int_t nb )

magma_zgetrf_native_recommend_notrans returns true if magma_zgetrf_native is going to perform the LU factorization without transposing the matrix.

This is often the case for relatively small matrices.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in]nbINTEGER The blocking size used during the factorization. nb > 0;

◆ magma_zgetrf_expert_gpu_work()

magma_int_t magma_zgetrf_expert_gpu_work ( magma_int_t m,
magma_int_t n,
magmaDoubleComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_mode_t mode,
magma_int_t nb,
magma_int_t recnb,
void * host_work,
magma_int_t * lwork_host,
void * device_work,
magma_int_t * lwork_device,
magma_event_t events[2],
magma_queue_t queues[2] )

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

This is an expert API, exposing more controls to the end user.

Parameters
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]dACOMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]modemagma_mode_t
  • = MagmaNative: Factorize dA using GPU only mode.
  • = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.
[in]nbINTEGER The blocking size used during the factorization. nb > 0; Users with no specific preference of nb can call magma_get_zgetrf_nb() or magma_get_zgetrf_native_nb() to get the value of nb as determined by MAGMA's internal tuning.
[in]recnbINTEGER The blocking size used during the recursive panel factorization (0 < recnb <= nb); Users with no specific preference of recnb can set it to a fixed value of 32.
[in,out]host_workWorkspace, allocated on host (CPU) memory. For faster CPU-GPU communication, user can allocate it as pinned memory using magma_malloc_pinned()
[in,out]lwork_hostINTEGER pointer The size of the workspace (host_work) in bytes
  • lwork_host[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_host. The workspace itself is not referenced, and no factorization is performed.
  • lwork[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_host.
Parameters
[in,out]device_workWorkspace, allocated on device (GPU) memory.
[in,out]lwork_deviceINTEGER pointer The size of the workspace (device_work) in bytes
  • lwork_device[0] < 0: a workspace query is assumed, the routine calculates the required amount of workspace and returns it in lwork_device. The workspace itself is not referenced, and no factorization is performed.
  • lwork_device[0] >= 0: the routine assumes that the user has provided a workspace with the size in lwork_device.
[in]eventsmagma_event_t array of size two
  • created/destroyed by the user outside the routine
  • Used to manage inter-stream dependencies
[in]queuesmagma_queue_t array of size two
  • created/destroyed by the user outside the routine
  • Used for concurrent kernel execution, if possible

◆ magma_zgetrf_gpu_expert()

magma_int_t magma_zgetrf_gpu_expert ( magma_int_t m,
magma_int_t n,
magmaDoubleComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info,
magma_int_t nb,
magma_mode_t mode )

magma_zgetrf_gpu_expert is similar to magma_zgetrf_expert_gpu_work except that all workspaces/queues are handled internally

See also
magma_zgetrf_expert_gpu_work

◆ magma_zgetrf_gpu()

magma_int_t magma_zgetrf_gpu ( magma_int_t m,
magma_int_t n,
magmaDoubleComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_zgetrf_expert_gpu_work with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See also
magma_zgetrf_expert_gpu_work

◆ magma_zgetrf_native()

magma_int_t magma_zgetrf_native ( magma_int_t m,
magma_int_t n,
magmaDoubleComplex_ptr dA,
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

magma_zgetrf_expert_gpu_work with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See also
magma_zgetrf_expert_gpu_work

◆ magma_zgetrf_m()

magma_int_t magma_zgetrf_m ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magmaDoubleComplex * A,
magma_int_t lda,
magma_int_t * ipiv,
magma_int_t * info )

ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]ACOMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]ldaINTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

◆ magma_zgetrf_mgpu()

magma_int_t magma_zgetrf_mgpu ( magma_int_t ngpu,
magma_int_t m,
magma_int_t n,
magmaDoubleComplex_ptr d_lA[],
magma_int_t ldda,
magma_int_t * ipiv,
magma_int_t * info )

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters
[in]ngpuINTEGER Number of GPUs to use. ngpu > 0.
[in]mINTEGER The number of rows of the matrix A. M >= 0.
[in]nINTEGER The number of columns of the matrix A. N >= 0.
[in,out]d_lACOMPLEX_16 array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored.
[in]lddaINTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]ipivINTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]infoINTEGER
  • = 0: successful exit
  • < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed.
  • > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.