![]() |
MAGMA
1.6.3
Matrix Algebra for GPU and Multicore Architectures
|
Functions | |
magma_int_t | magma_dtrsm_m (magma_int_t ngpu, magma_side_t side, magma_uplo_t uplo, magma_trans_t transa, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, double *A, magma_int_t lda, double *B, magma_int_t ldb) |
DTRSM solves one of the matrix equations op( A )*X = alpha*B, or X*op( A ) = alpha*B, where alpha is a scalar, X and B are m by n matrices, A is a unit, or non-unit, upper or lower triangular matrix and op( A ) is one of. More... | |
void | magma_dgemm (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc) |
Perform matrix-matrix product, \( C = \alpha op(A) op(B) + \beta C \). More... | |
void | magma_dsymm (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc) |
Perform symmetric matrix-matrix product. More... | |
void | magma_dsyrk (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, double beta, magmaDouble_ptr dC, magma_int_t lddc) |
Perform symmetric rank-k update. More... | |
void | magma_dsyr2k (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc) |
Perform symmetric rank-2k update. More... | |
void | magma_dtrmm (magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb) |
Perform triangular matrix-matrix product. More... | |
void | magma_dtrsm (magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb) |
Solve triangular matrix-matrix system (multiple right-hand sides). More... | |
void | magma_dgemm_batched (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, double const *const *dA_array, magma_int_t ldda, double const *const *dB_array, magma_int_t lddb, double beta, double **dC_array, magma_int_t lddc, magma_int_t batchCount, magma_queue_t queue, cublasHandle_t myhandle) |
DGEMM performs one of the matrix-matrix operations. More... | |
void | magmablas_dgemm_batched (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, double const *const *dA_array, magma_int_t ldda, double const *const *dB_array, magma_int_t lddb, double beta, double **dC_array, magma_int_t lddc, magma_int_t batchCount, magma_queue_t queue) |
DGEMM performs one of the matrix-matrix operations. More... | |
void | magmablas_dgemm (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc) |
DGEMM performs one of the matrix-matrix operations. More... | |
void | magmablas_dgemm_reduce (magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc) |
DGEMM_REDUCE performs one of the matrix-matrix operations. More... | |
void | magmablas_dgemm_tesla (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, const double *A, magma_int_t lda, const double *B, magma_int_t ldb, double beta, double *C, magma_int_t ldc) |
DGEMM performs one of the matrix-matrix operations. More... | |
__global__ void | dgemm_kernel_N_N_64_16_16_16_4 (double *__restrict__ C, const double *__restrict__ A, const double *__restrict__ B, int m, int n, int k, int lda, int ldb, int ldc, double alpha, double beta) |
Purpose:This routine computes C = alpha * A*B + beta * C More... | |
__global__ void | dgemm_kernel_N_N_64_16_16_16_4_special (double *__restrict__ C, const double *__restrict__ A, const double *__restrict__ B, int m, int n, int k, int lda, int ldb, int ldc, double alpha, double beta) |
Purpose:This routine computes C = alpha * A*B + beta * C More... | |
__global__ void | dgemm_kernel_N_T_64_16_4_16_4 (double *__restrict__ C, const double *__restrict__ A, const double *__restrict__ B, int m, int n, int k, int lda, int ldb, int ldc, double alpha, double beta) |
Purpose:This routine computes C = alpha * A*B^T + beta * C More... | |
__global__ void | dgemm_kernel_T_N_32_32_8_8_8 (double *__restrict__ C, const double *__restrict__ A, const double *__restrict__ B, int m, int n, int k, int lda, int ldb, int ldc, double alpha, double beta) |
Purpose:This routine computes C = alpha * A^T*B + beta * C More... | |
__global__ void | dgemm_kernel_T_T_64_16_16_16_4 (double *__restrict__ C, const double *__restrict__ A, const double *__restrict__ B, int m, int n, int k, int lda, int ldb, int ldc, double alpha, double beta) |
Purpose:This routine computes C = alpha * A^T*B^T + beta * C More... | |
__global__ void | dgemm_kernel_T_T_64_16_16_16_4_special (double *__restrict__ C, const double *__restrict__ A, const double *__restrict__ B, int m, int n, int k, int lda, int ldb, int ldc, double alpha, double beta) |
Purpose:This routine computes C = alpha * A^T*B^T + beta * C More... | |
void | magmablas_dsyr2k_mgpu2 (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDouble_ptr dA[], magma_int_t ldda, magma_int_t a_offset, magmaDouble_ptr dB[], magma_int_t lddb, magma_int_t b_offset, double beta, magmaDouble_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue) |
DSYR2K performs one of the symmetric rank 2k operations. More... | |
void | magmablas_dsyr2k_mgpu_spec (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDouble_ptr dA[], magma_int_t ldda, magma_int_t a_offset, magmaDouble_ptr dB[], magma_int_t lddb, magma_int_t b_offset, double beta, magmaDouble_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue) |
DSYR2K performs one of the symmetric rank 2k operations. More... | |
void | magma_dsyrk_batched (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, double const *const *dA_array, magma_int_t ldda, double beta, double **dC_array, magma_int_t lddc, magma_int_t batchCount, magma_queue_t queue) |
DSYRK performs one of the symmetric rank k operations. More... | |
void | magmablas_dsyrk_batched (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, double const *const *dA_array, magma_int_t ldda, double beta, double **dC_array, magma_int_t lddc, magma_int_t batchCount, magma_queue_t queue) |
DSYRK performs one of the symmetric rank k operations. More... | |
void | magmablas_dtrsm_outofplace (magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb, magmaDouble_ptr dX, magma_int_t lddx, magma_int_t flag, magmaDouble_ptr d_dinvA, magma_int_t dinvA_length) |
dtrsm_outofplace solves one of the matrix equations on gpu More... | |
void | magmablas_dtrsm_work (magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb, magmaDouble_ptr dX, magma_int_t lddx, magma_int_t flag, magmaDouble_ptr d_dinvA, magma_int_t dinvA_length) |
Similar to magmablas_dtrsm_outofplace, but copies result dX back to dB, as in classical dtrsm interface. More... | |
void | magmablas_dtrsm (magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb) |
Similar to magmablas_dtrsm_outofplace, but allocates dX and d_dinvA internally. More... | |
void | magmablas_dtrsm_outofplace_batched (magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag, magma_int_t flag, magma_int_t m, magma_int_t n, double alpha, double **dA_array, magma_int_t ldda, double **dB_array, magma_int_t lddb, double **dX_array, magma_int_t lddx, double **dinvA_array, magma_int_t dinvA_length, double **dA_displ, double **dB_displ, double **dX_displ, double **dinvA_displ, magma_int_t resetozero, magma_int_t batchCount, magma_queue_t queue, cublasHandle_t myhandle) |
dtrsm_work solves one of the matrix equations on gpu More... | |
void | magmablas_dtrsm_work_batched (magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag, magma_int_t flag, magma_int_t m, magma_int_t n, double alpha, double **dA_array, magma_int_t ldda, double **dB_array, magma_int_t lddb, double **dX_array, magma_int_t lddx, double **dinvA_array, magma_int_t dinvA_length, double **dA_displ, double **dB_displ, double **dX_displ, double **dinvA_displ, magma_int_t resetozero, magma_int_t batchCount, magma_queue_t queue, cublasHandle_t myhandle) |
void | magmablas_dtrsm_batched (magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, double **dA_array, magma_int_t ldda, double **dB_array, magma_int_t lddb, magma_int_t batchCount, magma_queue_t queue, cublasHandle_t myhandle) |
void | magmablas_dtrtri_diag_q (magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr d_dinvA, magma_queue_t queue) |
Inverts the NB x NB diagonal blocks of a triangular matrix. More... | |
void | magmablas_dtrtri_diag (magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr d_dinvA) |
void | magmablas_dtrtri_diag_batched (magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, double const *const *dA_array, magma_int_t ldda, double **dinvA_array, magma_int_t resetozero, magma_int_t batchCount, magma_queue_t queue) |
Inverts the NB x NB diagonal blocks of a triangular matrix. More... | |
__global__ void dgemm_kernel_N_N_64_16_16_16_4 | ( | double *__restrict__ | C, |
const double *__restrict__ | A, | ||
const double *__restrict__ | B, | ||
int | m, | ||
int | n, | ||
int | k, | ||
int | lda, | ||
int | ldb, | ||
int | ldc, | ||
double | alpha, | ||
double | beta | ||
) |
B is put into shared memory Parameters Used: blk_M=64 blk_N=16 blk_K=16 nthd_x=16 nthd_y=4
This code should run for any matrix size. This kernel outperforms cuda-2.2 when m, n, k >= 512
__global__ void dgemm_kernel_N_N_64_16_16_16_4_special | ( | double *__restrict__ | C, |
const double *__restrict__ | A, | ||
const double *__restrict__ | B, | ||
int | m, | ||
int | n, | ||
int | k, | ||
int | lda, | ||
int | ldb, | ||
int | ldc, | ||
double | alpha, | ||
double | beta | ||
) |
B is put into shared memory Parameters Used: blk_M=64 blk_N=16 blk_K=16 nthd_x=16 nthd_y=4
This kernel is for matrices divisible by the corresponding blocking sizes.
__global__ void dgemm_kernel_N_T_64_16_4_16_4 | ( | double *__restrict__ | C, |
const double *__restrict__ | A, | ||
const double *__restrict__ | B, | ||
int | m, | ||
int | n, | ||
int | k, | ||
int | lda, | ||
int | ldb, | ||
int | ldc, | ||
double | alpha, | ||
double | beta | ||
) |
B is put into shared memory Parameters Used: blk_M=64 blk_N=16 blk_K=4 nthd_x=16 nthd_y=4
This code should run for any matrix size.
__global__ void dgemm_kernel_T_N_32_32_8_8_8 | ( | double *__restrict__ | C, |
const double *__restrict__ | A, | ||
const double *__restrict__ | B, | ||
int | m, | ||
int | n, | ||
int | k, | ||
int | lda, | ||
int | ldb, | ||
int | ldc, | ||
double | alpha, | ||
double | beta | ||
) |
B is put into shared memory Parameters Used: blk_M=32 blk_N=32 blk_K=8 nthd_x=8 nthd_y=8
This code should run for any matrix size.
__global__ void dgemm_kernel_T_T_64_16_16_16_4 | ( | double *__restrict__ | C, |
const double *__restrict__ | A, | ||
const double *__restrict__ | B, | ||
int | m, | ||
int | n, | ||
int | k, | ||
int | lda, | ||
int | ldb, | ||
int | ldc, | ||
double | alpha, | ||
double | beta | ||
) |
B is put into shared memory Parameters Used: blk_M=64 blk_N=16 blk_K=16 nthd_x=16 nthd_y=4
This code should run for any matrix size. This kernel outperforms cuda-2.2 when m, n, k >= 512
__global__ void dgemm_kernel_T_T_64_16_16_16_4_special | ( | double *__restrict__ | C, |
const double *__restrict__ | A, | ||
const double *__restrict__ | B, | ||
int | m, | ||
int | n, | ||
int | k, | ||
int | lda, | ||
int | ldb, | ||
int | ldc, | ||
double | alpha, | ||
double | beta | ||
) |
B is put into shared memory Parameters Used: blk_M=64 blk_N=16 blk_K=16 nthd_x=16 nthd_y=4
This kernel is for matrices divisible by the corresponding blocking sizes.
void magma_dgemm | ( | magma_trans_t | transA, |
magma_trans_t | transB, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_const_ptr | dB, | ||
magma_int_t | lddb, | ||
double | beta, | ||
magmaDouble_ptr | dC, | ||
magma_int_t | lddc | ||
) |
Perform matrix-matrix product, \( C = \alpha op(A) op(B) + \beta C \).
[in] | transA | Operation op(A) to perform on matrix A. |
[in] | transB | Operation op(B) to perform on matrix B. |
[in] | m | Number of rows of C and op(A). m >= 0. |
[in] | n | Number of columns of C and op(B). n >= 0. |
[in] | k | Number of columns of op(A) and rows of op(B). k >= 0. |
[in] | alpha | Scalar \( \alpha \) |
[in] | dA | DOUBLE_PRECISION array on GPU device. If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); otherwise, the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k). |
[in] | ldda | Leading dimension of dA. |
[in] | dB | DOUBLE_PRECISION array on GPU device. If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); otherwise, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n). |
[in] | lddb | Leading dimension of dB. |
[in] | beta | Scalar \( \beta \) |
[in,out] | dC | DOUBLE_PRECISION array on GPU device. The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m). |
[in] | lddc | Leading dimension of dC. |
void magma_dgemm_batched | ( | magma_trans_t | transA, |
magma_trans_t | transB, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
double const *const * | dA_array, | ||
magma_int_t | ldda, | ||
double const *const * | dB_array, | ||
magma_int_t | lddb, | ||
double | beta, | ||
double ** | dC_array, | ||
magma_int_t | lddc, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue, | ||
cublasHandle_t | myhandle | ||
) |
DGEMM performs one of the matrix-matrix operations.
C = alpha*op( A )*op( B ) + beta*C,
where op( X ) is one of
op( X ) = X or op( X ) = X**T or op( X ) = X**H,
alpha and beta are scalars, and A, B and C are matrices, with op( A ) an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
[in] | transA | CHARACTER*1. On entry, transA specifies the form of op( A ) to be used in the matrix multiplication as follows:
|
[in] | transB | CHARACTER*1. On entry, transB specifies the form of op( B ) to be used in the matrix multiplication as follows:
|
[in] | m | INTEGER. On entry, M specifies the number of rows of the matrix op( dA ) and of the matrix dC. M must be at least zero. |
[in] | n | INTEGER. On entry, N specifies the number of columns of the matrix op( dB ) and the number of columns of the matrix dC. N must be at least zero. |
[in] | k | INTEGER. On entry, K specifies the number of columns of the matrix op( dA ) and the number of rows of the matrix op( dB ). K must be at least zero. |
[in] | alpha | DOUBLE_PRECISION On entry, ALPHA specifies the scalar alpha. |
[in] | dA | DOUBLE_PRECISION array of DIMENSION ( LDA, ka ), where ka is k when transA = MagmaNoTrans, and is m otherwise. Before entry with transA = MagmaNoTrans, the leading m by k part of the array dA must contain the matrix dA, otherwise the leading k by m part of the array dA must contain the matrix dA. |
[in] | ldda | INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When transA = MagmaNoTrans then LDA must be at least max( 1, m ), otherwise LDA must be at least max( 1, k ). |
[in] | dB | DOUBLE_PRECISION array of DIMENSION ( LDB, kb ), where kb is n when transB = MagmaNoTrans, and is k otherwise. Before entry with transB = MagmaNoTrans, the leading k by n part of the array dB must contain the matrix dB, otherwise the leading n by k part of the array dB must contain the matrix dB. |
[in] | lddb | INTEGER. On entry, LDB specifies the first dimension of dB as declared in the calling (sub) program. When transB = MagmaNoTrans then LDB must be at least max( 1, k ), otherwise LDB must be at least max( 1, n ). |
[in] | beta | DOUBLE_PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then dC need not be set on input. |
[in,out] | dC | DOUBLE_PRECISION array of DIMENSION ( LDC, n ). Before entry, the leading m by n part of the array dC must contain the matrix dC, except when beta is zero, in which case dC need not be set on entry. On exit, the array dC is overwritten by the m by n matrix ( alpha*op( dA )*op( dB ) + beta*dC ). |
[in] | lddc | INTEGER. On entry, LDC specifies the first dimension of dC as declared in the calling (sub) program. LDC must be at least max( 1, m ). |
void magma_dsymm | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_const_ptr | dB, | ||
magma_int_t | lddb, | ||
double | beta, | ||
magmaDouble_ptr | dC, | ||
magma_int_t | lddc | ||
) |
Perform symmetric matrix-matrix product.
\( C = \alpha A B + \beta C \) (side == MagmaLeft), or
\( C = \alpha B A + \beta C \) (side == MagmaRight),
where \( A \) is symmetric.
[in] | side | Whether A is on the left or right. |
[in] | uplo | Whether the upper or lower triangle of A is referenced. |
[in] | m | Number of rows of C. m >= 0. |
[in] | n | Number of columns of C. n >= 0. |
[in] | alpha | Scalar \( \alpha \) |
[in] | dA | DOUBLE_PRECISION array on GPU device. If side == MagmaLeft, the m-by-m symmetric matrix A of dimension (ldda,m), ldda >= max(1,m); otherwise, the n-by-n symmetric matrix A of dimension (ldda,n), ldda >= max(1,n). |
[in] | ldda | Leading dimension of dA. |
[in] | dB | DOUBLE_PRECISION array on GPU device. The m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m). |
[in] | lddb | Leading dimension of dB. |
[in] | beta | Scalar \( \beta \) |
[in,out] | dC | DOUBLE_PRECISION array on GPU device. The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m). |
[in] | lddc | Leading dimension of dC. |
void magma_dsyr2k | ( | magma_uplo_t | uplo, |
magma_trans_t | trans, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_const_ptr | dB, | ||
magma_int_t | lddb, | ||
double | beta, | ||
magmaDouble_ptr | dC, | ||
magma_int_t | lddc | ||
) |
Perform symmetric rank-2k update.
\( C = \alpha A B^T + \alpha B A^T \beta C \) (trans == MagmaNoTrans), or
\( C = \alpha A^T B + \alpha B^T A \beta C \) (trans == MagmaTrans),
where \( C \) is symmetric.
[in] | uplo | Whether the upper or lower triangle of C is referenced. |
[in] | trans | Operation to perform on A and B. |
[in] | n | Number of rows and columns of C. n >= 0. |
[in] | k | Number of columns of A and B (for MagmaNoTrans) or rows of A and B (for MagmaTrans). k >= 0. |
[in] | alpha | Scalar \( \alpha \) |
[in] | dA | DOUBLE_PRECISION array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix A of dimension (ldda,k), ldda >= max(1,n); otherwise, the k-by-n matrix A of dimension (ldda,n), ldda >= max(1,k). |
[in] | ldda | Leading dimension of dA. |
[in] | dB | DOUBLE_PRECISION array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n); otherwise, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k). |
[in] | lddb | Leading dimension of dB. |
[in] | beta | Scalar \( \beta \) |
[in,out] | dC | DOUBLE_PRECISION array on GPU device. The n-by-n symmetric matrix C of dimension (lddc,n), lddc >= max(1,n). |
[in] | lddc | Leading dimension of dC. |
void magma_dsyrk | ( | magma_uplo_t | uplo, |
magma_trans_t | trans, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
double | beta, | ||
magmaDouble_ptr | dC, | ||
magma_int_t | lddc | ||
) |
Perform symmetric rank-k update.
\( C = \alpha A A^T + \beta C \) (trans == MagmaNoTrans), or
\( C = \alpha A^T A + \beta C \) (trans == MagmaTrans),
where \( C \) is symmetric.
[in] | uplo | Whether the upper or lower triangle of C is referenced. |
[in] | trans | Operation to perform on A. |
[in] | n | Number of rows and columns of C. n >= 0. |
[in] | k | Number of columns of A (for MagmaNoTrans) or rows of A (for MagmaTrans). k >= 0. |
[in] | alpha | Scalar \( \alpha \) |
[in] | dA | DOUBLE_PRECISION array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix A of dimension (ldda,k), ldda >= max(1,n); otherwise, the k-by-n matrix A of dimension (ldda,n), ldda >= max(1,k). |
[in] | ldda | Leading dimension of dA. |
[in] | beta | Scalar \( \beta \) |
[in,out] | dC | DOUBLE_PRECISION array on GPU device. The n-by-n symmetric matrix C of dimension (lddc,n), lddc >= max(1,n). |
[in] | lddc | Leading dimension of dC. |
void magma_dsyrk_batched | ( | magma_uplo_t | uplo, |
magma_trans_t | trans, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
double const *const * | dA_array, | ||
magma_int_t | ldda, | ||
double | beta, | ||
double ** | dC_array, | ||
magma_int_t | lddc, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue | ||
) |
DSYRK performs one of the symmetric rank k operations.
C := alpha*A*A**H + beta*C,
or
C := alpha*A**H*A + beta*C,
where alpha and beta are real scalars, C is an n by n symmetric matrix and A is an n by k matrix in the first case and a k by n matrix in the second case.
[in] | uplo | CHARACTER*1. On entry, uplo specifies whether the upper or lower triangular part of the array C is to be referenced as follows: |
uplo = 'U' or 'u' Only the upper triangular part of C is to be referenced.
uplo = 'L' or 'l' Only the lower triangular part of C is to be referenced.
[in] | trans | CHARACTER*1. On entry, trans specifies the operation to be performed as follows: |
trans = 'N' or 'n' C := alpha*A*A**H + beta*C.
trans = 'C' or 'c' C := alpha*A**H*A + beta*C.
[in] | n | INTEGER. On entry, specifies the order of the matrix C. N must be at least zero. |
[in] | k | INTEGER. On entry with trans = 'N' or 'n', k specifies the number of columns of the matrix A, and on entry with trans = 'C' or 'c', k specifies the number of rows of the matrix A. K must be at least zero. |
[in] | alpha | DOUBLE PRECISION On entry, ALPHA specifies the scalar alpha. |
[in] | dA | DOUBLE_PRECISION array of DIMENSION ( ldda, ka ), where ka is k when trans = MagmaNoTrans, and is n otherwise. Before entry with trans = MagmaNoTrans, the leading m by k part of the array dA must contain the matrix dA, otherwise the leading k by m part of the array dA must contain the matrix dA. |
[in] | ldda | INTEGER. On entry, ldda specifies the first dimension of A as declared in the calling (sub) program. When trans = MagmaNoTrans then ldda must be at least max( 1, n ), otherwise ldda must be at least max( 1, k ). |
[in] | beta | DOUBLE PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then dC need not be set on input. |
[in,out] | dC | DOUBLE_PRECISION array of DIMENSION ( lddc, n ). Before entry with uplo = 'U' or 'u', the leading n by n upper triangular part of the array C must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of C is not referenced. On exit, the upper triangular part of the array C is overwritten by the upper triangular part of the updated matrix. Before entry with uplo = 'L' or 'l', the leading n by n lower triangular part of the array C must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of C is not referenced. On exit, the lower triangular part of the array C is overwritten by the lower triangular part of the updated matrix. Note that the imaginary parts of the diagonal elements need not be set, they are assumed to be zero, and on exit they are set to zero. |
[in] | lddc | INTEGER. On entry, lddc specifies the first dimension of dC as declared in the calling (sub) program. lddc must be at least max( 1, m ). |
void magma_dtrmm | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | trans, | ||
magma_diag_t | diag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_ptr | dB, | ||
magma_int_t | lddb | ||
) |
Perform triangular matrix-matrix product.
\( B = \alpha op(A) B \) (side == MagmaLeft), or
\( B = \alpha B op(A) \) (side == MagmaRight),
where \( A \) is triangular.
[in] | side | Whether A is on the left or right. |
[in] | uplo | Whether A is upper or lower triangular. |
[in] | trans | Operation to perform on A. |
[in] | diag | Whether the diagonal of A is assumed to be unit or non-unit. |
[in] | m | Number of rows of B. m >= 0. |
[in] | n | Number of columns of B. n >= 0. |
[in] | alpha | Scalar \( \alpha \) |
[in] | dA | DOUBLE_PRECISION array on GPU device. If side == MagmaLeft, the n-by-n triangular matrix A of dimension (ldda,n), ldda >= max(1,n); otherwise, the m-by-m triangular matrix A of dimension (ldda,m), ldda >= max(1,m). |
[in] | ldda | Leading dimension of dA. |
[in] | dB | DOUBLE_PRECISION array on GPU device. The m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m). |
[in] | lddb | Leading dimension of dB. |
void magma_dtrsm | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | trans, | ||
magma_diag_t | diag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_ptr | dB, | ||
magma_int_t | lddb | ||
) |
Solve triangular matrix-matrix system (multiple right-hand sides).
\( op(A) X = \alpha B \) (side == MagmaLeft), or
\( X op(A) = \alpha B \) (side == MagmaRight),
where \( A \) is triangular.
[in] | side | Whether A is on the left or right. |
[in] | uplo | Whether A is upper or lower triangular. |
[in] | trans | Operation to perform on A. |
[in] | diag | Whether the diagonal of A is assumed to be unit or non-unit. |
[in] | m | Number of rows of B. m >= 0. |
[in] | n | Number of columns of B. n >= 0. |
[in] | alpha | Scalar \( \alpha \) |
[in] | dA | DOUBLE_PRECISION array on GPU device. If side == MagmaLeft, the m-by-m triangular matrix A of dimension (ldda,m), ldda >= max(1,m); otherwise, the n-by-n triangular matrix A of dimension (ldda,n), ldda >= max(1,n). |
[in] | ldda | Leading dimension of dA. |
[in,out] | dB | DOUBLE_PRECISION array on GPU device. On entry, m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m). On exit, overwritten with the solution matrix X. |
[in] | lddb | Leading dimension of dB. |
magma_int_t magma_dtrsm_m | ( | magma_int_t | ngpu, |
magma_side_t | side, | ||
magma_uplo_t | uplo, | ||
magma_trans_t | transa, | ||
magma_diag_t | diag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
double * | A, | ||
magma_int_t | lda, | ||
double * | B, | ||
magma_int_t | ldb | ||
) |
DTRSM solves one of the matrix equations op( A )*X = alpha*B, or X*op( A ) = alpha*B, where alpha is a scalar, X and B are m by n matrices, A is a unit, or non-unit, upper or lower triangular matrix and op( A ) is one of.
op( A ) = A or op( A ) = A**T or op( A ) = A**H.
The matrix X is overwritten on B.
[in] | ngpu | INTEGER Number of GPUs to use. ngpu > 0. |
[in] | side | magma_side_t. On entry, SIDE specifies whether op( A ) appears on the left or right of X as follows:
|
[in] | uplo | magma_uplo_t. On entry, UPLO specifies whether the matrix A is an upper or lower triangular matrix as follows:
|
[in] | transa | magma_trans_t. On entry, TRANSA specifies the form of op( A ) to be used in the matrix multiplication as follows:
|
[in] | diag | magma_diag_t. On entry, DIAG specifies whether or not A is unit triangular as follows:
|
[in] | m | INTEGER. On entry, M specifies the number of rows of B. M must be at least zero. |
[in] | n | INTEGER. On entry, N specifies the number of columns of B. N must be at least zero. |
[in] | alpha | DOUBLE_PRECISION. On entry, ALPHA specifies the scalar alpha. When alpha is zero then A is not referenced and B need not be set before entry. |
[in] | A | DOUBLE_PRECISION array of DIMENSION ( LDA, k ), where k is m when SIDE = MagmaLeft and is n when SIDE = MagmaRight. Before entry with UPLO = MagmaUpper, the leading k by k upper triangular part of the array A must contain the upper triangular matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading k by k lower triangular part of the array A must contain the lower triangular matrix and the strictly upper triangular part of A is not referenced. Note that when DIAG = MagmaUnit, the diagonal elements of A are not referenced either, but are assumed to be unity. |
[in] | lda | INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When SIDE = MagmaLeft then LDA >= max( 1, m ), when SIDE = MagmaRight then LDA >= max( 1, n ). |
[in,out] | B | DOUBLE_PRECISION array of DIMENSION ( LDB, n ). Before entry, the leading m by n part of the array B must contain the right-hand side matrix B, and on exit is overwritten by the solution matrix X. |
[in] | ldb | INTEGER. On entry, LDB specifies the first dimension of B as declared in the calling (sub) program. LDB must be at least max( 1, m ). |
void magmablas_dgemm | ( | magma_trans_t | transA, |
magma_trans_t | transB, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_const_ptr | dB, | ||
magma_int_t | lddb, | ||
double | beta, | ||
magmaDouble_ptr | dC, | ||
magma_int_t | lddc | ||
) |
DGEMM performs one of the matrix-matrix operations.
C = alpha*op( A )*op( B ) + beta*C,
where op( X ) is one of
op( X ) = X or op( X ) = X**T or op( X ) = X**H,
alpha and beta are scalars, and A, B and C are matrices, with op( A ) an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
[in] | transA | CHARACTER*1. On entry, transA specifies the form of op( A ) to be used in the matrix multiplication as follows:
|
[in] | transB | CHARACTER*1. On entry, transB specifies the form of op( B ) to be used in the matrix multiplication as follows:
|
[in] | m | INTEGER. On entry, M specifies the number of rows of the matrix op( dA ) and of the matrix dC. M must be at least zero. |
[in] | n | INTEGER. On entry, N specifies the number of columns of the matrix op( dB ) and the number of columns of the matrix dC. N must be at least zero. |
[in] | k | INTEGER. On entry, K specifies the number of columns of the matrix op( dA ) and the number of rows of the matrix op( dB ). K must be at least zero. |
[in] | alpha | DOUBLE_PRECISION On entry, ALPHA specifies the scalar alpha. |
[in] | dA | DOUBLE_PRECISION array of DIMENSION ( LDA, ka ), where ka is k when transA = MagmaNoTrans, and is m otherwise. Before entry with transA = MagmaNoTrans, the leading m by k part of the array dA must contain the matrix dA, otherwise the leading k by m part of the array dA must contain the matrix dA. |
[in] | ldda | INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When transA = MagmaNoTrans then LDA must be at least max( 1, m ), otherwise LDA must be at least max( 1, k ). |
[in] | dB | DOUBLE_PRECISION array of DIMENSION ( LDB, kb ), where kb is n when transB = MagmaNoTrans, and is k otherwise. Before entry with transB = MagmaNoTrans, the leading k by n part of the array dB must contain the matrix dB, otherwise the leading n by k part of the array dB must contain the matrix dB. |
[in] | lddb | INTEGER. On entry, LDB specifies the first dimension of dB as declared in the calling (sub) program. When transB = MagmaNoTrans then LDB must be at least max( 1, k ), otherwise LDB must be at least max( 1, n ). |
[in] | beta | DOUBLE_PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then dC need not be set on input. |
[in,out] | dC | DOUBLE_PRECISION array of DIMENSION ( LDC, n ). Before entry, the leading m by n part of the array dC must contain the matrix dC, except when beta is zero, in which case dC need not be set on entry. On exit, the array dC is overwritten by the m by n matrix ( alpha*op( dA )*op( dB ) + beta*dC ). |
[in] | lddc | INTEGER. On entry, LDC specifies the first dimension of dC as declared in the calling (sub) program. LDC must be at least max( 1, m ). |
void magmablas_dgemm_batched | ( | magma_trans_t | transA, |
magma_trans_t | transB, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
double const *const * | dA_array, | ||
magma_int_t | ldda, | ||
double const *const * | dB_array, | ||
magma_int_t | lddb, | ||
double | beta, | ||
double ** | dC_array, | ||
magma_int_t | lddc, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue | ||
) |
DGEMM performs one of the matrix-matrix operations.
C = alpha*op( A )*op( B ) + beta*C,
where op( X ) is one of
op( X ) = X or op( X ) = X**T or op( X ) = X**H,
alpha and beta are scalars, and A, B and C are matrices, with op( A ) an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
[in] | transA | CHARACTER*1. On entry, transA specifies the form of op( A ) to be used in the matrix multiplication as follows:
|
[in] | transB | CHARACTER*1. On entry, transB specifies the form of op( B ) to be used in the matrix multiplication as follows:
|
[in] | m | INTEGER. On entry, M specifies the number of rows of the matrix op( dA ) and of the matrix dC. M must be at least zero. |
[in] | n | INTEGER. On entry, N specifies the number of columns of the matrix op( dB ) and the number of columns of the matrix dC. N must be at least zero. |
[in] | k | INTEGER. On entry, K specifies the number of columns of the matrix op( dA ) and the number of rows of the matrix op( dB ). K must be at least zero. |
[in] | alpha | DOUBLE_PRECISION On entry, ALPHA specifies the scalar alpha. |
[in] | dA | DOUBLE_PRECISION array of DIMENSION ( LDA, ka ), where ka is k when transA = MagmaNoTrans, and is m otherwise. Before entry with transA = MagmaNoTrans, the leading m by k part of the array dA must contain the matrix dA, otherwise the leading k by m part of the array dA must contain the matrix dA. |
[in] | ldda | INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When transA = MagmaNoTrans then LDA must be at least max( 1, m ), otherwise LDA must be at least max( 1, k ). |
[in] | dB | DOUBLE_PRECISION array of DIMENSION ( LDB, kb ), where kb is n when transB = MagmaNoTrans, and is k otherwise. Before entry with transB = MagmaNoTrans, the leading k by n part of the array dB must contain the matrix dB, otherwise the leading n by k part of the array dB must contain the matrix dB. |
[in] | lddb | INTEGER. On entry, LDB specifies the first dimension of dB as declared in the calling (sub) program. When transB = MagmaNoTrans then LDB must be at least max( 1, k ), otherwise LDB must be at least max( 1, n ). |
[in] | beta | DOUBLE_PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then dC need not be set on input. |
[in,out] | dC | DOUBLE_PRECISION array of DIMENSION ( LDC, n ). Before entry, the leading m by n part of the array dC must contain the matrix dC, except when beta is zero, in which case dC need not be set on entry. On exit, the array dC is overwritten by the m by n matrix ( alpha*op( dA )*op( dB ) + beta*dC ). |
[in] | lddc | INTEGER. On entry, LDC specifies the first dimension of dC as declared in the calling (sub) program. LDC must be at least max( 1, m ). |
void magmablas_dgemm_reduce | ( | magma_int_t | m, |
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_const_ptr | dB, | ||
magma_int_t | lddb, | ||
double | beta, | ||
magmaDouble_ptr | dC, | ||
magma_int_t | lddc | ||
) |
DGEMM_REDUCE performs one of the matrix-matrix operations.
C := alpha*A^T*B + beta*C,
where alpha and beta are scalars, and A, B and C are matrices, with A a k-by-m matrix, B a k-by-n matrix, and C an m-by-n matrix.
This routine is tuned for m, n << k. Typically, m and n are expected to be less than 128.
void magmablas_dgemm_tesla | ( | magma_trans_t | transA, |
magma_trans_t | transB, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
const double * | A, | ||
magma_int_t | lda, | ||
const double * | B, | ||
magma_int_t | ldb, | ||
double | beta, | ||
double * | C, | ||
magma_int_t | ldc | ||
) |
DGEMM performs one of the matrix-matrix operations.
C = alpha*op( A )*op( B ) + beta*C,
where op( X ) is one of
op( X ) = X or op( X ) = X**T,
alpha and beta are scalars, and A, B and C are matrices, with op( A ) an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
[in] | transA | magma_trans_t. On entry, transA specifies the form of op( A ) to be used in the matrix multiplication as follows:
|
[in] | transB | magma_trans_t. On entry, transB specifies the form of op( B ) to be used in the matrix multiplication as follows:
|
[in] | m | INTEGER. On entry, M specifies the number of rows of the matrix op( A ) and of the matrix C. M must be at least zero. |
[in] | n | INTEGER. On entry, N specifies the number of columns of the matrix op( B ) and the number of columns of the matrix C. N must be at least zero. |
[in] | k | INTEGER. On entry, K specifies the number of columns of the matrix op( A ) and the number of rows of the matrix op( B ). K must be at least zero. |
[in] | alpha | DOUBLE PRECISION. On entry, ALPHA specifies the scalar alpha. |
[in] | A | DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is k when transA = MagmaNoTrans, and is m otherwise. Before entry with transA = MagmaNoTrans, the leading m by k part of the array A must contain the matrix A, otherwise the leading k by m part of the array A must contain the matrix A. |
[in] | lda | INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When transA = MagmaNoTrans then LDA must be at least max( 1, m ), otherwise LDA must be at least max( 1, k ). |
[in] | B | DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is n when transB = MagmaNoTrans, and is k otherwise. Before entry with transB = MagmaNoTrans, the leading k by n part of the array B must contain the matrix B, otherwise the leading n by k part of the array B must contain the matrix B. |
[in] | ldb | INTEGER. On entry, LDB specifies the first dimension of B as declared in the calling (sub) program. When transB = MagmaNoTrans then LDB must be at least max( 1, k ), otherwise LDB must be at least max( 1, n ). |
[in] | beta | DOUBLE PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then C need not be set on input. |
[in,out] | C | DOUBLE PRECISION array of DIMENSION ( LDC, n ). Before entry, the leading m by n part of the array C must contain the matrix C, except when beta is zero, in which case C need not be set on entry. On exit, the array C is overwritten by the m by n matrix ( alpha*op( A )*op( B ) + beta*C ). |
[in] | ldc | INTEGER. On entry, LDC specifies the first dimension of C as declared in the calling (sub) program. LDC must be at least max( 1, m ). |
void magmablas_dsyr2k_mgpu2 | ( | magma_uplo_t | uplo, |
magma_trans_t | trans, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
magmaDouble_ptr | dA[], | ||
magma_int_t | ldda, | ||
magma_int_t | a_offset, | ||
magmaDouble_ptr | dB[], | ||
magma_int_t | lddb, | ||
magma_int_t | b_offset, | ||
double | beta, | ||
magmaDouble_ptr | dC[], | ||
magma_int_t | lddc, | ||
magma_int_t | c_offset, | ||
magma_int_t | ngpu, | ||
magma_int_t | nb, | ||
magma_queue_t | queues[][20], | ||
magma_int_t | nqueue | ||
) |
DSYR2K performs one of the symmetric rank 2k operations.
C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
or
C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
where alpha and beta are scalars with beta real, C is an n by n symmetric matrix and A and B are n by k matrices in the first case and k by n matrices in the second case.
[in] | uplo | magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array C is to be referenced as follows:
|
[in] | trans | magma_trans_t. On entry, TRANS specifies the operation to be performed as follows:
|
[in] | n | INTEGER. On entry, N specifies the order of the matrix C. N must be at least zero. |
[in] | k | INTEGER. On entry with TRANS = MagmaNoTrans, K specifies the number of columns of the matrices A and B, and on entry with TRANS = MagmaTrans, K specifies the number of rows of the matrices A and B. K must be at least zero. |
[in] | alpha | DOUBLE PRECISION. On entry, ALPHA specifies the scalar alpha. |
[in] | dA | DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is k when TRANS = MagmaNoTrans, and is n otherwise. Before entry with TRANS = MagmaNoTrans, the leading n by k part of the array A must contain the matrix A, otherwise the leading k by n part of the array A must contain the matrix A. |
[TODO: describe distribution: duplicated on all GPUs.]
[in] | ldda | INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When TRANS = MagmaNoTrans then LDA must be at least max( 1, n ), otherwise LDA must be at least max( 1, k ). |
[in] | a_offset | INTEGER Row offset to start sub-matrix of dA. Uses dA(a_offset:a_offset+n, :). 0 <= a_offset < ldda. |
[in] | dB | DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is k when TRANS = MagmaNoTrans, and is n otherwise. Before entry with TRANS = MagmaNoTrans, the leading n by k part of the array B must contain the matrix B, otherwise the leading k by n part of the array B must contain the matrix B. |
[TODO: describe distribution: duplicated on all GPUs.]
[in] | lddb | INTEGER. On entry, LDB specifies the first dimension of B as declared in the calling (sub) program. When TRANS = MagmaNoTrans then LDB must be at least max( 1, n ), otherwise LDB must be at least max( 1, k ). |
[in] | b_offset | INTEGER Row offset to start sub-matrix of dB. Uses dB(b_offset:b_offset+n, :). 0 <= b_offset < lddb. |
[in] | beta | DOUBLE PRECISION. On entry, BETA specifies the scalar beta. |
[in,out] | dC | DOUBLE PRECISION array of DIMENSION ( LDC, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array C must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of C is not referenced. On exit, the upper triangular part of the array C is overwritten by the upper triangular part of the updated matrix. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array C must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of C is not referenced. On exit, the lower triangular part of the array C is overwritten by the lower triangular part of the updated matrix. Note that the imaginary parts of the diagonal elements need not be set, they are assumed to be zero, and on exit they are set to zero. [TODO: verify] |
[TODO: describe distribution: 1D column block-cyclic across GPUs.]
[in] | lddc | INTEGER. On entry, LDC specifies the first dimension of C as declared in the calling (sub) program. LDC must be at least max( 1, n ). |
[in] | c_offset | INTEGER. Row and column offset to start sub-matrix of dC. Uses dC(c_offset:c_offset+n, c_offset:c_offset+n). 0 <= c_offset < lddc. |
[in] | ngpu | INTEGER. Number of GPUs over which matrix C is distributed. |
[in] | nb | INTEGER. Block size used for distribution of C. |
[in] | queues | array of CUDA queues, of dimension NGPU by 20. Streams to use for running multiple GEMMs in parallel. Only up to NSTREAM queues are used on each GPU. |
[in] | nqueue | INTEGER. Number of queues to use on each device |
void magmablas_dsyr2k_mgpu_spec | ( | magma_uplo_t | uplo, |
magma_trans_t | trans, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
magmaDouble_ptr | dA[], | ||
magma_int_t | ldda, | ||
magma_int_t | a_offset, | ||
magmaDouble_ptr | dB[], | ||
magma_int_t | lddb, | ||
magma_int_t | b_offset, | ||
double | beta, | ||
magmaDouble_ptr | dC[], | ||
magma_int_t | lddc, | ||
magma_int_t | c_offset, | ||
magma_int_t | ngpu, | ||
magma_int_t | nb, | ||
magma_queue_t | queues[][20], | ||
magma_int_t | nqueue | ||
) |
DSYR2K performs one of the symmetric rank 2k operations.
C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
or
C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
where alpha and beta are scalars with beta real, C is an n by n symmetric matrix and A and B are n by k matrices in the first case and k by n matrices in the second case.
This version assumes C has been symmetrized, so both upper and lower are stored, and it maintains the symmetry, doing twice the operations.
[in] | uplo | magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array C is to be referenced as follows:
|
[in] | trans | magma_trans_t. On entry, TRANS specifies the operation to be performed as follows:
|
[in] | n | INTEGER. On entry, N specifies the order of the matrix C. N must be at least zero. |
[in] | k | INTEGER. On entry with TRANS = MagmaNoTrans, K specifies the number of columns of the matrices A and B, and on entry with TRANS = MagmaTrans, K specifies the number of rows of the matrices A and B. K must be at least zero. |
[in] | alpha | DOUBLE PRECISION. On entry, ALPHA specifies the scalar alpha. |
[in] | dA | DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is k when TRANS = MagmaNoTrans, and is n otherwise. Before entry with TRANS = MagmaNoTrans, the leading n by k part of the array A must contain the matrix A, otherwise the leading k by n part of the array A must contain the matrix A. |
[TODO: describe distribution: duplicated on all GPUs.]
[in] | ldda | INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When TRANS = MagmaNoTrans then LDA must be at least max( 1, n ), otherwise LDA must be at least max( 1, k ). |
[in] | a_offset | INTEGER Row offset to start sub-matrix of dA. Uses dA(a_offset:a_offset+n, :). 0 <= a_offset < ldda. |
[in] | dB | DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is k when TRANS = MagmaNoTrans, and is n otherwise. Before entry with TRANS = MagmaNoTrans, the leading n by k part of the array B must contain the matrix B, otherwise the leading k by n part of the array B must contain the matrix B. |
[TODO: describe distribution: duplicated on all GPUs.]
[in] | lddb | INTEGER. On entry, LDB specifies the first dimension of B as declared in the calling (sub) program. When TRANS = MagmaNoTrans then LDB must be at least max( 1, n ), otherwise LDB must be at least max( 1, k ). |
[in] | b_offset | INTEGER Row offset to start sub-matrix of dB. Uses dB(b_offset:b_offset+n, :). 0 <= b_offset < lddb. |
[in] | beta | DOUBLE PRECISION. On entry, BETA specifies the scalar beta. |
[in,out] | dC | DOUBLE PRECISION array of DIMENSION ( LDC, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array C must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of C is not referenced. On exit, the upper triangular part of the array C is overwritten by the upper triangular part of the updated matrix. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array C must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of C is not referenced. On exit, the lower triangular part of the array C is overwritten by the lower triangular part of the updated matrix. Note that the imaginary parts of the diagonal elements need not be set, they are assumed to be zero, and on exit they are set to zero. [TODO: verify] |
[TODO: describe distribution: 1D column block-cyclic across GPUs.]
[in] | lddc | INTEGER. On entry, LDC specifies the first dimension of C as declared in the calling (sub) program. LDC must be at least max( 1, n ). |
[in] | c_offset | INTEGER. Row and column offset to start sub-matrix of dC. Uses dC(c_offset:c_offset+n, c_offset:c_offset+n). 0 <= c_offset < lddc. |
[in] | ngpu | INTEGER. Number of GPUs over which matrix C is distributed. |
[in] | nb | INTEGER. Block size used for distribution of C. |
[in] | queues | array of CUDA queues, of dimension NGPU by 20. Streams to use for running multiple GEMMs in parallel. Only up to NSTREAM queues are used on each GPU. |
[in] | nqueue | INTEGER. Number of queues to use on each device |
void magmablas_dsyrk_batched | ( | magma_uplo_t | uplo, |
magma_trans_t | trans, | ||
magma_int_t | n, | ||
magma_int_t | k, | ||
double | alpha, | ||
double const *const * | dA_array, | ||
magma_int_t | ldda, | ||
double | beta, | ||
double ** | dC_array, | ||
magma_int_t | lddc, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue | ||
) |
DSYRK performs one of the symmetric rank k operations.
C := alpha*A*A**H + beta*C,
or
C := alpha*A**H*A + beta*C,
where alpha and beta are real scalars, C is an n by n symmetric matrix and A is an n by k matrix in the first case and a k by n matrix in the second case.
[in] | uplo | CHARACTER*1. On entry, uplo specifies whether the upper or lower triangular part of the array C is to be referenced as follows: |
uplo = 'U' or 'u' Only the upper triangular part of C is to be referenced.
uplo = 'L' or 'l' Only the lower triangular part of C is to be referenced.
[in] | trans | CHARACTER*1. On entry, trans specifies the operation to be performed as follows: |
trans = 'N' or 'n' C := alpha*A*A**H + beta*C.
trans = 'C' or 'c' C := alpha*A**H*A + beta*C.
[in] | n | INTEGER. On entry, specifies the order of the matrix C. N must be at least zero. |
[in] | k | INTEGER. On entry with trans = 'N' or 'n', k specifies the number of columns of the matrix A, and on entry with trans = 'C' or 'c', k specifies the number of rows of the matrix A. K must be at least zero. |
[in] | alpha | DOUBLE PRECISION On entry, ALPHA specifies the scalar alpha. |
[in] | dA | DOUBLE_PRECISION array of DIMENSION ( ldda, ka ), where ka is k when trans = MagmaNoTrans, and is n otherwise. Before entry with trans = MagmaNoTrans, the leading m by k part of the array dA must contain the matrix dA, otherwise the leading k by m part of the array dA must contain the matrix dA. |
[in] | ldda | INTEGER. On entry, ldda specifies the first dimension of A as declared in the calling (sub) program. When trans = MagmaNoTrans then ldda must be at least max( 1, n ), otherwise ldda must be at least max( 1, k ). |
[in] | beta | DOUBLE PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then dC need not be set on input. |
[in,out] | dC | DOUBLE_PRECISION array of DIMENSION ( lddc, n ). Before entry with uplo = 'U' or 'u', the leading n by n upper triangular part of the array C must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of C is not referenced. On exit, the upper triangular part of the array C is overwritten by the upper triangular part of the updated matrix. Before entry with uplo = 'L' or 'l', the leading n by n lower triangular part of the array C must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of C is not referenced. On exit, the lower triangular part of the array C is overwritten by the lower triangular part of the updated matrix. Note that the imaginary parts of the diagonal elements need not be set, they are assumed to be zero, and on exit they are set to zero. |
[in] | lddc | INTEGER. On entry, lddc specifies the first dimension of dC as declared in the calling (sub) program. lddc must be at least max( 1, m ). |
void magmablas_dtrsm | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | transA, | ||
magma_diag_t | diag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_ptr | dB, | ||
magma_int_t | lddb | ||
) |
Similar to magmablas_dtrsm_outofplace, but allocates dX and d_dinvA internally.
This makes it a synchronous call, whereas magmablas_dtrsm_outofplace and magmablas_dtrsm_work are asynchronous.
void magmablas_dtrsm_batched | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | transA, | ||
magma_diag_t | diag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
double ** | dA_array, | ||
magma_int_t | ldda, | ||
double ** | dB_array, | ||
magma_int_t | lddb, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue, | ||
cublasHandle_t | myhandle | ||
) |
void magmablas_dtrsm_outofplace | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | transA, | ||
magma_diag_t | diag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_ptr | dB, | ||
magma_int_t | lddb, | ||
magmaDouble_ptr | dX, | ||
magma_int_t | lddx, | ||
magma_int_t | flag, | ||
magmaDouble_ptr | d_dinvA, | ||
magma_int_t | dinvA_length | ||
) |
dtrsm_outofplace solves one of the matrix equations on gpu
op(A)*X = alpha*B, or X*op(A) = alpha*B,
where alpha is a scalar, X and B are m by n matrices, A is a unit, or non-unit, upper or lower triangular matrix and op(A) is one of
op(A) = A, or op(A) = A^T, or op(A) = A^H.
The matrix X is output.
This is an asynchronous version of magmablas_dtrsm with flag, d_dinvA and dX workspaces as arguments.
[in] | side | magma_side_t. On entry, side specifies whether op(A) appears on the left or right of X as follows:
|
[in] | uplo | magma_uplo_t. On entry, uplo specifies whether the matrix A is an upper or lower triangular matrix as follows:
|
[in] | transA | magma_trans_t. On entry, transA specifies the form of op(A) to be used in the matrix multiplication as follows:
|
[in] | diag | magma_diag_t. On entry, diag specifies whether or not A is unit triangular as follows:
|
[in] | m | INTEGER. On entry, m specifies the number of rows of B. m >= 0. |
[in] | n | INTEGER. On entry, n specifies the number of columns of B. n >= 0. |
[in] | alpha | DOUBLE_PRECISION. On entry, alpha specifies the scalar alpha. When alpha is zero then A is not referenced and B need not be set before entry. |
[in] | dA | DOUBLE_PRECISION array of dimension ( ldda, k ), where k is m when side = MagmaLeft and is n when side = MagmaRight. Before entry with uplo = MagmaUpper, the leading k by k upper triangular part of the array A must contain the upper triangular matrix and the strictly lower triangular part of A is not referenced. Before entry with uplo = MagmaLower, the leading k by k lower triangular part of the array A must contain the lower triangular matrix and the strictly upper triangular part of A is not referenced. Note that when diag = MagmaUnit, the diagonal elements of A are not referenced either, but are assumed to be unity. |
[in] | ldda | INTEGER. On entry, ldda specifies the first dimension of A. When side = MagmaLeft, ldda >= max( 1, m ), when side = MagmaRight, ldda >= max( 1, n ). |
[in] | dB | DOUBLE_PRECISION array of dimension ( lddb, n ). Before entry, the leading m by n part of the array B must contain the right-hand side matrix B. On exit, contents in the leading m by n part are destroyed. |
[in] | lddb | INTEGER. On entry, lddb specifies the first dimension of B. lddb >= max( 1, m ). |
[out] | dX | DOUBLE_PRECISION array of dimension ( lddx, n ). On exit, it contains the m by n solution matrix X. |
[in] | lddx | INTEGER. On entry, lddx specifies the first dimension of X. lddx >= max( 1, m ). |
[in] | flag | BOOLEAN. If flag is true, invert diagonal blocks. If flag is false, assume diagonal blocks (stored in d_dinvA) are already inverted. |
d_dinvA | (workspace) on device. If side == MagmaLeft, d_dinvA must be of size dinvA_length >= ceil(m/NB)*NB*NB, If side == MagmaRight, d_dinvA must be of size dinvA_length >= ceil(n/NB)*NB*NB, where NB = 128. | |
[in] | dinvA_length | INTEGER. On entry, dinvA_length specifies the size of d_dinvA. |
void magmablas_dtrsm_outofplace_batched | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | transA, | ||
magma_diag_t | diag, | ||
magma_int_t | flag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
double ** | dA_array, | ||
magma_int_t | ldda, | ||
double ** | dB_array, | ||
magma_int_t | lddb, | ||
double ** | dX_array, | ||
magma_int_t | lddx, | ||
double ** | dinvA_array, | ||
magma_int_t | dinvA_length, | ||
double ** | dA_displ, | ||
double ** | dB_displ, | ||
double ** | dX_displ, | ||
double ** | dinvA_displ, | ||
magma_int_t | resetozero, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue, | ||
cublasHandle_t | myhandle | ||
) |
dtrsm_work solves one of the matrix equations on gpu
op(A)*X = alpha*B, or X*op(A) = alpha*B,
where alpha is a scalar, X and B are m by n matrices, A is a unit, or non-unit, upper or lower triangular matrix and op(A) is one of
op(A) = A, or op(A) = A^T, or op(A) = A^H.
The matrix X is overwritten on B.
This is an asynchronous version of magmablas_dtrsm with flag, d_dinvA and dX workspaces as arguments.
[in] | side | magma_side_t. On entry, side specifies whether op(A) appears on the left or right of X as follows:
|
[in] | uplo | magma_uplo_t. On entry, uplo specifies whether the matrix A is an upper or lower triangular matrix as follows:
|
[in] | transA | magma_trans_t. On entry, transA specifies the form of op(A) to be used in the matrix multiplication as follows:
|
[in] | diag | magma_diag_t. On entry, diag specifies whether or not A is unit triangular as follows:
|
[in] | m | INTEGER. On entry, m specifies the number of rows of B. m >= 0. |
[in] | n | INTEGER. On entry, n specifies the number of columns of B. n >= 0. |
[in] | alpha | DOUBLE_PRECISION. On entry, alpha specifies the scalar alpha. When alpha is zero then A is not referenced and B need not be set before entry. |
[in] | dA | DOUBLE_PRECISION array of dimension ( ldda, k ), where k is m when side = MagmaLeft and is n when side = MagmaRight. Before entry with uplo = MagmaUpper, the leading k by k upper triangular part of the array A must contain the upper triangular matrix and the strictly lower triangular part of A is not referenced. Before entry with uplo = MagmaLower, the leading k by k lower triangular part of the array A must contain the lower triangular matrix and the strictly upper triangular part of A is not referenced. Note that when diag = MagmaUnit, the diagonal elements of A are not referenced either, but are assumed to be unity. |
[in] | ldda | INTEGER. On entry, ldda specifies the first dimension of A. When side = MagmaLeft, ldda >= max( 1, m ), when side = MagmaRight, ldda >= max( 1, n ). |
[in] | dB | DOUBLE_PRECISION array of dimension ( lddb, n ). Before entry, the leading m by n part of the array B must contain the right-hand side matrix B. |
[in] | lddb | INTEGER. On entry, lddb specifies the first dimension of B. lddb >= max( 1, m ). |
[in] | flag | BOOLEAN. If flag is true, invert diagonal blocks. If flag is false, assume diagonal blocks (stored in d_dinvA) are already inverted. |
d_dinvA | (workspace) on device. If side == MagmaLeft, d_dinvA must be of size >= ceil(m/TRI_NB)*TRI_NB*TRI_NB, If side == MagmaRight, d_dinvA must be of size >= ceil(n/TRI_NB)*TRI_NB*TRI_NB, where TRI_NB = 128. | |
[out] | dX | DOUBLE_PRECISION array of dimension ( lddx, n ). On exit it contain the solution matrix X. |
void magmablas_dtrsm_work | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | transA, | ||
magma_diag_t | diag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_ptr | dB, | ||
magma_int_t | lddb, | ||
magmaDouble_ptr | dX, | ||
magma_int_t | lddx, | ||
magma_int_t | flag, | ||
magmaDouble_ptr | d_dinvA, | ||
magma_int_t | dinvA_length | ||
) |
Similar to magmablas_dtrsm_outofplace, but copies result dX back to dB, as in classical dtrsm interface.
void magmablas_dtrsm_work_batched | ( | magma_side_t | side, |
magma_uplo_t | uplo, | ||
magma_trans_t | transA, | ||
magma_diag_t | diag, | ||
magma_int_t | flag, | ||
magma_int_t | m, | ||
magma_int_t | n, | ||
double | alpha, | ||
double ** | dA_array, | ||
magma_int_t | ldda, | ||
double ** | dB_array, | ||
magma_int_t | lddb, | ||
double ** | dX_array, | ||
magma_int_t | lddx, | ||
double ** | dinvA_array, | ||
magma_int_t | dinvA_length, | ||
double ** | dA_displ, | ||
double ** | dB_displ, | ||
double ** | dX_displ, | ||
double ** | dinvA_displ, | ||
magma_int_t | resetozero, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue, | ||
cublasHandle_t | myhandle | ||
) |
void magmablas_dtrtri_diag | ( | magma_uplo_t | uplo, |
magma_diag_t | diag, | ||
magma_int_t | n, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_ptr | d_dinvA | ||
) |
void magmablas_dtrtri_diag_batched | ( | magma_uplo_t | uplo, |
magma_diag_t | diag, | ||
magma_int_t | n, | ||
double const *const * | dA_array, | ||
magma_int_t | ldda, | ||
double ** | dinvA_array, | ||
magma_int_t | resetozero, | ||
magma_int_t | batchCount, | ||
magma_queue_t | queue | ||
) |
Inverts the NB x NB diagonal blocks of a triangular matrix.
This routine is used in dtrsm.
Same as dtrtri_diag, but adds queue argument.
dtrtri_diag inverts the NB x NB diagonal blocks of A.
[in] | uplo | magma_uplo_t. On entry, uplo specifies whether the matrix A is an upper or lower triangular matrix as follows:
|
[in] | diag | magma_diag_t. On entry, diag specifies whether or not A is unit triangular as follows:
|
[in] | n | INTEGER. On entry, n specifies the order of the matrix A. N >= 0. |
[in] | dA_array | DOUBLE_PRECISION array of dimension ( ldda, n ) The triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. |
[in] | ldda | INTEGER. The leading dimension of the array A. LDDA >= max(1,N). |
[out] | dinvA_array | DOUBLE_PRECISION array of dimension (NB, ceil(n/NB)*NB), where NB = 128. On exit, contains inverses of the NB-by-NB diagonal blocks of A. |
[in] | queue | magma_queue_t Queue to execute in. |
void magmablas_dtrtri_diag_q | ( | magma_uplo_t | uplo, |
magma_diag_t | diag, | ||
magma_int_t | n, | ||
magmaDouble_const_ptr | dA, | ||
magma_int_t | ldda, | ||
magmaDouble_ptr | d_dinvA, | ||
magma_queue_t | queue | ||
) |
Inverts the NB x NB diagonal blocks of a triangular matrix.
This routine is used in dtrsm.
Same as dtrtri_diag, but adds queue argument.
dtrtri_diag inverts the NB x NB diagonal blocks of A.
[in] | uplo | magma_uplo_t. On entry, uplo specifies whether the matrix A is an upper or lower triangular matrix as follows:
|
[in] | diag | magma_diag_t. On entry, diag specifies whether or not A is unit triangular as follows:
|
[in] | n | INTEGER. On entry, n specifies the order of the matrix A. N >= 0. |
[in] | dA | DOUBLE_PRECISION array of dimension ( ldda, n ) The triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. |
[in] | ldda | INTEGER. The leading dimension of the array A. LDDA >= max(1,N). |
[out] | d_dinvA | DOUBLE_PRECISION array of dimension (NB, ceil(n/NB)*NB), where NB = 128. On exit, contains inverses of the NB-by-NB diagonal blocks of A. |
[in] | queue | magma_queue_t Queue to execute in. |