Functions
void	magmablas_cgetmatrix_transpose (magma_int_t m, magma_int_t n, magma_int_t nb, magmaFloatComplex_const_ptr dAT, magma_int_t ldda, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dwork, magma_int_t lddw, magma_queue_t queues[2])
	Copy and transpose matrix dAT on GPU device to hA on CPU host. More...

void	magmablas_cgetmatrix_transpose_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magmaFloatComplex_const_ptr const dAT[], magma_int_t ldda, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dwork[], magma_int_t lddw, magma_queue_t queues[][2])
	Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host. More...

void	magmablas_dgetmatrix_transpose (magma_int_t m, magma_int_t n, magma_int_t nb, magmaDouble_const_ptr dAT, magma_int_t ldda, double *hA, magma_int_t lda, magmaDouble_ptr dwork, magma_int_t lddw, magma_queue_t queues[2])
	Copy and transpose matrix dAT on GPU device to hA on CPU host. More...

void	magmablas_dgetmatrix_transpose_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magmaDouble_const_ptr const dAT[], magma_int_t ldda, double *hA, magma_int_t lda, magmaDouble_ptr dwork[], magma_int_t lddw, magma_queue_t queues[][2])
	Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host. More...

void	magmablas_sgetmatrix_transpose (magma_int_t m, magma_int_t n, magma_int_t nb, magmaFloat_const_ptr dAT, magma_int_t ldda, float *hA, magma_int_t lda, magmaFloat_ptr dwork, magma_int_t lddw, magma_queue_t queues[2])
	Copy and transpose matrix dAT on GPU device to hA on CPU host. More...

void	magmablas_sgetmatrix_transpose_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magmaFloat_const_ptr const dAT[], magma_int_t ldda, float *hA, magma_int_t lda, magmaFloat_ptr dwork[], magma_int_t lddw, magma_queue_t queues[][2])
	Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host. More...

void	magmablas_zgetmatrix_transpose (magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex_const_ptr dAT, magma_int_t ldda, magmaDoubleComplex *hA, magma_int_t lda, magmaDoubleComplex_ptr dwork, magma_int_t lddw, magma_queue_t queues[2])
	Copy and transpose matrix dAT on GPU device to hA on CPU host. More...

void	magmablas_zgetmatrix_transpose_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex_const_ptr const dAT[], magma_int_t ldda, magmaDoubleComplex *hA, magma_int_t lda, magmaDoubleComplex_ptr dwork[], magma_int_t lddw, magma_queue_t queues[][2])
	Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host. More...

Detailed Description

Function Documentation

void magmablas_cgetmatrix_transpose	(	magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaFloatComplex_const_ptr	dAT,
		magma_int_t	ldda,
		magmaFloatComplex *	hA,
		magma_int_t	lda,
		magmaFloatComplex_ptr	dwork,
		magma_int_t	lddw,
		magma_queue_t	queues[2]
	)

Copy and transpose matrix dAT on GPU device to hA on CPU host.

Parameters

[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	The n-by-m matrix A^T on the GPU, of dimension (ldda,m).
[in]	ldda	Leading dimension of matrix dAT. ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Workspace on the GPU, of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	Array of two queues, to pipeline operation.

void magmablas_cgetmatrix_transpose_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaFloatComplex_const_ptr const	dAT[],
		magma_int_t	ldda,
		magmaFloatComplex *	hA,
		magma_int_t	lda,
		magmaFloatComplex_ptr	dwork[],
		magma_int_t	lddw,
		magma_queue_t	queues[][2]
	)

Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host.

Parameters

[in]	ngpu	Number of GPUs over which dAT is distributed.
[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	Array of ngpu pointers, one per GPU, that store the disributed n-by-m matrix A^T on the GPUs, each of dimension (ldda,m).
[in]	ldda	Leading dimension of each matrix dAT on each GPU. ngpu*ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Array of ngpu pointers, one per GPU, that store the workspaces on each GPU, each of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	2D array of dimension (ngpu,2), with two queues per GPU.

void magmablas_dgetmatrix_transpose	(	magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaDouble_const_ptr	dAT,
		magma_int_t	ldda,
		double *	hA,
		magma_int_t	lda,
		magmaDouble_ptr	dwork,
		magma_int_t	lddw,
		magma_queue_t	queues[2]
	)

Copy and transpose matrix dAT on GPU device to hA on CPU host.

Parameters

[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	The n-by-m matrix A^T on the GPU, of dimension (ldda,m).
[in]	ldda	Leading dimension of matrix dAT. ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Workspace on the GPU, of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	Array of two queues, to pipeline operation.

void magmablas_dgetmatrix_transpose_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaDouble_const_ptr const	dAT[],
		magma_int_t	ldda,
		double *	hA,
		magma_int_t	lda,
		magmaDouble_ptr	dwork[],
		magma_int_t	lddw,
		magma_queue_t	queues[][2]
	)

Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host.

Parameters

[in]	ngpu	Number of GPUs over which dAT is distributed.
[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	Array of ngpu pointers, one per GPU, that store the disributed n-by-m matrix A^T on the GPUs, each of dimension (ldda,m).
[in]	ldda	Leading dimension of each matrix dAT on each GPU. ngpu*ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Array of ngpu pointers, one per GPU, that store the workspaces on each GPU, each of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	2D array of dimension (ngpu,2), with two queues per GPU.

void magmablas_sgetmatrix_transpose	(	magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaFloat_const_ptr	dAT,
		magma_int_t	ldda,
		float *	hA,
		magma_int_t	lda,
		magmaFloat_ptr	dwork,
		magma_int_t	lddw,
		magma_queue_t	queues[2]
	)

Copy and transpose matrix dAT on GPU device to hA on CPU host.

Parameters

[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	The n-by-m matrix A^T on the GPU, of dimension (ldda,m).
[in]	ldda	Leading dimension of matrix dAT. ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Workspace on the GPU, of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	Array of two queues, to pipeline operation.

void magmablas_sgetmatrix_transpose_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaFloat_const_ptr const	dAT[],
		magma_int_t	ldda,
		float *	hA,
		magma_int_t	lda,
		magmaFloat_ptr	dwork[],
		magma_int_t	lddw,
		magma_queue_t	queues[][2]
	)

Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host.

Parameters

[in]	ngpu	Number of GPUs over which dAT is distributed.
[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	Array of ngpu pointers, one per GPU, that store the disributed n-by-m matrix A^T on the GPUs, each of dimension (ldda,m).
[in]	ldda	Leading dimension of each matrix dAT on each GPU. ngpu*ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Array of ngpu pointers, one per GPU, that store the workspaces on each GPU, each of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	2D array of dimension (ngpu,2), with two queues per GPU.

void magmablas_zgetmatrix_transpose	(	magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaDoubleComplex_const_ptr	dAT,
		magma_int_t	ldda,
		magmaDoubleComplex *	hA,
		magma_int_t	lda,
		magmaDoubleComplex_ptr	dwork,
		magma_int_t	lddw,
		magma_queue_t	queues[2]
	)

Copy and transpose matrix dAT on GPU device to hA on CPU host.

Parameters

[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	The n-by-m matrix A^T on the GPU, of dimension (ldda,m).
[in]	ldda	Leading dimension of matrix dAT. ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Workspace on the GPU, of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	Array of two queues, to pipeline operation.

void magmablas_zgetmatrix_transpose_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magmaDoubleComplex_const_ptr const	dAT[],
		magma_int_t	ldda,
		magmaDoubleComplex *	hA,
		magma_int_t	lda,
		magmaDoubleComplex_ptr	dwork[],
		magma_int_t	lddw,
		magma_queue_t	queues[][2]
	)

Copy and transpose matrix dAT, which is distributed row block cyclic over multiple GPUs, to hA on CPU host.

Parameters

[in]	ngpu	Number of GPUs over which dAT is distributed.
[in]	m	Number of rows of output matrix hA. m >= 0.
[in]	n	Number of columns of output matrix hA. n >= 0.
[in]	nb	Block size. nb >= 0.
[in]	dAT	Array of ngpu pointers, one per GPU, that store the disributed n-by-m matrix A^T on the GPUs, each of dimension (ldda,m).
[in]	ldda	Leading dimension of each matrix dAT on each GPU. ngpu*ldda >= n.
[out]	hA	The m-by-n matrix A on the CPU, of dimension (lda,n).
[in]	lda	Leading dimension of matrix hA. lda >= m.
[out]	dwork	Array of ngpu pointers, one per GPU, that store the workspaces on each GPU, each of dimension (2lddwnb).
[in]	lddw	Leading dimension of dwork. lddw >= m.
[in]	queues	2D array of dimension (ngpu,2), with two queues per GPU.

Functions

Detailed Description

Function Documentation