\( y = \alpha Ax + \beta y \) More...

Functions
void	magma_chemv (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy, magma_queue_t queue)
	Perform Hermitian matrix-vector product, \( y = \alpha A x + \beta y, \) where \( A \) is Hermitian. More...

void	magma_zhemv (magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dx, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex_ptr dy, magma_int_t incy, magma_queue_t queue)
	Perform Hermitian matrix-vector product, \( y = \alpha A x + \beta y, \) where \( A \) is Hermitian. More...

magma_int_t	magmablas_chemv_work (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy, magmaFloatComplex_ptr dwork, magma_int_t lwork, magma_queue_t queue)
	magmablas_chemv_work performs the matrix-vector operation: More...

magma_int_t	magmablas_chemv (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy, magma_queue_t queue)
	magmablas_chemv performs the matrix-vector operation: More...

magma_int_t	magmablas_chemv_mgpu (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex const x, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex y, magma_int_t incy, magmaFloatComplex *hwork, magma_int_t lhwork, magmaFloatComplex_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	magmablas_chemv_mgpu performs the matrix-vector operation: More...

magma_int_t	magmablas_chemv_mgpu_sync (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex const x, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex y, magma_int_t incy, magmaFloatComplex *hwork, magma_int_t lhwork, magmaFloatComplex_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	Synchronizes and acculumates final chemv result. More...

magma_int_t	magmablas_dsymv_work (magma_uplo_t uplo, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dx, magma_int_t incx, double beta, magmaDouble_ptr dy, magma_int_t incy, magmaDouble_ptr dwork, magma_int_t lwork, magma_queue_t queue)
	magmablas_dsymv_work performs the matrix-vector operation: More...

magma_int_t	magmablas_dsymv (magma_uplo_t uplo, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dx, magma_int_t incx, double beta, magmaDouble_ptr dy, magma_int_t incy, magma_queue_t queue)
	magmablas_dsymv performs the matrix-vector operation: More...

magma_int_t	magmablas_dsymv_mgpu (magma_uplo_t uplo, magma_int_t n, double alpha, magmaDouble_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, double const x, magma_int_t incx, double beta, double y, magma_int_t incy, double *hwork, magma_int_t lhwork, magmaDouble_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	magmablas_dsymv_mgpu performs the matrix-vector operation: More...

magma_int_t	magmablas_dsymv_mgpu_sync (magma_uplo_t uplo, magma_int_t n, double alpha, magmaDouble_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, double const x, magma_int_t incx, double beta, double y, magma_int_t incy, double *hwork, magma_int_t lhwork, magmaDouble_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	Synchronizes and acculumates final dsymv result. More...

magma_int_t	magmablas_ssymv_work (magma_uplo_t uplo, magma_int_t n, float alpha, magmaFloat_const_ptr dA, magma_int_t ldda, magmaFloat_const_ptr dx, magma_int_t incx, float beta, magmaFloat_ptr dy, magma_int_t incy, magmaFloat_ptr dwork, magma_int_t lwork, magma_queue_t queue)
	magmablas_ssymv_work performs the matrix-vector operation: More...

magma_int_t	magmablas_ssymv (magma_uplo_t uplo, magma_int_t n, float alpha, magmaFloat_const_ptr dA, magma_int_t ldda, magmaFloat_const_ptr dx, magma_int_t incx, float beta, magmaFloat_ptr dy, magma_int_t incy, magma_queue_t queue)
	magmablas_ssymv performs the matrix-vector operation: More...

magma_int_t	magmablas_ssymv_mgpu (magma_uplo_t uplo, magma_int_t n, float alpha, magmaFloat_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, float const x, magma_int_t incx, float beta, float y, magma_int_t incy, float *hwork, magma_int_t lhwork, magmaFloat_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	magmablas_ssymv_mgpu performs the matrix-vector operation: More...

magma_int_t	magmablas_ssymv_mgpu_sync (magma_uplo_t uplo, magma_int_t n, float alpha, magmaFloat_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, float const x, magma_int_t incx, float beta, float y, magma_int_t incy, float *hwork, magma_int_t lhwork, magmaFloat_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	Synchronizes and acculumates final ssymv result. More...

magma_int_t	magmablas_zhemv_work (magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dx, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex_ptr dy, magma_int_t incy, magmaDoubleComplex_ptr dwork, magma_int_t lwork, magma_queue_t queue)
	magmablas_zhemv_work performs the matrix-vector operation: More...

magma_int_t	magmablas_zhemv (magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dx, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex_ptr dy, magma_int_t incy, magma_queue_t queue)
	magmablas_zhemv performs the matrix-vector operation: More...

magma_int_t	magmablas_zhemv_mgpu (magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, magmaDoubleComplex const x, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex y, magma_int_t incy, magmaDoubleComplex *hwork, magma_int_t lhwork, magmaDoubleComplex_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	magmablas_zhemv_mgpu performs the matrix-vector operation: More...

magma_int_t	magmablas_zhemv_mgpu_sync (magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr const d_lA[], magma_int_t ldda, magma_int_t offset, magmaDoubleComplex const x, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex y, magma_int_t incy, magmaDoubleComplex *hwork, magma_int_t lhwork, magmaDoubleComplex_ptr dwork[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[])
	Synchronizes and acculumates final zhemv result. More...

Detailed Description

\( y = \alpha Ax + \beta y \)

Function Documentation

void magma_chemv	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaFloatComplex	alpha,
		magmaFloatComplex_const_ptr	dA,
		magma_int_t	ldda,
		magmaFloatComplex_const_ptr	dx,
		magma_int_t	incx,
		magmaFloatComplex	beta,
		magmaFloatComplex_ptr	dy,
		magma_int_t	incy,
		magma_queue_t	queue
	)

Perform Hermitian matrix-vector product, \( y = \alpha A x + \beta y, \) where \( A \) is Hermitian.

Parameters

[in]	uplo	Whether the upper or lower triangle of A is referenced.
[in]	n	Number of rows and columns of A. n >= 0.
[in]	alpha	Scalar \( \alpha \)
[in]	dA	COMPLEX array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device.
[in]	ldda	Leading dimension of dA.
[in]	dx	COMPLEX array on GPU device. The m element vector x of dimension (1 + (m-1)*incx).
[in]	incx	Stride between consecutive elements of dx. incx != 0.
[in]	beta	Scalar \( \beta \)
[in,out]	dy	COMPLEX array on GPU device. The n element vector y of dimension (1 + (n-1)*incy).
[in]	incy	Stride between consecutive elements of dy. incy != 0.
[in]	queue	magma_queue_t Queue to execute in.

void magma_zhemv	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaDoubleComplex	alpha,
		magmaDoubleComplex_const_ptr	dA,
		magma_int_t	ldda,
		magmaDoubleComplex_const_ptr	dx,
		magma_int_t	incx,
		magmaDoubleComplex	beta,
		magmaDoubleComplex_ptr	dy,
		magma_int_t	incy,
		magma_queue_t	queue
	)

Perform Hermitian matrix-vector product, \( y = \alpha A x + \beta y, \) where \( A \) is Hermitian.

Parameters

[in]	uplo	Whether the upper or lower triangle of A is referenced.
[in]	n	Number of rows and columns of A. n >= 0.
[in]	alpha	Scalar \( \alpha \)
[in]	dA	COMPLEX_16 array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device.
[in]	ldda	Leading dimension of dA.
[in]	dx	COMPLEX_16 array on GPU device. The m element vector x of dimension (1 + (m-1)*incx).
[in]	incx	Stride between consecutive elements of dx. incx != 0.
[in]	beta	Scalar \( \beta \)
[in,out]	dy	COMPLEX_16 array on GPU device. The n element vector y of dimension (1 + (n-1)*incy).
[in]	incy	Stride between consecutive elements of dy. incy != 0.
[in]	queue	magma_queue_t Queue to execute in.

magma_int_t magmablas_chemv_work	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaFloatComplex	alpha,
		magmaFloatComplex_const_ptr	dA,
		magma_int_t	ldda,
		magmaFloatComplex_const_ptr	dx,
		magma_int_t	incx,
		magmaFloatComplex	beta,
		magmaFloatComplex_ptr	dy,
		magma_int_t	incy,
		magmaFloatComplex_ptr	dwork,
		magma_int_t	lwork,
		magma_queue_t	queue
	)

magmablas_chemv_work performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n Hermitian matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	COMPLEX. On entry, ALPHA specifies the scalar alpha.
[in]	dA	COMPLEX array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the Hermitian matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the Hermitian matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	COMPLEX array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	COMPLEX. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	COMPLEX array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	dwork	(workspace) COMPLEX array on the GPU, dimension (MAX(1, LWORK)),
[in]	lwork	INTEGER. The dimension of the array DWORK. LWORK >= LDDA * ceil( N / NB_X ), where NB_X = 64.
[in]	queue	magma_queue_t. Queue to execute in.

MAGMA implements chemv through two steps: 1) perform the multiplication in each thread block and put the intermediate value in dwork. 2) sum the intermediate values and store the final result in y.

magamblas_chemv_work requires users to provide a workspace, while magmablas_chemv is a wrapper routine allocating the workspace inside the routine and provides the same interface as cublas.

If users need to call chemv frequently, we suggest using magmablas_chemv_work instead of magmablas_chemv. As the overhead to allocate and free in device memory in magmablas_chemv would hurt performance. Our tests show that this penalty is about 10 Gflop/s when the matrix size is around 10000.

magma_int_t magmablas_chemv	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaFloatComplex	alpha,
		magmaFloatComplex_const_ptr	dA,
		magma_int_t	ldda,
		magmaFloatComplex_const_ptr	dx,
		magma_int_t	incx,
		magmaFloatComplex	beta,
		magmaFloatComplex_ptr	dy,
		magma_int_t	incy,
		magma_queue_t	queue
	)

magmablas_chemv performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n Hermitian matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	COMPLEX. On entry, ALPHA specifies the scalar alpha.
[in]	dA	COMPLEX array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the Hermitian matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the Hermitian matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	COMPLEX array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	COMPLEX. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	COMPLEX array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	queue	magma_queue_t Queue to execute in.

magma_int_t magmablas_chemv_mgpu	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaFloatComplex	alpha,
		magmaFloatComplex_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		magmaFloatComplex const *	x,
		magma_int_t	incx,
		magmaFloatComplex	beta,
		magmaFloatComplex *	y,
		magma_int_t	incy,
		magmaFloatComplex *	hwork,
		magma_int_t	lhwork,
		magmaFloatComplex_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

magmablas_chemv_mgpu performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n Hermitian matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. Not currently supported. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	COMPLEX. On entry, ALPHA specifies the scalar alpha.
[in]	d_lA	Array of pointers, dimension (ngpu), to block-column distributed matrix A, with block size nb. d_lA[dev] is a COMPLEX array on GPU dev, of dimension (LDDA, nlocal), where { floor(n/nb/ngpu)nb + nb if dev < floor(n/nb) % ngpu, nlocal = { floor(n/nb/ngpu)nb + nnb if dev == floor(n/nb) % ngpu, { floor(n/nb/ngpu)*nb otherwise. Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the Hermitian matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the Hermitian matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	offset	INTEGER. Row & column offset to start of matrix A within the distributed d_lA structure. Note that N is the size of this multiply, excluding the offset, so the size of the original parent matrix is N+offset. Also, x and y do not have an offset.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n + offset ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	x	COMPLEX array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	COMPLEX. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	y	COMPLEX array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
	hwork	(workspace) COMPLEX array on the CPU, of dimension (lhwork).
[in]	lhwork	INTEGER. The dimension of the array hwork. lhwork >= ngpu*nb.
	dwork	(workspaces) Array of pointers, dimension (ngpu), to workspace on each GPU. dwork[dev] is a COMPLEX array on GPU dev, of dimension (ldwork).
[in]	ldwork	INTEGER. The dimension of each array dwork[dev]. ldwork >= ldda*( ceil((n + offset % nb) / nb) + 1 ).
[in]	ngpu	INTEGER. The number of GPUs to use.
[in]	nb	INTEGER. The block size used for distributing d_lA. Must be 64.
[in]	queues	magma_queue_t array of dimension (ngpu). queues[dev] is an execution queue on GPU dev.

magma_int_t magmablas_chemv_mgpu_sync	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaFloatComplex	alpha,
		magmaFloatComplex_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		magmaFloatComplex const *	x,
		magma_int_t	incx,
		magmaFloatComplex	beta,
		magmaFloatComplex *	y,
		magma_int_t	incy,
		magmaFloatComplex *	hwork,
		magma_int_t	lhwork,
		magmaFloatComplex_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

Synchronizes and acculumates final chemv result.

For convenience, the parameters are identical to magmablas_chemv_mgpu (though some are unused here).

See Also: magmablas_chemv_mgpu

magma_int_t magmablas_dsymv_work	(	magma_uplo_t	uplo,
		magma_int_t	n,
		double	alpha,
		magmaDouble_const_ptr	dA,
		magma_int_t	ldda,
		magmaDouble_const_ptr	dx,
		magma_int_t	incx,
		double	beta,
		magmaDouble_ptr	dy,
		magma_int_t	incy,
		magmaDouble_ptr	dwork,
		magma_int_t	lwork,
		magma_queue_t	queue
	)

magmablas_dsymv_work performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n symmetric matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	DOUBLE PRECISION. On entry, ALPHA specifies the scalar alpha.
[in]	dA	DOUBLE PRECISION array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	DOUBLE PRECISION array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	DOUBLE PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	DOUBLE PRECISION array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	dwork	(workspace) DOUBLE PRECISION array on the GPU, dimension (MAX(1, LWORK)),
[in]	lwork	INTEGER. The dimension of the array DWORK. LWORK >= LDDA * ceil( N / NB_X ), where NB_X = 64.
[in]	queue	magma_queue_t. Queue to execute in.

MAGMA implements dsymv through two steps: 1) perform the multiplication in each thread block and put the intermediate value in dwork. 2) sum the intermediate values and store the final result in y.

magamblas_dsymv_work requires users to provide a workspace, while magmablas_dsymv is a wrapper routine allocating the workspace inside the routine and provides the same interface as cublas.

If users need to call dsymv frequently, we suggest using magmablas_dsymv_work instead of magmablas_dsymv. As the overhead to allocate and free in device memory in magmablas_dsymv would hurt performance. Our tests show that this penalty is about 10 Gflop/s when the matrix size is around 10000.

magma_int_t magmablas_dsymv	(	magma_uplo_t	uplo,
		magma_int_t	n,
		double	alpha,
		magmaDouble_const_ptr	dA,
		magma_int_t	ldda,
		magmaDouble_const_ptr	dx,
		magma_int_t	incx,
		double	beta,
		magmaDouble_ptr	dy,
		magma_int_t	incy,
		magma_queue_t	queue
	)

magmablas_dsymv performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n symmetric matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	DOUBLE PRECISION. On entry, ALPHA specifies the scalar alpha.
[in]	dA	DOUBLE PRECISION array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	DOUBLE PRECISION array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	DOUBLE PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	DOUBLE PRECISION array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	queue	magma_queue_t Queue to execute in.

magma_int_t magmablas_dsymv_mgpu	(	magma_uplo_t	uplo,
		magma_int_t	n,
		double	alpha,
		magmaDouble_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		double const *	x,
		magma_int_t	incx,
		double	beta,
		double *	y,
		magma_int_t	incy,
		double *	hwork,
		magma_int_t	lhwork,
		magmaDouble_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

magmablas_dsymv_mgpu performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n symmetric matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. Not currently supported. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	DOUBLE PRECISION. On entry, ALPHA specifies the scalar alpha.
[in]	d_lA	Array of pointers, dimension (ngpu), to block-column distributed matrix A, with block size nb. d_lA[dev] is a DOUBLE PRECISION array on GPU dev, of dimension (LDDA, nlocal), where { floor(n/nb/ngpu)nb + nb if dev < floor(n/nb) % ngpu, nlocal = { floor(n/nb/ngpu)nb + nnb if dev == floor(n/nb) % ngpu, { floor(n/nb/ngpu)*nb otherwise. Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	offset	INTEGER. Row & column offset to start of matrix A within the distributed d_lA structure. Note that N is the size of this multiply, excluding the offset, so the size of the original parent matrix is N+offset. Also, x and y do not have an offset.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n + offset ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	x	DOUBLE PRECISION array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	DOUBLE PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	y	DOUBLE PRECISION array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
	hwork	(workspace) DOUBLE PRECISION array on the CPU, of dimension (lhwork).
[in]	lhwork	INTEGER. The dimension of the array hwork. lhwork >= ngpu*nb.
	dwork	(workspaces) Array of pointers, dimension (ngpu), to workspace on each GPU. dwork[dev] is a DOUBLE PRECISION array on GPU dev, of dimension (ldwork).
[in]	ldwork	INTEGER. The dimension of each array dwork[dev]. ldwork >= ldda*( ceil((n + offset % nb) / nb) + 1 ).
[in]	ngpu	INTEGER. The number of GPUs to use.
[in]	nb	INTEGER. The block size used for distributing d_lA. Must be 64.
[in]	queues	magma_queue_t array of dimension (ngpu). queues[dev] is an execution queue on GPU dev.

magma_int_t magmablas_dsymv_mgpu_sync	(	magma_uplo_t	uplo,
		magma_int_t	n,
		double	alpha,
		magmaDouble_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		double const *	x,
		magma_int_t	incx,
		double	beta,
		double *	y,
		magma_int_t	incy,
		double *	hwork,
		magma_int_t	lhwork,
		magmaDouble_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

Synchronizes and acculumates final dsymv result.

For convenience, the parameters are identical to magmablas_dsymv_mgpu (though some are unused here).

See Also: magmablas_dsymv_mgpu

magma_int_t magmablas_ssymv_work	(	magma_uplo_t	uplo,
		magma_int_t	n,
		float	alpha,
		magmaFloat_const_ptr	dA,
		magma_int_t	ldda,
		magmaFloat_const_ptr	dx,
		magma_int_t	incx,
		float	beta,
		magmaFloat_ptr	dy,
		magma_int_t	incy,
		magmaFloat_ptr	dwork,
		magma_int_t	lwork,
		magma_queue_t	queue
	)

magmablas_ssymv_work performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n symmetric matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	REAL. On entry, ALPHA specifies the scalar alpha.
[in]	dA	REAL array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	REAL array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	REAL. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	REAL array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	dwork	(workspace) REAL array on the GPU, dimension (MAX(1, LWORK)),
[in]	lwork	INTEGER. The dimension of the array DWORK. LWORK >= LDDA * ceil( N / NB_X ), where NB_X = 64.
[in]	queue	magma_queue_t. Queue to execute in.

MAGMA implements ssymv through two steps: 1) perform the multiplication in each thread block and put the intermediate value in dwork. 2) sum the intermediate values and store the final result in y.

magamblas_ssymv_work requires users to provide a workspace, while magmablas_ssymv is a wrapper routine allocating the workspace inside the routine and provides the same interface as cublas.

If users need to call ssymv frequently, we suggest using magmablas_ssymv_work instead of magmablas_ssymv. As the overhead to allocate and free in device memory in magmablas_ssymv would hurt performance. Our tests show that this penalty is about 10 Gflop/s when the matrix size is around 10000.

magma_int_t magmablas_ssymv	(	magma_uplo_t	uplo,
		magma_int_t	n,
		float	alpha,
		magmaFloat_const_ptr	dA,
		magma_int_t	ldda,
		magmaFloat_const_ptr	dx,
		magma_int_t	incx,
		float	beta,
		magmaFloat_ptr	dy,
		magma_int_t	incy,
		magma_queue_t	queue
	)

magmablas_ssymv performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n symmetric matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	REAL. On entry, ALPHA specifies the scalar alpha.
[in]	dA	REAL array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	REAL array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	REAL. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	REAL array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	queue	magma_queue_t Queue to execute in.

magma_int_t magmablas_ssymv_mgpu	(	magma_uplo_t	uplo,
		magma_int_t	n,
		float	alpha,
		magmaFloat_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		float const *	x,
		magma_int_t	incx,
		float	beta,
		float *	y,
		magma_int_t	incy,
		float *	hwork,
		magma_int_t	lhwork,
		magmaFloat_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

magmablas_ssymv_mgpu performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n symmetric matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. Not currently supported. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	REAL. On entry, ALPHA specifies the scalar alpha.
[in]	d_lA	Array of pointers, dimension (ngpu), to block-column distributed matrix A, with block size nb. d_lA[dev] is a REAL array on GPU dev, of dimension (LDDA, nlocal), where { floor(n/nb/ngpu)nb + nb if dev < floor(n/nb) % ngpu, nlocal = { floor(n/nb/ngpu)nb + nnb if dev == floor(n/nb) % ngpu, { floor(n/nb/ngpu)*nb otherwise. Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the symmetric matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the symmetric matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	offset	INTEGER. Row & column offset to start of matrix A within the distributed d_lA structure. Note that N is the size of this multiply, excluding the offset, so the size of the original parent matrix is N+offset. Also, x and y do not have an offset.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n + offset ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	x	REAL array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	REAL. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	y	REAL array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
	hwork	(workspace) REAL array on the CPU, of dimension (lhwork).
[in]	lhwork	INTEGER. The dimension of the array hwork. lhwork >= ngpu*nb.
	dwork	(workspaces) Array of pointers, dimension (ngpu), to workspace on each GPU. dwork[dev] is a REAL array on GPU dev, of dimension (ldwork).
[in]	ldwork	INTEGER. The dimension of each array dwork[dev]. ldwork >= ldda*( ceil((n + offset % nb) / nb) + 1 ).
[in]	ngpu	INTEGER. The number of GPUs to use.
[in]	nb	INTEGER. The block size used for distributing d_lA. Must be 64.
[in]	queues	magma_queue_t array of dimension (ngpu). queues[dev] is an execution queue on GPU dev.

magma_int_t magmablas_ssymv_mgpu_sync	(	magma_uplo_t	uplo,
		magma_int_t	n,
		float	alpha,
		magmaFloat_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		float const *	x,
		magma_int_t	incx,
		float	beta,
		float *	y,
		magma_int_t	incy,
		float *	hwork,
		magma_int_t	lhwork,
		magmaFloat_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

Synchronizes and acculumates final ssymv result.

For convenience, the parameters are identical to magmablas_ssymv_mgpu (though some are unused here).

See Also: magmablas_ssymv_mgpu

magma_int_t magmablas_zhemv_work	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaDoubleComplex	alpha,
		magmaDoubleComplex_const_ptr	dA,
		magma_int_t	ldda,
		magmaDoubleComplex_const_ptr	dx,
		magma_int_t	incx,
		magmaDoubleComplex	beta,
		magmaDoubleComplex_ptr	dy,
		magma_int_t	incy,
		magmaDoubleComplex_ptr	dwork,
		magma_int_t	lwork,
		magma_queue_t	queue
	)

magmablas_zhemv_work performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n Hermitian matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	COMPLEX_16. On entry, ALPHA specifies the scalar alpha.
[in]	dA	COMPLEX_16 array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the Hermitian matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the Hermitian matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	COMPLEX_16 array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	COMPLEX_16. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	COMPLEX_16 array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	dwork	(workspace) COMPLEX_16 array on the GPU, dimension (MAX(1, LWORK)),
[in]	lwork	INTEGER. The dimension of the array DWORK. LWORK >= LDDA * ceil( N / NB_X ), where NB_X = 64.
[in]	queue	magma_queue_t. Queue to execute in.

MAGMA implements zhemv through two steps: 1) perform the multiplication in each thread block and put the intermediate value in dwork. 2) sum the intermediate values and store the final result in y.

magamblas_zhemv_work requires users to provide a workspace, while magmablas_zhemv is a wrapper routine allocating the workspace inside the routine and provides the same interface as cublas.

If users need to call zhemv frequently, we suggest using magmablas_zhemv_work instead of magmablas_zhemv. As the overhead to allocate and free in device memory in magmablas_zhemv would hurt performance. Our tests show that this penalty is about 10 Gflop/s when the matrix size is around 10000.

magma_int_t magmablas_zhemv	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaDoubleComplex	alpha,
		magmaDoubleComplex_const_ptr	dA,
		magma_int_t	ldda,
		magmaDoubleComplex_const_ptr	dx,
		magma_int_t	incx,
		magmaDoubleComplex	beta,
		magmaDoubleComplex_ptr	dy,
		magma_int_t	incy,
		magma_queue_t	queue
	)

magmablas_zhemv performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n Hermitian matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	COMPLEX_16. On entry, ALPHA specifies the scalar alpha.
[in]	dA	COMPLEX_16 array of DIMENSION ( LDDA, n ). Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the Hermitian matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the Hermitian matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	dx	COMPLEX_16 array of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	COMPLEX_16. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	dy	COMPLEX_16 array of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
[in]	queue	magma_queue_t Queue to execute in.

magma_int_t magmablas_zhemv_mgpu	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaDoubleComplex	alpha,
		magmaDoubleComplex_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		magmaDoubleComplex const *	x,
		magma_int_t	incx,
		magmaDoubleComplex	beta,
		magmaDoubleComplex *	y,
		magma_int_t	incy,
		magmaDoubleComplex *	hwork,
		magma_int_t	lhwork,
		magmaDoubleComplex_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

magmablas_zhemv_mgpu performs the matrix-vector operation:

y := alpha*A*x + beta*y,

where alpha and beta are scalars, x and y are n element vectors and A is an n by n Hermitian matrix.

Parameters

[in]	uplo	magma_uplo_t. On entry, UPLO specifies whether the upper or lower triangular part of the array A is to be referenced as follows: = MagmaUpper: Only the upper triangular part of A is to be referenced. Not currently supported. = MagmaLower: Only the lower triangular part of A is to be referenced.
[in]	n	INTEGER. On entry, N specifies the order of the matrix A. N must be at least zero.
[in]	alpha	COMPLEX_16. On entry, ALPHA specifies the scalar alpha.
[in]	d_lA	Array of pointers, dimension (ngpu), to block-column distributed matrix A, with block size nb. d_lA[dev] is a COMPLEX_16 array on GPU dev, of dimension (LDDA, nlocal), where { floor(n/nb/ngpu)nb + nb if dev < floor(n/nb) % ngpu, nlocal = { floor(n/nb/ngpu)nb + nnb if dev == floor(n/nb) % ngpu, { floor(n/nb/ngpu)*nb otherwise. Before entry with UPLO = MagmaUpper, the leading n by n upper triangular part of the array A must contain the upper triangular part of the Hermitian matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = MagmaLower, the leading n by n lower triangular part of the array A must contain the lower triangular part of the Hermitian matrix and the strictly upper triangular part of A is not referenced. Note that the imaginary parts of the diagonal elements need not be set and are assumed to be zero.
[in]	offset	INTEGER. Row & column offset to start of matrix A within the distributed d_lA structure. Note that N is the size of this multiply, excluding the offset, so the size of the original parent matrix is N+offset. Also, x and y do not have an offset.
[in]	ldda	INTEGER. On entry, LDDA specifies the first dimension of A as declared in the calling (sub) program. LDDA must be at least max( 1, n + offset ). It is recommended that ldda is multiple of 16. Otherwise performance would be deteriorated as the memory accesses would not be fully coalescent.
[in]	x	COMPLEX_16 array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCX ) ). Before entry, the incremented array X must contain the n element vector x.
[in]	incx	INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero.
[in]	beta	COMPLEX_16. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input.
[in,out]	y	COMPLEX_16 array on the CPU (not the GPU), of dimension at least ( 1 + ( n - 1 )*abs( INCY ) ). Before entry, the incremented array Y must contain the n element vector y. On exit, Y is overwritten by the updated vector y.
[in]	incy	INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero.
	hwork	(workspace) COMPLEX_16 array on the CPU, of dimension (lhwork).
[in]	lhwork	INTEGER. The dimension of the array hwork. lhwork >= ngpu*nb.
	dwork	(workspaces) Array of pointers, dimension (ngpu), to workspace on each GPU. dwork[dev] is a COMPLEX_16 array on GPU dev, of dimension (ldwork).
[in]	ldwork	INTEGER. The dimension of each array dwork[dev]. ldwork >= ldda*( ceil((n + offset % nb) / nb) + 1 ).
[in]	ngpu	INTEGER. The number of GPUs to use.
[in]	nb	INTEGER. The block size used for distributing d_lA. Must be 64.
[in]	queues	magma_queue_t array of dimension (ngpu). queues[dev] is an execution queue on GPU dev.

magma_int_t magmablas_zhemv_mgpu_sync	(	magma_uplo_t	uplo,
		magma_int_t	n,
		magmaDoubleComplex	alpha,
		magmaDoubleComplex_const_ptr const	d_lA[],
		magma_int_t	ldda,
		magma_int_t	offset,
		magmaDoubleComplex const *	x,
		magma_int_t	incx,
		magmaDoubleComplex	beta,
		magmaDoubleComplex *	y,
		magma_int_t	incy,
		magmaDoubleComplex *	hwork,
		magma_int_t	lhwork,
		magmaDoubleComplex_ptr	dwork[],
		magma_int_t	ldwork,
		magma_int_t	ngpu,
		magma_int_t	nb,
		magma_queue_t	queues[]
	)

Synchronizes and acculumates final zhemv result.

For convenience, the parameters are identical to magmablas_zhemv_mgpu (though some are unused here).

See Also: magmablas_zhemv_mgpu

Functions

Detailed Description

Function Documentation