I am happy to hear you like the library.
We are preparing MAGMA version 0.2 to be released by November 14, along with the sources. The library itself is "high" level, in the sense we try to avoid directly coding in CUDA, just using CUDA BLAS. There are particular BLAS cases (e.g. gemm on rectangular matrices, triangular solvers etc) that are of importance for MAGMA's performance and we will include those in a MAGMA BLAS library (written in CUDA). MAGMA BLAS can be seen as a complement to CUBLAS for those cases of particular interest for MAGMA.
Below are two example routines - one MAGMA and one MAGMA BLAS.
Code: Select all
/*
-- MAGMA (version 0.1) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
June 2009
*/
#include "cuda_runtime_api.h"
#include "cublas.h"
#include "magma.h"
#include <stdio.h>
int
magma_dpotrf(char *uplo, int *n, double *a, int *lda, double *work, int *info)
{
/* -- MAGMA (version 0.1) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
June 2009
Purpose
=======
DPOTRF computes the Cholesky factorization of a real symmetric
positive definite matrix A.
The factorization has the form
A = U**T * U, if UPLO = 'U', or
A = L * L**T, if UPLO = 'L',
where U is an upper triangular matrix and L is lower triangular.
This is the block version of the algorithm, calling Level 3 BLAS.
Arguments
=========
UPLO (input) CHARACTER*1
= 'U': Upper triangle of A is stored;
= 'L': Lower triangle of A is stored.
N (input) INTEGER
The order of the matrix A. N >= 0.
A (input/output) DOUBLE array, dimension (LDA,N)
On entry, the symmetric matrix A. If UPLO = 'U', the leading
N-by-N upper triangular part of A contains the upper
triangular part of the matrix A, and the strictly lower
triangular part of A is not referenced. If UPLO = 'L', the
leading N-by-N lower triangular part of A contains the lower
triangular part of the matrix A, and the strictly upper
triangular part of A is not referenced.
On exit, if INFO = 0, the factor U or L from the Cholesky
factorization A = U**T*U or A = L*L**T.
Higher performance is achieved if A is in pinned memory, e.g.
allocated using cudaMallocHost.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
WORK (workspace) DOUBLE array on the GPU, dimension (N, N)
(size to be reduced in upcoming versions).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
> 0: if INFO = i, the leading minor of order i is not
positive definite, and the factorization could not be
completed.
=====================================================================
Test the input parameters.
Parameter adjustments */
#define a_ref(a_1,a_2) (a+(a_2)*a_dim1 + a_1)
#define da_ref(a_1,a_2) (work+(a_2)*ldda + a_1)
#define min(a,b) (((a)<(b))?(a):(b))
#define max(a,b) (((a)>(b))?(a):(b))
/* Table of constant values */
static int c__1 = 1;
static int c_n1 = -1;
static double c_b13 = -1.f;
static double c_b14 = 1.f;
/* System generated locals */
int a_dim1, a_offset, i__1, i__2, i__3, i__4, ldda;
/* Local variables */
static int j;
long int upper = lsame_(uplo, "U");
*info = 0;
if (! upper && ! lsame_(uplo, "L")) {
*info = -1;
} else if (*n < 0) {
*info = -2;
} else if (*lda < max(1,*n)) {
*info = -4;
}
if (*info != 0)
return 0;
static int jb;
static cudaStream_t stream[2];
cudaStreamCreate(&stream[0]);
cudaStreamCreate(&stream[1]);
cublasStatus status;
a_dim1 = *lda;
ldda = *n;
a_offset = 1 + a_dim1 * 1;
a -= a_offset;
work -= (1 + (*n));
int nb = magma_get_dpotrf_nb(*n);
if (nb <= 1 || nb >= *n) {
dpotrf_(uplo, n, a_ref(1, 1), lda, info);
} else {
/* Use blocked code. */
if (upper) {
/* Compute the Cholesky factorization A = U'*U. */
i__1 = *n;
i__2 = nb;
for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
/* Update and factorize the current diagonal block and test
for non-positive-definiteness. Computing MIN */
i__3 = nb, i__4 = *n - j + 1;
jb = min(i__3,i__4);
i__3 = j - 1;
cublasSetMatrix(jb, i__4, sizeof(double),
a_ref(j,j), *lda, da_ref(j,j), ldda);
cublasDsyrk('u', 't', jb, i__3, c_b13, da_ref(1,j),
ldda, c_b14, da_ref(j, j), ldda);
cudaMemcpy2DAsync( a_ref(1,j), (*lda)*sizeof(double),
da_ref(1,j), ldda *sizeof(double),
sizeof(double)*(j+jb-1), jb,
cudaMemcpyDeviceToHost,stream[1]);
if (j + jb <= *n) {
i__3 = *n - j - jb + 1;
i__4 = j - 1;
cublasDgemm('T', 'N', jb, i__3, i__4,
c_b13, da_ref(1, j), ldda, da_ref(1, j + jb), ldda,
c_b14, da_ref(j, j + jb), ldda);
}
cudaStreamSynchronize(stream[1]);
dpotrf_("Upper", &jb, a_ref(j,j), lda, info);
if (*info != 0) {
*info = *info + j - 1;
break;
}
cudaMemcpy2DAsync(da_ref(j,j), ldda * sizeof(double),
a_ref( j,j), (*lda)* sizeof(double),
sizeof(double)*jb, jb,
cudaMemcpyHostToDevice,stream[0]);
if (j + jb <= *n)
cublasDtrsm('L', 'U', 'T', 'N', jb, i__3,
c_b14, da_ref(j,j), ldda, da_ref(j, j+jb), ldda);
}
} else {
//=========================================================
// Compute the Cholesky factorization A = L*L'.
i__2 = *n;
i__1 = nb;
for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
// Update and factorize the current diagonal block and test
// for non-positive-definiteness. Computing MIN
i__3 = nb, i__4 = *n - j + 1;
jb = min(i__3,i__4);
i__3 = j - 1;
cublasSetMatrix(i__4, jb, sizeof(double),
a_ref(j,j), *lda, da_ref(j,j), ldda);
cublasDsyrk('l', 'n', jb, i__3, c_b13, da_ref(j,1), ldda,
c_b14, da_ref(j, j), ldda);
cudaMemcpy2DAsync( a_ref(j,1), (*lda)*sizeof(double),
da_ref(j,1), ldda *sizeof(double),
sizeof(double)*jb, j+jb-1,
cudaMemcpyDeviceToHost,stream[1]);
if (j + jb <= *n) {
i__3 = *n - j - jb + 1;
i__4 = j - 1;
cublasDgemm('N', 'T', i__3, jb, i__4,
c_b13, da_ref(j + jb, 1), ldda, da_ref(j, 1), ldda,
c_b14, da_ref(j + jb, j), ldda);
}
cudaStreamSynchronize(stream[1]);
dpotrf_("Lower", &jb, a_ref(j, j), lda, info);
if (*info != 0){
*info = *info + j - 1;
break;
}
cudaMemcpy2DAsync(da_ref(j,j), ldda * sizeof(double),
a_ref( j,j), (*lda)* sizeof(double),
sizeof(double)*jb, jb,
cudaMemcpyHostToDevice,stream[0]);
if (j + jb <= *n)
cublasDtrsm('R', 'L', 'T', 'N', i__3, jb, c_b14,
da_ref(j, j), ldda, da_ref(j + jb, j), ldda);
}
}
}
return 0;
/* End of MAGMA_DPOTRF */
} /* magma_dpotrf */
#undef a_ref
#undef da_ref
#undef min
#undef max
Code: Select all
/*
-- MAGMA (version 0.2) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
June 2009
*/
#include "cublas.h"
#include "magma.h"
#define num_threads 64
#define dgemv_bs 64
__global__ void
dgemv_kernel(int n, int m, int n1, double* A, int lda, double *x, double *y)
{
int ind = blockIdx.x*num_threads + threadIdx.x;
A += ind;
x += threadIdx.x;
double res = 0.f;
__shared__ double buff[dgemv_bs];
for(int i=0; i<n1; i += dgemv_bs ){
__syncthreads();
buff[threadIdx.x] = x[i];
__syncthreads();
#pragma unroll
for(int j=0; j < dgemv_bs ; j++){
res+=A[0]*buff[j];
A+=lda;
}
}
__syncthreads();
if (m>n1){
buff[threadIdx.x] = x[n1];
__syncthreads();
for(int j=0; j<(m-n1); j++){
res += A[0]*buff[j];
A+=lda;
}
}
if (ind<n)
y[ind] = res;
}
extern "C" void
magmablas_dgemv(int n, int m, double *A, int lda, double *x, double *z)
{
/* -- MAGMA (version 0.2) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
June 2009
Purpose
=======
This routine computes z = A x on the GPU.
N - (input) INTEGER.
On entry, N specifies the number of rows of the matrix A.
M - (input) INTEGER.
On entry, M specifies the number of columns of the matrix A
A - (input) DOUBLE PRECISION array of dimension ( LDA, m ) on the GPU.
LDA - (input) INTEGER.
LDA specifies the leading dimension of A.
X - (input) DOUBLE PRECISION array of dimension m.
Z - (output) DOUBLE PRECISION array of dimension m.
On exit Z = A X.
===================================================================== */
int blocks;
if (n % num_threads==0)
blocks = n/num_threads;
else
blocks = n/num_threads + 1;
dim3 grid(blocks, 1, 1);
dim3 threads(num_threads, 1, 1);
dgemv_kernel<<<grid, threads>>>(n, m, (m / dgemv_bs)*dgemv_bs,
A, lda, x, z);
}
#undef num_threads
#undef dgemv_bs