so i have to factorize first with (sgetrf_gpu)
I think everything runs fine until calling
Code: Select all
cublasStrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb );Code: Select all
float* LU(float* h_A, int size, float* h_B){
TESTING_CUDA_INIT();
TimeStruct start, end;
float flops=0, gpu_perf=0, cpu_perf=0;
float *h_R,*h_X;
float *d_A,*d_B;
// float *hwork = new float[size]; // array, dimension N*NRHS
magma_int_t *ipiv;
/* Matrix size */
magma_int_t M =size, N = size, lda =size, ldda = size, lddb = size;
magma_int_t info, min_mn, nb;
min_mn = min(M, N);
//nb = magma_get_sgetrf_nb(min_mn);
/* Allocate host memory for the matrix */
TESTING_MALLOC(ipiv, magma_int_t, min_mn);
TESTING_MALLOC( h_A, float, M * N );
TESTING_HOSTALLOC( h_R, float, M * N );
TESTING_MALLOC( h_B, float, lda*1 );
TESTING_MALLOC( h_X, float, lda*1 );
TESTING_DEVALLOC( d_A, float, ldda*N );
TESTING_DEVALLOC( d_B, float, lddb*1 );
h_R=h_A;
// printf("\n\n M N CPU GFlop/s GPU GFlop/s\n");
printf("\n\n M N GPU GFlop/s\n");
printf("==========================\n");
//M = N = *size;
//lda = M;
//ldda = lda;
flops = FLOPS( (float)M, (float)N ) / 1000000;
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
cublasSetMatrix( M, N, sizeof(float), h_R, lda, d_A, ldda);
cublasSetMatrix( N, 1, sizeof( float ), h_B, N, d_B, lddb );
magma_sgetrf_gpu( M, N, d_A, ldda, ipiv, &info);
magma_sgetrs_gpu('N', M, 1, d_A, ldda, ipiv, h_B, ldda, &info);
//cublasGetMatrix( M, N, sizeof(float), d_A, ldda, h_A, lda);
cublasGetMatrix( N, 1, sizeof( float ), d_B, ldda, h_X, lda );
/* Shutdown */
TESTING_CUDA_FINALIZE();
return h_X ;
}