Hello mark,
Here is my environment configuration:
Code: Select all
% MAGMA 2.0.0 compiled for CUDA capability >= 2.0, 64-bit magma_int_t, 64-bit pointer.
% CUDA runtime 7050, driver 8000. OpenMP threads 12.
% device 0: Quadro K6000, 901.5 MHz clock, 12287.8 MB memory, capability 3.5
% device 1: Quadro K6000, 901.5 MHz clock, 12287.8 MB memory, capability 3.5
% device 2: Quadro K6000, 901.5 MHz clock, 12287.8 MB memory, capability 3.5
I have managed to reproduce the problem using testing_zgetrf tester application.
It requires small modifications in main function. Below there is modified main function (modifications are marked with <---)
Code: Select all
int main( int argc, char** argv)
{
TESTING_INIT();
real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
double error;
magmaDoubleComplex *h_A;
magma_int_t *ipiv;
magma_int_t M, N, n2, lda, info, min_mn;
magma_int_t status = 0;
magma_opts opts;
opts.parse_opts( argc, argv );
double tol = opts.tolerance * lapackf77_dlamch("E");
printf("%% ngpu %d, version %d\n", (int) opts.ngpu, (int) opts.version );
if ( opts.check == 2 ) {
printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n");
}
else {
printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |PA-LU|/(N*|A|)\n");
}
printf("%%========================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
M = opts.msize[itest];
N = opts.nsize[itest];
min_mn = min(M, N);
lda = M;
n2 = lda*N;
gflops = FLOPS_ZGETRF( M, N ) / 1e9;
// TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); <------------- MODIFIED LINE
// TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 ); <------------- MODIFIED LINE
ipiv = nullptr; <------------- MODIFIED LINE
h_A = nullptr; <------------- MODIFIED LINE
/* =====================================================================
Performs operation using LAPACK
=================================================================== */
if ( opts.lapack ) {
init_matrix( opts, M, N, h_A, lda );
cpu_time = magma_wtime();
lapackf77_zgetrf( &M, &N, h_A, &lda, ipiv, &info );
cpu_time = magma_wtime() - cpu_time;
cpu_perf = gflops / cpu_time;
if (info != 0) {
printf("lapackf77_zgetrf returned error %d: %s.\n",
(int) info, magma_strerror( info ));
}
}
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
// init_matrix( opts, M, N, h_A, lda ); <------------- MODIFIED LINE
if ( opts.version == 2 || opts.version == 3 ) {
// no pivoting versions, so set ipiv to identity
for (magma_int_t i=0; i < min_mn; ++i ) {
ipiv[i] = i+1;
}
}
gpu_time = magma_wtime();
if ( opts.version == 1 ) {
magma_zgetrf( M, N, h_A, lda, ipiv, &info );
}
else if ( opts.version == 2 ) {
magma_zgetrf_nopiv( M, N, h_A, lda, &info );
}
else if ( opts.version == 3 ) {
magma_zgetf2_nopiv( M, N, h_A, lda, &info );
}
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
if (info != 0) {
printf("magma_zgetrf returned error %d: %s.\n",
(int) info, magma_strerror( info ));
}
/* =====================================================================
Check the factorization
=================================================================== */
if ( opts.lapack ) {
printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)",
(int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
}
else {
printf("%5d %5d --- ( --- ) %7.2f (%7.2f)",
(int) M, (int) N, gpu_perf, gpu_time );
}
if ( opts.check == 2 ) {
error = get_residual( opts, M, N, h_A, lda, ipiv );
printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed"));
status += ! (error < tol);
}
else if ( opts.check ) {
error = get_LU_error( opts, M, N, h_A, lda, ipiv );
printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed"));
status += ! (error < tol);
}
else {
printf(" --- \n");
}
// TESTING_FREE_CPU( ipiv ); <------------- MODIFIED LINE
// TESTING_FREE_PIN( h_A ); <------------- MODIFIED LINE
fflush( stdout );
}
if ( opts.niter > 1 ) {
printf( "\n" );
}
}
opts.cleanup();
TESTING_FINALIZE();
return status;
}
When I run in debug mode modified application with following arguments:
the application crashes on function call
magma_zsetmatrix_async in function
magmablas_zsetmatrix_transpose_mgpu. It seems that only 2 queues are created/allocated instead of 3 (according to the number of GPUs).
From you post I have noticed that you use 2 GPU environment and have managed to reproduce the problem for such environment too. If you run application with following arguments:
application will crash in
magma_zgetrf2_mgpu function.
I think both cases are similar. Can you propose some workaround or fix for this problem?
Konstantin