Hi,
I'm running the MAGMA test quite on multiple new CentOS 6 machines each with two Nvidia Tesla GPUs and I get several test failures (44) and errors (63). I ran with a tolerance of 100. I'm using MAGMA as an acceptance test for a new purchase and I'm not sure what to do about the errors and failures. From WWW searches, it appears MAGMA is "supposed" to fail on some of the numerical issues. How can I tell which failures are important, and which can be ignored?
Also, I have multiple servers that are identically configured and the numerical results differ between some nodes. Are the MAGMA tests non-deterministic?
Numerical Errors
-
hartwig anzt
- Posts: 90
- Joined: Tue Sep 02, 2014 5:44 pm
Re: Numerical Errors
Can you please be a bit more specific - which tests fail? which routines do you use? Are you talking about dense or sparse solvers? Which test matrices do you use?
Thanks, Hartwig
Thanks, Hartwig
Re: Numerical Errors
Thanks for your reply. Here is the STDOUT results:
./run_tests.py --blas --tol=100
testing_zgemm -l -NN -c ok
testing_zgemm -l -NC -c ok
testing_zgemm -l -CN -c ok
testing_zgemm -l -CC -c ok
testing_zgemv -c ok
testing_zgemv -T -c ok
testing_zgemv -C -c ok
testing_zhemv -L -c ok
testing_zhemv -U -c ok
testing_zherk -L -c ok
testing_zherk -L -C -c ok
testing_zherk -U -c ** 14 tests failed
testing_zherk -U -C -c ** 14 tests failed
testing_zher2k -L -c ** 3 tests failed
testing_zher2k -L -C -c ** 3 tests failed
testing_zher2k -U -c ** 3 tests failed
testing_zher2k -U -C -c ** 3 tests failed
testing_zsymv -L -c ok
testing_zsymv -U -c (disabled: upper not implemented)
testing_ztrmm -SL -L -DN -c ok
testing_ztrmm -SL -L -DU -c ok
testing_ztrmm -SL -L -C -DN -c ok
testing_ztrmm -SL -L -C -DU -c ok
testing_ztrmm -SL -U -DN -c ok
testing_ztrmm -SL -U -DU -c ok
testing_ztrmm -SL -U -C -DN -c ** 1 tests failed
testing_ztrmm -SL -U -C -DU -c ** 1 tests failed
testing_ztrmm -SR -L -DN -c ok
testing_ztrmm -SR -L -DU -c ok
testing_ztrmm -SR -L -C -DN -c ok
testing_ztrmm -SR -L -C -DU -c ok
testing_ztrmm -SR -U -DN -c ok
testing_ztrmm -SR -U -DU -c ok
testing_ztrmm -SR -U -C -DN -c ok
testing_ztrmm -SR -U -C -DU -c ok
testing_ztrmv -L -DN -c ok
testing_ztrmv -L -DU -c ok
testing_ztrmv -L -C -DN -c ok
testing_ztrmv -L -C -DU -c ok
testing_ztrmv -U -DN -c ok
testing_ztrmv -U -DU -c ok
testing_ztrmv -U -C -DN -c ok
testing_ztrmv -U -C -DU -c ok
testing_ztrsm -SL -L -DN -c ok
testing_ztrsm -SL -L -DU -c ok
testing_ztrsm -SL -L -C -DN -c ok
testing_ztrsm -SL -L -C -DU -c ok
testing_ztrsm -SL -U -DN -c ok
testing_ztrsm -SL -U -DU -c ok
testing_ztrsm -SL -U -C -DN -c ok
testing_ztrsm -SL -U -C -DU -c ok
testing_ztrsm -SR -L -DN -c ok
testing_ztrsm -SR -L -DU -c ok
testing_ztrsm -SR -L -C -DN -c ok
testing_ztrsm -SR -L -C -DU -c ok
testing_ztrsm -SR -U -DN -c ok
testing_ztrsm -SR -U -DU -c ok
testing_ztrsm -SR -U -C -DN -c ok
testing_ztrsm -SR -U -C -DU -c ok
testing_ztrsv -L -DN -c ok
testing_ztrsv -L -DU -c ok
testing_ztrsv -L -C -DN -c ok
testing_ztrsv -L -C -DU -c ok
testing_ztrsv -U -DN -c ok
testing_ztrsv -U -DU -c ok
testing_ztrsv -U -C -DN -c ok
testing_ztrsv -U -C -DU -c ok
testing_ztrtri_diag -L -c ok
testing_ztrtri_diag -U -c ok
testing_zhemm_mgpu -L -c ok
testing_zhemm_mgpu -U -c ok
testing_zhemv_mgpu -L -c ** 63 errors
testing_zhemv_mgpu -U -c ** 63 errors
testing_zher2k_mgpu -L -c ok
testing_zher2k_mgpu -U -c ok
testing_blas_z -c (disabled: takes long time; cublas only)
testing_cblas_z -c ** 2 tests failed ** exit with signal 11
****************************************************************************************************
summary
****************************************************************************************************
7642 tests in 75 commands passed
44 tests failed accuracy test
127 errors detected (crashes, CUDA errors, etc.)
routines with failures:
testing_cblas_z -c
testing_zhemv_mgpu -L -c
testing_zhemv_mgpu -U -c
testing_zher2k -L -C -c
testing_zher2k -L -c
testing_zher2k -U -C -c
testing_zher2k -U -c
testing_zherk -U -C -c
testing_zherk -U -c
testing_ztrmm -SL -U -C -DN -c
testing_ztrmm -SL -U -C -DU -c
./run_tests.py --blas --tol=100
testing_zgemm -l -NN -c ok
testing_zgemm -l -NC -c ok
testing_zgemm -l -CN -c ok
testing_zgemm -l -CC -c ok
testing_zgemv -c ok
testing_zgemv -T -c ok
testing_zgemv -C -c ok
testing_zhemv -L -c ok
testing_zhemv -U -c ok
testing_zherk -L -c ok
testing_zherk -L -C -c ok
testing_zherk -U -c ** 14 tests failed
testing_zherk -U -C -c ** 14 tests failed
testing_zher2k -L -c ** 3 tests failed
testing_zher2k -L -C -c ** 3 tests failed
testing_zher2k -U -c ** 3 tests failed
testing_zher2k -U -C -c ** 3 tests failed
testing_zsymv -L -c ok
testing_zsymv -U -c (disabled: upper not implemented)
testing_ztrmm -SL -L -DN -c ok
testing_ztrmm -SL -L -DU -c ok
testing_ztrmm -SL -L -C -DN -c ok
testing_ztrmm -SL -L -C -DU -c ok
testing_ztrmm -SL -U -DN -c ok
testing_ztrmm -SL -U -DU -c ok
testing_ztrmm -SL -U -C -DN -c ** 1 tests failed
testing_ztrmm -SL -U -C -DU -c ** 1 tests failed
testing_ztrmm -SR -L -DN -c ok
testing_ztrmm -SR -L -DU -c ok
testing_ztrmm -SR -L -C -DN -c ok
testing_ztrmm -SR -L -C -DU -c ok
testing_ztrmm -SR -U -DN -c ok
testing_ztrmm -SR -U -DU -c ok
testing_ztrmm -SR -U -C -DN -c ok
testing_ztrmm -SR -U -C -DU -c ok
testing_ztrmv -L -DN -c ok
testing_ztrmv -L -DU -c ok
testing_ztrmv -L -C -DN -c ok
testing_ztrmv -L -C -DU -c ok
testing_ztrmv -U -DN -c ok
testing_ztrmv -U -DU -c ok
testing_ztrmv -U -C -DN -c ok
testing_ztrmv -U -C -DU -c ok
testing_ztrsm -SL -L -DN -c ok
testing_ztrsm -SL -L -DU -c ok
testing_ztrsm -SL -L -C -DN -c ok
testing_ztrsm -SL -L -C -DU -c ok
testing_ztrsm -SL -U -DN -c ok
testing_ztrsm -SL -U -DU -c ok
testing_ztrsm -SL -U -C -DN -c ok
testing_ztrsm -SL -U -C -DU -c ok
testing_ztrsm -SR -L -DN -c ok
testing_ztrsm -SR -L -DU -c ok
testing_ztrsm -SR -L -C -DN -c ok
testing_ztrsm -SR -L -C -DU -c ok
testing_ztrsm -SR -U -DN -c ok
testing_ztrsm -SR -U -DU -c ok
testing_ztrsm -SR -U -C -DN -c ok
testing_ztrsm -SR -U -C -DU -c ok
testing_ztrsv -L -DN -c ok
testing_ztrsv -L -DU -c ok
testing_ztrsv -L -C -DN -c ok
testing_ztrsv -L -C -DU -c ok
testing_ztrsv -U -DN -c ok
testing_ztrsv -U -DU -c ok
testing_ztrsv -U -C -DN -c ok
testing_ztrsv -U -C -DU -c ok
testing_ztrtri_diag -L -c ok
testing_ztrtri_diag -U -c ok
testing_zhemm_mgpu -L -c ok
testing_zhemm_mgpu -U -c ok
testing_zhemv_mgpu -L -c ** 63 errors
testing_zhemv_mgpu -U -c ** 63 errors
testing_zher2k_mgpu -L -c ok
testing_zher2k_mgpu -U -c ok
testing_blas_z -c (disabled: takes long time; cublas only)
testing_cblas_z -c ** 2 tests failed ** exit with signal 11
****************************************************************************************************
summary
****************************************************************************************************
7642 tests in 75 commands passed
44 tests failed accuracy test
127 errors detected (crashes, CUDA errors, etc.)
routines with failures:
testing_cblas_z -c
testing_zhemv_mgpu -L -c
testing_zhemv_mgpu -U -c
testing_zher2k -L -C -c
testing_zher2k -L -c
testing_zher2k -U -C -c
testing_zher2k -U -c
testing_zherk -U -C -c
testing_zherk -U -c
testing_ztrmm -SL -U -C -DN -c
testing_ztrmm -SL -U -C -DU -c
Re: Numerical Errors
Thanks for providing this output. It would be helpful to have some specifics about your system:
- MAGMA version
- OS
- CUDA version, BLAS & LAPACK libraries (MKL, ACML, etc.)
- compiler (gcc, icc, etc.)
- your make.inc file
and the output of the testers that failed (see grep output below). Some of this information is in the tester output. There are some failures with errors around 1e-14 that are just a little higher than the double-precision tolerance (3.33e-15, or around 1e-6 in single-precision), and so are negligible. Whereas if the error was like 1e-3, that would indicate a problem.
Below are the errors we observe (with K20c GPU, CUDA 6.5, MKL 11.1.2) for these problems. As you can see, none of these is significant. MAGMA no longer uses zdotu / zdotc routines, precisely because they crash on various systems due to bugs or interface differences in CPU BLAS libraries, not bugs in MAGMA. (I think in the future those zdot tests will be disabled to avoid confusion.)
As for numerical results differing, in most cases MAGMA is deterministic. The exceptions are randomized algorithms like zgesv_rbt (random butterfly transformation). However, the underlying CPU BLAS libraries (e.g., MKL) are not always deterministic. Using magma_[isdcz]malloc_cpu/magma_free_cpu instead of malloc/free helps to alleviate this issue by aligning memory consistently.
If you can, please include specific test cases where MAGMA differs between nodes.
-mark
- MAGMA version
- OS
- CUDA version, BLAS & LAPACK libraries (MKL, ACML, etc.)
- compiler (gcc, icc, etc.)
- your make.inc file
and the output of the testers that failed (see grep output below). Some of this information is in the tester output. There are some failures with errors around 1e-14 that are just a little higher than the double-precision tolerance (3.33e-15, or around 1e-6 in single-precision), and so are negligible. Whereas if the error was like 1e-3, that would indicate a problem.
Below are the errors we observe (with K20c GPU, CUDA 6.5, MKL 11.1.2) for these problems. As you can see, none of these is significant. MAGMA no longer uses zdotu / zdotc routines, precisely because they crash on various systems due to bugs or interface differences in CPU BLAS libraries, not bugs in MAGMA. (I think in the future those zdot tests will be disabled to avoid confusion.)
Code: Select all
prompt> ./run_tests.py testing_zherk testing_zher2k testing_ztrmm testing_zhemv_mgpu testing_cblas_z --tol 100 > ! output.txt
prompt> grep '^./testing|failed' output.txt | grep -B 1 failed
./testing_zher2k -L -c --tol 100 --range 1:20:1 -N 30 -N 31 -N 32 -N 33 -N 34 -N 62 -N 63 -N 64 -N 65 -N 66 -N 94 -N 95 -N 96 -N 97 -N 98 -N 126 -N 127 -N 128 -N 129 -N 130 -N 254 -N 255 -N 256 -N 257 -N 258 -N 510 -N 511 -N 512 -N 513 -N 514 --range 100:900:100 --range 1000:4000:1000
2000 2000 962.78 ( 66.47) 62.59 (1022.51) 1.40e-14 failed
3000 3000 1001.79 ( 215.62) 96.85 (2230.23) 2.15e-14 failed
4000 4000 1021.63 ( 501.16) 130.94 (3910.17) 2.25e-14 failed
** 3 tests failed
./testing_zher2k -L -C -c --tol 100 --range 1:20:1 -N 30 -N 31 -N 32 -N 33 -N 34 -N 62 -N 63 -N 64 -N 65 -N 66 -N 94 -N 95 -N 96 -N 97 -N 98 -N 126 -N 127 -N 128 -N 129 -N 130 -N 254 -N 255 -N 256 -N 257 -N 258 -N 510 -N 511 -N 512 -N 513 -N 514 --range 100:900:100 --range 1000:4000:1000
2000 2000 926.75 ( 69.06) 121.67 ( 526.02) 1.50e-14 failed
3000 3000 971.80 ( 222.27) 128.57 (1680.06) 2.03e-14 failed
4000 4000 988.39 ( 518.01) 132.25 (3871.33) 2.19e-14 failed
** 3 tests failed
./testing_zher2k -U -c --tol 100 --range 1:20:1 -N 30 -N 31 -N 32 -N 33 -N 34 -N 62 -N 63 -N 64 -N 65 -N 66 -N 94 -N 95 -N 96 -N 97 -N 98 -N 126 -N 127 -N 128 -N 129 -N 130 -N 254 -N 255 -N 256 -N 257 -N 258 -N 510 -N 511 -N 512 -N 513 -N 514 --range 100:900:100 --range 1000:4000:1000
2000 2000 940.75 ( 68.03) 93.28 ( 686.14) 1.41e-14 failed
3000 3000 1002.52 ( 215.46) 126.54 (1707.00) 2.18e-14 failed
4000 4000 1008.84 ( 507.51) 131.88 (3882.43) 2.26e-14 failed
** 3 tests failed
./testing_zher2k -U -C -c --tol 100 --range 1:20:1 -N 30 -N 31 -N 32 -N 33 -N 34 -N 62 -N 63 -N 64 -N 65 -N 66 -N 94 -N 95 -N 96 -N 97 -N 98 -N 126 -N 127 -N 128 -N 129 -N 130 -N 254 -N 255 -N 256 -N 257 -N 258 -N 510 -N 511 -N 512 -N 513 -N 514 --range 100:900:100 --range 1000:4000:1000
2000 2000 909.75 ( 70.35) 121.11 ( 528.46) 1.53e-14 failed
3000 3000 971.62 ( 222.31) 128.14 (1685.68) 2.05e-14 failed
4000 4000 978.02 ( 523.51) 133.04 (3848.54) 2.21e-14 failed
** 3 tests failed
--
./testing_ztrmm -SL -U -C -DN -c --tol 100 --range 1:20:1 -N 30 -N 31 -N 32 -N 33 -N 34 -N 62 -N 63 -N 64 -N 65 -N 66 -N 94 -N 95 -N 96 -N 97 -N 98 -N 126 -N 127 -N 128 -N 129 -N 130 -N 254 -N 255 -N 256 -N 257 -N 258 -N 510 -N 511 -N 512 -N 513 -N 514 --range 100:900:100 --range 1000:4000:1000 -N 2,1 -N 3,1 -N 4,2 -N 20,19 -N 20,10 -N 20,2 -N 20,1 -N 200,199 -N 200,100 -N 200,20 -N 200,10 -N 200,1 -N 600,599 -N 600,300 -N 600,60 -N 600,30 -N 600,10 -N 600,1 -N 2000,1999 -N 2000,1000 -N 2000,200 -N 2000,100 -N 2000,10 -N 2000,1 -N 1,2 -N 1,3 -N 2,4 -N 19,20 -N 10,20 -N 2,20 -N 1,20 -N 199,200 -N 100,200 -N 20,200 -N 10,200 -N 1,200 -N 599,600 -N 300,600 -N 60,600 -N 30,600 -N 10,600 -N 1,600 -N 1999,2000 -N 1000,2000 -N 200,2000 -N 100,2000 -N 10,2000 -N 1,2000
4000 4000 233.92 (1094.51) 143.36 (1785.90) 1.15e-14 failed
** 1 tests failed
./testing_ztrmm -SL -U -C -DU -c --tol 100 --range 1:20:1 -N 30 -N 31 -N 32 -N 33 -N 34 -N 62 -N 63 -N 64 -N 65 -N 66 -N 94 -N 95 -N 96 -N 97 -N 98 -N 126 -N 127 -N 128 -N 129 -N 130 -N 254 -N 255 -N 256 -N 257 -N 258 -N 510 -N 511 -N 512 -N 513 -N 514 --range 100:900:100 --range 1000:4000:1000 -N 2,1 -N 3,1 -N 4,2 -N 20,19 -N 20,10 -N 20,2 -N 20,1 -N 200,199 -N 200,100 -N 200,20 -N 200,10 -N 200,1 -N 600,599 -N 600,300 -N 600,60 -N 600,30 -N 600,10 -N 600,1 -N 2000,1999 -N 2000,1000 -N 2000,200 -N 2000,100 -N 2000,10 -N 2000,1 -N 1,2 -N 1,3 -N 2,4 -N 19,20 -N 10,20 -N 2,20 -N 1,20 -N 199,200 -N 100,200 -N 20,200 -N 10,200 -N 1,200 -N 599,600 -N 300,600 -N 60,600 -N 30,600 -N 10,600 -N 1,600 -N 1999,2000 -N 1000,2000 -N 200,2000 -N 100,2000 -N 10,2000 -N 1,2000
4000 4000 236.30 (1083.52) 142.91 (1791.53) 1.15e-14 failed
** 1 tests failed
--
./testing_cblas_z -c --tol 100 --range 1:20:1 -N 30 -N 31 -N 32 -N 33 -N 34 -N 62 -N 63 -N 64 -N 65 -N 66 -N 94 -N 95 -N 96 -N 97 -N 98 -N 126 -N 127 -N 128 -N 129 -N 130 -N 254 -N 255 -N 256 -N 257 -N 258 -N 510 -N 511 -N 512 -N 513 -N 514 --range 100:900:100 --range 1000:4000:1000
zdotc 0 -nan failed
zdotu 0 -nan failed
zdotc 0 -nan failed
zdotu 0 -nan failed
** 4 tests failed ** exit with signal 7
18 tests failed accuracy test
If you can, please include specific test cases where MAGMA differs between nodes.
-mark