/// ex01-matrix-add.cu #include #include #include "util.hh" //------------------------------------------------------------------------------ /// GPU kernel: adds two m-by-n matrices: C = A + B. /// Each thread-block adds mb-by-nb elements. /// Each thread adds one element, C(i,j) = A(i,j) + B(i,j). /// template __global__ void matrix_add_kernel( int m, int n, T const* __restrict__ A, T const* __restrict__ B, T* __restrict__ C, int ld ) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; if (i < m && j < n) { C[ i + j*ld ] = A[ i + j*ld ] + B[ i + j*ld ]; } } //------------------------------------------------------------------------------ /// CPU driver: adds two m-by-n matrices: C = A + B. /// ld is leading dimension, i.e., column stride, for all of A, B, C. /// Launches kernel on GPU. /// template void matrix_add( int m, int n, T const* A, T const* B, T* C, int ld, cudaStream_t stream, int mb, int nb, int verbose ) { // ceil( m / mb ) x ceil( n / nb ) blocks, mb x nb threads each. dim3 threads( mb, nb ); dim3 blocks( ceildiv( m, threads.x ), ceildiv( n, threads.y ) ); if (verbose) { printf( "%% m %d, n %d, matrix_add<<< %d x %d x %d blocks, %d x %d x %d threads each >>>\n", m, n, blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z ); } matrix_add_kernel<<< blocks, threads, 0, stream >>>( m, n, A, B, C, ld ); throw_error( cudaGetLastError() ); } //------------------------------------------------------------------------------ template void test( int m, int n, int align, int mb, int nb, int verbose ) { // Align columns, e.g., to 128-byte GPU cache line. int alignT = align / sizeof(T); alignT = std::max( alignT, 1 ); int ld = ceildiv( m, alignT ) * alignT; // Allocate and initialize matrices on CPU host. std::vector hA( ld * n ), hB( ld * n ), hC( ld * n ); rand_matrix( m, n, hA.data(), ld ); rand_matrix( m, n, hB.data(), ld ); rand_matrix( m, n, hC.data(), ld ); if (verbose >= 2) { print_matrix( "A", m, n, hA.data(), ld ); print_matrix( "B", m, n, hB.data(), ld ); print_matrix( "C", m, n, hC.data(), ld ); } // Allocate matrices on GPU device and copy from host to device. // todo: Better error handling; this leaks GPU memory on error. T *dA = nullptr, *dB = nullptr, *dC = nullptr; size_t size = ld * n * sizeof(T); throw_error( cudaMalloc( &dA, size ) ); throw_error( cudaMalloc( &dB, size ) ); throw_error( cudaMalloc( &dC, size ) ); throw_error( cudaMemcpy( dA, hA.data(), size, cudaMemcpyDefault )); throw_error( cudaMemcpy( dB, hB.data(), size, cudaMemcpyDefault )); throw_error( cudaMemcpy( dC, hC.data(), size, cudaMemcpyDefault )); cudaStream_t stream = nullptr; throw_error( cudaStreamCreate( &stream ) ); throw_error( cudaStreamSynchronize( stream ) ); double time = get_wtime(); //-------------------- // Add matrices. matrix_add( m, n, dA, dB, dC, ld, stream, mb, nb, verbose ); throw_error( cudaStreamSynchronize( stream ) ); time = get_wtime() - time; // Print results. GB/s based on it reads A, B; writes C. double gbytes = 3 * m * n * sizeof(T) * 1e-9 / time; printf( "m %4d, n %4d, ld %4d, time %10.6f, GB/s %10.6f\n", m, n, ld, time, gbytes ); // Copy hC = dC (device to host). throw_error( cudaMemcpy( hC.data(), dC, size, cudaMemcpyDefault )); throw_error( cudaStreamDestroy( stream ) ); throw_error( cudaFree( dA ) ); throw_error( cudaFree( dB ) ); throw_error( cudaFree( dC ) ); if (verbose >= 2) { print_matrix( "Cout", m, n, hC.data(), ld ); printf( "%% In Matlab, check norm( A + B - Cout ) is ~ 1e-04,\n" "%% since matrices are printed to 4 decimal places.\n" ); } } //------------------------------------------------------------------------------ int main( int argc, char** argv ) { int m = 8000; int n = 8000; int mb = 8; int nb = 8; int align = 4; // Minimal column alignment by default. int verbose = 0; int repeat = 5; // Rudimentary argument parsing. for (int i = 1; i < argc; ++i) { std::string arg( argv[i] ); if (arg == "-m" && i+1 < argc) { m = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-n" && i+1 < argc) { n = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-align" && i+1 < argc) { align = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-mb" && i+1 < argc) { mb = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-nb" && i+1 < argc) { nb = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-v") { verbose += 1; } else if (arg == "-repeat" && i+1 < argc) { repeat = strtol( argv[++i], nullptr, 0 ); } else { printf( "Unknown argument: %s\n", argv[i] ); } } printf( "m %4d, n %4d, align %3d bytes, mb %3d, nb %3d\n", m, n, align, mb, nb ); try { for (int i = 0; i < repeat; ++i) { test( m, n, align, mb, nb, verbose ); } } catch (std::exception const& ex) { fprintf( stderr, "Error: %s\n", ex.what() ); } return 0; }