#include #include #include "util.hh" const int gpu_kernel_nb = 64; //------------------------------------------------------------------------------ /// Fills an m-by-n matrix with random uniform [-1, 1] entries. /// Template type can be float or double. /// template void init_matrix( int m, int n, T* A, int ld ) { for (int j = 0; j < n; ++j) { for (int i = 0; i < m; ++i) { A[ i + j*ld ] = i + j; } } } //------------------------------------------------------------------------------ template T cpu_norm_one( int m, int n, T* A, int lda ) { T result = 0; for (int j = 0; j < n; ++j) { T colsum = 0; for (int i = 0; i < m; ++i) { colsum += std::abs( A[ i + j*lda ] ); } result = std::max( result, colsum ); } return result; } //------------------------------------------------------------------------------ /// GPU kernel. /// template __global__ void gpu_norm_one_kernel( int m, int n, T const* A, int ld, T* sums ) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < n) { // Shift to j-th col. T const* Aj = &A[ j*ld ]; // Thread sums down col i. T sum = 0; for (int i = 0; i < m; ++i) { sum += abs( Aj[ i ] ); } // Save thread's result. sums[ j ] = sum; } } //------------------------------------------------------------------------------ /// CPU driver launches kernels on GPU. /// template void gpu_norm_one( int m, int n, T const* A, int ld, T* sums, cudaStream_t stream ) { // ceil( m / nb ) blocks, nb threads each. int blocks = ceildiv( m, gpu_kernel_nb ); gpu_norm_one_kernel<<< blocks, gpu_kernel_nb, 0, stream >>> ( m, n, A, ld, sums ); throw_error( cudaGetLastError() ); } //------------------------------------------------------------------------------ template void test( int m, int n ) { size_t size = m * n * sizeof(T); T *hA, *hB, *dA, *dB, *dworkA, *dworkB; throw_error( cudaMallocHost( &hA, size ) ); throw_error( cudaMallocHost( &hB, size ) ); throw_error( cudaMalloc( &dA, size ) ); throw_error( cudaMalloc( &dB, size ) ); throw_error( cudaMalloc( &dworkA, n * sizeof(T) ) ); throw_error( cudaMalloc( &dworkB, n * sizeof(T) ) ); cudaStream_t comm_stream; cudaStream_t compute_stream; throw_error( cudaStreamCreate( &comm_stream ) ); throw_error( cudaStreamCreate( &compute_stream ) ); // Fill in A. nvtxRangePush( "init_matrix( A )" ); init_matrix( m, n, hA, m ); nvtxRangePop(); // Copy A to device. throw_error( cudaMemcpyAsync( dA, hA, size, cudaMemcpyDefault, comm_stream ) ); // Meanwhile, fill in B. nvtxRangePush( "init_matrix( B )" ); init_matrix( m, n, hB, m ); nvtxRangePop(); // Wait for A to copy, then compute on it. throw_error( cudaStreamSynchronize( comm_stream ) ); gpu_norm_one( m, n, dA, m, dworkA, compute_stream ); // Meanwhile, copy B to device. throw_error( cudaMemcpyAsync( dB, hB, size, cudaMemcpyDefault, comm_stream ) ); // Meanwhile, do some CPU computation. nvtxRangePush( "cpu_norm_one( A )" ); T normA = cpu_norm_one( m, n, hA, m ); nvtxRangePop(); // Wait for B to copy, then compute on it (after A). throw_error( cudaStreamSynchronize( comm_stream ) ); gpu_norm_one( m, n, dB, m, dworkB, compute_stream ); // Meanwhile, do some CPU computation. nvtxRangePush( "cpu_norm_one( B )" ); T normB = cpu_norm_one( m, n, hB, m ); nvtxRangePop(); // Wait for computation on B. throw_error( cudaStreamSynchronize( compute_stream ) ); throw_error( cudaStreamDestroy( comm_stream ) ); throw_error( cudaStreamDestroy( compute_stream ) ); throw_error( cudaFreeHost( hA ) ); throw_error( cudaFreeHost( hB ) ); throw_error( cudaFree( dA ) ); throw_error( cudaFree( dB ) ); throw_error( cudaFree( dworkA ) ); throw_error( cudaFree( dworkB ) ); printf( "normA %.2f, normB %.2f\n", normA, normB ); } //------------------------------------------------------------------------------ int main( int argc, char** argv ) { int m = 1000; int n = 1000; int verbose = 0; int repeat = 5; // Rudimentary argument parsing. for (int i = 1; i < argc; ++i) { std::string arg( argv[i] ); if (arg == "-m" && i+1 < argc) { m = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-n" && i+1 < argc) { n = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-v") { verbose += 1; } else if (arg == "-repeat" && i+1 < argc) { repeat = strtol( argv[++i], nullptr, 0 ); } else { printf( "Unknown argument: %s\n", argv[i] ); } } printf( "m %4d, n %4d\n", m, n ); try { for (int i = 0; i < repeat; ++i) { test( m, n ); } } catch (std::exception const& ex) { fprintf( stderr, "Error: %s\n", ex.what() ); } }