/// ex01-matrix-norm.cu #include #include #include #include "util.hh" //============================================================================== // One norm const int matrix_norm_one_nb = 64; //------------------------------------------------------------------------------ /// GPU kernel: computes column sums: /// sums[j] = sum_{i = 0...m} | Aij |. /// /// Each thread-block computes one entry, sums[j]. /// Threads compute partial sums, then sum-reduce to get sums[j]. /// template __global__ void matrix_norm_one_kernel_v1( int m, int n, T const* __restrict__ A, int ld, T* __restrict__ sums ) { const int nb = matrix_norm_one_nb; assert( nb == blockDim.x ); // Partial column sums, one per thread in thread block. __shared__ T s_partial_sums[ nb ]; // Shift to j-th column. int j = blockIdx.x; T const* Aj = &A[ j*ld ]; // Thread tid sums: // s_partial_sums[ tid ] = A(tid, j) + A(tid + nb, j) + ... + A(tid + k*nb, j). int tid = threadIdx.x; s_partial_sums[ tid ] = 0; for (int i = tid; i < m; i += nb) { s_partial_sums[ tid ] += abs( Aj[ i ] ); } // Parallel binary tree sum reduction; result in s_partial_sums[ 0 ]. int kb = nb / 2; while (kb > 0) { __syncthreads(); if (tid < kb) s_partial_sums[ tid ] += s_partial_sums[ tid + kb ]; kb /= 2; } // Save thread block's result. if (tid == 0) { sums[ j ] = s_partial_sums[ 0 ]; } } //------------------------------------------------------------------------------ template __global__ void matrix_norm_one_kernel_v1b( int m, int n, T const* __restrict__ A, int ld, T* __restrict__ sums ) { const int nb = matrix_norm_one_nb; assert( nb == blockDim.x ); // Partial column sums, one per thread in thread block. __shared__ T s_partial_sums[ nb ]; // Shift to j-th column. int j = blockIdx.x; T const* Aj = &A[ j*ld ]; // Thread tid sums: // s_partial_sums[ tid ] = A(tid, j) + A(tid + nb, j) + ... + A(tid + k*nb, j). int tid = threadIdx.x; s_partial_sums[ tid ] = 0; int chunk = (m + nb - 1) / nb; // ceildiv( m, nb ) int begin = tid*chunk; int end = (tid + 1)*chunk; // max( m, end ) if (end > m) end = m; for (int i = begin; i < end; ++i) { s_partial_sums[ tid ] += abs( Aj[ i ] ); } // Parallel binary tree sum reduction; result in s_partial_sums[ 0 ]. int kb = nb / 2; while (kb > 0) { __syncthreads(); if (tid < kb) s_partial_sums[ tid ] += s_partial_sums[ tid + kb ]; kb /= 2; } // Save thread block's result. if (tid == 0) { sums[ j ] = s_partial_sums[ 0 ]; } } //------------------------------------------------------------------------------ /// GPU kernel: computes col sums: /// sums[j] = sum_{j = 0...n} | Aij |. /// /// Each thread-block computes nb entries, sums[ k..k+nb ], k = blockIdx*nb. /// Each thread sums one col. /// template __global__ void matrix_norm_one_kernel_v2( int m, int n, T const* __restrict__ A, int ld, T* __restrict__ sums ) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < n) { // Shift to j-th col. T const* Aj = &A[ j*ld ]; // Thread sums down col i. T sum = 0; for (int i = 0; i < m; ++i) { sum += abs( Aj[ i ] ); } // Save thread's result. sums[ j ] = sum; } } //------------------------------------------------------------------------------ /// CPU driver computes one norm of matrix, maximum column sum: /// max_{j = 0...n} sum_{i = 0...m} | Aij |. /// /// CPU driver launches kernels on GPU. /// template T matrix_norm_one( int m, int n, T const* A, int ld, cudaStream_t stream, int version, int verbose ) { // Workspace for n column sums. thrust::device_vector sums_vec( n ); T* sums = thrust::raw_pointer_cast( sums_vec.data() ); // Compute column sums. switch (version) { case 1: { // m blocks, nb threads each. int blocks = m; if (verbose) { printf( "m %d, n %d, matrix_norm_one_kernel_v1<<< %d blocks, %d threads each >>>\n", m, n, blocks, matrix_norm_one_nb ); } matrix_norm_one_kernel_v1b<<< blocks, matrix_norm_one_nb, 0, stream >>> ( m, n, A, ld, sums ); break; } case 2: { // ceil( m / nb ) blocks, nb threads each. int blocks = ceildiv( m, matrix_norm_one_nb ); if (verbose) { printf( "m %d, n %d, matrix_norm_one_kernel_v2<<< %d blocks, %d threads each >>>\n", m, n, blocks, matrix_norm_one_nb ); } matrix_norm_one_kernel_v2<<< blocks, matrix_norm_one_nb, 0, stream >>> ( m, n, A, ld, sums ); break; } default: throw std::exception(); } throw_error( cudaGetLastError() ); // Get max column sum. T result = thrust::reduce( sums_vec.begin(), sums_vec.end(), 0, thrust::maximum() ); return result; } //============================================================================== template void test( int m, int n, int align, int verbose ) { // Align columns, e.g., to 128-byte GPU cache line. int alignT = align / sizeof(T); alignT = std::max( alignT, 1 ); int ld = ceildiv( m, alignT ) * alignT; // Allocate and initialize matrix on CPU host. std::vector hA( ld * n ); rand_matrix( m, n, hA.data(), ld ); if (verbose >= 2) { print_matrix( "A", m, n, hA.data(), ld ); } // Allocate matrix on GPU device and copy from host to device. thrust::device_vector dA, dB, dC; dA = hA; // Get raw pointers to pass to CUDA kernel. T *dA_ptr; dA_ptr = thrust::raw_pointer_cast( dA.data() ); // todo: Better error handling; this leaks GPU memory on error. cudaStream_t stream = nullptr; throw_error( cudaStreamCreate( &stream ) ); for (int v = 1; v <= 2; ++v) { throw_error( cudaStreamSynchronize( stream ) ); double time = get_wtime(); //-------------------- // Get matrix norm. T norm = matrix_norm_one( m, n, dA_ptr, ld, stream, v, verbose ); throw_error( cudaStreamSynchronize( stream ) ); time = get_wtime() - time; // Print results. GB/s based on it reads A. double gbytes = m * n * sizeof(T) * 1e-9 / time; printf( "m %4d, n %4d, time %10.6f, GB/s %10.6f, norm = %.4e, version %d\n", m, n, time, gbytes, norm, v ); } printf( "\n" ); throw_error( cudaStreamDestroy( stream ) ); } //------------------------------------------------------------------------------ int main( int argc, char** argv ) { int m = 8000; int n = 8000; int align = 4; // Minimal column alignment by default. int verbose = 0; int repeat = 5; // Rudimentary argument parsing. for (int i = 1; i < argc; ++i) { std::string arg( argv[i] ); if (arg == "-m" && i+1 < argc) { m = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-n" && i+1 < argc) { n = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-align" && i+1 < argc) { align = strtol( argv[++i], nullptr, 0 ); } else if (arg == "-v") { verbose += 1; } else if (arg == "-repeat" && i+1 < argc) { repeat = strtol( argv[++i], nullptr, 0 ); } else { printf( "Unknown argument: %s\n", argv[i] ); } } printf( "m %4d, n %4d, align %3d bytes\n", m, n, align ); try { for (int i = 0; i < repeat; ++i) { test( m, n, align, verbose ); } } catch (std::exception const& ex) { fprintf( stderr, "Error: %s\n", ex.what() ); } return 0; }