![]() |
MAGMA 2.9.0
Matrix Algebra for GPU and Multicore Architectures
|
Functions | |
template<typename T , typename ID > | |
__device__ void | magma_getidmax_n (int n, int i, T *x, ID *ind) |
Same as magma_getidmax(), but takes n as runtime argument instead of compile-time template parameter. | |
template<int n, typename T > | |
__device__ void | magma_max_reduce (int i, T *x) |
Does max reduction of n-element array x, leaving total in x[0]. | |
template<typename T > | |
__device__ void | magma_max_reduce_n (int n, int i, T *x) |
Same as magma_max_reduce(), but takes n as runtime argument instead of compile-time template parameter. | |
template<typename T > | |
__host__ __device__ T | max_nan (T x, T y) |
max that propogates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan | |
template<int n, typename T > | |
__device__ void | magma_max_nan_reduce (int i, T *x) |
Same as magma_max_reduce(), but propogates nan values. | |
template<typename T > | |
__device__ void | magma_max_nan_reduce_n (int n, int i, T *x) |
Same as magma_max_nan_reduce(), but takes n as runtime argument instead of compile-time template parameter. | |
template<typename T > | |
__global__ void | magma_max_nan_kernel (int n, T *x) |
max reduction, for arbitrary size vector. | |
template<int n, typename T > | |
__device__ void | magma_sum_reduce (int i, T *x) |
Does sum reduction of n-element array x, leaving total in x[0]. | |
template<typename T > | |
__global__ void | magma_sum_reduce_kernel (int n, T *x) |
sum reduction, for arbitrary size vector. | |
template<typename T > | |
__device__ void | magma_sum_reduce_n (int n, int i, T *x) |
Same as magma_sum_reduce(), but takes n as runtime argument instead of compile-time template parameter. | |
template<int m, int n, typename T > | |
__device__ void | magma_sum_reduce_2d (int i, int j, T x[m][n]) |
Does sum reduction of each column of M x N array x, leaving totals in x[0][j] = sum( x[0:m-1][j] ), for 0 <= j < n. | |
template<int m0, int m1, int m2, typename T > | |
__device__ void | magma_sum_reduce_3d (int i, int j, int k, T x[m0][m1][m2]) |
Does sum reduction of each "column" of M0 x M1 x M2 array x, leaving totals in x[0][j][k] = sum( x[0:m0-1][j][k] ), for 0 <= j < m1, 0 <= k < m2. | |
__device__ void magma_max_reduce | ( | int | i, |
T * | x ) |
Does max reduction of n-element array x, leaving total in x[0].
Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.
|
inline |
max that propogates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan
For x=nan, y=1: nan < y is false, yields x (nan)
For x=1, y=nan: x < nan is false, would yield x, but isnan(nan) is true, yields y (nan)
__device__ void magma_max_nan_reduce | ( | int | i, |
T * | x ) |
Same as magma_max_reduce(), but propogates nan values.
Does max reduction of n-element array x, leaving total in x[0]. Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.
__global__ void magma_max_nan_kernel | ( | int | n, |
T * | x ) |
max reduction, for arbitrary size vector.
Leaves max(x) in x[0]. Uses only one thread block of 512 threads, so is not efficient for really large vectors.
__device__ void magma_sum_reduce | ( | int | i, |
T * | x ) |
Does sum reduction of n-element array x, leaving total in x[0].
Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.
__global__ void magma_sum_reduce_kernel | ( | int | n, |
T * | x ) |
sum reduction, for arbitrary size vector.
Leaves sum(x) in x[0]. Uses only one thread block of 512 threads, so is not efficient for really large vectors.
__device__ void magma_sum_reduce_2d | ( | int | i, |
int | j, | ||
T | x[m][n] ) |
Does sum reduction of each column of M x N array x, leaving totals in x[0][j] = sum( x[0:m-1][j] ), for 0 <= j < n.
Contents of x are destroyed in the process. Calls __syncthreads before & after reduction.
__device__ void magma_sum_reduce_3d | ( | int | i, |
int | j, | ||
int | k, | ||
T | x[m0][m1][m2] ) |
Does sum reduction of each "column" of M0 x M1 x M2 array x, leaving totals in x[0][j][k] = sum( x[0:m0-1][j][k] ), for 0 <= j < m1, 0 <= k < m2.
Contents of x are destroyed in the process. Calls __syncthreads before & after reduction.