MAGMA 2.9.0
Matrix Algebra for GPU and Multicore Architectures
Loading...
Searching...
No Matches

Functions

template<typename T , typename ID >
__device__ void magma_getidmax_n (int n, int i, T *x, ID *ind)
 Same as magma_getidmax(), but takes n as runtime argument instead of compile-time template parameter.
 
template<int n, typename T >
__device__ void magma_max_reduce (int i, T *x)
 Does max reduction of n-element array x, leaving total in x[0].
 
template<typename T >
__device__ void magma_max_reduce_n (int n, int i, T *x)
 Same as magma_max_reduce(), but takes n as runtime argument instead of compile-time template parameter.
 
template<typename T >
__host__ __device__ T max_nan (T x, T y)
 max that propogates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan
 
template<int n, typename T >
__device__ void magma_max_nan_reduce (int i, T *x)
 Same as magma_max_reduce(), but propogates nan values.
 
template<typename T >
__device__ void magma_max_nan_reduce_n (int n, int i, T *x)
 Same as magma_max_nan_reduce(), but takes n as runtime argument instead of compile-time template parameter.
 
template<typename T >
__global__ void magma_max_nan_kernel (int n, T *x)
 max reduction, for arbitrary size vector.
 
template<int n, typename T >
__device__ void magma_sum_reduce (int i, T *x)
 Does sum reduction of n-element array x, leaving total in x[0].
 
template<typename T >
__global__ void magma_sum_reduce_kernel (int n, T *x)
 sum reduction, for arbitrary size vector.
 
template<typename T >
__device__ void magma_sum_reduce_n (int n, int i, T *x)
 Same as magma_sum_reduce(), but takes n as runtime argument instead of compile-time template parameter.
 
template<int m, int n, typename T >
__device__ void magma_sum_reduce_2d (int i, int j, T x[m][n])
 Does sum reduction of each column of M x N array x, leaving totals in x[0][j] = sum( x[0:m-1][j] ), for 0 <= j < n.
 
template<int m0, int m1, int m2, typename T >
__device__ void magma_sum_reduce_3d (int i, int j, int k, T x[m0][m1][m2])
 Does sum reduction of each "column" of M0 x M1 x M2 array x, leaving totals in x[0][j][k] = sum( x[0:m0-1][j][k] ), for 0 <= j < m1, 0 <= k < m2.
 

Detailed Description

Function Documentation

◆ magma_max_reduce()

template<int n, typename T >
__device__ void magma_max_reduce ( int i,
T * x )

Does max reduction of n-element array x, leaving total in x[0].

Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.

◆ max_nan()

template<typename T >
__host__ __device__ T max_nan ( T x,
T y )
inline

max that propogates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan

For x=nan, y=1: nan < y is false, yields x (nan)

For x=1, y=nan: x < nan is false, would yield x, but isnan(nan) is true, yields y (nan)

◆ magma_max_nan_reduce()

template<int n, typename T >
__device__ void magma_max_nan_reduce ( int i,
T * x )

Same as magma_max_reduce(), but propogates nan values.

Does max reduction of n-element array x, leaving total in x[0]. Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.

◆ magma_max_nan_kernel()

template<typename T >
__global__ void magma_max_nan_kernel ( int n,
T * x )

max reduction, for arbitrary size vector.

Leaves max(x) in x[0]. Uses only one thread block of 512 threads, so is not efficient for really large vectors.

◆ magma_sum_reduce()

template<int n, typename T >
__device__ void magma_sum_reduce ( int i,
T * x )

Does sum reduction of n-element array x, leaving total in x[0].

Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.

◆ magma_sum_reduce_kernel()

template<typename T >
__global__ void magma_sum_reduce_kernel ( int n,
T * x )

sum reduction, for arbitrary size vector.

Leaves sum(x) in x[0]. Uses only one thread block of 512 threads, so is not efficient for really large vectors.

◆ magma_sum_reduce_2d()

template<int m, int n, typename T >
__device__ void magma_sum_reduce_2d ( int i,
int j,
T x[m][n] )

Does sum reduction of each column of M x N array x, leaving totals in x[0][j] = sum( x[0:m-1][j] ), for 0 <= j < n.

Contents of x are destroyed in the process. Calls __syncthreads before & after reduction.

◆ magma_sum_reduce_3d()

template<int m0, int m1, int m2, typename T >
__device__ void magma_sum_reduce_3d ( int i,
int j,
int k,
T x[m0][m1][m2] )

Does sum reduction of each "column" of M0 x M1 x M2 array x, leaving totals in x[0][j][k] = sum( x[0:m0-1][j][k] ), for 0 <= j < m1, 0 <= k < m2.

Contents of x are destroyed in the process. Calls __syncthreads before & after reduction.