Functions
template<typename T , typename ID >
__device__ void	magma_getidmax_n (int n, int i, T x, ID ind)
	Same as magma_getidmax(), but takes n as runtime argument instead of compile-time template parameter.

template<int n, typename T >
__device__ void	magma_max_reduce (int i, T *x)
	Does max reduction of n-element array x, leaving total in x[0]. More...

template<typename T >
__device__ void	magma_max_reduce_n (int n, int i, T *x)
	Same as magma_max_reduce(), but takes n as runtime argument instead of compile-time template parameter.

template<typename T >
__host__ __device__ T	max_nan (T x, T y)
	max that propogates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan More...

template<int n, typename T >
__device__ void	magma_max_nan_reduce (int i, T *x)
	Same as magma_max_reduce(), but propogates nan values. More...

template<typename T >
__device__ void	magma_max_nan_reduce_n (int n, int i, T *x)
	Same as magma_max_nan_reduce(), but takes n as runtime argument instead of compile-time template parameter.

template<typename T >
__global__ void	magma_max_nan_kernel (int n, T *x)
	max reduction, for arbitrary size vector. More...

template<int n, typename T >
__device__ void	magma_sum_reduce (int i, T *x)
	Does sum reduction of n-element array x, leaving total in x[0]. More...

template<typename T >
__global__ void	magma_sum_reduce_kernel (int n, T *x)
	sum reduction, for arbitrary size vector. More...

template<typename T >
__device__ void	magma_sum_reduce_n (int n, int i, T *x)
	Same as magma_sum_reduce(), but takes n as runtime argument instead of compile-time template parameter.

template<int m, int n, typename T >
__device__ void	magma_sum_reduce_2d (int i, int j, T x[m][n])
	Does sum reduction of each column of M x N array x, leaving totals in x[0][j] = sum( x[0:m-1][j] ), for 0 <= j < n. More...

template<int m0, int m1, int m2, typename T >
__device__ void	magma_sum_reduce_3d (int i, int j, int k, T x[m0][m1][m2])
	Does sum reduction of each "column" of M0 x M1 x M2 array x, leaving totals in x[0][j][k] = sum( x[0:m0-1][j][k] ), for 0 <= j < m1, 0 <= k < m2. More...

Detailed Description

Function Documentation

template<int n, typename T >

__device__ void magma_max_reduce	(	int	i,
		T *	x
	)

Does max reduction of n-element array x, leaving total in x[0].

Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.

template<typename T >

__host__ __device__ T max_nan	(	T	x,
		T	y
	)

inline

max that propogates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan

For x=nan, y=1: nan < y is false, yields x (nan)

For x=1, y=nan: x < nan is false, would yield x, but isnan(nan) is true, yields y (nan)

template<int n, typename T >

__device__ void magma_max_nan_reduce	(	int	i,
		T *	x
	)

Same as magma_max_reduce(), but propogates nan values.

Does max reduction of n-element array x, leaving total in x[0]. Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.

template<typename T >

__global__ void magma_max_nan_kernel	(	int	n,
		T *	x
	)

max reduction, for arbitrary size vector.

Leaves max(x) in x[0]. Uses only one thread block of 512 threads, so is not efficient for really large vectors.

template<int n, typename T >

__device__ void magma_sum_reduce	(	int	i,
		T *	x
	)

Does sum reduction of n-element array x, leaving total in x[0].

Contents of x are destroyed in the process. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is max number of threads up to CUDA capability 3.0) Having n as template parameter allows compiler to evaluate some conditions at compile time. Calls __syncthreads before & after reduction.

template<typename T >

__global__ void magma_sum_reduce_kernel	(	int	n,
		T *	x
	)

sum reduction, for arbitrary size vector.

Leaves sum(x) in x[0]. Uses only one thread block of 512 threads, so is not efficient for really large vectors.

template<int m, int n, typename T >

__device__ void magma_sum_reduce_2d	(	int	i,
		int	j,
		T	x[m][n]
	)

Does sum reduction of each column of M x N array x, leaving totals in x[0][j] = sum( x[0:m-1][j] ), for 0 <= j < n.

Contents of x are destroyed in the process. Calls __syncthreads before & after reduction.

template<int m0, int m1, int m2, typename T >

__device__ void magma_sum_reduce_3d	(	int	i,
		int	j,
		int	k,
		T	x[m0][m1][m2]
	)

Does sum reduction of each "column" of M0 x M1 x M2 array x, leaving totals in x[0][j][k] = sum( x[0:m0-1][j][k] ), for 0 <= j < m1, 0 <= k < m2.

Contents of x are destroyed in the process. Calls __syncthreads before & after reduction.

Functions

Detailed Description

Function Documentation