4#define _GW_CALL(call) \
6 cudaError_t _status = (call); \
7 if (_status != cudaSuccess) { \
8 fprintf(stderr, "%s: %d: " #call "\n", __FILE__, __LINE__); \
13__global__
void VecAdd(
const int* A,
const int* B,
int* C,
int N)
15 int i = blockDim.x * blockIdx.x + threadIdx.x;
21__global__
void VecSub(
const int* A,
const int* B,
int* C,
int N)
23 int i = blockDim.x * blockIdx.x + threadIdx.x;
30 for (
int i=0;
i< n;
i++)
34static void cleanUp(
int *h_A,
int *h_B,
int *h_C,
int *h_D,
int *d_A,
int *d_B,
int *d_C,
int *d_D)
59 size_t size =
N *
sizeof(
int);
60 int threadsPerBlock = 0;
61 int blocksPerGrid = 0;
62 int *h_A, *h_B, *h_C, *h_D;
63 int *d_A, *d_B, *d_C, *d_D;
68 h_A = (
int*)malloc(size);
69 h_B = (
int*)malloc(size);
70 h_C = (
int*)malloc(size);
71 h_D = (
int*)malloc(size);
72 if (h_A == NULL || h_B == NULL || h_C == NULL || h_D == NULL) {
73 fprintf(
stderr,
"Allocating input vectors failed.\n");
83 _GW_CALL(cudaMalloc((
void**)&d_A, size));
84 _GW_CALL(cudaMalloc((
void**)&d_B, size));
85 _GW_CALL(cudaMalloc((
void**)&d_C, size));
86 _GW_CALL(cudaMalloc((
void**)&d_D, size));
89 _GW_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
90 _GW_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
93 threadsPerBlock = 256;
94 blocksPerGrid = (
N + threadsPerBlock - 1) / threadsPerBlock;
95 if (!
quiet) fprintf(
stderr,
"Launching kernel on device %d: blocks %d, thread/block %d\n",
96 device, blocksPerGrid, threadsPerBlock);
98 VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C,
N);
100 VecSub<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_D,
N);
104 _GW_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
105 _GW_CALL(cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost));
106 if (!
quiet) fprintf(
stderr,
"Kernel launch complete and mem copied back from device %d\n", device);
108 for (
i = 0;
i <
N; ++
i) {
109 sum = h_A[
i] + h_B[
i];
110 diff = h_A[
i] - h_B[
i];
111 if (h_C[
i] != sum || h_D[
i] != diff) {
112 fprintf(
stderr,
"error: result verification failed\n");
117 cleanUp(h_A, h_B, h_C, h_D, d_A, d_B, d_C, d_D);
cudaError_t CUDARTAPI cudaGetDevice(int *dest)
static void VectorAddSubtract(int N, int quiet)
static void initVec(int *vec, int n)
static void cleanUp(int *h_A, int *h_B, int *h_C, int *h_D, int *d_A, int *d_B, int *d_C, int *d_D)
__global__ void VecAdd(const int *A, const int *B, int *C, int N)
__global__ void VecSub(const int *A, const int *B, int *C, int N)