57{
59 size_t size =
N *
sizeof(
int);
60 int threadsPerBlock = 0;
61 int blocksPerGrid = 0;
62 int *h_A, *h_B, *h_C, *h_D;
63 int *d_A, *d_B, *d_C, *d_D;
65 int device;
67
68 h_A = (int*)malloc(size);
69 h_B = (int*)malloc(size);
70 h_C = (int*)malloc(size);
71 h_D = (int*)malloc(size);
72 if (h_A == NULL || h_B == NULL || h_C == NULL || h_D == NULL) {
73 fprintf(
stderr,
"Allocating input vectors failed.\n");
74 }
75
76
79 memset(h_C, 0, size);
80 memset(h_D, 0, size);
81
82
83 _GW_CALL(cudaMalloc((
void**)&d_A, size));
84 _GW_CALL(cudaMalloc((
void**)&d_B, size));
85 _GW_CALL(cudaMalloc((
void**)&d_C, size));
86 _GW_CALL(cudaMalloc((
void**)&d_D, size));
87
88
89 _GW_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
90 _GW_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
91
92
93 threadsPerBlock = 256;
94 blocksPerGrid = (
N + threadsPerBlock - 1) / threadsPerBlock;
95 if (!
quiet) fprintf(
stderr,
"Launching kernel on device %d: blocks %d, thread/block %d\n",
96 device, blocksPerGrid, threadsPerBlock);
97
98 VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C,
N);
99
100 VecSub<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_D,
N);
101
102
103
104 _GW_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
105 _GW_CALL(cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost));
106 if (!
quiet) fprintf(
stderr,
"Kernel launch complete and mem copied back from device %d\n", device);
107
108 for (
i = 0;
i <
N; ++
i) {
109 sum = h_A[
i] + h_B[
i];
110 diff = h_A[
i] - h_B[
i];
111 if (h_C[
i] != sum || h_D[
i] != diff) {
112 fprintf(
stderr,
"error: result verification failed\n");
113 exit(-1);
114 }
115 }
116
117 cleanUp(h_A, h_B, h_C, h_D, d_A, d_B, d_C, d_D);
118}
cudaError_t CUDARTAPI cudaGetDevice(int *dest)
static void initVec(int *vec, int n)
static void cleanUp(int *h_A, int *h_B, int *h_C, int *h_D, int *d_A, int *d_B, int *d_C, int *d_D)