PAPI 7.1.0.0
Loading...
Searching...
No Matches
gpu_work.h
Go to the documentation of this file.
1#include <stdio.h>
2#include <cuda.h>
3
4#define _GW_CALL(call) \
5do { \
6 cudaError_t _status = (call); \
7 if (_status != cudaSuccess) { \
8 fprintf(stderr, "%s: %d: " #call "\n", __FILE__, __LINE__); \
9 } \
10} while (0);
11
12// Device code
13__global__ void VecAdd(const int* A, const int* B, int* C, int N)
14{
15 int i = blockDim.x * blockIdx.x + threadIdx.x;
16 if (i < N)
17 C[i] = A[i] + B[i];
18}
19
20// Device code
21__global__ void VecSub(const int* A, const int* B, int* C, int N)
22{
23 int i = blockDim.x * blockIdx.x + threadIdx.x;
24 if (i < N)
25 C[i] = A[i] - B[i];
26}
27
28static void initVec(int *vec, int n)
29{
30 for (int i=0; i< n; i++)
31 vec[i] = i;
32}
33
34static void cleanUp(int *h_A, int *h_B, int *h_C, int *h_D, int *d_A, int *d_B, int *d_C, int *d_D)
35{
36 if (d_A)
37 _GW_CALL(cudaFree(d_A));
38 if (d_B)
39 _GW_CALL(cudaFree(d_B));
40 if (d_C)
41 _GW_CALL(cudaFree(d_C));
42 if (d_D)
43 _GW_CALL(cudaFree(d_D));
44
45 // Free host memory
46 if (h_A)
47 free(h_A);
48 if (h_B)
49 free(h_B);
50 if (h_C)
51 free(h_C);
52 if (h_D)
53 free(h_D);
54}
55
56static void VectorAddSubtract(int N, int quiet)
57{
58 if (N==0) N = 50000;
59 size_t size = N * sizeof(int);
60 int threadsPerBlock = 0;
61 int blocksPerGrid = 0;
62 int *h_A, *h_B, *h_C, *h_D;
63 int *d_A, *d_B, *d_C, *d_D;
64 int i, sum, diff;
65 int device;
66 cudaGetDevice(&device);
67 // Allocate input vectors h_A and h_B in host memory
68 h_A = (int*)malloc(size);
69 h_B = (int*)malloc(size);
70 h_C = (int*)malloc(size);
71 h_D = (int*)malloc(size);
72 if (h_A == NULL || h_B == NULL || h_C == NULL || h_D == NULL) {
73 fprintf(stderr, "Allocating input vectors failed.\n");
74 }
75
76 // Initialize input vectors
77 initVec(h_A, N);
78 initVec(h_B, N);
79 memset(h_C, 0, size);
80 memset(h_D, 0, size);
81
82 // Allocate vectors in device memory
83 _GW_CALL(cudaMalloc((void**)&d_A, size));
84 _GW_CALL(cudaMalloc((void**)&d_B, size));
85 _GW_CALL(cudaMalloc((void**)&d_C, size));
86 _GW_CALL(cudaMalloc((void**)&d_D, size));
87
88 // Copy vectors from host memory to device memory
89 _GW_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
90 _GW_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
91
92 // Invoke kernel
93 threadsPerBlock = 256;
94 blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
95 if (!quiet) fprintf(stderr, "Launching kernel on device %d: blocks %d, thread/block %d\n",
96 device, blocksPerGrid, threadsPerBlock);
97
98 VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
99
100 VecSub<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_D, N);
101
102 // Copy result from device memory to host memory
103 // h_C contains the result in host memory
104 _GW_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
105 _GW_CALL(cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost));
106 if (!quiet) fprintf(stderr, "Kernel launch complete and mem copied back from device %d\n", device);
107 // Verify result
108 for (i = 0; i < N; ++i) {
109 sum = h_A[i] + h_B[i];
110 diff = h_A[i] - h_B[i];
111 if (h_C[i] != sum || h_D[i] != diff) {
112 fprintf(stderr, "error: result verification failed\n");
113 exit(-1);
114 }
115 }
116
117 cleanUp(h_A, h_B, h_C, h_D, d_A, d_B, d_C, d_D);
118}
int i
cudaError_t CUDARTAPI cudaGetDevice(int *dest)
Definition: benchSANVML.c:57
#define N
Definition: byte_profile.c:32
static void VectorAddSubtract(int N, int quiet)
Definition: gpu_work.h:56
#define _GW_CALL(call)
Definition: gpu_work.h:4
static void initVec(int *vec, int n)
Definition: gpu_work.h:28
static void cleanUp(int *h_A, int *h_B, int *h_C, int *h_D, int *d_A, int *d_B, int *d_C, int *d_D)
Definition: gpu_work.h:34
__global__ void VecAdd(const int *A, const int *B, int *C, int N)
Definition: gpu_work.h:13
__global__ void VecSub(const int *A, const int *B, int *C, int N)
Definition: gpu_work.h:21
FILE * stderr
int quiet
Definition: rapl_overflow.c:19
int
Definition: sde_internal.h:89