31 prt_assert(packet != NULL,
"malloc failed");
36 packet->location = PRT_LOCATION_HOST;
41 packet->data = malloc(size);
42 prt_assert(packet->data != NULL,
"malloc failed");
72 prt_assert(packet != NULL,
"malloc failed");
77 packet->location = PRT_LOCATION_DEVICE;
78 packet->device_rank = vdp->device->rank;
79 packet->devmem = vdp->device->vsa->devmem[packet->device_rank];
84 packet->data =
gpu_malloc(packet->devmem, size);
85 prt_assert(packet->data != NULL,
"gpu_malloc failed");
108 prt_assert(packet->location == PRT_LOCATION_HOST,
"wrong packet location");
114 packet->data = realloc(packet->data, size);
115 prt_assert(packet->data != NULL,
"realloc failed");
129 int num_refs = __sync_sub_and_fetch(&packet->num_refs, 1);
130 prt_assert(num_refs >= 0,
"negative number of data references");
150 int num_refs = __sync_sub_and_fetch(&packet->num_refs, 1);
151 prt_assert(num_refs >= 0,
"negative number of data references");
154 cudaError_t error = cudaSetDevice(packet->device_rank);
155 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
157 int retval =
gpu_free(packet->devmem, packet->data);
158 prt_assert(retval == 0,
"gpu_free failed");
177 cudaError_t error = cudaSetDevice(channel->dst_vdp->device->rank);
178 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
182 src_packet->size, NULL, channel->dst_vdp);
187 dst_packet->data, src_packet->data, src_packet->size,
188 cudaMemcpyHostToDevice, channel->in_stream);
194 __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
195 cudaStreamAddCallback(
210 cudaError_t error = cudaSetDevice(src_packet->device_rank);
211 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
219 dst_packet->data, src_packet->data, src_packet->size,
220 cudaMemcpyDeviceToHost, channel->out_stream);
226 __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
227 cudaStreamAddCallback(
241 cudaError_t error = cudaSetDevice(src_packet->device_rank);
242 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
249 cudaMemcpyAsync(dst_packet->data, src_packet->data, src_packet->size,
250 cudaMemcpyDeviceToHost, channel->out_stream);
256 src_packet, dst_packet, channel, PRT_HOST_TO_DEVICE, -1);
257 __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
258 cudaStreamAddCallback(
274 cudaError_t error = cudaSetDevice(src_packet->device_rank);
275 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
279 src_packet->size, NULL, channel->dst_vdp);
284 dst_packet->data, channel->dst_vdp->device->rank,
285 src_packet->data, src_packet->device_rank,
286 src_packet->size, channel->out_stream);
292 __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
293 cudaStreamAddCallback(
310 cudaError_t error = cudaSetDevice(src_packet->device_rank);
311 prt_assert(error == cudaSuccess, cudaGetErrorString(error));
318 cudaMemcpyAsync(dst_packet->data, src_packet->data, src_packet->size,
319 cudaMemcpyDeviceToHost, channel->out_stream);
325 src_packet, dst_packet, channel, PRT_DEVICE_MPI_FROM_HOST, agent);
326 __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
327 cudaStreamAddCallback(