PULSAR  2.0.0
Parallel Ultra-Light Systolic Array Runtime
 All Data Structures Files Functions Typedefs Enumerations Macros Groups
prt_packet.c
Go to the documentation of this file.
1 
11 #include "prt_packet.h"
12 
14 
27 prt_packet_t *prt_packet_new_host(size_t size, void *data)
28 {
29  // Allocate the packet.
30  prt_packet_t *packet = (prt_packet_t*)malloc(sizeof(prt_packet_t));
31  prt_assert(packet != NULL, "malloc failed");
32 
33  // Initialize the packet.
34  packet->size = size;
35  packet->num_refs = 1;
36  packet->location = PRT_LOCATION_HOST;
37 
38  // IF the data pointer is NULL.
39  if (data == NULL) {
40  // Allocate host memory.
41  packet->data = malloc(size);
42  prt_assert(packet->data != NULL, "malloc failed");
43  }
44  else {
45  // Assign the pointer.
46  packet->data = data;
47  }
48  // Register memory usage and return.
50  return packet;
51 }
52 
54 
68 prt_packet_t *prt_packet_new_device(size_t size, void *data, prt_vdp_t *vdp)
69 {
70  // Allocate the packet.
71  prt_packet_t *packet = (prt_packet_t*)malloc(sizeof(prt_packet_t));
72  prt_assert(packet != NULL, "malloc failed");
73 
74  // Initialize the packet.
75  packet->size = size;
76  packet->num_refs = 1;
77  packet->location = PRT_LOCATION_DEVICE;
78  packet->device_rank = vdp->device->rank;
79  packet->devmem = vdp->device->vsa->devmem[packet->device_rank];
80 
81  // IF the data pointer is NULL.
82  if (data == NULL) {
83  // Allocate device memory.
84  packet->data = gpu_malloc(packet->devmem, size);
85  prt_assert(packet->data != NULL, "gpu_malloc failed");
86  }
87  else {
88  // Assign the pointer.
89  packet->data = data;
90  }
91  // Register memory usage and return.
93  return packet;
94 }
95 
97 
105 void prt_packet_resize_host(prt_packet_t *packet, size_t size)
106 {
107  // Check packet location.
108  prt_assert(packet->location == PRT_LOCATION_HOST, "wrong packet location");
109 
110  // Register memory usage.
111  svg_trace_memory_host(-packet->size+size);
112 
113  // Reallocate the data buffer.
114  packet->data = realloc(packet->data, size);
115  prt_assert(packet->data != NULL, "realloc failed");
116  packet->size = size;
117 }
118 
120 
128 {
129  int num_refs = __sync_sub_and_fetch(&packet->num_refs, 1);
130  prt_assert(num_refs >= 0, "negative number of data references");
131  if (num_refs == 0) {
132  // Register memory usage.
133  svg_trace_memory_host(-packet->size);
134  // Free the payload and the packet.
135  free(packet->data);
136  free(packet);
137  }
138 }
139 
141 
149 {
150  int num_refs = __sync_sub_and_fetch(&packet->num_refs, 1);
151  prt_assert(num_refs >= 0, "negative number of data references");
152  if (num_refs == 0) {
153  // Set the device.
154  cudaError_t error = cudaSetDevice(packet->device_rank);
155  prt_assert(error == cudaSuccess, cudaGetErrorString(error));
156  // Free the payload.
157  int retval = gpu_free(packet->devmem, packet->data);
158  prt_assert(retval == 0, "gpu_free failed");
159  // Register memory usage.
160  svg_trace_memory_device(-packet->size);
161  // Free the packet.
162  free(packet);
163  }
164 }
165 
167 
175 {
176  // Set device to the destination device.
177  cudaError_t error = cudaSetDevice(channel->dst_vdp->device->rank);
178  prt_assert(error == cudaSuccess, cudaGetErrorString(error));
179 
180  // Create a new device packet.
181  prt_packet_t *dst_packet = prt_packet_new_device(
182  src_packet->size, NULL, channel->dst_vdp);
183 
184  // Put the copy in the channel stream.
185  svg_trace_start_dma(channel->in_stream);
186  cudaMemcpyAsync(
187  dst_packet->data, src_packet->data, src_packet->size,
188  cudaMemcpyHostToDevice, channel->in_stream);
189  svg_trace_stop_dma(channel->in_stream, Silver);
190 
191  // Set up the callback to put the new device packet in the channel.
192  prt_callback_finish_t *callback =
193  prt_callback_finish_new(src_packet, dst_packet, channel);
194  __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
195  cudaStreamAddCallback(
196  channel->in_stream, prt_callback_finish_handler, (void*)callback, 0);
197 }
198 
200 
208 {
209  // Set device to the source device.
210  cudaError_t error = cudaSetDevice(src_packet->device_rank);
211  prt_assert(error == cudaSuccess, cudaGetErrorString(error));
212 
213  // Create new host packet.
214  prt_packet_t *dst_packet = prt_packet_new_host(src_packet->size, NULL);
215 
216  // Put the copy in the channel stream.
217  svg_trace_start_dma(channel->out_stream);
218  cudaMemcpyAsync(
219  dst_packet->data, src_packet->data, src_packet->size,
220  cudaMemcpyDeviceToHost, channel->out_stream);
221  svg_trace_stop_dma(channel->out_stream, Silver);
222 
223  // Set up the callback to put the new host packet in the channel.
224  prt_callback_finish_t *callback =
225  prt_callback_finish_new(src_packet, dst_packet, channel);
226  __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
227  cudaStreamAddCallback(
228  channel->out_stream, prt_callback_finish_handler, (void*)callback, 0);
229 }
230 
232 
239 {
240  // Set device to the source device.
241  cudaError_t error = cudaSetDevice(src_packet->device_rank);
242  prt_assert(error == cudaSuccess, cudaGetErrorString(error));
243 
244  // Create new host packet.
245  prt_packet_t *dst_packet = prt_packet_new_host(src_packet->size, NULL);
246 
247  // Put the copy in the channel stream.
248  svg_trace_start_dma(channel->out_stream);
249  cudaMemcpyAsync(dst_packet->data, src_packet->data, src_packet->size,
250  cudaMemcpyDeviceToHost, channel->out_stream);
251  svg_trace_stop_dma(channel->out_stream, Silver);
252 
253  // Set up the callback to queue the host to device transfer.
254  prt_callback_queue_t *callback =
256  src_packet, dst_packet, channel, PRT_HOST_TO_DEVICE, -1);
257  __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
258  cudaStreamAddCallback(
259  channel->out_stream, prt_callback_queue_handler, (void*)callback, 0);
260 }
261 
263 
271  prt_packet_t *src_packet, prt_channel_t *channel)
272 {
273  // Set device to the source device.
274  cudaError_t error = cudaSetDevice(src_packet->device_rank);
275  prt_assert(error == cudaSuccess, cudaGetErrorString(error));
276 
277  // Create new host packet.
278  prt_packet_t *dst_packet = prt_packet_new_device(
279  src_packet->size, NULL, channel->dst_vdp);
280 
281  // Put the copy in the channel stream.
282  svg_trace_start_dma(channel->out_stream);
283  cudaMemcpyPeerAsync(
284  dst_packet->data, channel->dst_vdp->device->rank,
285  src_packet->data, src_packet->device_rank,
286  src_packet->size, channel->out_stream);
287  svg_trace_stop_dma(channel->out_stream, Silver);
288 
289  // Set up the callback to put the new device packet in the channel.
290  prt_callback_finish_t *callback =
291  prt_callback_finish_new(src_packet, dst_packet, channel);
292  __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
293  cudaStreamAddCallback(
294  channel->out_stream, prt_callback_finish_handler, (void*)callback, 0);
295 }
296 
298 
307  prt_packet_t *src_packet, prt_channel_t *channel, int agent)
308 {
309  // Set device to the source device.
310  cudaError_t error = cudaSetDevice(src_packet->device_rank);
311  prt_assert(error == cudaSuccess, cudaGetErrorString(error));
312 
313  // Create new host packet.
314  prt_packet_t *dst_packet = prt_packet_new_host(src_packet->size, NULL);
315 
316  // Put the copy in the channel stream.
317  svg_trace_start_dma(channel->out_stream);
318  cudaMemcpyAsync(dst_packet->data, src_packet->data, src_packet->size,
319  cudaMemcpyDeviceToHost, channel->out_stream);
320  svg_trace_stop_dma(channel->out_stream, Silver);
321 
322  // Set up the callback to queue send from host.
323  prt_callback_queue_t *callback =
325  src_packet, dst_packet, channel, PRT_DEVICE_MPI_FROM_HOST, agent);
326  __sync_fetch_and_add(&channel->proxy->num_callbacks, 1);
327  cudaStreamAddCallback(
328  channel->out_stream, prt_callback_queue_handler, (void*)callback, 0);
329 }