1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2017 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2009 University of Houston. All rights reserved.
14 * Copyright (c) 2013-2017 Los Alamos National Security, LLC. All Rights
15 * reserved.
16 * Copyright (c) 2015-2016 Research Organization for Information Science
17 * and Technology (RIST). All rights reserved.
18 * Copyright (c) 2017 IBM Corporation. All rights reserved.
19 * $COPYRIGHT$
20 *
21 * Additional copyrights may follow
22 *
23 * $HEADER$
24 */
25
26 #include "ompi_config.h"
27
28 #include "mpi.h"
29 #include "ompi/constants.h"
30 #include "ompi/datatype/ompi_datatype.h"
31 #include "ompi/communicator/communicator.h"
32 #include "ompi/mca/coll/coll.h"
33 #include "ompi/mca/coll/base/coll_tags.h"
34 #include "ompi/mca/coll/base/coll_base_functions.h"
35 #include "coll_base_topo.h"
36 #include "coll_base_util.h"
37
38 /*
39 * ompi_coll_base_allgatherv_intra_bruck
40 *
41 * Function: allgather using O(log(N)) steps.
42 * Accepts: Same arguments as MPI_Allgather
43 * Returns: MPI_SUCCESS or error code
44 *
45 * Description: Variation to All-to-all algorithm described by Bruck et al.in
46 * "Efficient Algorithms for All-to-all Communications
47 * in Multiport Message-Passing Systems"
48 * Note: Unlike in case of allgather implementation, we relay on
49 * indexed datatype to select buffers appropriately.
50 * The only additional memory requirement is for creation of
51 * temporary datatypes.
52 * Example on 7 nodes (memory lay out need not be in-order)
53 * Initial set up:
54 * # 0 1 2 3 4 5 6
55 * [0] [ ] [ ] [ ] [ ] [ ] [ ]
56 * [ ] [1] [ ] [ ] [ ] [ ] [ ]
57 * [ ] [ ] [2] [ ] [ ] [ ] [ ]
58 * [ ] [ ] [ ] [3] [ ] [ ] [ ]
59 * [ ] [ ] [ ] [ ] [4] [ ] [ ]
60 * [ ] [ ] [ ] [ ] [ ] [5] [ ]
61 * [ ] [ ] [ ] [ ] [ ] [ ] [6]
62 * Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
63 * # 0 1 2 3 4 5 6
64 * [0] [ ] [ ] [ ] [ ] [ ] [0]
65 * [1] [1] [ ] [ ] [ ] [ ] [ ]
66 * [ ] [2] [2] [ ] [ ] [ ] [ ]
67 * [ ] [ ] [3] [3] [ ] [ ] [ ]
68 * [ ] [ ] [ ] [4] [4] [ ] [ ]
69 * [ ] [ ] [ ] [ ] [5] [5] [ ]
70 * [ ] [ ] [ ] [ ] [ ] [6] [6]
71 * Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
72 * message contains all blocks from (rank) .. (rank + 2^2) with
73 * wrap around.
74 * # 0 1 2 3 4 5 6
75 * [0] [ ] [ ] [ ] [0] [0] [0]
76 * [1] [1] [ ] [ ] [ ] [1] [1]
77 * [2] [2] [2] [ ] [ ] [ ] [2]
78 * [3] [3] [3] [3] [ ] [ ] [ ]
79 * [ ] [4] [4] [4] [4] [ ] [ ]
80 * [ ] [ ] [5] [5] [5] [5] [ ]
81 * [ ] [ ] [ ] [6] [6] [6] [6]
82 * Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
83 * message size is "all remaining blocks"
84 * # 0 1 2 3 4 5 6
85 * [0] [0] [0] [0] [0] [0] [0]
86 * [1] [1] [1] [1] [1] [1] [1]
87 * [2] [2] [2] [2] [2] [2] [2]
88 * [3] [3] [3] [3] [3] [3] [3]
89 * [4] [4] [4] [4] [4] [4] [4]
90 * [5] [5] [5] [5] [5] [5] [5]
91 * [6] [6] [6] [6] [6] [6] [6]
92 */
93 int ompi_coll_base_allgatherv_intra_bruck(const void *sbuf, int scount,
94 struct ompi_datatype_t *sdtype,
95 void *rbuf, const int *rcounts,
96 const int *rdispls,
97 struct ompi_datatype_t *rdtype,
98 struct ompi_communicator_t *comm,
99 mca_coll_base_module_t *module)
100 {
101 int line = -1, err = 0, rank, size, sendto, recvfrom, distance, blockcount, i;
102 int *new_rcounts = NULL, *new_rdispls = NULL, *new_scounts = NULL, *new_sdispls = NULL;
103 ptrdiff_t rlb, rext;
104 char *tmpsend = NULL, *tmprecv = NULL;
105 struct ompi_datatype_t *new_rdtype, *new_sdtype;
106
107 size = ompi_comm_size(comm);
108 rank = ompi_comm_rank(comm);
109
110 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
111 "coll:base:allgather_intra_bruck rank %d", rank));
112
113 err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
114 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
115
116 /* Initialization step:
117 - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
118 the receive buffer.
119 */
120 tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
121 if (MPI_IN_PLACE != sbuf) {
122 tmpsend = (char*) sbuf;
123 err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
124 tmprecv, rcounts[rank], rdtype);
125 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
126
127 }
128
129 /* Communication step:
130 At every step i, rank r:
131 - doubles the distance
132 - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
133 to rank (r - distance)
134 - receives message of blockcount blocks,
135 (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
136 rank (r + distance)
137 - blockcount doubles until the last step when only the remaining data is
138 exchanged.
139 */
140 blockcount = 1;
141 tmpsend = (char*) rbuf;
142
143 new_rcounts = (int*) calloc(4*size, sizeof(int));
144 if (NULL == new_rcounts) { err = -1; line = __LINE__; goto err_hndl; }
145 new_rdispls = new_rcounts + size;
146 new_scounts = new_rdispls + size;
147 new_sdispls = new_scounts + size;
148
149 for (distance = 1; distance < size; distance<<=1) {
150
151 recvfrom = (rank + distance) % size;
152 sendto = (rank - distance + size) % size;
153
154 if (distance <= (size >> 1)) {
155 blockcount = distance;
156 } else {
157 blockcount = size - distance;
158 }
159
160 /* create send and receive datatypes */
161 for (i = 0; i < blockcount; i++) {
162 const int tmp_srank = (rank + i) % size;
163 const int tmp_rrank = (recvfrom + i) % size;
164 new_scounts[i] = rcounts[tmp_srank];
165 new_sdispls[i] = rdispls[tmp_srank];
166 new_rcounts[i] = rcounts[tmp_rrank];
167 new_rdispls[i] = rdispls[tmp_rrank];
168 }
169 err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls,
170 rdtype, &new_sdtype);
171 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
172 err = ompi_datatype_create_indexed(blockcount, new_rcounts, new_rdispls,
173 rdtype, &new_rdtype);
174
175 err = ompi_datatype_commit(&new_sdtype);
176 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
177 err = ompi_datatype_commit(&new_rdtype);
178 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
179
180 /* Sendreceive */
181 err = ompi_coll_base_sendrecv(rbuf, 1, new_sdtype, sendto,
182 MCA_COLL_BASE_TAG_ALLGATHERV,
183 rbuf, 1, new_rdtype, recvfrom,
184 MCA_COLL_BASE_TAG_ALLGATHERV,
185 comm, MPI_STATUS_IGNORE, rank);
186 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
187
188 ompi_datatype_destroy(&new_sdtype);
189 ompi_datatype_destroy(&new_rdtype);
190 }
191
192 free(new_rcounts);
193
194 return OMPI_SUCCESS;
195
196 err_hndl:
197 if( NULL != new_rcounts ) free(new_rcounts);
198
199 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
200 __FILE__, line, err, rank));
201 (void)line; // silence compiler warning
202 return err;
203 }
204
205
206 /*
207 * ompi_coll_base_allgatherv_intra_ring
208 *
209 * Function: allgatherv using O(N) steps.
210 * Accepts: Same arguments as MPI_Allgatherv
211 * Returns: MPI_SUCCESS or error code
212 *
213 * Description: Ring algorithm for all gather.
214 * At every step i, rank r receives message from rank (r - 1)
215 * containing data from rank (r - i - 1) and sends message to rank
216 * (r + 1) containing data from rank (r - i), with wrap arounds.
217 * Memory requirements:
218 * No additional memory requirements.
219 *
220 */
221 int ompi_coll_base_allgatherv_intra_ring(const void *sbuf, int scount,
222 struct ompi_datatype_t *sdtype,
223 void* rbuf, const int *rcounts, const int *rdisps,
224 struct ompi_datatype_t *rdtype,
225 struct ompi_communicator_t *comm,
226 mca_coll_base_module_t *module)
227 {
228 int line = -1, rank, size, sendto, recvfrom, i, recvdatafrom, senddatafrom, err = 0;
229 ptrdiff_t rlb, rext;
230 char *tmpsend = NULL, *tmprecv = NULL;
231
232 size = ompi_comm_size(comm);
233 rank = ompi_comm_rank(comm);
234
235 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
236 "coll:base:allgatherv_intra_ring rank %d", rank));
237
238 err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
239 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
240
241 /* Initialization step:
242 - if send buffer is not MPI_IN_PLACE, copy send buffer to
243 the appropriate block of receive buffer
244 */
245 tmprecv = (char*) rbuf + (ptrdiff_t)rdisps[rank] * rext;
246 if (MPI_IN_PLACE != sbuf) {
247 tmpsend = (char*) sbuf;
248 err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
249 tmprecv, rcounts[rank], rdtype);
250 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
251 }
252
253 /* Communication step:
254 At every step i: 0 .. (P-1), rank r:
255 - receives message from [(r - 1 + size) % size] containing data from rank
256 [(r - i - 1 + size) % size]
257 - sends message to rank [(r + 1) % size] containing data from rank
258 [(r - i + size) % size]
259 - sends message which starts at begining of rbuf and has size
260 */
261 sendto = (rank + 1) % size;
262 recvfrom = (rank - 1 + size) % size;
263
264 for (i = 0; i < size - 1; i++) {
265 recvdatafrom = (rank - i - 1 + size) % size;
266 senddatafrom = (rank - i + size) % size;
267
268 tmprecv = (char*)rbuf + rdisps[recvdatafrom] * rext;
269 tmpsend = (char*)rbuf + rdisps[senddatafrom] * rext;
270
271 /* Sendreceive */
272 err = ompi_coll_base_sendrecv(tmpsend, rcounts[senddatafrom], rdtype,
273 sendto, MCA_COLL_BASE_TAG_ALLGATHERV,
274 tmprecv, rcounts[recvdatafrom], rdtype,
275 recvfrom, MCA_COLL_BASE_TAG_ALLGATHERV,
276 comm, MPI_STATUS_IGNORE, rank);
277 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
278 }
279
280 return OMPI_SUCCESS;
281
282 err_hndl:
283 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
284 __FILE__, line, err, rank));
285 (void)line; // silence compiler warning
286 return err;
287 }
288
289 /*
290 * ompi_coll_base_allgatherv_intra_neighborexchange
291 *
292 * Function: allgatherv using N/2 steps (O(N))
293 * Accepts: Same arguments as MPI_Allgatherv
294 * Returns: MPI_SUCCESS or error code
295 *
296 * Description: Neighbor Exchange algorithm for allgather adapted for
297 * allgatherv.
298 * Described by Chen et.al. in
299 * "Performance Evaluation of Allgather Algorithms on
300 * Terascale Linux Cluster with Fast Ethernet",
301 * Proceedings of the Eighth International Conference on
302 * High-Performance Computing inn Asia-Pacific Region
303 * (HPCASIA'05), 2005
304 *
305 * Rank r exchanges message with one of its neighbors and
306 * forwards the data further in the next step.
307 *
308 * No additional memory requirements.
309 *
310 * Limitations: Algorithm works only on even number of processes.
311 * For odd number of processes we switch to ring algorithm.
312 *
313 * Example on 6 nodes:
314 * Initial state
315 * # 0 1 2 3 4 5
316 * [0] [ ] [ ] [ ] [ ] [ ]
317 * [ ] [1] [ ] [ ] [ ] [ ]
318 * [ ] [ ] [2] [ ] [ ] [ ]
319 * [ ] [ ] [ ] [3] [ ] [ ]
320 * [ ] [ ] [ ] [ ] [4] [ ]
321 * [ ] [ ] [ ] [ ] [ ] [5]
322 * Step 0:
323 * # 0 1 2 3 4 5
324 * [0] [0] [ ] [ ] [ ] [ ]
325 * [1] [1] [ ] [ ] [ ] [ ]
326 * [ ] [ ] [2] [2] [ ] [ ]
327 * [ ] [ ] [3] [3] [ ] [ ]
328 * [ ] [ ] [ ] [ ] [4] [4]
329 * [ ] [ ] [ ] [ ] [5] [5]
330 * Step 1:
331 * # 0 1 2 3 4 5
332 * [0] [0] [0] [ ] [ ] [0]
333 * [1] [1] [1] [ ] [ ] [1]
334 * [ ] [2] [2] [2] [2] [ ]
335 * [ ] [3] [3] [3] [3] [ ]
336 * [4] [ ] [ ] [4] [4] [4]
337 * [5] [ ] [ ] [5] [5] [5]
338 * Step 2:
339 * # 0 1 2 3 4 5
340 * [0] [0] [0] [0] [0] [0]
341 * [1] [1] [1] [1] [1] [1]
342 * [2] [2] [2] [2] [2] [2]
343 * [3] [3] [3] [3] [3] [3]
344 * [4] [4] [4] [4] [4] [4]
345 * [5] [5] [5] [5] [5] [5]
346 */
347 int
348 ompi_coll_base_allgatherv_intra_neighborexchange(const void *sbuf, int scount,
349 struct ompi_datatype_t *sdtype,
350 void* rbuf, const int *rcounts, const int *rdispls,
351 struct ompi_datatype_t *rdtype,
352 struct ompi_communicator_t *comm,
353 mca_coll_base_module_t *module)
354 {
355 int line = -1, rank, size, i, even_rank, err = 0;
356 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
357 int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
358 ptrdiff_t rlb, rext;
359 char *tmpsend = NULL, *tmprecv = NULL;
360 struct ompi_datatype_t *new_rdtype, *new_sdtype;
361
362 size = ompi_comm_size(comm);
363 rank = ompi_comm_rank(comm);
364
365 if (size % 2) {
366 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
367 "coll:base:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
368 size));
369 return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
370 rbuf, rcounts,
371 rdispls, rdtype,
372 comm, module);
373 }
374
375 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
376 "coll:base:allgatherv_intra_neighborexchange rank %d", rank));
377
378 err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
379 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
380
381 /* Initialization step:
382 - if send buffer is not MPI_IN_PLACE, copy send buffer to
383 the appropriate block of receive buffer
384 */
385 tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
386 if (MPI_IN_PLACE != sbuf) {
387 tmpsend = (char*) sbuf;
388 err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
389 tmprecv, rcounts[rank], rdtype);
390 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
391 }
392
393 /* Determine neighbors, order in which blocks will arrive, etc. */
394 even_rank = !(rank % 2);
395 if (even_rank) {
396 neighbor[0] = (rank + 1) % size;
397 neighbor[1] = (rank - 1 + size) % size;
398 recv_data_from[0] = rank;
399 recv_data_from[1] = rank;
400 offset_at_step[0] = (+2);
401 offset_at_step[1] = (-2);
402 } else {
403 neighbor[0] = (rank - 1 + size) % size;
404 neighbor[1] = (rank + 1) % size;
405 recv_data_from[0] = neighbor[0];
406 recv_data_from[1] = neighbor[0];
407 offset_at_step[0] = (-2);
408 offset_at_step[1] = (+2);
409 }
410
411 /* Communication loop:
412 - First step is special: exchange a single block with neighbor[0].
413 - Rest of the steps:
414 update recv_data_from according to offset, and
415 exchange two blocks with appropriate neighbor.
416 the send location becomes previous receve location.
417 Note, we need to create indexed datatype to send and receive these
418 blocks properly.
419 */
420 tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[neighbor[0]] * rext;
421 tmpsend = (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext;
422 err = ompi_coll_base_sendrecv(tmpsend, rcounts[rank], rdtype,
423 neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
424 tmprecv, rcounts[neighbor[0]], rdtype,
425 neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
426 comm, MPI_STATUS_IGNORE, rank);
427 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
428
429 /* Determine initial sending counts and displacements*/
430 if (even_rank) {
431 send_data_from = rank;
432 } else {
433 send_data_from = recv_data_from[0];
434 }
435
436 for (i = 1; i < (size / 2); i++) {
437 const int i_parity = i % 2;
438 recv_data_from[i_parity] =
439 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
440
441 /* Create new indexed types for sending and receiving.
442 We are sending data from ranks (send_data_from) and (send_data_from+1)
443 We are receiving data from ranks (recv_data_from[i_parity]) and
444 (recv_data_from[i_parity]+1).
445 */
446 new_scounts[0] = rcounts[send_data_from];
447 new_scounts[1] = rcounts[(send_data_from + 1)];
448 new_sdispls[0] = rdispls[send_data_from];
449 new_sdispls[1] = rdispls[(send_data_from + 1)];
450 err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype,
451 &new_sdtype);
452 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
453 err = ompi_datatype_commit(&new_sdtype);
454 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
455
456 new_rcounts[0] = rcounts[recv_data_from[i_parity]];
457 new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
458 new_rdispls[0] = rdispls[recv_data_from[i_parity]];
459 new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
460 err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype,
461 &new_rdtype);
462 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
463 err = ompi_datatype_commit(&new_rdtype);
464 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
465
466 tmprecv = (char*)rbuf;
467 tmpsend = (char*)rbuf;
468
469 /* Sendreceive */
470 err = ompi_coll_base_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
471 MCA_COLL_BASE_TAG_ALLGATHERV,
472 tmprecv, 1, new_rdtype, neighbor[i_parity],
473 MCA_COLL_BASE_TAG_ALLGATHERV,
474 comm, MPI_STATUS_IGNORE, rank);
475 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
476
477 send_data_from = recv_data_from[i_parity];
478
479 ompi_datatype_destroy(&new_sdtype);
480 ompi_datatype_destroy(&new_rdtype);
481 }
482
483 return OMPI_SUCCESS;
484
485 err_hndl:
486 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
487 __FILE__, line, err, rank));
488 (void)line; // silence compiler warning
489 return err;
490 }
491
492
493 int ompi_coll_base_allgatherv_intra_two_procs(const void *sbuf, int scount,
494 struct ompi_datatype_t *sdtype,
495 void* rbuf, const int *rcounts,
496 const int *rdispls,
497 struct ompi_datatype_t *rdtype,
498 struct ompi_communicator_t *comm,
499 mca_coll_base_module_t *module)
500 {
501 int line = -1, err = 0, rank, remote;
502 char *tmpsend = NULL, *tmprecv = NULL;
503 ptrdiff_t rext, lb;
504
505 rank = ompi_comm_rank(comm);
506
507 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
508 "ompi_coll_base_allgatherv_intra_two_procs rank %d", rank));
509
510 if (2 != ompi_comm_size(comm)) {
511 return MPI_ERR_UNSUPPORTED_OPERATION;
512 }
513
514 err = ompi_datatype_get_extent (rdtype, &lb, &rext);
515 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
516
517 /* Exchange data:
518 - compute source and destinations
519 - send receive data
520 */
521 remote = rank ^ 0x1;
522
523 tmpsend = (char*)sbuf;
524 if (MPI_IN_PLACE == sbuf) {
525 tmpsend = (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext;
526 scount = rcounts[rank];
527 sdtype = rdtype;
528 }
529 tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[remote] * rext;
530
531 err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
532 MCA_COLL_BASE_TAG_ALLGATHERV,
533 tmprecv, rcounts[remote], rdtype, remote,
534 MCA_COLL_BASE_TAG_ALLGATHERV,
535 comm, MPI_STATUS_IGNORE, rank);
536 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
537
538 /* Place your data in correct location if necessary */
539 if (MPI_IN_PLACE != sbuf) {
540 err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
541 (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext,
542 rcounts[rank], rdtype);
543 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
544 }
545
546 return MPI_SUCCESS;
547
548 err_hndl:
549 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
550 __FILE__, line, err, rank));
551 (void)line; // silence compiler warning
552 return err;
553 }
554
555
556 /*
557 * Linear functions are copied from the BASIC coll module
558 * they do not segment the message and are simple implementations
559 * but for some small number of nodes and/or small data sizes they
560 * are just as fast as base/tree based segmenting operations
561 * and as such may be selected by the decision functions
562 * These are copied into this module due to the way we select modules
563 * in V1. i.e. in V2 we will handle this differently and so will not
564 * have to duplicate code.
565 * JPG following the examples from other coll_base implementations. Dec06.
566 */
567
568 /* copied function (with appropriate renaming) starts here */
569
570 /*
571 * allgatherv_intra_basic
572 *
573 * Function: - allgatherv using other MPI collectives:
574 * gatherv + bcast (from basic module).
575 * Accepts: - same as MPI_Allgatherv()
576 * Returns: - MPI_SUCCESS or error code
577 */
578 int
579 ompi_coll_base_allgatherv_intra_basic_default(const void *sbuf, int scount,
580 struct ompi_datatype_t *sdtype,
581 void *rbuf, const int *rcounts,
582 const int *disps,
583 struct ompi_datatype_t *rdtype,
584 struct ompi_communicator_t *comm,
585 mca_coll_base_module_t *module)
586 {
587 int size, rank, err;
588 MPI_Aint extent, lb;
589 char *send_buf = NULL;
590 struct ompi_datatype_t *newtype, *send_type;
591
592 size = ompi_comm_size(comm);
593 rank = ompi_comm_rank(comm);
594 /*
595 * We don't have a root process defined. Arbitrarily assign root
596 * to process with rank 0 (OMPI convention)
597 */
598
599 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
600 "ompi_coll_base_allgatherv_intra_basic_default rank %d",
601 rank));
602
603 if (MPI_IN_PLACE == sbuf) {
604 ompi_datatype_get_extent(rdtype, &lb, &extent);
605 send_type = rdtype;
606 send_buf = (char*)rbuf;
607 send_buf += ((ptrdiff_t)disps[rank] * extent);
608 scount = rcounts[rank];
609 } else {
610 send_buf = (char*)sbuf;
611 send_type = sdtype;
612 }
613
614 err = comm->c_coll->coll_gatherv(send_buf,
615 scount, send_type,rbuf,
616 rcounts, disps, rdtype, 0,
617 comm, comm->c_coll->coll_gatherv_module);
618 if (MPI_SUCCESS != err) {
619 return err;
620 }
621 /*
622 * we now have all the data in the root's rbuf. Need to
623 * broadcast the data out to the other processes
624 *
625 * Need to define a datatype that captures the different vectors
626 * from each process. MPI_TYPE_INDEXED with params
627 * size,rcount,displs,rdtype,newtype
628 * should do the trick.
629 * Use underlying ddt functions to create, and commit the
630 * new datatype on each process, then broadcast and destroy the
631 * datatype.
632 */
633
634 err = ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&newtype);
635 if (MPI_SUCCESS != err) {
636 return err;
637 }
638
639 err = ompi_datatype_commit(&newtype);
640 if(MPI_SUCCESS != err) {
641 return err;
642 }
643
644 comm->c_coll->coll_bcast(rbuf, 1, newtype, 0, comm,
645 comm->c_coll->coll_bcast_module);
646
647 ompi_datatype_destroy (&newtype);
648
649 return MPI_SUCCESS;
650 }
651
652 /* copied function (with appropriate renaming) ends here */