1 /*
2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2017 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2009 University of Houston. All rights reserved.
13 * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights
14 * reserved.
15 * Copyright (c) 2014-2016 Research Organization for Information Science
16 * and Technology (RIST). All rights reserved.
17 * $COPYRIGHT$
18 *
19 * Additional copyrights may follow
20 *
21 * $HEADER$
22 */
23
24 #include "ompi_config.h"
25
26 #include "mpi.h"
27 #include "opal/util/bit_ops.h"
28 #include "ompi/constants.h"
29 #include "ompi/datatype/ompi_datatype.h"
30 #include "ompi/communicator/communicator.h"
31 #include "ompi/mca/coll/coll.h"
32 #include "ompi/mca/coll/base/coll_tags.h"
33 #include "ompi/mca/coll/base/coll_base_functions.h"
34 #include "coll_base_topo.h"
35 #include "coll_base_util.h"
36
37 /*
38 * ompi_coll_base_allgather_intra_bruck
39 *
40 * Function: allgather using O(log(N)) steps.
41 * Accepts: Same arguments as MPI_Allgather
42 * Returns: MPI_SUCCESS or error code
43 *
44 * Description: Variation to All-to-all algorithm described by Bruck et al.in
45 * "Efficient Algorithms for All-to-all Communications
46 * in Multiport Message-Passing Systems"
47 * Memory requirements: non-zero ranks require shift buffer to perform final
48 * step in the algorithm.
49 *
50 * Example on 6 nodes:
51 * Initialization: everyone has its own buffer at location 0 in rbuf
52 * This means if user specified MPI_IN_PLACE for sendbuf
53 * we must copy our block from recvbuf to begining!
54 * # 0 1 2 3 4 5
55 * [0] [1] [2] [3] [4] [5]
56 * Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
57 * # 0 1 2 3 4 5
58 * [0] [1] [2] [3] [4] [5]
59 * [1] [2] [3] [4] [5] [0]
60 * Step 1: send message to (rank - 2^1), receive message from (rank + 2^1)
61 * message contains all blocks from location 0 to 2^1*block size
62 * # 0 1 2 3 4 5
63 * [0] [1] [2] [3] [4] [5]
64 * [1] [2] [3] [4] [5] [0]
65 * [2] [3] [4] [5] [0] [1]
66 * [3] [4] [5] [0] [1] [2]
67 * Step 2: send message to (rank - 2^2), receive message from (rank + 2^2)
68 * message size is "all remaining blocks"
69 * # 0 1 2 3 4 5
70 * [0] [1] [2] [3] [4] [5]
71 * [1] [2] [3] [4] [5] [0]
72 * [2] [3] [4] [5] [0] [1]
73 * [3] [4] [5] [0] [1] [2]
74 * [4] [5] [0] [1] [2] [3]
75 * [5] [0] [1] [2] [3] [4]
76 * Finalization: Do a local shift to get data in correct place
77 * # 0 1 2 3 4 5
78 * [0] [0] [0] [0] [0] [0]
79 * [1] [1] [1] [1] [1] [1]
80 * [2] [2] [2] [2] [2] [2]
81 * [3] [3] [3] [3] [3] [3]
82 * [4] [4] [4] [4] [4] [4]
83 * [5] [5] [5] [5] [5] [5]
84 */
85 int ompi_coll_base_allgather_intra_bruck(const void *sbuf, int scount,
86 struct ompi_datatype_t *sdtype,
87 void* rbuf, int rcount,
88 struct ompi_datatype_t *rdtype,
89 struct ompi_communicator_t *comm,
90 mca_coll_base_module_t *module)
91 {
92 int line = -1, rank, size, sendto, recvfrom, distance, blockcount, err = 0;
93 ptrdiff_t rlb, rext;
94 char *tmpsend = NULL, *tmprecv = NULL;
95
96 size = ompi_comm_size(comm);
97 rank = ompi_comm_rank(comm);
98
99 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
100 "coll:base:allgather_intra_bruck rank %d", rank));
101
102 err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
103 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
104
105 /* Initialization step:
106 - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
107 receive buffer, else
108 - if rank r != 0, copy r^th block from receive buffer to block 0.
109 */
110 tmprecv = (char*) rbuf;
111 if (MPI_IN_PLACE != sbuf) {
112 tmpsend = (char*) sbuf;
113 err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
114 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
115
116 } else if (0 != rank) { /* non root with MPI_IN_PLACE */
117 tmpsend = ((char*)rbuf) + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
118 err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend);
119 if (err < 0) { line = __LINE__; goto err_hndl; }
120 }
121
122 /* Communication step:
123 At every step i, rank r:
124 - doubles the distance
125 - sends message which starts at begining of rbuf and has size
126 (blockcount * rcount) to rank (r - distance)
127 - receives message of size blockcount * rcount from rank (r + distance)
128 at location (rbuf + distance * rcount * rext)
129 - blockcount doubles until last step when only the remaining data is
130 exchanged.
131 */
132 blockcount = 1;
133 tmpsend = (char*) rbuf;
134 for (distance = 1; distance < size; distance<<=1) {
135
136 recvfrom = (rank + distance) % size;
137 sendto = (rank - distance + size) % size;
138
139 tmprecv = tmpsend + (ptrdiff_t)distance * (ptrdiff_t)rcount * rext;
140
141 if (distance <= (size >> 1)) {
142 blockcount = distance;
143 } else {
144 blockcount = size - distance;
145 }
146
147 /* Sendreceive */
148 err = ompi_coll_base_sendrecv(tmpsend, blockcount * rcount, rdtype,
149 sendto, MCA_COLL_BASE_TAG_ALLGATHER,
150 tmprecv, blockcount * rcount, rdtype,
151 recvfrom, MCA_COLL_BASE_TAG_ALLGATHER,
152 comm, MPI_STATUS_IGNORE, rank);
153 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
154
155 }
156
157 /* Finalization step:
158 On all nodes except 0, data needs to be shifted locally:
159 - create temporary shift buffer,
160 see discussion in coll_basic_reduce.c about the size and begining
161 of temporary buffer.
162 - copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer
163 - move blocks [(size - rank) .. size] from rbuf to begining of rbuf
164 - copy blocks from shift buffer starting at block [rank] in rbuf.
165 */
166 if (0 != rank) {
167 char *free_buf = NULL, *shift_buf = NULL;
168 ptrdiff_t span, gap = 0;
169
170 span = opal_datatype_span(&rdtype->super, (int64_t)(size - rank) * rcount, &gap);
171
172 free_buf = (char*)calloc(span, sizeof(char));
173 if (NULL == free_buf) {
174 line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
175 }
176 shift_buf = free_buf - gap;
177
178 /* 1. copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer */
179 err = ompi_datatype_copy_content_same_ddt(rdtype, ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount),
180 shift_buf, rbuf);
181 if (err < 0) { line = __LINE__; goto err_hndl; }
182
183 /* 2. move blocks [(size - rank) .. size] from rbuf to the begining of rbuf */
184 tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext;
185 err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
186 rbuf, tmpsend);
187 if (err < 0) { line = __LINE__; goto err_hndl; }
188
189 /* 3. copy blocks from shift buffer back to rbuf starting at block [rank]. */
190 tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
191 err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
192 tmprecv, shift_buf);
193 if (err < 0) { line = __LINE__; goto err_hndl; }
194
195 free(free_buf);
196 }
197
198 return OMPI_SUCCESS;
199
200 err_hndl:
201 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
202 __FILE__, line, err, rank));
203 (void)line; // silence compiler warning
204 return err;
205 }
206
207 /*
208 * ompi_coll_base_allgather_intra_recursivedoubling
209 *
210 * Function: allgather using O(log(N)) steps.
211 * Accepts: Same arguments as MPI_Allgather
212 * Returns: MPI_SUCCESS or error code
213 *
214 * Description: Recursive doubling algorithm for MPI_Allgather implementation.
215 * This algorithm is used in MPICH-2 for small- and medium-sized
216 * messages on power-of-two processes.
217 *
218 * Limitation: Current implementation only works on power-of-two number of
219 * processes.
220 * In case this algorithm is invoked on non-power-of-two
221 * processes, Bruck algorithm will be invoked.
222 *
223 * Memory requirements:
224 * No additional memory requirements beyond user-supplied buffers.
225 *
226 * Example on 4 nodes:
227 * Initialization: everyone has its own buffer at location rank in rbuf
228 * # 0 1 2 3
229 * [0] [ ] [ ] [ ]
230 * [ ] [1] [ ] [ ]
231 * [ ] [ ] [2] [ ]
232 * [ ] [ ] [ ] [3]
233 * Step 0: exchange data with (rank ^ 2^0)
234 * # 0 1 2 3
235 * [0] [0] [ ] [ ]
236 * [1] [1] [ ] [ ]
237 * [ ] [ ] [2] [2]
238 * [ ] [ ] [3] [3]
239 * Step 1: exchange data with (rank ^ 2^1) (if you can)
240 * # 0 1 2 3
241 * [0] [0] [0] [0]
242 * [1] [1] [1] [1]
243 * [2] [2] [2] [2]
244 * [3] [3] [3] [3]
245 *
246 * TODO: Modify the algorithm to work with any number of nodes.
247 * We can modify code to use identical implementation like MPICH-2:
248 * - using recursive-halving algorithm, at the end of each step,
249 * determine if there are nodes who did not exchange their data in that
250 * step, and send them appropriate messages.
251 */
252 int
253 ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, int scount,
254 struct ompi_datatype_t *sdtype,
255 void* rbuf, int rcount,
256 struct ompi_datatype_t *rdtype,
257 struct ompi_communicator_t *comm,
258 mca_coll_base_module_t *module)
259 {
260 int line = -1, rank, size, pow2size, err;
261 int remote, distance, sendblocklocation;
262 ptrdiff_t rlb, rext;
263 char *tmpsend = NULL, *tmprecv = NULL;
264
265 size = ompi_comm_size(comm);
266 rank = ompi_comm_rank(comm);
267
268 pow2size = opal_next_poweroftwo (size);
269 pow2size >>=1;
270
271 /* Current implementation only handles power-of-two number of processes.
272 If the function was called on non-power-of-two number of processes,
273 print warning and call bruck allgather algorithm with same parameters.
274 */
275 if (pow2size != size) {
276 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
277 "coll:base:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
278 size));
279
280 return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
281 rbuf, rcount, rdtype,
282 comm, module);
283 }
284
285 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
286 "coll:base:allgather_intra_recursivedoubling rank %d, size %d",
287 rank, size));
288
289 err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
290 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
291
292 /* Initialization step:
293 - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
294 receive buffer
295 */
296 if (MPI_IN_PLACE != sbuf) {
297 tmpsend = (char*) sbuf;
298 tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
299 err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
300 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
301
302 }
303
304 /* Communication step:
305 At every step i, rank r:
306 - exchanges message with rank remote = (r ^ 2^i).
307
308 */
309 sendblocklocation = rank;
310 for (distance = 0x1; distance < size; distance<<=1) {
311 remote = rank ^ distance;
312
313 if (rank < remote) {
314 tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext;
315 tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext;
316 } else {
317 tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext;
318 tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext;
319 sendblocklocation -= distance;
320 }
321
322 /* Sendreceive */
323 err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
324 remote, MCA_COLL_BASE_TAG_ALLGATHER,
325 tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
326 remote, MCA_COLL_BASE_TAG_ALLGATHER,
327 comm, MPI_STATUS_IGNORE, rank);
328 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
329
330 }
331
332 return OMPI_SUCCESS;
333
334 err_hndl:
335 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
336 __FILE__, line, err, rank));
337 (void)line; // silence compiler warning
338 return err;
339 }
340
341
342
343 /*
344 * ompi_coll_base_allgather_intra_ring
345 *
346 * Function: allgather using O(N) steps.
347 * Accepts: Same arguments as MPI_Allgather
348 * Returns: MPI_SUCCESS or error code
349 *
350 * Description: Ring algorithm for all gather.
351 * At every step i, rank r receives message from rank (r - 1)
352 * containing data from rank (r - i - 1) and sends message to rank
353 * (r + 1) containing data from rank (r - i), with wrap arounds.
354 * Memory requirements:
355 * No additional memory requirements.
356 *
357 */
358 int ompi_coll_base_allgather_intra_ring(const void *sbuf, int scount,
359 struct ompi_datatype_t *sdtype,
360 void* rbuf, int rcount,
361 struct ompi_datatype_t *rdtype,
362 struct ompi_communicator_t *comm,
363 mca_coll_base_module_t *module)
364 {
365 int line = -1, rank, size, err, sendto, recvfrom, i, recvdatafrom, senddatafrom;
366 ptrdiff_t rlb, rext;
367 char *tmpsend = NULL, *tmprecv = NULL;
368
369 size = ompi_comm_size(comm);
370 rank = ompi_comm_rank(comm);
371
372 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
373 "coll:base:allgather_intra_ring rank %d", rank));
374
375 err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
376 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
377
378 /* Initialization step:
379 - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
380 of receive buffer
381 */
382 tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
383 if (MPI_IN_PLACE != sbuf) {
384 tmpsend = (char*) sbuf;
385 err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
386 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
387 }
388
389 /* Communication step:
390 At every step i: 0 .. (P-1), rank r:
391 - receives message from [(r - 1 + size) % size] containing data from rank
392 [(r - i - 1 + size) % size]
393 - sends message to rank [(r + 1) % size] containing data from rank
394 [(r - i + size) % size]
395 - sends message which starts at begining of rbuf and has size
396 */
397 sendto = (rank + 1) % size;
398 recvfrom = (rank - 1 + size) % size;
399
400 for (i = 0; i < size - 1; i++) {
401 recvdatafrom = (rank - i - 1 + size) % size;
402 senddatafrom = (rank - i + size) % size;
403
404 tmprecv = (char*)rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)rcount * rext;
405 tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext;
406
407 /* Sendreceive */
408 err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, sendto,
409 MCA_COLL_BASE_TAG_ALLGATHER,
410 tmprecv, rcount, rdtype, recvfrom,
411 MCA_COLL_BASE_TAG_ALLGATHER,
412 comm, MPI_STATUS_IGNORE, rank);
413 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
414
415 }
416
417 return OMPI_SUCCESS;
418
419 err_hndl:
420 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
421 __FILE__, line, err, rank));
422 (void)line; // silence compiler warning
423 return err;
424 }
425
426 /*
427 * ompi_coll_base_allgather_intra_neighborexchange
428 *
429 * Function: allgather using N/2 steps (O(N))
430 * Accepts: Same arguments as MPI_Allgather
431 * Returns: MPI_SUCCESS or error code
432 *
433 * Description: Neighbor Exchange algorithm for allgather.
434 * Described by Chen et.al. in
435 * "Performance Evaluation of Allgather Algorithms on
436 * Terascale Linux Cluster with Fast Ethernet",
437 * Proceedings of the Eighth International Conference on
438 * High-Performance Computing inn Asia-Pacific Region
439 * (HPCASIA'05), 2005
440 *
441 * Rank r exchanges message with one of its neighbors and
442 * forwards the data further in the next step.
443 *
444 * No additional memory requirements.
445 *
446 * Limitations: Algorithm works only on even number of processes.
447 * For odd number of processes we switch to ring algorithm.
448 *
449 * Example on 6 nodes:
450 * Initial state
451 * # 0 1 2 3 4 5
452 * [0] [ ] [ ] [ ] [ ] [ ]
453 * [ ] [1] [ ] [ ] [ ] [ ]
454 * [ ] [ ] [2] [ ] [ ] [ ]
455 * [ ] [ ] [ ] [3] [ ] [ ]
456 * [ ] [ ] [ ] [ ] [4] [ ]
457 * [ ] [ ] [ ] [ ] [ ] [5]
458 * Step 0:
459 * # 0 1 2 3 4 5
460 * [0] [0] [ ] [ ] [ ] [ ]
461 * [1] [1] [ ] [ ] [ ] [ ]
462 * [ ] [ ] [2] [2] [ ] [ ]
463 * [ ] [ ] [3] [3] [ ] [ ]
464 * [ ] [ ] [ ] [ ] [4] [4]
465 * [ ] [ ] [ ] [ ] [5] [5]
466 * Step 1:
467 * # 0 1 2 3 4 5
468 * [0] [0] [0] [ ] [ ] [0]
469 * [1] [1] [1] [ ] [ ] [1]
470 * [ ] [2] [2] [2] [2] [ ]
471 * [ ] [3] [3] [3] [3] [ ]
472 * [4] [ ] [ ] [4] [4] [4]
473 * [5] [ ] [ ] [5] [5] [5]
474 * Step 2:
475 * # 0 1 2 3 4 5
476 * [0] [0] [0] [0] [0] [0]
477 * [1] [1] [1] [1] [1] [1]
478 * [2] [2] [2] [2] [2] [2]
479 * [3] [3] [3] [3] [3] [3]
480 * [4] [4] [4] [4] [4] [4]
481 * [5] [5] [5] [5] [5] [5]
482 */
483 int
484 ompi_coll_base_allgather_intra_neighborexchange(const void *sbuf, int scount,
485 struct ompi_datatype_t *sdtype,
486 void* rbuf, int rcount,
487 struct ompi_datatype_t *rdtype,
488 struct ompi_communicator_t *comm,
489 mca_coll_base_module_t *module)
490 {
491 int line = -1, rank, size, i, even_rank, err;
492 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
493 ptrdiff_t rlb, rext;
494 char *tmpsend = NULL, *tmprecv = NULL;
495
496 size = ompi_comm_size(comm);
497 rank = ompi_comm_rank(comm);
498
499 if (size % 2) {
500 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
501 "coll:base:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
502 size));
503 return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
504 rbuf, rcount, rdtype,
505 comm, module);
506 }
507
508 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
509 "coll:base:allgather_intra_neighborexchange rank %d", rank));
510
511 err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
512 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
513
514 /* Initialization step:
515 - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
516 of receive buffer
517 */
518 tmprecv = (char*) rbuf + (ptrdiff_t)rank *(ptrdiff_t) rcount * rext;
519 if (MPI_IN_PLACE != sbuf) {
520 tmpsend = (char*) sbuf;
521 err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
522 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
523 }
524
525 /* Determine neighbors, order in which blocks will arrive, etc. */
526 even_rank = !(rank % 2);
527 if (even_rank) {
528 neighbor[0] = (rank + 1) % size;
529 neighbor[1] = (rank - 1 + size) % size;
530 recv_data_from[0] = rank;
531 recv_data_from[1] = rank;
532 offset_at_step[0] = (+2);
533 offset_at_step[1] = (-2);
534 } else {
535 neighbor[0] = (rank - 1 + size) % size;
536 neighbor[1] = (rank + 1) % size;
537 recv_data_from[0] = neighbor[0];
538 recv_data_from[1] = neighbor[0];
539 offset_at_step[0] = (-2);
540 offset_at_step[1] = (+2);
541 }
542
543 /* Communication loop:
544 - First step is special: exchange a single block with neighbor[0].
545 - Rest of the steps:
546 update recv_data_from according to offset, and
547 exchange two blocks with appropriate neighbor.
548 the send location becomes previous receve location.
549 */
550 tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext;
551 tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
552 /* Sendreceive */
553 err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
554 MCA_COLL_BASE_TAG_ALLGATHER,
555 tmprecv, rcount, rdtype, neighbor[0],
556 MCA_COLL_BASE_TAG_ALLGATHER,
557 comm, MPI_STATUS_IGNORE, rank);
558 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
559
560 /* Determine initial sending location */
561 if (even_rank) {
562 send_data_from = rank;
563 } else {
564 send_data_from = recv_data_from[0];
565 }
566
567 for (i = 1; i < (size / 2); i++) {
568 const int i_parity = i % 2;
569 recv_data_from[i_parity] =
570 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
571
572 tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext;
573 tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext;
574
575 /* Sendreceive */
576 err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
577 neighbor[i_parity],
578 MCA_COLL_BASE_TAG_ALLGATHER,
579 tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
580 neighbor[i_parity],
581 MCA_COLL_BASE_TAG_ALLGATHER,
582 comm, MPI_STATUS_IGNORE, rank);
583 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
584
585 send_data_from = recv_data_from[i_parity];
586 }
587
588 return OMPI_SUCCESS;
589
590 err_hndl:
591 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
592 __FILE__, line, err, rank));
593 (void)line; // silence compiler warning
594 return err;
595 }
596
597
598 int ompi_coll_base_allgather_intra_two_procs(const void *sbuf, int scount,
599 struct ompi_datatype_t *sdtype,
600 void* rbuf, int rcount,
601 struct ompi_datatype_t *rdtype,
602 struct ompi_communicator_t *comm,
603 mca_coll_base_module_t *module)
604 {
605 int line = -1, err, rank, remote;
606 char *tmpsend = NULL, *tmprecv = NULL;
607 ptrdiff_t rext, lb;
608
609 rank = ompi_comm_rank(comm);
610
611 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
612 "ompi_coll_base_allgather_intra_two_procs rank %d", rank));
613
614 if (2 != ompi_comm_size(comm)) {
615 return MPI_ERR_UNSUPPORTED_OPERATION;
616 }
617
618 err = ompi_datatype_get_extent (rdtype, &lb, &rext);
619 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
620
621 /* Exchange data:
622 - compute source and destinations
623 - send receive data
624 */
625 remote = rank ^ 0x1;
626
627 tmpsend = (char*)sbuf;
628 if (MPI_IN_PLACE == sbuf) {
629 tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
630 scount = rcount;
631 sdtype = rdtype;
632 }
633 tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext;
634
635 err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
636 MCA_COLL_BASE_TAG_ALLGATHER,
637 tmprecv, rcount, rdtype, remote,
638 MCA_COLL_BASE_TAG_ALLGATHER,
639 comm, MPI_STATUS_IGNORE, rank);
640 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
641
642 /* Place your data in correct location if necessary */
643 if (MPI_IN_PLACE != sbuf) {
644 err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
645 (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype);
646 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
647 }
648
649 return MPI_SUCCESS;
650
651 err_hndl:
652 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
653 __FILE__, line, err, rank));
654 (void)line; // silence compiler warning
655 return err;
656 }
657
658
659 /*
660 * Linear functions are copied from the BASIC coll module
661 * they do not segment the message and are simple implementations
662 * but for some small number of nodes and/or small data sizes they
663 * are just as fast as base/tree based segmenting operations
664 * and as such may be selected by the decision functions
665 * These are copied into this module due to the way we select modules
666 * in V1. i.e. in V2 we will handle this differently and so will not
667 * have to duplicate code.
668 * JPG following the examples from other coll_base implementations. Dec06.
669 */
670
671 /* copied function (with appropriate renaming) starts here */
672
673 /*
674 * allgather_intra_basic_linear
675 *
676 * Function: - allgather using other MPI collections
677 * Accepts: - same as MPI_Allgather()
678 * Returns: - MPI_SUCCESS or error code
679 */
680 int
681 ompi_coll_base_allgather_intra_basic_linear(const void *sbuf, int scount,
682 struct ompi_datatype_t *sdtype,
683 void *rbuf,
684 int rcount,
685 struct ompi_datatype_t *rdtype,
686 struct ompi_communicator_t *comm,
687 mca_coll_base_module_t *module)
688 {
689 int err;
690 ptrdiff_t lb, extent;
691
692 /* Handle MPI_IN_PLACE (see explanantion in reduce.c for how to
693 allocate temp buffer) -- note that rank 0 can use IN_PLACE
694 natively, and we can just alias the right position in rbuf
695 as sbuf and avoid using a temporary buffer if gather is
696 implemented correctly */
697 if (MPI_IN_PLACE == sbuf && 0 != ompi_comm_rank(comm)) {
698 ompi_datatype_get_extent(rdtype, &lb, &extent);
699 sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount);
700 sdtype = rdtype;
701 scount = rcount;
702 }
703
704 /* Gather and broadcast. */
705
706 err = comm->c_coll->coll_gather(sbuf, scount, sdtype,
707 rbuf, rcount, rdtype,
708 0, comm, comm->c_coll->coll_gather_module);
709 if (MPI_SUCCESS == err) {
710 size_t length = (ptrdiff_t)rcount * ompi_comm_size(comm);
711 if( length < (size_t)INT_MAX ) {
712 err = comm->c_coll->coll_bcast(rbuf, (ptrdiff_t)rcount * ompi_comm_size(comm), rdtype,
713 0, comm, comm->c_coll->coll_bcast_module);
714 } else {
715 ompi_datatype_t* temptype;
716 ompi_datatype_create_contiguous(ompi_comm_size(comm), rdtype, &temptype);
717 ompi_datatype_commit(&temptype);
718 err = comm->c_coll->coll_bcast(rbuf, rcount, temptype,
719 0, comm, comm->c_coll->coll_bcast_module);
720 ompi_datatype_destroy(&temptype);
721 }
722 }
723
724 /* All done */
725
726 return err;
727 }
728
729 /* copied function (with appropriate renaming) ends here */