root/ompi/mca/coll/libnbc/nbc_ibcast.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. NBC_Bcast_args_compare
  2. nbc_bcast_init
  3. ompi_coll_libnbc_ibcast
  4. bcast_sched_binomial
  5. bcast_sched_linear
  6. bcast_sched_chain
  7. bcast_sched_knomial
  8. nbc_bcast_inter_init
  9. ompi_coll_libnbc_ibcast_inter
  10. ompi_coll_libnbc_bcast_init
  11. ompi_coll_libnbc_bcast_inter_init

   1 /* -*- Mode: C; c-basic-offset:2 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2006      The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2006      The Technical University of Chemnitz. All
   7  *                         rights reserved.
   8  * Copyright (c) 2014-2018 Research Organization for Information Science
   9  *                         and Technology (RIST).  All rights reserved.
  10  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  11  *                         reserved.
  12  * Copyright (c) 2016-2017 IBM Corporation.  All rights reserved.
  13  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
  19  *
  20  */
  21 #include "nbc_internal.h"
  22 
  23 static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count,
  24                                        MPI_Datatype datatype);
  25 static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count,
  26                                      MPI_Datatype datatype);
  27 static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count,
  28                                     MPI_Datatype datatype, int fragsize, size_t size);
  29 static inline int bcast_sched_knomial(int rank, int comm_size, int root, NBC_Schedule *schedule, void *buf,
  30                                       int count, MPI_Datatype datatype, int knomial_radix);
  31 
  32 #ifdef NBC_CACHE_SCHEDULE
  33 /* tree comparison function for schedule cache */
  34 int NBC_Bcast_args_compare(NBC_Bcast_args *a, NBC_Bcast_args *b, void *param) {
  35   if ((a->buffer == b->buffer) &&
  36       (a->count == b->count) &&
  37       (a->datatype == b->datatype) &&
  38       (a->root == b->root) ) {
  39     return 0;
  40   }
  41 
  42   if( a->buffer < b->buffer ) {
  43     return -1;
  44   }
  45 
  46   return 1;
  47 }
  48 #endif
  49 
  50 static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int root,
  51                           struct ompi_communicator_t *comm, ompi_request_t ** request,
  52                           struct mca_coll_base_module_2_3_0_t *module, bool persistent)
  53 {
  54   int rank, p, res, segsize;
  55   size_t size;
  56   NBC_Schedule *schedule;
  57 #ifdef NBC_CACHE_SCHEDULE
  58   NBC_Bcast_args *args, *found, search;
  59 #endif
  60   enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN, NBC_BCAST_KNOMIAL } alg;
  61   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
  62 
  63   rank = ompi_comm_rank (comm);
  64   p = ompi_comm_size (comm);
  65 
  66   if (1 == p) {
  67     return nbc_get_noop_request(persistent, request);
  68   }
  69 
  70   res = ompi_datatype_type_size(datatype, &size);
  71   if (MPI_SUCCESS != res) {
  72     NBC_Error("MPI Error in ompi_datatype_type_size() (%i)", res);
  73     return res;
  74   }
  75 
  76   segsize = 16384;
  77   /* algorithm selection */
  78   if (libnbc_ibcast_algorithm == 0) {
  79     if( libnbc_ibcast_skip_dt_decision ) {
  80       if (p <= 4) {
  81         alg = NBC_BCAST_LINEAR;
  82       }
  83       else {
  84         alg = NBC_BCAST_BINOMIAL;
  85       }
  86     }
  87     else {
  88       if (p <= 4) {
  89         alg = NBC_BCAST_LINEAR;
  90       } else if (size * count < 65536) {
  91         alg = NBC_BCAST_BINOMIAL;
  92       } else if (size * count < 524288) {
  93         alg = NBC_BCAST_CHAIN;
  94         segsize = 8192;
  95       } else {
  96         alg = NBC_BCAST_CHAIN;
  97         segsize = 32768;
  98       }
  99     }
 100   } else {
 101     /* user forced dynamic decision */
 102     if (libnbc_ibcast_algorithm == 1) {
 103       alg = NBC_BCAST_LINEAR;
 104     } else if (libnbc_ibcast_algorithm == 2) {
 105       alg = NBC_BCAST_BINOMIAL;
 106     } else if (libnbc_ibcast_algorithm == 3) {
 107       alg = NBC_BCAST_CHAIN;
 108     } else if (libnbc_ibcast_algorithm == 4 && libnbc_ibcast_knomial_radix > 1) {
 109       alg = NBC_BCAST_KNOMIAL;
 110     } else {
 111       alg = NBC_BCAST_LINEAR;
 112     }
 113   }
 114 
 115 #ifdef NBC_CACHE_SCHEDULE
 116   /* search schedule in communicator specific tree */
 117   search.buffer = buffer;
 118   search.count = count;
 119   search.datatype = datatype;
 120   search.root = root;
 121   found = (NBC_Bcast_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_BCAST], &search);
 122   if (NULL == found) {
 123 #endif
 124     schedule = OBJ_NEW(NBC_Schedule);
 125     if (OPAL_UNLIKELY(NULL == schedule)) {
 126       return OMPI_ERR_OUT_OF_RESOURCE;
 127     }
 128 
 129     switch(alg) {
 130       case NBC_BCAST_LINEAR:
 131         res = bcast_sched_linear(rank, p, root, schedule, buffer, count, datatype);
 132         break;
 133       case NBC_BCAST_BINOMIAL:
 134         res = bcast_sched_binomial(rank, p, root, schedule, buffer, count, datatype);
 135         break;
 136       case NBC_BCAST_CHAIN:
 137         res = bcast_sched_chain(rank, p, root, schedule, buffer, count, datatype, segsize, size);
 138         break;
 139       case NBC_BCAST_KNOMIAL:
 140         res = bcast_sched_knomial(rank, p, root, schedule, buffer, count, datatype, libnbc_ibcast_knomial_radix);
 141         break;
 142     }
 143 
 144     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 145       OBJ_RELEASE(schedule);
 146       return res;
 147     }
 148 
 149     res = NBC_Sched_commit (schedule);
 150     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 151       OBJ_RELEASE(schedule);
 152       return res;
 153     }
 154 
 155 #ifdef NBC_CACHE_SCHEDULE
 156     /* save schedule to tree */
 157     args = (NBC_Bcast_args *) malloc (sizeof (args));
 158     if (NULL != args) {
 159       args->buffer = buffer;
 160       args->count = count;
 161       args->datatype = datatype;
 162       args->root = root;
 163       args->schedule = schedule;
 164       res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_BCAST], args, args, 0);
 165       if (0 == res) {
 166         OBJ_RETAIN (schedule);
 167 
 168         /* increase number of elements for A2A */
 169         if (++libnbc_module->NBC_Dict_size[NBC_BCAST] > NBC_SCHED_DICT_UPPER) {
 170           NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_BCAST],
 171                                    &libnbc_module->NBC_Dict_size[NBC_BCAST]);
 172         }
 173       } else {
 174         NBC_Error("error in dict_insert() (%i)", res);
 175         free (args);
 176       }
 177     }
 178   } else {
 179     /* found schedule */
 180     schedule = found->schedule;
 181     OBJ_RETAIN(schedule);
 182   }
 183 #endif
 184 
 185   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL);
 186   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 187     OBJ_RELEASE(schedule);
 188     return res;
 189   }
 190 
 191   return OMPI_SUCCESS;
 192 }
 193 
 194 int ompi_coll_libnbc_ibcast(void *buffer, int count, MPI_Datatype datatype, int root,
 195                             struct ompi_communicator_t *comm, ompi_request_t ** request,
 196                             struct mca_coll_base_module_2_3_0_t *module)
 197 {
 198     int res = nbc_bcast_init(buffer, count, datatype, root,
 199                              comm, request, module, false);
 200     if (OPAL_LIKELY(OMPI_SUCCESS != res)) {
 201         return res;
 202     }
 203     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 204     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 205         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 206         *request = &ompi_request_null.request;
 207         return res;
 208     }
 209 
 210     return OMPI_SUCCESS;
 211 }
 212 
 213 /* better binomial bcast
 214  * working principle:
 215  * - each node gets a virtual rank vrank
 216  * - the 'root' node get vrank 0
 217  * - node 0 gets the vrank of the 'root'
 218  * - all other ranks stay identical (they do not matter)
 219  *
 220  * Algorithm:
 221  * - each node with vrank > 2^r and vrank < 2^r+1 receives from node
 222  *   vrank - 2^r (vrank=1 receives from 0, vrank 0 receives never)
 223  * - each node sends each round r to node vrank + 2^r
 224  * - a node stops to send if 2^r > commsize
 225  */
 226 #define RANK2VRANK(rank, vrank, root) \
 227 { \
 228   vrank = rank; \
 229   if (rank == 0) vrank = root; \
 230   if (rank == root) vrank = 0; \
 231 }
 232 #define VRANK2RANK(rank, vrank, root) \
 233 { \
 234   rank = vrank; \
 235   if (vrank == 0) rank = root; \
 236   if (vrank == root) rank = 0; \
 237 }
 238 static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) {
 239   int maxr, vrank, peer, res;
 240 
 241   maxr = (int)ceil((log((double)p)/LOG2));
 242 
 243   RANK2VRANK(rank, vrank, root);
 244 
 245   /* receive from the right hosts  */
 246   if (vrank != 0) {
 247     for (int r = 0 ; r < maxr ; ++r) {
 248       if ((vrank >= (1 << r)) && (vrank < (1 << (r + 1)))) {
 249         VRANK2RANK(peer, vrank - (1 << r), root);
 250         res = NBC_Sched_recv (buffer, false, count, datatype, peer, schedule, false);
 251         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 252           return res;
 253         }
 254       }
 255     }
 256 
 257     res = NBC_Sched_barrier (schedule);
 258     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 259       return res;
 260     }
 261   }
 262 
 263   /* now send to the right hosts */
 264   for (int r = 0 ; r < maxr ; ++r) {
 265     if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) {
 266       VRANK2RANK(peer, vrank + (1 << r), root);
 267       res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false);
 268       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 269         return res;
 270       }
 271     }
 272   }
 273 
 274   return OMPI_SUCCESS;
 275 }
 276 
 277 /* simple linear MPI_Ibcast */
 278 static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) {
 279   int res;
 280 
 281   /* send to all others */
 282   if(rank == root) {
 283     for (int peer = 0 ; peer < p ; ++peer) {
 284       if (peer != root) {
 285         /* send msg to peer */
 286         res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false);
 287         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 288           return res;
 289         }
 290       }
 291     }
 292   } else {
 293     /* recv msg from root */
 294     res = NBC_Sched_recv (buffer, false, count, datatype, root, schedule, false);
 295     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 296       return res;
 297     }
 298   }
 299 
 300   return OMPI_SUCCESS;
 301 }
 302 
 303 /* simple chained MPI_Ibcast */
 304 static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype, int fragsize, size_t size) {
 305   int res, vrank, rpeer, speer, numfrag, fragcount, thiscount;
 306   MPI_Aint ext;
 307   char *buf;
 308 
 309   RANK2VRANK(rank, vrank, root);
 310   VRANK2RANK(rpeer, vrank-1, root);
 311   VRANK2RANK(speer, vrank+1, root);
 312   res = ompi_datatype_type_extent(datatype, &ext);
 313   if (MPI_SUCCESS != res) {
 314     NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res);
 315     return res;
 316   }
 317 
 318   if (count == 0) {
 319     return OMPI_SUCCESS;
 320   }
 321 
 322   numfrag = count * size/fragsize;
 323   if ((count * size) % fragsize != 0) {
 324     numfrag++;
 325   }
 326 
 327   fragcount = count/numfrag;
 328 
 329   for (int fragnum = 0 ; fragnum < numfrag ; ++fragnum) {
 330     buf = (char *) buffer + fragnum * fragcount * ext;
 331     thiscount = fragcount;
 332     if (fragnum == numfrag-1) {
 333       /* last fragment may not be full */
 334       thiscount = count - fragcount * fragnum;
 335     }
 336 
 337     /* root does not receive */
 338     if (vrank != 0) {
 339       res = NBC_Sched_recv (buf, false, thiscount, datatype, rpeer, schedule, true);
 340       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 341         return res;
 342       }
 343     }
 344 
 345     /* last rank does not send */
 346     if (vrank != p-1) {
 347       res = NBC_Sched_send (buf, false, thiscount, datatype, speer, schedule, false);
 348       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 349         return res;
 350       }
 351 
 352       /* this barrier here seems awaward but isn't!!!! */
 353       if (vrank == 0)  {
 354         res = NBC_Sched_barrier (schedule);
 355         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 356           return res;
 357         }
 358       }
 359     }
 360   }
 361 
 362   return OMPI_SUCCESS;
 363 }
 364 
 365 /*
 366  * bcast_sched_knomial:
 367  *
 368  * Description: an implementation of Ibcast using k-nomial tree algorithm
 369  *
 370  * Time: (radix - 1)O(log_{radix}(comm_size))
 371  * Schedule length (rounds): O(log(comm_size))
 372  */
 373 static inline int bcast_sched_knomial(
 374     int rank, int comm_size, int root, NBC_Schedule *schedule, void *buf,
 375     int count, MPI_Datatype datatype, int knomial_radix)
 376 {
 377     int res = OMPI_SUCCESS;
 378 
 379     /* Receive from parent */
 380     int vrank = (rank - root + comm_size) % comm_size;
 381     int mask = 0x1;
 382     while (mask < comm_size) {
 383         if (vrank % (knomial_radix * mask)) {
 384             int parent = vrank / (knomial_radix * mask) * (knomial_radix * mask);
 385             parent = (parent + root) % comm_size;
 386             res = NBC_Sched_recv(buf, false, count, datatype, parent, schedule, true);
 387             if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 388             break;
 389         }
 390         mask *= knomial_radix;
 391     }
 392     mask /= knomial_radix;
 393 
 394     /* Send data to all children */
 395     while (mask > 0) {
 396         for (int r = 1; r < knomial_radix; r++) {
 397             int child = vrank + mask * r;
 398             if (child < comm_size) {
 399                 child = (child + root) % comm_size;
 400                 res = NBC_Sched_send(buf, false, count, datatype, child, schedule, false);
 401                 if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; }
 402             }
 403         }
 404         mask /= knomial_radix;
 405     }
 406 
 407 cleanup_and_return:
 408     return res;
 409 }
 410 
 411 static int nbc_bcast_inter_init(void *buffer, int count, MPI_Datatype datatype, int root,
 412                                 struct ompi_communicator_t *comm, ompi_request_t ** request,
 413                                 struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
 414   int res;
 415   NBC_Schedule *schedule;
 416   ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
 417 
 418   schedule = OBJ_NEW(NBC_Schedule);
 419   if (OPAL_UNLIKELY(NULL == schedule)) {
 420     return OMPI_ERR_OUT_OF_RESOURCE;
 421   }
 422 
 423   if (root != MPI_PROC_NULL) {
 424     /* send to all others */
 425     if (root == MPI_ROOT) {
 426       int remsize;
 427 
 428       remsize = ompi_comm_remote_size (comm);
 429 
 430       for (int peer = 0 ; peer < remsize ; ++peer) {
 431         /* send msg to peer */
 432         res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false);
 433         if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 434           OBJ_RELEASE(schedule);
 435           return res;
 436         }
 437       }
 438     } else {
 439       /* recv msg from root */
 440       res = NBC_Sched_recv (buffer, false, count, datatype, root, schedule, false);
 441       if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 442         OBJ_RELEASE(schedule);
 443         return res;
 444       }
 445     }
 446   }
 447 
 448   res = NBC_Sched_commit (schedule);
 449   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 450     OBJ_RELEASE(schedule);
 451     return res;
 452   }
 453 
 454   res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL);
 455   if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 456     OBJ_RELEASE(schedule);
 457     return res;
 458   }
 459 
 460   return OMPI_SUCCESS;
 461 }
 462 
 463 int ompi_coll_libnbc_ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root,
 464                                   struct ompi_communicator_t *comm, ompi_request_t ** request,
 465                                   struct mca_coll_base_module_2_3_0_t *module) {
 466     int res = nbc_bcast_inter_init(buffer, count, datatype, root,
 467                                    comm, request, module, false);
 468     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 469         return res;
 470     }
 471   
 472     res = NBC_Start(*(ompi_coll_libnbc_request_t **)request);
 473     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 474         NBC_Return_handle (*(ompi_coll_libnbc_request_t **)request);
 475         *request = &ompi_request_null.request;
 476         return res;
 477     }
 478 
 479     return OMPI_SUCCESS;
 480 }
 481 
 482 int ompi_coll_libnbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int root,
 483                                 struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 484                                 struct mca_coll_base_module_2_3_0_t *module) {
 485     int res = nbc_bcast_init(buffer, count, datatype, root,
 486                              comm, request, module, true);
 487     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 488         return res;
 489     }
 490 
 491     return OMPI_SUCCESS;
 492 }
 493 
 494 int ompi_coll_libnbc_bcast_inter_init(void *buffer, int count, MPI_Datatype datatype, int root,
 495                                       struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request,
 496                                       struct mca_coll_base_module_2_3_0_t *module) {
 497     int res = nbc_bcast_inter_init(buffer, count, datatype, root,
 498                                    comm, request, module, true);
 499     if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
 500         return res;
 501     }
 502 
 503     return OMPI_SUCCESS;
 504 }

/* [<][>][^][v][top][bottom][index][help] */