This source file includes following definitions.
- init
- finalize
- delete_route
- update_route
- get_route
- route_lost
- route_is_defined
- set_lifeline
- radix_tree
- update_routing_plan
- get_routing_list
- num_routes
- radix_ft_event
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 #include "orte_config.h"
  18 #include "orte/constants.h"
  19 
  20 #include <stddef.h>
  21 
  22 #include "opal/dss/dss.h"
  23 #include "opal/class/opal_hash_table.h"
  24 #include "opal/class/opal_bitmap.h"
  25 #include "opal/runtime/opal_progress.h"
  26 #include "opal/util/output.h"
  27 
  28 #include "orte/mca/errmgr/errmgr.h"
  29 #include "orte/mca/ess/ess.h"
  30 #include "orte/mca/rml/rml.h"
  31 #include "orte/mca/rml/rml_types.h"
  32 #include "orte/util/name_fns.h"
  33 #include "orte/runtime/orte_globals.h"
  34 #include "orte/runtime/orte_wait.h"
  35 #include "orte/runtime/runtime.h"
  36 #include "orte/runtime/data_type_support/orte_dt_support.h"
  37 
  38 #include "orte/mca/rml/base/rml_contact.h"
  39 
  40 #include "orte/mca/routed/base/base.h"
  41 #include "routed_radix.h"
  42 
  43 
  44 static int init(void);
  45 static int finalize(void);
  46 static int delete_route(orte_process_name_t *proc);
  47 static int update_route(orte_process_name_t *target,
  48                         orte_process_name_t *route);
  49 static orte_process_name_t get_route(orte_process_name_t *target);
  50 static int route_lost(const orte_process_name_t *route);
  51 static bool route_is_defined(const orte_process_name_t *target);
  52 static void update_routing_plan(void);
  53 static void get_routing_list(opal_list_t *coll);
  54 static int set_lifeline(orte_process_name_t *proc);
  55 static size_t num_routes(void);
  56 
  57 #if OPAL_ENABLE_FT_CR == 1
  58 static int radix_ft_event(int state);
  59 #endif
  60 
  61 orte_routed_module_t orte_routed_radix_module = {
  62     .initialize = init,
  63     .finalize = finalize,
  64     .delete_route = delete_route,
  65     .update_route = update_route,
  66     .get_route = get_route,
  67     .route_lost = route_lost,
  68     .route_is_defined = route_is_defined,
  69     .set_lifeline = set_lifeline,
  70     .update_routing_plan = update_routing_plan,
  71     .get_routing_list = get_routing_list,
  72     .num_routes = num_routes,
  73 #if OPAL_ENABLE_FT_CR == 1
  74     .ft_event = radix_ft_event
  75 #else
  76     NULL
  77 #endif
  78 };
  79 
  80 
  81 static orte_process_name_t      *lifeline=NULL;
  82 static orte_process_name_t      local_lifeline;
  83 static int                      num_children;
  84 static opal_list_t              my_children;
  85 static bool                     hnp_direct=true;
  86 
  87 static int init(void)
  88 {
  89     lifeline = NULL;
  90 
  91     if (ORTE_PROC_IS_DAEMON) {
  92         
  93         if (orte_static_ports) {
  94             lifeline = ORTE_PROC_MY_PARENT;
  95         } else {
  96             
  97             lifeline = ORTE_PROC_MY_HNP;
  98         }
  99         ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
 100     } else if (ORTE_PROC_IS_APP) {
 101         
 102 
 103         if (NULL == orte_process_info.my_daemon_uri) {
 104             return ORTE_ERR_TAKE_NEXT_OPTION;
 105         }
 106         
 107         lifeline = ORTE_PROC_MY_DAEMON;
 108         orte_routing_is_enabled = true;
 109     }
 110 
 111     
 112     OBJ_CONSTRUCT(&my_children, opal_list_t);
 113     num_children = 0;
 114 
 115     return ORTE_SUCCESS;
 116 }
 117 
 118 static int finalize(void)
 119 {
 120     opal_list_item_t *item;
 121 
 122     lifeline = NULL;
 123 
 124     
 125     while (NULL != (item = opal_list_remove_first(&my_children))) {
 126         OBJ_RELEASE(item);
 127     }
 128     OBJ_DESTRUCT(&my_children);
 129     num_children = 0;
 130 
 131     return ORTE_SUCCESS;
 132 }
 133 
 134 static int delete_route(orte_process_name_t *proc)
 135 {
 136     if (proc->jobid == ORTE_JOBID_INVALID ||
 137         proc->vpid == ORTE_VPID_INVALID) {
 138         return ORTE_ERR_BAD_PARAM;
 139     }
 140 
 141     
 142 
 143 
 144     if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_DAEMON &&
 145         !ORTE_PROC_IS_TOOL) {
 146         return ORTE_SUCCESS;
 147     }
 148 
 149     OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
 150                          "%s routed_radix_delete_route for %s",
 151                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 152                          ORTE_NAME_PRINT(proc)));
 153 
 154 
 155     
 156 
 157 
 158 
 159 
 160     return ORTE_SUCCESS;
 161 }
 162 
 163 static int update_route(orte_process_name_t *target,
 164                         orte_process_name_t *route)
 165 {
 166     if (target->jobid == ORTE_JOBID_INVALID ||
 167         target->vpid == ORTE_VPID_INVALID) {
 168         return ORTE_ERR_BAD_PARAM;
 169     }
 170 
 171     
 172 
 173 
 174     if (ORTE_PROC_IS_APP) {
 175         return ORTE_SUCCESS;
 176     }
 177 
 178     OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
 179                          "%s routed_radix_update: %s --> %s",
 180                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 181                          ORTE_NAME_PRINT(target),
 182                          ORTE_NAME_PRINT(route)));
 183 
 184 
 185     
 186 
 187 
 188 
 189     if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
 190         OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
 191         hnp_direct = false;
 192         return ORTE_SUCCESS;
 193     }
 194 
 195     return ORTE_SUCCESS;
 196 }
 197 
 198 
 199 static orte_process_name_t get_route(orte_process_name_t *target)
 200 {
 201     orte_process_name_t *ret, daemon;
 202     opal_list_item_t *item;
 203     orte_routed_tree_t *child;
 204 
 205     if (!orte_routing_is_enabled) {
 206         ret = target;
 207         goto found;
 208     }
 209 
 210     
 211     daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
 212     daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
 213 
 214     if (target->jobid == ORTE_JOBID_INVALID ||
 215         target->vpid == ORTE_VPID_INVALID) {
 216         ret = ORTE_NAME_INVALID;
 217         goto found;
 218     }
 219 
 220     
 221     if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) {
 222         ret = target;
 223         goto found;
 224     }
 225 
 226     
 227 
 228 
 229     if (ORTE_PROC_IS_TOOL) {
 230         if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
 231             ret = target;
 232             goto found;
 233         } else {
 234             ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
 235             ret = &daemon;
 236             goto found;
 237         }
 238     }
 239 
 240     
 241     
 242 
 243 
 244     if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
 245         if (!hnp_direct || orte_static_ports) {
 246             OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 247                                  "%s routing to the HNP through my parent %s",
 248                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 249                                  ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
 250             ret = ORTE_PROC_MY_PARENT;
 251             goto found;
 252         } else {
 253             OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 254                                  "%s routing direct to the HNP",
 255                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 256             ret = ORTE_PROC_MY_HNP;
 257             goto found;
 258         }
 259     }
 260 
 261     
 262     if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_PARENT, target)) {
 263         ret = ORTE_PROC_MY_PARENT;
 264         goto found;
 265     }
 266 
 267     
 268 
 269     if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
 270         ret = target;
 271         goto found;
 272     }
 273 
 274     daemon.jobid = ORTE_PROC_MY_NAME->jobid;
 275     
 276     if (ORTE_PROC_MY_NAME->jobid == target->jobid) {
 277         
 278         daemon.vpid = target->vpid;
 279     } else {
 280         if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
 281             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 282             ret = ORTE_NAME_INVALID;
 283             goto found;
 284         }
 285     }
 286 
 287     
 288     if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
 289         ret = target;
 290         goto found;
 291     } else {
 292         
 293         for (item = opal_list_get_first(&my_children);
 294              item != opal_list_get_end(&my_children);
 295              item = opal_list_get_next(item)) {
 296             child = (orte_routed_tree_t*)item;
 297             if (child->vpid == daemon.vpid) {
 298                 
 299                 ret = &daemon;
 300                 goto found;
 301             }
 302             
 303             if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
 304                 
 305                 daemon.vpid = child->vpid;
 306                 ret = &daemon;
 307                 goto found;
 308             }
 309         }
 310     }
 311 
 312     
 313 
 314 
 315     daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
 316 
 317     ret = &daemon;
 318 
 319 found:
 320     OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
 321                          "%s routed_radix_get(%s) --> %s",
 322                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 323                          ORTE_NAME_PRINT(target),
 324                          ORTE_NAME_PRINT(ret)));
 325 
 326     return *ret;
 327 }
 328 
 329 static int route_lost(const orte_process_name_t *route)
 330 {
 331     opal_list_item_t *item;
 332     orte_routed_tree_t *child;
 333 
 334     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 335                          "%s route to %s lost",
 336                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 337                          ORTE_NAME_PRINT(route)));
 338 
 339     
 340 
 341 
 342 
 343 
 344     if (!orte_finalizing &&
 345         NULL != lifeline &&
 346         OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
 347         OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 348                              "%s routed:radix: Connection to lifeline %s lost",
 349                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 350                              ORTE_NAME_PRINT(lifeline)));
 351         return ORTE_ERR_FATAL;
 352     }
 353 
 354     
 355 
 356 
 357     if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) &&
 358         route->jobid == ORTE_PROC_MY_NAME->jobid) {
 359         for (item = opal_list_get_first(&my_children);
 360              item != opal_list_get_end(&my_children);
 361              item = opal_list_get_next(item)) {
 362             child = (orte_routed_tree_t*)item;
 363             if (child->vpid == route->vpid) {
 364                 opal_list_remove_item(&my_children, item);
 365                 OBJ_RELEASE(item);
 366                 return ORTE_SUCCESS;
 367             }
 368         }
 369     }
 370 
 371     
 372     return ORTE_SUCCESS;
 373 }
 374 
 375 static bool route_is_defined(const orte_process_name_t *target)
 376 {
 377     
 378     if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) {
 379         return false;
 380     }
 381 
 382     return true;
 383 }
 384 
 385 static int set_lifeline(orte_process_name_t *proc)
 386 {
 387     
 388 
 389 
 390     local_lifeline.jobid = proc->jobid;
 391     local_lifeline.vpid = proc->vpid;
 392     lifeline = &local_lifeline;
 393 
 394     return ORTE_SUCCESS;
 395 }
 396 
 397 static void radix_tree(int rank, int *num_children,
 398                        opal_list_t *children, opal_bitmap_t *relatives)
 399 {
 400     int i, peer, Sum, NInLevel;
 401     orte_routed_tree_t *child;
 402     opal_bitmap_t *relations;
 403 
 404     
 405     Sum=1;
 406     NInLevel=1;
 407 
 408     while ( Sum < (rank+1) ) {
 409         NInLevel *= mca_routed_radix_component.radix;
 410         Sum += NInLevel;
 411     }
 412 
 413     
 414     peer = rank + NInLevel;
 415     for (i = 0; i < mca_routed_radix_component.radix; i++) {
 416         if (peer < (int)orte_process_info.num_procs) {
 417             child = OBJ_NEW(orte_routed_tree_t);
 418             child->vpid = peer;
 419             if (NULL != children) {
 420                 
 421                 opal_list_append(children, &child->super);
 422                 (*num_children)++;
 423                 
 424                 opal_bitmap_init(&child->relatives, orte_process_info.num_procs);
 425                 
 426                 relations = &child->relatives;
 427             } else {
 428                 
 429                 if (OPAL_SUCCESS != opal_bitmap_set_bit(relatives, peer)) {
 430                     opal_output(0, "%s Error: could not set relations bit!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 431                 }
 432                 
 433                 relations = relatives;
 434                 OBJ_RELEASE(child);
 435             }
 436             
 437             radix_tree(peer, NULL, NULL, relations);
 438         }
 439         peer += NInLevel;
 440     }
 441 }
 442 
 443 static void update_routing_plan(void)
 444 {
 445     orte_routed_tree_t *child;
 446     int j;
 447     opal_list_item_t *item;
 448     int Level,Sum,NInLevel,Ii;
 449     int NInPrevLevel;
 450 
 451     
 452 
 453 
 454     if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
 455         return;
 456     }
 457 
 458     
 459     while (NULL != (item = opal_list_remove_first(&my_children))) {
 460         OBJ_RELEASE(item);
 461     }
 462     num_children = 0;
 463 
 464     
 465     Ii =  ORTE_PROC_MY_NAME->vpid;
 466     Level=0;
 467     Sum=1;
 468     NInLevel=1;
 469 
 470     while ( Sum < (Ii+1) ) {
 471         Level++;
 472         NInLevel *= mca_routed_radix_component.radix;
 473         Sum += NInLevel;
 474     }
 475     Sum -= NInLevel;
 476 
 477     NInPrevLevel = NInLevel/mca_routed_radix_component.radix;
 478 
 479     if( 0 == Ii ) {
 480         ORTE_PROC_MY_PARENT->vpid = -1;
 481     }  else {
 482         ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel;
 483         ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel);
 484     }
 485 
 486     
 487 
 488 
 489     radix_tree(Ii, &num_children, &my_children, NULL);
 490 
 491     if (0 < opal_output_get_verbosity(orte_routed_base_framework.framework_output)) {
 492         opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children);
 493         for (item = opal_list_get_first(&my_children);
 494              item != opal_list_get_end(&my_children);
 495              item = opal_list_get_next(item)) {
 496             child = (orte_routed_tree_t*)item;
 497             opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid);
 498             for (j=0; j < (int)orte_process_info.num_procs; j++) {
 499                 if (opal_bitmap_is_set_bit(&child->relatives, j)) {
 500                     opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
 501                 }
 502             }
 503         }
 504     }
 505 }
 506 
 507 static void get_routing_list(opal_list_t *coll)
 508 {
 509     
 510 
 511 
 512     if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
 513         return;
 514     }
 515 
 516     orte_routed_base_xcast_routing(coll, &my_children);
 517 }
 518 
 519 static size_t num_routes(void)
 520 {
 521     return opal_list_get_size(&my_children);
 522 }
 523 
 524 #if OPAL_ENABLE_FT_CR == 1
 525 static int radix_ft_event(int state)
 526 {
 527     int ret, exit_status = ORTE_SUCCESS;
 528 
 529     
 530     if(OPAL_CRS_CHECKPOINT == state) {
 531     }
 532     
 533     else if (OPAL_CRS_CONTINUE == state ) {
 534     }
 535     else if (OPAL_CRS_TERM == state ) {
 536         
 537     }
 538     else {
 539         
 540     }
 541 
 542  cleanup:
 543     return exit_status;
 544 }
 545 #endif