root/orte/mca/routed/direct/routed_direct.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. init
  2. finalize
  3. delete_route
  4. update_route
  5. get_route
  6. route_lost
  7. route_is_defined
  8. set_lifeline
  9. update_routing_plan
  10. get_routing_list
  11. num_routes
  12. direct_ft_event

   1 /*
   2  * Copyright (c) 2007-2011 Los Alamos National Security, LLC.
   3  *                         All rights reserved.
   4  * Copyright (c) 2004-2011 The University of Tennessee and The University
   5  *                         of Tennessee Research Foundation.  All rights
   6  *                         reserved.
   7  * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
   8  * $COPYRIGHT$
   9  *
  10  * Additional copyrights may follow
  11  *
  12  * $HEADER$
  13  */
  14 
  15 #include "orte_config.h"
  16 #include "orte/constants.h"
  17 
  18 #include "opal/dss/dss.h"
  19 #include "opal/util/output.h"
  20 
  21 #include "orte/mca/errmgr/errmgr.h"
  22 #include "orte/mca/rml/rml.h"
  23 #include "orte/util/name_fns.h"
  24 #include "orte/util/proc_info.h"
  25 #include "orte/runtime/orte_globals.h"
  26 #include "orte/runtime/data_type_support/orte_dt_support.h"
  27 #include "orte/runtime/orte_wait.h"
  28 
  29 #include "orte/mca/rml/base/rml_contact.h"
  30 
  31 #include "orte/mca/routed/base/base.h"
  32 #include "routed_direct.h"
  33 
  34 static int init(void);
  35 static int finalize(void);
  36 static int delete_route(orte_process_name_t *proc);
  37 static int update_route(orte_process_name_t *target,
  38                         orte_process_name_t *route);
  39 static orte_process_name_t get_route(orte_process_name_t *target);
  40 static int route_lost(const orte_process_name_t *route);
  41 static bool route_is_defined(const orte_process_name_t *target);
  42 static void update_routing_plan(void);
  43 static void get_routing_list(opal_list_t *coll);
  44 static int set_lifeline(orte_process_name_t *proc);
  45 static size_t num_routes(void);
  46 
  47 #if OPAL_ENABLE_FT_CR == 1
  48 static int direct_ft_event(int state);
  49 #endif
  50 
  51 orte_routed_module_t orte_routed_direct_module = {
  52     .initialize = init,
  53     .finalize = finalize,
  54     .delete_route = delete_route,
  55     .update_route = update_route,
  56     .get_route = get_route,
  57     .route_lost = route_lost,
  58     .route_is_defined = route_is_defined,
  59     .set_lifeline = set_lifeline,
  60     .update_routing_plan = update_routing_plan,
  61     .get_routing_list = get_routing_list,
  62     .num_routes = num_routes,
  63 #if OPAL_ENABLE_FT_CR == 1
  64     .ft_event = direct_ft_event
  65 #else
  66     NULL
  67 #endif
  68 };
  69 
  70 static orte_process_name_t mylifeline;
  71 static orte_process_name_t *lifeline = NULL;
  72 static opal_list_t my_children;
  73 
  74 static int init(void)
  75 {
  76     lifeline = NULL;
  77 
  78     if (ORTE_PROC_IS_DAEMON) {
  79         ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
  80         /* if we are using static ports, set my lifeline to point at my parent */
  81         if (orte_static_ports) {
  82             /* we will have been given our parent's vpid by MCA param */
  83             lifeline = ORTE_PROC_MY_PARENT;
  84         } else {
  85             /* set our lifeline to the HNP - we will abort if that connection is lost */
  86             lifeline = ORTE_PROC_MY_HNP;
  87             ORTE_PROC_MY_PARENT->vpid = 0;
  88         }
  89     } else if (ORTE_PROC_IS_APP) {
  90         /* if we don't have a designated daemon, just
  91          * disqualify ourselves */
  92         if (NULL == orte_process_info.my_daemon_uri) {
  93             return ORTE_ERR_TAKE_NEXT_OPTION;
  94         }
  95         /* set our lifeline to the local daemon - we will abort if this connection is lost */
  96         lifeline = ORTE_PROC_MY_DAEMON;
  97         orte_routing_is_enabled = true;
  98     }
  99 
 100     /* setup the list of children */
 101     OBJ_CONSTRUCT(&my_children, opal_list_t);
 102 
 103     return ORTE_SUCCESS;
 104 }
 105 
 106 static int finalize(void)
 107 {
 108     OPAL_LIST_DESTRUCT(&my_children);
 109     return ORTE_SUCCESS;
 110 }
 111 
 112 static int delete_route(orte_process_name_t *proc)
 113 {
 114     OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
 115                          "%s routed_direct_delete_route for %s",
 116                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 117                          ORTE_NAME_PRINT(proc)));
 118 
 119     /*There is nothing to do here */
 120 
 121     return ORTE_SUCCESS;
 122 }
 123 
 124 static int update_route(orte_process_name_t *target,
 125                         orte_process_name_t *route)
 126 {
 127     OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
 128                          "%s routed_direct_update: %s --> %s",
 129                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 130                          ORTE_NAME_PRINT(target),
 131                          ORTE_NAME_PRINT(route)));
 132 
 133     /*There is nothing to do here */
 134 
 135     return ORTE_SUCCESS;
 136 }
 137 
 138 
 139 static orte_process_name_t get_route(orte_process_name_t *target)
 140 {
 141     orte_process_name_t *ret, daemon;
 142 
 143     if (target->jobid == ORTE_JOBID_INVALID ||
 144         target->vpid == ORTE_VPID_INVALID) {
 145         ret = ORTE_NAME_INVALID;
 146         goto found;
 147     }
 148 
 149     /* initialize */
 150     daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
 151     daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
 152 
 153     if (ORTE_PROC_IS_APP) {
 154         /* if I am an application, AND I have knowledge of
 155          * my daemon (i.e., a daemon launched me), then I
 156          * always route thru the daemon */
 157         if (NULL != orte_process_info.my_daemon_uri) {
 158             ret = ORTE_PROC_MY_DAEMON;
 159         } else {
 160             /* I was direct launched and do not have
 161              * a daemon, so I have to route direct */
 162             ret = target;
 163         }
 164         goto found;
 165     }
 166 
 167     /* if I am a tool, the route is direct if target is in
 168      * my own job family, and to the target's HNP if not
 169      */
 170     if (ORTE_PROC_IS_TOOL) {
 171         if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
 172             ret = target;
 173             goto found;
 174         } else {
 175             ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
 176             ret = &daemon;
 177             goto found;
 178         }
 179     }
 180 
 181     /******     HNP AND DAEMONS ONLY     ******/
 182     if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
 183         OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 184                     "%s routing direct to the HNP",
 185                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 186         ret = ORTE_PROC_MY_HNP;
 187         goto found;
 188     }
 189 
 190     daemon.jobid = ORTE_PROC_MY_NAME->jobid;
 191     /* find out what daemon hosts this proc */
 192     if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
 193         ret = ORTE_NAME_INVALID;
 194         goto found;
 195     }
 196 
 197     /* if the daemon is me, then send direct to the target! */
 198     if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
 199         ret = target;
 200         goto found;
 201     }
 202 
 203     /* else route to this daemon directly */
 204     ret = &daemon;
 205 
 206  found:
 207     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 208                          "%s routed_direct_get(%s) --> %s",
 209                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 210                          ORTE_NAME_PRINT(target),
 211                          ORTE_NAME_PRINT(ret)));
 212 
 213     return *ret;
 214 }
 215 
 216 static int route_lost(const orte_process_name_t *route)
 217 {
 218     opal_list_item_t *item;
 219     orte_routed_tree_t *child;
 220 
 221     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 222                          "%s route to %s lost",
 223                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 224                          ORTE_NAME_PRINT(route)));
 225 
 226     /* if we lose the connection to the lifeline and we are NOT already,
 227      * in finalize, tell the OOB to abort.
 228      * NOTE: we cannot call abort from here as the OOB needs to first
 229      * release a thread-lock - otherwise, we will hang!!
 230      */
 231     if (!orte_finalizing &&
 232         NULL != lifeline &&
 233         OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
 234         OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 235                              "%s routed:direct: Connection to lifeline %s lost",
 236                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 237                              ORTE_NAME_PRINT(lifeline)));
 238         return ORTE_ERR_FATAL;
 239     }
 240 
 241     /* if we are the HNP, and the route is a daemon,
 242      * see if it is one of our children - if so, remove it
 243      */
 244     if (ORTE_PROC_IS_HNP &&
 245         route->jobid == ORTE_PROC_MY_NAME->jobid) {
 246         for (item = opal_list_get_first(&my_children);
 247              item != opal_list_get_end(&my_children);
 248              item = opal_list_get_next(item)) {
 249             child = (orte_routed_tree_t*)item;
 250             if (child->vpid == route->vpid) {
 251                 opal_list_remove_item(&my_children, item);
 252                 OBJ_RELEASE(item);
 253                 return ORTE_SUCCESS;
 254             }
 255         }
 256     }
 257 
 258     /* we don't care about this one, so return success */
 259     return ORTE_SUCCESS;
 260 }
 261 
 262 
 263 static bool route_is_defined(const orte_process_name_t *target)
 264 {
 265     /* all routes are defined */
 266     return true;
 267 }
 268 
 269 static int set_lifeline(orte_process_name_t *proc)
 270 {
 271     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 272                          "%s routed:direct: set lifeline to %s",
 273                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 274                          ORTE_NAME_PRINT(proc)));
 275     mylifeline = *proc;
 276     lifeline = &mylifeline;
 277     return ORTE_SUCCESS;
 278 }
 279 
 280 static void update_routing_plan(void)
 281 {
 282     orte_routed_tree_t *child;
 283     int32_t i;
 284     orte_job_t *jdata;
 285     orte_proc_t *proc;
 286 
 287     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 288                          "%s routed:direct: update routing plan",
 289                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 290 
 291     if (!ORTE_PROC_IS_HNP) {
 292         /* nothing to do */
 293         return;
 294     }
 295 
 296     /* clear the current list */
 297     OPAL_LIST_DESTRUCT(&my_children);
 298     OBJ_CONSTRUCT(&my_children, opal_list_t);
 299 
 300     /* HNP is directly connected to each daemon */
 301     if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
 302         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 303         return;
 304     }
 305     for (i=1; i < jdata->procs->size; i++) {
 306         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
 307             continue;
 308         }
 309         child = OBJ_NEW(orte_routed_tree_t);
 310         child->vpid = proc->name.vpid;
 311         opal_list_append(&my_children, &child->super);
 312     }
 313 
 314     return;
 315 }
 316 
 317 static void get_routing_list(opal_list_t *coll)
 318 {
 319 
 320     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
 321                          "%s routed:direct: get routing list",
 322                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 323 
 324     /* if I am anything other than daemons and the HNP, this
 325      * is a meaningless command as I am not allowed to route
 326      */
 327     if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
 328         return;
 329     }
 330 
 331     orte_routed_base_xcast_routing(coll, &my_children);
 332 }
 333 
 334 static size_t num_routes(void)
 335 {
 336     if (!ORTE_PROC_IS_HNP) {
 337         return 0;
 338     }
 339     return opal_list_get_size(&my_children);
 340 }
 341 
 342 #if OPAL_ENABLE_FT_CR == 1
 343 static int direct_ft_event(int state)
 344 {
 345     int ret, exit_status = ORTE_SUCCESS;
 346 
 347     /******** Checkpoint Prep ********/
 348     if(OPAL_CRS_CHECKPOINT == state) {
 349     }
 350     /******** Continue Recovery ********/
 351     else if (OPAL_CRS_CONTINUE == state ) {
 352     }
 353     else if (OPAL_CRS_TERM == state ) {
 354         /* Nothing */
 355     }
 356     else {
 357         /* Error state = Nothing */
 358     }
 359 
 360  cleanup:
 361     return exit_status;
 362 }
 363 #endif

/* [<][>][^][v][top][bottom][index][help] */