root/orte/mca/state/novm/state_novm.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. init
  2. finalize
  3. allocation_complete
  4. map_complete
  5. vm_ready

   1 /*
   2  * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
   3  *                         All rights reserved.
   4  * Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
   5  * $COPYRIGHT$
   6  *
   7  * Additional copyrights may follow
   8  *
   9  * $HEADER$
  10  */
  11 
  12 #include "orte_config.h"
  13 
  14 #include <sys/types.h>
  15 #ifdef HAVE_UNISTD_H
  16 #include <unistd.h>
  17 #endif  /* HAVE_UNISTD_H */
  18 #include <string.h>
  19 
  20 #include "opal/util/output.h"
  21 
  22 #include "orte/mca/errmgr/errmgr.h"
  23 #include "orte/mca/iof/iof.h"
  24 #include "orte/mca/plm/base/base.h"
  25 #include "orte/mca/ras/base/base.h"
  26 #include "orte/mca/rmaps/base/base.h"
  27 #include "orte/mca/routed/routed.h"
  28 #include "orte/util/session_dir.h"
  29 #include "orte/util/threads.h"
  30 #include "orte/runtime/orte_quit.h"
  31 
  32 #include "orte/mca/state/state.h"
  33 #include "orte/mca/state/base/base.h"
  34 #include "orte/mca/state/base/state_private.h"
  35 #include "state_novm.h"
  36 
  37 /*
  38  * Module functions: Global
  39  */
  40 static int init(void);
  41 static int finalize(void);
  42 
  43 /******************
  44  * NOVM module - just uses base functions after
  45  * initializing the proc state machine. Job state
  46  * machine is unused by application procs at this
  47  * time.
  48  ******************/
  49 orte_state_base_module_t orte_state_novm_module = {
  50     init,
  51     finalize,
  52     orte_state_base_activate_job_state,
  53     orte_state_base_add_job_state,
  54     orte_state_base_set_job_state_callback,
  55     orte_state_base_set_job_state_priority,
  56     orte_state_base_remove_job_state,
  57     orte_state_base_activate_proc_state,
  58     orte_state_base_add_proc_state,
  59     orte_state_base_set_proc_state_callback,
  60     orte_state_base_set_proc_state_priority,
  61     orte_state_base_remove_proc_state
  62 };
  63 
  64 static void allocation_complete(int fd, short args, void *cbdata);
  65 static void map_complete(int fd, short args, void *cbdata);
  66 static void vm_ready(int fd, short args, void *cbdata);
  67 
  68 /* defined state machine sequence for no VM - individual
  69  * plm's must add a state for launching daemons
  70  */
  71 static orte_job_state_t launch_states[] = {
  72     ORTE_JOB_STATE_INIT,
  73     ORTE_JOB_STATE_INIT_COMPLETE,
  74     ORTE_JOB_STATE_ALLOCATE,
  75     ORTE_JOB_STATE_ALLOCATION_COMPLETE,
  76     ORTE_JOB_STATE_DAEMONS_LAUNCHED,
  77     ORTE_JOB_STATE_DAEMONS_REPORTED,
  78     ORTE_JOB_STATE_VM_READY,
  79     ORTE_JOB_STATE_MAP,
  80     ORTE_JOB_STATE_MAP_COMPLETE,
  81     ORTE_JOB_STATE_SYSTEM_PREP,
  82     ORTE_JOB_STATE_LAUNCH_APPS,
  83     ORTE_JOB_STATE_SEND_LAUNCH_MSG,
  84     ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
  85     ORTE_JOB_STATE_RUNNING,
  86     ORTE_JOB_STATE_REGISTERED,
  87     /* termination states */
  88     ORTE_JOB_STATE_TERMINATED,
  89     ORTE_JOB_STATE_NOTIFY_COMPLETED,
  90     ORTE_JOB_STATE_ALL_JOBS_COMPLETE,
  91     ORTE_JOB_STATE_DAEMONS_TERMINATED
  92 };
  93 static orte_state_cbfunc_t launch_callbacks[] = {
  94     orte_plm_base_setup_job,
  95     orte_plm_base_setup_job_complete,
  96     orte_ras_base_allocate,
  97     allocation_complete,
  98     orte_plm_base_daemons_launched,
  99     orte_plm_base_daemons_reported,
 100     vm_ready,
 101     orte_rmaps_base_map_job,
 102     map_complete,
 103     orte_plm_base_complete_setup,
 104     orte_plm_base_launch_apps,
 105     orte_plm_base_send_launch_msg,
 106     orte_state_base_local_launch_complete,
 107     orte_plm_base_post_launch,
 108     orte_plm_base_registered,
 109     orte_state_base_check_all_complete,
 110     orte_state_base_cleanup_job,
 111     orte_quit,
 112     orte_quit
 113 };
 114 
 115 static orte_proc_state_t proc_states[] = {
 116     ORTE_PROC_STATE_RUNNING,
 117     ORTE_PROC_STATE_REGISTERED,
 118     ORTE_PROC_STATE_IOF_COMPLETE,
 119     ORTE_PROC_STATE_WAITPID_FIRED,
 120     ORTE_PROC_STATE_TERMINATED
 121 };
 122 static orte_state_cbfunc_t proc_callbacks[] = {
 123     orte_state_base_track_procs,
 124     orte_state_base_track_procs,
 125     orte_state_base_track_procs,
 126     orte_state_base_track_procs,
 127     orte_state_base_track_procs
 128 };
 129 
 130 /************************
 131  * API Definitions
 132  ************************/
 133 static int init(void)
 134 {
 135     int i, rc;
 136     int num_states;
 137 
 138     /* setup the state machines */
 139     OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
 140     OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
 141 
 142     /* setup the job state machine */
 143     num_states = sizeof(launch_states) / sizeof(orte_job_state_t);
 144     for (i=0; i < num_states; i++) {
 145         if (ORTE_SUCCESS != (rc = orte_state.add_job_state(launch_states[i],
 146                                                            launch_callbacks[i],
 147                                                            ORTE_SYS_PRI))) {
 148             ORTE_ERROR_LOG(rc);
 149         }
 150     }
 151     /* add a default error response */
 152     if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
 153                                                        orte_quit, ORTE_ERROR_PRI))) {
 154         ORTE_ERROR_LOG(rc);
 155     }
 156     /* add callback to report progress, if requested */
 157     if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS,
 158                                                        orte_state_base_report_progress, ORTE_ERROR_PRI))) {
 159         ORTE_ERROR_LOG(rc);
 160     }
 161     if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
 162         orte_state_base_print_job_state_machine();
 163     }
 164 
 165     /* populate the proc state machine to allow us to
 166      * track proc lifecycle changes
 167      */
 168     num_states = sizeof(proc_states) / sizeof(orte_proc_state_t);
 169     for (i=0; i < num_states; i++) {
 170         if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i],
 171                                                             proc_callbacks[i],
 172                                                             ORTE_SYS_PRI))) {
 173             ORTE_ERROR_LOG(rc);
 174         }
 175     }
 176     if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
 177         orte_state_base_print_proc_state_machine();
 178     }
 179 
 180     return ORTE_SUCCESS;
 181 }
 182 
 183 static int finalize(void)
 184 {
 185     opal_list_item_t *item;
 186 
 187     /* cleanup the proc state machine */
 188     while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
 189         OBJ_RELEASE(item);
 190     }
 191     OBJ_DESTRUCT(&orte_proc_states);
 192 
 193     return ORTE_SUCCESS;
 194 }
 195 
 196 /* after we allocate, we need to map the processes
 197  * so we know what nodes will be used
 198  */
 199 static void allocation_complete(int fd, short args, void *cbdata)
 200 {
 201     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 202     orte_job_t *jdata;
 203     orte_job_t *daemons;
 204     orte_topology_t *t;
 205     orte_node_t *node;
 206     int i;
 207 
 208     ORTE_ACQUIRE_OBJECT(caddy);
 209     jdata = state->jdata;
 210 
 211     jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
 212 
 213     /* get the daemon job object */
 214     if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
 215         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 216         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 217         goto done;
 218     }
 219     /* mark that we are not using a VM */
 220     orte_set_attribute(&daemons->attributes, ORTE_JOB_NO_VM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
 221 
 222     /* ensure that all nodes point to our topology - we
 223      * cannot support hetero nodes with this state machine
 224      */
 225     t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
 226     for (i=1; i < orte_node_pool->size; i++) {
 227         if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
 228             continue;
 229         }
 230         node->topology = t;
 231     }
 232     if (!orte_managed_allocation) {
 233         if (NULL != orte_set_slots &&
 234             0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) {
 235             for (i=0; i < orte_node_pool->size; i++) {
 236                 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
 237                     continue;
 238                 }
 239                 if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
 240                     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 241                                          "%s plm:base:setting slots for node %s by %s",
 242                                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
 243                     orte_plm_base_set_slots(node);
 244                 }
 245             }
 246         }
 247     }
 248 
 249     /* move to the map stage */
 250     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
 251 
 252  done:
 253     /* cleanup */
 254     OBJ_RELEASE(state);
 255 }
 256 
 257 /* after we map, we are ready to launch the daemons */
 258 static void map_complete(int fd, short args, void *cbdata)
 259 {
 260     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 261     orte_job_t *jdata;
 262 
 263     ORTE_ACQUIRE_OBJECT(caddy);
 264     jdata = state->jdata;
 265 
 266     jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
 267     /* move to the map stage */
 268     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
 269 
 270     /* cleanup */
 271     OBJ_RELEASE(state);
 272 }
 273 
 274 static void vm_ready(int fd, short args, void *cbdata)
 275 {
 276     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 277     orte_job_t *jdata;
 278 
 279     ORTE_ACQUIRE_OBJECT(caddy);
 280     jdata = state->jdata;
 281 
 282     /* now that the daemons are launched, we are ready
 283      * to roll
 284      */
 285     jdata->state = ORTE_JOB_STATE_VM_READY;
 286     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP);
 287 
 288     OBJ_RELEASE(state);
 289 }

/* [<][>][^][v][top][bottom][index][help] */