This source file includes following definitions.
- plm_lsf_init
- plm_lsf_launch_job
- launch_daemons
- plm_lsf_terminate_orteds
- plm_lsf_signal_job
- plm_lsf_finalize
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 
  30 
  31 
  32 #include "orte_config.h"
  33 #include "orte/constants.h"
  34 #include "orte/types.h"
  35 
  36 #include <sys/types.h>
  37 #ifdef HAVE_UNISTD_H
  38 #include <unistd.h>
  39 #endif
  40 #include <signal.h>
  41 #include <stdlib.h>
  42 #ifdef HAVE_SYS_TYPES_H
  43 #include <sys/types.h>
  44 #endif
  45 #ifdef HAVE_SYS_TIME_H
  46 #include <sys/time.h>
  47 #endif
  48 #ifdef HAVE_SYS_STAT_H
  49 #include <sys/stat.h>
  50 #endif
  51 #ifdef HAVE_FCNTL_H
  52 #include <fcntl.h>
  53 #endif
  54 
  55 #define SR1_PJOBS
  56 #include <lsf/lsbatch.h>
  57 
  58 #include "opal/mca/base/base.h"
  59 #include "opal/mca/installdirs/installdirs.h"
  60 #include "opal/util/argv.h"
  61 #include "opal/util/output.h"
  62 #include "opal/util/opal_environ.h"
  63 
  64 #include "orte/util/show_help.h"
  65 #include "orte/runtime/orte_globals.h"
  66 #include "orte/runtime/orte_wait.h"
  67 #include "orte/mca/errmgr/errmgr.h"
  68 #include "orte/mca/rmaps/rmaps.h"
  69 #include "orte/mca/state/state.h"
  70 #include "orte/util/threads.h"
  71 
  72 #include "orte/mca/plm/plm.h"
  73 #include "orte/mca/plm/base/base.h"
  74 #include "orte/mca/plm/base/plm_private.h"
  75 #include "plm_lsf.h"
  76 
  77 
  78 
  79 
  80 
  81 static int plm_lsf_init(void);
  82 static int plm_lsf_launch_job(orte_job_t *jdata);
  83 static int plm_lsf_terminate_orteds(void);
  84 static int plm_lsf_signal_job(orte_jobid_t jobid, int32_t signal);
  85 static int plm_lsf_finalize(void);
  86 
  87 
  88 
  89 
  90 
  91 orte_plm_base_module_t orte_plm_lsf_module = {
  92     plm_lsf_init,
  93     orte_plm_base_set_hnp_name,
  94     plm_lsf_launch_job,
  95     NULL,
  96     orte_plm_base_orted_terminate_job,
  97     plm_lsf_terminate_orteds,
  98     orte_plm_base_orted_kill_local_procs,
  99     plm_lsf_signal_job,
 100     plm_lsf_finalize
 101 };
 102 
 103 static void launch_daemons(int fd, short args, void *cbdata);
 104 
 105 
 106 
 107 
 108 int plm_lsf_init(void)
 109 {
 110     int rc;
 111 
 112     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
 113         ORTE_ERROR_LOG(rc);
 114     }
 115 
 116     if (orte_do_not_launch) {
 117         
 118         orte_plm_globals.daemon_nodes_assigned_at_launch = true;
 119     } else {
 120         
 121 
 122 
 123 
 124 
 125 
 126         orte_plm_globals.daemon_nodes_assigned_at_launch = false;
 127     }
 128 
 129     
 130     if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
 131                                                        launch_daemons, ORTE_SYS_PRI))) {
 132         ORTE_ERROR_LOG(rc);
 133         return rc;
 134     }
 135 
 136     return rc;
 137 }
 138 
 139 
 140 
 141 
 142 
 143 static int plm_lsf_launch_job(orte_job_t *jdata)
 144 {
 145     if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
 146         
 147         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
 148     } else {
 149         
 150         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
 151     }
 152     return ORTE_SUCCESS;
 153 }
 154 
 155 static void launch_daemons(int fd, short args, void *cbdata)
 156 {
 157     orte_job_map_t *map;
 158     size_t num_nodes;
 159     char *param;
 160     char **argv = NULL;
 161     int argc;
 162     int rc;
 163     char** env = NULL;
 164     char **nodelist_argv;
 165     int nodelist_argc;
 166     char *vpid_string;
 167     int i;
 168     char *cur_prefix;
 169     int proc_vpid_index = 0;
 170     bool failed_launch = true;
 171     orte_app_context_t *app;
 172     orte_node_t *node;
 173     orte_std_cntr_t nnode;
 174     orte_job_t *daemons;
 175     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 176     orte_job_t *jdata;
 177 
 178     ORTE_ACQUIRE_OBJECT(state);
 179     jdata  = state->jdata;
 180 
 181     
 182     daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 183     if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
 184         ORTE_ERROR_LOG(rc);
 185         goto cleanup;
 186     }
 187 
 188     
 189 
 190 
 191 
 192     if (orte_do_not_launch) {
 193         
 194 
 195 
 196 
 197         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 198         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 199         OBJ_RELEASE(state);
 200         return;
 201     }
 202 
 203     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 204                          "%s plm:lsf: launching vm",
 205                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 206 
 207 
 208     
 209     if (NULL == (map = daemons->map)) {
 210         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 211         rc = ORTE_ERR_NOT_FOUND;
 212         goto cleanup;
 213     }
 214 
 215     num_nodes = map->num_new_daemons;
 216     if (0 == num_nodes) {
 217         
 218 
 219 
 220 
 221         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 222                              "%s plm:lsf: no new daemons to launch",
 223                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 224         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 225         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 226         OBJ_RELEASE(state);
 227         return;
 228     }
 229 
 230     
 231     nodelist_argv = NULL;
 232     nodelist_argc = 0;
 233 
 234     for (nnode=0; nnode < map->nodes->size; nnode++) {
 235         if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
 236             continue;
 237         }
 238         
 239 
 240 
 241         if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
 242             continue;
 243         }
 244 
 245         
 246 
 247 
 248         opal_argv_append(&nodelist_argc, &nodelist_argv, node->name);
 249     }
 250 
 251     
 252 
 253 
 254     argv = NULL;
 255     argc = 0;
 256 
 257     
 258 
 259 
 260 
 261     
 262     orte_plm_base_setup_orted_cmd(&argc, &argv);
 263 
 264 
 265     
 266     orte_plm_base_orted_append_basic_args(&argc, &argv,
 267                                           "lsf",
 268                                           &proc_vpid_index);
 269 
 270     
 271 
 272 
 273     rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start);
 274     if (ORTE_SUCCESS != rc) {
 275         opal_output(0, "plm_lsf: unable to get daemon vpid as string");
 276         goto cleanup;
 277     }
 278     free(argv[proc_vpid_index]);
 279     argv[proc_vpid_index] = strdup(vpid_string);
 280     free(vpid_string);
 281 
 282     
 283     mca_base_cmd_line_wrap_args(argv);
 284 
 285     if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
 286         param = opal_argv_join(argv, ' ');
 287         if (NULL != param) {
 288             opal_output(0, "plm:lsf: final top-level argv:");
 289             opal_output(0, "plm:lsf:     %s", param);
 290             free(param);
 291         }
 292     }
 293 
 294     
 295 
 296 
 297 
 298 
 299 
 300     cur_prefix = NULL;
 301     for (i=0; i < jdata->apps->size; i++) {
 302         char *app_prefix_dir=NULL;
 303         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
 304             continue;
 305         }
 306         if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING) &&
 307             NULL != app_prefix_dir) {
 308             
 309 
 310             if (NULL != cur_prefix &&
 311                 0 != strcmp (cur_prefix, app_prefix_dir)) {
 312                 orte_show_help("help-plm-lsf.txt", "multiple-prefixes",
 313                                true, cur_prefix, app_prefix_dir);
 314                 rc = ORTE_ERR_FAILED_TO_START;
 315                 goto cleanup;
 316             }
 317 
 318             
 319 
 320             if (NULL == cur_prefix) {
 321                 cur_prefix = strdup(app_prefix_dir);
 322                 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 323                                      "%s plm:lsf: Set prefix:%s",
 324                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix));
 325             }
 326             free(app_prefix_dir);
 327         }
 328     }
 329 
 330     
 331     env = opal_argv_copy(orte_launch_environ);
 332 
 333     
 334 
 335 
 336 
 337 
 338     orte_wait_disable();
 339 
 340     
 341 
 342 
 343 
 344 
 345 
 346     if ( (rc = lsb_launch(nodelist_argv, argv, LSF_DJOB_REPLACE_ENV | LSF_DJOB_NOWAIT, env)) < 0) {
 347         ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
 348         char *flattened_nodelist = NULL;
 349         flattened_nodelist = opal_argv_join(nodelist_argv, '\n');
 350         orte_show_help("help-plm-lsf.txt", "lsb_launch-failed",
 351                        true, rc, lsberrno, lsb_sysmsg(),
 352                        opal_argv_count(nodelist_argv), flattened_nodelist);
 353         free(flattened_nodelist);
 354         rc = ORTE_ERR_FAILED_TO_START;
 355         orte_wait_enable();  
 356         goto cleanup;
 357     }
 358     orte_wait_enable();  
 359 
 360     
 361     state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 362     daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 363 
 364     
 365     failed_launch = false;
 366 
 367  cleanup:
 368     if (NULL != argv) {
 369         opal_argv_free(argv);
 370     }
 371     if (NULL != env) {
 372         opal_argv_free(env);
 373     }
 374 
 375     
 376     OBJ_RELEASE(state);
 377 
 378     
 379     if (failed_launch) {
 380         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 381     }
 382 }
 383 
 384 
 385 
 386 
 387 
 388 static int plm_lsf_terminate_orteds(void)
 389 {
 390     int rc;
 391 
 392     if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
 393         ORTE_ERROR_LOG(rc);
 394     }
 395 
 396     return rc;
 397 }
 398 
 399 
 400 
 401 
 402 
 403 static int plm_lsf_signal_job(orte_jobid_t jobid, int32_t signal)
 404 {
 405     int rc;
 406 
 407     
 408     if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
 409         ORTE_ERROR_LOG(rc);
 410     }
 411     return rc;
 412 }
 413 
 414 
 415 static int plm_lsf_finalize(void)
 416 {
 417     int rc;
 418 
 419     
 420     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
 421         ORTE_ERROR_LOG(rc);
 422     }
 423 
 424     return ORTE_SUCCESS;
 425 }