orte/mca/plm/slurm/plm_slurm

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
plm_slurm_init
plm_slurm_launch_job
launch_daemons
plm_slurm_terminate_orteds
plm_slurm_signal_job
plm_slurm_finalize
srun_wait_cb
plm_slurm_start_proc
   1 /*
   2  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2006 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2006-2019 Cisco Systems, Inc.  All rights reserved
  13  * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
  14  *                         reserved.
  15  * Copyright (c) 2014-2018 Intel, Inc.  All rights reserved.
  16  * $COPYRIGHT$
  17  *
  18  * Additional copyrights may follow
  19  *
  20  * $HEADER$
  21  *
  22  * These symbols are in a file by themselves to provide nice linker
  23  * semantics.  Since linkers generally pull in symbols by object
  24  * files, keeping these symbols as the only symbols in this file
  25  * prevents utility programs such as "ompi_info" from having to import
  26  * entire components just to query their version and parameters.
  27  */
  28 
  29 #include "orte_config.h"
  30 #include "orte/runtime/orte_globals.h"
  31 
  32 #include <string.h>
  33 #include <sys/types.h>
  34 #ifdef HAVE_UNISTD_H
  35 #include <unistd.h>
  36 #endif
  37 #include <signal.h>
  38 #include <stdlib.h>
  39 #ifdef HAVE_SYS_TYPES_H
  40 #include <sys/types.h>
  41 #endif
  42 #ifdef HAVE_SYS_TIME_H
  43 #include <sys/time.h>
  44 #endif
  45 #ifdef HAVE_SYS_STAT_H
  46 #include <sys/stat.h>
  47 #endif
  48 #ifdef HAVE_FCNTL_H
  49 #include <fcntl.h>
  50 #endif
  51 
  52 #include "opal/mca/base/base.h"
  53 #include "opal/mca/installdirs/installdirs.h"
  54 #include "opal/util/argv.h"
  55 #include "opal/util/output.h"
  56 #include "opal/util/opal_environ.h"
  57 #include "opal/util/path.h"
  58 #include "opal/util/basename.h"
  59 
  60 #include "orte/constants.h"
  61 #include "orte/types.h"
  62 #include "orte/util/show_help.h"
  63 #include "orte/util/name_fns.h"
  64 #include "orte/util/threads.h"
  65 #include "orte/runtime/orte_globals.h"
  66 #include "orte/runtime/orte_wait.h"
  67 #include "orte/runtime/orte_quit.h"
  68 #include "orte/mca/errmgr/errmgr.h"
  69 #include "orte/mca/rmaps/base/base.h"
  70 #include "orte/mca/state/state.h"
  71 
  72 #include "orte/orted/orted.h"
  73 
  74 #include "orte/mca/plm/plm.h"
  75 #include "orte/mca/plm/base/plm_private.h"
  76 #include "plm_slurm.h"
  77 
  78 
  79 /*
  80  * Local functions
  81  */
  82 static int plm_slurm_init(void);
  83 static int plm_slurm_launch_job(orte_job_t *jdata);
  84 static int plm_slurm_terminate_orteds(void);
  85 static int plm_slurm_signal_job(orte_jobid_t jobid, int32_t signal);
  86 static int plm_slurm_finalize(void);
  87 
  88 static int plm_slurm_start_proc(int argc, char **argv, char **env,
  89                                 char *prefix);
  90 
  91 
  92 /*
  93  * Global variable
  94  */
  95 orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
  96     plm_slurm_init,
  97     orte_plm_base_set_hnp_name,
  98     plm_slurm_launch_job,
  99     NULL,
 100     orte_plm_base_orted_terminate_job,
 101     plm_slurm_terminate_orteds,
 102     orte_plm_base_orted_kill_local_procs,
 103     plm_slurm_signal_job,
 104     plm_slurm_finalize
 105 };
 106 
 107 /*
 108  * Local variables
 109  */
 110 static pid_t primary_srun_pid = 0;
 111 static bool primary_pid_set = false;
 112 static void launch_daemons(int fd, short args, void *cbdata);
 113 
 114 /**
 115 * Init the module
 116  */
 117 static int plm_slurm_init(void)
 118 {
 119     int rc;
 120 
 121     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
 122         ORTE_ERROR_LOG(rc);
 123         return rc;
 124     }
 125 
 126     /* if we don't want to launch (e.g., someone just wants
 127      * to test the mappers), then we assign vpids at "launch"
 128      * so the mapper has something to work with
 129      */
 130     if (orte_do_not_launch) {
 131         orte_plm_globals.daemon_nodes_assigned_at_launch = true;
 132     } else {
 133         /* we do NOT assign daemons to nodes at launch - we will
 134          * determine that mapping when the daemon
 135          * calls back. This is required because slurm does
 136          * its own mapping of proc-to-node, and we cannot know
 137          * in advance which daemon will wind up on which node
 138          */
 139         orte_plm_globals.daemon_nodes_assigned_at_launch = false;
 140     }
 141 
 142     /* point to our launch command */
 143     if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
 144                                                        launch_daemons, ORTE_SYS_PRI))) {
 145         ORTE_ERROR_LOG(rc);
 146         return rc;
 147     }
 148 
 149     return rc;
 150 }
 151 
 152 /* When working in this function, ALWAYS jump to "cleanup" if
 153  * you encounter an error so that orterun will be woken up and
 154  * the job can cleanly terminate
 155  */
 156 static int plm_slurm_launch_job(orte_job_t *jdata)
 157 {
 158     if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
 159         /* this is a restart situation - skip to the mapping stage */
 160         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
 161     } else {
 162         /* new job - set it up */
 163         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
 164     }
 165     return ORTE_SUCCESS;
 166 }
 167 
 168 static void launch_daemons(int fd, short args, void *cbdata)
 169 {
 170     orte_app_context_t *app;
 171     orte_node_t *node;
 172     orte_std_cntr_t n;
 173     orte_job_map_t *map;
 174     char *jobid_string = NULL;
 175     char *param;
 176     char **argv = NULL;
 177     int argc;
 178     int rc;
 179     char *tmp;
 180     char** env = NULL;
 181     char *nodelist_flat;
 182     char **nodelist_argv;
 183     char *name_string;
 184     char **custom_strings;
 185     int num_args, i;
 186     char *cur_prefix;
 187     int proc_vpid_index;
 188     bool failed_launch=true;
 189     orte_job_t *daemons;
 190     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 191 
 192     ORTE_ACQUIRE_OBJECT(state);
 193 
 194     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 195                          "%s plm:slurm: LAUNCH DAEMONS CALLED",
 196                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 197 
 198     /* if we are launching debugger daemons, then just go
 199      * do it - no new daemons will be launched
 200      */
 201     if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
 202         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 203         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 204         OBJ_RELEASE(state);
 205         return;
 206     }
 207 
 208     /* start by setting up the virtual machine */
 209     daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 210     if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
 211         ORTE_ERROR_LOG(rc);
 212         goto cleanup;
 213     }
 214 
 215    /* if we don't want to launch, then don't attempt to
 216      * launch the daemons - the user really wants to just
 217      * look at the proposed process map
 218      */
 219     if (orte_do_not_launch) {
 220         /* set the state to indicate the daemons reported - this
 221          * will trigger the daemons_reported event and cause the
 222          * job to move to the following step
 223          */
 224         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 225         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 226         OBJ_RELEASE(state);
 227         return;
 228     }
 229 
 230     /* Get the map for this job */
 231     if (NULL == (map = daemons->map)) {
 232         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 233         rc = ORTE_ERR_NOT_FOUND;
 234         goto cleanup;
 235     }
 236 
 237     if (0 == map->num_new_daemons) {
 238         /* set the state to indicate the daemons reported - this
 239          * will trigger the daemons_reported event and cause the
 240          * job to move to the following step
 241          */
 242         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 243                              "%s plm:slurm: no new daemons to launch",
 244                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 245         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 246         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 247         OBJ_RELEASE(state);
 248         return;
 249     }
 250 
 251     /* need integer value for command line parameter */
 252     opal_asprintf(&jobid_string, "%lu", (unsigned long) daemons->jobid);
 253 
 254     /*
 255      * start building argv array
 256      */
 257     argv = NULL;
 258     argc = 0;
 259 
 260     /*
 261      * SLURM srun OPTIONS
 262      */
 263 
 264     /* add the srun command */
 265     opal_argv_append(&argc, &argv, "srun");
 266 
 267     /* start one orted on each node */
 268     opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
 269 
 270     if (!orte_enable_recovery) {
 271         /* kill the job if any orteds die */
 272         opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
 273     }
 274 
 275 #if SLURM_CRAY_ENV
 276     /*
 277      * If in a SLURM/Cray env. make sure that Cray PMI is not pulled in,
 278      * neither as a constructor run when orteds start, nor selected
 279      * when pmix components are registered
 280      */
 281 
 282     opal_setenv("PMI_NO_PREINITIALIZE", "1", false, &orte_launch_environ);
 283     opal_setenv("PMI_NO_FORK", "1", false, &orte_launch_environ);
 284     opal_setenv("OMPI_NO_USE_CRAY_PMI", "1", false, &orte_launch_environ);
 285 #endif
 286 
 287     /* Append user defined arguments to srun */
 288     if ( NULL != mca_plm_slurm_component.custom_args ) {
 289         custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' ');
 290         num_args       = opal_argv_count(custom_strings);
 291         for (i = 0; i < num_args; ++i) {
 292             opal_argv_append(&argc, &argv, custom_strings[i]);
 293         }
 294         opal_argv_free(custom_strings);
 295     }
 296 
 297     /* create nodelist */
 298     nodelist_argv = NULL;
 299 
 300     for (n=0; n < map->nodes->size; n++ ) {
 301         if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
 302             continue;
 303         }
 304         /* if the daemon already exists on this node, then
 305          * don't include it
 306          */
 307         if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
 308             continue;
 309         }
 310 
 311         /* otherwise, add it to the list of nodes upon which
 312          * we need to launch a daemon
 313          */
 314         opal_argv_append_nosize(&nodelist_argv, node->name);
 315     }
 316     if (0 == opal_argv_count(nodelist_argv)) {
 317         orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true);
 318         rc = ORTE_ERR_FAILED_TO_START;
 319         goto cleanup;
 320     }
 321     nodelist_flat = opal_argv_join(nodelist_argv, ',');
 322     opal_argv_free(nodelist_argv);
 323 
 324     /* if we are using all allocated nodes, then srun doesn't
 325      * require any further arguments
 326      */
 327     if (map->num_new_daemons < orte_num_allocated_nodes) {
 328         opal_asprintf(&tmp, "--nodes=%lu", (unsigned long)map->num_new_daemons);
 329         opal_argv_append(&argc, &argv, tmp);
 330         free(tmp);
 331 
 332         opal_asprintf(&tmp, "--nodelist=%s", nodelist_flat);
 333         opal_argv_append(&argc, &argv, tmp);
 334         free(tmp);
 335     }
 336 
 337     /* tell srun how many tasks to run */
 338     opal_asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
 339     opal_argv_append(&argc, &argv, tmp);
 340     free(tmp);
 341 
 342     OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output,
 343                          "%s plm:slurm: launching on nodes %s",
 344                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
 345     free(nodelist_flat);
 346 
 347     /*
 348      * ORTED OPTIONS
 349      */
 350 
 351     /* add the daemon command (as specified by user) */
 352     orte_plm_base_setup_orted_cmd(&argc, &argv);
 353 
 354     /* Add basic orted command line options, including debug flags */
 355     orte_plm_base_orted_append_basic_args(&argc, &argv,
 356                                           "slurm", &proc_vpid_index);
 357 
 358     /* tell the new daemons the base of the name list so they can compute
 359      * their own name on the other end
 360      */
 361     rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start);
 362     if (ORTE_SUCCESS != rc) {
 363         opal_output(0, "plm_slurm: unable to get daemon vpid as string");
 364         goto cleanup;
 365     }
 366 
 367     free(argv[proc_vpid_index]);
 368     argv[proc_vpid_index] = strdup(name_string);
 369     free(name_string);
 370 
 371     /* Copy the prefix-directory specified in the
 372        corresponding app_context.  If there are multiple,
 373        different prefix's in the app context, complain (i.e., only
 374        allow one --prefix option for the entire slurm run -- we
 375        don't support different --prefix'es for different nodes in
 376        the SLURM plm) */
 377     cur_prefix = NULL;
 378     for (n=0; n < state->jdata->apps->size; n++) {
 379         char * app_prefix_dir;
 380         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) {
 381             continue;
 382         }
 383         app_prefix_dir = NULL;
 384         orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
 385         /* Check for already set cur_prefix_dir -- if different,
 386            complain */
 387         if (NULL != app_prefix_dir) {
 388             if (NULL != cur_prefix &&
 389                 0 != strcmp (cur_prefix, app_prefix_dir)) {
 390                 orte_show_help("help-plm-slurm.txt", "multiple-prefixes",
 391                                true, cur_prefix, app_prefix_dir);
 392                 goto cleanup;
 393             }
 394 
 395             /* If not yet set, copy it; iff set, then it's the
 396              * same anyway
 397              */
 398             if (NULL == cur_prefix) {
 399                 cur_prefix = strdup(app_prefix_dir);
 400                 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 401                                      "%s plm:slurm: Set prefix:%s",
 402                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 403                                      cur_prefix));
 404             }
 405             free(app_prefix_dir);
 406         }
 407     }
 408 
 409     /* protect the args in case someone has a script wrapper around srun */
 410     mca_base_cmd_line_wrap_args(argv);
 411 
 412     /* setup environment */
 413     env = opal_argv_copy(orte_launch_environ);
 414 
 415     /* ensure the orteds are not bound to a single processor,
 416      * just in case the TaskAffinity option is set by default.
 417      * This will *not* release the orteds from any cpu-set
 418      * constraint, but will ensure it doesn't get
 419      * bound to only one processor
 420      *
 421      * NOTE: We used to pass --cpu_bind=none on the command line.  But
 422      * SLURM 19 changed this to --cpu-bind.  There is no easy way to
 423      * test at run time which of these two parameters is used (see
 424      * https://github.com/open-mpi/ompi/pull/6654).  There was
 425      * discussion of using --test-only to see which one works, but
 426      * --test-only is only effective if you're not already inside a
 427      * SLURM allocation.  Instead, set the env var SLURM_CPU_BIND to
 428      * "none", which should do the same thing as --cpu*bind=none.
 429      */
 430     opal_setenv("SLURM_CPU_BIND", "none", true, &env);
 431 
 432     if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
 433         param = opal_argv_join(argv, ' ');
 434         opal_output(orte_plm_base_framework.framework_output,
 435                     "%s plm:slurm: final top-level argv:\n\t%s",
 436                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 437                     (NULL == param) ? "NULL" : param);
 438         if (NULL != param) free(param);
 439     }
 440 
 441     /* exec the daemon(s) */
 442     if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
 443         ORTE_ERROR_LOG(rc);
 444         goto cleanup;
 445     }
 446 
 447     /* indicate that the daemons for this job were launched */
 448     state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 449     daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 450 
 451     /* flag that launch was successful, so far as we currently know */
 452     failed_launch = false;
 453 
 454  cleanup:
 455     if (NULL != argv) {
 456         opal_argv_free(argv);
 457     }
 458     if (NULL != env) {
 459         opal_argv_free(env);
 460     }
 461 
 462     if(NULL != jobid_string) {
 463         free(jobid_string);
 464     }
 465 
 466     /* cleanup the caddy */
 467     OBJ_RELEASE(state);
 468 
 469     /* check for failed launch - if so, force terminate */
 470     if (failed_launch) {
 471         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 472     }
 473 }
 474 
 475 
 476 /**
 477 * Terminate the orteds for a given job
 478  */
 479 static int plm_slurm_terminate_orteds(void)
 480 {
 481     int rc=ORTE_SUCCESS;
 482     orte_job_t *jdata;
 483 
 484     /* check to see if the primary pid is set. If not, this indicates
 485      * that we never launched any additional daemons, so we cannot
 486      * not wait for a waitpid to fire and tell us it's okay to
 487      * exit. Instead, we simply trigger an exit for ourselves
 488      */
 489     if (primary_pid_set) {
 490         if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
 491             ORTE_ERROR_LOG(rc);
 492         }
 493     } else {
 494         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 495                              "%s plm:slurm: primary daemons complete!",
 496                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 497         jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 498         /* need to set the #terminated value to avoid an incorrect error msg */
 499         jdata->num_terminated = jdata->num_procs;
 500         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 501     }
 502 
 503     return rc;
 504 }
 505 
 506 
 507 /**
 508  * Signal all the processes in the child srun by sending the signal directly to it
 509  */
 510 static int plm_slurm_signal_job(orte_jobid_t jobid, int32_t signal)
 511 {
 512     int rc = ORTE_SUCCESS;
 513 
 514     /* order them to pass this signal to their local procs */
 515     if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
 516         ORTE_ERROR_LOG(rc);
 517     }
 518 
 519     return rc;
 520 }
 521 
 522 
 523 static int plm_slurm_finalize(void)
 524 {
 525     int rc;
 526 
 527     /* cleanup any pending recvs */
 528     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
 529         ORTE_ERROR_LOG(rc);
 530     }
 531 
 532     return ORTE_SUCCESS;
 533 }
 534 
 535 
 536 static void srun_wait_cb(int sd, short fd, void *cbdata){
 537     orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
 538     orte_proc_t *proc = t2->child;
 539     orte_job_t *jdata;
 540 
 541     /* According to the SLURM folks, srun always returns the highest exit
 542      code of our remote processes. Thus, a non-zero exit status doesn't
 543      necessarily mean that srun failed - it could be that an orted returned
 544      a non-zero exit status. Of course, that means the orted failed(!), so
 545      the end result is the same - the job didn't start.
 546 
 547      As a result, we really can't do much with the exit status itself - it
 548      could be something in errno (if srun itself failed), or it could be
 549      something returned by an orted, or it could be something returned by
 550      the OS (e.g., couldn't find the orted binary). Somebody is welcome
 551      to sort out all the options and pretty-print a better error message. For
 552      now, though, the only thing that really matters is that
 553      srun failed. Report the error and make sure that orterun
 554      wakes up - otherwise, do nothing!
 555 
 556      Unfortunately, the pid returned here is the srun pid, not the pid of
 557      the proc that actually died! So, to avoid confusion, just use -1 as the
 558      pid so nobody thinks this is real
 559      */
 560 
 561     jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 562 
 563     /* abort only if the status returned is non-zero - i.e., if
 564     * the orteds exited with an error
 565      */
 566     if (0 != proc->exit_code) {
 567         /* an orted must have died unexpectedly - report
 568          * that the daemon has failed so we exit
 569          */
 570         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 571                              "%s plm:slurm: srun returned non-zero exit status (%d) from launching the per-node daemon",
 572                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 573                              proc->exit_code));
 574         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
 575     } else {
 576         /* otherwise, check to see if this is the primary pid */
 577         if (primary_srun_pid == proc->pid) {
 578             /* in this case, we just want to fire the proper trigger so
 579              * mpirun can exit
 580              */
 581             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 582                                  "%s plm:slurm: primary daemons complete!",
 583                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 584             /* need to set the #terminated value to avoid an incorrect error msg */
 585             jdata->num_terminated = jdata->num_procs;
 586             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 587         }
 588     }
 589 
 590     /* done with this dummy */
 591     OBJ_RELEASE(t2);
 592 }
 593 
 594 
 595 static int plm_slurm_start_proc(int argc, char **argv, char **env,
 596                                 char *prefix)
 597 {
 598     int fd;
 599     int srun_pid;
 600     char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
 601     orte_proc_t *dummy;
 602 
 603     if (NULL == exec_argv) {
 604         orte_show_help("help-plm-slurm.txt", "no-srun", true);
 605         return ORTE_ERR_SILENT;
 606     }
 607 
 608     srun_pid = fork();
 609     if (-1 == srun_pid) {
 610         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
 611         free(exec_argv);
 612         return ORTE_ERR_SYS_LIMITS_CHILDREN;
 613     }
 614     /* if this is the primary launch - i.e., not a comm_spawn of a
 615      * child job - then save the pid
 616      */
 617     if (0 < srun_pid && !primary_pid_set) {
 618         primary_srun_pid = srun_pid;
 619         primary_pid_set = true;
 620     }
 621 
 622     /* setup a dummy proc object to track the srun */
 623     dummy = OBJ_NEW(orte_proc_t);
 624     dummy->pid = srun_pid;
 625     /* be sure to mark it as alive so we don't instantly fire */
 626     ORTE_FLAG_SET(dummy, ORTE_PROC_FLAG_ALIVE);
 627     /* setup the waitpid so we can find out if srun succeeds! */
 628     orte_wait_cb(dummy, srun_wait_cb, orte_event_base, NULL);
 629 
 630     if (0 == srun_pid) {  /* child */
 631         char *bin_base = NULL, *lib_base = NULL;
 632 
 633         /* Figure out the basenames for the libdir and bindir.  There
 634            is a lengthy comment about this in plm_rsh_module.c
 635            explaining all the rationale for how / why we're doing
 636            this. */
 637 
 638         lib_base = opal_basename(opal_install_dirs.libdir);
 639         bin_base = opal_basename(opal_install_dirs.bindir);
 640 
 641         /* If we have a prefix, then modify the PATH and
 642            LD_LIBRARY_PATH environment variables.  */
 643         if (NULL != prefix) {
 644             char *oldenv, *newenv;
 645 
 646             /* Reset PATH */
 647             oldenv = getenv("PATH");
 648             if (NULL != oldenv) {
 649                 opal_asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
 650             } else {
 651                 opal_asprintf(&newenv, "%s/%s", prefix, bin_base);
 652             }
 653             opal_setenv("PATH", newenv, true, &env);
 654             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 655                                  "%s plm:slurm: reset PATH: %s",
 656                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 657                                  newenv));
 658             free(newenv);
 659 
 660             /* Reset LD_LIBRARY_PATH */
 661             oldenv = getenv("LD_LIBRARY_PATH");
 662             if (NULL != oldenv) {
 663                 opal_asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
 664             } else {
 665                 opal_asprintf(&newenv, "%s/%s", prefix, lib_base);
 666             }
 667             opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
 668             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 669                                  "%s plm:slurm: reset LD_LIBRARY_PATH: %s",
 670                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 671                                  newenv));
 672             free(newenv);
 673         }
 674 
 675         fd = open("/dev/null", O_CREAT|O_RDWR|O_TRUNC, 0666);
 676         if (fd >= 0) {
 677             dup2(fd, 0);
 678             /* When not in debug mode and --debug-daemons was not passed,
 679              * tie stdout/stderr to dev null so we don't see messages from orted
 680              * EXCEPT if the user has requested that we leave sessions attached
 681              */
 682             if (0 > opal_output_get_verbosity(orte_plm_base_framework.framework_output) &&
 683                 !orte_debug_daemons_flag && !orte_leave_session_attached) {
 684                 dup2(fd,1);
 685                 dup2(fd,2);
 686             }
 687 
 688             /* Don't leave the extra fd to /dev/null open */
 689             if (fd > 2) {
 690                 close(fd);
 691             }
 692         }
 693 
 694         /* get the srun process out of orterun's process group so that
 695            signals sent from the shell (like those resulting from
 696            cntl-c) don't get sent to srun */
 697         setpgid(0, 0);
 698 
 699         execve(exec_argv, argv, env);
 700 
 701         opal_output(0, "plm:slurm:start_proc: exec failed");
 702         /* don't return - need to exit - returning would be bad -
 703            we're not in the calling process anymore */
 704         exit(1);
 705     } else {  /* parent */
 706         /* just in case, make sure that the srun process is not in our
 707            process group any more.  Stevens says always do this on both
 708            sides of the fork... */
 709         setpgid(srun_pid, srun_pid);
 710 
 711         free(exec_argv);
 712     }
 713 
 714     return ORTE_SUCCESS;
 715 }
/* [<][>][^][v][top][bottom][index][help] */
root/orte/mca/plm/slurm/plm_slurm_module.c

DEFINITIONS