root/orte/mca/plm/alps/plm_alps_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. plm_alps_init
  2. plm_alps_launch_job
  3. launch_daemons
  4. plm_alps_terminate_orteds
  5. plm_alps_signal_job
  6. plm_alps_finalize
  7. alps_wait_cb
  8. plm_alps_start_proc

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2006 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2011 Cisco Systems, Inc.  All rights reserved.
  14  * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
  15  *                         reserved.
  16  * Copyright (c) 2014-2018 Intel, Inc.  All rights reserved.
  17  * Copyright (c) 2017      Research Organization for Information Science
  18  *                         and Technology (RIST). All rights reserved.
  19  * $COPYRIGHT$
  20  *
  21  * Additional copyrights may follow
  22  *
  23  * $HEADER$
  24  *
  25  * These symbols are in a file by themselves to provide nice linker
  26  * semantics.  Since linkers generally pull in symbols by object
  27  * files, keeping these symbols as the only symbols in this file
  28  * prevents utility programs such as "ompi_info" from having to import
  29  * entire components just to query their version and parameters.
  30  */
  31 
  32 #include "orte_config.h"
  33 #include "orte/constants.h"
  34 #include "orte/types.h"
  35 
  36 #include <sys/types.h>
  37 #ifdef HAVE_UNISTD_H
  38 #include <unistd.h>
  39 #endif
  40 #include <signal.h>
  41 #include <stdlib.h>
  42 #include <string.h>
  43 #ifdef HAVE_SYS_TYPES_H
  44 #include <sys/types.h>
  45 #endif
  46 #ifdef HAVE_SYS_TIME_H
  47 #include <sys/time.h>
  48 #endif
  49 #ifdef HAVE_SYS_STAT_H
  50 #include <sys/stat.h>
  51 #endif
  52 #ifdef HAVE_FCNTL_H
  53 #include <fcntl.h>
  54 #endif
  55 
  56 #include "opal/mca/base/base.h"
  57 #include "opal/mca/installdirs/installdirs.h"
  58 #include "opal/util/argv.h"
  59 #include "opal/util/output.h"
  60 #include "opal/util/opal_environ.h"
  61 #include "opal/util/path.h"
  62 #include "opal/util/basename.h"
  63 
  64 #include "orte/runtime/orte_globals.h"
  65 #include "orte/util/name_fns.h"
  66 #include "orte/util/show_help.h"
  67 #include "orte/util/threads.h"
  68 #include "orte/runtime/orte_wait.h"
  69 #include "orte/mca/errmgr/errmgr.h"
  70 #include "orte/mca/rmaps/rmaps.h"
  71 #include "orte/mca/state/state.h"
  72 
  73 #include "orte/mca/plm/plm.h"
  74 #include "orte/mca/plm/base/base.h"
  75 #include "orte/mca/plm/base/plm_private.h"
  76 #include "plm_alps.h"
  77 
  78 
  79 /*
  80  * Local functions
  81  */
  82 static int plm_alps_init(void);
  83 static int plm_alps_launch_job(orte_job_t *jdata);
  84 static int plm_alps_terminate_orteds(void);
  85 static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal);
  86 static int plm_alps_finalize(void);
  87 
  88 static int plm_alps_start_proc(int argc, char **argv, char **env,
  89                                 char *prefix);
  90 
  91 
  92 /*
  93  * Global variable
  94  */
  95 orte_plm_base_module_t orte_plm_alps_module = {
  96     plm_alps_init,
  97     orte_plm_base_set_hnp_name,
  98     plm_alps_launch_job,
  99     NULL,
 100     orte_plm_base_orted_terminate_job,
 101     plm_alps_terminate_orteds,
 102     orte_plm_base_orted_kill_local_procs,
 103     plm_alps_signal_job,
 104     plm_alps_finalize
 105 };
 106 
 107 /*
 108  * Local variables
 109  */
 110 static orte_proc_t *alpsrun = NULL;
 111 static bool failed_launch;
 112 static void launch_daemons(int fd, short args, void *cbdata);
 113 
 114 
 115 /**
 116 * Init the module
 117  */
 118 static int plm_alps_init(void)
 119 {
 120     int rc;
 121 
 122     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
 123         ORTE_ERROR_LOG(rc);
 124         return rc;
 125     }
 126 
 127     if (orte_do_not_launch) {
 128         /* must map daemons since we won't be launching them */
 129         orte_plm_globals.daemon_nodes_assigned_at_launch = true;
 130     } else {
 131         /* we do NOT assign daemons to nodes at launch - we will
 132          * determine that mapping when the daemon
 133          * calls back. This is required because alps does
 134          * its own mapping of proc-to-node, and we cannot know
 135          * in advance which daemon will wind up on which node
 136          */
 137         orte_plm_globals.daemon_nodes_assigned_at_launch = false;
 138     }
 139 
 140     /* point to our launch command */
 141     if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
 142                                                        launch_daemons, ORTE_SYS_PRI))) {
 143         ORTE_ERROR_LOG(rc);
 144         return rc;
 145     }
 146 
 147     return rc;
 148 }
 149 
 150 
 151 /* When working in this function, ALWAYS jump to "cleanup" if
 152  * you encounter an error so that orterun will be woken up and
 153  * the job can cleanly terminate
 154  */
 155 static int plm_alps_launch_job(orte_job_t *jdata)
 156 {
 157 
 158     if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
 159         /* this is a restart situation - skip to the mapping stage */
 160         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
 161     } else {
 162         /* new job - set it up */
 163         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
 164     }
 165     return ORTE_SUCCESS;
 166 }
 167 
 168 static void launch_daemons(int fd, short args, void *cbdata)
 169 {
 170     orte_job_map_t *map;
 171     char *jobid_string = NULL;
 172     char *param;
 173     char **argv = NULL;
 174     int argc;
 175     int rc;
 176     char *tmp;
 177     char** env = NULL;
 178     char *nodelist_flat;
 179     char **nodelist_argv;
 180     int nodelist_argc;
 181     char *vpid_string;
 182     char **custom_strings;
 183     int num_args, i;
 184     char *cur_prefix;
 185     int proc_vpid_index;
 186     orte_app_context_t *app;
 187     orte_node_t *node;
 188     orte_std_cntr_t nnode;
 189     orte_job_t *daemons;
 190     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
 191     char *ltmp;
 192 
 193     ORTE_ACQUIRE_OBJECT(state);
 194 
 195     /* if we are launching debugger daemons, then just go
 196      * do it - no new daemons will be launched
 197      */
 198     if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
 199         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 200         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 201         OBJ_RELEASE(state);
 202         return;
 203     }
 204 
 205     /* start by setting up the virtual machine */
 206     daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 207     if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
 208         ORTE_ERROR_LOG(rc);
 209         goto cleanup;
 210     }
 211 
 212    /* if we don't want to launch, then don't attempt to
 213      * launch the daemons - the user really wants to just
 214      * look at the proposed process map
 215      */
 216     if (orte_do_not_launch) {
 217         /* set the state to indicate the daemons reported - this
 218          * will trigger the daemons_reported event and cause the
 219          * job to move to the following step
 220          */
 221         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 222         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 223         OBJ_RELEASE(state);
 224         return;
 225     }
 226 
 227     /* Get the map for this job */
 228     if (NULL == (map = daemons->map)) {
 229         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 230         rc = ORTE_ERR_NOT_FOUND;
 231         goto cleanup;
 232     }
 233 
 234     if (0 == map->num_new_daemons) {
 235         /* set the state to indicate the daemons reported - this
 236          * will trigger the daemons_reported event and cause the
 237          * job to move to the following step
 238          */
 239         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 240                              "%s plm:alps: no new daemons to launch",
 241                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 242         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 243         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
 244         OBJ_RELEASE(state);
 245         return;
 246     }
 247 
 248     /* need integer value for command line parameter */
 249     orte_util_convert_jobid_to_string(&jobid_string, daemons->jobid);
 250 
 251     /*
 252      * start building argv array
 253      */
 254     argv = NULL;
 255     argc = 0;
 256 
 257     /*
 258      * ALPS aprun  OPTIONS
 259      */
 260 
 261     /* add the aprun command */
 262     opal_argv_append(&argc, &argv, mca_plm_alps_component.aprun_cmd);
 263 
 264     /* Append user defined arguments to aprun */
 265     if ( NULL != mca_plm_alps_component.custom_args ) {
 266         custom_strings = opal_argv_split(mca_plm_alps_component.custom_args, ' ');
 267         num_args       = opal_argv_count(custom_strings);
 268         for (i = 0; i < num_args; ++i) {
 269             opal_argv_append(&argc, &argv, custom_strings[i]);
 270         }
 271         opal_argv_free(custom_strings);
 272     }
 273 
 274     /* number of processors needed */
 275     opal_argv_append(&argc, &argv, "-n");
 276     opal_asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons);
 277     opal_argv_append(&argc, &argv, tmp);
 278     free(tmp);
 279     opal_argv_append(&argc, &argv, "-N");
 280     opal_argv_append(&argc, &argv, "1");
 281     opal_argv_append(&argc, &argv, "-cc");
 282     opal_argv_append(&argc, &argv, "none");
 283     /*
 284      * stuff below is necessary in the event that we've sadly configured Open MPI with --disable-dlopen,
 285      * which results in the orted's being linked against all kinds of unnecessary cray libraries, including
 286      * the cray pmi, which has a ctor that cause bad things if run when using mpirun/orted based launch.
 287      *
 288      * Code below adds env. variables for aprun to forward which suppresses the action of the Cray PMI ctor.
 289      */
 290     opal_argv_append(&argc, &argv, "-e");
 291     opal_argv_append(&argc, &argv, "PMI_NO_PREINITIALIZE=1");
 292     opal_argv_append(&argc, &argv, "-e");
 293     opal_argv_append(&argc, &argv, "PMI_NO_FORK=1");
 294     opal_argv_append(&argc, &argv, "-e");
 295     opal_argv_append(&argc, &argv, "OMPI_NO_USE_CRAY_PMI=1");
 296 
 297     /* if we are using all allocated nodes, then alps
 298      * doesn't need a nodelist, or if running without a batch scheduler
 299      */
 300     if ((map->num_new_daemons < orte_num_allocated_nodes) || (orte_num_allocated_nodes == 0)) {
 301         /* create nodelist */
 302         nodelist_argv = NULL;
 303         nodelist_argc = 0;
 304 
 305         for (nnode=0; nnode < map->nodes->size; nnode++) {
 306             if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
 307                 continue;
 308             }
 309 
 310             /* if the daemon already exists on this node, then
 311              * don't include it
 312              */
 313             if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
 314                 continue;
 315             }
 316 
 317             /* otherwise, add it to the list of nodes upon which
 318              * we need to launch a daemon
 319              */
 320             opal_argv_append(&nodelist_argc, &nodelist_argv, node->name);
 321         }
 322         if (0 == opal_argv_count(nodelist_argv)) {
 323             orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true);
 324             rc = ORTE_ERR_FAILED_TO_START;
 325             goto cleanup;
 326         }
 327         nodelist_flat = opal_argv_join(nodelist_argv, ',');
 328         opal_argv_free(nodelist_argv);
 329 
 330         opal_argv_append(&argc, &argv, "-L");
 331         opal_argv_append(&argc, &argv, nodelist_flat);
 332         free(nodelist_flat);
 333     }
 334 
 335 
 336     /*
 337      * ORTED OPTIONS
 338      */
 339 
 340     /* add the daemon command (as specified by user) */
 341     orte_plm_base_setup_orted_cmd(&argc, &argv);
 342 
 343     /* Add basic orted command line options, including debug flags */
 344     orte_plm_base_orted_append_basic_args(&argc, &argv,
 345                                           NULL,
 346                                           &proc_vpid_index);
 347 
 348     /* tell the new daemons the base of the name list so they can compute
 349      * their own name on the other end
 350      */
 351     rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start);
 352     if (ORTE_SUCCESS != rc) {
 353         opal_output(0, "plm_alps: unable to create process name");
 354         goto cleanup;
 355     }
 356 
 357     free(argv[proc_vpid_index]);
 358     argv[proc_vpid_index] = strdup(vpid_string);
 359     free(vpid_string);
 360 
 361     if (mca_plm_alps_component.debug) {
 362         param = opal_argv_join(argv, ' ');
 363         if (NULL != param) {
 364             opal_output(0, "plm:alps: final top-level argv:");
 365             opal_output(0, "plm:alps:     %s", param);
 366             free(param);
 367         }
 368     }
 369 
 370     /* Copy the prefix-directory specified in the
 371        corresponding app_context.  If there are multiple,
 372        different prefix's in the app context, complain (i.e., only
 373        allow one --prefix option for the entire alps run -- we
 374        don't support different --prefix'es for different nodes in
 375        the ALPS plm) */
 376     cur_prefix = NULL;
 377     for (i=0; i < state->jdata->apps->size; i++) {
 378         char *app_prefix_dir = NULL;
 379         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) {
 380             continue;
 381         }
 382         orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
 383         /* Check for already set cur_prefix_dir -- if different,
 384            complain */
 385         if (NULL != app_prefix_dir) {
 386             if (NULL != cur_prefix &&
 387                 0 != strcmp (cur_prefix, app_prefix_dir)) {
 388                 orte_show_help("help-plm-alps.txt", "multiple-prefixes",
 389                                true, cur_prefix, app_prefix_dir);
 390                 goto cleanup;
 391             }
 392 
 393             /* If not yet set, copy it; iff set, then it's the
 394                same anyway */
 395             if (NULL == cur_prefix) {
 396                 cur_prefix = strdup(app_prefix_dir);
 397                 if (mca_plm_alps_component.debug) {
 398                     opal_output (0, "plm:alps: Set prefix:%s",
 399                                  cur_prefix);
 400                 }
 401             }
 402             free(app_prefix_dir);
 403         }
 404     }
 405 
 406     /* protect the args in case someone has a script wrapper around aprun */
 407     mca_base_cmd_line_wrap_args(argv);
 408 
 409     /* setup environment */
 410     env = opal_argv_copy(orte_launch_environ);
 411 
 412     if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
 413         param = opal_argv_join(argv, ' ');
 414         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 415                              "%s plm:alps: final top-level argv:\n\t%s",
 416                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 417                              (NULL == param) ? "NULL" : param));
 418         if (NULL != param) free(param);
 419     }
 420 
 421     /* exec the daemon(s) */
 422     if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
 423         ORTE_ERROR_LOG(rc);
 424         goto cleanup;
 425     }
 426 
 427     /* indicate that the daemons for this job were launched */
 428     state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 429     daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
 430 
 431     /* flag that launch was successful, so far as we currently know */
 432     failed_launch = false;
 433 
 434  cleanup:
 435     if (NULL != argv) {
 436         opal_argv_free(argv);
 437     }
 438     if (NULL != env) {
 439         opal_argv_free(env);
 440     }
 441 
 442     if(NULL != jobid_string) {
 443         free(jobid_string);
 444     }
 445 
 446     /* cleanup the caddy */
 447     OBJ_RELEASE(state);
 448 
 449     /* check for failed launch - if so, force terminate */
 450     if (failed_launch) {
 451         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 452     }
 453 }
 454 
 455 
 456 
 457 /**
 458 * Terminate the orteds for a given job
 459  */
 460 static int plm_alps_terminate_orteds(void)
 461 {
 462     int rc;
 463     orte_job_t *jdata;
 464 
 465     OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
 466                             "%s plm:alps: terminating orteds",
 467                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 468 
 469     /* deregister the waitpid callback to ensure we don't make it look like
 470      * alps failed when it didn't. Since the alps may have already completed,
 471      * do NOT ERROR_LOG any return code to avoid confusing, duplicate error
 472      * messages
 473      */
 474     if (NULL != alpsrun) {
 475         orte_wait_cb_cancel(alpsrun);
 476     }
 477 
 478     /* now tell them to die */
 479     if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
 480         ORTE_ERROR_LOG(rc);
 481     }
 482 
 483     jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 484     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
 485 
 486     OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
 487                             "%s plm:alps: terminated orteds",
 488                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 489     return rc;
 490 }
 491 
 492 
 493 /**
 494  * Signal all the processes in the child alps by sending the signal directly to it
 495  */
 496 static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal)
 497 {
 498     if (NULL != alpsrun && 0 != alpsrun->pid) {
 499         kill(alpsrun->pid, (int)signal);
 500    }
 501     return ORTE_SUCCESS;
 502 }
 503 
 504 
 505 static int plm_alps_finalize(void)
 506 {
 507     int rc;
 508 
 509     if (NULL != alpsrun) {
 510         OBJ_RELEASE(alpsrun);
 511     }
 512 
 513     /* cleanup any pending recvs */
 514     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
 515         ORTE_ERROR_LOG(rc);
 516     }
 517 
 518     return ORTE_SUCCESS;
 519 }
 520 
 521 
 522 static void alps_wait_cb(int sd, short args, void *cbdata) {
 523     orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
 524     orte_proc_t *proc = t2->child;
 525     orte_job_t *jdata;
 526 
 527     /* According to the ALPS folks, alps always returns the highest exit
 528        code of our remote processes. Thus, a non-zero exit status doesn't
 529        necessarily mean that alps failed - it could be that an orted returned
 530        a non-zero exit status. Of course, that means the orted failed(!), so
 531        the end result is the same - the job didn't start.
 532 
 533        As a result, we really can't do much with the exit status itself - it
 534        could be something in errno (if alps itself failed), or it could be
 535        something returned by an orted, or it could be something returned by
 536        the OS (e.g., couldn't find the orted binary). Somebody is welcome
 537        to sort out all the options and pretty-print a better error message. For
 538        now, though, the only thing that really matters is that
 539        alps failed. Report the error and make sure that orterun
 540        wakes up - otherwise, do nothing!
 541     */
 542     jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 543 
 544     if (0 != proc->exit_code) {
 545         if (failed_launch) {
 546             /* report that the daemon has failed so we break out of the daemon
 547              * callback receive and exit
 548              */
 549             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
 550         } else {
 551             /* an orted must have died unexpectedly after launch - report
 552              * that the daemon has failed so we exit
 553              */
 554             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
 555         }
 556     }
 557     OBJ_RELEASE(t2);
 558 }
 559 
 560 
 561 static int plm_alps_start_proc(int argc, char **argv, char **env,
 562                                 char *prefix)
 563 {
 564     int fd;
 565     pid_t alps_pid;
 566     char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
 567 
 568     if (NULL == exec_argv) {
 569         return ORTE_ERR_NOT_FOUND;
 570     }
 571 
 572     alps_pid = fork();
 573     if (-1 == alps_pid) {
 574         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
 575         return ORTE_ERR_SYS_LIMITS_CHILDREN;
 576     }
 577 
 578     alpsrun = OBJ_NEW(orte_proc_t);
 579     alpsrun->pid = alps_pid;
 580     /* be sure to mark it as alive so we don't instantly fire */
 581     ORTE_FLAG_SET(alpsrun, ORTE_PROC_FLAG_ALIVE);
 582     /* setup the waitpid so we can find out if alps succeeds! */
 583     orte_wait_cb(alpsrun, alps_wait_cb, orte_event_base, NULL);
 584 
 585     if (0 == alps_pid) {  /* child */
 586         char *bin_base = NULL, *lib_base = NULL;
 587 
 588         /* Figure out the basenames for the libdir and bindir.  There
 589            is a lengthy comment about this in plm_rsh_module.c
 590            explaining all the rationale for how / why we're doing
 591            this. */
 592 
 593         lib_base = opal_basename(opal_install_dirs.libdir);
 594         bin_base = opal_basename(opal_install_dirs.bindir);
 595 
 596         /* If we have a prefix, then modify the PATH and
 597            LD_LIBRARY_PATH environment variables.  */
 598         if (NULL != prefix) {
 599             char *oldenv, *newenv;
 600 
 601             /* Reset PATH */
 602             oldenv = getenv("PATH");
 603             if (NULL != oldenv) {
 604                 opal_asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
 605             } else {
 606                 opal_asprintf(&newenv, "%s/%s", prefix, bin_base);
 607             }
 608             opal_setenv("PATH", newenv, true, &env);
 609             if (mca_plm_alps_component.debug) {
 610                 opal_output(0, "plm:alps: reset PATH: %s", newenv);
 611             }
 612             free(newenv);
 613 
 614             /* Reset LD_LIBRARY_PATH */
 615             oldenv = getenv("LD_LIBRARY_PATH");
 616             if (NULL != oldenv) {
 617                 opal_asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
 618             } else {
 619                 opal_asprintf(&newenv, "%s/%s", prefix, lib_base);
 620             }
 621             opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
 622             if (mca_plm_alps_component.debug) {
 623                 opal_output(0, "plm:alps: reset LD_LIBRARY_PATH: %s",
 624                             newenv);
 625             }
 626             free(newenv);
 627         }
 628 
 629         fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
 630         if(fd > 0) {
 631             dup2(fd, 0);
 632         }
 633 
 634         /* When not in debug mode and --debug-daemons was not passed,
 635          * tie stdout/stderr to dev null so we don't see messages from orted */
 636         if (0 == mca_plm_alps_component.debug && !orte_debug_daemons_flag) {
 637             if (fd >= 0) {
 638                 if (fd != 1) {
 639                     dup2(fd,1);
 640                 }
 641                 if (fd != 2) {
 642                     dup2(fd,2);
 643                 }
 644             }
 645         }
 646 
 647         if (fd > 2) {
 648             close(fd);
 649         }
 650 
 651         /* get the alps process out of orterun's process group so that
 652            signals sent from the shell (like those resulting from
 653            cntl-c) don't get sent to alps */
 654         setpgid(0, 0);
 655 
 656 
 657         execve(exec_argv, argv, env);
 658 
 659         opal_output(0, "plm:alps:start_proc: exec failed");
 660         /* don't return - need to exit - returning would be bad -
 661            we're not in the calling process anymore */
 662         exit(1);
 663     } else {  /* parent */
 664         /* just in case, make sure that the alps process is not in our
 665         process group any more.  Stevens says always do this on both
 666         sides of the fork... */
 667         setpgid(alps_pid, alps_pid);
 668 
 669         free(exec_argv);
 670     }
 671 
 672     return ORTE_SUCCESS;
 673 }

/* [<][>][^][v][top][bottom][index][help] */