root/orte/tools/orterun/orterun.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. launched
  2. completed
  3. orterun

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2008 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2017 Cisco Systems, Inc.  All rights reserved
  14  * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
  15  * Copyright (c) 2007-2017 Los Alamos National Security, LLC.  All rights
  16  *                         reserved.
  17  * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
  18  * Copyright (c) 2015      Research Organization for Information Science
  19  *                         and Technology (RIST). All rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <string.h>
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #ifdef HAVE_STRINGS_H
  34 #include <strings.h>
  35 #endif  /* HAVE_STRINGS_H */
  36 #ifdef HAVE_UNISTD_H
  37 #include <unistd.h>
  38 #endif
  39 #ifdef HAVE_SYS_PARAM_H
  40 #include <sys/param.h>
  41 #endif
  42 #include <errno.h>
  43 #include <signal.h>
  44 #include <ctype.h>
  45 #ifdef HAVE_SYS_TYPES_H
  46 #include <sys/types.h>
  47 #endif  /* HAVE_SYS_TYPES_H */
  48 #ifdef HAVE_SYS_WAIT_H
  49 #include <sys/wait.h>
  50 #endif  /* HAVE_SYS_WAIT_H */
  51 #ifdef HAVE_SYS_TIME_H
  52 #include <sys/time.h>
  53 #endif  /* HAVE_SYS_TIME_H */
  54 #include <fcntl.h>
  55 #ifdef HAVE_SYS_STAT_H
  56 #include <sys/stat.h>
  57 #endif
  58 
  59 #include "opal/mca/event/event.h"
  60 #include "opal/mca/installdirs/installdirs.h"
  61 #include "opal/mca/hwloc/base/base.h"
  62 #include "opal/mca/base/base.h"
  63 #include "opal/util/argv.h"
  64 #include "opal/util/output.h"
  65 #include "opal/util/basename.h"
  66 #include "opal/util/cmd_line.h"
  67 #include "opal/util/opal_environ.h"
  68 #include "opal/util/opal_getcwd.h"
  69 #include "opal/util/show_help.h"
  70 #include "opal/util/fd.h"
  71 #include "opal/sys/atomic.h"
  72 #if OPAL_ENABLE_FT_CR == 1
  73 #include "opal/runtime/opal_cr.h"
  74 #endif
  75 
  76 #include "opal/version.h"
  77 #include "opal/runtime/opal.h"
  78 #include "opal/runtime/opal_info_support.h"
  79 #include "opal/util/os_path.h"
  80 #include "opal/util/path.h"
  81 #include "opal/class/opal_pointer_array.h"
  82 #include "opal/dss/dss.h"
  83 
  84 #include "orte/mca/odls/odls.h"
  85 #include "orte/mca/rml/rml.h"
  86 #include "orte/mca/state/state.h"
  87 #include "orte/util/proc_info.h"
  88 #include "orte/util/session_dir.h"
  89 #include "orte/util/show_help.h"
  90 #include "orte/util/threads.h"
  91 
  92 #include "orte/runtime/runtime.h"
  93 #include "orte/runtime/orte_globals.h"
  94 #include "orte/runtime/orte_wait.h"
  95 #include "orte/runtime/orte_locks.h"
  96 #include "orte/runtime/orte_quit.h"
  97 
  98 /* ensure I can behave like a daemon */
  99 #include "orte/orted/orted.h"
 100 #include "orte/orted/orted_submit.h"
 101 #include "orterun.h"
 102 
 103 /* local type */
 104  typedef struct {
 105      int status;
 106      volatile bool active;
 107      orte_job_t *jdata;
 108  } orte_submit_status_t;
 109 
 110 
 111 static void launched(int index, orte_job_t *jdata, int ret, void *cbdata)
 112 {
 113     orte_submit_status_t *launchst = (orte_submit_status_t*)cbdata;
 114     launchst->status = ret;
 115     ORTE_UPDATE_EXIT_STATUS(ret);
 116     OBJ_RETAIN(jdata);
 117     launchst->jdata = jdata;
 118     launchst->active = false;
 119 }
 120 static void completed(int index, orte_job_t *jdata, int ret, void *cbdata)
 121 {
 122     orte_submit_status_t *completest = (orte_submit_status_t*)cbdata;
 123     completest->status = ret;
 124     ORTE_UPDATE_EXIT_STATUS(ret);
 125     OBJ_RETAIN(jdata);
 126     completest->jdata = jdata;
 127     completest->active = false;
 128 }
 129 
 130 int orterun(int argc, char *argv[])
 131 {
 132     orte_submit_status_t launchst, completest;
 133 
 134     /* orte_submit_init() will also check if the user is running as
 135        root (and may issue a warning/exit). */
 136     if (ORTE_SUCCESS != orte_submit_init(argc, argv, NULL)) {
 137         exit(1);
 138     }
 139 
 140     /* check if we are running as root - if we are, then only allow
 141      * us to proceed if the allow-run-as-root flag was given. Otherwise,
 142      * exit with a giant warning flag
 143      */
 144     if (0 == geteuid() && !orte_cmd_options.run_as_root) {
 145         fprintf(stderr, "--------------------------------------------------------------------------\n");
 146         if (NULL != orte_cmd_options.help) {
 147             fprintf(stderr, "%s cannot provide the help message when run as root.\n", orte_basename);
 148         } else {
 149             /* show_help is not yet available, so print an error manually */
 150             fprintf(stderr, "%s has detected an attempt to run as root.\n", orte_basename);
 151         }
 152         fprintf(stderr, "Running at root is *strongly* discouraged as any mistake (e.g., in\n");
 153         fprintf(stderr, "defining TMPDIR) or bug can result in catastrophic damage to the OS\n");
 154         fprintf(stderr, "file system, leaving your system in an unusable state.\n\n");
 155         fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n");
 156         fprintf(stderr, "option to your cmd line. However, we reiterate our strong advice\n");
 157         fprintf(stderr, "against doing so - please do so at your own risk.\n");
 158         fprintf(stderr, "--------------------------------------------------------------------------\n");
 159         exit(1);
 160     }
 161 
 162     /* setup to listen for commands sent specifically to me, even though I would probably
 163      * be the one sending them! Unfortunately, since I am a participating daemon,
 164      * there are times I need to send a command to "all daemons", and that means *I* have
 165      * to receive it too
 166      */
 167     orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
 168                             ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
 169 
 170     /* if the user just wants us to terminate a DVM, then do so */
 171     if (orte_cmd_options.terminate_dvm) {
 172         if (ORTE_ERR_OP_IN_PROGRESS != orte_submit_halt()) {
 173             ORTE_UPDATE_EXIT_STATUS(1);
 174             goto DONE;
 175         }
 176         while (orte_event_base_active) {
 177            opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
 178         }
 179         /* we are terminated when the DVM master shuts down, thereby
 180          * closing our connection to them. This looks like an error,
 181          * but is not - so correct our exit status here */
 182         orte_exit_status = 0;
 183         goto DONE;
 184     } else {
 185         /* spawn the job and its daemons */
 186         memset(&launchst, 0, sizeof(launchst));
 187         memset(&completest, 0, sizeof(completest));
 188         launchst.active = true;
 189         completest.active = true;
 190         if (ORTE_SUCCESS != orte_submit_job(argv, NULL,
 191                                             launched, &launchst,
 192                                             completed, &completest)) {
 193             ORTE_UPDATE_EXIT_STATUS(1);
 194             goto DONE;
 195         }
 196     }
 197 
 198     // wait for response and unpack the status, jobid
 199     while (orte_event_base_active && launchst.active) {
 200         opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
 201     }
 202     ORTE_ACQUIRE_OBJECT(orte_event_base_active);
 203     if (orte_debug_flag) {
 204         opal_output(0, "Job %s has launched",
 205                    (NULL == launchst.jdata) ? "UNKNOWN" : ORTE_JOBID_PRINT(launchst.jdata->jobid));
 206     }
 207     if (!orte_event_base_active || ORTE_SUCCESS != launchst.status) {
 208         goto DONE;
 209     }
 210 
 211     while (orte_event_base_active && completest.active) {
 212         opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
 213     }
 214     ORTE_ACQUIRE_OBJECT(orte_event_base_active);
 215 
 216     if (ORTE_PROC_IS_HNP) {
 217         /* ensure all local procs are dead */
 218         orte_odls.kill_local_procs(NULL);
 219     }
 220 
 221  DONE:
 222     /* cleanup and leave */
 223     orte_submit_finalize();
 224     orte_finalize();
 225     orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
 226     /* cleanup the process info */
 227     orte_proc_info_finalize();
 228 
 229     if (orte_debug_flag) {
 230         fprintf(stderr, "exiting with status %d\n", orte_exit_status);
 231     }
 232     exit(orte_exit_status);
 233 }

/* [<][>][^][v][top][bottom][index][help] */