root/orte/tools/orte-server/orte-server.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. main
  2. shutdown_callback

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2006 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2007-2013 Cisco Systems, Inc.  All rights reserved.
  14  * Copyright (c) 2007-2016 Los Alamos National Security, LLC.  All rights
  15  *                         reserved.
  16  * Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
  17  * $COPYRIGHT$
  18  *
  19  * Additional copyrights may follow
  20  *
  21  * $HEADER$
  22  */
  23 
  24 #include "orte_config.h"
  25 #include "orte/constants.h"
  26 
  27 #include <string.h>
  28 
  29 #include <stdio.h>
  30 #include <ctype.h>
  31 #ifdef HAVE_UNISTD_H
  32 #include <unistd.h>
  33 #endif
  34 #ifdef HAVE_NETDB_H
  35 #include <netdb.h>
  36 #endif
  37 #ifdef HAVE_SYS_PARAM_H
  38 #include <sys/param.h>
  39 #endif
  40 #include <fcntl.h>
  41 #include <errno.h>
  42 #include <signal.h>
  43 
  44 
  45 #include "opal/mca/event/event.h"
  46 #include "opal/mca/base/base.h"
  47 #include "opal/util/cmd_line.h"
  48 #include "opal/util/output.h"
  49 #include "opal/util/show_help.h"
  50 #include "opal/util/daemon_init.h"
  51 #include "opal/runtime/opal.h"
  52 #include "opal/runtime/opal_cr.h"
  53 
  54 
  55 #include "orte/util/name_fns.h"
  56 #include "orte/util/proc_info.h"
  57 #include "orte/util/threads.h"
  58 #include "orte/mca/errmgr/errmgr.h"
  59 #include "orte/mca/oob/base/base.h"
  60 #include "orte/mca/rml/rml.h"
  61 #include "orte/orted/orted.h"
  62 
  63 #include "orte/runtime/runtime.h"
  64 #include "orte/runtime/orte_globals.h"
  65 #include "orte/runtime/orte_data_server.h"
  66 
  67 /*
  68  * Globals
  69  */
  70 
  71 static opal_event_t term_handler;
  72 static opal_event_t int_handler;
  73 
  74 static void shutdown_callback(int fd, short flags, void *arg);
  75 
  76 static bool help=false;
  77 static bool debug=false;
  78 static bool no_daemonize=false;
  79 static char *report_uri=NULL;
  80 
  81 /*
  82  * define the context table for obtaining parameters
  83  */
  84 opal_cmd_line_init_t orte_server_cmd_line_opts[] = {
  85     /* Various "obvious" options */
  86     { NULL, 'h', NULL, "help", 0,
  87       &help, OPAL_CMD_LINE_TYPE_BOOL,
  88       "This help message" },
  89 
  90     { NULL, 'd', NULL, "debug", 0,
  91       &debug, OPAL_CMD_LINE_TYPE_BOOL,
  92         "Debug the Open MPI server" },
  93 
  94     { "orte_no_daemonize", '\0', NULL, "no-daemonize", 0,
  95       &no_daemonize, OPAL_CMD_LINE_TYPE_BOOL,
  96       "Don't daemonize into the background" },
  97 
  98     { NULL, 'r', NULL, "report-uri", 1,
  99       &report_uri, OPAL_CMD_LINE_TYPE_STRING,
 100       "Report the server's uri on stdout [-], stderr [+], or a file [anything else]"},
 101 
 102     /* End of list */
 103     { NULL, '\0', NULL, NULL, 0,
 104       NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
 105 };
 106 
 107 int main(int argc, char *argv[])
 108 {
 109     int ret = 0;
 110     opal_cmd_line_t *cmd_line = NULL;
 111     char *rml_uri;
 112 #if OPAL_ENABLE_FT_CR == 1
 113     char * tmp_env_var = NULL;
 114 #endif
 115 
 116     /* init enough of opal to process cmd lines */
 117     if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
 118         fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
 119         exit(1);
 120     }
 121 
 122     /* setup to check common command line options that just report and die */
 123     cmd_line = OBJ_NEW(opal_cmd_line_t);
 124     opal_cmd_line_create(cmd_line, orte_server_cmd_line_opts);
 125     mca_base_cmd_line_setup(cmd_line);
 126     if (OPAL_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
 127                                                    argc, argv))) {
 128         if (OPAL_ERR_SILENT != ret) {
 129             fprintf(stderr, "%s: command line error (%s)\n", argv[0],
 130                     opal_strerror(ret));
 131         }
 132         return 1;
 133     }
 134 
 135     /* check for help request */
 136     if (help) {
 137         char *str, *args = NULL;
 138         args = opal_cmd_line_get_usage_msg(cmd_line);
 139         str = opal_show_help_string("help-orte-server.txt",
 140                                     "orteserver:usage", false,
 141                                     argv[0], args);
 142         if (NULL != str) {
 143             printf("%s", str);
 144             free(str);
 145         }
 146         free(args);
 147         /* If we show the help message, that should be all we do */
 148         return 0;
 149     }
 150 
 151     /*
 152      * Since this process can now handle MCA/GMCA parameters, make sure to
 153      * process them.
 154      */
 155     mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
 156 
 157     /* if debug is set, then set orte_debug_flag so that the data server
 158      * code will output
 159      */
 160     if (debug) {
 161         putenv(OPAL_MCA_PREFIX"orte_debug=1");
 162     }
 163 
 164     /* detach from controlling terminal
 165      * otherwise, remain attached so output can get to us
 166      */
 167     if(debug == false &&
 168        no_daemonize == false) {
 169         opal_daemon_init(NULL);
 170     }
 171 
 172 #if OPAL_ENABLE_FT_CR == 1
 173     /* Disable the checkpoint notification routine for this
 174      * tool. As we will never need to checkpoint this tool.
 175      * Note: This must happen before opal_init().
 176      */
 177     opal_cr_set_enabled(false);
 178 
 179     /* Select the none component, since we don't actually use a checkpointer */
 180     (void) mca_base_var_env_name("crs", &tmp_env_var);
 181     opal_setenv(tmp_env_var,
 182                 "none",
 183                 true, &environ);
 184     free(tmp_env_var);
 185     tmp_env_var = NULL;
 186 
 187     /* Mark as a tool program */
 188     (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
 189     opal_setenv(tmp_env_var,
 190                 "1",
 191                 true, &environ);
 192     free(tmp_env_var);
 193 #endif
 194 
 195     /* don't want session directories */
 196     orte_create_session_dirs = false;
 197 
 198     /* Perform the standard init, but flag that we are an HNP */
 199     if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
 200         fprintf(stderr, "orte-server: failed to initialize -- aborting\n");
 201         exit(1);
 202     }
 203 
 204     /* report out our URI, if we were requested to do so, using syntax
 205      * proposed in an email thread by Jeff Squyres
 206      */
 207     if (NULL != report_uri) {
 208         orte_oob_base_get_addr(&rml_uri);
 209         if (0 == strcmp(report_uri, "-")) {
 210             /* if '-', then output to stdout */
 211             printf("%s\n", rml_uri);
 212         } else if (0 == strcmp(report_uri, "+")) {
 213             /* if '+', output to stderr */
 214             fprintf(stderr, "%s\n", rml_uri);
 215         } else {
 216             /* treat it as a filename and output into it */
 217             FILE *fp;
 218             fp = fopen(report_uri, "w");
 219             if (NULL == fp) {
 220                 fprintf(stderr, "orte-server: failed to open designated file %s -- aborting\n", report_uri);
 221                 orte_finalize();
 222                 exit(1);
 223             }
 224             fprintf(fp, "%s\n", rml_uri);
 225             fclose(fp);
 226         }
 227         free(rml_uri);
 228     }
 229 
 230     /* setup the data server to listen for commands */
 231     if (ORTE_SUCCESS != (ret = orte_data_server_init())) {
 232         fprintf(stderr, "orte-server: failed to start data server -- aborting\n");
 233         orte_finalize();
 234         exit(1);
 235     }
 236 
 237     /* setup to listen for commands sent specifically to me */
 238     orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
 239                             ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
 240 
 241     /* Set signal handlers to catch kill signals so we can properly clean up
 242      * after ourselves.
 243      */
 244     opal_event_set(orte_event_base, &term_handler, SIGTERM, OPAL_EV_SIGNAL,
 245                    shutdown_callback, NULL);
 246     opal_event_add(&term_handler, NULL);
 247     opal_event_set(orte_event_base, &int_handler, SIGINT, OPAL_EV_SIGNAL,
 248                    shutdown_callback, NULL);
 249     opal_event_add(&int_handler, NULL);
 250 
 251     /* We actually do *not* want the server to voluntarily yield() the
 252        processor more than necessary.  The server already blocks when
 253        it is doing nothing, so it doesn't use any more CPU cycles than
 254        it should; but when it *is* doing something, we do not want it
 255        to be unnecessarily delayed because it voluntarily yielded the
 256        processor in the middle of its work.
 257 
 258        For example: when a message arrives at the server, we want the
 259        OS to wake up the server in a timely fashion (which most OS's
 260        seem good about doing) and then we want the server to process
 261        the message as fast as possible.  If the server yields and lets
 262        aggressive MPI applications get the processor back, it may be a
 263        long time before the OS schedules the server to run again
 264        (particularly if there is no IO event to wake it up).  Hence,
 265        publish and lookup (for example) may be significantly delayed
 266        before being delivered to MPI processes, which can be
 267        problematic in some scenarios (e.g., COMM_SPAWN). */
 268     opal_progress_set_yield_when_idle(false);
 269 
 270     /* Change the default behavior of libevent such that we want to
 271        continually block rather than blocking for the default timeout
 272        and then looping around the progress engine again.  There
 273        should be nothing in the server that cannot block in libevent
 274        until "something" happens (i.e., there's no need to keep
 275        cycling through progress because the only things that should
 276        happen will happen in libevent).  This is a minor optimization,
 277        but what the heck... :-) */
 278     opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
 279 
 280     if (debug) {
 281         opal_output(0, "%s orte-server: up and running!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 282     }
 283 
 284     /* wait to hear we are done */
 285     while (orte_event_base_active) {
 286         opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
 287     }
 288     ORTE_ACQUIRE_OBJECT(orte_event_base_active);
 289 
 290     /* should never get here, but if we do... */
 291 
 292     /* Finalize and clean up ourselves */
 293     orte_finalize();
 294     return orte_exit_status;
 295 }
 296 
 297 static void shutdown_callback(int fd, short flags, void *arg)
 298 {
 299     if (debug) {
 300         opal_output(0, "%s orte-server: finalizing", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 301     }
 302 
 303     /* Finalize and clean up ourselves */
 304     orte_finalize();
 305     exit(orte_exit_status);
 306 }

/* [<][>][^][v][top][bottom][index][help] */