root/orte/mca/odls/base/odls_base_frame.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_odls_base_register
  2. orte_odls_base_harvest_threads
  3. orte_odls_base_start_threads
  4. orte_odls_base_close
  5. orte_odls_base_open
  6. launch_local_const
  7. launch_local_dest
  8. sccon
  9. scdes

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2010-2011 Oracle and/or its affiliates.  All rights reserved.
  13  * Copyright (c) 2011-2017 Cisco Systems, Inc.  All rights reserved
  14  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
  15  *                         All rights reserved.
  16  * Copyright (c) 2014-2017 Research Organization for Information Science
  17  *                         and Technology (RIST). All rights reserved.
  18  * Copyright (c) 2017-2018 Intel, Inc.  All rights reserved.
  19  * $COPYRIGHT$
  20  *
  21  * Additional copyrights may follow
  22  *
  23  * $HEADER$
  24  */
  25 
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <string.h>
  31 #include <signal.h>
  32 
  33 #include "opal/class/opal_ring_buffer.h"
  34 #include "orte/mca/mca.h"
  35 #include "opal/mca/base/base.h"
  36 #include "opal/mca/hwloc/hwloc-internal.h"
  37 #include "opal/runtime/opal_progress_threads.h"
  38 #include "opal/util/output.h"
  39 #include "opal/util/path.h"
  40 #include "opal/util/argv.h"
  41 #include "opal/util/printf.h"
  42 
  43 #include "orte/mca/errmgr/errmgr.h"
  44 #include "orte/mca/ess/ess.h"
  45 #include "orte/mca/plm/plm_types.h"
  46 #include "orte/runtime/orte_globals.h"
  47 #include "orte/util/name_fns.h"
  48 #include "orte/util/parse_options.h"
  49 #include "orte/util/show_help.h"
  50 #include "orte/util/threads.h"
  51 
  52 #include "orte/mca/odls/base/odls_private.h"
  53 #include "orte/mca/odls/base/base.h"
  54 
  55 
  56 /*
  57  * The following file was created by configure.  It contains extern
  58  * statements and the definition of an array of pointers to each
  59  * component's public mca_base_component_t struct.
  60  */
  61 
  62 #include "orte/mca/odls/base/static-components.h"
  63 
  64 /*
  65  * Instantiate globals
  66  */
  67 orte_odls_base_module_t orte_odls = {0};
  68 
  69 /*
  70  * Framework global variables
  71  */
  72 orte_odls_globals_t orte_odls_globals = {0};
  73 
  74 static int orte_odls_base_register(mca_base_register_flag_t flags)
  75 {
  76     orte_odls_globals.timeout_before_sigkill = 1;
  77     (void) mca_base_var_register("orte", "odls", "base", "sigkill_timeout",
  78                                  "Time to wait for a process to die after issuing a kill signal to it",
  79                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
  80                                  OPAL_INFO_LVL_9,
  81                                  MCA_BASE_VAR_SCOPE_READONLY,
  82                                  &orte_odls_globals.timeout_before_sigkill);
  83 
  84     orte_odls_globals.max_threads = 4;
  85     (void) mca_base_var_register("orte", "odls", "base", "max_threads",
  86                                  "Maximum number of threads to use for spawning local procs",
  87                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
  88                                  OPAL_INFO_LVL_9,
  89                                  MCA_BASE_VAR_SCOPE_READONLY,
  90                                  &orte_odls_globals.max_threads);
  91 
  92     orte_odls_globals.num_threads = -1;
  93     (void) mca_base_var_register("orte", "odls", "base", "num_threads",
  94                                  "Specific number of threads to use for spawning local procs",
  95                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
  96                                  OPAL_INFO_LVL_9,
  97                                  MCA_BASE_VAR_SCOPE_READONLY,
  98                                  &orte_odls_globals.num_threads);
  99 
 100     orte_odls_globals.cutoff = 32;
 101     (void) mca_base_var_register("orte", "odls", "base", "cutoff",
 102                                  "Minimum number of local procs before using thread pool for spawn",
 103                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 104                                  OPAL_INFO_LVL_9,
 105                                  MCA_BASE_VAR_SCOPE_READONLY,
 106                                  &orte_odls_globals.cutoff);
 107 
 108     orte_odls_globals.signal_direct_children_only = false;
 109     (void) mca_base_var_register("orte", "odls", "base", "signal_direct_children_only",
 110                                  "Whether to restrict signals (e.g., SIGTERM) to direct children, or "
 111                                  "to apply them as well to any children spawned by those processes",
 112                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
 113                                  OPAL_INFO_LVL_9,
 114                                  MCA_BASE_VAR_SCOPE_READONLY,
 115                                  &orte_odls_globals.signal_direct_children_only);
 116 
 117     return ORTE_SUCCESS;
 118 }
 119 
 120 void orte_odls_base_harvest_threads(void)
 121 {
 122     int i;
 123 
 124     ORTE_ACQUIRE_THREAD(&orte_odls_globals.lock);
 125     if (0 < orte_odls_globals.num_threads) {
 126         /* stop the progress threads */
 127         if (NULL != orte_odls_globals.ev_threads) {
 128             for (i=0; NULL != orte_odls_globals.ev_threads[i]; i++) {
 129                 opal_progress_thread_finalize(orte_odls_globals.ev_threads[i]);
 130             }
 131         }
 132         free(orte_odls_globals.ev_bases);
 133         orte_odls_globals.ev_bases = (opal_event_base_t**)malloc(sizeof(opal_event_base_t*));
 134         /* use the default event base */
 135         orte_odls_globals.ev_bases[0] = orte_event_base;
 136         orte_odls_globals.num_threads = 0;
 137         if (NULL != orte_odls_globals.ev_threads) {
 138             opal_argv_free(orte_odls_globals.ev_threads);
 139             orte_odls_globals.ev_threads = NULL;
 140         }
 141     }
 142     ORTE_RELEASE_THREAD(&orte_odls_globals.lock);
 143 }
 144 
 145 void orte_odls_base_start_threads(orte_job_t *jdata)
 146 {
 147     int i;
 148     char *tmp;
 149 
 150     ORTE_ACQUIRE_THREAD(&orte_odls_globals.lock);
 151     /* only do this once */
 152     if (NULL != orte_odls_globals.ev_threads) {
 153         ORTE_RELEASE_THREAD(&orte_odls_globals.lock);
 154         return;
 155     }
 156 
 157     /* setup the pool of worker threads */
 158     orte_odls_globals.ev_threads = NULL;
 159     orte_odls_globals.next_base = 0;
 160     if (-1 == orte_odls_globals.num_threads) {
 161         if ((int)jdata->num_local_procs < orte_odls_globals.cutoff) {
 162             /* do not use any dedicated odls thread */
 163             orte_odls_globals.num_threads = 0;
 164         } else {
 165             /* user didn't specify anything, so default to some fraction of
 166              * the number of local procs, capping it at the max num threads
 167              * parameter value. */
 168             orte_odls_globals.num_threads = jdata->num_local_procs / 8;
 169             if (0 == orte_odls_globals.num_threads) {
 170                 orte_odls_globals.num_threads = 1;
 171             } else if (orte_odls_globals.max_threads < orte_odls_globals.num_threads) {
 172                 orte_odls_globals.num_threads = orte_odls_globals.max_threads;
 173             }
 174         }
 175     }
 176     if (0 == orte_odls_globals.num_threads) {
 177         orte_odls_globals.ev_bases = (opal_event_base_t**)malloc(sizeof(opal_event_base_t*));
 178         /* use the default event base */
 179         orte_odls_globals.ev_bases[0] = orte_event_base;
 180     } else {
 181         orte_odls_globals.ev_bases =
 182             (opal_event_base_t**)malloc(orte_odls_globals.num_threads * sizeof(opal_event_base_t*));
 183         for (i=0; i < orte_odls_globals.num_threads; i++) {
 184             opal_asprintf(&tmp, "ORTE-ODLS-%d", i);
 185             orte_odls_globals.ev_bases[i] = opal_progress_thread_init(tmp);
 186             opal_argv_append_nosize(&orte_odls_globals.ev_threads, tmp);
 187             free(tmp);
 188         }
 189     }
 190     ORTE_RELEASE_THREAD(&orte_odls_globals.lock);
 191 }
 192 
 193 static int orte_odls_base_close(void)
 194 {
 195     int i;
 196     orte_proc_t *proc;
 197     opal_list_item_t *item;
 198 
 199     /* cleanup ODLS globals */
 200     while (NULL != (item = opal_list_remove_first(&orte_odls_globals.xterm_ranks))) {
 201         OBJ_RELEASE(item);
 202     }
 203     OBJ_DESTRUCT(&orte_odls_globals.xterm_ranks);
 204 
 205     /* cleanup the global list of local children and job data */
 206     for (i=0; i < orte_local_children->size; i++) {
 207         if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 208             OBJ_RELEASE(proc);
 209         }
 210     }
 211     OBJ_RELEASE(orte_local_children);
 212 
 213     orte_odls_base_harvest_threads();
 214 
 215     ORTE_DESTRUCT_LOCK(&orte_odls_globals.lock);
 216 
 217     return mca_base_framework_components_close(&orte_odls_base_framework, NULL);
 218 }
 219 
 220 /**
 221  * Function for finding and opening either all MCA components, or the one
 222  * that was specifically requested via a MCA parameter.
 223  */
 224 static int orte_odls_base_open(mca_base_open_flag_t flags)
 225 {
 226     char **ranks=NULL, *tmp;
 227     int rc, i, rank;
 228     orte_namelist_t *nm;
 229     bool xterm_hold;
 230     sigset_t unblock;
 231 
 232     ORTE_CONSTRUCT_LOCK(&orte_odls_globals.lock);
 233     orte_odls_globals.lock.active = false;   // start with nobody having the thread
 234 
 235     /* initialize the global array of local children */
 236     orte_local_children = OBJ_NEW(opal_pointer_array_t);
 237     if (OPAL_SUCCESS != (rc = opal_pointer_array_init(orte_local_children,
 238                                                       1,
 239                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
 240                                                       1))) {
 241         ORTE_ERROR_LOG(rc);
 242         return rc;
 243     }
 244 
 245     /* initialize ODLS globals */
 246     OBJ_CONSTRUCT(&orte_odls_globals.xterm_ranks, opal_list_t);
 247     orte_odls_globals.xtermcmd = NULL;
 248 
 249     /* ensure that SIGCHLD is unblocked as we need to capture it */
 250     if (0 != sigemptyset(&unblock)) {
 251         return ORTE_ERROR;
 252     }
 253     if (0 != sigaddset(&unblock, SIGCHLD)) {
 254         return ORTE_ERROR;
 255     }
 256     if (0 != sigprocmask(SIG_UNBLOCK, &unblock, NULL)) {
 257         return ORTE_ERR_NOT_SUPPORTED;
 258     }
 259 
 260     /* check if the user requested that we display output in xterms */
 261     if (NULL != orte_xterm) {
 262         /* construct a list of ranks to be displayed */
 263         xterm_hold = false;
 264         orte_util_parse_range_options(orte_xterm, &ranks);
 265         for (i=0; i < opal_argv_count(ranks); i++) {
 266             if (0 == strcmp(ranks[i], "BANG")) {
 267                 xterm_hold = true;
 268                 continue;
 269             }
 270             nm = OBJ_NEW(orte_namelist_t);
 271             rank = strtol(ranks[i], NULL, 10);
 272             if (-1 == rank) {
 273                 /* wildcard */
 274                 nm->name.vpid = ORTE_VPID_WILDCARD;
 275             } else if (rank < 0) {
 276                 /* error out on bozo case */
 277                 orte_show_help("help-orte-odls-base.txt",
 278                                "orte-odls-base:xterm-neg-rank",
 279                                true, rank);
 280                 return ORTE_ERROR;
 281             } else {
 282                 /* we can't check here if the rank is out of
 283                  * range as we don't yet know how many ranks
 284                  * will be in the job - we'll check later
 285                  */
 286                 nm->name.vpid = rank;
 287             }
 288             opal_list_append(&orte_odls_globals.xterm_ranks, &nm->super);
 289         }
 290         opal_argv_free(ranks);
 291         /* construct the xtermcmd */
 292         orte_odls_globals.xtermcmd = NULL;
 293         tmp = opal_find_absolute_path("xterm");
 294         if (NULL == tmp) {
 295             return ORTE_ERROR;
 296         }
 297         opal_argv_append_nosize(&orte_odls_globals.xtermcmd, tmp);
 298         free(tmp);
 299         opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-T");
 300         opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "save");
 301         if (xterm_hold) {
 302             opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-hold");
 303         }
 304         opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
 305     }
 306 
 307      /* Open up all available components */
 308     return mca_base_framework_components_open(&orte_odls_base_framework, flags);
 309 }
 310 
 311 MCA_BASE_FRAMEWORK_DECLARE(orte, odls, "ORTE Daemon Launch Subsystem",
 312                            orte_odls_base_register, orte_odls_base_open, orte_odls_base_close,
 313                            mca_odls_base_static_components, 0);
 314 
 315 static void launch_local_const(orte_odls_launch_local_t *ptr)
 316 {
 317     ptr->ev = opal_event_alloc();
 318     ptr->job = ORTE_JOBID_INVALID;
 319     ptr->fork_local = NULL;
 320     ptr->retries = 0;
 321 }
 322 static void launch_local_dest(orte_odls_launch_local_t *ptr)
 323 {
 324     opal_event_free(ptr->ev);
 325 }
 326 OBJ_CLASS_INSTANCE(orte_odls_launch_local_t,
 327                    opal_object_t,
 328                    launch_local_const,
 329                    launch_local_dest);
 330 
 331 static void sccon(orte_odls_spawn_caddy_t *p)
 332 {
 333     memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
 334     p->cmd = NULL;
 335     p->wdir = NULL;
 336     p->argv = NULL;
 337     p->env = NULL;
 338 }
 339 static void scdes(orte_odls_spawn_caddy_t *p)
 340 {
 341     if (NULL != p->cmd) {
 342         free(p->cmd);
 343     }
 344     if (NULL != p->wdir) {
 345         free(p->wdir);
 346     }
 347     if (NULL != p->argv) {
 348         opal_argv_free(p->argv);
 349     }
 350     if (NULL != p->env) {
 351         opal_argv_free(p->env);
 352     }
 353 }
 354 OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
 355                    opal_object_t,
 356                    sccon, scdes);

/* [<][>][^][v][top][bottom][index][help] */