root/orte/mca/schizo/slurm/schizo_slurm.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. check_launch_environment
  2. get_remaining_time
  3. finalize

   1 /*
   2  * Copyright (c) 2016-2018 Intel, Inc.  All rights reserved.
   3  * Copyright (c) 2016      Mellanox Technologies Ltd.  All rights reserved.
   4  * $COPYRIGHT$
   5  *
   6  * Additional copyrights may follow
   7  *
   8  * $HEADER$
   9  *
  10  */
  11 
  12 #include "orte_config.h"
  13 #include "orte/types.h"
  14 #include "opal/types.h"
  15 
  16 #ifdef HAVE_UNISTD_H
  17 #include <unistd.h>
  18 #endif
  19 #include <ctype.h>
  20 
  21 #include "opal/util/argv.h"
  22 #include "opal/util/basename.h"
  23 #include "opal/util/opal_environ.h"
  24 
  25 #include "orte/runtime/orte_globals.h"
  26 #include "orte/util/name_fns.h"
  27 #include "orte/mca/schizo/base/base.h"
  28 
  29 #include "schizo_slurm.h"
  30 
  31 static orte_schizo_launch_environ_t check_launch_environment(void);
  32 static int get_remaining_time(uint32_t *timeleft);
  33 static void finalize(void);
  34 
  35 orte_schizo_base_module_t orte_schizo_slurm_module = {
  36     .check_launch_environment = check_launch_environment,
  37     .get_remaining_time = get_remaining_time,
  38     .finalize = finalize
  39 };
  40 
  41 static char **pushed_envs = NULL;
  42 static char **pushed_vals = NULL;
  43 static orte_schizo_launch_environ_t myenv;
  44 static bool myenvdefined = false;
  45 
  46 static orte_schizo_launch_environ_t check_launch_environment(void)
  47 {
  48     char *bind, *list, *ptr;
  49     int i;
  50 
  51     if (myenvdefined) {
  52         return myenv;
  53     }
  54     myenvdefined = true;
  55 
  56     /* we were only selected because SLURM was detected
  57      * and we are an app, so no need to further check
  58      * that here. Instead, see if we were direct launched
  59      * vs launched via mpirun */
  60     if (NULL != orte_process_info.my_daemon_uri) {
  61         /* nope */
  62         myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
  63         opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
  64         opal_argv_append_nosize(&pushed_vals, "pmi");
  65         /* mark that we are native */
  66         opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
  67         opal_argv_append_nosize(&pushed_vals, "NATIVE");
  68         goto setup;
  69     }
  70 
  71     /* see if we are in a SLURM allocation */
  72     if (NULL == getenv("SLURM_NODELIST")) {
  73         /* nope */
  74         myenv = ORTE_SCHIZO_UNDETERMINED;
  75         return myenv;
  76     }
  77 
  78     /* mark that we are in SLURM */
  79     opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
  80     opal_argv_append_nosize(&pushed_vals, "SLURM");
  81 
  82     /* we are in an allocation, but were we direct launched
  83      * or are we a singleton? */
  84     if (NULL == getenv("SLURM_STEP_ID")) {
  85         /* not in a job step - ensure we select the
  86          * correct things */
  87         opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
  88         opal_argv_append_nosize(&pushed_vals, "singleton");
  89         myenv = ORTE_SCHIZO_MANAGED_SINGLETON;
  90         goto setup;
  91     }
  92     myenv = ORTE_SCHIZO_DIRECT_LAUNCHED;
  93     opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
  94     opal_argv_append_nosize(&pushed_vals, "pmi");
  95 
  96     /* if we are direct launched by SLURM, then we want
  97      * to ensure that we do not override their binding
  98      * options, so set that envar */
  99     if (NULL != (bind = getenv("SLURM_CPU_BIND_TYPE"))) {
 100         if (0 == strcmp(bind, "none")) {
 101             opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"hwloc_base_binding_policy");
 102             opal_argv_append_nosize(&pushed_vals, "none");
 103             /* indicate we are externally bound so we won't try to do it ourselves */
 104             opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"orte_externally_bound");
 105             opal_argv_append_nosize(&pushed_vals, "1");
 106         } else if (bind == strstr(bind, "mask_cpu")) {
 107             /* if the bind list is all F's, then the
 108              * user didn't specify anything */
 109             if (NULL != (list = getenv("SLURM_CPU_BIND_LIST")) &&
 110                 NULL != (ptr = strchr(list, 'x'))) {
 111                 ++ptr;  // step over the 'x'
 112                 for (i=0; '\0' != *ptr; ptr++) {
 113                     if ('F' != *ptr) {
 114                         /* indicate we are externally bound */
 115                         opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"orte_externally_bound");
 116                         opal_argv_append_nosize(&pushed_vals, "1");
 117                         break;
 118                     }
 119                 }
 120             }
 121         }
 122     }
 123 
 124   setup:
 125       opal_output_verbose(1, orte_schizo_base_framework.framework_output,
 126                           "schizo:slurm DECLARED AS %s", orte_schizo_base_print_env(myenv));
 127     if (NULL != pushed_envs) {
 128         for (i=0; NULL != pushed_envs[i]; i++) {
 129             opal_setenv(pushed_envs[i], pushed_vals[i], true, &environ);
 130         }
 131     }
 132     return myenv;
 133 }
 134 
 135 static int get_remaining_time(uint32_t *timeleft)
 136 {
 137     char output[256], *cmd, *jobid, **res;
 138     FILE *fp;
 139     uint32_t tleft;
 140     size_t cnt;
 141 
 142     /* set the default */
 143     *timeleft = UINT32_MAX;
 144 
 145     if (NULL == (jobid = getenv("SLURM_JOBID"))) {
 146         return ORTE_ERR_TAKE_NEXT_OPTION;
 147     }
 148     if (0 > opal_asprintf(&cmd, "squeue -h -j %s -o %%L", jobid)) {
 149         return ORTE_ERR_OUT_OF_RESOURCE;
 150     }
 151     fp = popen(cmd, "r");
 152     if (NULL == fp) {
 153         free(cmd);
 154         return ORTE_ERR_FILE_OPEN_FAILURE;
 155     }
 156     if (NULL == fgets(output, 256, fp)) {
 157         free(cmd);
 158         pclose(fp);
 159         return ORTE_ERR_FILE_READ_FAILURE;
 160     }
 161     free(cmd);
 162     pclose(fp);
 163     /* the output is returned in a colon-delimited set of fields */
 164     res = opal_argv_split(output, ':');
 165     cnt =  opal_argv_count(res);
 166     tleft = strtol(res[cnt-1], NULL, 10); // has to be at least one field
 167     /* the next field would be minutes */
 168     if (1 < cnt) {
 169         tleft += 60 * strtol(res[cnt-2], NULL, 10);
 170     }
 171     /* next field would be hours */
 172     if (2 < cnt) {
 173         tleft += 3600 * strtol(res[cnt-3], NULL, 10);
 174     }
 175     /* next field is days */
 176     if (3 < cnt) {
 177         tleft += 24*3600 * strtol(res[cnt-4], NULL, 10);
 178     }
 179     /* if there are more fields than that, then it is infinite */
 180     if (4 < cnt) {
 181         tleft = UINT32_MAX;
 182     }
 183     opal_argv_free(res);
 184 
 185     *timeleft = tleft;
 186     return ORTE_SUCCESS;
 187 }
 188 
 189 static void finalize(void)
 190 {
 191     int i;
 192 
 193     if (NULL != pushed_envs) {
 194         for (i=0; NULL != pushed_envs[i]; i++) {
 195             opal_unsetenv(pushed_envs[i], &environ);
 196         }
 197         opal_argv_free(pushed_envs);
 198         opal_argv_free(pushed_vals);
 199     }
 200 }

/* [<][>][^][v][top][bottom][index][help] */