root/orte/mca/ras/alps/ras_alps_component.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. prep_job_id
  2. get_res_id
  3. ras_alps_register
  4. ras_alps_open
  5. orte_ras_alps_component_query
  6. orte_ras_alps_get_appinfo_attempts

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2005 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2008      UT-Battelle, LLC
  14  * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
  15  *                         reserved.
  16  * Copyright (c) 2018      Intel, Inc.  All rights reserved.
  17  * $COPYRIGHT$
  18  *
  19  * Additional copyrights may follow
  20  *
  21  * $HEADER$
  22  */
  23 
  24 #include "orte_config.h"
  25 
  26 #include "opal/mca/base/base.h"
  27 #include "opal/util/output.h"
  28 #include "orte/constants.h"
  29 #include "orte/util/proc_info.h"
  30 #include "ras_alps.h"
  31 
  32 #include <ctype.h>
  33 
  34 /* Local variables */
  35 static int param_priority;
  36 static int ras_alps_read_attempts;
  37 
  38 /* Local functions */
  39 static int ras_alps_register(void);
  40 static int ras_alps_open(void);
  41 static int orte_ras_alps_component_query(mca_base_module_t **module,
  42                                          int *priority);
  43 unsigned long int orte_ras_alps_res_id = 0UL;
  44 char *ras_alps_apstat_cmd = NULL;
  45 
  46 orte_ras_base_component_t mca_ras_alps_component = {
  47     /* First, the mca_base_component_t struct containing meta information about
  48      * the component itself
  49      * */
  50     .base_version = {
  51         ORTE_RAS_BASE_VERSION_2_0_0,
  52 
  53         /* Component name and version */
  54         .mca_component_name = "alps",
  55         MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
  56                               ORTE_RELEASE_VERSION),
  57 
  58         /* Component open and close functions */
  59         .mca_open_component = ras_alps_open,
  60         .mca_query_component = orte_ras_alps_component_query,
  61         .mca_register_component_params = ras_alps_register,
  62     },
  63     .base_data = {
  64         /* The component is checkpoint ready */
  65         MCA_BASE_METADATA_PARAM_CHECKPOINT
  66     },
  67 };
  68 
  69 /* simple function used to strip off characters on and after a period. NULL
  70  * will be returned upon failure.  Otherwise, a "prepped" string will be
  71  * returned.  The caller is responsible for freeing returned resources.
  72  * for example: if jid is 138295.sdb, then 138295 will be returned.
  73  */
  74 static char *
  75 prep_job_id(const char *jid)
  76 {
  77     char *tmp = strdup(jid);
  78     char *tmp2 = NULL;
  79 
  80     if (NULL == tmp) {
  81         /* out of resources */
  82         return NULL;
  83     }
  84     if (NULL != (tmp2 = strchr(tmp, '.'))) {
  85         *tmp2 = '\0';
  86     }
  87     return tmp;
  88 }
  89 
  90 /* this function replicates some of the id setting functionality found in
  91  * ras-alps-command.sh. we wanted the ability to just "mpirun" the application
  92  * without having to set an environment variable
  93  */
  94 static unsigned long int
  95 get_res_id(void)
  96 {
  97     char *apstat_cmd;
  98     char *id = NULL;
  99     char read_buf[512];
 100     FILE *apstat_fp = NULL;
 101     /* zero is considered to be an invalid res id */
 102     unsigned long jid = 0;
 103     int ret;
 104 
 105     if (NULL != (id = getenv("BATCH_PARTITION_ID"))) {
 106         return strtoul(id, NULL, 10);
 107     }
 108     if (NULL != (id = getenv("PBS_JOBID"))) {
 109         char *prepped_jid = prep_job_id(id);
 110         if (NULL == prepped_jid) {
 111             /* out of resources */
 112             return 0;
 113         }
 114 
 115         ret = opal_asprintf (&apstat_cmd, "%s -r", ras_alps_apstat_cmd);
 116         if (0 > ret) {
 117             return 0;
 118         }
 119 
 120         apstat_fp = popen(apstat_cmd, "r");
 121         free (apstat_cmd);
 122         if (NULL == apstat_fp) {
 123             /* popen failure */
 124             free(prepped_jid);
 125             return 0;
 126         }
 127         while (NULL != fgets(read_buf, 512, apstat_fp)) {
 128             /* does this line have the id that we care about? */
 129             if (NULL != strstr(read_buf, prepped_jid)) {
 130         /* the line is going to be in the form of something like:
 131         A 1450   571783 batch:138309     XT    80 - -   2000 conf,claim
 132          */
 133                 char *t = read_buf;
 134                 for (t = read_buf; !isdigit(*t) && *t; ++t) {
 135                     jid = strtoul(t, NULL, 10);
 136                 }
 137                 /* if we are here, then jid should be, given the example above,
 138                  * 1450 */
 139                 break;
 140             }
 141         }
 142         fclose(apstat_fp);
 143         free(prepped_jid);
 144     }
 145     return jid;
 146 }
 147 
 148 static int
 149 ras_alps_register(void)
 150 {
 151     param_priority = 75;
 152     (void) mca_base_component_var_register (&mca_ras_alps_component.base_version,
 153                                             "priority", "Priority of the alps ras component",
 154                                             MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 155                                             OPAL_INFO_LVL_9,
 156                                             MCA_BASE_VAR_SCOPE_READONLY,
 157                                             &param_priority);
 158 
 159     ras_alps_read_attempts = 10;
 160     (void) mca_base_component_var_register (&mca_ras_alps_component.base_version,
 161                                             "appinfo_read_attempts",
 162                                             "Maximum number of attempts to read ALPS "
 163                                             "appinfo file", MCA_BASE_VAR_TYPE_INT,
 164                                             NULL, 0, 0, OPAL_INFO_LVL_9,
 165                                             MCA_BASE_VAR_SCOPE_READONLY, &ras_alps_read_attempts);
 166 
 167     ras_alps_apstat_cmd = "apstat";         /* by default apstat is in a user's path on a Cray XE/XC if
 168                                                alps is the site's job launcher  */
 169     (void) mca_base_component_var_register (&mca_ras_alps_component.base_version,
 170                                             "apstat_cmd", "Location of the apstat command",
 171                                             MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_6,
 172                                             MCA_BASE_VAR_SCOPE_READONLY, &ras_alps_apstat_cmd);
 173 
 174     return ORTE_SUCCESS;
 175 }
 176 
 177 static int
 178 ras_alps_open(void)
 179 {
 180     return ORTE_SUCCESS;
 181 }
 182 
 183 static int
 184 orte_ras_alps_component_query(mca_base_module_t **module,
 185                               int *priority)
 186 {
 187     char *jid_str = NULL;
 188     /* default to an invalid value */
 189     orte_ras_alps_res_id = 0;
 190 
 191     /* if we are not an HNP, then we must not be selected */
 192     if (!ORTE_PROC_IS_HNP) {
 193         *module = NULL;
 194         return ORTE_ERROR;
 195     }
 196 
 197     /* Are we running under a ALPS job? */
 198     /* BASIL_RESERVATION_ID is the equivalent of OMPI_ALPS_RESID
 199      * on some systems
 200      */
 201     if ((NULL == (jid_str = getenv("OMPI_ALPS_RESID"))) &&
 202         (NULL == (jid_str = getenv("BASIL_RESERVATION_ID")))) {
 203             orte_ras_alps_res_id = get_res_id();
 204     }
 205     else {
 206         orte_ras_alps_res_id = strtoul(jid_str, NULL, 10);
 207     }
 208     if (0 != orte_ras_alps_res_id) {
 209         *priority = param_priority;
 210         opal_output_verbose(2, orte_ras_base_framework.framework_output,
 211                              "ras:alps: available for selection");
 212         *module = (mca_base_module_t *) &orte_ras_alps_module;
 213         return ORTE_SUCCESS;
 214     }
 215 
 216     /* Sadly, no */
 217 
 218     opal_output(orte_ras_base_framework.framework_output,
 219                 "ras:alps: NOT available for selection -- "
 220                 "OMPI_ALPS_RESID or BASIL_RESERVATION_ID not set?");
 221     *module = NULL;
 222     return ORTE_ERROR;
 223 }
 224 
 225 int
 226 orte_ras_alps_get_appinfo_attempts(int *attempts)
 227 {
 228     *attempts = ras_alps_read_attempts;
 229     opal_output_verbose(2, orte_ras_base_framework.framework_output,
 230                          "ras:alps:orte_ras_alps_get_appinfo_attempts: %d",
 231                          *attempts);
 232     return ORTE_SUCCESS;
 233 }

/* [<][>][^][v][top][bottom][index][help] */