root/orte/mca/ras/tm/ras_tm_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. allocate
  2. finalize
  3. discover
  4. tm_getline

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2005 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
  13  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
  14  * Copyright (c) 2016      IBM Corporation.  All rights reserved.
  15  * $COPYRIGHT$
  16  *
  17  * Additional copyrights may follow
  18  *
  19  * $HEADER$
  20  */
  21 #include "orte_config.h"
  22 #include "orte/constants.h"
  23 #include "orte/types.h"
  24 
  25 #include <errno.h>
  26 #include <unistd.h>
  27 #include <string.h>
  28 
  29 #include "orte/util/show_help.h"
  30 #include "opal/util/os_path.h"
  31 #include "opal/util/net.h"
  32 
  33 #include "orte/mca/errmgr/errmgr.h"
  34 #include "orte/runtime/orte_globals.h"
  35 #include "orte/util/name_fns.h"
  36 
  37 #include "orte/mca/ras/base/ras_private.h"
  38 #include "ras_tm.h"
  39 
  40 
  41 /*
  42  * Local functions
  43  */
  44 static int allocate(orte_job_t *jdata, opal_list_t *nodes);
  45 static int finalize(void);
  46 
  47 static int discover(opal_list_t* nodelist, char *pbs_jobid);
  48 static char *tm_getline(FILE *fp);
  49 
  50 #define TM_FILE_MAX_LINE_LENGTH 512
  51 
  52 static char *filename;
  53 
  54 /*
  55  * Global variable
  56  */
  57 orte_ras_base_module_t orte_ras_tm_module = {
  58     NULL,
  59     allocate,
  60     NULL,
  61     finalize
  62 };
  63 
  64 
  65 /**
  66  * Discover available (pre-allocated) nodes and report
  67  * them back to the caller.
  68  *
  69  */
  70 static int allocate(orte_job_t *jdata, opal_list_t *nodes)
  71 {
  72     int ret;
  73     char *pbs_jobid;
  74 
  75     /* get our PBS jobid from the environment */
  76     if (NULL == (pbs_jobid = getenv("PBS_JOBID"))) {
  77         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
  78         return ORTE_ERR_NOT_FOUND;
  79     }
  80 
  81     /* save that value in the global job ident string for
  82      * later use in any error reporting
  83      */
  84     orte_job_ident = strdup(pbs_jobid);
  85 
  86     if (ORTE_SUCCESS != (ret = discover(nodes, pbs_jobid))) {
  87         ORTE_ERROR_LOG(ret);
  88         return ret;
  89     }
  90 
  91     /* in the TM world, if we didn't find anything, then this
  92      * is an unrecoverable error - report it
  93      */
  94     if (opal_list_is_empty(nodes)) {
  95         orte_show_help("help-ras-tm.txt", "no-nodes-found", true, filename);
  96         return ORTE_ERR_NOT_FOUND;
  97     }
  98 
  99     /* All done */
 100     return ORTE_SUCCESS;
 101 }
 102 
 103 /*
 104  * There's really nothing to do here
 105  */
 106 static int finalize(void)
 107 {
 108     OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 109                          "%s ras:tm:finalize: success (nothing to do)",
 110                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 111     return ORTE_SUCCESS;
 112 }
 113 
 114 
 115 /**
 116  * Discover the available resources.  Obtain directly from TM (and
 117  * therefore have no need to validate) -- ignore hostfile or any other
 118  * user-specified parameters.
 119  *
 120  *  - validate any nodes specified via hostfile/commandline
 121  *  - check for additional nodes that have already been allocated
 122  */
 123 
 124 static int discover(opal_list_t* nodelist, char *pbs_jobid)
 125 {
 126     int32_t nodeid;
 127     orte_node_t *node;
 128     opal_list_item_t* item;
 129     FILE *fp;
 130     char *hostname, *cppn;
 131     int ppn;
 132     char *ptr;
 133 
 134     /* Ignore anything that the user already specified -- we're
 135        getting nodes only from TM. */
 136 
 137     /* TM "nodes" may actually correspond to PBS "VCPUs", which means
 138        there may be multiple "TM nodes" that correspond to the same
 139        physical node.  This doesn't really affect what we're doing
 140        here (we actually ignore the fact that they're duplicates --
 141        slightly inefficient, but no big deal); just mentioned for
 142        completeness... */
 143 
 144     /* if we are in SMP mode, then read the environment to get the
 145      * number of cpus for each node read in the file
 146      */
 147     if (mca_ras_tm_component.smp_mode) {
 148         if (NULL == (cppn = getenv("PBS_PPN"))) {
 149             orte_show_help("help-ras-tm.txt", "smp-error", true);
 150             return ORTE_ERR_NOT_FOUND;
 151         }
 152         ppn = strtol(cppn, NULL, 10);
 153     } else {
 154         ppn = 1;
 155     }
 156 
 157     /* setup the full path to the PBS file */
 158     filename = opal_os_path(false, mca_ras_tm_component.nodefile_dir,
 159                             pbs_jobid, NULL);
 160     fp = fopen(filename, "r");
 161     if (NULL == fp) {
 162         ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
 163         free(filename);
 164         return ORTE_ERR_FILE_OPEN_FAILURE;
 165     }
 166 
 167     /* Iterate through all the nodes and make an entry for each.  TM
 168        node ID's will never be duplicated, but they may end up
 169        resolving to the same hostname (i.e., vcpu's on a single
 170        host). */
 171 
 172     nodeid=0;
 173     while (NULL != (hostname = tm_getline(fp))) {
 174         if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hostname) ) {
 175             if (NULL != (ptr = strchr(hostname, '.'))) {
 176                 *ptr = '\0';
 177             }
 178         }
 179 
 180         OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 181                              "%s ras:tm:allocate:discover: got hostname %s",
 182                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));
 183 
 184         /* Remember that TM may list the same node more than once.  So
 185            we have to check for duplicates. */
 186 
 187         for (item = opal_list_get_first(nodelist);
 188              opal_list_get_end(nodelist) != item;
 189              item = opal_list_get_next(item)) {
 190             node = (orte_node_t*) item;
 191             if (0 == strcmp(node->name, hostname)) {
 192                 if (mca_ras_tm_component.smp_mode) {
 193                     /* this cannot happen in smp mode */
 194                     orte_show_help("help-ras-tm.txt", "smp-multi", true);
 195                     return ORTE_ERR_BAD_PARAM;
 196                 }
 197                 ++node->slots;
 198 
 199                 OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 200                                      "%s ras:tm:allocate:discover: found -- bumped slots to %d",
 201                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots));
 202 
 203                 break;
 204             }
 205         }
 206 
 207         /* Did we find it? */
 208 
 209         if (opal_list_get_end(nodelist) == item) {
 210 
 211             /* Nope -- didn't find it, so add a new item to the list */
 212 
 213             OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 214                                  "%s ras:tm:allocate:discover: not found -- added to list",
 215                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 216 
 217             node = OBJ_NEW(orte_node_t);
 218             node->name = hostname;
 219             orte_set_attribute(&node->attributes, ORTE_NODE_LAUNCH_ID, ORTE_ATTR_LOCAL, &nodeid, OPAL_INT32);
 220             node->slots_inuse = 0;
 221             node->slots_max = 0;
 222             node->slots = ppn;
 223             node->state = ORTE_NODE_STATE_UP;
 224             opal_list_append(nodelist, &node->super);
 225         } else {
 226 
 227             /* Yes, so we need to free the hostname that came back */
 228             free(hostname);
 229         }
 230 
 231         /* up the nodeid */
 232         nodeid++;
 233     }
 234     fclose(fp);
 235 
 236     return ORTE_SUCCESS;
 237 }
 238 
 239 static char *tm_getline(FILE *fp)
 240 {
 241     char *ret, *buff;
 242     char input[TM_FILE_MAX_LINE_LENGTH];
 243 
 244     ret = fgets(input, TM_FILE_MAX_LINE_LENGTH, fp);
 245     if (NULL != ret) {
 246         input[strlen(input)-1] = '\0';  /* remove newline */
 247         buff = strdup(input);
 248         return buff;
 249     }
 250 
 251     return NULL;
 252 }
 253 

/* [<][>][^][v][top][bottom][index][help] */