root/orte/mca/ras/lsf/ras_lsf_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. allocate
  2. finalize

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2005 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2007-2017 Cisco Systems, Inc.  All rights reserved
  13  * Copyright (c) 2014      Intel, Inc. All rights reserved
  14  * Copyright (c) 2016      IBM Corporation.  All rights reserved.
  15  * $COPYRIGHT$
  16  *
  17  * Additional copyrights may follow
  18  *
  19  * $HEADER$
  20  */
  21 #include "orte_config.h"
  22 #include "orte/constants.h"
  23 
  24 #include <errno.h>
  25 #include <unistd.h>
  26 #include <string.h>
  27 #include <sys/types.h>
  28 #include <sys/stat.h>
  29 
  30 #define SR1_PJOBS
  31 #include <lsf/lsbatch.h>
  32 
  33 #include "opal/util/argv.h"
  34 #include "opal/util/net.h"
  35 #include "opal/mca/hwloc/hwloc-internal.h"
  36 
  37 #include "orte/mca/rmaps/rmaps_types.h"
  38 #include "orte/mca/errmgr/errmgr.h"
  39 #include "orte/runtime/orte_globals.h"
  40 #include "orte/util/show_help.h"
  41 
  42 #include "orte/mca/ras/base/ras_private.h"
  43 #include "orte/mca/ras/base/base.h"
  44 #include "ras_lsf.h"
  45 
  46 
  47 /*
  48  * Local functions
  49  */
  50 static int allocate(orte_job_t *jdata, opal_list_t *nodes);
  51 static int finalize(void);
  52 
  53 
  54 /*
  55  * Global variable
  56  */
  57 orte_ras_base_module_t orte_ras_lsf_module = {
  58     NULL,
  59     allocate,
  60     NULL,
  61     finalize
  62 };
  63 
  64 
  65 static int allocate(orte_job_t *jdata, opal_list_t *nodes)
  66 {
  67     char **nodelist;
  68     orte_node_t *node;
  69     int i, num_nodes;
  70     char *affinity_file;
  71     struct stat buf;
  72     char *ptr;
  73 
  74     /* get the list of allocated nodes */
  75     if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
  76         orte_show_help("help-ras-lsf.txt", "nodelist-failed", true);
  77         return ORTE_ERR_NOT_AVAILABLE;
  78     }
  79 
  80     node = NULL;
  81 
  82     /* step through the list */
  83     for (i = 0; i < num_nodes; i++) {
  84         if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(nodelist[i]) ) {
  85             if (NULL != (ptr = strchr(nodelist[i], '.'))) {
  86                 *ptr = '\0';
  87             }
  88         }
  89 
  90         /* is this a repeat of the current node? */
  91         if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
  92             /* it is a repeat - just bump the slot count */
  93             ++node->slots;
  94             opal_output_verbose(10, orte_ras_base_framework.framework_output,
  95                                 "ras/lsf: +++ Node (%s) [slots=%d]", node->name, node->slots);
  96             continue;
  97         }
  98 
  99         /* not a repeat - create a node entry for it */
 100         node = OBJ_NEW(orte_node_t);
 101         node->name = strdup(nodelist[i]);
 102         node->slots_inuse = 0;
 103         node->slots_max = 0;
 104         node->slots = 1;
 105         node->state = ORTE_NODE_STATE_UP;
 106         opal_list_append(nodes, &node->super);
 107 
 108         opal_output_verbose(10, orte_ras_base_framework.framework_output,
 109                             "ras/lsf: New Node (%s) [slots=%d]", node->name, node->slots);
 110     }
 111 
 112     /* release the nodelist from lsf */
 113     opal_argv_free(nodelist);
 114 
 115     /* check for an affinity file */
 116     if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) {
 117         /* check to see if the file is empty - if it is,
 118          * then affinity wasn't actually set for this job */
 119         if (0 != stat(affinity_file, &buf)) {
 120             orte_show_help("help-ras-lsf.txt", "affinity-file-not-found", true, affinity_file);
 121             return ORTE_ERR_SILENT;
 122         }
 123         if (0 == buf.st_size) {
 124             /* no affinity, so just return */
 125             return ORTE_SUCCESS;
 126         }
 127         /* the affinity file sequentially lists rank locations, with
 128          * cpusets given as physical cpu-ids. Setup the job object
 129          * so it knows to process this accordingly */
 130         if (NULL == jdata->map) {
 131             jdata->map = OBJ_NEW(orte_job_map_t);
 132         }
 133         ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ);
 134         jdata->map->req_mapper = strdup("seq"); // need sequential mapper
 135         /* tell the sequential mapper that all cpusets are to be treated as "physical" */
 136         orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL);
 137         /* LSF provides its info as hwthreads, so set the hwthread-as-cpus flag */
 138         opal_hwloc_use_hwthreads_as_cpus = true;
 139         /* don't override something provided by the user, but default to bind-to hwthread */
 140         if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
 141             OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
 142         }
 143         /*
 144          * Do not set the hostfile attribute on each app_context since that
 145          * would confuse the sequential mapper when it tries to assign bindings
 146          * when running an MPMD job.
 147          * Instead just overwrite the orte_default_hostfile so it will be
 148          * general for all of the app_contexts.
 149          */
 150         if( NULL != orte_default_hostfile ) {
 151             free(orte_default_hostfile);
 152             orte_default_hostfile = NULL;
 153         }
 154         orte_default_hostfile = strdup(affinity_file);
 155         opal_output_verbose(10, orte_ras_base_framework.framework_output,
 156                             "ras/lsf: Set default_hostfile to %s",orte_default_hostfile);
 157 
 158         return ORTE_SUCCESS;
 159     }
 160 
 161     return ORTE_SUCCESS;
 162 }
 163 
 164 static int finalize(void)
 165 {
 166     return ORTE_SUCCESS;
 167 }

/* [<][>][^][v][top][bottom][index][help] */