root/orte/mca/ras/base/ras_base_node.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_ras_base_node_insert

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2008 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2011-2017 Los Alamos National Security, LLC.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2014-2018 Intel, Inc.  All rights reserved.
  15  * Copyright (c) 2015-2018 Research Organization for Information Science
  16  *                         and Technology (RIST). All rights reserved.
  17  * $COPYRIGHT$
  18  *
  19  * Additional copyrights may follow
  20  *
  21  * $HEADER$
  22  */
  23 
  24 #include "orte_config.h"
  25 #include "orte/constants.h"
  26 
  27 #include <string.h>
  28 
  29 #include "opal/util/argv.h"
  30 #include "opal/util/if.h"
  31 
  32 #include "orte/mca/errmgr/errmgr.h"
  33 #include "orte/mca/rmaps/base/base.h"
  34 #include "orte/util/name_fns.h"
  35 #include "orte/runtime/orte_globals.h"
  36 
  37 #include "orte/mca/ras/base/ras_private.h"
  38 
  39 /*
  40  * Add the specified node definitions to the global data store
  41  * NOTE: this removes all items from the list!
  42  */
  43 int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
  44 {
  45     opal_list_item_t* item;
  46     orte_std_cntr_t num_nodes;
  47     int rc, i;
  48     orte_node_t *node, *hnp_node, *nptr;
  49     char *ptr;
  50     bool hnp_alone = true, skiphnp = false;
  51     orte_attribute_t *kv;
  52     char **alias=NULL, **nalias;
  53     orte_proc_t *daemon;
  54     orte_job_t *djob;
  55 
  56     /* get the number of nodes */
  57     num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
  58     if (0 == num_nodes) {
  59         return ORTE_SUCCESS;  /* nothing to do */
  60     }
  61 
  62     OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
  63                          "%s ras:base:node_insert inserting %ld nodes",
  64                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  65                          (long)num_nodes));
  66 
  67     /* mark the job as being a large-cluster sim if that was requested */
  68     if (1 < orte_ras_base.multiplier) {
  69         orte_set_attribute(&jdata->attributes, ORTE_JOB_MULTI_DAEMON_SIM,
  70                            ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
  71     }
  72 
  73     /* set the size of the global array - this helps minimize time
  74      * spent doing realloc's
  75      */
  76     if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes * orte_ras_base.multiplier))) {
  77         ORTE_ERROR_LOG(rc);
  78         return rc;
  79     }
  80 
  81     /* if we are not launching, get the daemon job */
  82     djob = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
  83 
  84     /* get the hnp node's info */
  85     hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
  86 
  87     if ((orte_ras_base.launch_orted_on_hn == true) &&
  88         (orte_managed_allocation)) {
  89         if (NULL != hnp_node) {
  90             OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
  91                 if (orte_ifislocal(node->name)) {
  92                     orte_hnp_is_allocated = true;
  93                     break;
  94                 }
  95             }
  96             if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) &
  97                 ORTE_MAPPING_NO_USE_LOCAL)) {
  98                 hnp_node->name = strdup("mpirun");
  99                 skiphnp = true;
 100                 ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
 101             }
 102         }
 103     }
 104 
 105     /* cycle through the list */
 106     while (NULL != (item = opal_list_remove_first(nodes))) {
 107         node = (orte_node_t*)item;
 108 
 109         /* the HNP had to already enter its node on the array - that entry is in the
 110          * first position since it is the first one entered. We need to check to see
 111          * if this node is the same as the HNP's node so we don't double-enter it
 112          */
 113         if (!skiphnp && NULL != hnp_node && orte_ifislocal(node->name)) {
 114             OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 115                                  "%s ras:base:node_insert updating HNP [%s] info to %ld slots",
 116                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 117                                  node->name,
 118                                  (long)node->slots));
 119 
 120             /* flag that hnp has been allocated */
 121             orte_hnp_is_allocated = true;
 122             /* update the total slots in the job */
 123             orte_ras_base.total_slots_alloc += node->slots;
 124             /* copy the allocation data to that node's info */
 125             hnp_node->slots = node->slots;
 126             hnp_node->slots_max = node->slots_max;
 127             /* copy across any attributes */
 128             OPAL_LIST_FOREACH(kv, &node->attributes, orte_attribute_t) {
 129                 orte_set_attribute(&node->attributes, kv->key, ORTE_ATTR_LOCAL, &kv->data, kv->type);
 130             }
 131             if (orte_managed_allocation || ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
 132                 /* the slots are always treated as sacred
 133                  * in managed allocations
 134                  */
 135                 ORTE_FLAG_SET(hnp_node, ORTE_NODE_FLAG_SLOTS_GIVEN);
 136             } else {
 137                 ORTE_FLAG_UNSET(hnp_node, ORTE_NODE_FLAG_SLOTS_GIVEN);
 138             }
 139             /* use the local name for our node - don't trust what
 140              * we got from an RM. If requested, store the resolved
 141              * nodename info
 142              */
 143             if (orte_show_resolved_nodenames) {
 144                 /* if the node name is different, store it as an alias */
 145                 if (0 != strcmp(node->name, hnp_node->name)) {
 146                     /* get any current list of aliases */
 147                     ptr = NULL;
 148                     orte_get_attribute(&hnp_node->attributes, ORTE_NODE_ALIAS, (void**)&ptr, OPAL_STRING);
 149                     if (NULL != ptr) {
 150                         alias = opal_argv_split(ptr, ',');
 151                         free(ptr);
 152                     }
 153                     /* add to list of aliases for this node - only add if unique */
 154                     opal_argv_append_unique_nosize(&alias, node->name, false);
 155                 }
 156                 if (orte_get_attribute(&node->attributes, ORTE_NODE_ALIAS, (void**)&ptr, OPAL_STRING)) {
 157                     nalias = opal_argv_split(ptr, ',');
 158                     /* now copy over any aliases that are unique */
 159                     for (i=0; NULL != nalias[i]; i++) {
 160                         opal_argv_append_unique_nosize(&alias, nalias[i], false);
 161                     }
 162                     opal_argv_free(nalias);
 163                 }
 164                 /* and store the result */
 165                 if (0 < opal_argv_count(alias)) {
 166                     ptr = opal_argv_join(alias, ',');
 167                     orte_set_attribute(&hnp_node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, ptr, OPAL_STRING);
 168                     free(ptr);
 169                 }
 170                 opal_argv_free(alias);
 171             }
 172             /* don't keep duplicate copy */
 173             OBJ_RELEASE(node);
 174             /* create copies, if required */
 175             for (i=1; i < orte_ras_base.multiplier; i++) {
 176                 opal_dss.copy((void**)&node, hnp_node, ORTE_NODE);
 177                 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
 178                 node->index = opal_pointer_array_add(orte_node_pool, node);
 179             }
 180         } else {
 181             /* insert the object onto the orte_nodes global array */
 182             OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 183                                  "%s ras:base:node_insert node %s slots %d",
 184                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 185                                  (NULL == node->name) ? "NULL" : node->name,
 186                                  node->slots));
 187             if (orte_managed_allocation) {
 188                 /* the slots are always treated as sacred
 189                  * in managed allocations
 190                  */
 191                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
 192             }
 193             /* insert it into the array */
 194             node->index = opal_pointer_array_add(orte_node_pool, (void*)node);
 195             if (ORTE_SUCCESS > (rc = node->index)) {
 196                 ORTE_ERROR_LOG(rc);
 197                 return rc;
 198             }
 199             if (orte_do_not_launch) {
 200                 /* create a daemon for this node since we won't be launching
 201                  * and the mapper needs to see a daemon - this is used solely
 202                  * for testing the mappers */
 203                 daemon = OBJ_NEW(orte_proc_t);
 204                 daemon->name.jobid = ORTE_PROC_MY_NAME->jobid;
 205                 daemon->name.vpid = node->index;
 206                 daemon->state = ORTE_PROC_STATE_RUNNING;
 207                 OBJ_RETAIN(node);
 208                 daemon->node = node;
 209                 opal_pointer_array_set_item(djob->procs, daemon->name.vpid, daemon);
 210                 djob->num_procs++;
 211                 OBJ_RETAIN(daemon);
 212                 node->daemon = daemon;
 213             }
 214             /* update the total slots in the job */
 215             orte_ras_base.total_slots_alloc += node->slots;
 216             /* check if we have fqdn names in the allocation */
 217             if (NULL != strchr(node->name, '.')) {
 218                 orte_have_fqdn_allocation = true;
 219             }
 220             /* indicate the HNP is not alone */
 221             hnp_alone = false;
 222             for (i=1; i < orte_ras_base.multiplier; i++) {
 223                 opal_dss.copy((void**)&nptr, node, ORTE_NODE);
 224                 nptr->index = opal_pointer_array_add(orte_node_pool, nptr);
 225             }
 226        }
 227     }
 228 
 229     /* if we didn't find any fqdn names in the allocation, then
 230      * ensure we don't have any domain info in the node record
 231      * for the hnp
 232      */
 233     if (NULL != hnp_node && !orte_have_fqdn_allocation && !hnp_alone) {
 234         if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
 235             *ptr = '\0';
 236         }
 237     }
 238 
 239     return ORTE_SUCCESS;
 240 }

/* [<][>][^][v][top][bottom][index][help] */