This source file includes following definitions.
- orte_rmaps_rf_map
- orte_rmaps_rank_file_parse
- orte_rmaps_rank_file_parse_string_or_int
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 #include "orte_config.h"
  30 #include "orte/constants.h"
  31 #include "orte/types.h"
  32 
  33 #include <errno.h>
  34 #ifdef HAVE_UNISTD_H
  35 #include <unistd.h>
  36 #endif  
  37 #include <string.h>
  38 
  39 #include "opal/util/argv.h"
  40 #include "opal/util/if.h"
  41 #include "opal/util/net.h"
  42 #include "opal/class/opal_pointer_array.h"
  43 #include "opal/mca/hwloc/base/base.h"
  44 
  45 #include "orte/mca/errmgr/errmgr.h"
  46 #include "orte/mca/ess/ess.h"
  47 #include "orte/util/show_help.h"
  48 #include "orte/mca/rmaps/base/rmaps_private.h"
  49 #include "orte/mca/rmaps/base/base.h"
  50 #include "orte/mca/rmaps/rank_file/rmaps_rank_file.h"
  51 #include "orte/mca/rmaps/rank_file/rmaps_rank_file_lex.h"
  52 #include "orte/runtime/orte_globals.h"
  53 
  54 static int orte_rmaps_rf_map(orte_job_t *jdata);
  55 
  56 orte_rmaps_base_module_t orte_rmaps_rank_file_module = {
  57     .map_job = orte_rmaps_rf_map
  58 };
  59 
  60 
  61 static int orte_rmaps_rank_file_parse(const char *);
  62 static char *orte_rmaps_rank_file_parse_string_or_int(void);
  63 static const char *orte_rmaps_rank_file_name_cur = NULL;
  64 char *orte_rmaps_rank_file_slot_list = NULL;
  65 
  66 
  67 
  68 
  69 static opal_pointer_array_t rankmap;
  70 static int num_ranks=0;
  71 
  72 
  73 
  74 
  75 static int orte_rmaps_rf_map(orte_job_t *jdata)
  76 {
  77     orte_job_map_t *map;
  78     orte_app_context_t *app=NULL;
  79     orte_std_cntr_t i, k;
  80     opal_list_t node_list;
  81     opal_list_item_t *item;
  82     orte_node_t *node, *nd, *root_node;
  83     orte_vpid_t rank, vpid_start;
  84     orte_std_cntr_t num_slots;
  85     orte_rmaps_rank_file_map_t *rfmap;
  86     orte_std_cntr_t relative_index, tmp_cnt;
  87     int rc;
  88     orte_proc_t *proc;
  89     mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version;
  90     char *slots;
  91     bool initial_map=true;
  92     opal_hwloc_resource_type_t rtype;
  93 
  94     
  95     if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
  96         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
  97                             "mca:rmaps:rf: job %s being restarted - rank_file cannot map",
  98                             ORTE_JOBID_PRINT(jdata->jobid));
  99         return ORTE_ERR_TAKE_NEXT_OPTION;
 100     }
 101     if (NULL != jdata->map->req_mapper &&
 102         0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
 103         
 104         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 105                             "mca:rmaps:rf: job %s not using rank_file mapper",
 106                             ORTE_JOBID_PRINT(jdata->jobid));
 107         return ORTE_ERR_TAKE_NEXT_OPTION;
 108     }
 109     if (ORTE_MAPPING_BYUSER != ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) {
 110         
 111         return ORTE_ERR_TAKE_NEXT_OPTION;
 112     }
 113     if (OPAL_BIND_ORDERED_REQUESTED(jdata->map->binding)) {
 114         
 115         return ORTE_ERR_TAKE_NEXT_OPTION;
 116     }
 117     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 118                         "mca:rmaps:rank_file: mapping job %s",
 119                         ORTE_JOBID_PRINT(jdata->jobid));
 120 
 121     
 122     if (NULL != jdata->map->last_mapper) {
 123         free(jdata->map->last_mapper);
 124     }
 125     jdata->map->last_mapper = strdup(c->mca_component_name);
 126 
 127     
 128     map = jdata->map;
 129 
 130     
 131     if (mca_rmaps_rank_file_component.physical) {
 132         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 133                             "mca:rmaps:rank_file: using PHYSICAL processors");
 134         rtype = OPAL_HWLOC_PHYSICAL;
 135     } else {
 136         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 137                             "mca:rmaps:rank_file: using LOGICAL processors");
 138         rtype = OPAL_HWLOC_LOGICAL;
 139     }
 140 
 141     
 142     OBJ_CONSTRUCT(&node_list, opal_list_t);
 143 
 144     
 145     if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
 146         rc = ORTE_ERR_SILENT;
 147         goto error;
 148     }
 149 
 150     
 151 
 152     
 153 
 154 
 155 
 156     if (0 == app->num_procs && 1 < jdata->num_apps) {
 157         orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
 158                        true, jdata->num_apps, NULL);
 159         rc = ORTE_ERR_SILENT;
 160         goto error;
 161     }
 162 
 163     
 164 
 165     
 166     vpid_start = 0;
 167     jdata->num_procs = 0;
 168     OBJ_CONSTRUCT(&rankmap, opal_pointer_array_t);
 169 
 170     
 171     if ( NULL != orte_rankfile ) {
 172         if ( ORTE_SUCCESS != (rc = orte_rmaps_rank_file_parse(orte_rankfile))) {
 173             ORTE_ERROR_LOG(rc);
 174             goto error;
 175         }
 176     }
 177 
 178     
 179     for(i=0; i < jdata->apps->size; i++) {
 180         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
 181             continue;
 182         }
 183 
 184         
 185 
 186 
 187 
 188         if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
 189                                                                   map->mapping, initial_map, false))) {
 190             ORTE_ERROR_LOG(rc);
 191             goto error;
 192         }
 193         
 194         initial_map = false;
 195 
 196         
 197         if (0 == app->num_procs) {
 198             if (NULL != orte_rankfile) {
 199                 
 200 
 201 
 202                 app->num_procs = num_ranks;
 203             } else {
 204                 
 205                 app->num_procs = num_slots;
 206             }
 207         }
 208         for (k=0; k < app->num_procs; k++) {
 209             rank = vpid_start + k;
 210             
 211             if (NULL == (rfmap = (orte_rmaps_rank_file_map_t*)opal_pointer_array_get_item(&rankmap, rank))) {
 212                 
 213                 if (NULL != opal_hwloc_base_cpu_list) {
 214                     slots = opal_hwloc_base_cpu_list;
 215                     
 216                     node = NULL;
 217                     OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
 218                         
 219 
 220                         if (nd->slots <= (int)nd->num_procs) {
 221                             continue;
 222                         }
 223                         
 224                         node = nd;
 225                         break;
 226                     }
 227                     if (NULL == node) {
 228                         
 229                         k = UINT32_MAX;
 230                         OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
 231                             if (nd->num_procs < (orte_vpid_t)k) {
 232                                 k = nd->num_procs;
 233                                 node = nd;
 234                             }
 235                         }
 236                     }
 237                     
 238                     if (NULL == node) {
 239                         rc = ORTE_ERR_OUT_OF_RESOURCE;
 240                         goto error;
 241                     }
 242                 } else {
 243                     
 244                     orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile);
 245                     rc = ORTE_ERR_SILENT;
 246                     goto error;
 247                 }
 248             } else {
 249                 if (0 == strlen(rfmap->slot_list)) {
 250                     
 251                     orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
 252                     rc = ORTE_ERR_SILENT;
 253                     goto error;
 254                 }
 255                 slots = rfmap->slot_list;
 256                 
 257                 node = NULL;
 258                 OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
 259                     if (NULL != rfmap->node_name &&
 260                         0 == strcmp(nd->name, rfmap->node_name)) {
 261                         node = nd;
 262                         break;
 263                     } else if (NULL != rfmap->node_name &&
 264                                (('+' == rfmap->node_name[0]) &&
 265                                 (('n' == rfmap->node_name[1]) ||
 266                                  ('N' == rfmap->node_name[1])))) {
 267 
 268                         relative_index=atoi(strtok(rfmap->node_name,"+n"));
 269                         if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){
 270                             orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name);
 271                             ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 272                             return ORTE_ERR_BAD_PARAM;
 273                         }
 274                         root_node = (orte_node_t*) opal_list_get_first(&node_list);
 275                         for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) {
 276                             root_node = (orte_node_t*) opal_list_get_next(root_node);
 277                         }
 278                         node = root_node;
 279                         break;
 280                     }
 281                 }
 282             }
 283             if (NULL == node) {
 284                 orte_show_help("help-rmaps_rank_file.txt","bad-host", true, rfmap->node_name);
 285                 rc = ORTE_ERR_SILENT;
 286                 goto error;
 287             }
 288             
 289             if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 290                 OBJ_RETAIN(node);
 291                 opal_pointer_array_add(map->nodes, node);
 292                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 293                 ++(jdata->map->num_nodes);
 294             }
 295             if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
 296                 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 297                 rc = ORTE_ERR_OUT_OF_RESOURCE;
 298                 goto error;
 299             }
 300             if ((node->slots < (int)node->num_procs) ||
 301                 (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
 302                 if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 303                     orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 304                                    true, node->num_procs, app->app);
 305                     ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 306                     rc = ORTE_ERR_SILENT;
 307                     goto error;
 308                 }
 309                 
 310 
 311 
 312                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 313                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 314             }
 315             
 316             proc->name.vpid = rank;
 317 
 318             if (NULL != slots) {
 319                 
 320                 hwloc_cpuset_t bitmap;
 321                 char *cpu_bitmap;
 322                 if (NULL == node->topology || NULL == node->topology->topo) {
 323                     
 324 
 325 
 326                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name);
 327                     rc = ORTE_ERR_SILENT;
 328                     goto error;
 329                 }
 330                 bitmap = hwloc_bitmap_alloc();
 331                 
 332                 if (ORTE_SUCCESS != (rc = opal_hwloc_base_cpu_list_parse(slots, node->topology->topo, rtype, bitmap))) {
 333                     ORTE_ERROR_LOG(rc);
 334                     hwloc_bitmap_free(bitmap);
 335                     goto error;
 336                 }
 337                 
 338 
 339 
 340 
 341                 
 342                 hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap);
 343                 orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
 344                 
 345                 free(cpu_bitmap);
 346                 hwloc_bitmap_free(bitmap);
 347             }
 348 
 349             
 350             if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
 351                                                                   proc->name.vpid, proc))) {
 352                 ORTE_ERROR_LOG(rc);
 353                 return rc;
 354             }
 355             jdata->num_procs++;
 356         }
 357         
 358         vpid_start += app->num_procs;
 359         
 360 
 361 
 362         while (NULL != (item = opal_list_remove_first(&node_list))) {
 363             OBJ_RELEASE(item);
 364         }
 365         OBJ_DESTRUCT(&node_list);
 366         OBJ_CONSTRUCT(&node_list, opal_list_t);
 367     }
 368     OBJ_DESTRUCT(&node_list);
 369 
 370     
 371     for (i=0; i < rankmap.size; i++) {
 372         if (NULL != (rfmap = opal_pointer_array_get_item(&rankmap, i))) {
 373             OBJ_RELEASE(rfmap);
 374         }
 375     }
 376     OBJ_DESTRUCT(&rankmap);
 377     
 378     orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
 379 
 380     return rc;
 381 
 382  error:
 383     OPAL_LIST_DESTRUCT(&node_list);
 384 
 385     return rc;
 386 }
 387 
 388 static int orte_rmaps_rank_file_parse(const char *rankfile)
 389 {
 390     int token;
 391     int rc = ORTE_SUCCESS;
 392     int cnt;
 393     char* node_name = NULL;
 394     char** argv;
 395     char buff[64];
 396     char* value;
 397     int rank=-1;
 398     int i;
 399     orte_node_t *hnp_node;
 400     orte_rmaps_rank_file_map_t *rfmap=NULL;
 401     opal_pointer_array_t *assigned_ranks_array;
 402     char tmp_rank_assignment[64];
 403 
 404     
 405     assigned_ranks_array = OBJ_NEW(opal_pointer_array_t);
 406 
 407     
 408     hnp_node = (orte_node_t*)(orte_node_pool->addr[0]);
 409 
 410     orte_rmaps_rank_file_name_cur = rankfile;
 411     orte_rmaps_rank_file_done = false;
 412     orte_rmaps_rank_file_in = fopen(rankfile, "r");
 413 
 414     if (NULL == orte_rmaps_rank_file_in) {
 415         orte_show_help("help-rmaps_rank_file.txt", "no-rankfile", true, rankfile);
 416         rc = OPAL_ERR_NOT_FOUND;
 417         ORTE_ERROR_LOG(rc);
 418         goto unlock;
 419     }
 420 
 421     while (!orte_rmaps_rank_file_done) {
 422         token = orte_rmaps_rank_file_lex();
 423 
 424         switch (token) {
 425             case ORTE_RANKFILE_ERROR:
 426                 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
 427                 rc = ORTE_ERR_BAD_PARAM;
 428                 ORTE_ERROR_LOG(rc);
 429                 goto unlock;
 430                 break;
 431             case ORTE_RANKFILE_QUOTED_STRING:
 432                 orte_show_help("help-rmaps_rank_file.txt", "not-supported-rankfile", true, "QUOTED_STRING", rankfile);
 433                 rc = ORTE_ERR_BAD_PARAM;
 434                 ORTE_ERROR_LOG(rc);
 435                 goto unlock;
 436             case ORTE_RANKFILE_NEWLINE:
 437                 rank = -1;
 438                 if (NULL != node_name) {
 439                     free(node_name);
 440                 }
 441                 node_name = NULL;
 442                 rfmap = NULL;
 443                 break;
 444             case ORTE_RANKFILE_RANK:
 445                 token = orte_rmaps_rank_file_lex();
 446                 if (ORTE_RANKFILE_INT == token) {
 447                     rank = orte_rmaps_rank_file_value.ival;
 448                     rfmap = OBJ_NEW(orte_rmaps_rank_file_map_t);
 449                     opal_pointer_array_set_item(&rankmap, rank, rfmap);
 450                     num_ranks++;  
 451                 } else {
 452                     orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
 453                     rc = ORTE_ERR_BAD_PARAM;
 454                     ORTE_ERROR_LOG(rc);
 455                     goto unlock;
 456                 }
 457                 break;
 458             case ORTE_RANKFILE_USERNAME:
 459                 orte_show_help("help-rmaps_rank_file.txt", "not-supported-rankfile", true, "USERNAME", rankfile);
 460                 rc = ORTE_ERR_BAD_PARAM;
 461                 ORTE_ERROR_LOG(rc);
 462                 goto unlock;
 463                 break;
 464             case ORTE_RANKFILE_EQUAL:
 465                 if (rank < 0) {
 466                     orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
 467                     rc = ORTE_ERR_BAD_PARAM;
 468                     ORTE_ERROR_LOG(rc);
 469                     goto unlock;
 470                 }
 471                 token = orte_rmaps_rank_file_lex();
 472                 switch (token) {
 473                     case ORTE_RANKFILE_HOSTNAME:
 474                     case ORTE_RANKFILE_IPV4:
 475                     case ORTE_RANKFILE_IPV6:
 476                     case ORTE_RANKFILE_STRING:
 477                     case ORTE_RANKFILE_INT:
 478                     case ORTE_RANKFILE_RELATIVE:
 479                         if(ORTE_RANKFILE_INT == token) {
 480                             sprintf(buff,"%d", orte_rmaps_rank_file_value.ival);
 481                             value = buff;
 482                         } else {
 483                             value = orte_rmaps_rank_file_value.sval;
 484                         }
 485                         argv = opal_argv_split (value, '@');
 486                         cnt = opal_argv_count (argv);
 487                         if (NULL != node_name) {
 488                             free(node_name);
 489                         }
 490                         if (1 == cnt) {
 491                             node_name = strdup(argv[0]);
 492                         } else if (2 == cnt) {
 493                             node_name = strdup(argv[1]);
 494                         } else {
 495                             orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
 496                             rc = ORTE_ERR_BAD_PARAM;
 497                             ORTE_ERROR_LOG(rc);
 498                             opal_argv_free(argv);
 499                             node_name = NULL;
 500                             goto unlock;
 501                         }
 502                         opal_argv_free (argv);
 503 
 504                         
 505                         if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(node_name) ) {
 506                             char *ptr;
 507                             if (NULL != (ptr = strchr(node_name, '.'))) {
 508                                 *ptr = '\0';
 509                             }
 510                         }
 511 
 512                         
 513                         if (NULL == rfmap) {
 514                             orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
 515                             rc = ORTE_ERR_BAD_PARAM;
 516                             ORTE_ERROR_LOG(rc);
 517                             goto unlock;
 518                         }
 519                         
 520                         if (orte_ifislocal(node_name)) {
 521                             rfmap->node_name = strdup(hnp_node->name);
 522                         } else {
 523                             rfmap->node_name = strdup(node_name);
 524                         }
 525                 }
 526                 break;
 527             case ORTE_RANKFILE_SLOT:
 528                 if (NULL == node_name || rank < 0 ||
 529                     NULL == (value = orte_rmaps_rank_file_parse_string_or_int())) {
 530                     orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
 531                     rc = ORTE_ERR_BAD_PARAM;
 532                     ORTE_ERROR_LOG(rc);
 533                     goto unlock;
 534                 }
 535 
 536                 
 537                 if (NULL != opal_pointer_array_get_item(assigned_ranks_array, rank)) {
 538                     orte_show_help("help-rmaps_rank_file.txt", "bad-assign", true, rank,
 539                                    opal_pointer_array_get_item(assigned_ranks_array, rank), rankfile);
 540                     rc = ORTE_ERR_BAD_PARAM;
 541                     free(value);
 542                     goto unlock;
 543                 } else {
 544                     
 545                     sprintf(tmp_rank_assignment, "%s slot=%s", node_name, value);
 546                     opal_pointer_array_set_item(assigned_ranks_array, 0, tmp_rank_assignment);
 547                 }
 548 
 549                 
 550                 if (NULL == rfmap) {
 551                     orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
 552                     rc = ORTE_ERR_BAD_PARAM;
 553                     ORTE_ERROR_LOG(rc);
 554                     free(value);
 555                     goto unlock;
 556                 }
 557                 for (i=0; i < 64 && '\0' != value[i]; i++) {
 558                     rfmap->slot_list[i] = value[i];
 559                 }
 560                 free(value);
 561                 break;
 562         }
 563     }
 564     fclose(orte_rmaps_rank_file_in);
 565     orte_rmaps_rank_file_lex_destroy ();
 566 
 567 unlock:
 568     if (NULL != node_name) {
 569         free(node_name);
 570     }
 571     OBJ_RELEASE(assigned_ranks_array);
 572     orte_rmaps_rank_file_name_cur = NULL;
 573     return rc;
 574 }
 575 
 576 
 577 static char *orte_rmaps_rank_file_parse_string_or_int(void)
 578 {
 579     int rc;
 580     char tmp_str[64];
 581 
 582     if (ORTE_RANKFILE_EQUAL != orte_rmaps_rank_file_lex()){
 583         return NULL;
 584     }
 585 
 586     rc = orte_rmaps_rank_file_lex();
 587     switch (rc) {
 588         case ORTE_RANKFILE_STRING:
 589             return strdup(orte_rmaps_rank_file_value.sval);
 590         case ORTE_RANKFILE_INT:
 591             sprintf(tmp_str,"%d",orte_rmaps_rank_file_value.ival);
 592             return strdup(tmp_str);
 593         default:
 594             return NULL;
 595 
 596     }
 597 
 598 }