root/orte/mca/ras/slurm/ras_slurm_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. jtrk_cons
  2. jtrk_des
  3. init
  4. orte_ras_slurm_allocate
  5. deallocate
  6. orte_ras_slurm_finalize
  7. orte_ras_slurm_discover
  8. orte_ras_slurm_parse_ranges
  9. orte_ras_slurm_parse_range
  10. timeout
  11. recv_data
  12. dyn_allocate
  13. parse_alloc_msg
  14. get_node_list
  15. read_ip_port

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2005 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2011-2017 Los Alamos National Security, LLC.  All rights
  14  *                         reserved.
  15  * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
  16  * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
  17  * Copyright (c) 2015      Research Organization for Information Science
  18  *                         and Technology (RIST). All rights reserved.
  19  * Copyright (c) 2016      IBM Corporation.  All rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 #include "orte_config.h"
  27 #include "orte/constants.h"
  28 #include "orte/types.h"
  29 
  30 #include <netdb.h>
  31 #include <unistd.h>
  32 #include <string.h>
  33 #include <ctype.h>
  34 #include <sys/types.h>
  35 #include <sys/socket.h>
  36 #ifdef HAVE_NETINET_IN_H
  37 #include <netinet/in.h>
  38 #endif
  39 #ifdef HAVE_ARPA_INET_H
  40 #include <arpa/inet.h>
  41 #endif
  42 #include <fcntl.h>
  43 #include <stdlib.h>
  44 #include <string.h>
  45 
  46 #include "opal/util/argv.h"
  47 #include "opal/util/net.h"
  48 #include "opal/util/output.h"
  49 #include "opal/opal_socket_errno.h"
  50 
  51 #include "orte/util/show_help.h"
  52 #include "orte/mca/errmgr/errmgr.h"
  53 #include "orte/mca/rmaps/base/base.h"
  54 #include "orte/mca/state/state.h"
  55 #include "orte/util/name_fns.h"
  56 #include "orte/runtime/orte_globals.h"
  57 
  58 #include "orte/mca/ras/base/ras_private.h"
  59 #include "ras_slurm.h"
  60 
  61 #define ORTE_SLURM_DYN_MAX_SIZE 256
  62 
  63 /*
  64  * API functions
  65  */
  66 static int init(void);
  67 static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes);
  68 static void deallocate(orte_job_t *jdata,
  69                        orte_app_context_t *app);
  70 static int orte_ras_slurm_finalize(void);
  71 
  72 /*
  73  * RAS slurm module
  74  */
  75 orte_ras_base_module_t orte_ras_slurm_module = {
  76     init,
  77     orte_ras_slurm_allocate,
  78     deallocate,
  79     orte_ras_slurm_finalize
  80 };
  81 
  82 /* Local functions */
  83 static int orte_ras_slurm_discover(char *regexp, char* tasks_per_node,
  84                                    opal_list_t *nodelist);
  85 static int orte_ras_slurm_parse_ranges(char *base, char *ranges, char ***nodelist);
  86 static int orte_ras_slurm_parse_range(char *base, char *range, char ***nodelist);
  87 
  88 static int dyn_allocate(orte_job_t *jdata);
  89 static char* get_node_list(orte_app_context_t *app);
  90 static int parse_alloc_msg(char *msg, int *idx, int *sjob,
  91                            char **nodelist, char **tpn);
  92 
  93 static void recv_data(int fd, short args, void *cbdata);
  94 static void timeout(int fd, short args, void *cbdata);
  95 static int read_ip_port(char *filename, char **ip, uint16_t *port);
  96 
  97 
  98 /* define structs for tracking dynamic allocations */
  99 typedef struct {
 100     opal_object_t super;
 101     int sjob;
 102 } local_apptracker_t;
 103 OBJ_CLASS_INSTANCE(local_apptracker_t,
 104                    opal_object_t,
 105                    NULL, NULL);
 106 
 107 typedef struct {
 108     opal_list_item_t super;
 109     char *cmd;
 110     opal_event_t timeout_ev;
 111     orte_jobid_t jobid;
 112     opal_pointer_array_t apps;
 113     int napps;
 114 } local_jobtracker_t;
 115 static void jtrk_cons(local_jobtracker_t *ptr)
 116 {
 117     ptr->cmd = NULL;
 118     OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t);
 119     opal_pointer_array_init(&ptr->apps, 1, INT_MAX, 1);
 120     ptr->napps = 0;
 121 }
 122 static void jtrk_des(local_jobtracker_t *ptr)
 123 {
 124     int i;
 125     local_apptracker_t *ap;
 126 
 127     if (NULL != ptr->cmd) {
 128         free(ptr->cmd);
 129     }
 130     for (i=0; i < ptr->apps.size; i++) {
 131         if (NULL != (ap = (local_apptracker_t*)opal_pointer_array_get_item(&ptr->apps, i))) {
 132             OBJ_RELEASE(ap);
 133         }
 134     }
 135     OBJ_DESTRUCT(&ptr->apps);
 136 }
 137 OBJ_CLASS_INSTANCE(local_jobtracker_t,
 138                    opal_list_item_t,
 139                    jtrk_cons, jtrk_des);
 140 
 141 /* local vars */
 142 static int socket_fd;
 143 static opal_list_t jobs;
 144 static opal_event_t recv_ev;
 145 
 146 /* init the module */
 147 static int init(void)
 148 {
 149     char *slurm_host=NULL;
 150     uint16_t port=0;
 151     struct sockaddr_in address;
 152     int flags;
 153     struct hostent *h;
 154 
 155     if (mca_ras_slurm_component.dyn_alloc_enabled) {
 156         if (NULL == mca_ras_slurm_component.config_file) {
 157             orte_show_help("help-ras-slurm.txt", "dyn-alloc-no-config", true);
 158             return ORTE_ERR_SILENT;
 159         }
 160         /* setup the socket */
 161         if (ORTE_SUCCESS != read_ip_port(mca_ras_slurm_component.config_file,
 162                                          &slurm_host, &port) ||
 163             NULL == slurm_host || 0 == port) {
 164             if (NULL != slurm_host) {
 165                 free(slurm_host);
 166             }
 167             return ORTE_ERR_SILENT;
 168         }
 169         OPAL_OUTPUT_VERBOSE((2, orte_ras_base_framework.framework_output,
 170                              "ras:slurm got [ ip = %s, port = %u ] from %s\n",
 171                              slurm_host, port, mca_ras_slurm_component.config_file));
 172 
 173         /* obtain a socket for our use */
 174         if ((socket_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
 175             ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 176             free(slurm_host);
 177             return ORTE_ERR_OUT_OF_RESOURCE;
 178         }
 179 
 180         /* connect to the Slurm dynamic allocation port */
 181         bzero(&address, sizeof(address));
 182         address.sin_family = AF_INET;
 183         if (!opal_net_isaddr(slurm_host)) {
 184             /* if the ControlMachine was not specified as an IP address,
 185              * we need to resolve it here
 186              */
 187             if (NULL == (h = gethostbyname(slurm_host))) {
 188                 /* could not resolve it */
 189                 orte_show_help("help-ras-slurm.txt", "host-not-resolved",
 190                                true, slurm_host);
 191                 free(slurm_host);
 192                 return ORTE_ERR_SILENT;
 193             }
 194             free(slurm_host);
 195             slurm_host = strdup(inet_ntoa(*(struct in_addr*)h->h_addr_list[0]));
 196         }
 197         address.sin_addr.s_addr = inet_addr(slurm_host);
 198         address.sin_port =  htons(port);
 199         if (connect(socket_fd, (struct sockaddr*)&address, sizeof(address)) < 0) {
 200             orte_show_help("help-ras-slurm.txt", "connection-failed",
 201                            true, slurm_host, (int)port);
 202             free(slurm_host);
 203             return ORTE_ERR_SILENT;
 204         }
 205         free(slurm_host);
 206 
 207         /* set socket up to be non-blocking */
 208         if ((flags = fcntl(socket_fd, F_GETFL, 0)) < 0) {
 209             opal_output(0, "ras:slurm:dyn: fcntl(F_GETFL) failed: %s (%d)",
 210                         strerror(opal_socket_errno), opal_socket_errno);
 211             return ORTE_ERROR;
 212         } else {
 213             flags |= O_NONBLOCK;
 214             if (fcntl(socket_fd, F_SETFL, flags) < 0) {
 215                 opal_output(0, "ras:slurm:dyn: fcntl(F_SETFL) failed: %s (%d)",
 216                             strerror(opal_socket_errno), opal_socket_errno);
 217                 return ORTE_ERROR;
 218             }
 219         }
 220 
 221         /* setup to recv data */
 222         opal_event_set(orte_event_base, &recv_ev, socket_fd,
 223                        OPAL_EV_READ, recv_data, NULL);
 224         opal_event_add(&recv_ev, 0);
 225 
 226         /* initialize the list of jobs for tracking dynamic allocations */
 227         OBJ_CONSTRUCT(&jobs, opal_list_t);
 228     }
 229     return ORTE_SUCCESS;
 230 }
 231 
 232 /**
 233  * Discover available (pre-allocated) nodes.  Allocate the
 234  * requested number of nodes/process slots to the job.
 235  *
 236  */
 237 static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
 238 {
 239     int ret, cpus_per_task;
 240     char *slurm_node_str, *regexp;
 241     char *tasks_per_node, *node_tasks;
 242     char *tmp;
 243     char *slurm_jobid;
 244 
 245     if (NULL == (slurm_jobid = getenv("SLURM_JOBID"))) {
 246         /* we are not in a slurm allocation - see if dyn alloc
 247          * is enabled
 248          */
 249         if (!mca_ras_slurm_component.dyn_alloc_enabled) {
 250             /* nope - nothing we can do */
 251             opal_output_verbose(2, orte_ras_base_framework.framework_output,
 252                                 "%s ras:slurm: no prior allocation and dynamic alloc disabled",
 253                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 254             return ORTE_ERR_TAKE_NEXT_OPTION;
 255         }
 256     } else {
 257         /* save this value in the global job ident string for
 258          * later use in any error reporting
 259          */
 260         orte_job_ident = strdup(slurm_jobid);
 261     }
 262 
 263     slurm_node_str = getenv("SLURM_NODELIST");
 264     if (NULL == slurm_node_str) {
 265         /* see if dynamic allocation is enabled */
 266         if (mca_ras_slurm_component.dyn_alloc_enabled) {
 267             /* attempt to get the allocation - the function
 268              * dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING
 269              * if it succeeds in sending the allocation request
 270              */
 271             ret = dyn_allocate(jdata);
 272             /* return to the above layer in ras/base/ras_base_allocate.c
 273              * to wait for event (libevent) happening
 274              */
 275             return ret;
 276         }
 277         orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
 278                        "SLURM_NODELIST");
 279         return ORTE_ERR_NOT_FOUND;
 280     }
 281     regexp = strdup(slurm_node_str);
 282     if(NULL == regexp) {
 283         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 284         return ORTE_ERR_OUT_OF_RESOURCE;
 285     }
 286 
 287     if (mca_ras_slurm_component.use_all) {
 288         /* this is an oddball case required for debug situations where
 289          * a tool is started that will then call mpirun. In this case,
 290          * Slurm will assign only 1 tasks/per node to the tool, but
 291          * we want mpirun to use the entire allocation. They don't give
 292          * us a specific variable for this purpose, so we have to fudge
 293          * a bit - but this is a special edge case, and we'll live with it */
 294         tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
 295         if (NULL == tasks_per_node) {
 296             /* couldn't find any version - abort */
 297             orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
 298                            "SLURM_JOB_CPUS_PER_NODE");
 299             free(regexp);
 300             return ORTE_ERR_NOT_FOUND;
 301         }
 302         node_tasks = strdup(tasks_per_node);
 303         if (NULL == node_tasks) {
 304             ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 305             free(regexp);
 306             return ORTE_ERR_OUT_OF_RESOURCE;
 307         }
 308         cpus_per_task = 1;
 309     } else {
 310         /* get the number of process slots we were assigned on each node */
 311         tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
 312         if (NULL == tasks_per_node) {
 313             /* couldn't find any version - abort */
 314             orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
 315                            "SLURM_TASKS_PER_NODE");
 316             free(regexp);
 317             return ORTE_ERR_NOT_FOUND;
 318         }
 319         node_tasks = strdup(tasks_per_node);
 320         if (NULL == node_tasks) {
 321             ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 322             free(regexp);
 323             return ORTE_ERR_OUT_OF_RESOURCE;
 324         }
 325 
 326         /* get the number of CPUs per task that the user provided to slurm */
 327         tmp = getenv("SLURM_CPUS_PER_TASK");
 328         if(NULL != tmp) {
 329             cpus_per_task = atoi(tmp);
 330             if(0 >= cpus_per_task) {
 331                 opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. "
 332                             "Variable was: %s\n", tmp);
 333                 ORTE_ERROR_LOG(ORTE_ERROR);
 334                 free(node_tasks);
 335                 free(regexp);
 336                 return ORTE_ERROR;
 337             }
 338         } else {
 339             cpus_per_task = 1;
 340         }
 341     }
 342 
 343     ret = orte_ras_slurm_discover(regexp, node_tasks, nodes);
 344     free(regexp);
 345     free(node_tasks);
 346     if (ORTE_SUCCESS != ret) {
 347         OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 348                              "%s ras:slurm:allocate: discover failed!",
 349                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 350         return ret;
 351     }
 352     /* record the number of allocated nodes */
 353     orte_num_allocated_nodes = opal_list_get_size(nodes);
 354 
 355     /* All done */
 356 
 357     OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 358                          "%s ras:slurm:allocate: success",
 359                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 360     return ORTE_SUCCESS;
 361 }
 362 
 363 static void deallocate(orte_job_t *jdata,
 364                        orte_app_context_t *app)
 365 {
 366 }
 367 
 368 static int orte_ras_slurm_finalize(void)
 369 {
 370     opal_list_item_t *item;
 371 
 372     if (mca_ras_slurm_component.dyn_alloc_enabled) {
 373         /* delete the recv event */
 374         opal_event_del(&recv_ev);
 375         while (NULL != (item = opal_list_remove_first(&jobs))) {
 376             OBJ_RELEASE(item);
 377         }
 378         OBJ_DESTRUCT(&jobs);
 379         /* close the socket */
 380         shutdown(socket_fd, 2);
 381         close(socket_fd);
 382     }
 383     return ORTE_SUCCESS;
 384 }
 385 
 386 
 387 /**
 388  * Discover the available resources.
 389  *
 390  * In order to fully support slurm, we need to be able to handle
 391  * node regexp/task_per_node strings such as:
 392  * foo,bar    5,3
 393  * foo        5
 394  * foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16)
 395  *
 396  * @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST)
 397  * @param *tasks_per_node A tasks per node expression from SLURM
 398  *                        (i.e. SLURM_TASKS_PER_NODE)
 399  * @param *nodelist A list which has already been constucted to return
 400  *                  the found nodes in
 401  */
 402 static int orte_ras_slurm_discover(char *regexp, char *tasks_per_node,
 403                                    opal_list_t* nodelist)
 404 {
 405     int i, j, len, ret, count, reps, num_nodes;
 406     char *base, **names = NULL;
 407     char *begptr, *endptr, *orig;
 408     int *slots;
 409     bool found_range = false;
 410     bool more_to_come = false;
 411     char *ptr;
 412 
 413     orig = base = strdup(regexp);
 414     if (NULL == base) {
 415         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 416         return ORTE_ERR_OUT_OF_RESOURCE;
 417     }
 418 
 419     OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 420                          "%s ras:slurm:allocate:discover: checking nodelist: %s",
 421                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 422                          regexp));
 423 
 424     do {
 425         /* Find the base */
 426         len = strlen(base);
 427         for (i = 0; i <= len; ++i) {
 428             if (base[i] == '[') {
 429                 /* we found a range. this gets dealt with below */
 430                 base[i] = '\0';
 431                 found_range = true;
 432                 break;
 433             }
 434             if (base[i] == ',') {
 435                 /* we found a singleton node, and there are more to come */
 436                 base[i] = '\0';
 437                 found_range = false;
 438                 more_to_come = true;
 439                 break;
 440             }
 441             if (base[i] == '\0') {
 442                 /* we found a singleton node */
 443                 found_range = false;
 444                 more_to_come = false;
 445                 break;
 446             }
 447         }
 448         if(i == 0) {
 449             /* we found a special character at the beginning of the string */
 450             orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
 451                            1, regexp, tasks_per_node, "SLURM_NODELIST");
 452             ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 453             free(orig);
 454             return ORTE_ERR_BAD_PARAM;
 455         }
 456 
 457         if (found_range) {
 458             /* If we found a range, now find the end of the range */
 459             for (j = i; j < len; ++j) {
 460                 if (base[j] == ']') {
 461                     base[j] = '\0';
 462                     break;
 463                 }
 464             }
 465             if (j >= len) {
 466                 /* we didn't find the end of the range */
 467                 orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
 468                                1, regexp, tasks_per_node, "SLURM_NODELIST");
 469                 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 470                 free(orig);
 471                 return ORTE_ERR_BAD_PARAM;
 472             }
 473 
 474             ret = orte_ras_slurm_parse_ranges(base, base + i + 1, &names);
 475             if(ORTE_SUCCESS != ret) {
 476                 orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
 477                                1, regexp, tasks_per_node, "SLURM_NODELIST");
 478                 ORTE_ERROR_LOG(ret);
 479                 free(orig);
 480                 return ret;
 481             }
 482             if(base[j + 1] == ',') {
 483                 more_to_come = true;
 484                 base = &base[j + 2];
 485             } else {
 486                 more_to_come = false;
 487             }
 488         } else {
 489             /* If we didn't find a range, just add the node */
 490 
 491             OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 492                                  "%s ras:slurm:allocate:discover: found node %s",
 493                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 494                                  base));
 495 
 496             if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(&names, base))) {
 497                 ORTE_ERROR_LOG(ret);
 498                 free(orig);
 499                 return ret;
 500             }
 501             /* set base equal to the (possible) next base to look at */
 502             base = &base[i + 1];
 503         }
 504     } while(more_to_come);
 505 
 506     free(orig);
 507 
 508     num_nodes = opal_argv_count(names);
 509 
 510     /* Find the number of slots per node */
 511 
 512     slots = malloc(sizeof(int) * num_nodes);
 513     if (NULL == slots) {
 514         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 515         return ORTE_ERR_OUT_OF_RESOURCE;
 516     }
 517     memset(slots, 0, sizeof(int) * num_nodes);
 518 
 519     orig = begptr = strdup(tasks_per_node);
 520     if (NULL == begptr) {
 521         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 522         free(slots);
 523         return ORTE_ERR_OUT_OF_RESOURCE;
 524     }
 525 
 526     j = 0;
 527     while (begptr) {
 528         count = strtol(begptr, &endptr, 10);
 529         if ((endptr[0] == '(') && (endptr[1] == 'x')) {
 530             reps = strtol((endptr+2), &endptr, 10);
 531             if (endptr[0] == ')') {
 532                 endptr++;
 533             }
 534         } else {
 535             reps = 1;
 536         }
 537 
 538         /**
 539          * TBP: it seems like it would be an error to have more slot
 540          * descriptions than nodes. Turns out that this valid, and SLURM will
 541          * return such a thing. For instance, if I did:
 542          * srun -A -N 30 -w odin001
 543          * I would get SLURM_NODELIST=odin001 SLURM_TASKS_PER_NODE=4(x30)
 544          * That is, I am allocated 30 nodes, but since I only requested
 545          * one specific node, that's what is in the nodelist.
 546          * I'm not sure this is what users would expect, but I think it is
 547          * more of a SLURM issue than a orte issue, since SLURM is OK with it,
 548          * I'm ok with it
 549          */
 550         for (i = 0; i < reps && j < num_nodes; i++) {
 551             slots[j++] = count;
 552         }
 553 
 554         if (*endptr == ',') {
 555             begptr = endptr + 1;
 556         } else if (*endptr == '\0' || j >= num_nodes) {
 557             break;
 558         } else {
 559             orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1,
 560                            regexp, tasks_per_node, "SLURM_TASKS_PER_NODE");
 561             ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 562             free(slots);
 563             free(orig);
 564             return ORTE_ERR_BAD_PARAM;
 565         }
 566     }
 567 
 568     free(orig);
 569 
 570     /* Convert the argv of node names to a list of node_t's */
 571 
 572     for (i = 0; NULL != names && NULL != names[i]; ++i) {
 573         orte_node_t *node;
 574 
 575         if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(names[i]) ) {
 576             if (NULL != (ptr = strchr(names[i], '.'))) {
 577                 *ptr = '\0';
 578             }
 579         }
 580 
 581         OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 582                              "%s ras:slurm:allocate:discover: adding node %s (%d slot%s)",
 583                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 584                              names[i], slots[i], (1 == slots[i]) ? "" : "s"));
 585 
 586         node = OBJ_NEW(orte_node_t);
 587         if (NULL == node) {
 588             ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 589             free(slots);
 590             return ORTE_ERR_OUT_OF_RESOURCE;
 591         }
 592         node->name = strdup(names[i]);
 593         node->state = ORTE_NODE_STATE_UP;
 594         node->slots_inuse = 0;
 595         node->slots_max = 0;
 596         node->slots = slots[i];
 597         opal_list_append(nodelist, &node->super);
 598     }
 599     free(slots);
 600     opal_argv_free(names);
 601 
 602     /* All done */
 603     return ret;
 604 }
 605 
 606 
 607 /*
 608  * Parse one or more ranges in a set
 609  *
 610  * @param base     The base text of the node name
 611  * @param *ranges  A pointer to a range. This can contain multiple ranges
 612  *                 (i.e. "1-3,10" or "5" or "9,0100-0130,250")
 613  * @param ***names An argv array to add the newly discovered nodes to
 614  */
 615 static int orte_ras_slurm_parse_ranges(char *base, char *ranges, char ***names)
 616 {
 617     int i, len, ret;
 618     char *start, *orig;
 619 
 620     /* Look for commas, the separator between ranges */
 621 
 622     len = strlen(ranges);
 623     for (orig = start = ranges, i = 0; i < len; ++i) {
 624         if (',' == ranges[i]) {
 625             ranges[i] = '\0';
 626             ret = orte_ras_slurm_parse_range(base, start, names);
 627             if (ORTE_SUCCESS != ret) {
 628                 ORTE_ERROR_LOG(ret);
 629                 return ret;
 630             }
 631             start = ranges + i + 1;
 632         }
 633     }
 634 
 635     /* Pick up the last range, if it exists */
 636 
 637     if (start < orig + len) {
 638 
 639         OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
 640                              "%s ras:slurm:allocate:discover: parse range %s (2)",
 641                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 642                              start));
 643 
 644         ret = orte_ras_slurm_parse_range(base, start, names);
 645         if (ORTE_SUCCESS != ret) {
 646             ORTE_ERROR_LOG(ret);
 647             return ret;
 648         }
 649     }
 650 
 651     /* All done */
 652     return ORTE_SUCCESS;
 653 }
 654 
 655 
 656 /*
 657  * Parse a single range in a set and add the full names of the nodes
 658  * found to the names argv
 659  *
 660  * @param base     The base text of the node name
 661  * @param *ranges  A pointer to a single range. (i.e. "1-3" or "5")
 662  * @param ***names An argv array to add the newly discovered nodes to
 663  */
 664 static int orte_ras_slurm_parse_range(char *base, char *range, char ***names)
 665 {
 666     char *str, temp1[BUFSIZ];
 667     size_t i, j, start, end;
 668     size_t base_len, len, num_len;
 669     size_t num_str_len;
 670     bool found;
 671     int ret;
 672 
 673     len = strlen(range);
 674     base_len = strlen(base);
 675     /* Silence compiler warnings; start and end are always assigned
 676        properly, below */
 677     start = end = 0;
 678 
 679     /* Look for the beginning of the first number */
 680 
 681     for (found = false, i = 0; i < len; ++i) {
 682         if (isdigit((int) range[i])) {
 683             if (!found) {
 684                 start = atoi(range + i);
 685                 found = true;
 686                 break;
 687             }
 688         }
 689     }
 690     if (!found) {
 691         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 692         return ORTE_ERR_NOT_FOUND;
 693     }
 694 
 695     /* Look for the end of the first number */
 696 
 697     for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
 698         if (!isdigit((int) range[i])) {
 699             break;
 700         }
 701     }
 702 
 703     /* Was there no range, just a single number? */
 704 
 705     if (i >= len) {
 706         end = start;
 707         found = true;
 708     }
 709 
 710     /* Nope, there was a range.  Look for the beginning of the second
 711        number */
 712 
 713     else {
 714         for (; i < len; ++i) {
 715             if (isdigit((int) range[i])) {
 716                 end = atoi(range + i);
 717                 found = true;
 718                 break;
 719             }
 720         }
 721     }
 722     if (!found) {
 723         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 724         return ORTE_ERR_NOT_FOUND;
 725     }
 726 
 727     /* Make strings for all values in the range */
 728 
 729     len = base_len + num_str_len + 32;
 730     str = malloc(len);
 731     if (NULL == str) {
 732         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 733         return ORTE_ERR_OUT_OF_RESOURCE;
 734     }
 735     strcpy(str, base);
 736     for (i = start; i <= end; ++i) {
 737         str[base_len] = '\0';
 738         snprintf(temp1, BUFSIZ - 1, "%lu", (long) i);
 739 
 740         /* Do we need zero pading? */
 741 
 742         if ((num_len = strlen(temp1)) < num_str_len) {
 743             for (j = base_len; j < base_len + (num_str_len - num_len); ++j) {
 744                 str[j] = '0';
 745             }
 746             str[j] = '\0';
 747         }
 748         strcat(str, temp1);
 749         ret = opal_argv_append_nosize(names, str);
 750         if(ORTE_SUCCESS != ret) {
 751             ORTE_ERROR_LOG(ret);
 752             free(str);
 753             return ret;
 754         }
 755     }
 756     free(str);
 757 
 758     /* All done */
 759     return ORTE_SUCCESS;
 760 }
 761 
 762 static void timeout(int fd, short args, void *cbdata)
 763 {
 764     local_jobtracker_t *jtrk = (local_jobtracker_t*)cbdata;
 765     orte_job_t *jdata;
 766 
 767     orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-timeout", true);
 768     opal_output_verbose(2, orte_ras_base_framework.framework_output,
 769                         "%s Timed out on dynamic allocation",
 770                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 771     /* indicate that we failed to receive an allocation */
 772     jdata = orte_get_job_data_object(jtrk->jobid);
 773     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
 774 }
 775 
 776 static void recv_data(int fd, short args, void *cbdata)
 777 {
 778     bool found;
 779     int i, rc;
 780     orte_node_t *nd, *nd2;
 781     opal_list_t nds, ndtmp;
 782     opal_list_item_t *item, *itm;
 783     char recv_msg[8192];
 784     int nbytes, idx, sjob;
 785     char **alloc, *nodelist, *tpn;
 786     local_jobtracker_t *ptr, *jtrk;
 787     local_apptracker_t *aptrk;
 788     orte_app_context_t *app;
 789     orte_jobid_t jobid;
 790     orte_job_t *jdata;
 791     char **dash_host = NULL;
 792 
 793     opal_output_verbose(2, orte_ras_base_framework.framework_output,
 794                         "%s ras:slurm: dynamic allocation - data recvd",
 795                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 796 
 797     /* read the data from the socket and put it in the
 798      * nodes field of op
 799      */
 800     memset(recv_msg, 0, sizeof(recv_msg));
 801     nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1);
 802 
 803     opal_output_verbose(2, orte_ras_base_framework.framework_output,
 804                         "%s ras:slurm: dynamic allocation msg: %s",
 805                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg);
 806 
 807     /* check if we got something */
 808     if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) {
 809         /* show an error here - basically, a "nothing was available"
 810          * message
 811          */
 812         orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true,
 813                        (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg);
 814         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
 815         return;
 816     }
 817 
 818     /* break the message into its component parts, separated by colons */
 819     alloc = opal_argv_split(recv_msg, ':');
 820 
 821     /* the first section contains the ORTE jobid for this allocation */
 822     tpn = strchr(alloc[0], '=');
 823     orte_util_convert_string_to_jobid(&jobid, tpn+1);
 824     /* get the corresponding job object */
 825     jdata = orte_get_job_data_object(jobid);
 826     jtrk = NULL;
 827     /* find the associated tracking object */
 828     for (item = opal_list_get_first(&jobs);
 829          item != opal_list_get_end(&jobs);
 830          item = opal_list_get_next(item)) {
 831         ptr = (local_jobtracker_t*)item;
 832         if (ptr->jobid == jobid) {
 833             jtrk = ptr;
 834             break;
 835         }
 836     }
 837     if (NULL == jtrk) {
 838         orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER");
 839         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
 840         opal_argv_free(alloc);
 841         return;
 842     }
 843 
 844     /* stop the timeout event */
 845     opal_event_del(&jtrk->timeout_ev);
 846 
 847     /* cycle across all the remaining parts - each is the allocation for
 848      * an app in this job
 849      */
 850     OBJ_CONSTRUCT(&nds, opal_list_t);
 851     OBJ_CONSTRUCT(&ndtmp, opal_list_t);
 852     idx = -1;
 853     sjob = -1;
 854     nodelist = NULL;
 855     tpn = NULL;
 856     for (i=1; NULL != alloc[i]; i++) {
 857         if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) {
 858             orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
 859             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
 860             opal_argv_free(alloc);
 861             if (NULL != nodelist) {
 862                 free(nodelist);
 863             }
 864             if (NULL != tpn) {
 865                 free(tpn);
 866             }
 867             return;
 868         }
 869         if (idx < 0) {
 870             orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
 871             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
 872             opal_argv_free(alloc);
 873             free(nodelist);
 874             free(tpn);
 875             return;
 876         }
 877         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
 878             orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
 879             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
 880             opal_argv_free(alloc);
 881             free(nodelist);
 882             free(tpn);
 883             return;
 884         }
 885         /* release the current dash_host as that contained the *desired* allocation */
 886         orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST);
 887         /* track the Slurm jobid */
 888         if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) {
 889             aptrk = OBJ_NEW(local_apptracker_t);
 890             opal_pointer_array_set_item(&jtrk->apps, idx, aptrk);
 891         }
 892         aptrk->sjob = sjob;
 893         /* since the nodelist/tpn may contain regular expressions, parse them */
 894         if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) {
 895             ORTE_ERROR_LOG(rc);
 896             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
 897             opal_argv_free(alloc);
 898             free(nodelist);
 899             free(tpn);
 900             return;
 901         }
 902         /* transfer the discovered nodes to our node list, and construct
 903          * the new dash_host entry to match what was allocated
 904          */
 905         while (NULL != (item = opal_list_remove_first(&ndtmp))) {
 906             nd = (orte_node_t*)item;
 907             opal_argv_append_nosize(&dash_host, nd->name);
 908             /* check for duplicates */
 909             found = false;
 910             for (itm = opal_list_get_first(&nds);
 911                  itm != opal_list_get_end(&nds);
 912                  itm = opal_list_get_next(itm)) {
 913                 nd2 = (orte_node_t*)itm;
 914                 if (0 == strcmp(nd->name, nd2->name)) {
 915                     found = true;
 916                     nd2->slots += nd->slots;
 917                     OBJ_RELEASE(item);
 918                     break;
 919                 }
 920             }
 921             if (!found) {
 922                 /* append the new node to our list */
 923                 opal_list_append(&nds, item);
 924             }
 925         }
 926         /* cleanup */
 927         free(nodelist);
 928         free(tpn);
 929     }
 930     /* cleanup */
 931     opal_argv_free(alloc);
 932     OBJ_DESTRUCT(&ndtmp);
 933     if (NULL != dash_host) {
 934         tpn = opal_argv_join(dash_host, ',');
 935         for (idx=0; idx < jdata->apps->size; idx++) {
 936             if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
 937                 orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
 938                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
 939                 opal_argv_free(dash_host);
 940                 free(tpn);
 941                 return;
 942             }
 943             orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING);
 944         }
 945         opal_argv_free(dash_host);
 946         free(tpn);
 947     }
 948 
 949     if (opal_list_is_empty(&nds)) {
 950         /* if we get here, then we were able to contact slurm,
 951          * which means we are in an actively managed cluster.
 952          * However, slurm indicated that nothing is currently
 953          * available that meets our requirements. This is a fatal
 954          * situation - we do NOT have the option of running on
 955          * user-specified hosts as the cluster is managed.
 956          */
 957         OBJ_DESTRUCT(&nds);
 958         orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
 959         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 960     }
 961 
 962     /* store the found nodes */
 963     if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) {
 964         ORTE_ERROR_LOG(rc);
 965         OBJ_DESTRUCT(&nds);
 966         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 967         return;
 968     }
 969     OBJ_DESTRUCT(&nds);
 970 
 971     /* default to no-oversubscribe-allowed for managed systems */
 972     if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
 973         ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
 974     }
 975     /* flag that the allocation is managed */
 976     orte_managed_allocation = true;
 977     /* move the job along */
 978     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE);
 979     /* all done */
 980     return;
 981 }
 982 
 983 /* we cannot use the RML to communicate with SLURM as it doesn't
 984  * understand our internal protocol, so we have to do a bare-bones
 985  * exchange based on sockets
 986  */
 987 static int dyn_allocate(orte_job_t *jdata)
 988 {
 989     char *cmd_str, **cmd=NULL, *tmp, *jstring;
 990     char *node_list;
 991     orte_app_context_t *app;
 992     int i;
 993     struct timeval tv;
 994     local_jobtracker_t *jtrk;
 995     int64_t i64, *i64ptr;
 996 
 997     if (NULL == mca_ras_slurm_component.config_file) {
 998         opal_output(0, "Cannot perform dynamic allocation as no Slurm configuration file provided");
 999         return ORTE_ERR_NOT_FOUND;
1000     }
1001 
1002     /* track this request */
1003     jtrk = OBJ_NEW(local_jobtracker_t);
1004     jtrk->jobid = jdata->jobid;
1005     opal_list_append(&jobs, &jtrk->super);
1006 
1007     /* construct the command - note that the jdata structure contains
1008      * a field for the minimum number of nodes required for the job.
1009      * The node list can be constructed from the union of all the nodes
1010      * contained in the dash_host field of the app_contexts. So you'll
1011      * need to do a little work to build the command. We don't currently
1012      * have a field in the jdata structure for "mandatory" vs "optional"
1013      * allocations, so we'll have to add that someday. Likewise, you may
1014      * want to provide a param to adjust the timeout value
1015      */
1016     /* construct the cmd string */
1017     opal_argv_append_nosize(&cmd, "allocate");
1018     /* add the jobid */
1019     orte_util_convert_jobid_to_string(&jstring, jdata->jobid);
1020     opal_asprintf(&tmp, "jobid=%s", jstring);
1021     opal_argv_append_nosize(&cmd, tmp);
1022     free(tmp);
1023     free(jstring);
1024     /* if we want the allocation for all apps in one shot,
1025      * then tell slurm
1026      *
1027      * RHC: we don't currently have the ability to handle
1028      * rolling allocations in the rest of the code base
1029      */
1030 #if 0
1031     if (!mca_ras_slurm_component.rolling_alloc) {
1032         opal_argv_append_nosize(&cmd, "return=all");
1033     }
1034 #else
1035     opal_argv_append_nosize(&cmd, "return=all");
1036 #endif
1037 
1038     /* pass the timeout */
1039     opal_asprintf(&tmp, "timeout=%d", mca_ras_slurm_component.timeout);
1040     opal_argv_append_nosize(&cmd, tmp);
1041     free(tmp);
1042 
1043     /* for each app, add its allocation request info */
1044     i64ptr = &i64;
1045     for (i=0; i < jdata->apps->size; i++) {
1046         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
1047             continue;
1048         }
1049         /* add the app id, preceded by a colon separator */
1050         opal_asprintf(&tmp, ": app=%d", (int)app->idx);
1051         opal_argv_append_nosize(&cmd, tmp);
1052         free(tmp);
1053         /* add the number of process "slots" we need */
1054         opal_asprintf(&tmp, "np=%d", app->num_procs);
1055         opal_argv_append_nosize(&cmd, tmp);
1056         free(tmp);
1057         /* if we were given a minimum number of nodes, pass it along */
1058         if (orte_get_attribute(&app->attributes, ORTE_APP_MIN_NODES, (void**)&i64ptr, OPAL_INT64)) {
1059             opal_asprintf(&tmp, "N=%ld", (long int)i64);
1060             opal_argv_append_nosize(&cmd, tmp);
1061             free(tmp);
1062         }
1063         /* add the list of nodes, if one was given, ensuring
1064          * that each node only appears once
1065          */
1066         node_list =  get_node_list(app);
1067         if (NULL != node_list) {
1068             opal_asprintf(&tmp, "node_list=%s", node_list);
1069             opal_argv_append_nosize(&cmd, tmp);
1070             free(node_list);
1071             free(tmp);
1072         }
1073         /* add the mandatory/optional flag */
1074         if (orte_get_attribute(&app->attributes, ORTE_APP_MANDATORY, NULL, OPAL_BOOL)) {
1075             opal_argv_append_nosize(&cmd, "flag=mandatory");
1076         } else {
1077             opal_argv_append_nosize(&cmd, "flag=optional");
1078         }
1079     }
1080 
1081     /* assemble it into the final cmd to be sent */
1082     cmd_str = opal_argv_join(cmd, ' ');
1083     opal_argv_free(cmd);
1084 
1085     /* start a timer - if the response to our request doesn't appear
1086      * in the defined time, then we will error out as Slurm isn't
1087      * responding to us
1088      */
1089     opal_event_evtimer_set(orte_event_base, &jtrk->timeout_ev, timeout, jtrk);
1090     tv.tv_sec = mca_ras_slurm_component.timeout * 2;
1091     tv.tv_usec = 0;
1092     opal_event_evtimer_add(&jtrk->timeout_ev, &tv);
1093 
1094     opal_output_verbose(2, orte_ras_base_framework.framework_output,
1095                         "%s slurm:dynalloc cmd_str = %s",
1096                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1097                         cmd_str);
1098 
1099     if (send(socket_fd, cmd_str, strlen(cmd_str)+1, 0) < 0) {
1100         ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
1101     }
1102     free(cmd_str);
1103 
1104     /* we cannot wait here for a response as we
1105      * are already in an event. So return a value
1106      * that indicates we are waiting for an
1107      * allocation so the base functions know
1108      * that they shouldn't progress the job
1109      */
1110     return ORTE_ERR_ALLOCATION_PENDING;
1111 }
1112 
1113 static int parse_alloc_msg(char *msg, int *idx, int *sjob,
1114                            char **nodelist, char **tpn)
1115 {
1116     char *tmp;
1117     char *p_str;
1118     char *pos;
1119     int found=0;
1120 
1121     if (msg == NULL || strlen(msg) == 0) {
1122         return ORTE_ERR_BAD_PARAM;
1123     }
1124 
1125     tmp = strdup(msg);
1126     p_str = strtok(tmp, " ");
1127     while (p_str) {
1128         if (NULL != strstr(p_str, "slurm_jobid")) {
1129             pos = strchr(p_str, '=');
1130             *sjob = strtol(pos+1, NULL, 10);
1131             found++;
1132         } else if (NULL != strstr(p_str, "allocated_node_list")) {
1133             pos = strchr(p_str, '=');
1134             *nodelist = strdup(pos+1);
1135             found++;
1136         } else if (NULL != strstr(p_str, "tasks_per_node")) {
1137             pos = strchr(p_str, '=');
1138             *tpn = strdup(pos+1);
1139             found++;
1140         } else if (NULL != strstr(p_str, "app")) {
1141             pos = strchr(p_str, '=');
1142             *idx = strtol(pos+1, NULL, 10);
1143             found++;
1144         }
1145         p_str = strtok(NULL, " ");
1146     }
1147     free(tmp);
1148 
1149     if (4 != found) {
1150         return ORTE_ERR_NOT_FOUND;
1151     }
1152     return ORTE_SUCCESS;
1153 }
1154 
1155 static char* get_node_list(orte_app_context_t *app)
1156 {
1157     int j;
1158     char **total_host = NULL;
1159     char *nodes;
1160     char **dash_host, *dh;
1161 
1162     if (!orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&dh, OPAL_STRING)) {
1163         return NULL;
1164     }
1165     dash_host = opal_argv_split(dh, ',');
1166     free(dh);
1167     for (j=0; NULL != dash_host[j]; j++) {
1168         opal_argv_append_unique_nosize(&total_host, dash_host[j], false);
1169     }
1170     opal_argv_free(dash_host);
1171     if (NULL == total_host) {
1172         return NULL;
1173     }
1174 
1175     nodes = opal_argv_join(total_host, ',');
1176     opal_argv_free(total_host);
1177     return nodes;
1178 }
1179 
1180 static int read_ip_port(char *filename, char **ip, uint16_t *port)
1181 {
1182     FILE *fp;
1183     char line[ORTE_SLURM_DYN_MAX_SIZE];
1184     char *pos;
1185     bool found_port = false;
1186     bool found_ip = false;
1187 
1188     if (NULL == (fp = fopen(filename, "r"))) {
1189         orte_show_help("help-ras-slurm.txt", "config-file-not-found", true, filename);
1190         return ORTE_ERR_SILENT;
1191     }
1192 
1193     memset(line, 0, ORTE_SLURM_DYN_MAX_SIZE);
1194     while (NULL != fgets(line, ORTE_SLURM_DYN_MAX_SIZE, fp) &&
1195                  (!found_ip || !found_port)) {
1196         if (0 == strlen(line)) {
1197             continue;
1198         }
1199         line[strlen(line)-1] = '\0';
1200         if (0 == strncmp(line, "JobSubmitDynAllocPort", strlen("JobSubmitDynAllocPort"))) {
1201             pos = strstr(line, "=") + 1;
1202             *port = strtol(pos, NULL, 10);
1203             found_port = true;
1204         } else if (0 == strncmp(line, "ControlMachine", strlen("ControlMachine"))) {
1205             pos = strstr(line, "=") + 1;
1206             *ip = strdup(pos);
1207             found_ip = true;
1208         }
1209         memset(line, 0, ORTE_SLURM_DYN_MAX_SIZE);
1210     }
1211 
1212     fclose(fp);
1213     if (!found_ip) {
1214         opal_output(0, "The IP address or name of the Slurm control machine was not provided");
1215         return ORTE_ERR_NOT_FOUND;
1216     }
1217     if (!found_port) {
1218         opal_output(0, "The IP port of the Slurm dynamic allocation service was not provided");
1219         return ORTE_ERR_NOT_FOUND;
1220     }
1221 
1222     return ORTE_SUCCESS;
1223 }

/* [<][>][^][v][top][bottom][index][help] */