root/orte/mca/ras/gridengine/ras_gridengine_module.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_ras_gridengine_allocate
  2. get_slot_count
  3. orte_ras_gridengine_finalize

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2005 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2006-2010 Oracle and/or its affiliates.  All rights reserved
  13  * Copyright (c) 2016      IBM Corporation.  All rights reserved.
  14  * $COPYRIGHT$
  15  *
  16  * Additional copyrights may follow
  17  *
  18  * $HEADER$
  19  */
  20 /**
  21  * @file:
  22  * Resource Allocation for Grid Engine
  23  */
  24 #include "orte_config.h"
  25 #include "orte/constants.h"
  26 
  27 #include <errno.h>
  28 #include <unistd.h>
  29 #include <string.h>
  30 
  31 #include "opal/util/output.h"
  32 #include "opal/util/net.h"
  33 #include "orte/util/show_help.h"
  34 #include "orte/mca/errmgr/errmgr.h"
  35 #include "orte/runtime/orte_globals.h"
  36 #include "orte/mca/ras/base/ras_private.h"
  37 #include "orte/mca/ras/gridengine/ras_gridengine.h"
  38 
  39 /*
  40  * Local functions
  41  */
  42 static int orte_ras_gridengine_allocate(orte_job_t *jdata, opal_list_t *nodes);
  43 static int orte_ras_gridengine_finalize(void);
  44 #if 0
  45 static int get_slot_count(char* node_name, int* slot_cnt);
  46 #endif
  47 
  48 /*
  49  * Global variable
  50  */
  51 orte_ras_base_module_t orte_ras_gridengine_module = {
  52     NULL,
  53     orte_ras_gridengine_allocate,
  54     NULL,
  55     orte_ras_gridengine_finalize
  56 };
  57 
  58 /**
  59  *  Discover available (pre-allocated) nodes. Allocate the
  60  *  requested number of nodes/process slots to the job.
  61  *
  62  */
  63 static int orte_ras_gridengine_allocate(orte_job_t *jdata, opal_list_t *nodelist)
  64 {
  65     char *pe_hostfile = getenv("PE_HOSTFILE");
  66     char *job_id = getenv("JOB_ID");
  67     char buf[1024], *tok, *num, *queue, *arch, *ptr, *tmp;
  68     int rc;
  69     FILE *fp;
  70     orte_node_t *node;
  71     opal_list_item_t *item;
  72     bool found;
  73 
  74     /* show the Grid Engine's JOB_ID */
  75     if (mca_ras_gridengine_component.show_jobid ||
  76         mca_ras_gridengine_component.verbose != -1) {
  77         opal_output(0, "ras:gridengine: JOB_ID: %s", job_id);
  78     }
  79 
  80     /* check the PE_HOSTFILE before continuing on */
  81     if (!(fp = fopen(pe_hostfile, "r"))) {
  82         orte_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
  83             true, pe_hostfile, strerror(errno));
  84         rc = ORTE_ERROR;
  85         ORTE_ERROR_LOG(rc);
  86         goto cleanup;
  87     }
  88 
  89     /* parse the pe_hostfile for hostname, slots, etc, then compare the
  90      * current node with a list of hosts in the nodelist, if the current
  91      * node is not found in nodelist, add it in */
  92     opal_output(mca_ras_gridengine_component.verbose,
  93                 "ras:gridengine: PE_HOSTFILE: %s", pe_hostfile);
  94 
  95     while (fgets(buf, sizeof(buf), fp)) {
  96         ptr = strtok_r(buf, " \n", &tok);
  97         num = strtok_r(NULL, " \n", &tok);
  98         queue = strtok_r(NULL, " \n", &tok);
  99         arch = strtok_r(NULL, " \n", &tok);
 100 
 101         if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(ptr) ) {
 102             if (NULL != (tmp = strchr(ptr, '.'))) {
 103                 *tmp = '\0';
 104             }
 105         }
 106 
 107         /* see if we already have this node */
 108         found = false;
 109         for (item = opal_list_get_first(nodelist);
 110              item != opal_list_get_end(nodelist);
 111              item = opal_list_get_next(item)) {
 112             node = (orte_node_t*)item;
 113             if (0 == strcmp(ptr, node->name)) {
 114                 /* just add the slots */
 115                 node->slots += (int)strtol(num, (char **)NULL, 10);
 116                 found = true;
 117                 break;
 118             }
 119         }
 120         if (!found) {
 121             /* create a new node entry */
 122             node = OBJ_NEW(orte_node_t);
 123             if (NULL == node) {
 124                 fclose(fp);
 125                 return ORTE_ERR_OUT_OF_RESOURCE;
 126             }
 127             node->name = strdup(ptr);
 128             node->state = ORTE_NODE_STATE_UP;
 129             node->slots_inuse = 0;
 130             node->slots_max = 0;
 131             node->slots = (int)strtol(num, (char **)NULL, 10);
 132             opal_output(mca_ras_gridengine_component.verbose,
 133                         "ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
 134                         node->name, node->slots);
 135             opal_list_append(nodelist, &node->super);
 136         }
 137     } /* finished reading the $PE_HOSTFILE */
 138 
 139 cleanup:
 140     fclose(fp);
 141 
 142     /* in gridengine, if we didn't find anything, then something
 143      * is wrong. The user may not have indicated this was a parallel
 144      * job, or may not have an allocation at all. In any case, this
 145      * is considered an unrecoverable error and we need to report it
 146      */
 147     if (opal_list_is_empty(nodelist)) {
 148         orte_show_help("help-ras-gridengine.txt", "no-nodes-found", true);
 149         return ORTE_ERR_NOT_FOUND;
 150     }
 151 
 152     return ORTE_SUCCESS;
 153 
 154 }
 155 
 156 #if 0
 157 /**
 158  * This function is not used currently, but may be used eventually.
 159  * Parse the PE_HOSTFILE to determine the number of process
 160  * slots/processors available on the node.
 161  */
 162 static int get_slot_count(char* node_name, int* slot_cnt)
 163 {
 164     char buf[1024], *tok, *name, *num, *queue, *arch;
 165     char *pe_hostfile = getenv("PE_HOSTFILE");
 166     FILE *fp;
 167 
 168     /* check the PE_HOSTFILE before continuing on */
 169     if (!(fp = fopen(pe_hostfile, "r"))) {
 170         orte_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
 171             true, pe_hostfile, strerror(errno));
 172         ORTE_ERROR_LOG(ORTE_ERROR);
 173         return(ORTE_ERROR);
 174     }
 175 
 176     while (fgets(buf, sizeof(buf), fp)) {
 177         name = strtok_r(buf, " \n", &tok);
 178         num = strtok_r(NULL, " \n", &tok);
 179         queue = strtok_r(NULL, " \n", &tok);
 180         arch = strtok_r(NULL, " \n", &tok);
 181 
 182         if(strcmp(node_name,name) == 0) {
 183             *slot_cnt = (int) strtol(num, (char **)NULL, 10);
 184             opal_output(mca_ras_gridengine_component.verbose,
 185                 "ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
 186                 node_name, *slot_cnt);
 187             fclose(fp);
 188             return ORTE_SUCCESS;
 189         }
 190     }
 191 
 192     /* when there is no match */
 193     fclose(fp);
 194     return ORTE_ERROR;
 195 }
 196 #endif
 197 
 198 /**
 199  * finalize
 200  */
 201 static int orte_ras_gridengine_finalize(void)
 202 {
 203     /* Nothing to do */
 204     opal_output(mca_ras_gridengine_component.verbose,
 205         "ras:gridengine:finalize: success (nothing to do)");
 206     return ORTE_SUCCESS;
 207 }

/* [<][>][^][v][top][bottom][index][help] */