root/orte/mca/ess/alps/ess_alps_utils.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_ess_alps_get_first_rank_on_node
  2. orte_ess_alps_sync_start
  3. orte_ess_alps_sync_complete

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2011      Cisco Systems, Inc.  All rights reserved.
  13  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
  14  *                         All rights reserved.
  15  * Copyright (c) 2018      Intel, Inc.  All rights reserved.
  16  * $COPYRIGHT$
  17  *
  18  * Additional copyrights may follow
  19  *
  20  * $HEADER$
  21  *
  22  */
  23 
  24 #include "orte_config.h"
  25 #include "orte/constants.h"
  26 
  27 #include "orte/util/show_help.h"
  28 #include "opal/util/argv.h"
  29 
  30 #include "orte/util/proc_info.h"
  31 #include "orte/mca/errmgr/base/base.h"
  32 #include "orte/util/name_fns.h"
  33 #include "orte/runtime/orte_globals.h"
  34 
  35 #include "orte/mca/ess/ess.h"
  36 #include "orte/mca/ess/base/base.h"
  37 #include "orte/mca/ess/alps/ess_alps.h"
  38 
  39 /*
  40  * use the Alps placement file to obtain
  41  * the global rank of the "first" local rank
  42  * on the node.
  43  */
  44 
  45 int
  46 orte_ess_alps_get_first_rank_on_node(int *first_rank)
  47 {
  48     int alps_status = 0;
  49     uint64_t apid;
  50     size_t alps_count;
  51     int ret = ORTE_SUCCESS;
  52     int lli_ret = 0, place_ret;
  53     alpsAppLayout_t orted_layout;
  54 
  55     if (first_rank == NULL) {
  56         ret = ORTE_ERR_BAD_PARAM;
  57         goto fn_exit;
  58     }
  59 
  60     /*
  61      * First get our apid
  62      */
  63 
  64     lli_ret = alps_app_lli_lock();
  65     if (0 != ret) {
  66         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
  67                              "%s ess:alps: alps_app_lli_lock returned %d",
  68                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
  69         ret = ORTE_ERR_FILE_WRITE_FAILURE;
  70         goto fn_exit;
  71     }
  72 
  73     lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_APID, NULL, 0);
  74     if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
  75         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
  76                              "%s ess:alps: alps_app_lli_put_request - APID returned %d",
  77                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
  78         ret = ORTE_ERR_FILE_WRITE_FAILURE;
  79         goto fn_exit_w_lock;
  80     }
  81 
  82     lli_ret = alps_app_lli_get_response (&alps_status, &alps_count);
  83     if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) {
  84         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
  85                              "%s ess:alps: alps_app_lli_get_response returned %d",
  86                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status));
  87         ret = ORTE_ERR_FILE_READ_FAILURE;
  88         goto fn_exit_w_lock;
  89     }
  90 
  91     lli_ret = alps_app_lli_get_response_bytes (&apid, sizeof(apid));
  92     if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
  93         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
  94                              "%s ess:alps: alps_app_lli_get_response_bytes returned %d",
  95                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
  96         ret = ORTE_ERR_FILE_READ_FAILURE;
  97         goto fn_exit_w_lock;
  98     }
  99 
 100     place_ret = alps_get_placement_info(apid,
 101                                         &orted_layout,
 102                                         NULL,
 103                                         NULL,
 104                                         NULL,
 105                                         NULL,
 106                                         NULL,
 107                                         NULL,
 108                                         NULL,
 109                                         NULL,
 110                                         NULL);
 111     if (1 != place_ret) {
 112         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 113                              "%s ess:alps: alps_get_placement_info returned %d (%s)",
 114                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, strerror(errno)));
 115         ret = ORTE_ERROR;
 116         goto fn_exit;
 117     }
 118 
 119     OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
 120                            "%s ess:alps: alps_get_placement_info returned %d first pe on node is %d",
 121                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, orted_layout.firstPe));
 122     *first_rank = orted_layout.firstPe;
 123 
 124    fn_exit_w_lock:
 125     lli_ret = alps_app_lli_unlock();
 126     if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
 127         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 128                              "%s ess:alps: alps_app_lli_unlock returned %d",
 129                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
 130         ret = ORTE_ERR_FILE_WRITE_FAILURE;
 131     }
 132 
 133    fn_exit:
 134     return ret;
 135 }
 136 
 137 /*
 138  * Function to check in with apshepherd to say we are a parallel application
 139  */
 140 int
 141 orte_ess_alps_sync_start(void)
 142 {
 143     int ret = ORTE_SUCCESS;
 144     int lli_ret = 0;
 145     int alps_status = 0;
 146     size_t alps_count;
 147 
 148     lli_ret = alps_app_lli_lock();
 149     if (0 != ret) {
 150         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 151                              "%s ess:alps: alps_app_lli_lock returned %d",
 152                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
 153         ret = ORTE_ERR_FILE_WRITE_FAILURE;
 154         goto fn_exit;
 155     }
 156 
 157     lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_START, NULL, 0);
 158     if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
 159         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 160                              "%s ess:alps: alps_app_lli_put_request returned %d",
 161                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
 162         ret = ORTE_ERR_FILE_WRITE_FAILURE;
 163         goto fn_exit_w_lock;
 164     }
 165 
 166     lli_ret = alps_app_lli_get_response (&alps_status, &alps_count);
 167     if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) {
 168         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 169                              "%s ess:alps: alps_app_lli_get_response returned %d",
 170                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status));
 171         ret = ORTE_ERR_FILE_READ_FAILURE;
 172         goto fn_exit_w_lock;
 173     }
 174 
 175    fn_exit_w_lock:
 176     lli_ret = alps_app_lli_unlock();
 177     if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
 178         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 179                              "%s ess:alps: alps_app_lli_unlock returned %d",
 180                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
 181         ret = ORTE_ERR_FILE_WRITE_FAILURE;
 182     }
 183 
 184    fn_exit:
 185     return ret;
 186 }
 187 
 188 /*
 189  * Function to check in with apshepherd to say we are a parallel application
 190  */
 191 
 192 int
 193 orte_ess_alps_sync_complete(void)
 194 {
 195     int ret = ORTE_SUCCESS;
 196     int lli_ret = 0;
 197     int alps_status = 0;
 198     size_t alps_count;
 199 
 200     lli_ret = alps_app_lli_lock();
 201     if (0 != ret) {
 202         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 203                              "%s ess:alps: alps_app_lli_lock returned %d",
 204                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
 205         ret = ORTE_ERR_FILE_WRITE_FAILURE;
 206         goto fn_exit;
 207     }
 208 
 209     lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_EXITING, NULL, 0);
 210     if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
 211         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 212                              "%s ess:alps: alps_app_lli_put_request returned %d",
 213                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
 214         ret = ORTE_ERR_FILE_WRITE_FAILURE;
 215         goto fn_exit_w_lock;
 216     }
 217 
 218     lli_ret = alps_app_lli_get_response (&alps_status, &alps_count);
 219     if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) {
 220         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 221                              "%s ess:alps: alps_app_lli_get_response returned %d",
 222                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status));
 223         ret = ORTE_ERR_FILE_READ_FAILURE;
 224         goto fn_exit_w_lock;
 225     }
 226 
 227    fn_exit_w_lock:
 228     lli_ret = alps_app_lli_unlock();
 229     if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
 230         OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output,
 231                              "%s ess:alps: alps_app_lli_unlock returned %d",
 232                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret));
 233         ret = ORTE_ERR_FILE_WRITE_FAILURE;
 234     }
 235 
 236    fn_exit:
 237     return ret;
 238 }
 239 
 240 

/* [<][>][^][v][top][bottom][index][help] */