root/ompi/mca/mtl/psm/mtl_psm_component.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_mtl_psm_component_register
  2. ompi_mtl_psm_component_open
  3. ompi_mtl_psm_component_query
  4. ompi_mtl_psm_component_close
  5. get_num_total_procs
  6. get_num_local_procs
  7. get_local_rank
  8. ompi_mtl_psm_component_init

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2005 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
  14  * Copyright (c) 2012-2015 Los Alamos National Security, LLC.
  15  *                         All rights reserved.
  16  * Copyright (c) 2014      Intel Corporation. All rights reserved.
  17  * $COPYRIGHT$
  18  *
  19  * Additional copyrights may follow
  20  *
  21  * $HEADER$
  22  */
  23 
  24 #include "ompi_config.h"
  25 
  26 #include "opal/mca/event/event.h"
  27 #include "opal/util/output.h"
  28 #include "opal/util/show_help.h"
  29 #include "ompi/proc/proc.h"
  30 
  31 #include "mtl_psm.h"
  32 #include "mtl_psm_types.h"
  33 #include "mtl_psm_request.h"
  34 
  35 #include "psm.h"
  36 
  37 #include <sys/types.h>
  38 #include <sys/stat.h>
  39 #include <unistd.h>
  40 #include <glob.h>
  41 
  42 static int param_priority;
  43 
  44 static int ompi_mtl_psm_component_open(void);
  45 static int ompi_mtl_psm_component_close(void);
  46 static int ompi_mtl_psm_component_query(mca_base_module_t **module, int *priority);
  47 static int ompi_mtl_psm_component_register(void);
  48 
  49 static mca_mtl_base_module_t* ompi_mtl_psm_component_init( bool enable_progress_threads,
  50                                                           bool enable_mpi_threads );
  51 
  52 mca_mtl_psm_component_t mca_mtl_psm_component = {
  53 
  54     {
  55         /* First, the mca_base_component_t struct containing meta
  56          * information about the component itself */
  57 
  58         .mtl_version = {
  59             MCA_MTL_BASE_VERSION_2_0_0,
  60 
  61             .mca_component_name = "psm",
  62             MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
  63                                   OMPI_RELEASE_VERSION),
  64             .mca_open_component = ompi_mtl_psm_component_open,
  65             .mca_close_component = ompi_mtl_psm_component_close,
  66             .mca_query_component = ompi_mtl_psm_component_query,
  67             .mca_register_component_params = ompi_mtl_psm_component_register,
  68         },
  69         .mtl_data = {
  70             /* The component is not checkpoint ready */
  71             MCA_BASE_METADATA_PARAM_NONE
  72         },
  73 
  74         .mtl_init = ompi_mtl_psm_component_init,
  75     }
  76 };
  77 
  78 #if PSM_VERNO >= 0x010d
  79 static mca_base_var_enum_value_t path_query_values[] = {
  80     {PSM_PATH_RES_NONE, "none"},
  81     {PSM_PATH_RES_OPP, "opp"},
  82     {0, NULL}
  83 };
  84 #endif
  85 
  86 static int
  87 ompi_mtl_psm_component_register(void)
  88 {
  89 #if PSM_VERNO >= 0x010d
  90     mca_base_var_enum_t *new_enum;
  91 #endif
  92 
  93 
  94     /* set priority high enough to beat ob1's default */
  95     param_priority = 30;
  96     (void) mca_base_component_var_register (&mca_mtl_psm_component.super.mtl_version,
  97                                             "priority", "Priority of the PSM MTL component",
  98                                             MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
  99                                             OPAL_INFO_LVL_9,
 100                                             MCA_BASE_VAR_SCOPE_READONLY,
 101                                             &param_priority);
 102 
 103     ompi_mtl_psm.connect_timeout = 180;
 104     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 105                                            "connect_timeout",
 106                                            "PSM connection timeout value in seconds",
 107                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 108                                            OPAL_INFO_LVL_9,
 109                                            MCA_BASE_VAR_SCOPE_READONLY,
 110                                            &ompi_mtl_psm.connect_timeout);
 111 
 112     ompi_mtl_psm.debug_level = 1;
 113     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 114                                            "debug", "PSM debug level",
 115                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 116                                            OPAL_INFO_LVL_9,
 117                                            MCA_BASE_VAR_SCOPE_READONLY,
 118                                            &ompi_mtl_psm.debug_level);
 119 
 120     ompi_mtl_psm.ib_unit = -1;
 121     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 122                                            "ib_unit", "Truescale unit to use",
 123                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 124                                            OPAL_INFO_LVL_9,
 125                                            MCA_BASE_VAR_SCOPE_READONLY,
 126                                            &ompi_mtl_psm.ib_unit);
 127 
 128     ompi_mtl_psm.ib_port = 0;
 129     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 130                                            "ib_port", "Truescale port on unit to use",
 131                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 132                                            OPAL_INFO_LVL_9,
 133                                            MCA_BASE_VAR_SCOPE_READONLY,
 134                                            &ompi_mtl_psm.ib_port);
 135 
 136     ompi_mtl_psm.ib_service_level = 0;
 137     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 138                                            "ib_service_level", "Infiniband service level"
 139                                            "(0 <= SL <= 15)", MCA_BASE_VAR_TYPE_INT,
 140                                            NULL, 0, 0, OPAL_INFO_LVL_9,
 141                                            MCA_BASE_VAR_SCOPE_READONLY,
 142                                            &ompi_mtl_psm.ib_service_level);
 143 
 144     ompi_mtl_psm.ib_pkey = 0x7fffUL;
 145     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 146                                            "ib_pkey", "Infiniband partition key",
 147                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 148                                            OPAL_INFO_LVL_9,
 149                                            MCA_BASE_VAR_SCOPE_READONLY,
 150                                            &ompi_mtl_psm.ib_pkey);
 151 
 152 #if PSM_VERNO >= 0x010d
 153     ompi_mtl_psm.ib_service_id = 0x1000117500000000ull;
 154     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 155                                            "ib_service_id",
 156                                            "Infiniband service ID to use for application (default is 0)",
 157                                            MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, 0, 0,
 158                                            OPAL_INFO_LVL_9,
 159                                            MCA_BASE_VAR_SCOPE_READONLY,
 160                                            &ompi_mtl_psm.ib_service_id);
 161 
 162     ompi_mtl_psm.path_res_type = PSM_PATH_RES_NONE;
 163     mca_base_var_enum_create("mtl_psm_path_query", path_query_values, &new_enum);
 164     (void) mca_base_component_var_register(&mca_mtl_psm_component.super.mtl_version,
 165                                           "path_query",
 166                                           "Path record query mechanisms",
 167                                           MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
 168                                           OPAL_INFO_LVL_9,
 169                                           MCA_BASE_VAR_SCOPE_READONLY,
 170                                           &ompi_mtl_psm.path_res_type);
 171     OBJ_RELEASE(new_enum);
 172 #endif
 173 
 174     return OMPI_SUCCESS;
 175 }
 176 
 177 static int
 178 ompi_mtl_psm_component_open(void)
 179 {
 180   struct stat st;
 181 
 182     if (ompi_mtl_psm.ib_service_level < 0)  {
 183       ompi_mtl_psm.ib_service_level = 0;
 184     } else if (ompi_mtl_psm.ib_service_level > 15) {
 185       ompi_mtl_psm.ib_service_level = 15;
 186     }
 187 
 188   /* Component available only if Truescale hardware is present */
 189   if (0 != stat("/dev/ipath", &st)) {
 190     return OPAL_ERR_NOT_AVAILABLE;
 191   }
 192 
 193   /* Component available only if at least one qib port is ACTIVE */
 194   bool foundOnlineQibPort = false;
 195   size_t i;
 196   char portState[128];
 197   FILE *devFile;
 198   glob_t globbuf;
 199   globbuf.gl_offs = 0;
 200   if (glob("/sys/class/infiniband/qib*/ports/*/state",
 201         GLOB_DOOFFS, NULL, &globbuf) != 0) {
 202     return OPAL_ERR_NOT_AVAILABLE;
 203   }
 204 
 205   for (i=0;i < globbuf.gl_pathc; i++) {
 206     devFile = fopen(globbuf.gl_pathv[i], "r");
 207     fgets(portState, sizeof(portState), devFile);
 208     fclose(devFile);
 209 
 210     if (strstr(portState, "ACTIVE") != NULL) {
 211       /* Found at least one ACTIVE port */
 212       foundOnlineQibPort = true;
 213       break;
 214     }
 215   }
 216 
 217   globfree(&globbuf);
 218 
 219   if (!foundOnlineQibPort) {
 220     return OPAL_ERR_NOT_AVAILABLE;
 221   }
 222 
 223   return OMPI_SUCCESS;
 224 }
 225 
 226 static int
 227 ompi_mtl_psm_component_query(mca_base_module_t **module, int *priority)
 228 {
 229     /*
 230      * if we get here it means that PSM is available so give high priority
 231      */
 232 
 233     *priority = param_priority;
 234     *module = (mca_base_module_t *)&ompi_mtl_psm.super;
 235     return OMPI_SUCCESS;
 236 }
 237 
 238 
 239 static int
 240 ompi_mtl_psm_component_close(void)
 241 {
 242     return OMPI_SUCCESS;
 243 }
 244 
 245 static int
 246 get_num_total_procs(int *out_ntp)
 247 {
 248     *out_ntp = (int)ompi_process_info.num_procs;
 249     return OMPI_SUCCESS;
 250 }
 251 
 252 static int
 253 get_num_local_procs(int *out_nlp)
 254 {
 255     /* num_local_peers does not include us in
 256      * its calculation, so adjust for that */
 257     *out_nlp = (int)(1 + ompi_process_info.num_local_peers);
 258     return OMPI_SUCCESS;
 259 }
 260 
 261 static int
 262 get_local_rank(int *out_rank)
 263 {
 264     ompi_node_rank_t my_node_rank;
 265 
 266     *out_rank = 0;
 267 
 268     if (OMPI_NODE_RANK_INVALID == (my_node_rank =
 269         ompi_process_info.my_node_rank)) {
 270         return OMPI_ERROR;
 271     }
 272     *out_rank = (int)my_node_rank;
 273     return OMPI_SUCCESS;
 274 }
 275 
 276 static mca_mtl_base_module_t *
 277 ompi_mtl_psm_component_init(bool enable_progress_threads,
 278                             bool enable_mpi_threads)
 279 {
 280     psm_error_t err;
 281     int verno_major = PSM_VERNO_MAJOR;
 282     int verno_minor = PSM_VERNO_MINOR;
 283     int local_rank = -1, num_local_procs = 0;
 284     int num_total_procs = 0;
 285 
 286     /* Compute the total number of processes on this host and our local rank
 287      * on that node. We need to provide PSM with these values so it can
 288      * allocate hardware contexts appropriately across processes.
 289      */
 290     if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) {
 291         opal_output(0, "Cannot determine number of local processes. "
 292                     "Cannot continue.\n");
 293         return NULL;
 294     }
 295     if (OMPI_SUCCESS != get_local_rank(&local_rank)) {
 296         opal_output(0, "Cannot determine local rank. Cannot continue.\n");
 297         return NULL;
 298     }
 299     if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) {
 300         opal_output(0, "Cannot determine total number of processes. "
 301                     "Cannot continue.\n");
 302         return NULL;
 303     }
 304 
 305 
 306 #if PSM_VERNO >= 0x010c
 307     /* Set infinipath debug level */
 308     err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG,
 309                      (const void*) &ompi_mtl_psm.debug_level,
 310                      sizeof(unsigned));
 311     if (err) {
 312       /* Non fatal error. Can continue */
 313       opal_show_help("help-mtl-psm.txt",
 314                      "psm init", false,
 315                      psm_error_get_string(err));
 316     }
 317 #endif
 318 
 319     if (getenv("PSM_DEVICES") == NULL) {
 320         /* Only allow for shm and ipath devices in 2.0 and earlier releases
 321          * (unless the user overrides the setting).
 322          */
 323         if (PSM_VERNO >= 0x0104) {
 324             if (num_local_procs == num_total_procs) {
 325                 setenv("PSM_DEVICES", "self,shm", 0);
 326             } else {
 327                 setenv("PSM_DEVICES", "self,shm,ipath", 0);
 328             }
 329         }
 330         else {
 331             if (num_local_procs == num_total_procs) {
 332                 setenv("PSM_DEVICES", "shm", 0);
 333             } else {
 334                 setenv("PSM_DEVICES", "shm,ipath", 0);
 335             }
 336         }
 337     }
 338 
 339     err = psm_init(&verno_major, &verno_minor);
 340     if (err) {
 341       opal_show_help("help-mtl-psm.txt",
 342                      "psm init", true,
 343                      psm_error_get_string(err));
 344       return NULL;
 345     }
 346 
 347     /* Complete PSM initialization */
 348     ompi_mtl_psm_module_init(local_rank, num_local_procs);
 349 
 350     ompi_mtl_psm.super.mtl_request_size =
 351       sizeof(mca_mtl_psm_request_t) -
 352       sizeof(struct mca_mtl_request_t);
 353 
 354     /* don't register the err handler until we know we will be active */
 355     err = psm_error_register_handler(NULL /* no ep */,
 356                                      PSM_ERRHANDLER_NOP);
 357     if (err) {
 358         opal_output(0, "Error in psm_error_register_handler (error %s)\n",
 359                     psm_error_get_string(err));
 360         return NULL;
 361     }
 362 
 363     return &ompi_mtl_psm.super;
 364 }
 365 

/* [<][>][^][v][top][bottom][index][help] */