root/ompi/mca/mtl/psm2/mtl_psm2_component.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ompi_mtl_psm2_set_shadow_env
  2. ompi_mtl_psm2_register_shadow_env
  3. get_num_total_procs
  4. get_num_local_procs
  5. ompi_mtl_psm2_component_register
  6. ompi_mtl_psm2_component_open
  7. ompi_mtl_psm2_component_query
  8. ompi_mtl_psm2_component_close
  9. get_local_rank
  10. ompi_mtl_psm2_component_init

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2005 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
  14  * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
  15  *                         reserved.
  16  * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
  17  * Copyright (c) 2018      Research Organization for Information Science
  18  *                         and Technology (RIST). All rights reserved.
  19  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 
  27 #include "ompi_config.h"
  28 
  29 #include "opal/mca/event/event.h"
  30 #include "opal/util/output.h"
  31 #include "opal/util/show_help.h"
  32 #include "opal/util/opal_environ.h"
  33 #include "opal/util/printf.h"
  34 #include "ompi/proc/proc.h"
  35 
  36 #include "mtl_psm2.h"
  37 #include "mtl_psm2_types.h"
  38 #include "mtl_psm2_request.h"
  39 
  40 #include "psm2.h"
  41 
  42 #include <sys/types.h>
  43 #include <sys/stat.h>
  44 #include <unistd.h>
  45 #include <glob.h>
  46 
  47 static int param_priority;
  48 /* MPI_THREAD_MULTIPLE_SUPPORT */
  49 opal_mutex_t mtl_psm2_mq_mutex = OPAL_MUTEX_STATIC_INIT;
  50 
  51 #if OPAL_CUDA_SUPPORT
  52 static bool cuda_envvar_set = false;
  53 #endif
  54 
  55 static int ompi_mtl_psm2_component_open(void);
  56 static int ompi_mtl_psm2_component_close(void);
  57 static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
  58 static int ompi_mtl_psm2_component_register(void);
  59 
  60 static mca_mtl_base_module_t* ompi_mtl_psm2_component_init( bool enable_progress_threads,
  61                                                           bool enable_mpi_threads );
  62 
  63 mca_mtl_psm2_component_t mca_mtl_psm2_component = {
  64 
  65     {
  66         /* First, the mca_base_component_t struct containing meta
  67          * information about the component itself */
  68 
  69         .mtl_version = {
  70             MCA_MTL_BASE_VERSION_2_0_0,
  71 
  72             .mca_component_name = "psm2",
  73             MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
  74                                   OMPI_RELEASE_VERSION),
  75             .mca_open_component = ompi_mtl_psm2_component_open,
  76             .mca_close_component = ompi_mtl_psm2_component_close,
  77             .mca_query_component = ompi_mtl_psm2_component_query,
  78             .mca_register_component_params = ompi_mtl_psm2_component_register,
  79         },
  80         .mtl_data = {
  81             /* The component is not checkpoint ready */
  82             MCA_BASE_METADATA_PARAM_NONE
  83         },
  84 
  85         .mtl_init = ompi_mtl_psm2_component_init,
  86     }
  87 };
  88 
  89 struct ompi_mtl_psm2_shadow_variable {
  90     int variable_type;
  91     void *storage;
  92     mca_base_var_storage_t default_value;
  93     const char *env_name;
  94     mca_base_var_info_lvl_t info_level;
  95     const char *mca_name;
  96     const char *description;
  97     mca_base_var_flag_t flags;
  98 };
  99 
 100 struct ompi_mtl_psm2_shadow_variable ompi_mtl_psm2_shadow_variables[] = {
 101     {MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_devices, {.stringval = "self,shm,hfi"}, "PSM2_DEVICES", OPAL_INFO_LVL_3,
 102      "devices",
 103      "Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi. Reduced to self,shm in single node jobs)",0},
 104     {MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_memory, {.stringval = "normal"}, "PSM2_MEMORY", OPAL_INFO_LVL_9,
 105      "memory_model", "PSM2 memory usage mode. Valid values: min, normal, large (default: normal)", 0},
 106     {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_sendreqs_max, {.ulval = 0}, "PSM2_MQ_SENDREQS_MAX", OPAL_INFO_LVL_3,
 107      "mq_sendreqs_max", "PSM2 maximum number of isend requests in flight (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
 108     {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_recvreqs_max, {.ulval = 0}, "PSM2_MQ_RECVREQS_MAX", OPAL_INFO_LVL_3,
 109      "mq_recvreqs_max", "PSM2 maximum number of irecv requests in flight (default:  unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
 110     {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_hfi_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_HFI_THRESH", OPAL_INFO_LVL_3,
 111      "hfi_eager_limit", "PSM2 eager to rendezvous threshold (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
 112     {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_shm_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_SHM_THRESH", OPAL_INFO_LVL_3,
 113      "shm_eager_limit", "PSM2 shared memory eager to rendezvous threshold (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
 114     {MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_recvthread, {.boolval = true}, "PSM2_RCVTHREAD", OPAL_INFO_LVL_3,
 115      "use_receive_thread", "Use PSM2 progress thread (default: true)"},
 116     {MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_shared_contexts, {.boolval = true}, "PSM2_SHAREDCONTEXTS", OPAL_INFO_LVL_6,
 117      "use_shared_contexts", "Share PSM contexts between MPI processes (default: true)"},
 118     {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_max_contexts_per_job, {.ulval = 0}, "PSM2_MAX_CONTEXTS_PER_JOB", OPAL_INFO_LVL_9,
 119      "max_contexts_per_job", "Maximum number of contexts available on a node (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
 120     {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_tracemask, {.ulval = 1}, "PSM2_TRACEMASK", OPAL_INFO_LVL_9,
 121      "trace_mask", "PSM2 tracemask value. See PSM2 documentation for accepted values in 0x (default: 1)"},
 122     {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_opa_sl, {.ulval = 0}, "HFI_SL", OPAL_INFO_LVL_9,
 123      "opa_service_level", "HFI Service Level (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
 124     {-1},
 125 };
 126 
 127 static void ompi_mtl_psm2_set_shadow_env (struct ompi_mtl_psm2_shadow_variable *variable)
 128 {
 129     mca_base_var_storage_t *storage = variable->storage;
 130     char *env_value;
 131     int ret = 0;
 132     int var_index = 0;
 133     const mca_base_var_t *mca_base_var;
 134 
 135     var_index = mca_base_var_find("ompi", "mtl", "psm2", variable->mca_name);
 136     ret = mca_base_var_get (var_index,&mca_base_var);
 137     /* Something is fundamentally broken if registered variables are
 138      * not found */
 139     if (OPAL_SUCCESS != ret) {
 140         fprintf (stderr, "ERROR setting PSM2 environment variable: %s\n", variable->env_name);
 141         return;
 142     }
 143 
 144     /** Skip setting variables for which the default behavior is "unset" */
 145     if ((mca_base_var->mbv_flags & MCA_BASE_VAR_FLAG_DEF_UNSET) &&
 146         (MCA_BASE_VAR_SOURCE_DEFAULT == mca_base_var->mbv_source)){
 147         return ;
 148     }
 149 
 150     switch (variable->variable_type) {
 151     case MCA_BASE_VAR_TYPE_BOOL:
 152         ret = opal_asprintf (&env_value, "%s=%d", variable->env_name, storage->boolval ? 1 : 0);
 153         break;
 154     case MCA_BASE_VAR_TYPE_UNSIGNED_LONG:
 155         if (0 == strcmp (variable->env_name, "PSM2_TRACEMASK")) {
 156             /* PSM2 documentation shows the tracemask as a hexidecimal number. to be consitent
 157              * use hexidecimal here. */
 158             ret = opal_asprintf (&env_value, "%s=0x%lx", variable->env_name, storage->ulval);
 159         } else {
 160             ret = opal_asprintf (&env_value, "%s=%lu", variable->env_name, storage->ulval);
 161         }
 162         break;
 163     case MCA_BASE_VAR_TYPE_STRING:
 164         ret = opal_asprintf (&env_value, "%s=%s", variable->env_name, storage->stringval);
 165         break;
 166     }
 167 
 168     if (0 > ret) {
 169         fprintf (stderr, "ERROR setting PSM2 environment variable: %s\n", variable->env_name);
 170     } else {
 171         putenv (env_value);
 172     }
 173 }
 174 
 175 static void ompi_mtl_psm2_register_shadow_env (struct ompi_mtl_psm2_shadow_variable *variable)
 176 {
 177     mca_base_var_storage_t *storage = variable->storage;
 178     char *env_value;
 179 
 180     env_value = getenv (variable->env_name);
 181     switch (variable->variable_type) {
 182     case MCA_BASE_VAR_TYPE_BOOL:
 183         if (env_value) {
 184             int tmp;
 185             (void) mca_base_var_enum_bool.value_from_string (&mca_base_var_enum_bool, env_value, &tmp);
 186             storage->boolval = !!tmp;
 187         } else {
 188             storage->boolval = variable->default_value.boolval;
 189         }
 190         break;
 191     case MCA_BASE_VAR_TYPE_UNSIGNED_LONG:
 192         if (env_value) {
 193             storage->ulval = strtol (env_value, NULL, 0);
 194         } else {
 195             storage->ulval = variable->default_value.ulval;
 196         }
 197         break;
 198     case MCA_BASE_VAR_TYPE_STRING:
 199         if (env_value) {
 200             storage->stringval = env_value;
 201         } else {
 202             storage->stringval = variable->default_value.stringval;
 203         }
 204         break;
 205     }
 206 
 207     (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version, variable->mca_name, variable->description,
 208                                             variable->variable_type, NULL, 0, variable->flags, variable->info_level, MCA_BASE_VAR_SCOPE_READONLY,
 209                                             variable->storage);
 210 }
 211 
 212 static int
 213 get_num_total_procs(int *out_ntp)
 214 {
 215   *out_ntp = (int)ompi_process_info.num_procs;
 216   return OMPI_SUCCESS;
 217 }
 218 
 219 static int
 220 get_num_local_procs(int *out_nlp)
 221 {
 222     /* num_local_peers does not include us in
 223      * its calculation, so adjust for that */
 224     *out_nlp = (int)(1 + ompi_process_info.num_local_peers);
 225     return OMPI_SUCCESS;
 226 }
 227 
 228 static int
 229 ompi_mtl_psm2_component_register(void)
 230 {
 231     int num_local_procs, num_total_procs;
 232 
 233     ompi_mtl_psm2.connect_timeout = 180;
 234     (void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
 235                                            "connect_timeout",
 236                                            "PSM2 connection timeout value in seconds",
 237                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 238                                            OPAL_INFO_LVL_9,
 239                                            MCA_BASE_VAR_SCOPE_READONLY,
 240                                            &ompi_mtl_psm2.connect_timeout);
 241 
 242 
 243     (void) get_num_local_procs(&num_local_procs);
 244     (void) get_num_total_procs(&num_total_procs);
 245 
 246     /* set priority high enough to beat ob1's default (also set higher than psm) */
 247     if ((num_local_procs == num_total_procs) && (1 < num_total_procs)) {
 248         /* Disable hfi if all processes are local. However, if running only one
 249          * process assume it is ompi_info or this is most likely going to spawn, for
 250          * which all PSM2 devices are needed */
 251         setenv("PSM2_DEVICES", "self,shm", 0);
 252     }
 253 
 254     param_priority = 40;
 255     (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
 256                                             "priority", "Priority of the PSM2 MTL component",
 257                                             MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
 258                                             OPAL_INFO_LVL_9,
 259                                             MCA_BASE_VAR_SCOPE_READONLY,
 260                                             &param_priority);
 261 
 262     for (int i = 0 ; ompi_mtl_psm2_shadow_variables[i].variable_type >= 0 ; ++i) {
 263         ompi_mtl_psm2_register_shadow_env (ompi_mtl_psm2_shadow_variables + i);
 264     }
 265 
 266     ompi_mtl_psm2_register_pvars();
 267 
 268     return OMPI_SUCCESS;
 269 }
 270 
 271 static int
 272 ompi_mtl_psm2_component_open(void)
 273 {
 274   int res;
 275   glob_t globbuf = {0};
 276 
 277   /* Component available only if Omni-Path hardware is present */
 278   res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
 279   if (globbuf.gl_pathc > 0 || GLOB_NOMATCH==res) {
 280       globfree(&globbuf);
 281   }
 282   if (0 != res) {
 283       res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
 284       if (globbuf.gl_pathc > 0) {
 285           globfree(&globbuf);
 286       }
 287       if (0 != res) {
 288           return OPAL_ERR_NOT_AVAILABLE;
 289       }
 290   }
 291 
 292   /* Component available only if at least one hfi1 port is ACTIVE */
 293   bool foundOnlineHfi1Port = false;
 294   size_t i;
 295   char portState[128];
 296   FILE *devFile;
 297   if (glob("/sys/class/infiniband/hfi1_*/ports/*/state",
 298         GLOB_DOOFFS, NULL, &globbuf) != 0) {
 299     return OPAL_ERR_NOT_AVAILABLE;
 300   }
 301 
 302   for (i=0;i < globbuf.gl_pathc; i++) {
 303     devFile = fopen(globbuf.gl_pathv[i], "r");
 304     fgets(portState, sizeof(portState), devFile);
 305     fclose(devFile);
 306 
 307     if (strstr(portState, "ACTIVE") != NULL) {
 308       /* Found at least one ACTIVE port */
 309       foundOnlineHfi1Port = true;
 310       break;
 311     }
 312   }
 313 
 314   globfree(&globbuf);
 315 
 316   if (!foundOnlineHfi1Port) {
 317     return OPAL_ERR_NOT_AVAILABLE;
 318   }
 319 
 320   return OMPI_SUCCESS;
 321 }
 322 
 323 static int
 324 ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
 325 {
 326     /*
 327      * if we get here it means that PSM2 is available so give high priority
 328      */
 329 
 330     *priority = param_priority;
 331     *module = (mca_base_module_t *)&ompi_mtl_psm2.super;
 332     return OMPI_SUCCESS;
 333 }
 334 
 335 static int
 336 ompi_mtl_psm2_component_close(void)
 337 {
 338 #if OPAL_CUDA_SUPPORT
 339     if (cuda_envvar_set) {
 340         opal_unsetenv("PSM2_CUDA", &environ);
 341     }
 342 #endif
 343     return OMPI_SUCCESS;
 344 }
 345 
 346 static int
 347 get_local_rank(int *out_rank)
 348 {
 349     ompi_node_rank_t my_node_rank;
 350 
 351     *out_rank = 0;
 352 
 353     if (OMPI_NODE_RANK_INVALID == (my_node_rank =
 354         ompi_process_info.my_node_rank)) {
 355         return OMPI_ERROR;
 356     }
 357     *out_rank = (int)my_node_rank;
 358     return OMPI_SUCCESS;
 359 }
 360 
 361 static mca_mtl_base_module_t *
 362 ompi_mtl_psm2_component_init(bool enable_progress_threads,
 363                             bool enable_mpi_threads)
 364 {
 365     psm2_error_t        err;
 366     int verno_major = PSM2_VERNO_MAJOR;
 367     int verno_minor = PSM2_VERNO_MINOR;
 368     int local_rank = -1, num_local_procs = 0;
 369 #if OPAL_CUDA_SUPPORT
 370     int ret;
 371     char *cuda_env;
 372     glob_t globbuf = {0};
 373 #endif
 374 
 375     /* Compute the total number of processes on this host and our local rank
 376      * on that node. We need to provide PSM2 with these values so it can
 377      * allocate hardware contexts appropriately across processes.
 378      */
 379     if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) {
 380         opal_output(0, "Cannot determine number of local processes. "
 381                     "Cannot continue.\n");
 382         return NULL;
 383     }
 384     if (OMPI_SUCCESS != get_local_rank(&local_rank)) {
 385         opal_output(0, "Cannot determine local rank. Cannot continue.\n");
 386         return NULL;
 387     }
 388 
 389     err = psm2_error_register_handler(NULL /* no ep */,
 390                                      PSM2_ERRHANDLER_NOP);
 391     if (err) {
 392         opal_output(0, "Error in psm2_error_register_handler (error %s)\n",
 393                     psm2_error_get_string(err));
 394         return NULL;
 395     }
 396 
 397     for (int i = 0 ; ompi_mtl_psm2_shadow_variables[i].variable_type >= 0 ; ++i) {
 398         ompi_mtl_psm2_set_shadow_env (ompi_mtl_psm2_shadow_variables + i);
 399     }
 400 
 401 #if OPAL_CUDA_SUPPORT
 402     /*
 403      * If using CUDA enabled Open MPI, the user likely intends to
 404      * run with CUDA buffers. So, force-set the envvar here if user failed
 405      * to set it.
 406      */
 407     ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
 408     if (globbuf.gl_pathc > 0) {
 409         globfree(&globbuf);
 410     }
 411 
 412     cuda_env = getenv("PSM2_CUDA");
 413     if (!cuda_env && (0 == ret)) {
 414         opal_show_help("help-mtl-psm2.txt",
 415                        "no psm2 cuda env", true,
 416                        ompi_process_info.nodename);
 417         opal_setenv("PSM2_CUDA", "1", false, &environ);
 418         cuda_envvar_set = true;
 419     }
 420 #endif
 421 
 422     err = psm2_init(&verno_major, &verno_minor);
 423     if (err) {
 424       opal_show_help("help-mtl-psm2.txt",
 425                      "psm2 init", true,
 426                      psm2_error_get_string(err));
 427       return NULL;
 428     }
 429 
 430     /* Complete PSM2 initialization */
 431     ompi_mtl_psm2_module_init(local_rank, num_local_procs);
 432 
 433     ompi_mtl_psm2.super.mtl_request_size =
 434       sizeof(mca_mtl_psm2_request_t) -
 435       sizeof(struct mca_mtl_request_t);
 436 
 437     return &ompi_mtl_psm2.super;
 438 }

/* [<][>][^][v][top][bottom][index][help] */