This source file includes following definitions.
- ompi_mtl_psm2_set_shadow_env
- ompi_mtl_psm2_register_shadow_env
- get_num_total_procs
- get_num_local_procs
- ompi_mtl_psm2_component_register
- ompi_mtl_psm2_component_open
- ompi_mtl_psm2_component_query
- ompi_mtl_psm2_component_close
- get_local_rank
- ompi_mtl_psm2_component_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 #include "ompi_config.h"
28
29 #include "opal/mca/event/event.h"
30 #include "opal/util/output.h"
31 #include "opal/util/show_help.h"
32 #include "opal/util/opal_environ.h"
33 #include "opal/util/printf.h"
34 #include "ompi/proc/proc.h"
35
36 #include "mtl_psm2.h"
37 #include "mtl_psm2_types.h"
38 #include "mtl_psm2_request.h"
39
40 #include "psm2.h"
41
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <unistd.h>
45 #include <glob.h>
46
47 static int param_priority;
48
49 opal_mutex_t mtl_psm2_mq_mutex = OPAL_MUTEX_STATIC_INIT;
50
51 #if OPAL_CUDA_SUPPORT
52 static bool cuda_envvar_set = false;
53 #endif
54
55 static int ompi_mtl_psm2_component_open(void);
56 static int ompi_mtl_psm2_component_close(void);
57 static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
58 static int ompi_mtl_psm2_component_register(void);
59
60 static mca_mtl_base_module_t* ompi_mtl_psm2_component_init( bool enable_progress_threads,
61 bool enable_mpi_threads );
62
63 mca_mtl_psm2_component_t mca_mtl_psm2_component = {
64
65 {
66
67
68
69 .mtl_version = {
70 MCA_MTL_BASE_VERSION_2_0_0,
71
72 .mca_component_name = "psm2",
73 MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
74 OMPI_RELEASE_VERSION),
75 .mca_open_component = ompi_mtl_psm2_component_open,
76 .mca_close_component = ompi_mtl_psm2_component_close,
77 .mca_query_component = ompi_mtl_psm2_component_query,
78 .mca_register_component_params = ompi_mtl_psm2_component_register,
79 },
80 .mtl_data = {
81
82 MCA_BASE_METADATA_PARAM_NONE
83 },
84
85 .mtl_init = ompi_mtl_psm2_component_init,
86 }
87 };
88
89 struct ompi_mtl_psm2_shadow_variable {
90 int variable_type;
91 void *storage;
92 mca_base_var_storage_t default_value;
93 const char *env_name;
94 mca_base_var_info_lvl_t info_level;
95 const char *mca_name;
96 const char *description;
97 mca_base_var_flag_t flags;
98 };
99
100 struct ompi_mtl_psm2_shadow_variable ompi_mtl_psm2_shadow_variables[] = {
101 {MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_devices, {.stringval = "self,shm,hfi"}, "PSM2_DEVICES", OPAL_INFO_LVL_3,
102 "devices",
103 "Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi. Reduced to self,shm in single node jobs)",0},
104 {MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_memory, {.stringval = "normal"}, "PSM2_MEMORY", OPAL_INFO_LVL_9,
105 "memory_model", "PSM2 memory usage mode. Valid values: min, normal, large (default: normal)", 0},
106 {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_sendreqs_max, {.ulval = 0}, "PSM2_MQ_SENDREQS_MAX", OPAL_INFO_LVL_3,
107 "mq_sendreqs_max", "PSM2 maximum number of isend requests in flight (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
108 {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_recvreqs_max, {.ulval = 0}, "PSM2_MQ_RECVREQS_MAX", OPAL_INFO_LVL_3,
109 "mq_recvreqs_max", "PSM2 maximum number of irecv requests in flight (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
110 {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_hfi_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_HFI_THRESH", OPAL_INFO_LVL_3,
111 "hfi_eager_limit", "PSM2 eager to rendezvous threshold (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
112 {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_shm_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_SHM_THRESH", OPAL_INFO_LVL_3,
113 "shm_eager_limit", "PSM2 shared memory eager to rendezvous threshold (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
114 {MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_recvthread, {.boolval = true}, "PSM2_RCVTHREAD", OPAL_INFO_LVL_3,
115 "use_receive_thread", "Use PSM2 progress thread (default: true)"},
116 {MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_shared_contexts, {.boolval = true}, "PSM2_SHAREDCONTEXTS", OPAL_INFO_LVL_6,
117 "use_shared_contexts", "Share PSM contexts between MPI processes (default: true)"},
118 {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_max_contexts_per_job, {.ulval = 0}, "PSM2_MAX_CONTEXTS_PER_JOB", OPAL_INFO_LVL_9,
119 "max_contexts_per_job", "Maximum number of contexts available on a node (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
120 {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_tracemask, {.ulval = 1}, "PSM2_TRACEMASK", OPAL_INFO_LVL_9,
121 "trace_mask", "PSM2 tracemask value. See PSM2 documentation for accepted values in 0x (default: 1)"},
122 {MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_opa_sl, {.ulval = 0}, "HFI_SL", OPAL_INFO_LVL_9,
123 "opa_service_level", "HFI Service Level (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
124 {-1},
125 };
126
127 static void ompi_mtl_psm2_set_shadow_env (struct ompi_mtl_psm2_shadow_variable *variable)
128 {
129 mca_base_var_storage_t *storage = variable->storage;
130 char *env_value;
131 int ret = 0;
132 int var_index = 0;
133 const mca_base_var_t *mca_base_var;
134
135 var_index = mca_base_var_find("ompi", "mtl", "psm2", variable->mca_name);
136 ret = mca_base_var_get (var_index,&mca_base_var);
137
138
139 if (OPAL_SUCCESS != ret) {
140 fprintf (stderr, "ERROR setting PSM2 environment variable: %s\n", variable->env_name);
141 return;
142 }
143
144
145 if ((mca_base_var->mbv_flags & MCA_BASE_VAR_FLAG_DEF_UNSET) &&
146 (MCA_BASE_VAR_SOURCE_DEFAULT == mca_base_var->mbv_source)){
147 return ;
148 }
149
150 switch (variable->variable_type) {
151 case MCA_BASE_VAR_TYPE_BOOL:
152 ret = opal_asprintf (&env_value, "%s=%d", variable->env_name, storage->boolval ? 1 : 0);
153 break;
154 case MCA_BASE_VAR_TYPE_UNSIGNED_LONG:
155 if (0 == strcmp (variable->env_name, "PSM2_TRACEMASK")) {
156
157
158 ret = opal_asprintf (&env_value, "%s=0x%lx", variable->env_name, storage->ulval);
159 } else {
160 ret = opal_asprintf (&env_value, "%s=%lu", variable->env_name, storage->ulval);
161 }
162 break;
163 case MCA_BASE_VAR_TYPE_STRING:
164 ret = opal_asprintf (&env_value, "%s=%s", variable->env_name, storage->stringval);
165 break;
166 }
167
168 if (0 > ret) {
169 fprintf (stderr, "ERROR setting PSM2 environment variable: %s\n", variable->env_name);
170 } else {
171 putenv (env_value);
172 }
173 }
174
175 static void ompi_mtl_psm2_register_shadow_env (struct ompi_mtl_psm2_shadow_variable *variable)
176 {
177 mca_base_var_storage_t *storage = variable->storage;
178 char *env_value;
179
180 env_value = getenv (variable->env_name);
181 switch (variable->variable_type) {
182 case MCA_BASE_VAR_TYPE_BOOL:
183 if (env_value) {
184 int tmp;
185 (void) mca_base_var_enum_bool.value_from_string (&mca_base_var_enum_bool, env_value, &tmp);
186 storage->boolval = !!tmp;
187 } else {
188 storage->boolval = variable->default_value.boolval;
189 }
190 break;
191 case MCA_BASE_VAR_TYPE_UNSIGNED_LONG:
192 if (env_value) {
193 storage->ulval = strtol (env_value, NULL, 0);
194 } else {
195 storage->ulval = variable->default_value.ulval;
196 }
197 break;
198 case MCA_BASE_VAR_TYPE_STRING:
199 if (env_value) {
200 storage->stringval = env_value;
201 } else {
202 storage->stringval = variable->default_value.stringval;
203 }
204 break;
205 }
206
207 (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version, variable->mca_name, variable->description,
208 variable->variable_type, NULL, 0, variable->flags, variable->info_level, MCA_BASE_VAR_SCOPE_READONLY,
209 variable->storage);
210 }
211
212 static int
213 get_num_total_procs(int *out_ntp)
214 {
215 *out_ntp = (int)ompi_process_info.num_procs;
216 return OMPI_SUCCESS;
217 }
218
219 static int
220 get_num_local_procs(int *out_nlp)
221 {
222
223
224 *out_nlp = (int)(1 + ompi_process_info.num_local_peers);
225 return OMPI_SUCCESS;
226 }
227
228 static int
229 ompi_mtl_psm2_component_register(void)
230 {
231 int num_local_procs, num_total_procs;
232
233 ompi_mtl_psm2.connect_timeout = 180;
234 (void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
235 "connect_timeout",
236 "PSM2 connection timeout value in seconds",
237 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
238 OPAL_INFO_LVL_9,
239 MCA_BASE_VAR_SCOPE_READONLY,
240 &ompi_mtl_psm2.connect_timeout);
241
242
243 (void) get_num_local_procs(&num_local_procs);
244 (void) get_num_total_procs(&num_total_procs);
245
246
247 if ((num_local_procs == num_total_procs) && (1 < num_total_procs)) {
248
249
250
251 setenv("PSM2_DEVICES", "self,shm", 0);
252 }
253
254 param_priority = 40;
255 (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
256 "priority", "Priority of the PSM2 MTL component",
257 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
258 OPAL_INFO_LVL_9,
259 MCA_BASE_VAR_SCOPE_READONLY,
260 ¶m_priority);
261
262 for (int i = 0 ; ompi_mtl_psm2_shadow_variables[i].variable_type >= 0 ; ++i) {
263 ompi_mtl_psm2_register_shadow_env (ompi_mtl_psm2_shadow_variables + i);
264 }
265
266 ompi_mtl_psm2_register_pvars();
267
268 return OMPI_SUCCESS;
269 }
270
271 static int
272 ompi_mtl_psm2_component_open(void)
273 {
274 int res;
275 glob_t globbuf = {0};
276
277
278 res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
279 if (globbuf.gl_pathc > 0 || GLOB_NOMATCH==res) {
280 globfree(&globbuf);
281 }
282 if (0 != res) {
283 res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
284 if (globbuf.gl_pathc > 0) {
285 globfree(&globbuf);
286 }
287 if (0 != res) {
288 return OPAL_ERR_NOT_AVAILABLE;
289 }
290 }
291
292
293 bool foundOnlineHfi1Port = false;
294 size_t i;
295 char portState[128];
296 FILE *devFile;
297 if (glob("/sys/class/infiniband/hfi1_*/ports/*/state",
298 GLOB_DOOFFS, NULL, &globbuf) != 0) {
299 return OPAL_ERR_NOT_AVAILABLE;
300 }
301
302 for (i=0;i < globbuf.gl_pathc; i++) {
303 devFile = fopen(globbuf.gl_pathv[i], "r");
304 fgets(portState, sizeof(portState), devFile);
305 fclose(devFile);
306
307 if (strstr(portState, "ACTIVE") != NULL) {
308
309 foundOnlineHfi1Port = true;
310 break;
311 }
312 }
313
314 globfree(&globbuf);
315
316 if (!foundOnlineHfi1Port) {
317 return OPAL_ERR_NOT_AVAILABLE;
318 }
319
320 return OMPI_SUCCESS;
321 }
322
323 static int
324 ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
325 {
326
327
328
329
330 *priority = param_priority;
331 *module = (mca_base_module_t *)&ompi_mtl_psm2.super;
332 return OMPI_SUCCESS;
333 }
334
335 static int
336 ompi_mtl_psm2_component_close(void)
337 {
338 #if OPAL_CUDA_SUPPORT
339 if (cuda_envvar_set) {
340 opal_unsetenv("PSM2_CUDA", &environ);
341 }
342 #endif
343 return OMPI_SUCCESS;
344 }
345
346 static int
347 get_local_rank(int *out_rank)
348 {
349 ompi_node_rank_t my_node_rank;
350
351 *out_rank = 0;
352
353 if (OMPI_NODE_RANK_INVALID == (my_node_rank =
354 ompi_process_info.my_node_rank)) {
355 return OMPI_ERROR;
356 }
357 *out_rank = (int)my_node_rank;
358 return OMPI_SUCCESS;
359 }
360
361 static mca_mtl_base_module_t *
362 ompi_mtl_psm2_component_init(bool enable_progress_threads,
363 bool enable_mpi_threads)
364 {
365 psm2_error_t err;
366 int verno_major = PSM2_VERNO_MAJOR;
367 int verno_minor = PSM2_VERNO_MINOR;
368 int local_rank = -1, num_local_procs = 0;
369 #if OPAL_CUDA_SUPPORT
370 int ret;
371 char *cuda_env;
372 glob_t globbuf = {0};
373 #endif
374
375
376
377
378
379 if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) {
380 opal_output(0, "Cannot determine number of local processes. "
381 "Cannot continue.\n");
382 return NULL;
383 }
384 if (OMPI_SUCCESS != get_local_rank(&local_rank)) {
385 opal_output(0, "Cannot determine local rank. Cannot continue.\n");
386 return NULL;
387 }
388
389 err = psm2_error_register_handler(NULL ,
390 PSM2_ERRHANDLER_NOP);
391 if (err) {
392 opal_output(0, "Error in psm2_error_register_handler (error %s)\n",
393 psm2_error_get_string(err));
394 return NULL;
395 }
396
397 for (int i = 0 ; ompi_mtl_psm2_shadow_variables[i].variable_type >= 0 ; ++i) {
398 ompi_mtl_psm2_set_shadow_env (ompi_mtl_psm2_shadow_variables + i);
399 }
400
401 #if OPAL_CUDA_SUPPORT
402
403
404
405
406
407 ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
408 if (globbuf.gl_pathc > 0) {
409 globfree(&globbuf);
410 }
411
412 cuda_env = getenv("PSM2_CUDA");
413 if (!cuda_env && (0 == ret)) {
414 opal_show_help("help-mtl-psm2.txt",
415 "no psm2 cuda env", true,
416 ompi_process_info.nodename);
417 opal_setenv("PSM2_CUDA", "1", false, &environ);
418 cuda_envvar_set = true;
419 }
420 #endif
421
422 err = psm2_init(&verno_major, &verno_minor);
423 if (err) {
424 opal_show_help("help-mtl-psm2.txt",
425 "psm2 init", true,
426 psm2_error_get_string(err));
427 return NULL;
428 }
429
430
431 ompi_mtl_psm2_module_init(local_rank, num_local_procs);
432
433 ompi_mtl_psm2.super.mtl_request_size =
434 sizeof(mca_mtl_psm2_request_t) -
435 sizeof(struct mca_mtl_request_t);
436
437 return &ompi_mtl_psm2.super;
438 }