This source file includes following definitions.
- buffer_cleanup
- get_print_name_buffer
- ompi_pmix_print_jobids
- ompi_pmix_print_vpids
- ompi_pmix_print_name
- ompi_rte_compare_name_fields
- ompi_rte_convert_string_to_process_name
- ompi_rte_convert_process_name_to_string
- ompi_pmix_convert_string_to_jobid
- ompi_pmix_snprintf_jobid
- _process_name_print_for_opal
- _jobid_print_for_opal
- _vpid_print_for_opal
- _process_name_compare
- _convert_string_to_process_name
- _convert_process_name_to_string
- _convert_string_to_jobid
- ompi_rte_init
- check_file
- ompi_rte_finalize
- ompi_rte_abort
- ompi_rte_abort_peers
- _release_fn
- _register_fn
- ompi_rte_wait_for_debugger
- ompi_rte_connect_accept_support
- pre_condition_transports_print
- _setup_job_session_dir
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 #include "ompi_config.h"
18 #include "ompi/constants.h"
19
20 #include <string.h>
21 #include <stdio.h>
22 #include <ctype.h>
23 #ifdef HAVE_SYS_TYPES_H
24 #include <sys/types.h>
25 #endif
26 #include <sys/stat.h>
27 #ifdef HAVE_UNISTD_H
28 #include <unistd.h>
29 #endif
30 #ifdef HAVE_DIRENT_H
31 #include <dirent.h>
32 #endif
33 #ifdef HAVE_PWD_H
34 #include <pwd.h>
35 #endif
36
37 #include "opal/dss/dss.h"
38 #include "opal/util/argv.h"
39 #include "opal/util/error.h"
40 #include "opal/util/opal_getcwd.h"
41 #include "opal/util/os_path.h"
42 #include "opal/util/os_dirpath.h"
43 #include "opal/util/printf.h"
44 #include "opal/util/proc.h"
45 #include "opal/util/show_help.h"
46 #include "opal/util/string_copy.h"
47 #include "opal/mca/hwloc/base/base.h"
48 #include "opal/mca/pmix/base/base.h"
49 #include "opal/threads/threads.h"
50 #include "opal/threads/tsd.h"
51 #include "opal/class/opal_list.h"
52 #include "opal/dss/dss.h"
53
54 #include "ompi/mca/rte/base/base.h"
55 #include "ompi/mca/rte/rte.h"
56 #include "ompi/debuggers/debuggers.h"
57 #include "ompi/proc/proc.h"
58 #include "ompi/runtime/params.h"
59 #include "ompi/communicator/communicator.h"
60
61
62 volatile int MPIR_being_debugged = 0;
63
64 extern ompi_rte_component_t mca_rte_pmix_component;
65
66
67 opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1};
68 opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX};
69 hwloc_cpuset_t ompi_proc_applied_binding = NULL;
70 pmix_process_info_t pmix_process_info = {0};
71 bool pmix_proc_is_bound = false;
72
73 static bool pmix_in_parallel_debugger = false;
74 static bool added_transport_keys = false;
75 static bool added_num_procs = false;
76 static bool added_app_ctx = false;
77 static char* pre_condition_transports_print(uint64_t *unique_key);
78 static int _setup_job_session_dir(char **sdir);
79
80 #define OPAL_SCHEMA_DELIMITER_CHAR '.'
81 #define OPAL_SCHEMA_WILDCARD_CHAR '*'
82 #define OPAL_SCHEMA_WILDCARD_STRING "*"
83 #define OPAL_SCHEMA_INVALID_CHAR '$'
84 #define OPAL_SCHEMA_INVALID_STRING "$"
85
86 #define OPAL_PRINT_NAME_ARGS_MAX_SIZE 50
87 #define OPAL_PRINT_NAME_ARG_NUM_BUFS 16
88
89 static bool fns_init=false;
90 static opal_tsd_key_t print_args_tsd_key;
91 static char* opal_print_args_null = "NULL";
92 typedef struct {
93 char *buffers[OPAL_PRINT_NAME_ARG_NUM_BUFS];
94 int cntr;
95 } opal_print_args_buffers_t;
96
97 static void
98 buffer_cleanup(void *value)
99 {
100 int i;
101 opal_print_args_buffers_t *ptr;
102
103 if (NULL != value) {
104 ptr = (opal_print_args_buffers_t*)value;
105 for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) {
106 free(ptr->buffers[i]);
107 }
108 free (ptr);
109 }
110 }
111
112 static opal_print_args_buffers_t*
113 get_print_name_buffer(void)
114 {
115 opal_print_args_buffers_t *ptr;
116 int ret, i;
117
118 if (!fns_init) {
119
120 if (OPAL_SUCCESS != (ret = opal_tsd_key_create(&print_args_tsd_key, buffer_cleanup))) {
121 OPAL_ERROR_LOG(ret);
122 return NULL;
123 }
124 fns_init = true;
125 }
126
127 ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr);
128 if (OPAL_SUCCESS != ret) return NULL;
129
130 if (NULL == ptr) {
131 ptr = (opal_print_args_buffers_t*)malloc(sizeof(opal_print_args_buffers_t));
132 for (i=0; i < OPAL_PRINT_NAME_ARG_NUM_BUFS; i++) {
133 ptr->buffers[i] = (char *) malloc((OPAL_PRINT_NAME_ARGS_MAX_SIZE+1) * sizeof(char));
134 }
135 ptr->cntr = 0;
136 ret = opal_tsd_setspecific(print_args_tsd_key, (void*)ptr);
137 }
138
139 return (opal_print_args_buffers_t*) ptr;
140 }
141
142 static char* ompi_pmix_print_jobids(const opal_jobid_t job)
143 {
144 opal_print_args_buffers_t *ptr;
145 unsigned long tmp1, tmp2;
146
147 ptr = get_print_name_buffer();
148
149 if (NULL == ptr) {
150 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
151 return opal_print_args_null;
152 }
153
154
155 if (OPAL_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) {
156 ptr->cntr = 0;
157 }
158
159 if (OPAL_JOBID_INVALID == job) {
160 snprintf(ptr->buffers[ptr->cntr++], OPAL_PRINT_NAME_ARGS_MAX_SIZE, "[INVALID]");
161 } else if (OPAL_JOBID_WILDCARD == job) {
162 snprintf(ptr->buffers[ptr->cntr++], OPAL_PRINT_NAME_ARGS_MAX_SIZE, "[WILDCARD]");
163 } else {
164 tmp1 = OMPI_JOB_FAMILY((unsigned long)job);
165 tmp2 = OMPI_LOCAL_JOBID((unsigned long)job);
166 snprintf(ptr->buffers[ptr->cntr++],
167 OPAL_PRINT_NAME_ARGS_MAX_SIZE,
168 "[%lu,%lu]", tmp1, tmp2);
169 }
170 return ptr->buffers[ptr->cntr-1];
171 }
172
173 static char* ompi_pmix_print_vpids(const opal_vpid_t vpid)
174 {
175 opal_print_args_buffers_t *ptr;
176
177 ptr = get_print_name_buffer();
178
179 if (NULL == ptr) {
180 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
181 return opal_print_args_null;
182 }
183
184
185 if (OPAL_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) {
186 ptr->cntr = 0;
187 }
188
189 if (OPAL_VPID_INVALID == vpid) {
190 snprintf(ptr->buffers[ptr->cntr++], OPAL_PRINT_NAME_ARGS_MAX_SIZE, "INVALID");
191 } else if (OPAL_VPID_WILDCARD == vpid) {
192 snprintf(ptr->buffers[ptr->cntr++], OPAL_PRINT_NAME_ARGS_MAX_SIZE, "WILDCARD");
193 } else {
194 snprintf(ptr->buffers[ptr->cntr++],
195 OPAL_PRINT_NAME_ARGS_MAX_SIZE,
196 "%ld", (long)vpid);
197 }
198 return ptr->buffers[ptr->cntr-1];
199 }
200
201 char* ompi_pmix_print_name(const ompi_process_name_t *name)
202 {
203 opal_print_args_buffers_t *ptr;
204 char *job, *vpid;
205
206
207 if (NULL == name) {
208
209 ptr = get_print_name_buffer();
210 if (NULL == ptr) {
211 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
212 return opal_print_args_null;
213 }
214
215 if (OPAL_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) {
216 ptr->cntr = 0;
217 }
218 snprintf(ptr->buffers[ptr->cntr++], OPAL_PRINT_NAME_ARGS_MAX_SIZE, "[NO-NAME]");
219 return ptr->buffers[ptr->cntr-1];
220 }
221
222
223
224
225
226
227 job = ompi_pmix_print_jobids(name->jobid);
228 vpid = ompi_pmix_print_vpids(name->vpid);
229
230
231 ptr = get_print_name_buffer();
232
233 if (NULL == ptr) {
234 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
235 return opal_print_args_null;
236 }
237
238
239 if (OPAL_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) {
240 ptr->cntr = 0;
241 }
242
243 snprintf(ptr->buffers[ptr->cntr++],
244 OPAL_PRINT_NAME_ARGS_MAX_SIZE,
245 "[%s,%s]", job, vpid);
246
247 return ptr->buffers[ptr->cntr-1];
248 }
249
250 int ompi_rte_compare_name_fields(ompi_rte_cmp_bitmask_t fields,
251 const opal_process_name_t* name1,
252 const opal_process_name_t* name2)
253 {
254
255 if (NULL == name1 && NULL == name2) {
256 return OPAL_EQUAL;
257 } else if (NULL == name1) {
258 return OPAL_VALUE2_GREATER;
259 } else if (NULL == name2) {
260 return OPAL_VALUE1_GREATER;
261 }
262
263
264
265
266
267
268
269
270
271
272 if (OMPI_RTE_CMP_JOBID & fields) {
273 if (OMPI_RTE_CMP_WILD & fields &&
274 (pmix_name_wildcard.jobid == name1->jobid ||
275 pmix_name_wildcard.jobid == name2->jobid)) {
276 goto check_vpid;
277 }
278 if (name1->jobid < name2->jobid) {
279 return OPAL_VALUE2_GREATER;
280 } else if (name1->jobid > name2->jobid) {
281 return OPAL_VALUE1_GREATER;
282 }
283 }
284
285
286
287
288 check_vpid:
289 if (OMPI_RTE_CMP_VPID & fields) {
290 if (OMPI_RTE_CMP_WILD & fields &&
291 (pmix_name_wildcard.vpid == name1->vpid ||
292 pmix_name_wildcard.vpid == name2->vpid)) {
293 return OPAL_EQUAL;
294 }
295 if (name1->vpid < name2->vpid) {
296 return OPAL_VALUE2_GREATER;
297 } else if (name1->vpid > name2->vpid) {
298 return OPAL_VALUE1_GREATER;
299 }
300 }
301
302
303
304
305
306
307 return OPAL_EQUAL;
308 }
309
310 int ompi_rte_convert_string_to_process_name(opal_process_name_t *name,
311 const char* name_string)
312 {
313 char *temp, *token;
314 opal_jobid_t job;
315 opal_vpid_t vpid;
316 int return_code=OPAL_SUCCESS;
317
318
319 name->jobid = pmix_name_invalid.jobid;
320 name->vpid = pmix_name_invalid.vpid;
321
322
323 if (NULL == name_string) {
324 OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
325 return OPAL_ERR_BAD_PARAM;
326 }
327
328 temp = strdup(name_string);
329 token = strchr(temp, OPAL_SCHEMA_DELIMITER_CHAR);
330
331
332 if (NULL == token) {
333 OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
334 free(temp);
335 return OPAL_ERR_BAD_PARAM;
336 }
337 *token = '\0';
338 token++;
339
340
341
342
343 if (0 == strcmp(temp, OPAL_SCHEMA_WILDCARD_STRING)) {
344 job = pmix_name_wildcard.jobid;
345 } else if (0 == strcmp(temp, OPAL_SCHEMA_INVALID_STRING)) {
346 job = pmix_name_invalid.jobid;
347 } else {
348 job = strtoul(temp, NULL, 10);
349 }
350
351
352
353
354 if (0 == strcmp(token, OPAL_SCHEMA_WILDCARD_STRING)) {
355 vpid = pmix_name_wildcard.vpid;
356 } else if (0 == strcmp(token, OPAL_SCHEMA_INVALID_STRING)) {
357 vpid = pmix_name_invalid.vpid;
358 } else {
359 vpid = strtoul(token, NULL, 10);
360 }
361
362 name->jobid = job;
363 name->vpid = vpid;
364
365 free(temp);
366
367 return return_code;
368 }
369
370 int ompi_rte_convert_process_name_to_string(char** name_string,
371 const opal_process_name_t *name)
372 {
373 char *tmp, *tmp2;
374
375 if (NULL == name) {
376 OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
377 return OPAL_ERR_BAD_PARAM;
378 }
379
380
381
382
383
384 if (pmix_name_wildcard.jobid == name->jobid) {
385 opal_asprintf(&tmp, "%s", OPAL_SCHEMA_WILDCARD_STRING);
386 } else if (pmix_name_invalid.jobid == name->jobid) {
387 opal_asprintf(&tmp, "%s", OPAL_SCHEMA_INVALID_STRING);
388 } else {
389 opal_asprintf(&tmp, "%lu", (unsigned long)name->jobid);
390 }
391
392 if (pmix_name_wildcard.vpid == name->vpid) {
393 opal_asprintf(&tmp2, "%s%c%s", tmp, OPAL_SCHEMA_DELIMITER_CHAR, OPAL_SCHEMA_WILDCARD_STRING);
394 } else if (pmix_name_invalid.vpid == name->vpid) {
395 opal_asprintf(&tmp2, "%s%c%s", tmp, OPAL_SCHEMA_DELIMITER_CHAR, OPAL_SCHEMA_INVALID_STRING);
396 } else {
397 opal_asprintf(&tmp2, "%s%c%lu", tmp, OPAL_SCHEMA_DELIMITER_CHAR, (unsigned long)name->vpid);
398 }
399
400 opal_asprintf(name_string, "%s", tmp2);
401
402 free(tmp);
403 free(tmp2);
404
405 return OPAL_SUCCESS;
406 }
407
408 static int ompi_pmix_convert_string_to_jobid(opal_jobid_t *jobid, const char* jobidstring)
409 {
410 if (NULL == jobidstring) {
411 OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
412 *jobid = OPAL_JOBID_INVALID;
413 return OPAL_ERR_BAD_PARAM;
414 }
415
416
417 if (0 == strcmp(OPAL_SCHEMA_WILDCARD_STRING, jobidstring)) {
418 *jobid = OPAL_JOBID_WILDCARD;
419 return OPAL_SUCCESS;
420 }
421
422
423 if (0 == strcmp(OPAL_SCHEMA_INVALID_STRING, jobidstring)) {
424 *jobid = OPAL_JOBID_INVALID;
425 return OPAL_SUCCESS;
426 }
427
428 *jobid = strtoul(jobidstring, NULL, 10);
429
430 return OPAL_SUCCESS;
431 }
432
433 static int ompi_pmix_snprintf_jobid(char *jobid_string, size_t size, const opal_jobid_t jobid)
434 {
435 int rc;
436
437
438 if (OPAL_JOBID_WILDCARD == jobid) {
439 (void)opal_string_copy(jobid_string,
440 OPAL_SCHEMA_WILDCARD_STRING, size);
441 } else {
442 rc = snprintf(jobid_string, size, "%ld", (long) jobid);
443 if (0 > rc) {
444 return OPAL_ERROR;
445 }
446 }
447
448 return OPAL_SUCCESS;
449 }
450
451
452
453
454
455
456
457 static char*
458 _process_name_print_for_opal(const opal_process_name_t procname)
459 {
460 ompi_process_name_t* rte_name = (ompi_process_name_t*)&procname;
461 return ompi_pmix_print_name(rte_name);
462 }
463
464 static char*
465 _jobid_print_for_opal(const opal_jobid_t jobid)
466 {
467 return ompi_pmix_print_jobids(jobid);
468 }
469
470 static char*
471 _vpid_print_for_opal(const opal_vpid_t vpid)
472 {
473 return ompi_pmix_print_vpids(vpid);
474 }
475
476 static int
477 _process_name_compare(const opal_process_name_t p1, const opal_process_name_t p2)
478 {
479 return ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, &p1, &p2);
480 }
481
482 static int _convert_string_to_process_name(opal_process_name_t *name,
483 const char* name_string)
484 {
485 return ompi_rte_convert_string_to_process_name(name, name_string);
486 }
487
488 static int _convert_process_name_to_string(char** name_string,
489 const opal_process_name_t *name)
490 {
491 return ompi_rte_convert_process_name_to_string(name_string, name);
492 }
493
494 static int
495 _convert_string_to_jobid(opal_jobid_t *jobid, const char *jobid_string)
496 {
497 return ompi_pmix_convert_string_to_jobid(jobid, jobid_string);
498 }
499
500 int ompi_rte_init(int *pargc, char ***pargv)
501 {
502 int ret;
503 char *error = NULL;
504 opal_process_name_t pname;
505 opal_proc_t *myproc;
506 int u32, *u32ptr;
507 uint16_t u16, *u16ptr;
508 char **peers=NULL;
509 char *envar, *ev1, *ev2;
510 opal_value_t *kv;
511 char *val;
512 size_t i;
513 uint64_t unique_key[2];
514 char *string_key;
515
516 u32ptr = &u32;
517 u16ptr = &u16;
518 memset(&pmix_process_info, 0, sizeof(pmix_process_info));
519
520
521 opal_process_name_print = _process_name_print_for_opal;
522 opal_vpid_print = _vpid_print_for_opal;
523 opal_jobid_print = _jobid_print_for_opal;
524 opal_compare_proc = _process_name_compare;
525 opal_convert_string_to_process_name = _convert_string_to_process_name;
526 opal_convert_process_name_to_string = _convert_process_name_to_string;
527 opal_snprintf_jobid = ompi_pmix_snprintf_jobid;
528 opal_convert_string_to_jobid = _convert_string_to_jobid;
529
530
531 if (OPAL_SUCCESS != (ret = opal_init(pargc, pargv))) {
532 error = "opal_init";
533 goto error;
534 }
535
536
537 if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
538 OPAL_ERROR_LOG(ret);
539
540 error = "pmix init";
541 goto error;
542 }
543 if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) {
544
545 error = "pmix init";
546 goto error;
547 }
548
549 opal_pmix_base_set_evbase(opal_sync_event_base);
550
551
552 if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
553
554
555
556 opal_show_help("help-ompi-rte-pmix.txt", "no-pmi", true);
557 return OPAL_ERR_SILENT;
558 }
559
560
561 pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid;
562 pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid;
563
564 myproc = opal_proc_local_get();
565 pmix_process_info.nodename = opal_get_proc_hostname(myproc);
566
567
568 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK,
569 &pmix_process_info.my_name, &u16ptr, OPAL_UINT16);
570 if (OPAL_SUCCESS != ret) {
571 error = "getting local rank";
572 goto error;
573 }
574 pmix_process_info.my_local_rank = u16;
575
576
577 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK,
578 &pmix_process_info.my_name, &u16ptr, OPAL_UINT16);
579 if (OPAL_SUCCESS != ret) {
580 error = "getting node rank";
581 goto error;
582 }
583 pmix_process_info.my_node_rank = u16;
584
585
586 pname.jobid = pmix_process_info.my_name.jobid;
587 pname.vpid = OPAL_VPID_WILDCARD;
588 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE,
589 &pname, &u32ptr, OPAL_UINT32);
590 if (OPAL_SUCCESS != ret) {
591 error = "getting job size";
592 goto error;
593 }
594 pmix_process_info.num_procs = u32;
595
596
597
598
599 if (NULL == getenv(OPAL_MCA_PREFIX"opal_ess_num_procs")) {
600 opal_asprintf(&ev1, OPAL_MCA_PREFIX"opal_ess_num_procs=%d", pmix_process_info.num_procs);
601 putenv(ev1);
602 added_num_procs = true;
603 }
604 if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
605 opal_asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", pmix_process_info.num_procs);
606 putenv(ev2);
607 added_app_ctx = true;
608 }
609
610
611 OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
612 &pmix_process_info.my_name, &u32ptr, OPAL_UINT32);
613 if (OPAL_SUCCESS == ret) {
614 pmix_process_info.app_num = u32;
615 } else {
616 pmix_process_info.app_num = 0;
617 }
618
619
620
621 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE,
622 &pname, &u32ptr, OPAL_UINT32);
623 if (OPAL_SUCCESS == ret) {
624 pmix_process_info.num_local_peers = u32 - 1;
625 } else {
626 pmix_process_info.num_local_peers = 0;
627 }
628
629
630
631
632
633 if (NULL == getenv(OPAL_MCA_PREFIX"opal_precondition_transports")) {
634 unique_key[0] = (pmix_process_info.my_name.jobid & 0xff00) >> 16;
635 unique_key[1] = pmix_process_info.my_name.jobid & 0x00ff;
636 if (NULL == (string_key = pre_condition_transports_print(unique_key))) {
637 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
638 return OPAL_ERR_OUT_OF_RESOURCE;
639 }
640 opal_output_verbose(2, ompi_rte_base_framework.framework_output,
641 "%s transport key %s",
642 OPAL_NAME_PRINT(pmix_process_info.my_name), string_key);
643 opal_asprintf(&envar, OPAL_MCA_PREFIX"opal_precondition_transports=%s", string_key);
644 putenv(envar);
645 added_transport_keys = true;
646
647 free(string_key);
648 }
649
650
651 OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &pname, &val, OPAL_STRING);
652 if (OPAL_SUCCESS == ret && NULL != val) {
653 pmix_process_info.job_session_dir = val;
654 val = NULL;
655 } else {
656
657 ret = _setup_job_session_dir(&pmix_process_info.job_session_dir);
658 if (OPAL_SUCCESS != ret) {
659 error = "job session directory";
660 goto error;
661 }
662 }
663
664
665 if (0 < pmix_process_info.num_local_peers) {
666
667 if (pmix_process_info.num_local_peers < pmix_process_info.my_local_rank) {
668 ret = OPAL_ERR_BAD_PARAM;
669 error = "num local peers";
670 goto error;
671 }
672
673 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
674 &pname, &val, OPAL_STRING);
675 if (OPAL_SUCCESS == ret && NULL != val) {
676 peers = opal_argv_split(val, ',');
677 free(val);
678 } else {
679 peers = NULL;
680 }
681 } else {
682 peers = NULL;
683 }
684
685
686 if (NULL != peers) {
687
688 val = NULL;
689 OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
690 &pmix_process_info.my_name, &val, OPAL_STRING);
691 if (OPAL_SUCCESS == ret && NULL != val) {
692 pmix_process_info.cpuset = val;
693 } else {
694 pmix_process_info.cpuset = NULL;
695 }
696 pname.jobid = pmix_process_info.my_name.jobid;
697 for (i=0; NULL != peers[i]; i++) {
698 pname.vpid = strtoul(peers[i], NULL, 10);
699 if (pname.vpid == pmix_process_info.my_name.vpid) {
700
701 u16 = OPAL_PROC_ALL_LOCAL;
702 } else {
703 val = NULL;
704 OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
705 &pname, &val, OPAL_STRING);
706 if (OPAL_SUCCESS == ret && NULL != val) {
707 u16 = opal_hwloc_compute_relative_locality(pmix_process_info.cpuset, val);
708 free(val);
709 } else {
710
711 u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
712 }
713 }
714 kv = OBJ_NEW(opal_value_t);
715 kv->key = strdup(OPAL_PMIX_LOCALITY);
716 kv->type = OPAL_UINT16;
717 OPAL_OUTPUT_VERBOSE((1, ompi_rte_base_framework.framework_output,
718 "%s locality: proc %s locality %s",
719 OPAL_NAME_PRINT(pmix_process_info.my_name),
720 OPAL_NAME_PRINT(pname), opal_hwloc_base_print_locality(u16)));
721 kv->data.uint16 = u16;
722 ret = opal_pmix.store_local(&pname, kv);
723 if (OPAL_SUCCESS != ret) {
724 error = "local store of locality";
725 opal_argv_free(peers);
726 if (NULL != pmix_process_info.cpuset) {
727 free(pmix_process_info.cpuset);
728 }
729 goto error;
730 }
731 OBJ_RELEASE(kv);
732 }
733 opal_argv_free(peers);
734 }
735
736
737 if (NULL != getenv("SLURM_CPU_BIND_TYPE")) {
738 pmix_proc_is_bound = true;
739 }
740
741
742
743
744
745 opal_process_info.job_session_dir = pmix_process_info.job_session_dir;
746 opal_process_info.proc_session_dir = pmix_process_info.proc_session_dir;
747 opal_process_info.num_local_peers = (int32_t)pmix_process_info.num_local_peers;
748 opal_process_info.my_local_rank = (int32_t)pmix_process_info.my_local_rank;
749 opal_process_info.cpuset = pmix_process_info.cpuset;
750
751
752
753
754 OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, pmix_process_info.nodename, OPAL_STRING);
755 if (OPAL_SUCCESS != ret) {
756 error = "db store hostname";
757 goto error;
758 }
759
760 return OPAL_SUCCESS;
761
762 error:
763 if (OPAL_ERR_SILENT != ret ) {
764 opal_show_help("help-ompi-rte-pmix.txt",
765 "internal-failure",
766 true, error, opal_strerror(ret), ret);
767 }
768 opal_finalize();
769 return ret;
770
771 }
772
773 static bool check_file(const char *root, const char *path)
774 {
775 struct stat st;
776 char *fullpath;
777
778
779
780
781
782 if (0 == strncmp(path, "output-", strlen("output-"))) {
783 fullpath = opal_os_path(false, &fullpath, root, path, NULL);
784 stat(fullpath, &st);
785 free(fullpath);
786 if (0 == st.st_size) {
787 return true;
788 }
789 return false;
790 }
791
792 return true;
793 }
794
795 int ompi_rte_finalize(void)
796 {
797
798
799
800 if (added_transport_keys) {
801 unsetenv(OPAL_MCA_PREFIX"opal_precondition_transports");
802 }
803 if (added_num_procs) {
804 unsetenv(OPAL_MCA_PREFIX"opal_ess_num_procs");
805 }
806 if (added_app_ctx) {
807 unsetenv("OMPI_APP_CTX_NUM_PROCS");
808 }
809
810
811 if (NULL != opal_pmix.finalize) {
812 opal_pmix.finalize();
813 (void) mca_base_framework_close(&opal_pmix_base_framework);
814 }
815
816
817 if (NULL != pmix_process_info.job_session_dir) {
818 opal_os_dirpath_destroy(pmix_process_info.job_session_dir,
819 false, check_file);
820 free(pmix_process_info.job_session_dir);
821 }
822
823 free (pmix_process_info.cpuset);
824 pmix_process_info.cpuset = NULL;
825
826 return OMPI_SUCCESS;
827 }
828
829 void ompi_rte_abort(int error_code, char *fmt, ...)
830 {
831 va_list arglist;
832 char* buffer = NULL;
833 struct timespec tp = {0, 100000};
834
835
836 va_start(arglist, fmt);
837 if( NULL != fmt ) {
838 opal_vasprintf( &buffer, fmt, arglist );
839 }
840 va_end(arglist);
841
842
843 opal_pmix.abort(error_code, buffer, NULL);
844 if (NULL != buffer) {
845 free(buffer);
846 }
847
848
849
850 nanosleep(&tp, NULL);
851
852
853 _exit(error_code);
854 }
855
856 void ompi_rte_abort_peers(opal_process_name_t *procs,
857 int32_t num_procs,
858 int error_code)
859 {
860 return;
861 }
862
863 static size_t handler = SIZE_MAX;
864 static bool debugger_register_active = true;
865 static bool debugger_event_active = true;
866
867 static void _release_fn(int status,
868 const opal_process_name_t *source,
869 opal_list_t *info, opal_list_t *results,
870 opal_pmix_notification_complete_fn_t cbfunc,
871 void *cbdata)
872 {
873
874 if (NULL != cbfunc) {
875 cbfunc(OPAL_SUCCESS, NULL, NULL, NULL, cbdata);
876 }
877 debugger_event_active = false;
878 }
879
880 static void _register_fn(int status,
881 size_t evhandler_ref,
882 void *cbdata)
883 {
884 opal_list_t *codes = (opal_list_t*)cbdata;
885
886 handler = evhandler_ref;
887 OPAL_LIST_RELEASE(codes);
888 debugger_register_active = false;
889 }
890
891
892
893
894
895
896 void ompi_rte_wait_for_debugger(void)
897 {
898 int debugger;
899 opal_list_t *codes, directives;
900 opal_value_t *kv;
901 char *evar;
902 int time;
903
904
905 debugger = pmix_in_parallel_debugger;
906
907 if (1 == MPIR_being_debugged) {
908 debugger = 1;
909 }
910
911 if (!debugger && NULL == getenv("PMIX_TEST_DEBUGGER_ATTACH")) {
912
913 return;
914 }
915
916
917
918
919 ompi_debugger_setup_dlls();
920
921 if (NULL != (evar = getenv("PMIX_TEST_DEBUGGER_SLEEP"))) {
922 time = strtol(evar, NULL, 10);
923 sleep(time);
924 return;
925 }
926
927
928 codes = OBJ_NEW(opal_list_t);
929 kv = OBJ_NEW(opal_value_t);
930 kv->key = strdup("errorcode");
931 kv->type = OPAL_INT;
932 kv->data.integer = OPAL_ERR_DEBUGGER_RELEASE;
933 opal_list_append(codes, &kv->super);
934
935 OBJ_CONSTRUCT(&directives, opal_list_t);
936 kv = OBJ_NEW(opal_value_t);
937 kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
938 kv->type = OPAL_STRING;
939 kv->data.string = strdup("MPI-DEBUGGER-ATTACH");
940 opal_list_append(&directives, &kv->super);
941
942 opal_pmix.register_evhandler(codes, &directives, _release_fn, _register_fn, codes);
943
944 OMPI_WAIT_FOR_COMPLETION(debugger_register_active);
945 OPAL_LIST_DESTRUCT(&directives);
946
947
948 OMPI_WAIT_FOR_COMPLETION(debugger_event_active);
949
950
951 opal_pmix.deregister_evhandler(handler, NULL, NULL);
952 }
953
954 bool ompi_rte_connect_accept_support(const char *port)
955 {
956
957 return true;
958 }
959
960 static char* pre_condition_transports_print(uint64_t *unique_key)
961 {
962 unsigned int *int_ptr;
963 size_t i, j, string_key_len, written_len;
964 char *string_key = NULL, *format = NULL;
965
966
967
968
969 string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1;
970 string_key = (char*) malloc(string_key_len);
971 if (NULL == string_key) {
972 return NULL;
973 }
974
975 string_key[0] = '\0';
976 written_len = 0;
977
978
979
980
981
982
983
984
985
986 opal_asprintf(&format, "%%0%dx", (int)(sizeof(unsigned int)) * 2);
987
988
989 int_ptr = (unsigned int*) &unique_key[0];
990 for (i = 0 ; i < sizeof(uint64_t) / sizeof(unsigned int) ; ++i) {
991 if (0 == int_ptr[i]) {
992
993 for (j=0; j < sizeof(unsigned int); j++) {
994 int_ptr[i] |= j << j;
995 }
996 }
997 snprintf(string_key + written_len,
998 string_key_len - written_len,
999 format, int_ptr[i]);
1000 written_len = strlen(string_key);
1001 }
1002
1003
1004 snprintf(string_key + written_len, string_key_len - written_len, "-");
1005 written_len = strlen(string_key);
1006
1007
1008 int_ptr = (unsigned int*) &unique_key[1];
1009 for (i = 0 ; i < sizeof(uint64_t) / sizeof(unsigned int) ; ++i) {
1010 if (0 == int_ptr[i]) {
1011
1012 for (j=0; j < sizeof(unsigned int); j++) {
1013 int_ptr[i] |= j << j;
1014 }
1015 }
1016 snprintf(string_key + written_len,
1017 string_key_len - written_len,
1018 format, int_ptr[i]);
1019 written_len = strlen(string_key);
1020 }
1021 free(format);
1022
1023 return string_key;
1024 }
1025
1026 static int _setup_job_session_dir(char **sdir)
1027 {
1028 char *tmpdir;
1029
1030 uid_t uid = geteuid();
1031
1032 if( NULL == (tmpdir = getenv("TMPDIR")) )
1033 if( NULL == (tmpdir = getenv("TEMP")) )
1034 if( NULL == (tmpdir = getenv("TMP")) )
1035 tmpdir = "/tmp";
1036
1037 if (0 > opal_asprintf(&pmix_process_info.job_session_dir,
1038 "%s/ompi.%s.%lu/jf.0/%u", tmpdir,
1039 pmix_process_info.nodename,
1040 (unsigned long)uid,
1041 pmix_process_info.my_name.jobid)) {
1042 pmix_process_info.job_session_dir = NULL;
1043 return OPAL_ERR_OUT_OF_RESOURCE;
1044 }
1045
1046 return OPAL_SUCCESS;
1047 }