This source file includes following definitions.
- orte_plm_base_set_slots
- orte_plm_base_daemons_reported
- orte_plm_base_allocation_complete
- orte_plm_base_daemons_launched
- files_ready
- orte_plm_base_vm_ready
- orte_plm_base_mapping_complete
- orte_plm_base_setup_job
- orte_plm_base_setup_job_complete
- orte_plm_base_complete_setup
- timer_cb
- orte_plm_base_launch_apps
- orte_plm_base_send_launch_msg
- orte_plm_base_post_launch
- orte_plm_base_registered
- orte_plm_base_daemon_topology
- orte_plm_base_daemon_callback
- orte_plm_base_daemon_failed
- orte_plm_base_setup_orted_cmd
- orte_plm_base_orted_append_basic_args
- orte_plm_base_setup_virtual_machine
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 #include "orte_config.h"
29 #include "orte/constants.h"
30
31 #ifdef HAVE_SYS_WAIT_H
32 #include <sys/wait.h>
33 #endif
34 #ifdef HAVE_SYS_TIME_H
35 #include <sys/time.h>
36 #endif
37 #include <ctype.h>
38
39 #include "opal/hash_string.h"
40 #include "opal/util/argv.h"
41 #include "opal/util/opal_environ.h"
42 #include "opal/util/printf.h"
43 #include "opal/class/opal_pointer_array.h"
44 #include "opal/dss/dss.h"
45 #include "opal/mca/hwloc/hwloc-internal.h"
46 #include "opal/mca/pmix/pmix.h"
47 #include "opal/mca/compress/compress.h"
48
49 #include "orte/util/dash_host/dash_host.h"
50 #include "orte/util/nidmap.h"
51 #include "orte/util/session_dir.h"
52 #include "orte/util/show_help.h"
53 #include "orte/mca/errmgr/errmgr.h"
54 #include "orte/mca/ess/ess.h"
55 #include "orte/mca/iof/base/base.h"
56 #include "orte/mca/odls/base/base.h"
57 #include "orte/mca/ras/base/base.h"
58 #include "orte/mca/rmaps/rmaps.h"
59 #include "orte/mca/rmaps/base/base.h"
60 #include "orte/mca/rml/rml.h"
61 #include "orte/mca/rml/rml_types.h"
62 #include "orte/mca/routed/routed.h"
63 #include "orte/mca/grpcomm/base/base.h"
64 #if OPAL_ENABLE_FT_CR == 1
65 #include "orte/mca/snapc/base/base.h"
66 #endif
67 #include "orte/mca/filem/filem.h"
68 #include "orte/mca/filem/base/base.h"
69 #include "orte/mca/grpcomm/base/base.h"
70 #include "orte/mca/rml/base/rml_contact.h"
71 #include "orte/mca/rtc/rtc.h"
72 #include "orte/runtime/orte_globals.h"
73 #include "orte/runtime/runtime.h"
74 #include "orte/runtime/orte_locks.h"
75 #include "orte/runtime/orte_quit.h"
76 #include "orte/util/name_fns.h"
77 #include "orte/util/pre_condition_transports.h"
78 #include "orte/util/proc_info.h"
79 #include "orte/util/threads.h"
80 #include "orte/mca/state/state.h"
81 #include "orte/mca/state/base/base.h"
82 #include "orte/util/hostfile/hostfile.h"
83 #include "orte/mca/odls/odls_types.h"
84
85 #include "orte/mca/plm/base/plm_private.h"
86 #include "orte/mca/plm/base/base.h"
87
88 void orte_plm_base_set_slots(orte_node_t *node)
89 {
90 if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
91 if (NULL != node->topology && NULL != node->topology->topo) {
92 node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
93 HWLOC_OBJ_CORE, 0,
94 OPAL_HWLOC_LOGICAL);
95 }
96 } else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
97 if (NULL != node->topology && NULL != node->topology->topo) {
98 if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
99 HWLOC_OBJ_SOCKET, 0,
100 OPAL_HWLOC_LOGICAL))) {
101
102
103 node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
104 HWLOC_OBJ_NODE, 0,
105 OPAL_HWLOC_LOGICAL);
106 }
107 }
108 } else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
109 if (NULL != node->topology && NULL != node->topology->topo) {
110 node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
111 HWLOC_OBJ_NODE, 0,
112 OPAL_HWLOC_LOGICAL);
113 }
114 } else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
115 if (NULL != node->topology && NULL != node->topology->topo) {
116 node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
117 HWLOC_OBJ_PU, 0,
118 OPAL_HWLOC_LOGICAL);
119 }
120 } else {
121
122 node->slots = strtol(orte_set_slots, NULL, 10);
123 }
124
125 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
126 }
127
128 void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
129 {
130 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
131 orte_topology_t *t;
132 orte_node_t *node;
133 int i, rc;
134 uint8_t u8;
135 opal_buffer_t buf;
136 orte_grpcomm_signature_t *sig;
137 orte_daemon_cmd_flag_t command = ORTE_DAEMON_PASS_NODE_INFO_CMD;
138
139 ORTE_ACQUIRE_OBJECT(caddy);
140
141
142
143 if (orte_do_not_launch) {
144 node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
145 t = node->topology;
146 for (i=1; i < orte_node_pool->size; i++) {
147 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
148 continue;
149 }
150 if (NULL == node->topology) {
151 node->topology = t;
152 }
153 }
154 }
155
156
157
158
159 if (!orte_managed_allocation) {
160 if (NULL != orte_set_slots &&
161 0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) {
162 caddy->jdata->total_slots_alloc = 0;
163 for (i=0; i < orte_node_pool->size; i++) {
164 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
165 continue;
166 }
167 if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
168 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
169 "%s plm:base:setting slots for node %s by %s",
170 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
171 orte_plm_base_set_slots(node);
172 }
173 caddy->jdata->total_slots_alloc += node->slots;
174 }
175 }
176 }
177
178 if (orte_display_allocation) {
179 orte_ras_base_display_alloc();
180 }
181
182 orte_routed.update_routing_plan();
183
184
185 OBJ_CONSTRUCT(&buf, opal_buffer_t);
186
187 if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_DAEMON_CMD))) {
188 ORTE_ERROR_LOG(rc);
189 OBJ_DESTRUCT(&buf);
190 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
191 OBJ_RELEASE(caddy);
192 return;
193 }
194
195
196
197
198
199 if (!orte_nidmap_communicated) {
200 u8 = 1;
201 if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &u8, 1, OPAL_UINT8))) {
202 ORTE_ERROR_LOG(rc);
203 OBJ_DESTRUCT(&buf);
204 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
205 OBJ_RELEASE(caddy);
206 return;
207 }
208 if (OPAL_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &buf))) {
209 ORTE_ERROR_LOG(rc);
210 OBJ_DESTRUCT(&buf);
211 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
212 OBJ_RELEASE(caddy);
213 return;
214 }
215 orte_nidmap_communicated = true;
216 } else {
217 u8 = 0;
218 if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &u8, 1, OPAL_UINT8))) {
219 ORTE_ERROR_LOG(rc);
220 OBJ_DESTRUCT(&buf);
221 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
222 OBJ_RELEASE(caddy);
223 return;
224 }
225 }
226
227
228
229
230 if (OPAL_SUCCESS != (rc = orte_util_pass_node_info(&buf))) {
231 ORTE_ERROR_LOG(rc);
232 OBJ_DESTRUCT(&buf);
233 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
234 OBJ_RELEASE(caddy);
235 return;
236 }
237
238
239 sig = OBJ_NEW(orte_grpcomm_signature_t);
240 sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
241 sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
242 sig->signature[0].vpid = ORTE_VPID_WILDCARD;
243 sig->sz = 1;
244 if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, &buf))) {
245 ORTE_ERROR_LOG(rc);
246 OBJ_RELEASE(sig);
247 OBJ_DESTRUCT(&buf);
248 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
249 OBJ_RELEASE(caddy);
250 return;
251 }
252 OBJ_DESTRUCT(&buf);
253
254 OBJ_RELEASE(sig);
255
256
257 caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
258 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_VM_READY);
259
260
261 OBJ_RELEASE(caddy);
262 }
263
264 void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)
265 {
266 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
267
268 ORTE_ACQUIRE_OBJECT(caddy);
269
270
271
272
273 if (orte_do_not_launch) {
274 caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
275 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP);
276 } else {
277
278 caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
279 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
280 }
281
282
283 OBJ_RELEASE(caddy);
284 }
285
286 void orte_plm_base_daemons_launched(int fd, short args, void *cbdata)
287 {
288 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
289
290 ORTE_ACQUIRE_OBJECT(caddy);
291
292
293
294
295
296
297 OBJ_RELEASE(caddy);
298 }
299
300 static void files_ready(int status, void *cbdata)
301 {
302 orte_job_t *jdata = (orte_job_t*)cbdata;
303
304 if (ORTE_SUCCESS != status) {
305 ORTE_FORCED_TERMINATE(status);
306 } else {
307 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
308 }
309 }
310
311 void orte_plm_base_vm_ready(int fd, short args, void *cbdata)
312 {
313 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
314
315 ORTE_ACQUIRE_OBJECT(caddy);
316
317
318 caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
319
320
321 if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) {
322 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
323 }
324
325
326 OBJ_RELEASE(caddy);
327 }
328
329 void orte_plm_base_mapping_complete(int fd, short args, void *cbdata)
330 {
331 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
332
333 ORTE_ACQUIRE_OBJECT(caddy);
334
335
336 caddy->jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
337 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_SYSTEM_PREP);
338
339
340 OBJ_RELEASE(caddy);
341 }
342
343
344 void orte_plm_base_setup_job(int fd, short args, void *cbdata)
345 {
346 int rc;
347 int i;
348 orte_app_context_t *app;
349 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
350 char *key;
351 orte_job_t *parent;
352 orte_process_name_t name, *nptr;
353
354 ORTE_ACQUIRE_OBJECT(caddy);
355
356 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
357 "%s plm:base:setup_job",
358 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
359
360 if (ORTE_JOB_STATE_INIT != caddy->job_state) {
361 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
362 OBJ_RELEASE(caddy);
363 return;
364 }
365
366 caddy->jdata->state = caddy->job_state;
367
368
369 if (ORTE_JOBID_INVALID == caddy->jdata->jobid) {
370 if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(caddy->jdata))) {
371 ORTE_ERROR_LOG(rc);
372 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
373 OBJ_RELEASE(caddy);
374 return;
375 }
376
377
378
379
380
381
382 opal_hash_table_set_value_uint32(orte_job_data, caddy->jdata->jobid, caddy->jdata);
383 }
384
385
386 if (!ORTE_FLAG_TEST(caddy->jdata, ORTE_JOB_FLAG_RECOVERABLE) &&
387 orte_enable_recovery) {
388 ORTE_FLAG_SET(caddy->jdata, ORTE_JOB_FLAG_RECOVERABLE);
389 }
390
391
392
393
394
395
396
397 nptr = &name;
398 if (orte_get_attribute(&caddy->jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&nptr, OPAL_NAME)) {
399
400 if (NULL == (parent = orte_get_job_data_object(name.jobid))) {
401 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
402 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
403 OBJ_RELEASE(caddy);
404 return;
405 }
406
407
408 key = NULL;
409 if (orte_get_attribute(&parent->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&key, OPAL_STRING) &&
410 NULL != key) {
411
412 orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, key, OPAL_STRING);
413
414 for (i=0; i < caddy->jdata->apps->size; i++) {
415 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
416 continue;
417 }
418 opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &app->env);
419 }
420 free(key);
421 } else {
422 if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
423 ORTE_ERROR_LOG(rc);
424 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
425 OBJ_RELEASE(caddy);
426 return;
427 }
428 }
429 } else {
430
431
432 if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
433 ORTE_ERROR_LOG(rc);
434 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
435 OBJ_RELEASE(caddy);
436 return;
437 }
438 }
439
440
441 for (i=0; i < caddy->jdata->apps->size; i++) {
442 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
443 continue;
444 }
445 if (!orte_get_attribute(&app->attributes, ORTE_APP_RECOV_DEF, NULL, OPAL_BOOL)) {
446 orte_set_attribute(&app->attributes, ORTE_APP_MAX_RESTARTS, ORTE_ATTR_LOCAL, &orte_max_restarts, OPAL_INT32);
447 }
448 }
449
450
451 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE);
452
453
454 OBJ_RELEASE(caddy);
455 }
456
457 void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata)
458 {
459 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
460
461 ORTE_ACQUIRE_OBJECT(caddy);
462
463
464 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
465 OBJ_RELEASE(caddy);
466 }
467
468 void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
469 {
470 orte_job_t *jdata, *jdatorted;
471 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
472 orte_node_t *node;
473 uint32_t h;
474 orte_vpid_t *vptr;
475 int i, rc;
476 char *serial_number;
477 orte_process_name_t requestor, *rptr;
478
479 ORTE_ACQUIRE_OBJECT(caddy);
480
481 opal_output_verbose(5, orte_plm_base_framework.framework_output,
482 "%s complete_setup on job %s",
483 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
484 ORTE_JOBID_PRINT(caddy->jdata->jobid));
485
486
487 if (ORTE_JOB_STATE_SYSTEM_PREP != caddy->job_state) {
488 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
489 OBJ_RELEASE(caddy);
490 return;
491 }
492
493 caddy->jdata->state = caddy->job_state;
494
495
496 if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
497 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
498 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
499 OBJ_RELEASE(caddy);
500 return;
501 }
502
503
504 jdata = caddy->jdata;
505
506
507
508
509
510
511
512
513
514 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL, NULL, OPAL_BOOL)) {
515
516 rptr = &requestor;
517 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&rptr, OPAL_NAME)) {
518 ORTE_IOF_PROXY_PULL(jdata, rptr);
519 } else {
520 ORTE_IOF_PROXY_PULL(jdata, &jdata->originator);
521 }
522
523
524 }
525
526
527
528
529
530
531
532 if (orte_coprocessors_detected) {
533
534 for (i=0; i < orte_node_pool->size; i++) {
535 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
536 continue;
537 }
538
539 serial_number = NULL;
540 if (!orte_get_attribute(&node->attributes, ORTE_NODE_SERIAL_NUMBER, (void**)&serial_number, OPAL_STRING)) {
541 continue;
542 }
543 if (NULL != serial_number) {
544
545
546
547 OPAL_HASH_STR(serial_number, h);
548 free(serial_number);
549 if (OPAL_SUCCESS != (rc = opal_hash_table_get_value_uint32(orte_coprocessors, h,
550 (void**)&vptr))) {
551 ORTE_ERROR_LOG(rc);
552 break;
553 }
554 orte_set_attribute(&node->attributes, ORTE_NODE_HOSTID, ORTE_ATTR_LOCAL, vptr, ORTE_VPID);
555 }
556 }
557 }
558
559 if (NULL != orte_coprocessors) {
560 OBJ_RELEASE(orte_coprocessors);
561 }
562
563
564 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_APPS);
565
566
567 OBJ_RELEASE(caddy);
568 }
569
570
571 static void timer_cb(int fd, short event, void *cbdata)
572 {
573 orte_job_t *jdata = (orte_job_t*)cbdata;
574 orte_timer_t *timer=NULL;
575
576 ORTE_ACQUIRE_OBJECT(jdata);
577
578
579 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
580
581
582 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, (void**)&timer, OPAL_PTR)) {
583
584 OBJ_RELEASE(timer);
585 orte_remove_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT);
586 }
587 }
588
589 void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
590 {
591 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
592 orte_job_t *jdata;
593 orte_daemon_cmd_flag_t command;
594 int rc;
595
596 ORTE_ACQUIRE_OBJECT(caddy);
597
598
599 jdata = caddy->jdata;
600
601 if (ORTE_JOB_STATE_LAUNCH_APPS != caddy->job_state) {
602 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
603 OBJ_RELEASE(caddy);
604 return;
605 }
606
607 caddy->jdata->state = caddy->job_state;
608
609 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
610 "%s plm:base:launch_apps for job %s",
611 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
612 ORTE_JOBID_PRINT(jdata->jobid)));
613
614
615 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
616 command = ORTE_DAEMON_DVM_ADD_PROCS;
617 } else {
618 command = ORTE_DAEMON_ADD_LOCAL_PROCS;
619 }
620 if (ORTE_SUCCESS != (rc = opal_dss.pack(&jdata->launch_msg, &command, 1, ORTE_DAEMON_CMD))) {
621 ORTE_ERROR_LOG(rc);
622 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
623 OBJ_RELEASE(caddy);
624 return;
625 }
626
627
628 if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(&jdata->launch_msg, jdata->jobid))) {
629 ORTE_ERROR_LOG(rc);
630 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
631 }
632
633 OBJ_RELEASE(caddy);
634 return;
635 }
636
637 void orte_plm_base_send_launch_msg(int fd, short args, void *cbdata)
638 {
639 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
640 orte_timer_t *timer;
641 orte_grpcomm_signature_t *sig;
642 orte_job_t *jdata;
643 int rc;
644
645
646 jdata = caddy->jdata;
647
648 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
649 "%s plm:base:send launch msg for job %s",
650 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
651 ORTE_JOBID_PRINT(jdata->jobid)));
652
653
654 if (orte_do_not_launch) {
655 bool compressed;
656 uint8_t *cmpdata;
657 size_t cmplen;
658
659 compressed = opal_compress.compress_block((uint8_t*)jdata->launch_msg.base_ptr,
660 jdata->launch_msg.bytes_used,
661 &cmpdata, &cmplen);
662 if (compressed) {
663 opal_output(0, "LAUNCH MSG RAW SIZE: %d COMPRESSED SIZE: %d",
664 (int)jdata->launch_msg.bytes_used, (int)cmplen);
665 free(cmpdata);
666 } else {
667 opal_output(0, "LAUNCH MSG RAW SIZE: %d", (int)jdata->launch_msg.bytes_used);
668 }
669 orte_never_launched = true;
670 ORTE_FORCED_TERMINATE(0);
671 OBJ_RELEASE(caddy);
672 return;
673 }
674
675
676 sig = OBJ_NEW(orte_grpcomm_signature_t);
677 sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
678 sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
679 sig->signature[0].vpid = ORTE_VPID_WILDCARD;
680 sig->sz = 1;
681 if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, &jdata->launch_msg))) {
682 ORTE_ERROR_LOG(rc);
683 OBJ_RELEASE(sig);
684 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
685 OBJ_RELEASE(caddy);
686 return;
687 }
688 OBJ_DESTRUCT(&jdata->launch_msg);
689 OBJ_CONSTRUCT(&jdata->launch_msg, opal_buffer_t);
690
691 OBJ_RELEASE(sig);
692
693
694
695
696 caddy->jdata->num_daemons_reported++;
697
698
699
700
701 if (0 < orte_startup_timeout) {
702 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
703 "%s plm:base:launch defining timeout for job %s",
704 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
705 ORTE_JOBID_PRINT(jdata->jobid)));
706 timer = OBJ_NEW(orte_timer_t);
707 timer->payload = jdata;
708 opal_event_evtimer_set(orte_event_base,
709 timer->ev, timer_cb, jdata);
710 opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
711 timer->tv.tv_sec = orte_startup_timeout;
712 timer->tv.tv_usec = 0;
713 orte_set_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, ORTE_ATTR_LOCAL, timer, OPAL_PTR);
714 ORTE_POST_OBJECT(timer);
715 opal_event_evtimer_add(timer->ev, &timer->tv);
716 }
717
718
719 OBJ_RELEASE(caddy);
720 }
721
722 void orte_plm_base_post_launch(int fd, short args, void *cbdata)
723 {
724 int32_t rc;
725 orte_job_t *jdata;
726 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
727 orte_process_name_t name;
728 orte_timer_t *timer=NULL;
729 int ret;
730 opal_buffer_t *answer;
731 int room, *rmptr;
732
733 ORTE_ACQUIRE_OBJECT(caddy);
734
735
736 jdata = caddy->jdata;
737
738
739 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, (void**)&timer, OPAL_PTR)) {
740 opal_event_evtimer_del(timer->ev);
741 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
742 "%s plm:base:launch deleting timeout for job %s",
743 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
744 ORTE_JOBID_PRINT(jdata->jobid)));
745 OBJ_RELEASE(timer);
746 orte_remove_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT);
747 }
748
749 if (ORTE_JOB_STATE_RUNNING != caddy->job_state) {
750 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
751 OBJ_RELEASE(caddy);
752 return;
753 }
754
755 caddy->jdata->state = caddy->job_state;
756
757
758 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
759 "%s plm:base:launch wiring up iof for job %s",
760 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
761 ORTE_JOBID_PRINT(jdata->jobid)));
762
763
764 name.jobid = jdata->jobid;
765 name.vpid = jdata->stdin_target;
766
767 if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
768 ORTE_ERROR_LOG(rc);
769 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
770 OBJ_RELEASE(caddy);
771 return;
772 }
773
774
775 if (ORTE_JOBID_INVALID == jdata->originator.jobid) {
776 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
777 "%s plm:base:launch job %s is not a dynamic spawn",
778 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
779 ORTE_JOBID_PRINT(jdata->jobid)));
780 goto cleanup;
781 }
782
783
784 rc = ORTE_SUCCESS;
785 answer = OBJ_NEW(opal_buffer_t);
786
787 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
788 ORTE_ERROR_LOG(ret);
789 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
790 OBJ_RELEASE(caddy);
791 return;
792 }
793
794 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
795 ORTE_ERROR_LOG(ret);
796 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
797 OBJ_RELEASE(caddy);
798 return;
799 }
800
801 rmptr = &room;
802 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
803 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
804 ORTE_ERROR_LOG(ret);
805 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
806 OBJ_RELEASE(caddy);
807 return;
808 }
809 }
810 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
811 "%s plm:base:launch sending dyn release of job %s to %s",
812 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
813 ORTE_JOBID_PRINT(jdata->jobid),
814 ORTE_NAME_PRINT(&jdata->originator)));
815 if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
816 ORTE_RML_TAG_LAUNCH_RESP,
817 orte_rml_send_callback, NULL))) {
818 ORTE_ERROR_LOG(ret);
819 OBJ_RELEASE(answer);
820 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
821 OBJ_RELEASE(caddy);
822 return;
823 }
824
825 cleanup:
826
827 OBJ_RELEASE(caddy);
828 }
829
830 void orte_plm_base_registered(int fd, short args, void *cbdata)
831 {
832 orte_job_t *jdata;
833 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
834
835 ORTE_ACQUIRE_OBJECT(caddy);
836
837
838 jdata = caddy->jdata;
839
840 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
841 "%s plm:base:launch %s registered",
842 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
843 ORTE_JOBID_PRINT(jdata->jobid)));
844
845 if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) {
846 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
847 "%s plm:base:launch job %s not registered - state %s",
848 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
849 ORTE_JOBID_PRINT(jdata->jobid),
850 orte_job_state_to_str(caddy->job_state)));
851 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
852 OBJ_RELEASE(caddy);
853 return;
854 }
855
856 jdata->state = caddy->job_state;
857
858
859 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
860 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
861 }
862
863 OBJ_RELEASE(caddy);
864 }
865
866
867 static bool orted_failed_launch;
868 static orte_job_t *jdatorted=NULL;
869
870
871 void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
872 opal_buffer_t *buffer,
873 orte_rml_tag_t tag, void *cbdata)
874 {
875 hwloc_topology_t topo;
876 int rc, idx;
877 char *sig, *coprocessors, **sns;
878 orte_proc_t *daemon=NULL;
879 orte_topology_t *t, *t2;
880 int i;
881 uint32_t h;
882 orte_job_t *jdata;
883 uint8_t flag;
884 size_t inlen, cmplen;
885 uint8_t *packed_data, *cmpdata;
886 opal_buffer_t datbuf, *data;
887
888 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
889 "%s plm:base:daemon_topology recvd for daemon %s",
890 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
891 ORTE_NAME_PRINT(sender)));
892
893
894 if (NULL == jdatorted) {
895 jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
896 }
897 if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, sender->vpid))) {
898 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
899 orted_failed_launch = true;
900 goto CLEANUP;
901 }
902 OBJ_CONSTRUCT(&datbuf, opal_buffer_t);
903
904 idx=1;
905 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT8))) {
906 ORTE_ERROR_LOG(rc);
907 orted_failed_launch = true;
908 goto CLEANUP;
909 }
910 if (flag) {
911
912 idx=1;
913 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &inlen, &idx, OPAL_SIZE))) {
914 ORTE_ERROR_LOG(rc);
915 orted_failed_launch = true;
916 goto CLEANUP;
917 }
918
919 idx=1;
920 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &cmplen, &idx, OPAL_SIZE))) {
921 ORTE_ERROR_LOG(rc);
922 orted_failed_launch = true;
923 goto CLEANUP;
924 }
925
926 packed_data = (uint8_t*)malloc(inlen);
927
928 idx = inlen;
929 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, packed_data, &idx, OPAL_UINT8))) {
930 ORTE_ERROR_LOG(rc);
931 orted_failed_launch = true;
932 goto CLEANUP;
933 }
934
935 if (opal_compress.decompress_block(&cmpdata, cmplen,
936 packed_data, inlen)) {
937
938 opal_dss.load(&datbuf, cmpdata, cmplen);
939 data = &datbuf;
940 } else {
941 data = buffer;
942 }
943 free(packed_data);
944 } else {
945 data = buffer;
946 }
947
948
949 idx=1;
950 if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &sig, &idx, OPAL_STRING))) {
951 ORTE_ERROR_LOG(rc);
952 orted_failed_launch = true;
953 goto CLEANUP;
954 }
955
956 t = NULL;
957 for (i=0; i < orte_node_topologies->size; i++) {
958 if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, i))) {
959 continue;
960 }
961
962 if (0 == strcmp(sig, t2->sig)) {
963 t = t2;
964 break;
965 }
966 }
967 if (NULL == t) {
968
969 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
970 orted_failed_launch = true;
971 goto CLEANUP;
972 }
973
974
975 idx=1;
976 if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &topo, &idx, OPAL_HWLOC_TOPO))) {
977 ORTE_ERROR_LOG(rc);
978 orted_failed_launch = true;
979 goto CLEANUP;
980 }
981
982 t->topo = topo;
983
984
985 idx=1;
986 if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &coprocessors, &idx, OPAL_STRING))) {
987 ORTE_ERROR_LOG(rc);
988 orted_failed_launch = true;
989 goto CLEANUP;
990 }
991 if (NULL != coprocessors) {
992
993 if (NULL == orte_coprocessors) {
994 orte_coprocessors = OBJ_NEW(opal_hash_table_t);
995 opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs);
996 }
997
998
999
1000 sns = opal_argv_split(coprocessors, ',');
1001 for (idx=0; NULL != sns[idx]; idx++) {
1002
1003 OPAL_HASH_STR(sns[idx], h);
1004
1005 opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&daemon->name.vpid);
1006 }
1007 opal_argv_free(sns);
1008 free(coprocessors);
1009 orte_coprocessors_detected = true;
1010 }
1011
1012 idx=1;
1013 if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &coprocessors, &idx, OPAL_STRING))) {
1014 ORTE_ERROR_LOG(rc);
1015 orted_failed_launch = true;
1016 goto CLEANUP;
1017 }
1018 if (NULL != coprocessors) {
1019 if (orte_get_attribute(&daemon->node->attributes, ORTE_NODE_SERIAL_NUMBER, NULL, OPAL_STRING)) {
1020
1021
1022
1023 ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
1024 orted_failed_launch = true;
1025 free(coprocessors);
1026 goto CLEANUP;
1027 }
1028 orte_set_attribute(&daemon->node->attributes, ORTE_NODE_SERIAL_NUMBER, ORTE_ATTR_LOCAL, coprocessors, OPAL_STRING);
1029 free(coprocessors);
1030 orte_coprocessors_detected = true;
1031 }
1032
1033 CLEANUP:
1034 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1035 "%s plm:base:orted:report_topo launch %s for daemon %s",
1036 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1037 orted_failed_launch ? "failed" : "completed",
1038 ORTE_NAME_PRINT(sender)));
1039
1040 if (orted_failed_launch) {
1041 ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START);
1042 return;
1043 } else {
1044 jdatorted->num_reported++;
1045 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1046 "%s plm:base:orted_report_launch recvd %d of %d reported daemons",
1047 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1048 jdatorted->num_reported, jdatorted->num_procs));
1049 if (jdatorted->num_procs == jdatorted->num_reported) {
1050 bool dvm = true;
1051 uint32_t key;
1052 void *nptr;
1053 jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1054
1055
1056
1057 rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
1058 while (OPAL_SUCCESS == rc) {
1059 if (ORTE_PROC_MY_NAME->jobid != jdata->jobid) {
1060 dvm = false;
1061 if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
1062 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1063 }
1064 }
1065 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
1066 }
1067 if (dvm) {
1068
1069 ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_DAEMONS_REPORTED);
1070 }
1071 }
1072 }
1073 }
1074
1075 void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
1076 opal_buffer_t *buffer,
1077 orte_rml_tag_t tag, void *cbdata)
1078 {
1079 char *ptr;
1080 int rc, idx;
1081 orte_proc_t *daemon=NULL;
1082 orte_job_t *jdata;
1083 orte_process_name_t dname;
1084 opal_buffer_t *relay;
1085 char *sig;
1086 orte_topology_t *t;
1087 hwloc_topology_t topo;
1088 int i;
1089 bool found;
1090 orte_daemon_cmd_flag_t cmd;
1091 int32_t flag;
1092 opal_value_t *kv;
1093 char *myendian;
1094
1095
1096 if (NULL == jdatorted) {
1097 jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1098 }
1099
1100
1101 t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
1102 if (NULL == t) {
1103
1104 myendian = "unknown";
1105 } else {
1106 myendian = strrchr(t->sig, ':');
1107 ++myendian;
1108 }
1109
1110
1111 idx = 1;
1112 while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
1113 char *nodename = NULL;
1114
1115 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1116 "%s plm:base:orted_report_launch from daemon %s",
1117 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1118 ORTE_NAME_PRINT(&dname)));
1119
1120
1121 if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, dname.vpid))) {
1122 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1123 orted_failed_launch = true;
1124 goto CLEANUP;
1125 }
1126 daemon->state = ORTE_PROC_STATE_RUNNING;
1127
1128 ORTE_FLAG_SET(daemon, ORTE_PROC_FLAG_ALIVE);
1129
1130
1131
1132 idx = 1;
1133 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT32))) {
1134 ORTE_ERROR_LOG(rc);
1135 orted_failed_launch = true;
1136 goto CLEANUP;
1137 }
1138 for (i=0; i < flag; i++) {
1139 idx = 1;
1140 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &kv, &idx, OPAL_VALUE))) {
1141 ORTE_ERROR_LOG(rc);
1142 orted_failed_launch = true;
1143 goto CLEANUP;
1144 }
1145
1146 opal_pmix.store_local(&dname, kv);
1147 OBJ_RELEASE(kv);
1148 }
1149
1150
1151 idx = 1;
1152 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nodename, &idx, OPAL_STRING))) {
1153 ORTE_ERROR_LOG(rc);
1154 orted_failed_launch = true;
1155 goto CLEANUP;
1156 }
1157 if (!orte_have_fqdn_allocation) {
1158
1159 if (NULL != (ptr = strchr(nodename, '.'))) {
1160 *ptr = '\0';
1161 ptr = strdup(nodename);
1162 free(nodename);
1163 nodename = ptr;
1164 }
1165 }
1166
1167 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1168 "%s plm:base:orted_report_launch from daemon %s on node %s",
1169 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1170 ORTE_NAME_PRINT(&daemon->name), nodename));
1171
1172
1173 ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
1174
1175 if (orte_retain_aliases) {
1176 char *alias, **atmp=NULL;
1177 uint8_t naliases, ni;
1178
1179
1180
1181
1182
1183
1184
1185 opal_argv_append_nosize(&atmp, nodename);
1186
1187 idx = 1;
1188 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &naliases, &idx, OPAL_UINT8))) {
1189 ORTE_ERROR_LOG(rc);
1190 orted_failed_launch = true;
1191 goto CLEANUP;
1192 }
1193 for (ni=0; ni < naliases; ni++) {
1194 idx = 1;
1195 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &alias, &idx, OPAL_STRING))) {
1196 ORTE_ERROR_LOG(rc);
1197 orted_failed_launch = true;
1198 goto CLEANUP;
1199 }
1200 opal_argv_append_nosize(&atmp, alias);
1201 free(alias);
1202 }
1203 if (0 < naliases) {
1204 alias = opal_argv_join(atmp, ',');
1205 orte_set_attribute(&daemon->node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
1206 free(alias);
1207 }
1208 opal_argv_free(atmp);
1209 }
1210
1211
1212 idx=1;
1213 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &idx, OPAL_STRING))) {
1214 ORTE_ERROR_LOG(rc);
1215 orted_failed_launch = true;
1216 goto CLEANUP;
1217 }
1218 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1219 "%s RECEIVED TOPOLOGY SIG %s FROM NODE %s",
1220 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), sig, nodename));
1221
1222
1223 topo = NULL;
1224 if (1 == dname.vpid) {
1225 uint8_t flag;
1226 size_t inlen, cmplen;
1227 uint8_t *packed_data, *cmpdata;
1228 opal_buffer_t datbuf, *data;
1229 OBJ_CONSTRUCT(&datbuf, opal_buffer_t);
1230
1231 idx=1;
1232 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT8))) {
1233 ORTE_ERROR_LOG(rc);
1234 orted_failed_launch = true;
1235 goto CLEANUP;
1236 }
1237 if (flag) {
1238
1239 idx=1;
1240 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &inlen, &idx, OPAL_SIZE))) {
1241 ORTE_ERROR_LOG(rc);
1242 orted_failed_launch = true;
1243 goto CLEANUP;
1244 }
1245
1246 idx=1;
1247 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &cmplen, &idx, OPAL_SIZE))) {
1248 ORTE_ERROR_LOG(rc);
1249 orted_failed_launch = true;
1250 goto CLEANUP;
1251 }
1252
1253 packed_data = (uint8_t*)malloc(inlen);
1254
1255 idx = inlen;
1256 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, packed_data, &idx, OPAL_UINT8))) {
1257 ORTE_ERROR_LOG(rc);
1258 orted_failed_launch = true;
1259 goto CLEANUP;
1260 }
1261
1262 if (opal_compress.decompress_block(&cmpdata, cmplen,
1263 packed_data, inlen)) {
1264
1265 opal_dss.load(&datbuf, cmpdata, cmplen);
1266 data = &datbuf;
1267 } else {
1268 data = buffer;
1269 }
1270 free(packed_data);
1271 } else {
1272 data = buffer;
1273 }
1274 idx=1;
1275 if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &topo, &idx, OPAL_HWLOC_TOPO))) {
1276 ORTE_ERROR_LOG(rc);
1277 orted_failed_launch = true;
1278 goto CLEANUP;
1279 }
1280 }
1281
1282
1283 found = false;
1284 for (i=0; i < orte_node_topologies->size; i++) {
1285 if (NULL == (t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, i))) {
1286 continue;
1287 }
1288
1289 if (0 == strcmp(sig, t->sig)) {
1290 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1291 "%s TOPOLOGY ALREADY RECORDED",
1292 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1293 found = true;
1294 daemon->node->topology = t;
1295 if (NULL != topo) {
1296 hwloc_topology_destroy(topo);
1297 }
1298 free(sig);
1299 break;
1300 }
1301 #if !OPAL_ENABLE_HETEROGENEOUS_SUPPORT
1302 else {
1303
1304 ptr = strrchr(sig, ':');
1305 ++ptr;
1306 if (0 != strcmp(ptr, myendian)) {
1307
1308
1309 orte_show_help("help-plm-base", "multi-endian", true,
1310 nodename, ptr, myendian);
1311 orted_failed_launch = true;
1312 if (NULL != topo) {
1313 hwloc_topology_destroy(topo);
1314 }
1315 goto CLEANUP;
1316 }
1317 }
1318 #endif
1319 }
1320
1321 if (!found) {
1322
1323 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1324 "%s NEW TOPOLOGY - ADDING",
1325 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1326 t = OBJ_NEW(orte_topology_t);
1327 t->sig = sig;
1328 t->index = opal_pointer_array_add(orte_node_topologies, t);
1329 daemon->node->topology = t;
1330 if (NULL != topo) {
1331 t->topo = topo;
1332 } else {
1333
1334 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1335 "%s REQUESTING TOPOLOGY FROM %s",
1336 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1337 ORTE_NAME_PRINT(&dname)));
1338
1339 relay = OBJ_NEW(opal_buffer_t);
1340 cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
1341 if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
1342 ORTE_ERROR_LOG(rc);
1343 OBJ_RELEASE(relay);
1344 orted_failed_launch = true;
1345 goto CLEANUP;
1346 }
1347
1348 orte_rml.send_buffer_nb(&dname, relay,
1349 ORTE_RML_TAG_DAEMON,
1350 orte_rml_send_callback, NULL);
1351
1352
1353 if (NULL != nodename) {
1354 free(nodename);
1355 nodename = NULL;
1356 }
1357 idx = 1;
1358 continue;
1359 }
1360 }
1361
1362 CLEANUP:
1363 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1364 "%s plm:base:orted_report_launch %s for daemon %s at contact %s",
1365 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1366 orted_failed_launch ? "failed" : "completed",
1367 ORTE_NAME_PRINT(&dname),
1368 (NULL == daemon) ? "UNKNOWN" : daemon->rml_uri));
1369
1370 if (NULL != nodename) {
1371 free(nodename);
1372 nodename = NULL;
1373 }
1374
1375 if (orted_failed_launch) {
1376 ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START);
1377 return;
1378 } else {
1379 jdatorted->num_reported++;
1380 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1381 "%s plm:base:orted_report_launch job %s recvd %d of %d reported daemons",
1382 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1383 ORTE_JOBID_PRINT(jdatorted->jobid),
1384 jdatorted->num_reported, jdatorted->num_procs));
1385 if (jdatorted->num_procs == jdatorted->num_reported) {
1386 bool dvm = true;
1387 uint32_t key;
1388 void *nptr;
1389 jdatorted->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1390
1391
1392
1393 rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
1394 while (OPAL_SUCCESS == rc) {
1395 if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
1396 goto next;
1397 }
1398 dvm = false;
1399 if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) {
1400 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1401 }
1402 next:
1403 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
1404 }
1405 if (dvm) {
1406
1407 ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_DAEMONS_REPORTED);
1408 }
1409 }
1410 }
1411 idx = 1;
1412 }
1413 if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
1414 ORTE_ERROR_LOG(rc);
1415 ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START);
1416 }
1417 }
1418
1419 void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender,
1420 opal_buffer_t *buffer,
1421 orte_rml_tag_t tag, void *cbdata)
1422 {
1423 int status, rc;
1424 int32_t n;
1425 orte_vpid_t vpid;
1426 orte_proc_t *daemon=NULL;
1427
1428
1429 if (NULL == jdatorted) {
1430 jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1431 }
1432
1433
1434 n=1;
1435 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &vpid, &n, ORTE_VPID))) {
1436 ORTE_ERROR_LOG(rc);
1437 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
1438 goto finish;
1439 }
1440
1441
1442 n=1;
1443 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &status, &n, OPAL_INT))) {
1444 ORTE_ERROR_LOG(rc);
1445 status = ORTE_ERROR_DEFAULT_EXIT_CODE;
1446 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
1447 } else {
1448 ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
1449 }
1450
1451
1452 if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) {
1453 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1454 goto finish;
1455 }
1456 daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
1457 daemon->exit_code = status;
1458
1459 finish:
1460 if (NULL == daemon) {
1461 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
1462 return;
1463 }
1464 ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
1465 }
1466
1467 int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
1468 {
1469 int i, loc;
1470 char **tmpv;
1471
1472
1473
1474
1475 loc = 0;
1476
1477 tmpv = opal_argv_split(orte_launch_agent, ' ');
1478 for (i = 0; NULL != tmpv && NULL != tmpv[i]; ++i) {
1479 if (0 == strcmp(tmpv[i], "orted")) {
1480 loc = i;
1481 }
1482 opal_argv_append(argc, argv, tmpv[i]);
1483 }
1484 opal_argv_free(tmpv);
1485
1486 return loc;
1487 }
1488
1489
1490
1491
1492
1493 int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
1494 char *ess,
1495 int *proc_vpid_index)
1496 {
1497 char *param = NULL;
1498 const char **tmp_value, **tmp_value2;
1499 int loc_id;
1500 char *tmp_force = NULL;
1501 int i, j, cnt, rc;
1502 orte_job_t *jdata;
1503 unsigned long num_procs;
1504 bool ignore;
1505
1506
1507 if (orte_debug_flag) {
1508 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1509 opal_argv_append(argc, argv, "orte_debug");
1510 opal_argv_append(argc, argv, "1");
1511 }
1512 if (orte_debug_daemons_flag) {
1513 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1514 opal_argv_append(argc, argv, "orte_debug_daemons");
1515 opal_argv_append(argc, argv, "1");
1516 }
1517 if (orte_debug_daemons_file_flag) {
1518 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1519 opal_argv_append(argc, argv, "orte_debug_daemons_file");
1520 opal_argv_append(argc, argv, "1");
1521 }
1522 if (orte_leave_session_attached) {
1523 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1524 opal_argv_append(argc, argv, "orte_leave_session_attached");
1525 opal_argv_append(argc, argv, "1");
1526 }
1527
1528 if (orted_spin_flag) {
1529 opal_argv_append(argc, argv, "--spin");
1530 }
1531
1532 if (opal_hwloc_report_bindings) {
1533 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1534 opal_argv_append(argc, argv, "orte_report_bindings");
1535 opal_argv_append(argc, argv, "1");
1536 }
1537
1538 if (orte_map_stddiag_to_stderr) {
1539 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1540 opal_argv_append(argc, argv, "orte_map_stddiag_to_stderr");
1541 opal_argv_append(argc, argv, "1");
1542 }
1543 else if (orte_map_stddiag_to_stdout) {
1544 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1545 opal_argv_append(argc, argv, "orte_map_stddiag_to_stdout");
1546 opal_argv_append(argc, argv, "1");
1547 }
1548
1549
1550 if (NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
1551 opal_argv_append(argc, argv, "--test-suicide");
1552 }
1553
1554
1555 if (NULL != ess) {
1556 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1557 opal_argv_append(argc, argv, "ess");
1558 opal_argv_append(argc, argv, ess);
1559 }
1560
1561
1562 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1563 opal_argv_append(argc, argv, "ess_base_jobid");
1564 if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(¶m, ORTE_PROC_MY_NAME->jobid))) {
1565 ORTE_ERROR_LOG(rc);
1566 return rc;
1567 }
1568 opal_argv_append(argc, argv, param);
1569 free(param);
1570
1571
1572 if (NULL != proc_vpid_index) {
1573 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1574 opal_argv_append(argc, argv, "ess_base_vpid");
1575 *proc_vpid_index = *argc;
1576 opal_argv_append(argc, argv, "<template>");
1577 }
1578
1579
1580 if (ORTE_PROC_IS_HNP) {
1581 jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1582 num_procs = jdata->num_procs;
1583 } else {
1584 num_procs = orte_process_info.num_procs;
1585 }
1586 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1587 opal_argv_append(argc, argv, "ess_base_num_procs");
1588 opal_asprintf(¶m, "%lu", num_procs);
1589 opal_argv_append(argc, argv, param);
1590 free(param);
1591
1592
1593 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1594 opal_argv_append(argc, argv, "orte_hnp_uri");
1595 opal_argv_append(argc, argv, orte_process_info.my_hnp_uri);
1596
1597
1598 if (NULL != orte_xterm) {
1599 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1600 opal_argv_append(argc, argv, "orte_xterm");
1601 opal_argv_append(argc, argv, orte_xterm);
1602 }
1603
1604 loc_id = mca_base_var_find("opal", "mca", "base", "param_files");
1605 if (loc_id < 0) {
1606 rc = OPAL_ERR_NOT_FOUND;
1607 ORTE_ERROR_LOG(rc);
1608 return rc;
1609 }
1610 tmp_value = NULL;
1611 rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1612 if (ORTE_SUCCESS != rc) {
1613 ORTE_ERROR_LOG(rc);
1614 return rc;
1615 }
1616 if (NULL != tmp_value && NULL != tmp_value[0]) {
1617 rc = strcmp(tmp_value[0], "none");
1618 } else {
1619 rc = 1;
1620 }
1621
1622 if (0 != rc) {
1623
1624
1625
1626
1627 tmp_value = NULL;
1628
1629 loc_id = mca_base_var_find("opal", "mca", "base", "envar_file_prefix");
1630 if (loc_id < 0) {
1631 rc = OPAL_ERR_NOT_FOUND;
1632 ORTE_ERROR_LOG(rc);
1633 return rc;
1634 }
1635 rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1636 if (ORTE_SUCCESS != rc) {
1637 ORTE_ERROR_LOG(rc);
1638 return rc;
1639 }
1640 if( NULL != tmp_value && NULL != tmp_value[0] ) {
1641
1642
1643
1644 opal_argv_append(argc, argv, "-mca");
1645 opal_argv_append(argc, argv, "mca_base_envar_file_prefix");
1646 opal_argv_append(argc, argv, tmp_value[0]);
1647 }
1648
1649 tmp_value2 = NULL;
1650 loc_id = mca_base_var_find("opal", "mca", "base", "param_file_prefix");
1651 mca_base_var_get_value(loc_id, &tmp_value2, NULL, NULL);
1652 if( NULL != tmp_value2 && NULL != tmp_value2[0] ) {
1653
1654
1655
1656 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1657 opal_argv_append(argc, argv, "mca_base_param_file_prefix");
1658 opal_argv_append(argc, argv, tmp_value2[0]);
1659 orte_show_help("help-plm-base.txt", "deprecated-amca", true);
1660 }
1661
1662 if ((NULL != tmp_value && NULL != tmp_value[0])
1663 || (NULL != tmp_value2 && NULL != tmp_value2[0])) {
1664
1665 tmp_value = NULL;
1666 loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path");
1667 if (loc_id < 0) {
1668 ORTE_ERROR_LOG(rc);
1669 return rc;
1670 }
1671 rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1672 if (ORTE_SUCCESS != rc) {
1673 ORTE_ERROR_LOG(rc);
1674 return rc;
1675 }
1676 if( NULL != tmp_value && NULL != tmp_value[0] ) {
1677 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1678 opal_argv_append(argc, argv, "mca_base_param_file_path");
1679 opal_argv_append(argc, argv, tmp_value[0]);
1680 }
1681
1682
1683 opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
1684 opal_argv_append(argc, argv, "mca_base_param_file_path_force");
1685
1686 tmp_value = NULL;
1687 loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path_force");
1688 if (loc_id < 0) {
1689 rc = OPAL_ERR_NOT_FOUND;
1690 ORTE_ERROR_LOG(rc);
1691 return rc;
1692 }
1693 rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL);
1694 if (OPAL_SUCCESS != rc) {
1695 ORTE_ERROR_LOG(rc);
1696 return rc;
1697 }
1698 if( NULL == tmp_value || NULL == tmp_value[0] ) {
1699
1700 tmp_force = (char *) malloc(sizeof(char) * OPAL_PATH_MAX);
1701 if (NULL == getcwd(tmp_force, OPAL_PATH_MAX)) {
1702 free(tmp_force);
1703 tmp_force = strdup("");
1704 }
1705
1706 opal_argv_append(argc, argv, tmp_force);
1707 free(tmp_force);
1708 } else {
1709 opal_argv_append(argc, argv, tmp_value[0]);
1710 }
1711 }
1712 }
1713
1714
1715
1716
1717
1718 if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
1719 cnt = opal_argv_count(orted_cmd_line);
1720 for (i=0; i < cnt; i+=3) {
1721
1722
1723
1724
1725
1726
1727
1728
1729 if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
1730 continue;
1731 }
1732
1733
1734
1735
1736
1737
1738 if (0 == strcmp(orted_cmd_line[i+1], "plm")) {
1739 continue;
1740 }
1741
1742 ignore = false;
1743 for (j=0; j < *argc; j++) {
1744 if (0 == strcmp((*argv)[j], orted_cmd_line[i+1])) {
1745 ignore = true;
1746 break;
1747 }
1748 }
1749 if (!ignore) {
1750
1751 opal_argv_append(argc, argv, orted_cmd_line[i]);
1752 opal_argv_append(argc, argv, orted_cmd_line[i+1]);
1753 opal_argv_append(argc, argv, orted_cmd_line[i+2]);
1754 }
1755 }
1756 }
1757
1758 return ORTE_SUCCESS;
1759 }
1760
1761 int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
1762 {
1763 orte_node_t *node, *nptr;
1764 orte_proc_t *proc, *pptr;
1765 orte_job_map_t *map=NULL;
1766 int rc, i;
1767 orte_job_t *daemons;
1768 opal_list_t nodes, tnodes;
1769 opal_list_item_t *item, *next;
1770 orte_app_context_t *app;
1771 bool one_filter = false;
1772 int num_nodes;
1773 bool default_hostfile_used;
1774 char *hosts = NULL;
1775 bool singleton=false;
1776 bool multi_sim = false;
1777
1778 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1779 "%s plm:base:setup_vm",
1780 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1781
1782 if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
1783 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1784 return ORTE_ERR_NOT_FOUND;
1785 }
1786 if (NULL == daemons->map) {
1787 daemons->map = OBJ_NEW(orte_job_map_t);
1788 }
1789 map = daemons->map;
1790
1791
1792
1793 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
1794
1795 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1796 map->num_new_daemons = 0;
1797 return ORTE_SUCCESS;
1798 }
1799
1800
1801
1802
1803 if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
1804 if (0 == map->num_nodes) {
1805 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1806 "%s plm:base:setup_vm creating map",
1807 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1808
1809
1810
1811
1812
1813 node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
1814 opal_pointer_array_add(map->nodes, (void*)node);
1815 ++(map->num_nodes);
1816
1817 OBJ_RETAIN(node);
1818
1819 singleton = true;
1820 }
1821 OBJ_CONSTRUCT(&nodes, opal_list_t);
1822 for (i=1; i < orte_node_pool->size; i++) {
1823 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
1824 continue;
1825 }
1826
1827 if (!singleton && ORTE_NODE_STATE_ADDED != node->state) {
1828 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1829 "%s plm_base:setup_vm NODE %s WAS NOT ADDED",
1830 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
1831 continue;
1832 }
1833 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1834 "%s plm_base:setup_vm ADDING NODE %s",
1835 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
1836
1837
1838
1839 OBJ_RETAIN(node);
1840 opal_list_append(&nodes, &node->super);
1841
1842 node->state = ORTE_NODE_STATE_UP;
1843 }
1844 map->num_new_daemons = 0;
1845
1846
1847
1848 if (0 == opal_list_get_size(&nodes)) {
1849 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1850 "%s plm:base:setup_vm no new daemons required",
1851 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1852 OBJ_DESTRUCT(&nodes);
1853
1854 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1855 ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
1856 return ORTE_SUCCESS;
1857 }
1858
1859 goto process;
1860 }
1861
1862
1863
1864
1865
1866 multi_sim = orte_get_attribute(&jdata->attributes, ORTE_JOB_MULTI_DAEMON_SIM, NULL, OPAL_BOOL);
1867 if (orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL) || multi_sim) {
1868 OBJ_CONSTRUCT(&nodes, opal_list_t);
1869
1870
1871
1872 for (i=1; i < orte_node_pool->size; i++) {
1873 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
1874 continue;
1875 }
1876
1877 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
1878 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1879 "NODE %s IS MARKED NO_USE", node->name));
1880
1881 node->state = ORTE_NODE_STATE_UP;
1882 continue;
1883 }
1884 if (ORTE_NODE_STATE_DOWN == node->state) {
1885 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1886 "NODE %s IS MARKED DOWN", node->name));
1887 continue;
1888 }
1889 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
1890 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
1891 "NODE %s IS MARKED NO_INCLUDE", node->name));
1892
1893 continue;
1894 }
1895 if (0 < node->num_procs || multi_sim) {
1896
1897
1898
1899 OBJ_RETAIN(node);
1900 opal_list_append(&nodes, &node->super);
1901 }
1902 }
1903 if (multi_sim) {
1904 goto process;
1905 }
1906
1907 if (0 == opal_list_get_size(&nodes)) {
1908
1909 node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
1910 if (0 < node->num_procs) {
1911 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1912 "%s plm:base:setup_vm only HNP in use",
1913 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1914 OBJ_DESTRUCT(&nodes);
1915 map->num_nodes = 1;
1916
1917 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1918 return ORTE_SUCCESS;
1919 }
1920
1921
1922
1923 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
1924 return ORTE_ERR_FATAL;
1925 }
1926 goto process;
1927 }
1928
1929 if (0 == map->num_nodes) {
1930 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1931 "%s plm:base:setup_vm creating map",
1932 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1933
1934
1935
1936
1937
1938 node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
1939 opal_pointer_array_add(map->nodes, (void*)node);
1940 ++(map->num_nodes);
1941
1942 OBJ_RETAIN(node);
1943 }
1944
1945
1946
1947
1948 map->num_new_daemons = 0;
1949
1950
1951 OBJ_CONSTRUCT(&nodes, opal_list_t);
1952
1953
1954
1955
1956
1957
1958 if (!orte_managed_allocation) {
1959 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1960 "%s setup:vm: working unmanaged allocation",
1961 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1962 default_hostfile_used = false;
1963 OBJ_CONSTRUCT(&tnodes, opal_list_t);
1964 for (i=0; i < jdata->apps->size; i++) {
1965 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
1966 continue;
1967 }
1968
1969
1970
1971 hosts = NULL;
1972 if (!orte_soft_locations &&
1973 orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
1974 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1975 "%s using dash_host",
1976 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1977 if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&tnodes, hosts, false))) {
1978 ORTE_ERROR_LOG(rc);
1979 free(hosts);
1980 return rc;
1981 }
1982 free(hosts);
1983 } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
1984
1985 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1986 "%s using hostfile %s",
1987 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
1988 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&tnodes, hosts))) {
1989 ORTE_ERROR_LOG(rc);
1990 free(hosts);
1991 return rc;
1992 }
1993 free(hosts);
1994 } else if (NULL != orte_rankfile) {
1995
1996 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1997 "%s using rankfile %s",
1998 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1999 orte_rankfile));
2000 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&tnodes,
2001 orte_rankfile))) {
2002 ORTE_ERROR_LOG(rc);
2003 return rc;
2004 }
2005 } else if (NULL != orte_default_hostfile) {
2006 if (!default_hostfile_used) {
2007
2008 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2009 "%s using default hostfile %s",
2010 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2011 orte_default_hostfile));
2012 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&tnodes,
2013 orte_default_hostfile))) {
2014 ORTE_ERROR_LOG(rc);
2015 return rc;
2016 }
2017
2018 default_hostfile_used = true;
2019 }
2020 }
2021 }
2022
2023
2024
2025
2026 while (NULL != (item = opal_list_remove_first(&tnodes))) {
2027 nptr = (orte_node_t*)item;
2028 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2029 "%s checking node %s",
2030 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2031 nptr->name));
2032 for (i=0; i < orte_node_pool->size; i++) {
2033 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
2034 continue;
2035 }
2036 if (0 != strcmp(node->name, nptr->name)) {
2037 continue;
2038 }
2039
2040
2041 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
2042 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2043 "NODE %s IS MARKED NO_USE", node->name));
2044
2045 node->state = ORTE_NODE_STATE_UP;
2046 break;
2047 }
2048 if (ORTE_NODE_STATE_DOWN == node->state) {
2049 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2050 "NODE %s IS MARKED DOWN", node->name));
2051 break;
2052 }
2053 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
2054 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2055 "NODE %s IS MARKED NO_INCLUDE", node->name));
2056 break;
2057 }
2058
2059 if (0 == node->index) {
2060 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2061 "%s ignoring myself",
2062 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2063 break;
2064 }
2065
2066 OBJ_RETAIN(node);
2067 opal_list_append(&nodes, &node->super);
2068 }
2069 OBJ_RELEASE(nptr);
2070 }
2071 OPAL_LIST_DESTRUCT(&tnodes);
2072
2073
2074
2075
2076 if (0 == opal_list_get_size(&nodes)) {
2077 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2078 "%s plm:base:setup_vm only HNP in allocation",
2079 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2080 OBJ_DESTRUCT(&nodes);
2081
2082 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
2083 ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
2084 return ORTE_SUCCESS;
2085 }
2086
2087 goto process;
2088 }
2089
2090
2091 for (i=1; i < orte_node_pool->size; i++) {
2092 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
2093
2094 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
2095 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2096 "NODE %s IS MARKED NO_USE", node->name));
2097
2098 node->state = ORTE_NODE_STATE_UP;
2099 continue;
2100 }
2101 if (ORTE_NODE_STATE_DOWN == node->state) {
2102 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2103 "NODE %s IS MARKED DOWN", node->name));
2104 continue;
2105 }
2106 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
2107 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
2108 "NODE %s IS MARKED NO_INCLUDE", node->name));
2109
2110 continue;
2111 }
2112
2113
2114
2115 OBJ_RETAIN(node);
2116 opal_list_append(&nodes, &node->super);
2117
2118
2119
2120 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
2121 }
2122 }
2123
2124
2125
2126
2127
2128 if (0 == opal_list_get_size(&nodes)) {
2129 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2130 "%s plm:base:setup_vm only HNP in allocation",
2131 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2132
2133 OBJ_DESTRUCT(&nodes);
2134
2135 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
2136 ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
2137 return ORTE_SUCCESS;
2138 }
2139
2140
2141
2142
2143
2144
2145
2146 if (orte_hnp_is_allocated) {
2147 node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
2148 OBJ_RETAIN(node);
2149 opal_list_prepend(&nodes, &node->super);
2150 }
2151 for (i=0; i < jdata->apps->size; i++) {
2152 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
2153 continue;
2154 }
2155 if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, &nodes, false)) &&
2156 rc != ORTE_ERR_TAKE_NEXT_OPTION) {
2157 ORTE_ERROR_LOG(rc);
2158 return rc;
2159 }
2160 if (ORTE_SUCCESS == rc) {
2161
2162 one_filter = true;
2163 }
2164 }
2165
2166 if (one_filter) {
2167
2168
2169
2170 item = opal_list_get_first(&nodes);
2171 while (item != opal_list_get_end(&nodes)) {
2172 next = opal_list_get_next(item);
2173 node = (orte_node_t*)item;
2174 if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
2175 opal_list_remove_item(&nodes, item);
2176 OBJ_RELEASE(item);
2177 } else {
2178
2179
2180
2181 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
2182 }
2183 item = next;
2184 }
2185 }
2186
2187
2188 if (0 < opal_list_get_size(&nodes)) {
2189 item = opal_list_get_first(&nodes);
2190 node = (orte_node_t*)item;
2191 if (0 == node->index) {
2192 opal_list_remove_item(&nodes, item);
2193 OBJ_RELEASE(item);
2194 }
2195 }
2196
2197
2198
2199
2200
2201 if (0 == opal_list_get_size(&nodes)) {
2202 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2203 "%s plm:base:setup_vm only HNP left",
2204 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
2205 OBJ_DESTRUCT(&nodes);
2206
2207 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
2208 ORTE_FLAG_UNSET(daemons, ORTE_JOB_FLAG_UPDATED);
2209 return ORTE_SUCCESS;
2210 }
2211
2212 process:
2213
2214
2215
2216
2217
2218
2219 if (orte_hnp_is_allocated) {
2220 num_nodes = 1;
2221 } else {
2222 num_nodes = 0;
2223 }
2224 while (NULL != (item = opal_list_remove_first(&nodes))) {
2225
2226 if (0 < orte_max_vm_size && num_nodes == orte_max_vm_size) {
2227
2228 OBJ_RELEASE(item);
2229 break;
2230 }
2231 node = (orte_node_t*)item;
2232
2233 if (NULL != node->daemon) {
2234 num_nodes++;
2235
2236 OBJ_RELEASE(item);
2237 continue;
2238 }
2239
2240
2241
2242
2243 opal_pointer_array_add(map->nodes, (void*)node);
2244 ++(map->num_nodes);
2245 num_nodes++;
2246
2247 proc = OBJ_NEW(orte_proc_t);
2248 if (NULL == proc) {
2249 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
2250 return ORTE_ERR_OUT_OF_RESOURCE;
2251 }
2252 proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
2253 if (ORTE_VPID_MAX-1 <= daemons->num_procs) {
2254
2255 orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
2256 OBJ_RELEASE(proc);
2257 return ORTE_ERR_OUT_OF_RESOURCE;
2258 }
2259 proc->name.vpid = daemons->num_procs;
2260 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2261 "%s plm:base:setup_vm add new daemon %s",
2262 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2263 ORTE_NAME_PRINT(&proc->name)));
2264
2265 if (0 > (rc = opal_pointer_array_set_item(daemons->procs, proc->name.vpid, (void*)proc))) {
2266 ORTE_ERROR_LOG(rc);
2267 return rc;
2268 }
2269 ++daemons->num_procs;
2270 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
2271 "%s plm:base:setup_vm assigning new daemon %s to node %s",
2272 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2273 ORTE_NAME_PRINT(&proc->name),
2274 node->name));
2275
2276 node->daemon = proc;
2277 OBJ_RETAIN(proc);
2278
2279 proc->node = node;
2280 OBJ_RETAIN(node);
2281 if (orte_plm_globals.daemon_nodes_assigned_at_launch) {
2282 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_LOC_VERIFIED);
2283 } else {
2284 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_LOC_VERIFIED);
2285 }
2286
2287 ++map->num_new_daemons;
2288
2289 if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
2290 map->daemon_vpid_start = proc->name.vpid;
2291 }
2292
2293 for (i=0; i < node->procs->size; i++) {
2294 if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
2295 pptr->parent = proc->name.vpid;
2296 }
2297 }
2298 }
2299
2300 if (orte_process_info.num_procs != daemons->num_procs) {
2301
2302
2303
2304
2305
2306
2307
2308 orte_process_info.num_procs = daemons->num_procs;
2309
2310 if (orte_process_info.max_procs < orte_process_info.num_procs) {
2311 orte_process_info.max_procs = orte_process_info.num_procs;
2312 }
2313
2314
2315
2316 orte_routed.update_routing_plan();
2317 }
2318
2319
2320 ORTE_FLAG_SET(daemons, ORTE_JOB_FLAG_UPDATED);
2321
2322
2323
2324 if (0 < map->num_new_daemons) {
2325 if (ORTE_SUCCESS != (rc = orte_set_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS,
2326 true, NULL, OPAL_BOOL))) {
2327 ORTE_ERROR_LOG(rc);
2328 return rc;
2329 }
2330 }
2331
2332 return ORTE_SUCCESS;
2333 }