This source file includes following definitions.
- orte_ras_base_display_alloc
- orte_ras_base_allocate
- orte_ras_base_add_hosts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 #include "orte_config.h"
25
26 #include <string.h>
27
28 #include "orte/constants.h"
29 #include "orte/types.h"
30
31 #include "orte/mca/mca.h"
32 #include "opal/mca/base/base.h"
33 #include "opal/class/opal_list.h"
34 #include "opal/util/output.h"
35 #include "opal/util/printf.h"
36 #include "opal/dss/dss.h"
37 #include "opal/util/argv.h"
38 #include "opal/mca/if/if.h"
39
40 #include "orte/util/show_help.h"
41 #include "orte/mca/errmgr/errmgr.h"
42 #include "orte/mca/rmaps/base/base.h"
43 #include "orte/util/name_fns.h"
44 #include "orte/runtime/orte_globals.h"
45 #include "orte/runtime/orte_wait.h"
46 #include "orte/util/hostfile/hostfile.h"
47 #include "orte/util/dash_host/dash_host.h"
48 #include "orte/util/proc_info.h"
49 #include "orte/util/comm/comm.h"
50 #include "orte/util/error_strings.h"
51 #include "orte/util/threads.h"
52 #include "orte/mca/state/state.h"
53 #include "orte/runtime/orte_quit.h"
54
55 #include "orte/mca/ras/base/ras_private.h"
56
57
58 void orte_ras_base_display_alloc(void)
59 {
60 char *tmp=NULL, *tmp2, *tmp3;
61 int i, istart;
62 orte_node_t *alloc;
63
64 if (orte_xml_output) {
65 opal_asprintf(&tmp, "<allocation>\n");
66 } else {
67 opal_asprintf(&tmp, "\n====================== ALLOCATED NODES ======================\n");
68 }
69 if (orte_hnp_is_allocated) {
70 istart = 0;
71 } else {
72 istart = 1;
73 }
74 for (i=istart; i < orte_node_pool->size; i++) {
75 if (NULL == (alloc = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
76 continue;
77 }
78 if (orte_xml_output) {
79
80 opal_asprintf(&tmp2, "\t<host name=\"%s\" slots=\"%d\" max_slots=\"%d\" slots_inuse=\"%d\">\n",
81 (NULL == alloc->name) ? "UNKNOWN" : alloc->name,
82 (int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
83 } else {
84 opal_asprintf(&tmp2, "\t%s: flags=0x%02x slots=%d max_slots=%d slots_inuse=%d state=%s\n",
85 (NULL == alloc->name) ? "UNKNOWN" : alloc->name, alloc->flags,
86 (int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
87 orte_node_state_to_str(alloc->state));
88 }
89 if (NULL == tmp) {
90 tmp = tmp2;
91 } else {
92 opal_asprintf(&tmp3, "%s%s", tmp, tmp2);
93 free(tmp);
94 free(tmp2);
95 tmp = tmp3;
96 }
97 }
98 if (orte_xml_output) {
99 fprintf(orte_xml_fp, "%s</allocation>\n", tmp);
100 fflush(orte_xml_fp);
101 } else {
102 opal_output(orte_clean_output, "%s=================================================================\n", tmp);
103 }
104 free(tmp);
105 }
106
107
108
109
110
111 void orte_ras_base_allocate(int fd, short args, void *cbdata)
112 {
113 int rc;
114 orte_job_t *jdata;
115 opal_list_t nodes;
116 orte_node_t *node;
117 orte_std_cntr_t i;
118 orte_app_context_t *app;
119 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
120 char *hosts=NULL;
121
122 ORTE_ACQUIRE_OBJECT(caddy);
123
124 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
125 "%s ras:base:allocate",
126 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
127
128
129 jdata = caddy->jdata;
130
131
132
133
134 if (orte_ras_base.allocation_read) {
135
136 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
137 "%s ras:base:allocate allocation already read",
138 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
139 goto next_state;
140 }
141 orte_ras_base.allocation_read = true;
142
143
144
145
146
147
148
149
150
151
152
153 OBJ_CONSTRUCT(&nodes, opal_list_t);
154
155
156
157
158 if (NULL != orte_ras_base.active_module) {
159
160 if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(jdata, &nodes))) {
161 if (ORTE_ERR_ALLOCATION_PENDING == rc) {
162
163 OBJ_DESTRUCT(&nodes);
164 OBJ_RELEASE(caddy);
165 return;
166 }
167 if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) {
168
169
170
171
172 goto addlocal;
173 }
174 if (ORTE_ERR_TAKE_NEXT_OPTION == rc) {
175
176
177
178
179
180 if (orte_allocation_required) {
181
182 OBJ_DESTRUCT(&nodes);
183 orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
184 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
185 OBJ_RELEASE(caddy);
186 return;
187 } else {
188
189
190
191 goto addlocal;
192 }
193 }
194 ORTE_ERROR_LOG(rc);
195 OBJ_DESTRUCT(&nodes);
196 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
197 OBJ_RELEASE(caddy);
198 return;
199 }
200 }
201
202 if (!opal_list_is_empty(&nodes)) {
203
204 orte_managed_allocation = true;
205
206
207 opal_if_do_not_resolve = true;
208
209
210
211 if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
212 ORTE_ERROR_LOG(rc);
213 OBJ_DESTRUCT(&nodes);
214 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
215 OBJ_RELEASE(caddy);
216 return;
217 }
218 OBJ_DESTRUCT(&nodes);
219 goto DISPLAY;
220 } else if (orte_allocation_required) {
221
222
223
224 OBJ_DESTRUCT(&nodes);
225 orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
226 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
227 OBJ_RELEASE(caddy);
228 return;
229 }
230
231 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
232 "%s ras:base:allocate nothing found in module - proceeding to hostfile",
233 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
234
235
236
237
238 if (NULL != orte_rankfile) {
239 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
240 "%s ras:base:allocate parsing rankfile %s",
241 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
242 orte_rankfile));
243
244
245 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
246 orte_rankfile))) {
247 OBJ_DESTRUCT(&nodes);
248 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
249 OBJ_RELEASE(caddy);
250 return;
251 }
252 }
253
254
255
256
257 if (!opal_list_is_empty(&nodes)) {
258
259
260
261 if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
262 ORTE_ERROR_LOG(rc);
263 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
264 OBJ_RELEASE(caddy);
265 return;
266 }
267
268 if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
269 ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
270 }
271
272 OBJ_DESTRUCT(&nodes);
273 goto DISPLAY;
274 }
275
276
277
278
279 for (i=0; i < jdata->apps->size; i++) {
280 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
281 continue;
282 }
283 if (!orte_soft_locations &&
284 orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
285
286
287
288
289
290
291
292 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
293 "%s ras:base:allocate adding dash_hosts",
294 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
295 if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, true))) {
296 free(hosts);
297 OBJ_DESTRUCT(&nodes);
298 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
299 OBJ_RELEASE(caddy);
300 return;
301 }
302 free(hosts);
303 }
304 }
305
306
307
308
309 if (!opal_list_is_empty(&nodes)) {
310
311
312
313 if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
314 ORTE_ERROR_LOG(rc);
315 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
316 OBJ_RELEASE(caddy);
317 return;
318 }
319
320 OBJ_DESTRUCT(&nodes);
321 goto DISPLAY;
322 }
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339 for (i=0; i < jdata->apps->size; i++) {
340 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
341 continue;
342 }
343 if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
344 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
345 "%s ras:base:allocate adding hostfile %s",
346 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
347
348
349 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) {
350 free(hosts);
351 OBJ_DESTRUCT(&nodes);
352
353 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
354 OBJ_RELEASE(caddy);
355 return;
356 }
357 free(hosts);
358 }
359 }
360
361
362
363
364 if (!opal_list_is_empty(&nodes)) {
365
366
367
368 if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
369 ORTE_ERROR_LOG(rc);
370 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
371 OBJ_RELEASE(caddy);
372 return;
373 }
374
375 OBJ_DESTRUCT(&nodes);
376 goto DISPLAY;
377 }
378
379
380 if (NULL != orte_default_hostfile) {
381 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
382 "%s ras:base:allocate parsing default hostfile %s",
383 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
384 orte_default_hostfile));
385
386
387 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
388 orte_default_hostfile))) {
389 OBJ_DESTRUCT(&nodes);
390 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
391 OBJ_RELEASE(caddy);
392 return;
393 }
394 }
395
396
397
398
399 if (!opal_list_is_empty(&nodes)) {
400
401
402
403 if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
404 ORTE_ERROR_LOG(rc);
405 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
406 OBJ_RELEASE(caddy);
407 return;
408 }
409
410 OBJ_DESTRUCT(&nodes);
411 goto DISPLAY;
412 }
413
414 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
415 "%s ras:base:allocate nothing found in hostfiles - inserting current node",
416 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
417
418 addlocal:
419
420
421
422 node = OBJ_NEW(orte_node_t);
423 if (NULL == node) {
424 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
425 OBJ_DESTRUCT(&nodes);
426 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
427 OBJ_RELEASE(caddy);
428 return;
429 }
430
431
432
433 node->name = strdup(orte_process_info.nodename);
434 node->state = ORTE_NODE_STATE_UP;
435 node->slots_inuse = 0;
436 node->slots_max = 0;
437 node->slots = 1;
438 opal_list_append(&nodes, &node->super);
439
440 orte_hnp_is_allocated = true;
441
442
443
444
445 if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
446 ORTE_ERROR_LOG(rc);
447 OBJ_DESTRUCT(&nodes);
448 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
449 OBJ_RELEASE(caddy);
450 return;
451 }
452 OBJ_DESTRUCT(&nodes);
453
454 DISPLAY:
455
456 if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) {
457 orte_ras_base_display_alloc();
458 }
459
460 next_state:
461
462 if (orte_report_events) {
463 if (ORTE_SUCCESS != (rc = orte_util_comm_report_event(ORTE_COMM_EVENT_ALLOCATE))) {
464 ORTE_ERROR_LOG(rc);
465 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
466 OBJ_RELEASE(caddy);
467 }
468 }
469
470
471 jdata->total_slots_alloc = orte_ras_base.total_slots_alloc;
472
473
474 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE);
475
476
477 OBJ_RELEASE(caddy);
478 }
479
480 int orte_ras_base_add_hosts(orte_job_t *jdata)
481 {
482 int rc;
483 opal_list_t nodes;
484 int i, n;
485 orte_app_context_t *app;
486 orte_node_t *node, *next, *nptr;
487 char *hosts;
488
489
490 OBJ_CONSTRUCT(&nodes, opal_list_t);
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505 for (i=0; i < jdata->apps->size; i++) {
506 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
507 continue;
508 }
509 if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
510 OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
511 "%s ras:base:add_hosts checking add-hostfile %s",
512 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
513
514
515 if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) {
516 ORTE_ERROR_LOG(rc);
517 OBJ_DESTRUCT(&nodes);
518 free(hosts);
519 return rc;
520 }
521
522 orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, (void**)hosts, OPAL_STRING);
523 orte_remove_attribute(&app->attributes, ORTE_APP_ADD_HOSTFILE);
524 free(hosts);
525 }
526 }
527
528
529
530
531
532
533
534
535
536
537 for (i=0; i < jdata->apps->size; i++) {
538 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
539 continue;
540 }
541 if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOST, (void**)&hosts, OPAL_STRING)) {
542 opal_output_verbose(5, orte_ras_base_framework.framework_output,
543 "%s ras:base:add_hosts checking add-host %s",
544 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts);
545 if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, true))) {
546 ORTE_ERROR_LOG(rc);
547 OBJ_DESTRUCT(&nodes);
548 free(hosts);
549 return rc;
550 }
551
552 orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, hosts, OPAL_STRING);
553 orte_remove_attribute(&app->attributes, ORTE_APP_ADD_HOST);
554 free(hosts);
555 }
556 }
557
558
559 if (!opal_list_is_empty(&nodes)) {
560
561
562
563 OPAL_LIST_FOREACH_SAFE(node, next, &nodes, orte_node_t) {
564 node->state = ORTE_NODE_STATE_ADDED;
565 for (n=0; n < orte_node_pool->size; n++) {
566 if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
567 continue;
568 }
569 if (0 == strcmp(node->name, nptr->name)) {
570 opal_list_remove_item(&nodes, &node->super);
571 OBJ_RELEASE(node);
572 break;
573 }
574 }
575 }
576 if (!opal_list_is_empty(&nodes)) {
577
578
579
580 if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
581 ORTE_ERROR_LOG(rc);
582 }
583
584 orte_nidmap_communicated = false;
585 }
586 }
587
588 OPAL_LIST_DESTRUCT(&nodes);
589
590
591 if (0 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) {
592 orte_ras_base_display_alloc();
593 }
594
595 return ORTE_SUCCESS;
596 }