This source file includes following definitions.
- orte_rmaps_rf_map
- orte_rmaps_rank_file_parse
- orte_rmaps_rank_file_parse_string_or_int
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 #include "orte_config.h"
30 #include "orte/constants.h"
31 #include "orte/types.h"
32
33 #include <errno.h>
34 #ifdef HAVE_UNISTD_H
35 #include <unistd.h>
36 #endif
37 #include <string.h>
38
39 #include "opal/util/argv.h"
40 #include "opal/util/if.h"
41 #include "opal/util/net.h"
42 #include "opal/class/opal_pointer_array.h"
43 #include "opal/mca/hwloc/base/base.h"
44
45 #include "orte/mca/errmgr/errmgr.h"
46 #include "orte/mca/ess/ess.h"
47 #include "orte/util/show_help.h"
48 #include "orte/mca/rmaps/base/rmaps_private.h"
49 #include "orte/mca/rmaps/base/base.h"
50 #include "orte/mca/rmaps/rank_file/rmaps_rank_file.h"
51 #include "orte/mca/rmaps/rank_file/rmaps_rank_file_lex.h"
52 #include "orte/runtime/orte_globals.h"
53
54 static int orte_rmaps_rf_map(orte_job_t *jdata);
55
56 orte_rmaps_base_module_t orte_rmaps_rank_file_module = {
57 .map_job = orte_rmaps_rf_map
58 };
59
60
61 static int orte_rmaps_rank_file_parse(const char *);
62 static char *orte_rmaps_rank_file_parse_string_or_int(void);
63 static const char *orte_rmaps_rank_file_name_cur = NULL;
64 char *orte_rmaps_rank_file_slot_list = NULL;
65
66
67
68
69 static opal_pointer_array_t rankmap;
70 static int num_ranks=0;
71
72
73
74
75 static int orte_rmaps_rf_map(orte_job_t *jdata)
76 {
77 orte_job_map_t *map;
78 orte_app_context_t *app=NULL;
79 orte_std_cntr_t i, k;
80 opal_list_t node_list;
81 opal_list_item_t *item;
82 orte_node_t *node, *nd, *root_node;
83 orte_vpid_t rank, vpid_start;
84 orte_std_cntr_t num_slots;
85 orte_rmaps_rank_file_map_t *rfmap;
86 orte_std_cntr_t relative_index, tmp_cnt;
87 int rc;
88 orte_proc_t *proc;
89 mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version;
90 char *slots;
91 bool initial_map=true;
92 opal_hwloc_resource_type_t rtype;
93
94
95 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
96 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
97 "mca:rmaps:rf: job %s being restarted - rank_file cannot map",
98 ORTE_JOBID_PRINT(jdata->jobid));
99 return ORTE_ERR_TAKE_NEXT_OPTION;
100 }
101 if (NULL != jdata->map->req_mapper &&
102 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
103
104 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
105 "mca:rmaps:rf: job %s not using rank_file mapper",
106 ORTE_JOBID_PRINT(jdata->jobid));
107 return ORTE_ERR_TAKE_NEXT_OPTION;
108 }
109 if (ORTE_MAPPING_BYUSER != ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) {
110
111 return ORTE_ERR_TAKE_NEXT_OPTION;
112 }
113 if (OPAL_BIND_ORDERED_REQUESTED(jdata->map->binding)) {
114
115 return ORTE_ERR_TAKE_NEXT_OPTION;
116 }
117 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
118 "mca:rmaps:rank_file: mapping job %s",
119 ORTE_JOBID_PRINT(jdata->jobid));
120
121
122 if (NULL != jdata->map->last_mapper) {
123 free(jdata->map->last_mapper);
124 }
125 jdata->map->last_mapper = strdup(c->mca_component_name);
126
127
128 map = jdata->map;
129
130
131 if (mca_rmaps_rank_file_component.physical) {
132 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
133 "mca:rmaps:rank_file: using PHYSICAL processors");
134 rtype = OPAL_HWLOC_PHYSICAL;
135 } else {
136 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
137 "mca:rmaps:rank_file: using LOGICAL processors");
138 rtype = OPAL_HWLOC_LOGICAL;
139 }
140
141
142 OBJ_CONSTRUCT(&node_list, opal_list_t);
143
144
145 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
146 rc = ORTE_ERR_SILENT;
147 goto error;
148 }
149
150
151
152
153
154
155
156 if (0 == app->num_procs && 1 < jdata->num_apps) {
157 orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
158 true, jdata->num_apps, NULL);
159 rc = ORTE_ERR_SILENT;
160 goto error;
161 }
162
163
164
165
166 vpid_start = 0;
167 jdata->num_procs = 0;
168 OBJ_CONSTRUCT(&rankmap, opal_pointer_array_t);
169
170
171 if ( NULL != orte_rankfile ) {
172 if ( ORTE_SUCCESS != (rc = orte_rmaps_rank_file_parse(orte_rankfile))) {
173 ORTE_ERROR_LOG(rc);
174 goto error;
175 }
176 }
177
178
179 for(i=0; i < jdata->apps->size; i++) {
180 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
181 continue;
182 }
183
184
185
186
187
188 if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
189 map->mapping, initial_map, false))) {
190 ORTE_ERROR_LOG(rc);
191 goto error;
192 }
193
194 initial_map = false;
195
196
197 if (0 == app->num_procs) {
198 if (NULL != orte_rankfile) {
199
200
201
202 app->num_procs = num_ranks;
203 } else {
204
205 app->num_procs = num_slots;
206 }
207 }
208 for (k=0; k < app->num_procs; k++) {
209 rank = vpid_start + k;
210
211 if (NULL == (rfmap = (orte_rmaps_rank_file_map_t*)opal_pointer_array_get_item(&rankmap, rank))) {
212
213 if (NULL != opal_hwloc_base_cpu_list) {
214 slots = opal_hwloc_base_cpu_list;
215
216 node = NULL;
217 OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
218
219
220 if (nd->slots <= (int)nd->num_procs) {
221 continue;
222 }
223
224 node = nd;
225 break;
226 }
227 if (NULL == node) {
228
229 k = UINT32_MAX;
230 OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
231 if (nd->num_procs < (orte_vpid_t)k) {
232 k = nd->num_procs;
233 node = nd;
234 }
235 }
236 }
237
238 if (NULL == node) {
239 rc = ORTE_ERR_OUT_OF_RESOURCE;
240 goto error;
241 }
242 } else {
243
244 orte_show_help("help-rmaps_rank_file.txt", "missing-rank", true, rank, orte_rankfile);
245 rc = ORTE_ERR_SILENT;
246 goto error;
247 }
248 } else {
249 if (0 == strlen(rfmap->slot_list)) {
250
251 orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
252 rc = ORTE_ERR_SILENT;
253 goto error;
254 }
255 slots = rfmap->slot_list;
256
257 node = NULL;
258 OPAL_LIST_FOREACH(nd, &node_list, orte_node_t) {
259 if (NULL != rfmap->node_name &&
260 0 == strcmp(nd->name, rfmap->node_name)) {
261 node = nd;
262 break;
263 } else if (NULL != rfmap->node_name &&
264 (('+' == rfmap->node_name[0]) &&
265 (('n' == rfmap->node_name[1]) ||
266 ('N' == rfmap->node_name[1])))) {
267
268 relative_index=atoi(strtok(rfmap->node_name,"+n"));
269 if ( relative_index >= (int)opal_list_get_size (&node_list) || ( 0 > relative_index)){
270 orte_show_help("help-rmaps_rank_file.txt","bad-index", true,rfmap->node_name);
271 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
272 return ORTE_ERR_BAD_PARAM;
273 }
274 root_node = (orte_node_t*) opal_list_get_first(&node_list);
275 for(tmp_cnt=0; tmp_cnt<relative_index; tmp_cnt++) {
276 root_node = (orte_node_t*) opal_list_get_next(root_node);
277 }
278 node = root_node;
279 break;
280 }
281 }
282 }
283 if (NULL == node) {
284 orte_show_help("help-rmaps_rank_file.txt","bad-host", true, rfmap->node_name);
285 rc = ORTE_ERR_SILENT;
286 goto error;
287 }
288
289 if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
290 OBJ_RETAIN(node);
291 opal_pointer_array_add(map->nodes, node);
292 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
293 ++(jdata->map->num_nodes);
294 }
295 if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
296 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
297 rc = ORTE_ERR_OUT_OF_RESOURCE;
298 goto error;
299 }
300 if ((node->slots < (int)node->num_procs) ||
301 (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
302 if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
303 orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
304 true, node->num_procs, app->app);
305 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
306 rc = ORTE_ERR_SILENT;
307 goto error;
308 }
309
310
311
312 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
313 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
314 }
315
316 proc->name.vpid = rank;
317
318 if (NULL != slots) {
319
320 hwloc_cpuset_t bitmap;
321 char *cpu_bitmap;
322 if (NULL == node->topology || NULL == node->topology->topo) {
323
324
325
326 orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name);
327 rc = ORTE_ERR_SILENT;
328 goto error;
329 }
330 bitmap = hwloc_bitmap_alloc();
331
332 if (ORTE_SUCCESS != (rc = opal_hwloc_base_cpu_list_parse(slots, node->topology->topo, rtype, bitmap))) {
333 ORTE_ERROR_LOG(rc);
334 hwloc_bitmap_free(bitmap);
335 goto error;
336 }
337
338
339
340
341
342 hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap);
343 orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
344
345 free(cpu_bitmap);
346 hwloc_bitmap_free(bitmap);
347 }
348
349
350 if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
351 proc->name.vpid, proc))) {
352 ORTE_ERROR_LOG(rc);
353 return rc;
354 }
355 jdata->num_procs++;
356 }
357
358 vpid_start += app->num_procs;
359
360
361
362 while (NULL != (item = opal_list_remove_first(&node_list))) {
363 OBJ_RELEASE(item);
364 }
365 OBJ_DESTRUCT(&node_list);
366 OBJ_CONSTRUCT(&node_list, opal_list_t);
367 }
368 OBJ_DESTRUCT(&node_list);
369
370
371 for (i=0; i < rankmap.size; i++) {
372 if (NULL != (rfmap = opal_pointer_array_get_item(&rankmap, i))) {
373 OBJ_RELEASE(rfmap);
374 }
375 }
376 OBJ_DESTRUCT(&rankmap);
377
378 orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
379
380 return rc;
381
382 error:
383 OPAL_LIST_DESTRUCT(&node_list);
384
385 return rc;
386 }
387
388 static int orte_rmaps_rank_file_parse(const char *rankfile)
389 {
390 int token;
391 int rc = ORTE_SUCCESS;
392 int cnt;
393 char* node_name = NULL;
394 char** argv;
395 char buff[64];
396 char* value;
397 int rank=-1;
398 int i;
399 orte_node_t *hnp_node;
400 orte_rmaps_rank_file_map_t *rfmap=NULL;
401 opal_pointer_array_t *assigned_ranks_array;
402 char tmp_rank_assignment[64];
403
404
405 assigned_ranks_array = OBJ_NEW(opal_pointer_array_t);
406
407
408 hnp_node = (orte_node_t*)(orte_node_pool->addr[0]);
409
410 orte_rmaps_rank_file_name_cur = rankfile;
411 orte_rmaps_rank_file_done = false;
412 orte_rmaps_rank_file_in = fopen(rankfile, "r");
413
414 if (NULL == orte_rmaps_rank_file_in) {
415 orte_show_help("help-rmaps_rank_file.txt", "no-rankfile", true, rankfile);
416 rc = OPAL_ERR_NOT_FOUND;
417 ORTE_ERROR_LOG(rc);
418 goto unlock;
419 }
420
421 while (!orte_rmaps_rank_file_done) {
422 token = orte_rmaps_rank_file_lex();
423
424 switch (token) {
425 case ORTE_RANKFILE_ERROR:
426 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
427 rc = ORTE_ERR_BAD_PARAM;
428 ORTE_ERROR_LOG(rc);
429 goto unlock;
430 break;
431 case ORTE_RANKFILE_QUOTED_STRING:
432 orte_show_help("help-rmaps_rank_file.txt", "not-supported-rankfile", true, "QUOTED_STRING", rankfile);
433 rc = ORTE_ERR_BAD_PARAM;
434 ORTE_ERROR_LOG(rc);
435 goto unlock;
436 case ORTE_RANKFILE_NEWLINE:
437 rank = -1;
438 if (NULL != node_name) {
439 free(node_name);
440 }
441 node_name = NULL;
442 rfmap = NULL;
443 break;
444 case ORTE_RANKFILE_RANK:
445 token = orte_rmaps_rank_file_lex();
446 if (ORTE_RANKFILE_INT == token) {
447 rank = orte_rmaps_rank_file_value.ival;
448 rfmap = OBJ_NEW(orte_rmaps_rank_file_map_t);
449 opal_pointer_array_set_item(&rankmap, rank, rfmap);
450 num_ranks++;
451 } else {
452 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
453 rc = ORTE_ERR_BAD_PARAM;
454 ORTE_ERROR_LOG(rc);
455 goto unlock;
456 }
457 break;
458 case ORTE_RANKFILE_USERNAME:
459 orte_show_help("help-rmaps_rank_file.txt", "not-supported-rankfile", true, "USERNAME", rankfile);
460 rc = ORTE_ERR_BAD_PARAM;
461 ORTE_ERROR_LOG(rc);
462 goto unlock;
463 break;
464 case ORTE_RANKFILE_EQUAL:
465 if (rank < 0) {
466 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
467 rc = ORTE_ERR_BAD_PARAM;
468 ORTE_ERROR_LOG(rc);
469 goto unlock;
470 }
471 token = orte_rmaps_rank_file_lex();
472 switch (token) {
473 case ORTE_RANKFILE_HOSTNAME:
474 case ORTE_RANKFILE_IPV4:
475 case ORTE_RANKFILE_IPV6:
476 case ORTE_RANKFILE_STRING:
477 case ORTE_RANKFILE_INT:
478 case ORTE_RANKFILE_RELATIVE:
479 if(ORTE_RANKFILE_INT == token) {
480 sprintf(buff,"%d", orte_rmaps_rank_file_value.ival);
481 value = buff;
482 } else {
483 value = orte_rmaps_rank_file_value.sval;
484 }
485 argv = opal_argv_split (value, '@');
486 cnt = opal_argv_count (argv);
487 if (NULL != node_name) {
488 free(node_name);
489 }
490 if (1 == cnt) {
491 node_name = strdup(argv[0]);
492 } else if (2 == cnt) {
493 node_name = strdup(argv[1]);
494 } else {
495 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
496 rc = ORTE_ERR_BAD_PARAM;
497 ORTE_ERROR_LOG(rc);
498 opal_argv_free(argv);
499 node_name = NULL;
500 goto unlock;
501 }
502 opal_argv_free (argv);
503
504
505 if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(node_name) ) {
506 char *ptr;
507 if (NULL != (ptr = strchr(node_name, '.'))) {
508 *ptr = '\0';
509 }
510 }
511
512
513 if (NULL == rfmap) {
514 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
515 rc = ORTE_ERR_BAD_PARAM;
516 ORTE_ERROR_LOG(rc);
517 goto unlock;
518 }
519
520 if (orte_ifislocal(node_name)) {
521 rfmap->node_name = strdup(hnp_node->name);
522 } else {
523 rfmap->node_name = strdup(node_name);
524 }
525 }
526 break;
527 case ORTE_RANKFILE_SLOT:
528 if (NULL == node_name || rank < 0 ||
529 NULL == (value = orte_rmaps_rank_file_parse_string_or_int())) {
530 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
531 rc = ORTE_ERR_BAD_PARAM;
532 ORTE_ERROR_LOG(rc);
533 goto unlock;
534 }
535
536
537 if (NULL != opal_pointer_array_get_item(assigned_ranks_array, rank)) {
538 orte_show_help("help-rmaps_rank_file.txt", "bad-assign", true, rank,
539 opal_pointer_array_get_item(assigned_ranks_array, rank), rankfile);
540 rc = ORTE_ERR_BAD_PARAM;
541 free(value);
542 goto unlock;
543 } else {
544
545 sprintf(tmp_rank_assignment, "%s slot=%s", node_name, value);
546 opal_pointer_array_set_item(assigned_ranks_array, 0, tmp_rank_assignment);
547 }
548
549
550 if (NULL == rfmap) {
551 orte_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
552 rc = ORTE_ERR_BAD_PARAM;
553 ORTE_ERROR_LOG(rc);
554 free(value);
555 goto unlock;
556 }
557 for (i=0; i < 64 && '\0' != value[i]; i++) {
558 rfmap->slot_list[i] = value[i];
559 }
560 free(value);
561 break;
562 }
563 }
564 fclose(orte_rmaps_rank_file_in);
565 orte_rmaps_rank_file_lex_destroy ();
566
567 unlock:
568 if (NULL != node_name) {
569 free(node_name);
570 }
571 OBJ_RELEASE(assigned_ranks_array);
572 orte_rmaps_rank_file_name_cur = NULL;
573 return rc;
574 }
575
576
577 static char *orte_rmaps_rank_file_parse_string_or_int(void)
578 {
579 int rc;
580 char tmp_str[64];
581
582 if (ORTE_RANKFILE_EQUAL != orte_rmaps_rank_file_lex()){
583 return NULL;
584 }
585
586 rc = orte_rmaps_rank_file_lex();
587 switch (rc) {
588 case ORTE_RANKFILE_STRING:
589 return strdup(orte_rmaps_rank_file_value.sval);
590 case ORTE_RANKFILE_INT:
591 sprintf(tmp_str,"%d",orte_rmaps_rank_file_value.ival);
592 return strdup(tmp_str);
593 default:
594 return NULL;
595
596 }
597
598 }