This source file includes following definitions.
- mindist_map
- assign_locations
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 #include "orte_config.h"
28 #include "orte/constants.h"
29 #include "orte/types.h"
30
31 #include <errno.h>
32 #ifdef HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif
35 #include <string.h>
36
37 #include "opal/mca/base/mca_base_var.h"
38
39 #include "orte/util/show_help.h"
40 #include "orte/mca/errmgr/errmgr.h"
41 #include "orte/util/error_strings.h"
42
43 #include "orte/mca/rmaps/base/rmaps_private.h"
44 #include "orte/mca/rmaps/base/base.h"
45 #include "orte/mca/rmaps/mindist/rmaps_mindist.h"
46
47 static int mindist_map(orte_job_t *jdata);
48 static int assign_locations(orte_job_t *jdata);
49
50 orte_rmaps_base_module_t orte_rmaps_mindist_module = {
51 .map_job = mindist_map,
52 .assign_locations = assign_locations
53 };
54
55
56
57
58 static int mindist_map(orte_job_t *jdata)
59 {
60 orte_app_context_t *app;
61 int i, j;
62 unsigned int k;
63 hwloc_obj_t obj = NULL;
64 opal_list_t node_list;
65 opal_list_t numa_list;
66 opal_list_item_t *item;
67 opal_list_item_t *numa_item;
68 opal_rmaps_numa_node_t *numa;
69 orte_node_t *node;
70 orte_proc_t *proc;
71 int nprocs_mapped;
72 int navg=0, nextra=0;
73 orte_std_cntr_t num_nodes, num_slots;
74 unsigned int npus, total_npus, num_procs_to_assign=0, required;
75 int rc;
76 mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
77 bool initial_map=true;
78 bool bynode = false;
79 float balance;
80 int extra_procs_to_assign=0, nxtra_nodes=0;
81 bool add_one=false;
82 bool oversubscribed=false;
83 int ret;
84
85
86
87
88 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
89 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
90 "mca:rmaps:mindist: job %s is being restarted - mindist cannot map",
91 ORTE_JOBID_PRINT(jdata->jobid));
92 return ORTE_ERR_TAKE_NEXT_OPTION;
93 }
94 if (NULL != jdata->map->req_mapper &&
95 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
96
97 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
98 "mca:rmaps:mindist: job %s not using mindist mapper",
99 ORTE_JOBID_PRINT(jdata->jobid));
100 return ORTE_ERR_TAKE_NEXT_OPTION;
101 }
102 if (ORTE_MAPPING_BYDIST != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
103
104 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
105 "mca:rmaps:mindist: job %s not using mindist mapper",
106 ORTE_JOBID_PRINT(jdata->jobid));
107 return ORTE_ERR_TAKE_NEXT_OPTION;
108 }
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125 if (ORTE_MAPPING_SPAN & jdata->map->mapping) {
126
127 bynode = true;
128 } else {
129
130 bynode = false;
131 }
132
133 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
134 "mca:rmaps:mindist: mapping job %s",
135 ORTE_JOBID_PRINT(jdata->jobid));
136
137
138 if (NULL != jdata->map->last_mapper) {
139 free(jdata->map->last_mapper);
140 }
141 jdata->map->last_mapper = strdup(c->mca_component_name);
142
143
144 jdata->num_procs = 0;
145
146
147 for(i=0; i < jdata->apps->size; i++) {
148 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
149 continue;
150 }
151
152
153 OBJ_CONSTRUCT(&node_list, opal_list_t);
154
155
156
157
158
159 if (0 == app->num_procs && 1 < jdata->num_apps) {
160 orte_show_help("help-orte-rmaps-md.txt", "multi-apps-and-zero-np",
161 true, jdata->num_apps, NULL);
162 rc = ORTE_ERR_SILENT;
163 goto error;
164 }
165
166
167
168
169
170 if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
171 jdata->map->mapping, initial_map, false))) {
172 ORTE_ERROR_LOG(rc);
173 goto error;
174 }
175
176
177 if (num_slots < (int)app->num_procs) {
178 if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
179 orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
180 true, app->num_procs, app->app, orte_process_info.nodename);
181 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
182 return ORTE_ERR_SILENT;
183 }
184 oversubscribed = true;
185 }
186
187 num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
188
189 initial_map = false;
190
191
192 jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
193
194 if (0 == app->num_procs) {
195
196 app->num_procs = num_slots;
197 }
198
199 nprocs_mapped = 0;
200 if (!num_nodes) {
201 rc = ORTE_ERR_SILENT;
202 goto error;
203 }
204 do {
205 if (bynode || (app->num_procs > num_slots)) {
206
207 bynode = true;
208
209 navg = ((int)app->num_procs - nprocs_mapped) / num_nodes;
210 nextra = app->num_procs - navg * num_nodes;
211 num_procs_to_assign = navg;
212 if (nextra > 0) {
213 num_procs_to_assign++;
214 }
215
216 balance = (float)(((int)app->num_procs - nprocs_mapped) - (navg * num_nodes)) / (float)num_nodes;
217 extra_procs_to_assign = (int)balance;
218 nxtra_nodes = 0;
219 add_one = false;
220 if (0 < (balance - (float)extra_procs_to_assign)) {
221
222 nxtra_nodes = ((int)app->num_procs - nprocs_mapped) - ((navg + extra_procs_to_assign) * num_nodes);
223
224
225
226 extra_procs_to_assign++;
227
228 add_one = true;
229 }
230 }
231
232 num_nodes = 0;
233
234 for (item = opal_list_get_first(&node_list);
235 item != opal_list_get_end(&node_list);
236 item = opal_list_get_next(item)) {
237 node = (orte_node_t*)item;
238
239 if (NULL == node->topology || NULL == node->topology->topo) {
240 orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
241 true, node->name);
242 rc = ORTE_ERR_SILENT;
243 goto error;
244 }
245
246
247
248 obj = hwloc_get_root_obj(node->topology->topo);
249 if (NULL == obj) {
250 orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
251 true, node->name);
252 rc = ORTE_ERR_SILENT;
253 goto error;
254 }
255
256 num_nodes++;
257
258
259 if (opal_hwloc_use_hwthreads_as_cpus) {
260 total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PU, 0, OPAL_HWLOC_AVAILABLE);
261 } else {
262 total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_CORE, 0, OPAL_HWLOC_AVAILABLE);
263 }
264
265 if (bynode) {
266 if (oversubscribed) {
267
268 if (add_one) {
269 if (0 == nxtra_nodes) {
270 --extra_procs_to_assign;
271 add_one = false;
272 } else {
273 --nxtra_nodes;
274 }
275 }
276
277 num_procs_to_assign = navg + extra_procs_to_assign;
278 }else if (node->slots <= node->slots_inuse) {
279
280 continue;
281 } else {
282
283
284
285
286
287
288
289
290 if (add_one) {
291 if (0 == nxtra_nodes) {
292 --extra_procs_to_assign;
293 add_one = false;
294 } else {
295 --nxtra_nodes;
296 }
297 }
298
299 if ((node->slots - node->slots_inuse) < (navg + extra_procs_to_assign)) {
300 num_procs_to_assign = node->slots - node->slots_inuse;
301
302 if (num_procs_to_assign == 0) {
303 continue;
304 }
305 } else {
306
307 num_procs_to_assign = navg + extra_procs_to_assign;
308 }
309 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
310 "mca:rmaps:mindist: %s node %s avg %d assign %d extra %d",
311 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
312 navg, num_procs_to_assign, extra_procs_to_assign);
313 }
314 } else {
315 num_procs_to_assign = ((int)app->num_procs - nprocs_mapped) > node->slots ?
316 node->slots : ((int)app->num_procs - nprocs_mapped);
317 }
318
319 if (bynode) {
320 if (total_npus < num_procs_to_assign) {
321
322 if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
323 orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
324 true, app->num_procs, app->app);
325 rc = ORTE_ERR_SILENT;
326 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
327 goto error;
328 } else {
329 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
330 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
331 }
332 }
333 }
334
335
336 opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
337 OBJ_CONSTRUCT(&numa_list, opal_list_t);
338 ret = opal_hwloc_get_sorted_numa_list(node->topology->topo, orte_rmaps_base.device, &numa_list);
339 if (ret > 1) {
340 orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-devices",
341 true, orte_rmaps_base.device, ret, node->name);
342 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
343 rc = ORTE_ERR_TAKE_NEXT_OPTION;
344 goto error;
345 } else if (ret < 0) {
346 orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:device-not-found",
347 true, orte_rmaps_base.device, node->name);
348 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
349 rc = ORTE_ERR_TAKE_NEXT_OPTION;
350 goto error;
351 }
352 if (opal_list_get_size(&numa_list) > 0) {
353 j = 0;
354 required = 0;
355 OPAL_LIST_FOREACH(numa, &numa_list, opal_rmaps_numa_node_t) {
356
357 if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) {
358 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
359 return ORTE_ERR_NOT_FOUND;
360 }
361 npus = opal_hwloc_base_get_npus(node->topology->topo, obj);
362 if (bynode) {
363 required = num_procs_to_assign;
364 } else {
365 required = (num_procs_to_assign-j) > npus ? npus : (num_procs_to_assign-j);
366 }
367 for (k = 0; (k < required) && (nprocs_mapped < app->num_procs); k++) {
368 if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
369 rc = ORTE_ERR_OUT_OF_RESOURCE;
370 goto error;
371 }
372 nprocs_mapped++;
373 j++;
374 orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
375 }
376 if ((nprocs_mapped == (int)app->num_procs) || ((int)num_procs_to_assign == j)) {
377 break;
378 }
379 }
380 if (0 != j) {
381
382 if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
383 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
384 OBJ_RETAIN(node);
385 jdata->map->num_nodes++;
386 opal_pointer_array_add(jdata->map->nodes, node);
387 }
388 opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
389 "mca:rmaps:mindist: assigned %d procs to node %s",
390 j, node->name);
391 }
392 } else {
393 if (hwloc_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_SOCKET) > 1) {
394
395 orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:no-pci-locality-info",
396 true, node->name);
397 }
398
399 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
400 rc = ORTE_ERR_TAKE_NEXT_OPTION;
401 goto error;
402 }
403 while (NULL != (numa_item = opal_list_remove_first(&numa_list))) {
404 OBJ_RELEASE(numa_item);
405 }
406 OBJ_DESTRUCT(&numa_list);
407 if (bynode) {
408 nextra--;
409 if (nextra == 0) {
410 num_procs_to_assign--;
411 }
412 }
413 }
414 } while(bynode && nprocs_mapped < app->num_procs && 0 < num_nodes);
415
416
417
418
419
420 jdata->num_procs += app->num_procs;
421
422
423
424
425 while (NULL != (item = opal_list_remove_first(&node_list))) {
426 OBJ_RELEASE(item);
427 }
428 OBJ_DESTRUCT(&node_list);
429 }
430 free(orte_rmaps_base.device);
431
432 return ORTE_SUCCESS;
433
434 error:
435 while(NULL != (item = opal_list_remove_first(&node_list))) {
436 OBJ_RELEASE(item);
437 }
438 OBJ_DESTRUCT(&node_list);
439
440 return rc;
441 }
442
443 static int assign_locations(orte_job_t *jdata)
444 {
445 int j, k, m, n, npus;
446 orte_app_context_t *app;
447 orte_node_t *node;
448 orte_proc_t *proc;
449 hwloc_obj_t obj=NULL;
450 mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
451 int rc;
452 opal_list_t numa_list;
453 opal_rmaps_numa_node_t *numa;
454
455 if (NULL == jdata->map->last_mapper||
456 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) {
457
458 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
459 "mca:rmaps:mindist: job %s not using mindist mapper",
460 ORTE_JOBID_PRINT(jdata->jobid));
461 return ORTE_ERR_TAKE_NEXT_OPTION;
462 }
463
464 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
465 "mca:rmaps:mindist: assign locations for job %s",
466 ORTE_JOBID_PRINT(jdata->jobid));
467
468
469
470
471
472 for (n=0; n < jdata->apps->size; n++) {
473 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
474 continue;
475 }
476 for (m=0; m < jdata->map->nodes->size; m++) {
477 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
478 continue;
479 }
480 if (NULL == node->topology || NULL == node->topology->topo) {
481 orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
482 true, node->name);
483 return ORTE_ERR_SILENT;
484 }
485
486
487
488 opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
489 OBJ_CONSTRUCT(&numa_list, opal_list_t);
490 rc = opal_hwloc_get_sorted_numa_list(node->topology->topo, orte_rmaps_base.device, &numa_list);
491 if (rc > 1) {
492 orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-devices",
493 true, orte_rmaps_base.device, rc, node->name);
494 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
495 OPAL_LIST_DESTRUCT(&numa_list);
496 return ORTE_ERR_TAKE_NEXT_OPTION;
497 } else if (rc < 0) {
498 orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:device-not-found",
499 true, orte_rmaps_base.device, node->name);
500 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
501 OPAL_LIST_DESTRUCT(&numa_list);
502 return ORTE_ERR_TAKE_NEXT_OPTION;
503 }
504 j = 0;
505 OPAL_LIST_FOREACH(numa, &numa_list, opal_rmaps_numa_node_t) {
506
507 if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) {
508 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
509 OPAL_LIST_DESTRUCT(&numa_list);
510 return ORTE_ERR_NOT_FOUND;
511 }
512 npus = opal_hwloc_base_get_npus(node->topology->topo, obj);
513
514
515 for (k = j; k < node->procs->size && 0 < npus; k++) {
516 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) {
517 continue;
518 }
519 if (proc->name.jobid != jdata->jobid) {
520 continue;
521 }
522 orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
523 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
524 "mca:rmaps:mindist: assigning proc %d to numa %d", k, numa->index);
525 ++j;
526 --npus;
527 }
528 }
529 OPAL_LIST_DESTRUCT(&numa_list);
530 }
531 }
532
533 return ORTE_SUCCESS;
534 }