root/orte/mca/plm/base/plm_base_orted_cmds.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. failed_cmd
  2. orte_plm_base_orted_exit
  3. orte_plm_base_orted_terminate_job
  4. orte_plm_base_orted_kill_local_procs
  5. orte_plm_base_orted_signal_local_procs

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
  13  *                         All rights reserved.
  14  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
  15  * $COPYRIGHT$
  16  *
  17  * Additional copyrights may follow
  18  *
  19  * $HEADER$
  20  *
  21  */
  22 
  23 #include "orte_config.h"
  24 #include "orte/constants.h"
  25 
  26 #include <string.h>
  27 #ifdef HAVE_SYS_TIME_H
  28 #include <sys/time.h>
  29 #endif
  30 
  31 
  32 #include "opal/dss/dss.h"
  33 #include "opal/mca/event/event.h"
  34 
  35 #include "orte/mca/odls/odls_types.h"
  36 #include "orte/mca/grpcomm/base/base.h"
  37 #include "orte/mca/errmgr/errmgr.h"
  38 #include "orte/mca/ess/ess.h"
  39 #include "orte/mca/rml/rml.h"
  40 #include "orte/mca/rml/rml_types.h"
  41 #include "orte/runtime/orte_globals.h"
  42 #include "orte/runtime/orte_wait.h"
  43 #include "orte/util/name_fns.h"
  44 #include "orte/util/proc_info.h"
  45 #include "orte/mca/state/state.h"
  46 #include "orte/runtime/orte_wait.h"
  47 #include "orte/orted/orted.h"
  48 
  49 #include "orte/mca/plm/base/base.h"
  50 #include "orte/mca/plm/base/plm_private.h"
  51 
  52 #if 0
  53 static void failed_cmd(int fd, short event, void *cbdata)
  54 {
  55     orte_timer_t *tm = (orte_timer_t*)cbdata;
  56 
  57     /* we get called if an abnormal term
  58      * don't complete in time - just force exit
  59      */
  60     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
  61                          "%s plm:base:orted_cmd command timed out",
  62                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
  63     OBJ_RELEASE(tm);
  64 /*
  65     ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
  66 */
  67 }
  68 #endif
  69 
  70 int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
  71 {
  72     int rc;
  73     opal_buffer_t *cmd;
  74     orte_daemon_cmd_flag_t cmmnd;
  75     orte_grpcomm_signature_t *sig;
  76 
  77     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
  78                          "%s plm:base:orted_cmd sending orted_exit commands",
  79                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
  80 
  81     /* flag that orteds are being terminated */
  82     orte_orteds_term_ordered = true;
  83     cmmnd = command;
  84 
  85     /* if we are terminating before launch, or abnormally
  86      * terminating, then the daemons may not be wired up
  87      * and therefore cannot depend on detecting their
  88      * routed children to determine termination
  89      */
  90     if (orte_abnormal_term_ordered ||
  91         orte_never_launched ||
  92         !orte_routing_is_enabled) {
  93         cmmnd = ORTE_DAEMON_HALT_VM_CMD;
  94     }
  95 
  96     /* send it express delivery! */
  97     cmd = OBJ_NEW(opal_buffer_t);
  98     /* pack the command */
  99     if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &cmmnd, 1, ORTE_DAEMON_CMD))) {
 100         ORTE_ERROR_LOG(rc);
 101         OBJ_RELEASE(cmd);
 102         return rc;
 103     }
 104     /* goes to all daemons */
 105     sig = OBJ_NEW(orte_grpcomm_signature_t);
 106     sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
 107     sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
 108     sig->signature[0].vpid = ORTE_VPID_WILDCARD;
 109     if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, cmd))) {
 110         ORTE_ERROR_LOG(rc);
 111     }
 112     OBJ_RELEASE(cmd);
 113     OBJ_RELEASE(sig);
 114 
 115 #if 0
 116     /* if we are abnormally ordering the termination, then
 117      * set a timeout in case it never finishes
 118      */
 119     if (orte_abnormal_term_ordered) {
 120         ORTE_DETECT_TIMEOUT(orte_process_info.num_procs, 100, 3, failed_cmd, NULL);
 121     }
 122 #endif
 123     return rc;
 124 }
 125 
 126 
 127 int orte_plm_base_orted_terminate_job(orte_jobid_t jobid)
 128 {
 129     opal_pointer_array_t procs;
 130     orte_proc_t proc;
 131     int rc;
 132 
 133     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 134                          "%s plm:base:orted_terminate job %s",
 135                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 136                          ORTE_JOBID_PRINT(jobid)));
 137 
 138     OBJ_CONSTRUCT(&procs, opal_pointer_array_t);
 139     opal_pointer_array_init(&procs, 1, 1, 1);
 140     OBJ_CONSTRUCT(&proc, orte_proc_t);
 141     proc.name.jobid = jobid;
 142     proc.name.vpid = ORTE_VPID_WILDCARD;
 143     opal_pointer_array_add(&procs, &proc);
 144     if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) {
 145         ORTE_ERROR_LOG(rc);
 146     }
 147     OBJ_DESTRUCT(&procs);
 148     OBJ_DESTRUCT(&proc);
 149     return rc;
 150 }
 151 
 152 int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs)
 153 {
 154     int rc;
 155     opal_buffer_t *cmd;
 156     orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS;
 157     int v;
 158     orte_proc_t *proc;
 159     orte_grpcomm_signature_t *sig;
 160 
 161     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 162                          "%s plm:base:orted_cmd sending kill_local_procs cmds",
 163                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 164 
 165     cmd = OBJ_NEW(opal_buffer_t);
 166     /* pack the command */
 167     if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
 168         ORTE_ERROR_LOG(rc);
 169         OBJ_RELEASE(cmd);
 170         return rc;
 171     }
 172 
 173     /* pack the proc names */
 174     if (NULL != procs) {
 175         for (v=0; v < procs->size; v++) {
 176             if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procs, v))) {
 177                 continue;
 178             }
 179             if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &(proc->name), 1, ORTE_NAME))) {
 180                 ORTE_ERROR_LOG(rc);
 181                 OBJ_RELEASE(cmd);
 182                 return rc;
 183             }
 184         }
 185     }
 186     /* goes to all daemons */
 187     sig = OBJ_NEW(orte_grpcomm_signature_t);
 188     sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
 189     sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
 190     sig->signature[0].vpid = ORTE_VPID_WILDCARD;
 191     if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, cmd))) {
 192         ORTE_ERROR_LOG(rc);
 193     }
 194     OBJ_RELEASE(cmd);
 195     OBJ_RELEASE(sig);
 196 
 197     /* we're done! */
 198     return rc;
 199 }
 200 
 201 
 202 int orte_plm_base_orted_signal_local_procs(orte_jobid_t job, int32_t signal)
 203 {
 204     int rc;
 205     opal_buffer_t cmd;
 206     orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS;
 207     orte_grpcomm_signature_t *sig;
 208 
 209     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
 210                          "%s plm:base:orted_cmd sending signal_local_procs cmds",
 211                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 212 
 213     OBJ_CONSTRUCT(&cmd, opal_buffer_t);
 214 
 215     /* pack the command */
 216     if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
 217         ORTE_ERROR_LOG(rc);
 218         OBJ_DESTRUCT(&cmd);
 219         return rc;
 220     }
 221 
 222     /* pack the jobid */
 223     if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
 224         ORTE_ERROR_LOG(rc);
 225         OBJ_DESTRUCT(&cmd);
 226         return rc;
 227     }
 228 
 229     /* pack the signal */
 230     if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &signal, 1, OPAL_INT32))) {
 231         ORTE_ERROR_LOG(rc);
 232         OBJ_DESTRUCT(&cmd);
 233         return rc;
 234     }
 235 
 236     /* goes to all daemons */
 237     sig = OBJ_NEW(orte_grpcomm_signature_t);
 238     sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
 239     sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
 240     sig->signature[0].vpid = ORTE_VPID_WILDCARD;
 241     if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, &cmd))) {
 242         ORTE_ERROR_LOG(rc);
 243     }
 244     OBJ_DESTRUCT(&cmd);
 245     OBJ_RELEASE(sig);
 246 
 247     /* we're done! */
 248     return ORTE_SUCCESS;
 249 }

/* [<][>][^][v][top][bottom][index][help] */