root/orte/orted/ft_tester.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sample

   1 /*
   2  * Copyright (c) 2009-2011 Cisco Systems, Inc.  All rights reserved.
   3  * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
   4  *                         All rights reserved.
   5  * Copyright (c) 2014-2017 Intel, Inc.  All rights reserved.
   6  *
   7  * $COPYRIGHT$
   8  *
   9  * Additional copyrights may follow
  10  *
  11  * $HEADER$
  12  */
  13 
  14 #include "orte_config.h"
  15 #include "orte/constants.h"
  16 #include "orte/types.h"
  17 
  18 #include <errno.h>
  19 #ifdef HAVE_UNISTD_H
  20 #include <unistd.h>
  21 #endif  /* HAVE_UNISTD_H */
  22 #ifdef HAVE_STRING_H
  23 #include <string.h>
  24 #endif  /* HAVE_STRING_H */
  25 #include <stdio.h>
  26 #ifdef HAVE_SIGNAL_H
  27 #include <signal.h>
  28 #endif
  29 
  30 #include "opal_stdint.h"
  31 #include "opal/util/alfg.h"
  32 #include "opal/util/output.h"
  33 
  34 #include "orte/util/error_strings.h"
  35 #include "orte/util/name_fns.h"
  36 #include "orte/mca/errmgr/errmgr.h"
  37 #include "orte/runtime/orte_globals.h"
  38 
  39 #include "orte/mca/sensor/base/base.h"
  40 #include "orte/mca/sensor/base/sensor_private.h"
  41 #include "sensor_ft_tester.h"
  42 
  43 /* declare the API functions */
  44 static void sample(void);
  45 
  46 /* instantiate the module */
  47 orte_sensor_base_module_t orte_sensor_ft_tester_module = {
  48     NULL,
  49     NULL,
  50     NULL,
  51     NULL,
  52     sample,
  53     NULL
  54 };
  55 
  56 static void sample(void)
  57 {
  58     float prob;
  59     orte_proc_t *child;
  60     int i;
  61 
  62     OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
  63                          "%s sample:ft_tester considering killing something",
  64                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
  65 
  66     /* are we including ourselves? */
  67     if (ORTE_PROC_IS_DAEMON &&
  68         0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
  69         OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
  70                              "%s sample:ft_tester considering killing me!",
  71                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
  72         /* roll the dice */
  73         prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
  74         if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
  75             /* commit suicide */
  76             OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
  77                                  "%s sample:ft_tester committing suicide",
  78                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
  79             orte_errmgr.abort(1, NULL);
  80             return;
  81         }
  82     }
  83 
  84     if (0 < mca_sensor_ft_tester_component.fail_prob) {
  85         /* see if we should kill a child */
  86         for (i=0; i < orte_local_children->size; i++) {
  87             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
  88                 continue;
  89             }
  90             if (!child->alive || 0 == child->pid ||
  91                 ORTE_PROC_STATE_UNTERMINATED < child->state) {
  92                 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
  93                                      "%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
  94                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  95                                      ORTE_NAME_PRINT(&child->name),
  96                                      child->alive ? "TRUE" : "FALSE",
  97                                      (unsigned long)child->pid, orte_proc_state_to_str(child->state)));
  98                 continue;
  99             }
 100             /* roll the dice */
 101             prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
 102             OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
 103                                  "%s sample:ft_tester child: %s dice: %f prob %f",
 104                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 105                                  ORTE_NAME_PRINT(&child->name),
 106                                  prob, mca_sensor_ft_tester_component.fail_prob));
 107             if (prob < mca_sensor_ft_tester_component.fail_prob) {
 108                 /* you shall die... */
 109                 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
 110                                      "%s sample:ft_tester killing %s",
 111                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 112                                      ORTE_NAME_PRINT(&child->name)));
 113                 kill(child->pid, SIGTERM);
 114                 /* are we allowing multiple deaths */
 115                 if (!mca_sensor_ft_tester_component.multi_fail) {
 116                     break;
 117                 }
 118             }
 119         }
 120     }
 121 }

/* [<][>][^][v][top][bottom][index][help] */