root/opal/mca/btl/usnic/btl_usnic_stats.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. usnic_stats_reset
  2. opal_btl_usnic_print_stats
  3. usnic_stats_callback
  4. opal_btl_usnic_stats_init
  5. opal_btl_usnic_stats_finalize
  6. usnic_pvar_notify
  7. usnic_pvar_read
  8. register_pvar_highwater
  9. usnic_pvar_enum_read
  10. register_pvar_counter
  11. setup_mpit_pvar_type
  12. setup_mpit_pvars_enum
  13. setup_mpit_pvars_highwatermark
  14. setup_mpit_pvars_counters
  15. opal_btl_usnic_setup_mpit_pvars

   1 /*
   2  * Copyright (c) 2013-2017 Cisco Systems, Inc.  All rights reserved.
   3  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
   4  * $COPYRIGHT$
   5  *
   6  * Additional copyrights may follow
   7  *
   8  * $HEADER$
   9  */
  10 
  11 #include "opal_config.h"
  12 
  13 #include <unistd.h>
  14 #include <stdlib.h>
  15 
  16 #include "opal/util/output.h"
  17 #include "opal/util/printf.h"
  18 #include "opal/mca/base/mca_base_var.h"
  19 #include "opal/mca/base/mca_base_pvar.h"
  20 
  21 #include "btl_usnic_compat.h"
  22 #include "btl_usnic.h"
  23 #include "btl_usnic_module.h"
  24 #include "btl_usnic_stats.h"
  25 #include "btl_usnic_util.h"
  26 
  27 /*
  28  * Local variables
  29  */
  30 static mca_base_var_type_t pvar_type = MCA_BASE_VAR_TYPE_MAX;
  31 
  32 static inline void usnic_stats_reset(opal_btl_usnic_module_t *module)
  33 {
  34     int i;
  35 
  36     module->stats.num_total_sends =
  37         module->stats.num_resends =
  38         module->stats.num_chunk_sends =
  39         module->stats.num_frag_sends =
  40         module->stats.num_ack_recvs =
  41 
  42         module->stats.num_total_recvs =
  43         module->stats.num_unk_recvs =
  44         module->stats.num_dup_recvs =
  45         module->stats.num_oow_low_recvs =
  46         module->stats.num_oow_high_recvs =
  47         module->stats.num_frag_recvs =
  48         module->stats.num_chunk_recvs =
  49         module->stats.num_badfrag_recvs =
  50         module->stats.num_ack_sends =
  51         module->stats.num_recv_reposts =
  52         module->stats.num_crc_errors =
  53 
  54         module->stats.num_old_dup_acks =
  55         module->stats.num_dup_acks =
  56         module->stats.num_fast_retrans =
  57         module->stats.num_timeout_retrans =
  58 
  59         module->stats.max_sent_window_size =
  60         module->stats.max_rcvd_window_size =
  61 
  62         module->stats.pml_module_sends =
  63         module->stats.pml_send_callbacks =
  64 
  65         module->stats.num_seg_total_completions =
  66         module->stats.num_seg_ack_completions =
  67         module->stats.num_seg_frag_completions =
  68         module->stats.num_seg_chunk_completions =
  69         module->stats.num_seg_recv_completions =
  70 
  71         0;
  72 
  73     for (i=0; i<USNIC_NUM_CHANNELS; ++i) {
  74         module->mod_channels[i].num_channel_sends = 0;
  75     }
  76 }
  77 
  78 /* Prints a few terse statistics lines via opal_output(0,...).  The first
  79  * line will be prefixed with the string "prefix".  If "reset_stats" is true
  80  * then the statistics will be reset after printing.
  81  *
  82  * NOTE: this routine ignores the setting of stats_enable, so it can be used
  83  * for debugging routines even when normal stats reporting is not enabled.
  84  */
  85 void opal_btl_usnic_print_stats(
  86     opal_btl_usnic_module_t *module,
  87     const char *prefix,
  88     bool reset_stats)
  89 {
  90     char tmp[128], str[2048];
  91 
  92     /* The usuals */
  93     snprintf(str, sizeof(str), "%s:MCW:%3u, %s, ST(P+D)/F/C/R(T+F)/A:%8lu(%8u+%8u)/%8lu/%8lu/%4lu(%4lu+%4lu)/%8lu, RcvTot/Chk/F/C/L/H/D/BF/A:%8lu/%c%c/%8lu/%8lu/%4lu+%2lu/%4lu/%4lu/%6lu Comp:T(A/F/C/R) %8lu(%8lu/%8lu/%8lu/%8lu), OA/DA %4lu/%4lu CRC:%4lu ",
  94              prefix,
  95              opal_proc_local_get()->proc_name.vpid,
  96 
  97              module->linux_device_name,
  98 
  99              module->stats.num_total_sends,
 100              module->mod_channels[USNIC_PRIORITY_CHANNEL].num_channel_sends,
 101              module->mod_channels[USNIC_DATA_CHANNEL].num_channel_sends,
 102              module->stats.num_frag_sends,
 103              module->stats.num_chunk_sends,
 104              module->stats.num_resends,
 105              module->stats.num_timeout_retrans,
 106              module->stats.num_fast_retrans,
 107              module->stats.num_ack_sends,
 108 
 109              module->stats.num_total_recvs,
 110              (module->stats.num_total_recvs -
 111               module->stats.num_recv_reposts) == 0 ? 'g' : 'B',
 112              (module->stats.num_total_recvs -
 113               module->stats.num_frag_recvs -
 114               module->stats.num_chunk_recvs -
 115               module->stats.num_badfrag_recvs -
 116               module->stats.num_oow_low_recvs -
 117               module->stats.num_oow_high_recvs -
 118               module->stats.num_dup_recvs -
 119               module->stats.num_ack_recvs -
 120               module->stats.num_unk_recvs) == 0 ? 'g' : 'B',
 121              module->stats.num_frag_recvs,
 122              module->stats.num_chunk_recvs,
 123              module->stats.num_oow_low_recvs,
 124              module->stats.num_oow_high_recvs,
 125              module->stats.num_dup_recvs,
 126              module->stats.num_badfrag_recvs,
 127              module->stats.num_ack_recvs,
 128 
 129              module->stats.num_seg_total_completions,
 130              module->stats.num_seg_ack_completions,
 131              module->stats.num_seg_frag_completions,
 132              module->stats.num_seg_chunk_completions,
 133              module->stats.num_seg_recv_completions,
 134 
 135              module->stats.num_old_dup_acks,
 136              module->stats.num_dup_acks,
 137 
 138              module->stats.num_crc_errors);
 139 
 140     // Shouldn't happen, but just in case the string ever grows long
 141     // enough to someday potentially get truncated by snprintf, ensure
 142     // that the string is terminated.
 143     str[sizeof(str) - 1] = '\0';
 144 
 145     /* If our PML calls were 0, then show send and receive window
 146        extents instead */
 147     if (module->stats.pml_module_sends +
 148         module->stats.pml_send_callbacks == 0) {
 149         int64_t send_unacked, su_min = WINDOW_SIZE * 2, su_max = 0;
 150         int64_t recv_depth, rd_min = WINDOW_SIZE * 2, rd_max = 0;
 151         opal_btl_usnic_endpoint_t *endpoint;
 152         opal_list_item_t *item;
 153 
 154         rd_min = su_min = WINDOW_SIZE * 2;
 155         rd_max = su_max = 0;
 156 
 157         opal_mutex_lock(&module->all_endpoints_lock);
 158         item = opal_list_get_first(&module->all_endpoints);
 159         while (item != opal_list_get_end(&(module->all_endpoints))) {
 160             endpoint = container_of(item, mca_btl_base_endpoint_t,
 161                     endpoint_endpoint_li);
 162             item = opal_list_get_next(item);
 163 
 164             /* Number of un-acked sends (i.e., sends for which we're
 165                still waiting for ACK) */
 166             send_unacked =
 167                 SEQ_DIFF(endpoint->endpoint_next_seq_to_send,
 168                          SEQ_DIFF(endpoint->endpoint_ack_seq_rcvd, 1));
 169 
 170             if (send_unacked > su_max) su_max = send_unacked;
 171             if (send_unacked < su_min) su_min = send_unacked;
 172 
 173             /* Receive window depth (i.e., difference between highest
 174                seq received and the next message we haven't ACKed
 175                yet) */
 176             recv_depth =
 177                 endpoint->endpoint_highest_seq_rcvd -
 178                 endpoint->endpoint_next_contig_seq_to_recv;
 179             if (recv_depth > rd_max) rd_max = recv_depth;
 180             if (recv_depth < rd_min) rd_min = recv_depth;
 181         }
 182         opal_mutex_unlock(&module->all_endpoints_lock);
 183         snprintf(tmp, sizeof(tmp), "PML S:%1ld, Win!A/R:%4ld/%4ld %4ld/%4ld",
 184                  module->stats.pml_module_sends,
 185                  su_min, su_max,
 186                  rd_min, rd_max);
 187     } else {
 188         snprintf(tmp, sizeof(tmp), "PML S/CB/Diff:%4lu/%4lu=%4ld",
 189                 module->stats.pml_module_sends,
 190                 module->stats.pml_send_callbacks,
 191                 module->stats.pml_module_sends -
 192                  module->stats.pml_send_callbacks);
 193     }
 194 
 195     strncat(str, tmp, sizeof(str) - strlen(str) - 1);
 196     opal_output(0, "%s", str);
 197 
 198     if (reset_stats) {
 199         usnic_stats_reset(module);
 200     }
 201 }
 202 
 203 /*
 204  * Callback routine for libevent
 205  */
 206 static void usnic_stats_callback(int fd, short flags, void *arg)
 207 {
 208     opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) arg;
 209     char tmp[128];
 210 
 211     if (!mca_btl_usnic_component.stats_enabled) {
 212         return;
 213     }
 214 
 215     snprintf(tmp, sizeof(tmp), "%4lu", ++module->stats.report_num);
 216 
 217     opal_btl_usnic_print_stats(module, tmp,
 218                                /*reset=*/mca_btl_usnic_component.stats_relative);
 219 }
 220 
 221 /*
 222  * Initialize usnic module statistics
 223  */
 224 int opal_btl_usnic_stats_init(opal_btl_usnic_module_t *module)
 225 {
 226     if (mca_btl_usnic_component.stats_enabled) {
 227         usnic_stats_reset(module);
 228 
 229         module->stats.timeout.tv_sec = mca_btl_usnic_component.stats_frequency;
 230         module->stats.timeout.tv_usec = 0;
 231 
 232         opal_event_set(mca_btl_usnic_component.opal_evbase,
 233                        &(module->stats.timer_event),
 234                        -1, EV_TIMEOUT | EV_PERSIST,
 235                        &usnic_stats_callback, module);
 236         opal_event_add(&(module->stats.timer_event),
 237                        &(module->stats.timeout));
 238     }
 239 
 240     return OPAL_SUCCESS;
 241 }
 242 
 243 /*
 244  * Finalize usnic module statistics
 245  */
 246 int opal_btl_usnic_stats_finalize(opal_btl_usnic_module_t *module)
 247 {
 248     /* Disable the stats callback event, and then call the stats
 249        callback manually to display the final stats */
 250     if (mca_btl_usnic_component.stats_enabled) {
 251         opal_event_del(&(module->stats.timer_event));
 252         opal_btl_usnic_print_stats(module, "final", /*reset_stats=*/false);
 253     }
 254 
 255     return OPAL_SUCCESS;
 256 }
 257 
 258 /************************************************************************/
 259 
 260 /*
 261  * Function called by the pvar base upon MPI_T_pvar_handle_alloc,
 262  * handle_start, and handle_stop.
 263  */
 264 static int usnic_pvar_notify(struct mca_base_pvar_t *pvar,
 265                              mca_base_pvar_event_t event,
 266                              void *obj, int *count)
 267 {
 268     if (MCA_BASE_PVAR_HANDLE_BIND == event) {
 269         *count = mca_btl_usnic_component.num_modules;
 270     }
 271 
 272     /* Don't care about the other events */
 273 
 274     return OPAL_SUCCESS;
 275 }
 276 
 277 
 278 /*
 279  * Function called by the pvar base when a user wants to read the
 280  * value of an MPI_T performance variable.
 281  */
 282 static int usnic_pvar_read(const struct mca_base_pvar_t *pvar,
 283                            void *value, void *bound_obj)
 284 {
 285     size_t offset = (size_t) pvar->ctx;
 286     uint64_t *array = (uint64_t*) value;
 287 
 288     for (int i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
 289         char *base = (char*) &(mca_btl_usnic_component.usnic_active_modules[i]->stats);
 290         array[i] = *((uint64_t*) (base + offset));
 291     }
 292 
 293     return OPAL_SUCCESS;
 294 }
 295 
 296 
 297 /*
 298  * Register an MPI_T performance variable of type CLASS_HIGHWATERMARK.
 299  */
 300 static void register_pvar_highwater(char *name, char *desc, size_t offset)
 301 {
 302     int rc __opal_attribute_unused__;
 303 
 304     rc = mca_base_component_pvar_register(&mca_btl_usnic_component.super.btl_version,
 305                                           name, desc,
 306                                           OPAL_INFO_LVL_5,
 307                                           MCA_BASE_PVAR_CLASS_HIGHWATERMARK,
 308                                           pvar_type,
 309                                           NULL, /* enumeration */
 310                                           MCA_BASE_VAR_BIND_NO_OBJECT,
 311                                           (MCA_BASE_PVAR_FLAG_READONLY |
 312                                            MCA_BASE_PVAR_FLAG_CONTINUOUS),
 313                                           usnic_pvar_read,
 314                                           NULL, /* write function */
 315                                           usnic_pvar_notify,
 316                                           (void *) offset);
 317     assert(rc >= 0);
 318 }
 319 
 320 
 321 /*
 322  * Function called by the pvar base when a user wants to read the
 323  * devices enum value.  The array is a simple list of 0..num_modules,
 324  * which will map to the strings in the devices_enum
 325  * setup_mpit_pvar_type().
 326  */
 327 static int usnic_pvar_enum_read(const struct mca_base_pvar_t *pvar,
 328                                 void *value, void *bound_obj)
 329 {
 330     int *array = (int *) value;
 331 
 332     for (int i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
 333         array[i] = i;
 334     }
 335 
 336     return OPAL_SUCCESS;
 337 }
 338 
 339 
 340 /*
 341  * Register an MPI_T performance variable of type CLASS_COUNTER.
 342  */
 343 static void register_pvar_counter(char *name, char *desc, size_t offset)
 344 {
 345     int rc __opal_attribute_unused__;
 346 
 347     rc = mca_base_component_pvar_register(&mca_btl_usnic_component.super.btl_version,
 348                                           name, desc,
 349                                           OPAL_INFO_LVL_5,
 350                                           MCA_BASE_PVAR_CLASS_COUNTER,
 351                                           pvar_type,
 352                                           NULL, /* enumeration */
 353                                           MCA_BASE_VAR_BIND_NO_OBJECT,
 354                                           (MCA_BASE_PVAR_FLAG_READONLY |
 355                                            MCA_BASE_PVAR_FLAG_CONTINUOUS),
 356                                           usnic_pvar_read,
 357                                           NULL, /* write function */
 358                                           usnic_pvar_notify,
 359                                           (void *) offset);
 360     assert(rc >= 0);
 361 }
 362 
 363 
 364 /*
 365  * Find the MPI_T type corresponding to our uint64_t counters and
 366  * highwatermarks.
 367  */
 368 static bool setup_mpit_pvar_type(void)
 369 {
 370     /* Our stats variables are uint64_t's, so find a pvar type that is
 371        compatible */
 372     if (sizeof(uint64_t) == sizeof(unsigned int)) {
 373         pvar_type = MCA_BASE_VAR_TYPE_UNSIGNED_INT;
 374     } else if (sizeof(uint64_t) == sizeof(unsigned long)) {
 375         pvar_type = MCA_BASE_VAR_TYPE_UNSIGNED_LONG;
 376     } else if (sizeof(uint64_t) == sizeof(unsigned long long)) {
 377         pvar_type = MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG;
 378     }
 379 
 380     /* Let the caller know if we found a compatible type or not */
 381     if (MCA_BASE_VAR_TYPE_MAX == pvar_type) {
 382         return false;
 383     }
 384     return true;
 385 }
 386 
 387 
 388 /*
 389  * Setup the usnic_X device enumeration pvar
 390  */
 391 static void setup_mpit_pvars_enum(void)
 392 {
 393     int i;
 394     int rc __opal_attribute_unused__;
 395     mca_base_var_enum_value_t *devices;
 396     static mca_base_var_enum_t *devices_enum;
 397     opal_btl_usnic_module_t *m;
 398     unsigned char *c;
 399     struct sockaddr_in *sin;
 400 
 401     devices = calloc(mca_btl_usnic_component.num_modules + 1,
 402                      sizeof(*devices));
 403     assert(devices != NULL);
 404 
 405     for (i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
 406         char *str;
 407 
 408         m = mca_btl_usnic_component.usnic_active_modules[i];
 409         sin = m->fabric_info->src_addr;
 410         c = (unsigned char*) &sin->sin_addr.s_addr;
 411 
 412         devices[i].value = i;
 413         rc = opal_asprintf(&str, "%s,%hhu.%hhu.%hhu.%hhu/%" PRIu32,
 414                       m->linux_device_name,
 415                       c[0], c[1], c[2], c[3],
 416                       usnic_netmask_to_cidrlen(sin->sin_addr.s_addr));
 417         assert(rc > 0);
 418         devices[i].string = str;
 419     }
 420     devices[i].string = NULL;
 421 
 422     rc = mca_base_var_enum_create("btl_usnic", devices, &devices_enum);
 423     assert(OPAL_SUCCESS == rc);
 424 
 425     rc = mca_base_component_pvar_register(&mca_btl_usnic_component.super.btl_version,
 426                                           "devices",
 427                                           "Enumeration representing which slot in btl_usnic_* MPI_T pvar value arrays correspond to which usnic_X Linux device",
 428                                           OPAL_INFO_LVL_5,
 429                                           MCA_BASE_PVAR_CLASS_STATE,
 430                                           MCA_BASE_VAR_TYPE_INT,
 431                                           devices_enum,
 432                                           MCA_BASE_VAR_BIND_NO_OBJECT,
 433                                           (MCA_BASE_PVAR_FLAG_READONLY |
 434                                            MCA_BASE_PVAR_FLAG_CONTINUOUS),
 435                                           usnic_pvar_enum_read,
 436                                           NULL, /* write function */
 437                                           usnic_pvar_notify,
 438                                           NULL /* context */);
 439     assert(rc >= 0);
 440 
 441     /* Free the strings (mca_base_var_enum_create() strdup()'ed them
 442        into private storage, so we don't need them any more) */
 443     for (int i = 0; i < mca_btl_usnic_component.num_modules; ++i) {
 444         free((char*) devices[i].string);
 445     }
 446     free(devices);
 447 
 448     /* The devices_enum has been RETAIN'ed by the pvar, so we can
 449        RELEASE it here, and the enum will be destroyed when the pvar
 450        is destroyed. */
 451     OBJ_RELEASE(devices_enum);
 452 }
 453 
 454 
 455 /*
 456  * Setup high watermark MPI_T performance variables
 457  */
 458 static void setup_mpit_pvars_highwatermark(void)
 459 {
 460 #define REGISTERHW(field, desc) \
 461     register_pvar_highwater(#field, (desc), offsetof(opal_btl_usnic_module_stats_t, field))
 462 
 463     REGISTERHW(max_sent_window_size,
 464                "Maximum number of entries in all send windows from this peer");
 465     REGISTERHW(max_rcvd_window_size,
 466                "Maximum number of entries in all receive windows to this peer");
 467 }
 468 
 469 
 470 /*
 471  * Setup counter MPI_T performance variables
 472  */
 473 static void setup_mpit_pvars_counters(void)
 474 {
 475 #define REGISTERC(field, desc) \
 476     register_pvar_counter(#field, (desc), offsetof(opal_btl_usnic_module_stats_t, field))
 477 
 478     REGISTERC(num_total_sends,
 479               "Total number of sends (MPI data, ACKs, retransmissions, etc.)");
 480     REGISTERC(num_resends,
 481               "Total number of all retransmissions");
 482     REGISTERC(num_timeout_retrans,
 483               "Number of times chunk retransmissions have occured because an ACK was not received within the timeout");
 484     REGISTERC(num_fast_retrans,
 485               "Number of times chunk retransmissions have occured because due to a repeated ACK");
 486     REGISTERC(num_chunk_sends,
 487               "Number of sends that were part of a larger MPI message fragment (i.e., the MPI message was so long that it had to be split into multiple MTU/network sends)");
 488     REGISTERC(num_frag_sends,
 489               "Number of sends where the entire MPI message fragment fit into a single MTU/network send");
 490     REGISTERC(num_ack_sends,
 491               "Number of ACKs sent (i.e., usNIC-BTL-to-usNIC-BTL control messages)");
 492 
 493     REGISTERC(num_total_recvs,
 494               "Total number of receives completed");
 495     REGISTERC(num_unk_recvs,
 496               "Number of receives with an unknown source or type, and therefore ignored by the usNIC BTL (this should never be >0)");
 497     REGISTERC(num_dup_recvs,
 498               "Number of duplicate receives");
 499     REGISTERC(num_oow_low_recvs,
 500               "Number of times a receive was out of the sliding window (on the low side)");
 501     REGISTERC(num_oow_high_recvs,
 502               "Number of times a receive was out of the sliding window (on the high side)");
 503     REGISTERC(num_frag_recvs,
 504               "Number of receives where the entire MPI message fragment fit into a single MTU/network send");
 505     REGISTERC(num_chunk_recvs,
 506               "Number of receives that were part of a larger MPI message fragment (i.e., this receive was reassembled into a larger MPI message fragment)");
 507     REGISTERC(num_badfrag_recvs,
 508               "Number of chunks received that had a bad fragment ID (this should never be >0)");
 509 
 510     REGISTERC(num_ack_recvs,
 511               "Total number of ACKs received");
 512     REGISTERC(num_old_dup_acks,
 513               "Number of old duplicate ACKs received (i.e., before the current expected ACK)");
 514     REGISTERC(num_dup_acks,
 515               "Number of duplicate ACKs received (i.e., the current expected ACK)");
 516 
 517     REGISTERC(num_recv_reposts,
 518               "Number of times buffers have been reposted for receives");
 519     REGISTERC(num_crc_errors,
 520               "Number of times receives were aborted because of a CRC error");
 521 
 522     REGISTERC(pml_module_sends,
 523               "Number of times the PML has called down to send a message");
 524     REGISTERC(pml_send_callbacks,
 525               "Number of times the usNIC BTL has called up to the PML to complete a send");
 526 }
 527 
 528 
 529 /*
 530  * Initialize MPI_T performance variables
 531  */
 532 int opal_btl_usnic_setup_mpit_pvars(void)
 533 {
 534     /* If we cannot find a compatible pvar type, we're done (i.e.,
 535        don't register any pvars) */
 536     if (!setup_mpit_pvar_type()) {
 537         return OPAL_SUCCESS;
 538     }
 539 
 540     /* Setup the usnic_X device enumeration pvar */
 541     setup_mpit_pvars_enum();
 542 
 543     /* Register watermark pvars */
 544     setup_mpit_pvars_highwatermark();
 545 
 546     /* If our counter stats are relative, don't report them through
 547        MPI_T, because MPI_T expects counters to be monotonically
 548        rising. */
 549     if (!mca_btl_usnic_component.stats_relative) {
 550         setup_mpit_pvars_counters();
 551     }
 552 
 553     /* All done */
 554     return OPAL_SUCCESS;
 555 }

/* [<][>][^][v][top][bottom][index][help] */