*/
#define KNET_LINK_PONG_TIMEOUT_LAT_MUL 2
+/*
+ * under heavy load with crypto enabled, it takes much
+ * longer time to receive a response from the other node.
+ *
+ * 128 is somewhat arbitrary number but we want to set a limit
+ * and report failures after that.
+ */
+#define KNET_LINK_PMTUD_CRYPTO_TIMEOUT_MULTIPLIER_MIN 2
+#define KNET_LINK_PMTUD_CRYPTO_TIMEOUT_MULTIPLIER_MAX 128
+
int _link_updown(knet_handle_t knet_h, knet_node_id_t node_id, uint8_t link_id,
unsigned int enabled, unsigned int connected);
size_t app_mtu_len; /* real data that we can send onwire */
ssize_t len; /* len of what we were able to sendto onwire */
- struct timespec ts;
- unsigned long long pong_timeout_adj_tmp;
+ struct timespec ts, pmtud_crypto_start_ts, pmtud_crypto_stop_ts;
+ unsigned long long pong_timeout_adj_tmp, timediff;
+ int pmtud_crypto_reduce = 1;
unsigned char *outbuf = (unsigned char *)knet_h->pmtudbuf;
warn_once = 0;
return -1;
}
+ /*
+ * non fatal, we can wait the next round to reduce the
+ * multiplier
+ */
+ if (clock_gettime(CLOCK_MONOTONIC, &pmtud_crypto_start_ts) < 0) {
+ log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get current time: %s", strerror(errno));
+ pmtud_crypto_reduce = 0;
+ }
+
/*
* set PMTUd reply timeout to match pong_timeout on a given link
*
/*
* crypto, under pressure, is a royal PITA
*/
- pong_timeout_adj_tmp = dst_link->pong_timeout_adj * 2;
+ pong_timeout_adj_tmp = dst_link->pong_timeout_adj * dst_link->pmtud_crypto_timeout_multiplier;
} else {
pong_timeout_adj_tmp = dst_link->pong_timeout_adj;
}
if (ret) {
if (ret == ETIMEDOUT) {
+ if ((knet_h->crypto_instance) && (dst_link->pmtud_crypto_timeout_multiplier < KNET_LINK_PMTUD_CRYPTO_TIMEOUT_MULTIPLIER_MAX)) {
+ dst_link->pmtud_crypto_timeout_multiplier = dst_link->pmtud_crypto_timeout_multiplier * 2;
+ pmtud_crypto_reduce = 0;
+ log_debug(knet_h, KNET_SUB_PMTUD,
+ "Increasing PMTUd response timeout multiplier to (%u) for host %u link: %u",
+ dst_link->pmtud_crypto_timeout_multiplier,
+ dst_host->host_id,
+ dst_link->link_id);
+ pthread_mutex_unlock(&knet_h->pmtud_mutex);
+ goto restart;
+ }
if (!warn_once) {
log_warn(knet_h, KNET_SUB_PMTUD,
"possible MTU misconfiguration detected. "
}
}
+ if ((knet_h->crypto_instance) && (pmtud_crypto_reduce == 1) &&
+ (dst_link->pmtud_crypto_timeout_multiplier > KNET_LINK_PMTUD_CRYPTO_TIMEOUT_MULTIPLIER_MIN)) {
+ if (!clock_gettime(CLOCK_MONOTONIC, &pmtud_crypto_stop_ts)) {
+ timespec_diff(pmtud_crypto_start_ts, pmtud_crypto_stop_ts, &timediff);
+ if (((pong_timeout_adj_tmp * 1000) / 2) > timediff) {
+ dst_link->pmtud_crypto_timeout_multiplier = dst_link->pmtud_crypto_timeout_multiplier / 2;
+ log_debug(knet_h, KNET_SUB_PMTUD,
+ "Decreasing PMTUd response timeout multiplier to (%u) for host %u link: %u",
+ dst_link->pmtud_crypto_timeout_multiplier,
+ dst_host->host_id,
+ dst_link->link_id);
+ }
+ } else {
+ log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get current time: %s", strerror(errno));
+ }
+ }
+
if ((dst_link->last_recv_mtu != onwire_len) || (ret)) {
dst_link->last_bad_mtu = onwire_len;
} else {