From 13d3b1ebe28762c79e981931a41914fae5d04386 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 30 Jun 2017 20:02:44 -0700 Subject: [PATCH] bpf: Support for setting initial receive window This patch adds suppport for setting the initial advertized window from within a BPF_SOCK_OPS program. This can be used to support larger initial cwnd values in environments where it is known to be safe. Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 10 ++++++++++ include/uapi/linux/bpf.h | 4 ++++ net/ipv4/tcp_minisocks.c | 9 ++++++++- net/ipv4/tcp_output.c | 7 ++++++- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 564af2dee236..d6bb3948203d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2068,4 +2068,14 @@ static inline u32 tcp_timeout_init(struct sock *sk) return timeout; } +static inline u32 tcp_rwnd_init_bpf(struct sock *sk) +{ + int rwnd; + + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + + if (rwnd < 0) + rwnd = 0; + return rwnd; +} #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 00702b294447..94d7ded1a6cf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -751,6 +751,10 @@ enum { BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or * -1 if default value should be used */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d30ee31e94eb..0ff83c1637d8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; + u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); @@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req, (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; + rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + else if (full_space < rcv_wnd * mss) + full_space = rcv_wnd * mss; + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + rcv_wnd); ireq->rcv_wscale = rcv_wscale; } EXPORT_SYMBOL(tcp_openreq_init_rwin); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 47fe0759a877..ef809426b538 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3266,6 +3266,7 @@ static void tcp_connect_init(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -3299,13 +3300,17 @@ static void tcp_connect_init(struct sock *sk) (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) tp->window_clamp = tcp_full_space(sk); + rcv_wnd = tcp_rwnd_init_bpf(sk); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, sock_net(sk)->ipv4.sysctl_tcp_window_scaling, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; -- 2.39.5