]> git.proxmox.com Git - mirror_ubuntu-kernels.git/commitdiff
bpf: tcp: Add bpf_skops_hdr_opt_len() and bpf_skops_write_hdr_opt()
authorMartin KaFai Lau <kafai@fb.com>
Thu, 20 Aug 2020 19:00:52 +0000 (12:00 -0700)
committerAlexei Starovoitov <ast@kernel.org>
Mon, 24 Aug 2020 21:35:00 +0000 (14:35 -0700)
The bpf prog needs to parse the SYN header to learn what options have
been sent by the peer's bpf-prog before writing its options into SYNACK.
This patch adds a "syn_skb" arg to tcp_make_synack() and send_synack().
This syn_skb will eventually be made available (as read-only) to the
bpf prog.  This will be the only SYN packet available to the bpf
prog during syncookie.  For other regular cases, the bpf prog can
also use the saved_syn.

When writing options, the bpf prog will first be called to tell the
kernel its required number of bytes.  It is done by the new
bpf_skops_hdr_opt_len().  The bpf prog will only be called when the new
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG is set in tp->bpf_sock_ops_cb_flags.
When the bpf prog returns, the kernel will know how many bytes are needed
and then update the "*remaining" arg accordingly.  4 byte alignment will
be included in the "*remaining" before this function returns.  The 4 byte
aligned number of bytes will also be stored into the opts->bpf_opt_len.
"bpf_opt_len" is a newly added member to the struct tcp_out_options.

Then the new bpf_skops_write_hdr_opt() will call the bpf prog to write the
header options.  The bpf prog is only called if it has reserved spaces
before (opts->bpf_opt_len > 0).

The bpf prog is the last one getting a chance to reserve header space
and writing the header option.

These two functions are half implemented to highlight the changes in
TCP stack.  The actual codes preparing the bpf running context and
invoking the bpf prog will be added in the later patch with other
necessary bpf pieces.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20200820190052.2885316-1-kafai@fb.com
include/net/tcp.h
include/uapi/linux/bpf.h
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_output.c
net/ipv6/tcp_ipv6.c
tools/include/uapi/linux/bpf.h

index c186dbf731e159c4a847d2924b691df92cc4db6e..3e768a6b82641c7897a9db401ed8d8a994dfe397 100644 (file)
@@ -455,7 +455,8 @@ enum tcp_synack_type {
 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
                                struct tcp_fastopen_cookie *foc,
-                               enum tcp_synack_type synack_type);
+                               enum tcp_synack_type synack_type,
+                               struct sk_buff *syn_skb);
 int tcp_disconnect(struct sock *sk, int flags);
 
 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
@@ -2035,7 +2036,8 @@ struct tcp_request_sock_ops {
        int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
                           struct flowi *fl, struct request_sock *req,
                           struct tcp_fastopen_cookie *foc,
-                          enum tcp_synack_type synack_type);
+                          enum tcp_synack_type synack_type,
+                          struct sk_buff *syn_skb);
 };
 
 extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
index 010ed2abcb668eea5be7a3e25fe481f2d26c587d..18d0e128bc3c79671695f5bf4568e56ccaa55b1f 100644 (file)
@@ -4175,8 +4175,9 @@ enum {
        BPF_SOCK_OPS_RTT_CB_FLAG        = (1<<3),
        BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG  = (1<<4),
        BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5),
+       BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
 /* Mask of all currently supported cb flags */
-       BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x3F,
+       BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
 };
 
 /* List of known BPF sock_ops operators.
index b520450170d1275d6fcad8aabe0c966cdfb205a7..8c9da4b65dae07722c69f17f3f9a9892295d0074 100644 (file)
@@ -6824,7 +6824,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        }
        if (fastopen_sk) {
                af_ops->send_synack(fastopen_sk, dst, &fl, req,
-                                   &foc, TCP_SYNACK_FASTOPEN);
+                                   &foc, TCP_SYNACK_FASTOPEN, skb);
                /* Add the child socket directly into the accept queue */
                if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
                        reqsk_fastopen_remove(fastopen_sk, req, false);
@@ -6842,7 +6842,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                                tcp_timeout_init((struct sock *)req));
                af_ops->send_synack(sk, dst, &fl, req, &foc,
                                    !want_cookie ? TCP_SYNACK_NORMAL :
-                                                  TCP_SYNACK_COOKIE);
+                                                  TCP_SYNACK_COOKIE,
+                                   skb);
                if (want_cookie) {
                        reqsk_free(req);
                        return 0;
index 5084333b5ab647ca8ed296235a1ed6573693b250..631a5ee0dd4eb92b6801bc4351b3da2c3f49b124 100644 (file)
@@ -965,7 +965,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
                              struct flowi *fl,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
-                             enum tcp_synack_type synack_type)
+                             enum tcp_synack_type synack_type,
+                             struct sk_buff *syn_skb)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
@@ -976,7 +977,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                return -1;
 
-       skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+       skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 
        if (skb) {
                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
index 44ffa4891beb9daca9ba9f580e762f0855638a7d..673db6879e46909192d23bc6167490d3a562b2ab 100644 (file)
@@ -438,6 +438,7 @@ struct tcp_out_options {
        u8 ws;                  /* window scale, 0 to disable */
        u8 num_sack_blocks;     /* number of SACK blocks to include */
        u8 hash_size;           /* bytes in hash_location */
+       u8 bpf_opt_len;         /* length of BPF hdr option */
        __u8 *hash_location;    /* temporary pointer, overloaded */
        __u32 tsval, tsecr;     /* need to include OPTION_TS */
        struct tcp_fastopen_cookie *fastopen_cookie;    /* Fast open cookie */
@@ -452,6 +453,59 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
 #endif
 }
 
+#ifdef CONFIG_CGROUP_BPF
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+                                 struct request_sock *req,
+                                 struct sk_buff *syn_skb,
+                                 enum tcp_synack_type synack_type,
+                                 struct tcp_out_options *opts,
+                                 unsigned int *remaining)
+{
+       if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+                                          BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+           !*remaining)
+               return;
+
+       /* The bpf running context preparation and the actual bpf prog
+        * calling will be implemented in a later PATCH together with
+        * other bpf pieces.
+        */
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+                                   struct request_sock *req,
+                                   struct sk_buff *syn_skb,
+                                   enum tcp_synack_type synack_type,
+                                   struct tcp_out_options *opts)
+{
+       if (likely(!opts->bpf_opt_len))
+               return;
+
+       /* The bpf running context preparation and the actual bpf prog
+        * calling will be implemented in a later PATCH together with
+        * other bpf pieces.
+        */
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+                                 struct request_sock *req,
+                                 struct sk_buff *syn_skb,
+                                 enum tcp_synack_type synack_type,
+                                 struct tcp_out_options *opts,
+                                 unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+                                   struct request_sock *req,
+                                   struct sk_buff *syn_skb,
+                                   enum tcp_synack_type synack_type,
+                                   struct tcp_out_options *opts)
+{
+}
+#endif
+
 /* Write previously computed TCP options to the packet.
  *
  * Beware: Something in the Internet is very sensitive to the ordering of
@@ -691,6 +745,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
                }
        }
 
+       bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
        return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -701,7 +757,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
                                       struct tcp_out_options *opts,
                                       const struct tcp_md5sig_key *md5,
                                       struct tcp_fastopen_cookie *foc,
-                                      enum tcp_synack_type synack_type)
+                                      enum tcp_synack_type synack_type,
+                                      struct sk_buff *syn_skb)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -758,6 +815,9 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 
        smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
 
+       bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+                             synack_type, opts, &remaining);
+
        return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -826,6 +886,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
                        opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
        }
 
+       if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+                                           BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+               unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+               bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+               size = MAX_TCP_OPTION_SPACE - remaining;
+       }
+
        return size;
 }
 
@@ -1213,6 +1282,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
        }
 #endif
 
+       /* BPF prog is the last one writing header option */
+       bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
+
        INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
                           tcp_v6_send_check, tcp_v4_send_check,
                           sk, skb);
@@ -3336,20 +3408,20 @@ int tcp_send_synack(struct sock *sk)
 }
 
 /**
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- * foc: cookie for tcp fast open
- * synack_type: Type of synback to prepare
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ *       should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received.  It could be NULL for rtx case.
  */
 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
                                struct tcp_fastopen_cookie *foc,
-                               enum tcp_synack_type synack_type)
+                               enum tcp_synack_type synack_type,
+                               struct sk_buff *syn_skb)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct tcp_sock *tp = tcp_sk(sk);
@@ -3408,8 +3480,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
        md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
 #endif
        skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+       /* bpf program will be interested in the tcp_flags */
+       TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
        tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
-                                            foc, synack_type) + sizeof(*th);
+                                            foc, synack_type,
+                                            syn_skb) + sizeof(*th);
 
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
@@ -3441,6 +3516,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
        rcu_read_unlock();
 #endif
 
+       bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+                               synack_type, &opts);
+
        skb->skb_mstamp_ns = now;
        tcp_add_tx_delay(skb, tp);
 
@@ -3936,7 +4014,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
        int res;
 
        tcp_rsk(req)->txhash = net_tx_rndhash();
-       res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
+       res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+                                 NULL);
        if (!res) {
                __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
index 305870a72352d6cb404675ad1049b7017c88117b..87a633e1fbefd6e73b02c964217245610f1f30b7 100644 (file)
@@ -501,7 +501,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
                              struct flowi *fl,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
-                             enum tcp_synack_type synack_type)
+                             enum tcp_synack_type synack_type,
+                             struct sk_buff *syn_skb)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        struct ipv6_pinfo *np = tcp_inet6_sk(sk);
@@ -515,7 +516,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
                                               IPPROTO_TCP)) == NULL)
                goto done;
 
-       skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+       skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 
        if (skb) {
                __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
index 010ed2abcb668eea5be7a3e25fe481f2d26c587d..18d0e128bc3c79671695f5bf4568e56ccaa55b1f 100644 (file)
@@ -4175,8 +4175,9 @@ enum {
        BPF_SOCK_OPS_RTT_CB_FLAG        = (1<<3),
        BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG  = (1<<4),
        BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5),
+       BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
 /* Mask of all currently supported cb flags */
-       BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x3F,
+       BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
 };
 
 /* List of known BPF sock_ops operators.