tcp/dccp: fix lockdep issue when SYN is backlogged

[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / tcp_input.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 45f750e85714da11f569ae0c6522f1cc56c6d2a2..53106f3b3b2e130dbe68faac923688f2bc43488b 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -183,24 +183,27 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
         }
  }
  
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
  {
         struct inet_connection_sock *icsk = inet_csk(sk);
         unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
  
         if (quickacks == 0)
                 quickacks = 2;
+       quickacks = min(quickacks, max_quickacks);
         if (quickacks > icsk->icsk_ack.quick)
-               icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+               icsk->icsk_ack.quick = quickacks;
  }
  
-static void tcp_enter_quickack_mode(struct sock *sk)
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
  {
         struct inet_connection_sock *icsk = inet_csk(sk);
-       tcp_incr_quickack(sk);
+
+       tcp_incr_quickack(sk, max_quickacks);
         icsk->icsk_ack.pingpong = 0;
         icsk->icsk_ack.ato = TCP_ATO_MIN;
  }
+EXPORT_SYMBOL(tcp_enter_quickack_mode);
  
  /* Send ACKs quickly, if "quick" count is not exhausted
   * and the session is not interactive.
@@ -223,8 +226,15 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
  
  static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
  {
-       if (tcp_hdr(skb)->cwr)
+       if (tcp_hdr(skb)->cwr) {
                 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+
+               /* If the sender is telling us it has entered CWR, then its
+                * cwnd may be very low (even just 1 packet), so we should ACK
+                * immediately.
+                */
+               tcp_enter_quickack_mode((struct sock *)tp, 2);
+       }
  }
  
  static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
@@ -232,8 +242,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
         tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
  }
  
-static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
  {
+       struct tcp_sock *tp = tcp_sk(sk);
+
         switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
         case INET_ECN_NOT_ECT:
                 /* Funny extension: if ECT is not set on a segment,
@@ -241,31 +253,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
                  * it is probably a retransmit.
                  */
                 if (tp->ecn_flags & TCP_ECN_SEEN)
-                       tcp_enter_quickack_mode((struct sock *)tp);
+                       tcp_enter_quickack_mode(sk, 2);
                 break;
         case INET_ECN_CE:
-               if (tcp_ca_needs_ecn((struct sock *)tp))
-                       tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+               if (tcp_ca_needs_ecn(sk))
+                       tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
  
                 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
                         /* Better not delay acks, sender can have a very low cwnd */
-                       tcp_enter_quickack_mode((struct sock *)tp);
+                       tcp_enter_quickack_mode(sk, 2);
                         tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
                 }
                 tp->ecn_flags |= TCP_ECN_SEEN;
                 break;
         default:
-               if (tcp_ca_needs_ecn((struct sock *)tp))
-                       tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+               if (tcp_ca_needs_ecn(sk))
+                       tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
                 tp->ecn_flags |= TCP_ECN_SEEN;
                 break;
         }
  }
  
-static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
  {
-       if (tp->ecn_flags & TCP_ECN_OK)
-               __tcp_ecn_check_ce(tp, skb);
+       if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
+               __tcp_ecn_check_ce(sk, skb);
  }
  
  static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
@@ -578,8 +590,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
  void tcp_rcv_space_adjust(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
+       u32 copied;
         int time;
-       int copied;
  
         tcp_mstamp_refresh(tp);
         time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
@@ -602,12 +614,13 @@ void tcp_rcv_space_adjust(struct sock *sk)
  
         if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
             !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-               int rcvwin, rcvmem, rcvbuf;
+               int rcvmem, rcvbuf;
+               u64 rcvwin;
  
                 /* minimal window to cope with packet losses, assuming
                  * steady state. Add some cushion because of small variations.
                  */
-               rcvwin = (copied << 1) + 16 * tp->advmss;
+               rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
  
                 /* If rate increased by 25%,
                  *      assume slow start, rcvwin = 3 * copied
@@ -627,13 +640,14 @@ void tcp_rcv_space_adjust(struct sock *sk)
                 while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
                         rcvmem += 128;
  
-               rcvbuf = min(rcvwin / tp->advmss * rcvmem,
+               do_div(rcvwin, tp->advmss);
+               rcvbuf = min_t(u64, rcvwin * rcvmem,
                              sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
                 if (rcvbuf > sk->sk_rcvbuf) {
                         sk->sk_rcvbuf = rcvbuf;
  
                         /* Make the window clamp follow along.  */
-                       tp->window_clamp = rcvwin;
+                       tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
                 }
         }
         tp->rcvq_space.space = copied;
@@ -671,7 +685,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
                 /* The _first_ data packet received, initialize
                  * delayed ACK engine.
                  */
-               tcp_incr_quickack(sk);
+               tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
                 icsk->icsk_ack.ato = TCP_ATO_MIN;
         } else {
                 int m = now - icsk->icsk_ack.lrcvtime;
@@ -687,13 +701,13 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
                         /* Too long gap. Apparently sender failed to
                          * restart window, so that we send ACKs quickly.
                          */
-                       tcp_incr_quickack(sk);
+                       tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
                         sk_mem_reclaim(sk);
                 }
         }
         icsk->icsk_ack.lrcvtime = now;
  
-       tcp_ecn_check_ce(tp, skb);
+       tcp_ecn_check_ce(sk, skb);
  
         if (skb->len >= 128)
                 tcp_grow_window(sk, skb);
@@ -1283,7 +1297,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
         TCP_SKB_CB(skb)->seq += shifted;
  
         tcp_skb_pcount_add(prev, pcount);
-       BUG_ON(tcp_skb_pcount(skb) < pcount);
+       WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
         tcp_skb_pcount_add(skb, -pcount);
  
         /* When we're adding to gso_segs == 1, gso_size will be zero,
@@ -1349,6 +1363,21 @@ static int skb_can_shift(const struct sk_buff *skb)
         return !skb_headlen(skb) && skb_is_nonlinear(skb);
  }
  
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
+                 int pcount, int shiftlen)
+{
+       /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
+        * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
+        * to make sure not storing more than 65535 * 8 bytes per skb,
+        * even if current MSS is bigger.
+        */
+       if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
+               return 0;
+       if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
+               return 0;
+       return skb_shift(to, from, shiftlen);
+}
+
  /* Try collapsing SACK blocks spanning across multiple skbs to a single
   * skb.
   */
@@ -1457,7 +1486,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
         if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
                 goto fallback;
  
-       if (!skb_shift(prev, skb, len))
+       if (!tcp_skb_shift(prev, skb, pcount, len))
                 goto fallback;
         if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
                 goto out;
@@ -1475,11 +1504,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
                 goto out;
  
         len = skb->len;
-       if (skb_shift(prev, skb, len)) {
-               pcount += tcp_skb_pcount(skb);
-               tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+       pcount = tcp_skb_pcount(skb);
+       if (tcp_skb_shift(prev, skb, pcount, len))
+               tcp_shifted_skb(sk, prev, skb, state, pcount,
                                 len, mss, 0);
-       }
  
  out:
         return prev;
@@ -1977,11 +2005,6 @@ void tcp_enter_loss(struct sock *sk)
         /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
          * loss recovery is underway except recurring timeout(s) on
          * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
-        *
-        * In theory F-RTO can be used repeatedly during loss recovery.
-        * In practice this interacts badly with broken middle-boxes that
-        * falsely raise the receive window, which results in repeated
-        * timeouts and stop-and-go behavior.
          */
         tp->frto = net->ipv4.sysctl_tcp_frto &&
                    (new_recovery || icsk->icsk_retransmits) &&
@@ -2637,18 +2660,14 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
             tcp_try_undo_loss(sk, false))
                 return;
  
-       /* The ACK (s)acks some never-retransmitted data meaning not all
-        * the data packets before the timeout were lost. Therefore we
-        * undo the congestion window and state. This is essentially
-        * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
-        * a retransmitted skb is permantly marked, we can apply such an
-        * operation even if F-RTO was not used.
-        */
-       if ((flag & FLAG_ORIG_SACK_ACKED) &&
-           tcp_try_undo_loss(sk, tp->undo_marker))
-               return;
-
         if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+               /* Step 3.b. A timeout is spurious if not all data are
+                * lost, i.e., never-retransmitted data are (s)acked.
+                */
+               if ((flag & FLAG_ORIG_SACK_ACKED) &&
+                   tcp_try_undo_loss(sk, true))
+                       return;
+
                 if (after(tp->snd_nxt, tp->high_seq)) {
                         if (flag & FLAG_DATA_SACKED || is_dupack)
                                 tp->frto = 0; /* Step 3.a. loss was real */
@@ -3149,6 +3168,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
  
                 if (tcp_is_reno(tp)) {
                         tcp_remove_reno_sacks(sk, pkts_acked);
+
+                       /* If any of the cumulatively ACKed segments was
+                        * retransmitted, non-SACK case cannot confirm that
+                        * progress was due to original transmission due to
+                        * lack of TCPCB_SACKED_ACKED bits even if some of
+                        * the packets may have been never retransmitted.
+                        */
+                       if (flag & FLAG_RETRANS_DATA_ACKED)
+                               flag &= ~FLAG_ORIG_SACK_ACKED;
                 } else {
                         int delta;
  
@@ -3867,11 +3895,8 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
         int length = (th->doff << 2) - sizeof(*th);
         const u8 *ptr = (const u8 *)(th + 1);
  
-       /* If the TCP option is too short, we can short cut */
-       if (length < TCPOLEN_MD5SIG)
-               return NULL;
-
-       while (length > 0) {
+       /* If not enough data remaining, we can short cut */
+       while (length >= TCPOLEN_MD5SIG) {
                 int opcode = *ptr++;
                 int opsize;
  
@@ -3988,6 +4013,7 @@ void tcp_reset(struct sock *sk)
         /* This barrier is coupled with smp_rmb() in tcp_poll() */
         smp_wmb();
  
+       tcp_write_queue_purge(sk);
         tcp_done(sk);
  
         if (!sock_flag(sk, SOCK_DEAD))
@@ -4127,7 +4153,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
             before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
-               tcp_enter_quickack_mode(sk);
+               tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
  
                 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
                         u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4290,6 +4316,23 @@ static bool tcp_try_coalesce(struct sock *sk,
         return true;
  }
  
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+                            struct sk_buff *to,
+                            struct sk_buff *from,
+                            bool *fragstolen)
+{
+       bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+       /* In case tcp_drop() is called later, update to->gso_segs */
+       if (res) {
+               u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+                              max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+               skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+       }
+       return res;
+}
+
  static void tcp_drop(struct sock *sk, struct sk_buff *skb)
  {
         sk_drops_add(sk, skb);
@@ -4378,7 +4421,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
         u32 seq, end_seq;
         bool fragstolen;
  
-       tcp_ecn_check_ce(tp, skb);
+       tcp_ecn_check_ce(sk, skb);
  
         if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4413,8 +4456,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
         /* In the typical case, we are adding an skb to the end of the list.
          * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
          */
-       if (tcp_try_coalesce(sk, tp->ooo_last_skb,
-                            skb, &fragstolen)) {
+       if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+                                skb, &fragstolen)) {
  coalesce_done:
                 tcp_grow_window(sk, skb);
                 kfree_skb_partial(skb, fragstolen);
@@ -4442,7 +4485,7 @@ coalesce_done:
                                 /* All the bits are present. Drop. */
                                 NET_INC_STATS(sock_net(sk),
                                               LINUX_MIB_TCPOFOMERGE);
-                               __kfree_skb(skb);
+                               tcp_drop(sk, skb);
                                 skb = NULL;
                                 tcp_dsack_set(sk, seq, end_seq);
                                 goto add_sack;
@@ -4461,11 +4504,11 @@ coalesce_done:
                                                  TCP_SKB_CB(skb1)->end_seq);
                                 NET_INC_STATS(sock_net(sk),
                                               LINUX_MIB_TCPOFOMERGE);
-                               __kfree_skb(skb1);
+                               tcp_drop(sk, skb1);
                                 goto merge_right;
                         }
-               } else if (tcp_try_coalesce(sk, skb1,
-                                           skb, &fragstolen)) {
+               } else if (tcp_ooo_try_coalesce(sk, skb1,
+                                               skb, &fragstolen)) {
                         goto coalesce_done;
                 }
                 p = &parent->rb_right;
@@ -4641,7 +4684,7 @@ queue_and_out:
                 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
  
  out_of_window:
-               tcp_enter_quickack_mode(sk);
+               tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
                 inet_csk_schedule_ack(sk);
  drop:
                 tcp_drop(sk, skb);
@@ -4652,8 +4695,6 @@ drop:
         if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
                 goto out_of_window;
  
-       tcp_enter_quickack_mode(sk);
-
         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                 /* Partial packet, seq < rcv_next < end_seq */
                 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
@@ -4825,6 +4866,7 @@ end:
  static void tcp_collapse_ofo_queue(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
+       u32 range_truesize, sum_tiny = 0;
         struct sk_buff *skb, *head;
         u32 start, end;
  
@@ -4836,6 +4878,7 @@ new_range:
         }
         start = TCP_SKB_CB(skb)->seq;
         end = TCP_SKB_CB(skb)->end_seq;
+       range_truesize = skb->truesize;
  
         for (head = skb;;) {
                 skb = skb_rb_next(skb);
@@ -4846,11 +4889,20 @@ new_range:
                 if (!skb ||
                     after(TCP_SKB_CB(skb)->seq, end) ||
                     before(TCP_SKB_CB(skb)->end_seq, start)) {
-                       tcp_collapse(sk, NULL, &tp->out_of_order_queue,
-                                    head, skb, start, end);
+                       /* Do not attempt collapsing tiny skbs */
+                       if (range_truesize != head->truesize ||
+                           end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+                               tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+                                            head, skb, start, end);
+                       } else {
+                               sum_tiny += range_truesize;
+                               if (sum_tiny > sk->sk_rcvbuf >> 3)
+                                       return;
+                       }
                         goto new_range;
                 }
  
+               range_truesize += skb->truesize;
                 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
                         start = TCP_SKB_CB(skb)->seq;
                 if (after(TCP_SKB_CB(skb)->end_seq, end))
@@ -4865,6 +4917,7 @@ new_range:
   * 2) not add too big latencies if thousands of packets sit there.
   *    (But if application shrinks SO_RCVBUF, we could still end up
   *     freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
   *
   * Return true if queue has shrunk.
   */
@@ -4872,20 +4925,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct rb_node *node, *prev;
+       int goal;
  
         if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
                 return false;
  
         NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+       goal = sk->sk_rcvbuf >> 3;
         node = &tp->ooo_last_skb->rbnode;
         do {
                 prev = rb_prev(node);
                 rb_erase(node, &tp->out_of_order_queue);
+               goal -= rb_to_skb(node)->truesize;
                 tcp_drop(sk, rb_to_skb(node));
-               sk_mem_reclaim(sk);
-               if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
-                   !tcp_under_memory_pressure(sk))
-                       break;
+               if (!prev || goal <= 0) {
+                       sk_mem_reclaim(sk);
+                       if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+                           !tcp_under_memory_pressure(sk))
+                               break;
+                       goal = sk->sk_rcvbuf >> 3;
+               }
                 node = prev;
         } while (node);
         tp->ooo_last_skb = rb_to_skb(prev);
@@ -4920,6 +4979,9 @@ static int tcp_prune_queue(struct sock *sk)
         else if (tcp_under_memory_pressure(sk))
                 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
  
+       if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+               return 0;
+
         tcp_collapse_ofo_queue(sk);
         if (!skb_queue_empty(&sk->sk_receive_queue))
                 tcp_collapse(sk, &sk->sk_receive_queue, NULL,
@@ -5696,7 +5758,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                          * to stand against the temptation 8)     --ANK
                          */
                         inet_csk_schedule_ack(sk);
-                       tcp_enter_quickack_mode(sk);
+                       tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
                                                   TCP_DELACK_MAX, TCP_RTO_MAX);
  
@@ -5823,11 +5885,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                         if (th->fin)
                                 goto discard;
                         /* It is possible that we process SYN packets from backlog,
-                        * so we need to make sure to disable BH right there.
+                        * so we need to make sure to disable BH and RCU right there.
                          */
+                       rcu_read_lock();
                         local_bh_disable();
                         acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
                         local_bh_enable();
+                       rcu_read_unlock();
  
                         if (!acceptable)
                                 return 1;