]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - net/ipv4/tcp_output.c
[TCP] Vegas: timestamp before clone
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / tcp_output.c
index d6e3d269e90611250fabaea9005d0dc3abc3dea3..b7325e0b406a8486fad7bc6da6b24f5ade426415 100644 (file)
@@ -190,7 +190,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
        }
 
        /* Set initial window to value enough for senders,
-        * following RFC1414. Senders, not following this RFC,
+        * following RFC2414. Senders, not following this RFC,
         * will be satisfied with 2.
         */
        if (mss > (1<<*rcv_wscale)) {
@@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
 {
-       if (skb != NULL) {
-               const struct inet_connection_sock *icsk = inet_csk(sk);
-               struct inet_sock *inet = inet_sk(sk);
-               struct tcp_sock *tp = tcp_sk(sk);
-               struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-               int tcp_header_size = tp->tcp_header_len;
-               struct tcphdr *th;
-               int sysctl_flags;
-               int err;
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct inet_sock *inet;
+       struct tcp_sock *tp;
+       struct tcp_skb_cb *tcb;
+       int tcp_header_size;
+       struct tcphdr *th;
+       int sysctl_flags;
+       int err;
 
-               BUG_ON(!tcp_skb_pcount(skb));
+       BUG_ON(!skb || !tcp_skb_pcount(skb));
+
+       /* If congestion control is doing timestamping, we must
+        * take such a timestamp before we potentially clone/copy.
+        */
+       if (icsk->icsk_ca_ops->rtt_sample)
+               __net_timestamp(skb);
+
+       if (likely(clone_it)) {
+               if (unlikely(skb_cloned(skb)))
+                       skb = pskb_copy(skb, gfp_mask);
+               else
+                       skb = skb_clone(skb, gfp_mask);
+               if (unlikely(!skb))
+                       return -ENOBUFS;
+       }
+
+       inet = inet_sk(sk);
+       tp = tcp_sk(sk);
+       tcb = TCP_SKB_CB(skb);
+       tcp_header_size = tp->tcp_header_len;
 
 #define SYSCTL_FLAG_TSTAMPS    0x1
 #define SYSCTL_FLAG_WSCALE     0x2
 #define SYSCTL_FLAG_SACK       0x4
 
-               /* If congestion control is doing timestamping */
-               if (icsk->icsk_ca_ops->rtt_sample)
-                       __net_timestamp(skb);
-
-               sysctl_flags = 0;
-               if (tcb->flags & TCPCB_FLAG_SYN) {
-                       tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
-                       if(sysctl_tcp_timestamps) {
-                               tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
-                               sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
-                       }
-                       if(sysctl_tcp_window_scaling) {
-                               tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
-                               sysctl_flags |= SYSCTL_FLAG_WSCALE;
-                       }
-                       if(sysctl_tcp_sack) {
-                               sysctl_flags |= SYSCTL_FLAG_SACK;
-                               if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
-                                       tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
-                       }
-               } else if (tp->rx_opt.eff_sacks) {
-                       /* A SACK is 2 pad bytes, a 2 byte header, plus
-                        * 2 32-bit sequence numbers for each SACK block.
-                        */
-                       tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
-                                           (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+       sysctl_flags = 0;
+       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+               tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+               if(sysctl_tcp_timestamps) {
+                       tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+                       sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
                }
-               
-               if (tcp_packets_in_flight(tp) == 0)
-                       tcp_ca_event(sk, CA_EVENT_TX_START);
-
-               th = (struct tcphdr *) skb_push(skb, tcp_header_size);
-               skb->h.th = th;
-               skb_set_owner_w(skb, sk);
-
-               /* Build TCP header and checksum it. */
-               th->source              = inet->sport;
-               th->dest                = inet->dport;
-               th->seq                 = htonl(tcb->seq);
-               th->ack_seq             = htonl(tp->rcv_nxt);
-               *(((__u16 *)th) + 6)    = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
-               if (tcb->flags & TCPCB_FLAG_SYN) {
-                       /* RFC1323: The window in SYN & SYN/ACK segments
-                        * is never scaled.
-                        */
-                       th->window      = htons(tp->rcv_wnd);
-               } else {
-                       th->window      = htons(tcp_select_window(sk));
+               if (sysctl_tcp_window_scaling) {
+                       tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+                       sysctl_flags |= SYSCTL_FLAG_WSCALE;
                }
-               th->check               = 0;
-               th->urg_ptr             = 0;
-
-               if (tp->urg_mode &&
-                   between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
-                       th->urg_ptr             = htons(tp->snd_up-tcb->seq);
-                       th->urg                 = 1;
+               if (sysctl_tcp_sack) {
+                       sysctl_flags |= SYSCTL_FLAG_SACK;
+                       if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+                               tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
                }
+       } else if (unlikely(tp->rx_opt.eff_sacks)) {
+               /* A SACK is 2 pad bytes, a 2 byte header, plus
+                * 2 32-bit sequence numbers for each SACK block.
+                */
+               tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+                                   (tp->rx_opt.eff_sacks *
+                                    TCPOLEN_SACK_PERBLOCK));
+       }
+               
+       if (tcp_packets_in_flight(tp) == 0)
+               tcp_ca_event(sk, CA_EVENT_TX_START);
+
+       th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+       skb->h.th = th;
+       skb_set_owner_w(skb, sk);
+
+       /* Build TCP header and checksum it. */
+       th->source              = inet->sport;
+       th->dest                = inet->dport;
+       th->seq                 = htonl(tcb->seq);
+       th->ack_seq             = htonl(tp->rcv_nxt);
+       *(((__u16 *)th) + 6)    = htons(((tcp_header_size >> 2) << 12) |
+                                       tcb->flags);
+
+       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+               /* RFC1323: The window in SYN & SYN/ACK segments
+                * is never scaled.
+                */
+               th->window      = htons(tp->rcv_wnd);
+       } else {
+               th->window      = htons(tcp_select_window(sk));
+       }
+       th->check               = 0;
+       th->urg_ptr             = 0;
 
-               if (tcb->flags & TCPCB_FLAG_SYN) {
-                       tcp_syn_build_options((__u32 *)(th + 1),
-                                             tcp_advertise_mss(sk),
-                                             (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
-                                             (sysctl_flags & SYSCTL_FLAG_SACK),
-                                             (sysctl_flags & SYSCTL_FLAG_WSCALE),
-                                             tp->rx_opt.rcv_wscale,
-                                             tcb->when,
-                                             tp->rx_opt.ts_recent);
-               } else {
-                       tcp_build_and_update_options((__u32 *)(th + 1),
-                                                    tp, tcb->when);
+       if (unlikely(tp->urg_mode &&
+                    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
+               th->urg_ptr             = htons(tp->snd_up-tcb->seq);
+               th->urg                 = 1;
+       }
 
-                       TCP_ECN_send(sk, tp, skb, tcp_header_size);
-               }
-               tp->af_specific->send_check(sk, th, skb->len, skb);
+       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+               tcp_syn_build_options((__u32 *)(th + 1),
+                                     tcp_advertise_mss(sk),
+                                     (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+                                     (sysctl_flags & SYSCTL_FLAG_SACK),
+                                     (sysctl_flags & SYSCTL_FLAG_WSCALE),
+                                     tp->rx_opt.rcv_wscale,
+                                     tcb->when,
+                                     tp->rx_opt.ts_recent);
+       } else {
+               tcp_build_and_update_options((__u32 *)(th + 1),
+                                            tp, tcb->when);
+               TCP_ECN_send(sk, tp, skb, tcp_header_size);
+       }
 
-               if (tcb->flags & TCPCB_FLAG_ACK)
-                       tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+       tp->af_specific->send_check(sk, th, skb->len, skb);
 
-               if (skb->len != tcp_header_size)
-                       tcp_event_data_sent(tp, skb, sk);
+       if (likely(tcb->flags & TCPCB_FLAG_ACK))
+               tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
-               TCP_INC_STATS(TCP_MIB_OUTSEGS);
+       if (skb->len != tcp_header_size)
+               tcp_event_data_sent(tp, skb, sk);
 
-               err = tp->af_specific->queue_xmit(skb, 0);
-               if (err <= 0)
-                       return err;
+       TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
-               tcp_enter_cwr(sk);
+       err = tp->af_specific->queue_xmit(skb, 0);
+       if (unlikely(err <= 0))
+               return err;
+
+       tcp_enter_cwr(sk);
+
+       /* NET_XMIT_CN is special. It does not guarantee,
+        * that this packet is lost. It tells that device
+        * is about to start to drop packets or already
+        * drops some packets of the same priority and
+        * invokes us to send less aggressively.
+        */
+       return err == NET_XMIT_CN ? 0 : err;
 
-               /* NET_XMIT_CN is special. It does not guarantee,
-                * that this packet is lost. It tells that device
-                * is about to start to drop packets or already
-                * drops some packets of the same priority and
-                * invokes us to send less aggressively.
-                */
-               return err == NET_XMIT_CN ? 0 : err;
-       }
-       return -ENOBUFS;
 #undef SYSCTL_FLAG_TSTAMPS
 #undef SYSCTL_FLAG_WSCALE
 #undef SYSCTL_FLAG_SACK
@@ -435,8 +452,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
        int nsize, old_factor;
        u16 flags;
 
-       BUG_ON(len >= skb->len);
+       BUG_ON(len > skb->len);
 
+       clear_all_retrans_hints(tp);
        nsize = skb_headlen(skb) - len;
        if (nsize < 0)
                nsize = 0;
@@ -600,7 +618,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    for TCP options, but includes only bare TCP header.
 
    tp->rx_opt.mss_clamp is mss negotiated at connection setup.
-   It is minumum of user_mss and mss received with SYN.
+   It is minimum of user_mss and mss received with SYN.
    It also does not include TCP options.
 
    tp->pmtu_cookie is last pmtu, seen by this function.
@@ -1035,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-               if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+               if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
                        break;
 
                /* Advance the send_head.  This one is sent out.
@@ -1108,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
                /* Send it out now. */
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-               if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+               if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
                        update_send_head(sk, tp, skb);
                        tcp_cwnd_validate(sk, tp);
                        return;
@@ -1172,7 +1190,7 @@ u32 __tcp_select_window(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
-       /* MSS for the peer's data.  Previous verions used mss_clamp
+       /* MSS for the peer's data.  Previous versions used mss_clamp
         * here.  I don't know if the value based on our guesses
         * of peer's MSS is better for the performance.  It's more correct
         * but may be worse for the performance because of rcv_mss
@@ -1261,7 +1279,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
                BUG_ON(tcp_skb_pcount(skb) != 1 ||
                       tcp_skb_pcount(next_skb) != 1);
 
-               /* Ok.  We will be able to collapse the packet. */
+               /* changing transmit queue under us so clear hints */
+               clear_all_retrans_hints(tp);
+
+               /* Ok.  We will be able to collapse the packet. */
                __skb_unlink(next_skb, &sk->sk_write_queue);
 
                memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1331,6 +1352,8 @@ void tcp_simple_retransmit(struct sock *sk)
                }
        }
 
+       clear_all_retrans_hints(tp);
+
        if (!lost)
                return;
 
@@ -1362,7 +1385,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
        int err;
 
        /* Do not sent more than we queued. 1/4 is reserved for possible
-        * copying overhead: frgagmentation, tunneling, mangling etc.
+        * copying overhead: fragmentation, tunneling, mangling etc.
         */
        if (atomic_read(&sk->sk_wmem_alloc) >
            min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
@@ -1423,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-       err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
-                                   pskb_copy(skb, GFP_ATOMIC):
-                                   skb_clone(skb, GFP_ATOMIC)));
+       err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 
        if (err == 0) {
                /* Update global TCP statistics. */
@@ -1469,13 +1490,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-       int packet_cnt = tp->lost_out;
+       int packet_cnt;
+
+       if (tp->retransmit_skb_hint) {
+               skb = tp->retransmit_skb_hint;
+               packet_cnt = tp->retransmit_cnt_hint;
+       }else{
+               skb = sk->sk_write_queue.next;
+               packet_cnt = 0;
+       }
 
        /* First pass: retransmit lost packets. */
-       if (packet_cnt) {
-               sk_stream_for_retrans_queue(skb, sk) {
+       if (tp->lost_out) {
+               sk_stream_for_retrans_queue_from(skb, sk) {
                        __u8 sacked = TCP_SKB_CB(skb)->sacked;
 
+                       /* we could do better than to assign each time */
+                       tp->retransmit_skb_hint = skb;
+                       tp->retransmit_cnt_hint = packet_cnt;
+
                        /* Assume this retransmit will generate
                         * only one packet for congestion window
                         * calculation purposes.  This works because
@@ -1486,10 +1519,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
                                return;
 
-                       if (sacked&TCPCB_LOST) {
+                       if (sacked & TCPCB_LOST) {
                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
-                                       if (tcp_retransmit_skb(sk, skb))
+                                       if (tcp_retransmit_skb(sk, skb)) {
+                                               tp->retransmit_skb_hint = NULL;
                                                return;
+                                       }
                                        if (icsk->icsk_ca_state != TCP_CA_Loss)
                                                NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
                                        else
@@ -1502,8 +1537,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                                                                          TCP_RTO_MAX);
                                }
 
-                               packet_cnt -= tcp_skb_pcount(skb);
-                               if (packet_cnt <= 0)
+                               packet_cnt += tcp_skb_pcount(skb);
+                               if (packet_cnt >= tp->lost_out)
                                        break;
                        }
                }
@@ -1529,9 +1564,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
        if (tcp_may_send_now(sk, tp))
                return;
 
-       packet_cnt = 0;
+       if (tp->forward_skb_hint) {
+               skb = tp->forward_skb_hint;
+               packet_cnt = tp->forward_cnt_hint;
+       } else{
+               skb = sk->sk_write_queue.next;
+               packet_cnt = 0;
+       }
+
+       sk_stream_for_retrans_queue_from(skb, sk) {
+               tp->forward_cnt_hint = packet_cnt;
+               tp->forward_skb_hint = skb;
 
-       sk_stream_for_retrans_queue(skb, sk) {
                /* Similar to the retransmit loop above we
                 * can pretend that the retransmitted SKB
                 * we send out here will be composed of one
@@ -1548,8 +1592,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                        continue;
 
                /* Ok, retransmit it. */
-               if (tcp_retransmit_skb(sk, skb))
+               if (tcp_retransmit_skb(sk, skb)) {
+                       tp->forward_skb_hint = NULL;
                        break;
+               }
 
                if (skb == skb_peek(&sk->sk_write_queue))
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -1610,7 +1656,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -1634,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
        TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       if (tcp_transmit_skb(sk, skb))
+       if (tcp_transmit_skb(sk, skb, 0, priority))
                NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
 }
 
@@ -1669,7 +1715,7 @@ int tcp_send_synack(struct sock *sk)
                TCP_ECN_send_synack(tcp_sk(sk), skb);
        }
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+       return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
 
 /*
@@ -1830,7 +1876,7 @@ int tcp_connect(struct sock *sk)
        __skb_queue_tail(&sk->sk_write_queue, buff);
        sk_charge_skb(sk, buff);
        tp->packets_out += tcp_skb_pcount(buff);
-       tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+       tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
        /* Timer for repeating the SYN until an answer. */
@@ -1926,7 +1972,7 @@ void tcp_send_ack(struct sock *sk)
                /* Send it off, this clears delayed acks for us. */
                TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
                TCP_SKB_CB(buff)->when = tcp_time_stamp;
-               tcp_transmit_skb(sk, buff);
+               tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
        }
 }
 
@@ -1966,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
        TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       return tcp_transmit_skb(sk, skb);
+       return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
 }
 
 int tcp_write_wakeup(struct sock *sk)
@@ -1999,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk)
 
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                       err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+                       err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
                        if (!err) {
                                update_send_head(sk, tp, skb);
                        }
@@ -2059,3 +2105,4 @@ EXPORT_SYMBOL(tcp_connect);
 EXPORT_SYMBOL(tcp_make_synack);
 EXPORT_SYMBOL(tcp_simple_retransmit);
 EXPORT_SYMBOL(tcp_sync_mss);
+EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);