mptcp: enforce HoL-blocking estimation

author Paolo Abeni <pabeni@redhat.com>

Fri, 17 Dec 2021 23:37:00 +0000 (15:37 -0800)

committer Jakub Kicinski <kuba@kernel.org>

Sat, 18 Dec 2021 03:27:04 +0000 (19:27 -0800)
author Paolo Abeni <pabeni@redhat.com>
Fri, 17 Dec 2021 23:37:00 +0000 (15:37 -0800)
committer Jakub Kicinski <kuba@kernel.org>
Sat, 18 Dec 2021 03:27:04 +0000 (19:27 -0800)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c

index 3e549f6190c0654e0fb4f68de538c083cbe010c9..df5a0cf431c1c496c8fdc40fa60ce6336f192863 100644 (file)
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1372,7 +1372,7 @@ out:
  
  struct subflow_send_info {
         struct sock *ssk;
-       u64 ratio;
+       u64 linger_time;
  };
  
  void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
@@ -1397,20 +1397,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
         return __mptcp_subflow_active(subflow);
  }
  
+#define SSK_MODE_ACTIVE        0
+#define SSK_MODE_BACKUP        1
+#define SSK_MODE_MAX   2
+
  /* implement the mptcp packet scheduler;
   * returns the subflow that will transmit the next DSS
   * additionally updates the rtx timeout
   */
  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
  {
-       struct subflow_send_info send_info[2];
+       struct subflow_send_info send_info[SSK_MODE_MAX];
         struct mptcp_subflow_context *subflow;
         struct sock *sk = (struct sock *)msk;
+       u32 pace, burst, wmem;
         int i, nr_active = 0;
         struct sock *ssk;
+       u64 linger_time;
         long tout = 0;
-       u64 ratio;
-       u32 pace;
  
         sock_owned_by_me(sk);
  
@@ -1429,10 +1433,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
         }
  
         /* pick the subflow with the lower wmem/wspace ratio */
-       for (i = 0; i < 2; ++i) {
+       for (i = 0; i < SSK_MODE_MAX; ++i) {
                 send_info[i].ssk = NULL;
-               send_info[i].ratio = -1;
+               send_info[i].linger_time = -1;
         }
+
         mptcp_for_each_subflow(msk, subflow) {
                 trace_mptcp_subflow_get_send(subflow);
                 ssk =  mptcp_subflow_tcp_sock(subflow);
@@ -1441,34 +1446,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
  
                 tout = max(tout, mptcp_timeout_from_subflow(subflow));
                 nr_active += !subflow->backup;
-               if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
-                       continue;
-
-               pace = READ_ONCE(ssk->sk_pacing_rate);
-               if (!pace)
-                       continue;
+               pace = subflow->avg_pacing_rate;
+               if (unlikely(!pace)) {
+                       /* init pacing rate from socket */
+                       subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
+                       pace = subflow->avg_pacing_rate;
+                       if (!pace)
+                               continue;
+               }
  
-               ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
-                               pace);
-               if (ratio < send_info[subflow->backup].ratio) {
+               linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
+               if (linger_time < send_info[subflow->backup].linger_time) {
                         send_info[subflow->backup].ssk = ssk;
-                       send_info[subflow->backup].ratio = ratio;
+                       send_info[subflow->backup].linger_time = linger_time;
                 }
         }
         __mptcp_set_timeout(sk, tout);
  
         /* pick the best backup if no other subflow is active */
         if (!nr_active)
-               send_info[0].ssk = send_info[1].ssk;
-
-       if (send_info[0].ssk) {
-               msk->last_snd = send_info[0].ssk;
-               msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
-                                      tcp_sk(msk->last_snd)->snd_wnd);
-               return msk->last_snd;
-       }
+               send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
+
+       /* According to the blest algorithm, to avoid HoL blocking for the
+        * faster flow, we need to:
+        * - estimate the faster flow linger time
+        * - use the above to estimate the amount of byte transferred
+        *   by the faster flow
+        * - check that the amount of queued data is greter than the above,
+        *   otherwise do not use the picked, slower, subflow
+        * We select the subflow with the shorter estimated time to flush
+        * the queued mem, which basically ensure the above. We just need
+        * to check that subflow has a non empty cwin.
+        */
+       ssk = send_info[SSK_MODE_ACTIVE].ssk;
+       if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd)
+               return NULL;
  
-       return NULL;
+       burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd);
+       wmem = READ_ONCE(ssk->sk_wmem_queued);
+       subflow = mptcp_subflow_ctx(ssk);
+       subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
+                                          READ_ONCE(ssk->sk_pacing_rate) * burst,
+                                          burst + wmem);
+       msk->last_snd = ssk;
+       msk->snd_burst = burst;
+       return ssk;
  }
  
  static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h

index e1469155fb159b0f001e22f0c9848ad3bf5c5b8c..0486c9f5b38b07223c4a84dbdab497500a5c305a 100644 (file)
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -395,6 +395,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
  /* MPTCP subflow context */
  struct mptcp_subflow_context {
         struct  list_head node;/* conn_list of subflows */
+       unsigned long avg_pacing_rate; /* protected by msk socket lock */
         u64     local_key;
         u64     remote_key;
         u64     idsn;
author	Paolo Abeni <pabeni@redhat.com>
	Fri, 17 Dec 2021 23:37:00 +0000 (15:37 -0800)
committer	Jakub Kicinski <kuba@kernel.org>
	Sat, 18 Dec 2021 03:27:04 +0000 (19:27 -0800)
net/mptcp/protocol.c		patch \| blob \| blame \| history
net/mptcp/protocol.h		patch \| blob \| blame \| history