]>
git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
40 ksocknal_lib_get_conn_addrs (ksock_conn_t
*conn
)
42 int rc
= libcfs_sock_getaddr(conn
->ksnc_sock
, 1,
46 /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
47 LASSERT (!conn
->ksnc_closing
);
50 CERROR ("Error %d getting sock peer IP\n", rc
);
54 rc
= libcfs_sock_getaddr(conn
->ksnc_sock
, 0,
55 &conn
->ksnc_myipaddr
, NULL
);
57 CERROR ("Error %d getting sock local IP\n", rc
);
65 ksocknal_lib_zc_capable(ksock_conn_t
*conn
)
67 int caps
= conn
->ksnc_sock
->sk
->sk_route_caps
;
69 if (conn
->ksnc_proto
== &ksocknal_protocol_v1x
)
72 /* ZC if the socket supports scatter/gather and doesn't need software
74 return ((caps
& NETIF_F_SG
) != 0 && (caps
& NETIF_F_ALL_CSUM
) != 0);
78 ksocknal_lib_send_iov (ksock_conn_t
*conn
, ksock_tx_t
*tx
)
80 struct socket
*sock
= conn
->ksnc_sock
;
84 if (*ksocknal_tunables
.ksnd_enable_csum
&& /* checksum enabled */
85 conn
->ksnc_proto
== &ksocknal_protocol_v2x
&& /* V2.x connection */
86 tx
->tx_nob
== tx
->tx_resid
&& /* frist sending */
87 tx
->tx_msg
.ksm_csum
== 0) /* not checksummed */
88 ksocknal_lib_csum_tx(tx
);
90 /* NB we can't trust socket ops to either consume our iovs
91 * or leave them alone. */
94 #if SOCKNAL_SINGLE_FRAG_TX
96 struct iovec
*scratchiov
= &scratch
;
97 unsigned int niov
= 1;
99 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
100 unsigned int niov
= tx
->tx_niov
;
102 struct msghdr msg
= {
105 .msg_iov
= scratchiov
,
109 .msg_flags
= MSG_DONTWAIT
111 mm_segment_t oldmm
= get_fs();
114 for (nob
= i
= 0; i
< niov
; i
++) {
115 scratchiov
[i
] = tx
->tx_iov
[i
];
116 nob
+= scratchiov
[i
].iov_len
;
119 if (!list_empty(&conn
->ksnc_tx_queue
) ||
121 msg
.msg_flags
|= MSG_MORE
;
124 rc
= sock_sendmsg(sock
, &msg
, nob
);
131 ksocknal_lib_send_kiov (ksock_conn_t
*conn
, ksock_tx_t
*tx
)
133 struct socket
*sock
= conn
->ksnc_sock
;
134 lnet_kiov_t
*kiov
= tx
->tx_kiov
;
138 /* Not NOOP message */
139 LASSERT (tx
->tx_lnetmsg
!= NULL
);
141 /* NB we can't trust socket ops to either consume our iovs
142 * or leave them alone. */
143 if (tx
->tx_msg
.ksm_zc_cookies
[0] != 0) {
144 /* Zero copy is enabled */
145 struct sock
*sk
= sock
->sk
;
146 struct page
*page
= kiov
->kiov_page
;
147 int offset
= kiov
->kiov_offset
;
148 int fragsize
= kiov
->kiov_len
;
149 int msgflg
= MSG_DONTWAIT
;
151 CDEBUG(D_NET
, "page %p + offset %x for %d\n",
152 page
, offset
, kiov
->kiov_len
);
154 if (!list_empty(&conn
->ksnc_tx_queue
) ||
155 fragsize
< tx
->tx_resid
)
158 if (sk
->sk_prot
->sendpage
!= NULL
) {
159 rc
= sk
->sk_prot
->sendpage(sk
, page
,
160 offset
, fragsize
, msgflg
);
162 rc
= cfs_tcp_sendpage(sk
, page
, offset
, fragsize
,
166 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
167 struct iovec scratch
;
168 struct iovec
*scratchiov
= &scratch
;
169 unsigned int niov
= 1;
171 #ifdef CONFIG_HIGHMEM
172 #warning "XXX risk of kmap deadlock on multiple frags..."
174 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
175 unsigned int niov
= tx
->tx_nkiov
;
177 struct msghdr msg
= {
180 .msg_iov
= scratchiov
,
184 .msg_flags
= MSG_DONTWAIT
186 mm_segment_t oldmm
= get_fs();
189 for (nob
= i
= 0; i
< niov
; i
++) {
190 scratchiov
[i
].iov_base
= kmap(kiov
[i
].kiov_page
) +
192 nob
+= scratchiov
[i
].iov_len
= kiov
[i
].kiov_len
;
195 if (!list_empty(&conn
->ksnc_tx_queue
) ||
197 msg
.msg_flags
|= MSG_MORE
;
200 rc
= sock_sendmsg(sock
, &msg
, nob
);
203 for (i
= 0; i
< niov
; i
++)
204 kunmap(kiov
[i
].kiov_page
);
210 ksocknal_lib_eager_ack (ksock_conn_t
*conn
)
213 mm_segment_t oldmm
= get_fs();
214 struct socket
*sock
= conn
->ksnc_sock
;
216 /* Remind the socket to ACK eagerly. If I don't, the socket might
217 * think I'm about to send something it could piggy-back the ACK
218 * on, introducing delay in completing zero-copy sends in my
222 sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_QUICKACK
,
223 (char *)&opt
, sizeof (opt
));
228 ksocknal_lib_recv_iov (ksock_conn_t
*conn
)
230 #if SOCKNAL_SINGLE_FRAG_RX
231 struct iovec scratch
;
232 struct iovec
*scratchiov
= &scratch
;
233 unsigned int niov
= 1;
235 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
236 unsigned int niov
= conn
->ksnc_rx_niov
;
238 struct iovec
*iov
= conn
->ksnc_rx_iov
;
239 struct msghdr msg
= {
242 .msg_iov
= scratchiov
,
248 mm_segment_t oldmm
= get_fs();
256 /* NB we can't trust socket ops to either consume our iovs
257 * or leave them alone. */
260 for (nob
= i
= 0; i
< niov
; i
++) {
261 scratchiov
[i
] = iov
[i
];
262 nob
+= scratchiov
[i
].iov_len
;
264 LASSERT (nob
<= conn
->ksnc_rx_nob_wanted
);
267 rc
= sock_recvmsg (conn
->ksnc_sock
, &msg
, nob
, MSG_DONTWAIT
);
268 /* NB this is just a boolean..........................^ */
272 if (conn
->ksnc_proto
== &ksocknal_protocol_v2x
) {
273 saved_csum
= conn
->ksnc_msg
.ksm_csum
;
274 conn
->ksnc_msg
.ksm_csum
= 0;
277 if (saved_csum
!= 0) {
278 /* accumulate checksum */
279 for (i
= 0, sum
= rc
; sum
> 0; i
++, sum
-= fragnob
) {
282 fragnob
= iov
[i
].iov_len
;
286 conn
->ksnc_rx_csum
= ksocknal_csum(conn
->ksnc_rx_csum
,
287 iov
[i
].iov_base
, fragnob
);
289 conn
->ksnc_msg
.ksm_csum
= saved_csum
;
296 ksocknal_lib_kiov_vunmap(void *addr
)
305 ksocknal_lib_kiov_vmap(lnet_kiov_t
*kiov
, int niov
,
306 struct iovec
*iov
, struct page
**pages
)
312 if (!*ksocknal_tunables
.ksnd_zc_recv
|| pages
== NULL
)
315 LASSERT (niov
<= LNET_MAX_IOV
);
318 niov
< *ksocknal_tunables
.ksnd_zc_recv_min_nfrags
)
321 for (nob
= i
= 0; i
< niov
; i
++) {
322 if ((kiov
[i
].kiov_offset
!= 0 && i
> 0) ||
323 (kiov
[i
].kiov_offset
+ kiov
[i
].kiov_len
!= PAGE_CACHE_SIZE
&& i
< niov
- 1))
326 pages
[i
] = kiov
[i
].kiov_page
;
327 nob
+= kiov
[i
].kiov_len
;
330 addr
= vmap(pages
, niov
, VM_MAP
, PAGE_KERNEL
);
334 iov
->iov_base
= addr
+ kiov
[0].kiov_offset
;
341 ksocknal_lib_recv_kiov (ksock_conn_t
*conn
)
343 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
344 struct iovec scratch
;
345 struct iovec
*scratchiov
= &scratch
;
346 struct page
**pages
= NULL
;
347 unsigned int niov
= 1;
349 #ifdef CONFIG_HIGHMEM
350 #warning "XXX risk of kmap deadlock on multiple frags..."
352 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
353 struct page
**pages
= conn
->ksnc_scheduler
->kss_rx_scratch_pgs
;
354 unsigned int niov
= conn
->ksnc_rx_nkiov
;
356 lnet_kiov_t
*kiov
= conn
->ksnc_rx_kiov
;
357 struct msghdr msg
= {
360 .msg_iov
= scratchiov
,
365 mm_segment_t oldmm
= get_fs();
374 /* NB we can't trust socket ops to either consume our iovs
375 * or leave them alone. */
376 addr
= ksocknal_lib_kiov_vmap(kiov
, niov
, scratchiov
, pages
);
378 nob
= scratchiov
[0].iov_len
;
382 for (nob
= i
= 0; i
< niov
; i
++) {
383 nob
+= scratchiov
[i
].iov_len
= kiov
[i
].kiov_len
;
384 scratchiov
[i
].iov_base
= kmap(kiov
[i
].kiov_page
) +
387 msg
.msg_iovlen
= niov
;
390 LASSERT (nob
<= conn
->ksnc_rx_nob_wanted
);
393 rc
= sock_recvmsg (conn
->ksnc_sock
, &msg
, nob
, MSG_DONTWAIT
);
394 /* NB this is just a boolean.......................^ */
397 if (conn
->ksnc_msg
.ksm_csum
!= 0) {
398 for (i
= 0, sum
= rc
; sum
> 0; i
++, sum
-= fragnob
) {
401 /* Dang! have to kmap again because I have nowhere to stash the
402 * mapped address. But by doing it while the page is still
403 * mapped, the kernel just bumps the map count and returns me
404 * the address it stashed. */
405 base
= kmap(kiov
[i
].kiov_page
) + kiov
[i
].kiov_offset
;
406 fragnob
= kiov
[i
].kiov_len
;
410 conn
->ksnc_rx_csum
= ksocknal_csum(conn
->ksnc_rx_csum
,
413 kunmap(kiov
[i
].kiov_page
);
418 ksocknal_lib_kiov_vunmap(addr
);
420 for (i
= 0; i
< niov
; i
++)
421 kunmap(kiov
[i
].kiov_page
);
428 ksocknal_lib_csum_tx(ksock_tx_t
*tx
)
434 LASSERT(tx
->tx_iov
[0].iov_base
== (void *)&tx
->tx_msg
);
435 LASSERT(tx
->tx_conn
!= NULL
);
436 LASSERT(tx
->tx_conn
->ksnc_proto
== &ksocknal_protocol_v2x
);
438 tx
->tx_msg
.ksm_csum
= 0;
440 csum
= ksocknal_csum(~0, (void *)tx
->tx_iov
[0].iov_base
,
441 tx
->tx_iov
[0].iov_len
);
443 if (tx
->tx_kiov
!= NULL
) {
444 for (i
= 0; i
< tx
->tx_nkiov
; i
++) {
445 base
= kmap(tx
->tx_kiov
[i
].kiov_page
) +
446 tx
->tx_kiov
[i
].kiov_offset
;
448 csum
= ksocknal_csum(csum
, base
, tx
->tx_kiov
[i
].kiov_len
);
450 kunmap(tx
->tx_kiov
[i
].kiov_page
);
453 for (i
= 1; i
< tx
->tx_niov
; i
++)
454 csum
= ksocknal_csum(csum
, tx
->tx_iov
[i
].iov_base
,
455 tx
->tx_iov
[i
].iov_len
);
458 if (*ksocknal_tunables
.ksnd_inject_csum_error
) {
460 *ksocknal_tunables
.ksnd_inject_csum_error
= 0;
463 tx
->tx_msg
.ksm_csum
= csum
;
467 ksocknal_lib_get_conn_tunables (ksock_conn_t
*conn
, int *txmem
, int *rxmem
, int *nagle
)
469 mm_segment_t oldmm
= get_fs ();
470 struct socket
*sock
= conn
->ksnc_sock
;
474 rc
= ksocknal_connsock_addref(conn
);
476 LASSERT (conn
->ksnc_closing
);
477 *txmem
= *rxmem
= *nagle
= 0;
481 rc
= libcfs_sock_getbuf(sock
, txmem
, rxmem
);
483 len
= sizeof(*nagle
);
485 rc
= sock
->ops
->getsockopt(sock
, SOL_TCP
, TCP_NODELAY
,
486 (char *)nagle
, &len
);
490 ksocknal_connsock_decref(conn
);
495 *txmem
= *rxmem
= *nagle
= 0;
501 ksocknal_lib_setup_sock (struct socket
*sock
)
503 mm_segment_t oldmm
= get_fs ();
510 struct linger linger
;
512 sock
->sk
->sk_allocation
= GFP_NOFS
;
514 /* Ensure this socket aborts active sends immediately when we close
521 rc
= sock_setsockopt (sock
, SOL_SOCKET
, SO_LINGER
,
522 (char *)&linger
, sizeof (linger
));
525 CERROR ("Can't set SO_LINGER: %d\n", rc
);
531 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_LINGER2
,
532 (char *)&option
, sizeof (option
));
535 CERROR ("Can't set SO_LINGER2: %d\n", rc
);
539 if (!*ksocknal_tunables
.ksnd_nagle
) {
543 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_NODELAY
,
544 (char *)&option
, sizeof (option
));
547 CERROR ("Can't disable nagle: %d\n", rc
);
552 rc
= libcfs_sock_setbuf(sock
,
553 *ksocknal_tunables
.ksnd_tx_buffer_size
,
554 *ksocknal_tunables
.ksnd_rx_buffer_size
);
556 CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
557 *ksocknal_tunables
.ksnd_tx_buffer_size
,
558 *ksocknal_tunables
.ksnd_rx_buffer_size
, rc
);
562 /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
564 /* snapshot tunables */
565 keep_idle
= *ksocknal_tunables
.ksnd_keepalive_idle
;
566 keep_count
= *ksocknal_tunables
.ksnd_keepalive_count
;
567 keep_intvl
= *ksocknal_tunables
.ksnd_keepalive_intvl
;
569 do_keepalive
= (keep_idle
> 0 && keep_count
> 0 && keep_intvl
> 0);
571 option
= (do_keepalive
? 1 : 0);
573 rc
= sock_setsockopt (sock
, SOL_SOCKET
, SO_KEEPALIVE
,
574 (char *)&option
, sizeof (option
));
577 CERROR ("Can't set SO_KEEPALIVE: %d\n", rc
);
585 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_KEEPIDLE
,
586 (char *)&keep_idle
, sizeof (keep_idle
));
589 CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc
);
594 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_KEEPINTVL
,
595 (char *)&keep_intvl
, sizeof (keep_intvl
));
598 CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc
);
603 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_KEEPCNT
,
604 (char *)&keep_count
, sizeof (keep_count
));
607 CERROR ("Can't set TCP_KEEPCNT: %d\n", rc
);
615 ksocknal_lib_push_conn (ksock_conn_t
*conn
)
624 rc
= ksocknal_connsock_addref(conn
);
625 if (rc
!= 0) /* being shut down */
628 sk
= conn
->ksnc_sock
->sk
;
632 nonagle
= tp
->nonagle
;
639 rc
= sk
->sk_prot
->setsockopt (sk
, SOL_TCP
, TCP_NODELAY
,
640 (char *)&val
, sizeof (val
));
646 tp
->nonagle
= nonagle
;
649 ksocknal_connsock_decref(conn
);
652 extern void ksocknal_read_callback (ksock_conn_t
*conn
);
653 extern void ksocknal_write_callback (ksock_conn_t
*conn
);
655 * socket call back in Linux
658 ksocknal_data_ready (struct sock
*sk
, int n
)
662 /* interleave correctly with closing sockets... */
664 read_lock(&ksocknal_data
.ksnd_global_lock
);
666 conn
= sk
->sk_user_data
;
667 if (conn
== NULL
) { /* raced with ksocknal_terminate_conn */
668 LASSERT (sk
->sk_data_ready
!= &ksocknal_data_ready
);
669 sk
->sk_data_ready (sk
, n
);
671 ksocknal_read_callback(conn
);
673 read_unlock(&ksocknal_data
.ksnd_global_lock
);
677 ksocknal_write_space (struct sock
*sk
)
683 /* interleave correctly with closing sockets... */
685 read_lock(&ksocknal_data
.ksnd_global_lock
);
687 conn
= sk
->sk_user_data
;
688 wspace
= SOCKNAL_WSPACE(sk
);
689 min_wpace
= SOCKNAL_MIN_WSPACE(sk
);
691 CDEBUG(D_NET
, "sk %p wspace %d low water %d conn %p%s%s%s\n",
692 sk
, wspace
, min_wpace
, conn
,
693 (conn
== NULL
) ? "" : (conn
->ksnc_tx_ready
?
694 " ready" : " blocked"),
695 (conn
== NULL
) ? "" : (conn
->ksnc_tx_scheduled
?
696 " scheduled" : " idle"),
697 (conn
== NULL
) ? "" : (list_empty (&conn
->ksnc_tx_queue
) ?
698 " empty" : " queued"));
700 if (conn
== NULL
) { /* raced with ksocknal_terminate_conn */
701 LASSERT (sk
->sk_write_space
!= &ksocknal_write_space
);
702 sk
->sk_write_space (sk
);
704 read_unlock(&ksocknal_data
.ksnd_global_lock
);
708 if (wspace
>= min_wpace
) { /* got enough space */
709 ksocknal_write_callback(conn
);
711 /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
712 * ENOMEM check in ksocknal_transmit is race-free (think about
715 clear_bit (SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
718 read_unlock(&ksocknal_data
.ksnd_global_lock
);
722 ksocknal_lib_save_callback(struct socket
*sock
, ksock_conn_t
*conn
)
724 conn
->ksnc_saved_data_ready
= sock
->sk
->sk_data_ready
;
725 conn
->ksnc_saved_write_space
= sock
->sk
->sk_write_space
;
729 ksocknal_lib_set_callback(struct socket
*sock
, ksock_conn_t
*conn
)
731 sock
->sk
->sk_user_data
= conn
;
732 sock
->sk
->sk_data_ready
= ksocknal_data_ready
;
733 sock
->sk
->sk_write_space
= ksocknal_write_space
;
738 ksocknal_lib_reset_callback(struct socket
*sock
, ksock_conn_t
*conn
)
740 /* Remove conn's network callbacks.
741 * NB I _have_ to restore the callback, rather than storing a noop,
742 * since the socket could survive past this module being unloaded!! */
743 sock
->sk
->sk_data_ready
= conn
->ksnc_saved_data_ready
;
744 sock
->sk
->sk_write_space
= conn
->ksnc_saved_write_space
;
746 /* A callback could be in progress already; they hold a read lock
747 * on ksnd_global_lock (to serialise with me) and NOOP if
748 * sk_user_data is NULL. */
749 sock
->sk
->sk_user_data
= NULL
;
755 ksocknal_lib_memory_pressure(ksock_conn_t
*conn
)
758 ksock_sched_t
*sched
;
760 sched
= conn
->ksnc_scheduler
;
761 spin_lock_bh(&sched
->kss_lock
);
763 if (!SOCK_TEST_NOSPACE(conn
->ksnc_sock
) &&
764 !conn
->ksnc_tx_ready
) {
765 /* SOCK_NOSPACE is set when the socket fills
766 * and cleared in the write_space callback
767 * (which also sets ksnc_tx_ready). If
768 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
769 * zero, I didn't fill the socket and
770 * write_space won't reschedule me, so I
771 * return -ENOMEM to get my caller to retry
776 spin_unlock_bh(&sched
->kss_lock
);