]>
git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
40 ksocknal_lib_get_conn_addrs (ksock_conn_t
*conn
)
42 int rc
= libcfs_sock_getaddr(conn
->ksnc_sock
, 1,
46 /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
47 LASSERT (!conn
->ksnc_closing
);
50 CERROR ("Error %d getting sock peer IP\n", rc
);
54 rc
= libcfs_sock_getaddr(conn
->ksnc_sock
, 0,
55 &conn
->ksnc_myipaddr
, NULL
);
57 CERROR ("Error %d getting sock local IP\n", rc
);
65 ksocknal_lib_zc_capable(ksock_conn_t
*conn
)
67 int caps
= conn
->ksnc_sock
->sk
->sk_route_caps
;
69 if (conn
->ksnc_proto
== &ksocknal_protocol_v1x
)
72 /* ZC if the socket supports scatter/gather and doesn't need software
74 return ((caps
& NETIF_F_SG
) != 0 && (caps
& NETIF_F_ALL_CSUM
) != 0);
78 ksocknal_lib_send_iov (ksock_conn_t
*conn
, ksock_tx_t
*tx
)
80 struct socket
*sock
= conn
->ksnc_sock
;
84 if (*ksocknal_tunables
.ksnd_enable_csum
&& /* checksum enabled */
85 conn
->ksnc_proto
== &ksocknal_protocol_v2x
&& /* V2.x connection */
86 tx
->tx_nob
== tx
->tx_resid
&& /* frist sending */
87 tx
->tx_msg
.ksm_csum
== 0) /* not checksummed */
88 ksocknal_lib_csum_tx(tx
);
90 /* NB we can't trust socket ops to either consume our iovs
91 * or leave them alone. */
94 #if SOCKNAL_SINGLE_FRAG_TX
96 struct iovec
*scratchiov
= &scratch
;
97 unsigned int niov
= 1;
99 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
100 unsigned int niov
= tx
->tx_niov
;
102 struct msghdr msg
= {.msg_flags
= MSG_DONTWAIT
};
105 for (nob
= i
= 0; i
< niov
; i
++) {
106 scratchiov
[i
] = tx
->tx_iov
[i
];
107 nob
+= scratchiov
[i
].iov_len
;
110 if (!list_empty(&conn
->ksnc_tx_queue
) ||
112 msg
.msg_flags
|= MSG_MORE
;
114 rc
= kernel_sendmsg(sock
, &msg
, (struct kvec
*)scratchiov
, niov
, nob
);
120 ksocknal_lib_send_kiov (ksock_conn_t
*conn
, ksock_tx_t
*tx
)
122 struct socket
*sock
= conn
->ksnc_sock
;
123 lnet_kiov_t
*kiov
= tx
->tx_kiov
;
127 /* Not NOOP message */
128 LASSERT (tx
->tx_lnetmsg
!= NULL
);
130 /* NB we can't trust socket ops to either consume our iovs
131 * or leave them alone. */
132 if (tx
->tx_msg
.ksm_zc_cookies
[0] != 0) {
133 /* Zero copy is enabled */
134 struct sock
*sk
= sock
->sk
;
135 struct page
*page
= kiov
->kiov_page
;
136 int offset
= kiov
->kiov_offset
;
137 int fragsize
= kiov
->kiov_len
;
138 int msgflg
= MSG_DONTWAIT
;
140 CDEBUG(D_NET
, "page %p + offset %x for %d\n",
141 page
, offset
, kiov
->kiov_len
);
143 if (!list_empty(&conn
->ksnc_tx_queue
) ||
144 fragsize
< tx
->tx_resid
)
147 if (sk
->sk_prot
->sendpage
!= NULL
) {
148 rc
= sk
->sk_prot
->sendpage(sk
, page
,
149 offset
, fragsize
, msgflg
);
151 rc
= cfs_tcp_sendpage(sk
, page
, offset
, fragsize
,
155 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
156 struct iovec scratch
;
157 struct iovec
*scratchiov
= &scratch
;
158 unsigned int niov
= 1;
160 #ifdef CONFIG_HIGHMEM
161 #warning "XXX risk of kmap deadlock on multiple frags..."
163 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
164 unsigned int niov
= tx
->tx_nkiov
;
166 struct msghdr msg
= {.msg_flags
= MSG_DONTWAIT
};
169 for (nob
= i
= 0; i
< niov
; i
++) {
170 scratchiov
[i
].iov_base
= kmap(kiov
[i
].kiov_page
) +
172 nob
+= scratchiov
[i
].iov_len
= kiov
[i
].kiov_len
;
175 if (!list_empty(&conn
->ksnc_tx_queue
) ||
177 msg
.msg_flags
|= MSG_MORE
;
179 rc
= kernel_sendmsg(sock
, &msg
, (struct kvec
*)scratchiov
, niov
, nob
);
181 for (i
= 0; i
< niov
; i
++)
182 kunmap(kiov
[i
].kiov_page
);
188 ksocknal_lib_eager_ack (ksock_conn_t
*conn
)
191 mm_segment_t oldmm
= get_fs();
192 struct socket
*sock
= conn
->ksnc_sock
;
194 /* Remind the socket to ACK eagerly. If I don't, the socket might
195 * think I'm about to send something it could piggy-back the ACK
196 * on, introducing delay in completing zero-copy sends in my
200 sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_QUICKACK
,
201 (char *)&opt
, sizeof (opt
));
206 ksocknal_lib_recv_iov (ksock_conn_t
*conn
)
208 #if SOCKNAL_SINGLE_FRAG_RX
209 struct iovec scratch
;
210 struct iovec
*scratchiov
= &scratch
;
211 unsigned int niov
= 1;
213 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
214 unsigned int niov
= conn
->ksnc_rx_niov
;
216 struct iovec
*iov
= conn
->ksnc_rx_iov
;
217 struct msghdr msg
= {
227 /* NB we can't trust socket ops to either consume our iovs
228 * or leave them alone. */
231 for (nob
= i
= 0; i
< niov
; i
++) {
232 scratchiov
[i
] = iov
[i
];
233 nob
+= scratchiov
[i
].iov_len
;
235 LASSERT (nob
<= conn
->ksnc_rx_nob_wanted
);
237 rc
= kernel_recvmsg(conn
->ksnc_sock
, &msg
,
238 (struct kvec
*)scratchiov
, niov
, nob
, MSG_DONTWAIT
);
241 if (conn
->ksnc_proto
== &ksocknal_protocol_v2x
) {
242 saved_csum
= conn
->ksnc_msg
.ksm_csum
;
243 conn
->ksnc_msg
.ksm_csum
= 0;
246 if (saved_csum
!= 0) {
247 /* accumulate checksum */
248 for (i
= 0, sum
= rc
; sum
> 0; i
++, sum
-= fragnob
) {
251 fragnob
= iov
[i
].iov_len
;
255 conn
->ksnc_rx_csum
= ksocknal_csum(conn
->ksnc_rx_csum
,
256 iov
[i
].iov_base
, fragnob
);
258 conn
->ksnc_msg
.ksm_csum
= saved_csum
;
265 ksocknal_lib_kiov_vunmap(void *addr
)
274 ksocknal_lib_kiov_vmap(lnet_kiov_t
*kiov
, int niov
,
275 struct iovec
*iov
, struct page
**pages
)
281 if (!*ksocknal_tunables
.ksnd_zc_recv
|| pages
== NULL
)
284 LASSERT (niov
<= LNET_MAX_IOV
);
287 niov
< *ksocknal_tunables
.ksnd_zc_recv_min_nfrags
)
290 for (nob
= i
= 0; i
< niov
; i
++) {
291 if ((kiov
[i
].kiov_offset
!= 0 && i
> 0) ||
292 (kiov
[i
].kiov_offset
+ kiov
[i
].kiov_len
!= PAGE_CACHE_SIZE
&& i
< niov
- 1))
295 pages
[i
] = kiov
[i
].kiov_page
;
296 nob
+= kiov
[i
].kiov_len
;
299 addr
= vmap(pages
, niov
, VM_MAP
, PAGE_KERNEL
);
303 iov
->iov_base
= addr
+ kiov
[0].kiov_offset
;
310 ksocknal_lib_recv_kiov (ksock_conn_t
*conn
)
312 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
313 struct iovec scratch
;
314 struct iovec
*scratchiov
= &scratch
;
315 struct page
**pages
= NULL
;
316 unsigned int niov
= 1;
318 #ifdef CONFIG_HIGHMEM
319 #warning "XXX risk of kmap deadlock on multiple frags..."
321 struct iovec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
322 struct page
**pages
= conn
->ksnc_scheduler
->kss_rx_scratch_pgs
;
323 unsigned int niov
= conn
->ksnc_rx_nkiov
;
325 lnet_kiov_t
*kiov
= conn
->ksnc_rx_kiov
;
326 struct msghdr msg
= {
338 /* NB we can't trust socket ops to either consume our iovs
339 * or leave them alone. */
340 addr
= ksocknal_lib_kiov_vmap(kiov
, niov
, scratchiov
, pages
);
342 nob
= scratchiov
[0].iov_len
;
346 for (nob
= i
= 0; i
< niov
; i
++) {
347 nob
+= scratchiov
[i
].iov_len
= kiov
[i
].kiov_len
;
348 scratchiov
[i
].iov_base
= kmap(kiov
[i
].kiov_page
) +
354 LASSERT (nob
<= conn
->ksnc_rx_nob_wanted
);
356 rc
= kernel_recvmsg(conn
->ksnc_sock
, &msg
,
357 (struct kvec
*)scratchiov
, n
, nob
, MSG_DONTWAIT
);
359 if (conn
->ksnc_msg
.ksm_csum
!= 0) {
360 for (i
= 0, sum
= rc
; sum
> 0; i
++, sum
-= fragnob
) {
363 /* Dang! have to kmap again because I have nowhere to stash the
364 * mapped address. But by doing it while the page is still
365 * mapped, the kernel just bumps the map count and returns me
366 * the address it stashed. */
367 base
= kmap(kiov
[i
].kiov_page
) + kiov
[i
].kiov_offset
;
368 fragnob
= kiov
[i
].kiov_len
;
372 conn
->ksnc_rx_csum
= ksocknal_csum(conn
->ksnc_rx_csum
,
375 kunmap(kiov
[i
].kiov_page
);
380 ksocknal_lib_kiov_vunmap(addr
);
382 for (i
= 0; i
< niov
; i
++)
383 kunmap(kiov
[i
].kiov_page
);
390 ksocknal_lib_csum_tx(ksock_tx_t
*tx
)
396 LASSERT(tx
->tx_iov
[0].iov_base
== (void *)&tx
->tx_msg
);
397 LASSERT(tx
->tx_conn
!= NULL
);
398 LASSERT(tx
->tx_conn
->ksnc_proto
== &ksocknal_protocol_v2x
);
400 tx
->tx_msg
.ksm_csum
= 0;
402 csum
= ksocknal_csum(~0, (void *)tx
->tx_iov
[0].iov_base
,
403 tx
->tx_iov
[0].iov_len
);
405 if (tx
->tx_kiov
!= NULL
) {
406 for (i
= 0; i
< tx
->tx_nkiov
; i
++) {
407 base
= kmap(tx
->tx_kiov
[i
].kiov_page
) +
408 tx
->tx_kiov
[i
].kiov_offset
;
410 csum
= ksocknal_csum(csum
, base
, tx
->tx_kiov
[i
].kiov_len
);
412 kunmap(tx
->tx_kiov
[i
].kiov_page
);
415 for (i
= 1; i
< tx
->tx_niov
; i
++)
416 csum
= ksocknal_csum(csum
, tx
->tx_iov
[i
].iov_base
,
417 tx
->tx_iov
[i
].iov_len
);
420 if (*ksocknal_tunables
.ksnd_inject_csum_error
) {
422 *ksocknal_tunables
.ksnd_inject_csum_error
= 0;
425 tx
->tx_msg
.ksm_csum
= csum
;
429 ksocknal_lib_get_conn_tunables (ksock_conn_t
*conn
, int *txmem
, int *rxmem
, int *nagle
)
431 mm_segment_t oldmm
= get_fs ();
432 struct socket
*sock
= conn
->ksnc_sock
;
436 rc
= ksocknal_connsock_addref(conn
);
438 LASSERT (conn
->ksnc_closing
);
439 *txmem
= *rxmem
= *nagle
= 0;
443 rc
= libcfs_sock_getbuf(sock
, txmem
, rxmem
);
445 len
= sizeof(*nagle
);
447 rc
= sock
->ops
->getsockopt(sock
, SOL_TCP
, TCP_NODELAY
,
448 (char *)nagle
, &len
);
452 ksocknal_connsock_decref(conn
);
457 *txmem
= *rxmem
= *nagle
= 0;
463 ksocknal_lib_setup_sock (struct socket
*sock
)
465 mm_segment_t oldmm
= get_fs ();
472 struct linger linger
;
474 sock
->sk
->sk_allocation
= GFP_NOFS
;
476 /* Ensure this socket aborts active sends immediately when we close
483 rc
= sock_setsockopt (sock
, SOL_SOCKET
, SO_LINGER
,
484 (char *)&linger
, sizeof (linger
));
487 CERROR ("Can't set SO_LINGER: %d\n", rc
);
493 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_LINGER2
,
494 (char *)&option
, sizeof (option
));
497 CERROR ("Can't set SO_LINGER2: %d\n", rc
);
501 if (!*ksocknal_tunables
.ksnd_nagle
) {
505 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_NODELAY
,
506 (char *)&option
, sizeof (option
));
509 CERROR ("Can't disable nagle: %d\n", rc
);
514 rc
= libcfs_sock_setbuf(sock
,
515 *ksocknal_tunables
.ksnd_tx_buffer_size
,
516 *ksocknal_tunables
.ksnd_rx_buffer_size
);
518 CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
519 *ksocknal_tunables
.ksnd_tx_buffer_size
,
520 *ksocknal_tunables
.ksnd_rx_buffer_size
, rc
);
524 /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
526 /* snapshot tunables */
527 keep_idle
= *ksocknal_tunables
.ksnd_keepalive_idle
;
528 keep_count
= *ksocknal_tunables
.ksnd_keepalive_count
;
529 keep_intvl
= *ksocknal_tunables
.ksnd_keepalive_intvl
;
531 do_keepalive
= (keep_idle
> 0 && keep_count
> 0 && keep_intvl
> 0);
533 option
= (do_keepalive
? 1 : 0);
535 rc
= sock_setsockopt (sock
, SOL_SOCKET
, SO_KEEPALIVE
,
536 (char *)&option
, sizeof (option
));
539 CERROR ("Can't set SO_KEEPALIVE: %d\n", rc
);
547 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_KEEPIDLE
,
548 (char *)&keep_idle
, sizeof (keep_idle
));
551 CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc
);
556 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_KEEPINTVL
,
557 (char *)&keep_intvl
, sizeof (keep_intvl
));
560 CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc
);
565 rc
= sock
->ops
->setsockopt (sock
, SOL_TCP
, TCP_KEEPCNT
,
566 (char *)&keep_count
, sizeof (keep_count
));
569 CERROR ("Can't set TCP_KEEPCNT: %d\n", rc
);
577 ksocknal_lib_push_conn (ksock_conn_t
*conn
)
586 rc
= ksocknal_connsock_addref(conn
);
587 if (rc
!= 0) /* being shut down */
590 sk
= conn
->ksnc_sock
->sk
;
594 nonagle
= tp
->nonagle
;
601 rc
= sk
->sk_prot
->setsockopt (sk
, SOL_TCP
, TCP_NODELAY
,
602 (char *)&val
, sizeof (val
));
608 tp
->nonagle
= nonagle
;
611 ksocknal_connsock_decref(conn
);
614 extern void ksocknal_read_callback (ksock_conn_t
*conn
);
615 extern void ksocknal_write_callback (ksock_conn_t
*conn
);
617 * socket call back in Linux
620 ksocknal_data_ready (struct sock
*sk
, int n
)
624 /* interleave correctly with closing sockets... */
626 read_lock(&ksocknal_data
.ksnd_global_lock
);
628 conn
= sk
->sk_user_data
;
629 if (conn
== NULL
) { /* raced with ksocknal_terminate_conn */
630 LASSERT (sk
->sk_data_ready
!= &ksocknal_data_ready
);
631 sk
->sk_data_ready (sk
, n
);
633 ksocknal_read_callback(conn
);
635 read_unlock(&ksocknal_data
.ksnd_global_lock
);
639 ksocknal_write_space (struct sock
*sk
)
645 /* interleave correctly with closing sockets... */
647 read_lock(&ksocknal_data
.ksnd_global_lock
);
649 conn
= sk
->sk_user_data
;
650 wspace
= SOCKNAL_WSPACE(sk
);
651 min_wpace
= SOCKNAL_MIN_WSPACE(sk
);
653 CDEBUG(D_NET
, "sk %p wspace %d low water %d conn %p%s%s%s\n",
654 sk
, wspace
, min_wpace
, conn
,
655 (conn
== NULL
) ? "" : (conn
->ksnc_tx_ready
?
656 " ready" : " blocked"),
657 (conn
== NULL
) ? "" : (conn
->ksnc_tx_scheduled
?
658 " scheduled" : " idle"),
659 (conn
== NULL
) ? "" : (list_empty (&conn
->ksnc_tx_queue
) ?
660 " empty" : " queued"));
662 if (conn
== NULL
) { /* raced with ksocknal_terminate_conn */
663 LASSERT (sk
->sk_write_space
!= &ksocknal_write_space
);
664 sk
->sk_write_space (sk
);
666 read_unlock(&ksocknal_data
.ksnd_global_lock
);
670 if (wspace
>= min_wpace
) { /* got enough space */
671 ksocknal_write_callback(conn
);
673 /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
674 * ENOMEM check in ksocknal_transmit is race-free (think about
677 clear_bit (SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
680 read_unlock(&ksocknal_data
.ksnd_global_lock
);
684 ksocknal_lib_save_callback(struct socket
*sock
, ksock_conn_t
*conn
)
686 conn
->ksnc_saved_data_ready
= sock
->sk
->sk_data_ready
;
687 conn
->ksnc_saved_write_space
= sock
->sk
->sk_write_space
;
691 ksocknal_lib_set_callback(struct socket
*sock
, ksock_conn_t
*conn
)
693 sock
->sk
->sk_user_data
= conn
;
694 sock
->sk
->sk_data_ready
= ksocknal_data_ready
;
695 sock
->sk
->sk_write_space
= ksocknal_write_space
;
700 ksocknal_lib_reset_callback(struct socket
*sock
, ksock_conn_t
*conn
)
702 /* Remove conn's network callbacks.
703 * NB I _have_ to restore the callback, rather than storing a noop,
704 * since the socket could survive past this module being unloaded!! */
705 sock
->sk
->sk_data_ready
= conn
->ksnc_saved_data_ready
;
706 sock
->sk
->sk_write_space
= conn
->ksnc_saved_write_space
;
708 /* A callback could be in progress already; they hold a read lock
709 * on ksnd_global_lock (to serialise with me) and NOOP if
710 * sk_user_data is NULL. */
711 sock
->sk
->sk_user_data
= NULL
;
717 ksocknal_lib_memory_pressure(ksock_conn_t
*conn
)
720 ksock_sched_t
*sched
;
722 sched
= conn
->ksnc_scheduler
;
723 spin_lock_bh(&sched
->kss_lock
);
725 if (!SOCK_TEST_NOSPACE(conn
->ksnc_sock
) &&
726 !conn
->ksnc_tx_ready
) {
727 /* SOCK_NOSPACE is set when the socket fills
728 * and cleared in the write_space callback
729 * (which also sets ksnc_tx_ready). If
730 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
731 * zero, I didn't fill the socket and
732 * write_space won't reschedule me, so I
733 * return -ENOMEM to get my caller to retry
738 spin_unlock_bh(&sched
->kss_lock
);