]>
Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #include "socklnd.h" | |
38 | ||
d7e09d03 PT |
39 | int |
40 | ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) | |
41 | { | |
42 | int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, | |
43 | &conn->ksnc_ipaddr, | |
44 | &conn->ksnc_port); | |
45 | ||
46 | /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ | |
47 | LASSERT (!conn->ksnc_closing); | |
48 | ||
49 | if (rc != 0) { | |
50 | CERROR ("Error %d getting sock peer IP\n", rc); | |
51 | return rc; | |
52 | } | |
53 | ||
54 | rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, | |
55 | &conn->ksnc_myipaddr, NULL); | |
56 | if (rc != 0) { | |
57 | CERROR ("Error %d getting sock local IP\n", rc); | |
58 | return rc; | |
59 | } | |
60 | ||
61 | return 0; | |
62 | } | |
63 | ||
64 | int | |
65 | ksocknal_lib_zc_capable(ksock_conn_t *conn) | |
66 | { | |
67 | int caps = conn->ksnc_sock->sk->sk_route_caps; | |
68 | ||
69 | if (conn->ksnc_proto == &ksocknal_protocol_v1x) | |
70 | return 0; | |
71 | ||
72 | /* ZC if the socket supports scatter/gather and doesn't need software | |
73 | * checksums */ | |
74 | return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0); | |
75 | } | |
76 | ||
77 | int | |
78 | ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) | |
79 | { | |
80 | struct socket *sock = conn->ksnc_sock; | |
81 | int nob; | |
82 | int rc; | |
83 | ||
84 | if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ | |
85 | conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ | |
86 | tx->tx_nob == tx->tx_resid && /* frist sending */ | |
87 | tx->tx_msg.ksm_csum == 0) /* not checksummed */ | |
88 | ksocknal_lib_csum_tx(tx); | |
89 | ||
90 | /* NB we can't trust socket ops to either consume our iovs | |
91 | * or leave them alone. */ | |
92 | ||
93 | { | |
94 | #if SOCKNAL_SINGLE_FRAG_TX | |
95 | struct iovec scratch; | |
96 | struct iovec *scratchiov = &scratch; | |
97 | unsigned int niov = 1; | |
98 | #else | |
99 | struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; | |
100 | unsigned int niov = tx->tx_niov; | |
101 | #endif | |
102 | struct msghdr msg = { | |
103 | .msg_name = NULL, | |
104 | .msg_namelen = 0, | |
105 | .msg_iov = scratchiov, | |
106 | .msg_iovlen = niov, | |
107 | .msg_control = NULL, | |
108 | .msg_controllen = 0, | |
109 | .msg_flags = MSG_DONTWAIT | |
110 | }; | |
111 | mm_segment_t oldmm = get_fs(); | |
112 | int i; | |
113 | ||
114 | for (nob = i = 0; i < niov; i++) { | |
115 | scratchiov[i] = tx->tx_iov[i]; | |
116 | nob += scratchiov[i].iov_len; | |
117 | } | |
118 | ||
119 | if (!list_empty(&conn->ksnc_tx_queue) || | |
120 | nob < tx->tx_resid) | |
121 | msg.msg_flags |= MSG_MORE; | |
122 | ||
123 | set_fs (KERNEL_DS); | |
124 | rc = sock_sendmsg(sock, &msg, nob); | |
125 | set_fs (oldmm); | |
126 | } | |
127 | return rc; | |
128 | } | |
129 | ||
130 | int | |
131 | ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) | |
132 | { | |
133 | struct socket *sock = conn->ksnc_sock; | |
134 | lnet_kiov_t *kiov = tx->tx_kiov; | |
135 | int rc; | |
136 | int nob; | |
137 | ||
138 | /* Not NOOP message */ | |
139 | LASSERT (tx->tx_lnetmsg != NULL); | |
140 | ||
141 | /* NB we can't trust socket ops to either consume our iovs | |
142 | * or leave them alone. */ | |
143 | if (tx->tx_msg.ksm_zc_cookies[0] != 0) { | |
144 | /* Zero copy is enabled */ | |
145 | struct sock *sk = sock->sk; | |
146 | struct page *page = kiov->kiov_page; | |
147 | int offset = kiov->kiov_offset; | |
148 | int fragsize = kiov->kiov_len; | |
149 | int msgflg = MSG_DONTWAIT; | |
150 | ||
151 | CDEBUG(D_NET, "page %p + offset %x for %d\n", | |
152 | page, offset, kiov->kiov_len); | |
153 | ||
154 | if (!list_empty(&conn->ksnc_tx_queue) || | |
155 | fragsize < tx->tx_resid) | |
156 | msgflg |= MSG_MORE; | |
157 | ||
158 | if (sk->sk_prot->sendpage != NULL) { | |
159 | rc = sk->sk_prot->sendpage(sk, page, | |
160 | offset, fragsize, msgflg); | |
161 | } else { | |
162 | rc = cfs_tcp_sendpage(sk, page, offset, fragsize, | |
163 | msgflg); | |
164 | } | |
165 | } else { | |
166 | #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK | |
167 | struct iovec scratch; | |
168 | struct iovec *scratchiov = &scratch; | |
169 | unsigned int niov = 1; | |
170 | #else | |
171 | #ifdef CONFIG_HIGHMEM | |
172 | #warning "XXX risk of kmap deadlock on multiple frags..." | |
173 | #endif | |
174 | struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; | |
175 | unsigned int niov = tx->tx_nkiov; | |
176 | #endif | |
177 | struct msghdr msg = { | |
178 | .msg_name = NULL, | |
179 | .msg_namelen = 0, | |
180 | .msg_iov = scratchiov, | |
181 | .msg_iovlen = niov, | |
182 | .msg_control = NULL, | |
183 | .msg_controllen = 0, | |
184 | .msg_flags = MSG_DONTWAIT | |
185 | }; | |
186 | mm_segment_t oldmm = get_fs(); | |
187 | int i; | |
188 | ||
189 | for (nob = i = 0; i < niov; i++) { | |
190 | scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + | |
191 | kiov[i].kiov_offset; | |
192 | nob += scratchiov[i].iov_len = kiov[i].kiov_len; | |
193 | } | |
194 | ||
195 | if (!list_empty(&conn->ksnc_tx_queue) || | |
196 | nob < tx->tx_resid) | |
197 | msg.msg_flags |= MSG_MORE; | |
198 | ||
199 | set_fs (KERNEL_DS); | |
200 | rc = sock_sendmsg(sock, &msg, nob); | |
201 | set_fs (oldmm); | |
202 | ||
203 | for (i = 0; i < niov; i++) | |
204 | kunmap(kiov[i].kiov_page); | |
205 | } | |
206 | return rc; | |
207 | } | |
208 | ||
209 | void | |
210 | ksocknal_lib_eager_ack (ksock_conn_t *conn) | |
211 | { | |
212 | int opt = 1; | |
213 | mm_segment_t oldmm = get_fs(); | |
214 | struct socket *sock = conn->ksnc_sock; | |
215 | ||
216 | /* Remind the socket to ACK eagerly. If I don't, the socket might | |
217 | * think I'm about to send something it could piggy-back the ACK | |
218 | * on, introducing delay in completing zero-copy sends in my | |
219 | * peer. */ | |
220 | ||
221 | set_fs(KERNEL_DS); | |
222 | sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK, | |
223 | (char *)&opt, sizeof (opt)); | |
224 | set_fs(oldmm); | |
225 | } | |
226 | ||
227 | int | |
228 | ksocknal_lib_recv_iov (ksock_conn_t *conn) | |
229 | { | |
230 | #if SOCKNAL_SINGLE_FRAG_RX | |
231 | struct iovec scratch; | |
232 | struct iovec *scratchiov = &scratch; | |
233 | unsigned int niov = 1; | |
234 | #else | |
235 | struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; | |
236 | unsigned int niov = conn->ksnc_rx_niov; | |
237 | #endif | |
238 | struct iovec *iov = conn->ksnc_rx_iov; | |
239 | struct msghdr msg = { | |
240 | .msg_name = NULL, | |
241 | .msg_namelen = 0, | |
242 | .msg_iov = scratchiov, | |
243 | .msg_iovlen = niov, | |
244 | .msg_control = NULL, | |
245 | .msg_controllen = 0, | |
246 | .msg_flags = 0 | |
247 | }; | |
248 | mm_segment_t oldmm = get_fs(); | |
249 | int nob; | |
250 | int i; | |
251 | int rc; | |
252 | int fragnob; | |
253 | int sum; | |
254 | __u32 saved_csum; | |
255 | ||
256 | /* NB we can't trust socket ops to either consume our iovs | |
257 | * or leave them alone. */ | |
258 | LASSERT (niov > 0); | |
259 | ||
260 | for (nob = i = 0; i < niov; i++) { | |
261 | scratchiov[i] = iov[i]; | |
262 | nob += scratchiov[i].iov_len; | |
263 | } | |
264 | LASSERT (nob <= conn->ksnc_rx_nob_wanted); | |
265 | ||
266 | set_fs (KERNEL_DS); | |
267 | rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); | |
268 | /* NB this is just a boolean..........................^ */ | |
269 | set_fs (oldmm); | |
270 | ||
271 | saved_csum = 0; | |
272 | if (conn->ksnc_proto == &ksocknal_protocol_v2x) { | |
273 | saved_csum = conn->ksnc_msg.ksm_csum; | |
274 | conn->ksnc_msg.ksm_csum = 0; | |
275 | } | |
276 | ||
277 | if (saved_csum != 0) { | |
278 | /* accumulate checksum */ | |
279 | for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { | |
280 | LASSERT (i < niov); | |
281 | ||
282 | fragnob = iov[i].iov_len; | |
283 | if (fragnob > sum) | |
284 | fragnob = sum; | |
285 | ||
286 | conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, | |
287 | iov[i].iov_base, fragnob); | |
288 | } | |
289 | conn->ksnc_msg.ksm_csum = saved_csum; | |
290 | } | |
291 | ||
292 | return rc; | |
293 | } | |
294 | ||
295 | static void | |
296 | ksocknal_lib_kiov_vunmap(void *addr) | |
297 | { | |
298 | if (addr == NULL) | |
299 | return; | |
300 | ||
301 | vunmap(addr); | |
302 | } | |
303 | ||
304 | static void * | |
305 | ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, | |
306 | struct iovec *iov, struct page **pages) | |
307 | { | |
308 | void *addr; | |
309 | int nob; | |
310 | int i; | |
311 | ||
312 | if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) | |
313 | return NULL; | |
314 | ||
315 | LASSERT (niov <= LNET_MAX_IOV); | |
316 | ||
317 | if (niov < 2 || | |
318 | niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) | |
319 | return NULL; | |
320 | ||
321 | for (nob = i = 0; i < niov; i++) { | |
322 | if ((kiov[i].kiov_offset != 0 && i > 0) || | |
323 | (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1)) | |
324 | return NULL; | |
325 | ||
326 | pages[i] = kiov[i].kiov_page; | |
327 | nob += kiov[i].kiov_len; | |
328 | } | |
329 | ||
330 | addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); | |
331 | if (addr == NULL) | |
332 | return NULL; | |
333 | ||
334 | iov->iov_base = addr + kiov[0].kiov_offset; | |
335 | iov->iov_len = nob; | |
336 | ||
337 | return addr; | |
338 | } | |
339 | ||
340 | int | |
341 | ksocknal_lib_recv_kiov (ksock_conn_t *conn) | |
342 | { | |
343 | #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK | |
344 | struct iovec scratch; | |
345 | struct iovec *scratchiov = &scratch; | |
346 | struct page **pages = NULL; | |
347 | unsigned int niov = 1; | |
348 | #else | |
349 | #ifdef CONFIG_HIGHMEM | |
350 | #warning "XXX risk of kmap deadlock on multiple frags..." | |
351 | #endif | |
352 | struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; | |
353 | struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; | |
354 | unsigned int niov = conn->ksnc_rx_nkiov; | |
355 | #endif | |
356 | lnet_kiov_t *kiov = conn->ksnc_rx_kiov; | |
357 | struct msghdr msg = { | |
358 | .msg_name = NULL, | |
359 | .msg_namelen = 0, | |
360 | .msg_iov = scratchiov, | |
361 | .msg_control = NULL, | |
362 | .msg_controllen = 0, | |
363 | .msg_flags = 0 | |
364 | }; | |
365 | mm_segment_t oldmm = get_fs(); | |
366 | int nob; | |
367 | int i; | |
368 | int rc; | |
369 | void *base; | |
370 | void *addr; | |
371 | int sum; | |
372 | int fragnob; | |
373 | ||
374 | /* NB we can't trust socket ops to either consume our iovs | |
375 | * or leave them alone. */ | |
4a87df3e CP |
376 | addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages); |
377 | if (addr != NULL) { | |
d7e09d03 PT |
378 | nob = scratchiov[0].iov_len; |
379 | msg.msg_iovlen = 1; | |
380 | ||
381 | } else { | |
382 | for (nob = i = 0; i < niov; i++) { | |
383 | nob += scratchiov[i].iov_len = kiov[i].kiov_len; | |
384 | scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + | |
385 | kiov[i].kiov_offset; | |
386 | } | |
387 | msg.msg_iovlen = niov; | |
388 | } | |
389 | ||
390 | LASSERT (nob <= conn->ksnc_rx_nob_wanted); | |
391 | ||
392 | set_fs (KERNEL_DS); | |
393 | rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); | |
394 | /* NB this is just a boolean.......................^ */ | |
395 | set_fs (oldmm); | |
396 | ||
397 | if (conn->ksnc_msg.ksm_csum != 0) { | |
398 | for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { | |
399 | LASSERT (i < niov); | |
400 | ||
401 | /* Dang! have to kmap again because I have nowhere to stash the | |
402 | * mapped address. But by doing it while the page is still | |
403 | * mapped, the kernel just bumps the map count and returns me | |
404 | * the address it stashed. */ | |
405 | base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; | |
406 | fragnob = kiov[i].kiov_len; | |
407 | if (fragnob > sum) | |
408 | fragnob = sum; | |
409 | ||
410 | conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, | |
411 | base, fragnob); | |
412 | ||
413 | kunmap(kiov[i].kiov_page); | |
414 | } | |
415 | } | |
416 | ||
417 | if (addr != NULL) { | |
418 | ksocknal_lib_kiov_vunmap(addr); | |
419 | } else { | |
420 | for (i = 0; i < niov; i++) | |
421 | kunmap(kiov[i].kiov_page); | |
422 | } | |
423 | ||
424 | return (rc); | |
425 | } | |
426 | ||
427 | void | |
428 | ksocknal_lib_csum_tx(ksock_tx_t *tx) | |
429 | { | |
430 | int i; | |
431 | __u32 csum; | |
432 | void *base; | |
433 | ||
434 | LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg); | |
435 | LASSERT(tx->tx_conn != NULL); | |
436 | LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); | |
437 | ||
438 | tx->tx_msg.ksm_csum = 0; | |
439 | ||
440 | csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base, | |
441 | tx->tx_iov[0].iov_len); | |
442 | ||
443 | if (tx->tx_kiov != NULL) { | |
444 | for (i = 0; i < tx->tx_nkiov; i++) { | |
445 | base = kmap(tx->tx_kiov[i].kiov_page) + | |
446 | tx->tx_kiov[i].kiov_offset; | |
447 | ||
448 | csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); | |
449 | ||
450 | kunmap(tx->tx_kiov[i].kiov_page); | |
451 | } | |
452 | } else { | |
453 | for (i = 1; i < tx->tx_niov; i++) | |
454 | csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, | |
455 | tx->tx_iov[i].iov_len); | |
456 | } | |
457 | ||
458 | if (*ksocknal_tunables.ksnd_inject_csum_error) { | |
459 | csum++; | |
460 | *ksocknal_tunables.ksnd_inject_csum_error = 0; | |
461 | } | |
462 | ||
463 | tx->tx_msg.ksm_csum = csum; | |
464 | } | |
465 | ||
466 | int | |
467 | ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) | |
468 | { | |
469 | mm_segment_t oldmm = get_fs (); | |
470 | struct socket *sock = conn->ksnc_sock; | |
471 | int len; | |
472 | int rc; | |
473 | ||
474 | rc = ksocknal_connsock_addref(conn); | |
475 | if (rc != 0) { | |
476 | LASSERT (conn->ksnc_closing); | |
477 | *txmem = *rxmem = *nagle = 0; | |
478 | return (-ESHUTDOWN); | |
479 | } | |
480 | ||
481 | rc = libcfs_sock_getbuf(sock, txmem, rxmem); | |
482 | if (rc == 0) { | |
483 | len = sizeof(*nagle); | |
484 | set_fs(KERNEL_DS); | |
485 | rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, | |
486 | (char *)nagle, &len); | |
487 | set_fs(oldmm); | |
488 | } | |
489 | ||
490 | ksocknal_connsock_decref(conn); | |
491 | ||
492 | if (rc == 0) | |
493 | *nagle = !*nagle; | |
494 | else | |
495 | *txmem = *rxmem = *nagle = 0; | |
496 | ||
497 | return (rc); | |
498 | } | |
499 | ||
500 | int | |
501 | ksocknal_lib_setup_sock (struct socket *sock) | |
502 | { | |
503 | mm_segment_t oldmm = get_fs (); | |
504 | int rc; | |
505 | int option; | |
506 | int keep_idle; | |
507 | int keep_intvl; | |
508 | int keep_count; | |
509 | int do_keepalive; | |
510 | struct linger linger; | |
511 | ||
512 | sock->sk->sk_allocation = GFP_NOFS; | |
513 | ||
514 | /* Ensure this socket aborts active sends immediately when we close | |
515 | * it. */ | |
516 | ||
517 | linger.l_onoff = 0; | |
518 | linger.l_linger = 0; | |
519 | ||
520 | set_fs (KERNEL_DS); | |
521 | rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER, | |
522 | (char *)&linger, sizeof (linger)); | |
523 | set_fs (oldmm); | |
524 | if (rc != 0) { | |
525 | CERROR ("Can't set SO_LINGER: %d\n", rc); | |
526 | return (rc); | |
527 | } | |
528 | ||
529 | option = -1; | |
530 | set_fs (KERNEL_DS); | |
531 | rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2, | |
532 | (char *)&option, sizeof (option)); | |
533 | set_fs (oldmm); | |
534 | if (rc != 0) { | |
535 | CERROR ("Can't set SO_LINGER2: %d\n", rc); | |
536 | return (rc); | |
537 | } | |
538 | ||
539 | if (!*ksocknal_tunables.ksnd_nagle) { | |
540 | option = 1; | |
541 | ||
542 | set_fs (KERNEL_DS); | |
543 | rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY, | |
544 | (char *)&option, sizeof (option)); | |
545 | set_fs (oldmm); | |
546 | if (rc != 0) { | |
547 | CERROR ("Can't disable nagle: %d\n", rc); | |
548 | return (rc); | |
549 | } | |
550 | } | |
551 | ||
552 | rc = libcfs_sock_setbuf(sock, | |
553 | *ksocknal_tunables.ksnd_tx_buffer_size, | |
554 | *ksocknal_tunables.ksnd_rx_buffer_size); | |
555 | if (rc != 0) { | |
556 | CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", | |
557 | *ksocknal_tunables.ksnd_tx_buffer_size, | |
558 | *ksocknal_tunables.ksnd_rx_buffer_size, rc); | |
559 | return (rc); | |
560 | } | |
561 | ||
562 | /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ | |
563 | ||
564 | /* snapshot tunables */ | |
565 | keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; | |
566 | keep_count = *ksocknal_tunables.ksnd_keepalive_count; | |
567 | keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; | |
568 | ||
569 | do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); | |
570 | ||
571 | option = (do_keepalive ? 1 : 0); | |
572 | set_fs (KERNEL_DS); | |
573 | rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, | |
574 | (char *)&option, sizeof (option)); | |
575 | set_fs (oldmm); | |
576 | if (rc != 0) { | |
577 | CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); | |
578 | return (rc); | |
579 | } | |
580 | ||
581 | if (!do_keepalive) | |
582 | return (0); | |
583 | ||
584 | set_fs (KERNEL_DS); | |
585 | rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, | |
586 | (char *)&keep_idle, sizeof (keep_idle)); | |
587 | set_fs (oldmm); | |
588 | if (rc != 0) { | |
589 | CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); | |
590 | return (rc); | |
591 | } | |
592 | ||
593 | set_fs (KERNEL_DS); | |
594 | rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, | |
595 | (char *)&keep_intvl, sizeof (keep_intvl)); | |
596 | set_fs (oldmm); | |
597 | if (rc != 0) { | |
598 | CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); | |
599 | return (rc); | |
600 | } | |
601 | ||
602 | set_fs (KERNEL_DS); | |
603 | rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, | |
604 | (char *)&keep_count, sizeof (keep_count)); | |
605 | set_fs (oldmm); | |
606 | if (rc != 0) { | |
607 | CERROR ("Can't set TCP_KEEPCNT: %d\n", rc); | |
608 | return (rc); | |
609 | } | |
610 | ||
611 | return (0); | |
612 | } | |
613 | ||
614 | void | |
615 | ksocknal_lib_push_conn (ksock_conn_t *conn) | |
616 | { | |
617 | struct sock *sk; | |
618 | struct tcp_sock *tp; | |
619 | int nonagle; | |
620 | int val = 1; | |
621 | int rc; | |
622 | mm_segment_t oldmm; | |
623 | ||
624 | rc = ksocknal_connsock_addref(conn); | |
625 | if (rc != 0) /* being shut down */ | |
626 | return; | |
627 | ||
628 | sk = conn->ksnc_sock->sk; | |
629 | tp = tcp_sk(sk); | |
630 | ||
631 | lock_sock (sk); | |
632 | nonagle = tp->nonagle; | |
633 | tp->nonagle = 1; | |
634 | release_sock (sk); | |
635 | ||
636 | oldmm = get_fs (); | |
637 | set_fs (KERNEL_DS); | |
638 | ||
639 | rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, | |
640 | (char *)&val, sizeof (val)); | |
641 | LASSERT (rc == 0); | |
642 | ||
643 | set_fs (oldmm); | |
644 | ||
645 | lock_sock (sk); | |
646 | tp->nonagle = nonagle; | |
647 | release_sock (sk); | |
648 | ||
649 | ksocknal_connsock_decref(conn); | |
650 | } | |
651 | ||
652 | extern void ksocknal_read_callback (ksock_conn_t *conn); | |
653 | extern void ksocknal_write_callback (ksock_conn_t *conn); | |
654 | /* | |
655 | * socket call back in Linux | |
656 | */ | |
657 | static void | |
658 | ksocknal_data_ready (struct sock *sk, int n) | |
659 | { | |
660 | ksock_conn_t *conn; | |
d7e09d03 PT |
661 | |
662 | /* interleave correctly with closing sockets... */ | |
663 | LASSERT(!in_irq()); | |
664 | read_lock(&ksocknal_data.ksnd_global_lock); | |
665 | ||
666 | conn = sk->sk_user_data; | |
667 | if (conn == NULL) { /* raced with ksocknal_terminate_conn */ | |
668 | LASSERT (sk->sk_data_ready != &ksocknal_data_ready); | |
669 | sk->sk_data_ready (sk, n); | |
670 | } else | |
671 | ksocknal_read_callback(conn); | |
672 | ||
673 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
d7e09d03 PT |
674 | } |
675 | ||
676 | static void | |
677 | ksocknal_write_space (struct sock *sk) | |
678 | { | |
679 | ksock_conn_t *conn; | |
680 | int wspace; | |
681 | int min_wpace; | |
682 | ||
683 | /* interleave correctly with closing sockets... */ | |
684 | LASSERT(!in_irq()); | |
685 | read_lock(&ksocknal_data.ksnd_global_lock); | |
686 | ||
687 | conn = sk->sk_user_data; | |
688 | wspace = SOCKNAL_WSPACE(sk); | |
689 | min_wpace = SOCKNAL_MIN_WSPACE(sk); | |
690 | ||
691 | CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", | |
692 | sk, wspace, min_wpace, conn, | |
693 | (conn == NULL) ? "" : (conn->ksnc_tx_ready ? | |
694 | " ready" : " blocked"), | |
695 | (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? | |
696 | " scheduled" : " idle"), | |
697 | (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? | |
698 | " empty" : " queued")); | |
699 | ||
700 | if (conn == NULL) { /* raced with ksocknal_terminate_conn */ | |
701 | LASSERT (sk->sk_write_space != &ksocknal_write_space); | |
702 | sk->sk_write_space (sk); | |
703 | ||
704 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
705 | return; | |
706 | } | |
707 | ||
708 | if (wspace >= min_wpace) { /* got enough space */ | |
709 | ksocknal_write_callback(conn); | |
710 | ||
711 | /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the | |
712 | * ENOMEM check in ksocknal_transmit is race-free (think about | |
713 | * it). */ | |
714 | ||
715 | clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); | |
716 | } | |
717 | ||
718 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
719 | } | |
720 | ||
721 | void | |
722 | ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) | |
723 | { | |
724 | conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; | |
725 | conn->ksnc_saved_write_space = sock->sk->sk_write_space; | |
726 | } | |
727 | ||
728 | void | |
729 | ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) | |
730 | { | |
731 | sock->sk->sk_user_data = conn; | |
732 | sock->sk->sk_data_ready = ksocknal_data_ready; | |
733 | sock->sk->sk_write_space = ksocknal_write_space; | |
734 | return; | |
735 | } | |
736 | ||
737 | void | |
738 | ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) | |
739 | { | |
740 | /* Remove conn's network callbacks. | |
741 | * NB I _have_ to restore the callback, rather than storing a noop, | |
742 | * since the socket could survive past this module being unloaded!! */ | |
743 | sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; | |
744 | sock->sk->sk_write_space = conn->ksnc_saved_write_space; | |
745 | ||
746 | /* A callback could be in progress already; they hold a read lock | |
747 | * on ksnd_global_lock (to serialise with me) and NOOP if | |
748 | * sk_user_data is NULL. */ | |
749 | sock->sk->sk_user_data = NULL; | |
750 | ||
751 | return ; | |
752 | } | |
753 | ||
754 | int | |
755 | ksocknal_lib_memory_pressure(ksock_conn_t *conn) | |
756 | { | |
757 | int rc = 0; | |
758 | ksock_sched_t *sched; | |
759 | ||
760 | sched = conn->ksnc_scheduler; | |
761 | spin_lock_bh(&sched->kss_lock); | |
762 | ||
763 | if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) && | |
764 | !conn->ksnc_tx_ready) { | |
765 | /* SOCK_NOSPACE is set when the socket fills | |
766 | * and cleared in the write_space callback | |
767 | * (which also sets ksnc_tx_ready). If | |
768 | * SOCK_NOSPACE and ksnc_tx_ready are BOTH | |
769 | * zero, I didn't fill the socket and | |
770 | * write_space won't reschedule me, so I | |
771 | * return -ENOMEM to get my caller to retry | |
772 | * after a timeout */ | |
773 | rc = -ENOMEM; | |
774 | } | |
775 | ||
776 | spin_unlock_bh(&sched->kss_lock); | |
777 | ||
778 | return rc; | |
779 | } |