]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - drivers/block/drbd/drbd_receiver.c
drbd: Remove redundant and wrong test for NULL simplification in conn_connect()
[mirror_ubuntu-zesty-kernel.git] / drivers / block / drbd / drbd_receiver.c
CommitLineData
b411b363
PR
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
b411b363
PR
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
b411b363
PR
31#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
b411b363
PR
39#include <linux/pkt_sched.h>
40#define __KERNEL_SYSCALLS__
41#include <linux/unistd.h>
42#include <linux/vmalloc.h>
43#include <linux/random.h>
b411b363
PR
44#include <linux/string.h>
45#include <linux/scatterlist.h>
46#include "drbd_int.h"
b411b363
PR
47#include "drbd_req.h"
48
49#include "drbd_vli.h"
50
77351055
PR
51struct packet_info {
52 enum drbd_packet cmd;
e2857216
AG
53 unsigned int size;
54 unsigned int vnr;
e658983a 55 void *data;
77351055
PR
56};
57
b411b363
PR
58enum finish_epoch {
59 FE_STILL_LIVE,
60 FE_DESTROYED,
61 FE_RECYCLED,
62};
63
6038178e 64static int drbd_do_features(struct drbd_tconn *tconn);
13e6037d 65static int drbd_do_auth(struct drbd_tconn *tconn);
c141ebda 66static int drbd_disconnected(struct drbd_conf *mdev);
b411b363 67
1e9dd291 68static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event);
99920dc5 69static int e_end_block(struct drbd_work *, int);
b411b363 70
b411b363
PR
71
72#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
73
45bb912b
LE
74/*
75 * some helper functions to deal with single linked page lists,
76 * page->private being our "next" pointer.
77 */
78
79/* If at least n pages are linked at head, get n pages off.
80 * Otherwise, don't modify head, and return NULL.
81 * Locking is the responsibility of the caller.
82 */
83static struct page *page_chain_del(struct page **head, int n)
84{
85 struct page *page;
86 struct page *tmp;
87
88 BUG_ON(!n);
89 BUG_ON(!head);
90
91 page = *head;
23ce4227
PR
92
93 if (!page)
94 return NULL;
95
45bb912b
LE
96 while (page) {
97 tmp = page_chain_next(page);
98 if (--n == 0)
99 break; /* found sufficient pages */
100 if (tmp == NULL)
101 /* insufficient pages, don't use any of them. */
102 return NULL;
103 page = tmp;
104 }
105
106 /* add end of list marker for the returned list */
107 set_page_private(page, 0);
108 /* actual return value, and adjustment of head */
109 page = *head;
110 *head = tmp;
111 return page;
112}
113
114/* may be used outside of locks to find the tail of a (usually short)
115 * "private" page chain, before adding it back to a global chain head
116 * with page_chain_add() under a spinlock. */
117static struct page *page_chain_tail(struct page *page, int *len)
118{
119 struct page *tmp;
120 int i = 1;
121 while ((tmp = page_chain_next(page)))
122 ++i, page = tmp;
123 if (len)
124 *len = i;
125 return page;
126}
127
128static int page_chain_free(struct page *page)
129{
130 struct page *tmp;
131 int i = 0;
132 page_chain_for_each_safe(page, tmp) {
133 put_page(page);
134 ++i;
135 }
136 return i;
137}
138
139static void page_chain_add(struct page **head,
140 struct page *chain_first, struct page *chain_last)
141{
142#if 1
143 struct page *tmp;
144 tmp = page_chain_tail(chain_first, NULL);
145 BUG_ON(tmp != chain_last);
146#endif
147
148 /* add chain to head */
149 set_page_private(chain_last, (unsigned long)*head);
150 *head = chain_first;
151}
152
18c2d522
AG
153static struct page *__drbd_alloc_pages(struct drbd_conf *mdev,
154 unsigned int number)
b411b363
PR
155{
156 struct page *page = NULL;
45bb912b 157 struct page *tmp = NULL;
18c2d522 158 unsigned int i = 0;
b411b363
PR
159
160 /* Yes, testing drbd_pp_vacant outside the lock is racy.
161 * So what. It saves a spin_lock. */
45bb912b 162 if (drbd_pp_vacant >= number) {
b411b363 163 spin_lock(&drbd_pp_lock);
45bb912b
LE
164 page = page_chain_del(&drbd_pp_pool, number);
165 if (page)
166 drbd_pp_vacant -= number;
b411b363 167 spin_unlock(&drbd_pp_lock);
45bb912b
LE
168 if (page)
169 return page;
b411b363 170 }
45bb912b 171
b411b363
PR
172 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
173 * "criss-cross" setup, that might cause write-out on some other DRBD,
174 * which in turn might block on the other node at this very place. */
45bb912b
LE
175 for (i = 0; i < number; i++) {
176 tmp = alloc_page(GFP_TRY);
177 if (!tmp)
178 break;
179 set_page_private(tmp, (unsigned long)page);
180 page = tmp;
181 }
182
183 if (i == number)
184 return page;
185
186 /* Not enough pages immediately available this time.
c37c8ecf 187 * No need to jump around here, drbd_alloc_pages will retry this
45bb912b
LE
188 * function "soon". */
189 if (page) {
190 tmp = page_chain_tail(page, NULL);
191 spin_lock(&drbd_pp_lock);
192 page_chain_add(&drbd_pp_pool, page, tmp);
193 drbd_pp_vacant += i;
194 spin_unlock(&drbd_pp_lock);
195 }
196 return NULL;
b411b363
PR
197}
198
a990be46
AG
199static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev,
200 struct list_head *to_be_freed)
b411b363 201{
db830c46 202 struct drbd_peer_request *peer_req;
b411b363
PR
203 struct list_head *le, *tle;
204
205 /* The EEs are always appended to the end of the list. Since
206 they are sent in order over the wire, they have to finish
207 in order. As soon as we see the first not finished we can
208 stop to examine the list... */
209
210 list_for_each_safe(le, tle, &mdev->net_ee) {
db830c46 211 peer_req = list_entry(le, struct drbd_peer_request, w.list);
045417f7 212 if (drbd_peer_req_has_active_page(peer_req))
b411b363
PR
213 break;
214 list_move(le, to_be_freed);
215 }
216}
217
218static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
219{
220 LIST_HEAD(reclaimed);
db830c46 221 struct drbd_peer_request *peer_req, *t;
b411b363 222
87eeee41 223 spin_lock_irq(&mdev->tconn->req_lock);
a990be46 224 reclaim_finished_net_peer_reqs(mdev, &reclaimed);
87eeee41 225 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 226
db830c46 227 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
3967deb1 228 drbd_free_net_peer_req(mdev, peer_req);
b411b363
PR
229}
230
231/**
c37c8ecf 232 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
b411b363 233 * @mdev: DRBD device.
45bb912b
LE
234 * @number: number of pages requested
235 * @retry: whether to retry, if not enough pages are available right now
236 *
237 * Tries to allocate number pages, first from our own page pool, then from
238 * the kernel, unless this allocation would exceed the max_buffers setting.
239 * Possibly retry until DRBD frees sufficient pages somewhere else.
b411b363 240 *
45bb912b 241 * Returns a page chain linked via page->private.
b411b363 242 */
c37c8ecf
AG
243struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number,
244 bool retry)
b411b363
PR
245{
246 struct page *page = NULL;
44ed167d 247 struct net_conf *nc;
b411b363 248 DEFINE_WAIT(wait);
44ed167d 249 int mxb;
b411b363 250
45bb912b
LE
251 /* Yes, we may run up to @number over max_buffers. If we
252 * follow it strictly, the admin will get it wrong anyways. */
44ed167d
PR
253 rcu_read_lock();
254 nc = rcu_dereference(mdev->tconn->net_conf);
255 mxb = nc ? nc->max_buffers : 1000000;
256 rcu_read_unlock();
257
258 if (atomic_read(&mdev->pp_in_use) < mxb)
18c2d522 259 page = __drbd_alloc_pages(mdev, number);
b411b363 260
45bb912b 261 while (page == NULL) {
b411b363
PR
262 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
263
264 drbd_kick_lo_and_reclaim_net(mdev);
265
44ed167d 266 if (atomic_read(&mdev->pp_in_use) < mxb) {
18c2d522 267 page = __drbd_alloc_pages(mdev, number);
b411b363
PR
268 if (page)
269 break;
270 }
271
272 if (!retry)
273 break;
274
275 if (signal_pending(current)) {
c37c8ecf 276 dev_warn(DEV, "drbd_alloc_pages interrupted!\n");
b411b363
PR
277 break;
278 }
279
280 schedule();
281 }
282 finish_wait(&drbd_pp_wait, &wait);
283
45bb912b
LE
284 if (page)
285 atomic_add(number, &mdev->pp_in_use);
b411b363
PR
286 return page;
287}
288
c37c8ecf 289/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
87eeee41 290 * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
45bb912b
LE
291 * Either links the page chain back to the global pool,
292 * or returns all pages to the system. */
5cc287e0 293static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net)
b411b363 294{
435f0740 295 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
b411b363 296 int i;
435f0740 297
81a3537a
LE
298 if (page == NULL)
299 return;
300
81a5d60e 301 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
45bb912b
LE
302 i = page_chain_free(page);
303 else {
304 struct page *tmp;
305 tmp = page_chain_tail(page, &i);
306 spin_lock(&drbd_pp_lock);
307 page_chain_add(&drbd_pp_pool, page, tmp);
308 drbd_pp_vacant += i;
309 spin_unlock(&drbd_pp_lock);
b411b363 310 }
435f0740 311 i = atomic_sub_return(i, a);
45bb912b 312 if (i < 0)
435f0740
LE
313 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
314 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
b411b363
PR
315 wake_up(&drbd_pp_wait);
316}
317
318/*
319You need to hold the req_lock:
320 _drbd_wait_ee_list_empty()
321
322You must not have the req_lock:
3967deb1 323 drbd_free_peer_req()
0db55363 324 drbd_alloc_peer_req()
7721f567 325 drbd_free_peer_reqs()
b411b363 326 drbd_ee_fix_bhs()
a990be46 327 drbd_finish_peer_reqs()
b411b363
PR
328 drbd_clear_done_ee()
329 drbd_wait_ee_list_empty()
330*/
331
f6ffca9f 332struct drbd_peer_request *
0db55363
AG
333drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector,
334 unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
b411b363 335{
db830c46 336 struct drbd_peer_request *peer_req;
81a3537a 337 struct page *page = NULL;
45bb912b 338 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
b411b363 339
0cf9d27e 340 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
b411b363
PR
341 return NULL;
342
db830c46
AG
343 peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
344 if (!peer_req) {
b411b363 345 if (!(gfp_mask & __GFP_NOWARN))
0db55363 346 dev_err(DEV, "%s: allocation failed\n", __func__);
b411b363
PR
347 return NULL;
348 }
349
81a3537a
LE
350 if (data_size) {
351 page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
352 if (!page)
353 goto fail;
354 }
b411b363 355
db830c46
AG
356 drbd_clear_interval(&peer_req->i);
357 peer_req->i.size = data_size;
358 peer_req->i.sector = sector;
359 peer_req->i.local = false;
360 peer_req->i.waiting = false;
361
362 peer_req->epoch = NULL;
a21e9298 363 peer_req->w.mdev = mdev;
db830c46
AG
364 peer_req->pages = page;
365 atomic_set(&peer_req->pending_bios, 0);
366 peer_req->flags = 0;
9a8e7753
AG
367 /*
368 * The block_id is opaque to the receiver. It is not endianness
369 * converted, and sent back to the sender unchanged.
370 */
db830c46 371 peer_req->block_id = id;
b411b363 372
db830c46 373 return peer_req;
b411b363 374
45bb912b 375 fail:
db830c46 376 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
377 return NULL;
378}
379
3967deb1 380void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
f6ffca9f 381 int is_net)
b411b363 382{
db830c46
AG
383 if (peer_req->flags & EE_HAS_DIGEST)
384 kfree(peer_req->digest);
5cc287e0 385 drbd_free_pages(mdev, peer_req->pages, is_net);
db830c46
AG
386 D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
387 D_ASSERT(drbd_interval_empty(&peer_req->i));
388 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
389}
390
7721f567 391int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list)
b411b363
PR
392{
393 LIST_HEAD(work_list);
db830c46 394 struct drbd_peer_request *peer_req, *t;
b411b363 395 int count = 0;
435f0740 396 int is_net = list == &mdev->net_ee;
b411b363 397
87eeee41 398 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 399 list_splice_init(list, &work_list);
87eeee41 400 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 401
db830c46 402 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
3967deb1 403 __drbd_free_peer_req(mdev, peer_req, is_net);
b411b363
PR
404 count++;
405 }
406 return count;
407}
408
a990be46
AG
409/*
410 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
b411b363 411 */
a990be46 412static int drbd_finish_peer_reqs(struct drbd_conf *mdev)
b411b363
PR
413{
414 LIST_HEAD(work_list);
415 LIST_HEAD(reclaimed);
db830c46 416 struct drbd_peer_request *peer_req, *t;
e2b3032b 417 int err = 0;
b411b363 418
87eeee41 419 spin_lock_irq(&mdev->tconn->req_lock);
a990be46 420 reclaim_finished_net_peer_reqs(mdev, &reclaimed);
b411b363 421 list_splice_init(&mdev->done_ee, &work_list);
87eeee41 422 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 423
db830c46 424 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
3967deb1 425 drbd_free_net_peer_req(mdev, peer_req);
b411b363
PR
426
427 /* possible callbacks here:
7be8da07 428 * e_end_block, and e_end_resync_block, e_send_discard_write.
b411b363
PR
429 * all ignore the last argument.
430 */
db830c46 431 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
e2b3032b
AG
432 int err2;
433
b411b363 434 /* list_del not necessary, next/prev members not touched */
e2b3032b
AG
435 err2 = peer_req->w.cb(&peer_req->w, !!err);
436 if (!err)
437 err = err2;
3967deb1 438 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
439 }
440 wake_up(&mdev->ee_wait);
441
e2b3032b 442 return err;
b411b363
PR
443}
444
d4da1537
AG
445static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
446 struct list_head *head)
b411b363
PR
447{
448 DEFINE_WAIT(wait);
449
450 /* avoids spin_lock/unlock
451 * and calling prepare_to_wait in the fast path */
452 while (!list_empty(head)) {
453 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
87eeee41 454 spin_unlock_irq(&mdev->tconn->req_lock);
7eaceacc 455 io_schedule();
b411b363 456 finish_wait(&mdev->ee_wait, &wait);
87eeee41 457 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
458 }
459}
460
d4da1537
AG
461static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
462 struct list_head *head)
b411b363 463{
87eeee41 464 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 465 _drbd_wait_ee_list_empty(mdev, head);
87eeee41 466 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
467}
468
dbd9eea0 469static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
b411b363
PR
470{
471 mm_segment_t oldfs;
472 struct kvec iov = {
473 .iov_base = buf,
474 .iov_len = size,
475 };
476 struct msghdr msg = {
477 .msg_iovlen = 1,
478 .msg_iov = (struct iovec *)&iov,
479 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
480 };
481 int rv;
482
483 oldfs = get_fs();
484 set_fs(KERNEL_DS);
485 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
486 set_fs(oldfs);
487
488 return rv;
489}
490
de0ff338 491static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
b411b363
PR
492{
493 mm_segment_t oldfs;
494 struct kvec iov = {
495 .iov_base = buf,
496 .iov_len = size,
497 };
498 struct msghdr msg = {
499 .msg_iovlen = 1,
500 .msg_iov = (struct iovec *)&iov,
501 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
502 };
503 int rv;
504
505 oldfs = get_fs();
506 set_fs(KERNEL_DS);
507
508 for (;;) {
de0ff338 509 rv = sock_recvmsg(tconn->data.socket, &msg, size, msg.msg_flags);
b411b363
PR
510 if (rv == size)
511 break;
512
513 /* Note:
514 * ECONNRESET other side closed the connection
515 * ERESTARTSYS (on sock) we got a signal
516 */
517
518 if (rv < 0) {
519 if (rv == -ECONNRESET)
de0ff338 520 conn_info(tconn, "sock was reset by peer\n");
b411b363 521 else if (rv != -ERESTARTSYS)
de0ff338 522 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
523 break;
524 } else if (rv == 0) {
de0ff338 525 conn_info(tconn, "sock was shut down by peer\n");
b411b363
PR
526 break;
527 } else {
528 /* signal came in, or peer/link went down,
529 * after we read a partial message
530 */
531 /* D_ASSERT(signal_pending(current)); */
532 break;
533 }
534 };
535
536 set_fs(oldfs);
537
538 if (rv != size)
bbeb641c 539 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363
PR
540
541 return rv;
542}
543
c6967746
AG
544static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
545{
546 int err;
547
548 err = drbd_recv(tconn, buf, size);
549 if (err != size) {
550 if (err >= 0)
551 err = -EIO;
552 } else
553 err = 0;
554 return err;
555}
556
a5c31904
AG
557static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
558{
559 int err;
560
561 err = drbd_recv_all(tconn, buf, size);
562 if (err && !signal_pending(current))
563 conn_warn(tconn, "short read (expected size %d)\n", (int)size);
564 return err;
565}
566
5dbf1673
LE
567/* quoting tcp(7):
568 * On individual connections, the socket buffer size must be set prior to the
569 * listen(2) or connect(2) calls in order to have it take effect.
570 * This is our wrapper to do so.
571 */
572static void drbd_setbufsize(struct socket *sock, unsigned int snd,
573 unsigned int rcv)
574{
575 /* open coded SO_SNDBUF, SO_RCVBUF */
576 if (snd) {
577 sock->sk->sk_sndbuf = snd;
578 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
579 }
580 if (rcv) {
581 sock->sk->sk_rcvbuf = rcv;
582 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
583 }
584}
585
eac3e990 586static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
b411b363
PR
587{
588 const char *what;
589 struct socket *sock;
590 struct sockaddr_in6 src_in6;
44ed167d
PR
591 struct sockaddr_in6 peer_in6;
592 struct net_conf *nc;
593 int err, peer_addr_len, my_addr_len;
69ef82de 594 int sndbuf_size, rcvbuf_size, connect_int;
b411b363
PR
595 int disconnect_on_error = 1;
596
44ed167d
PR
597 rcu_read_lock();
598 nc = rcu_dereference(tconn->net_conf);
599 if (!nc) {
600 rcu_read_unlock();
b411b363 601 return NULL;
44ed167d 602 }
44ed167d
PR
603 sndbuf_size = nc->sndbuf_size;
604 rcvbuf_size = nc->rcvbuf_size;
69ef82de 605 connect_int = nc->connect_int;
089c075d 606 rcu_read_unlock();
44ed167d 607
089c075d
AG
608 my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6));
609 memcpy(&src_in6, &tconn->my_addr, my_addr_len);
44ed167d 610
089c075d 611 if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6)
44ed167d
PR
612 src_in6.sin6_port = 0;
613 else
614 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
615
089c075d
AG
616 peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6));
617 memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len);
b411b363
PR
618
619 what = "sock_create_kern";
44ed167d
PR
620 err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
621 SOCK_STREAM, IPPROTO_TCP, &sock);
b411b363
PR
622 if (err < 0) {
623 sock = NULL;
624 goto out;
625 }
626
627 sock->sk->sk_rcvtimeo =
69ef82de 628 sock->sk->sk_sndtimeo = connect_int * HZ;
44ed167d 629 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
b411b363
PR
630
631 /* explicitly bind to the configured IP as source IP
632 * for the outgoing connections.
633 * This is needed for multihomed hosts and to be
634 * able to use lo: interfaces for drbd.
635 * Make sure to use 0 as port number, so linux selects
636 * a free one dynamically.
637 */
b411b363 638 what = "bind before connect";
44ed167d 639 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
b411b363
PR
640 if (err < 0)
641 goto out;
642
643 /* connect may fail, peer not yet available.
644 * stay C_WF_CONNECTION, don't go Disconnecting! */
645 disconnect_on_error = 0;
646 what = "connect";
44ed167d 647 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
b411b363
PR
648
649out:
650 if (err < 0) {
651 if (sock) {
652 sock_release(sock);
653 sock = NULL;
654 }
655 switch (-err) {
656 /* timeout, busy, signal pending */
657 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
658 case EINTR: case ERESTARTSYS:
659 /* peer not (yet) available, network problem */
660 case ECONNREFUSED: case ENETUNREACH:
661 case EHOSTDOWN: case EHOSTUNREACH:
662 disconnect_on_error = 0;
663 break;
664 default:
eac3e990 665 conn_err(tconn, "%s failed, err = %d\n", what, err);
b411b363
PR
666 }
667 if (disconnect_on_error)
bbeb641c 668 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 669 }
44ed167d 670
b411b363
PR
671 return sock;
672}
673
7a426fd8
PR
674struct accept_wait_data {
675 struct drbd_tconn *tconn;
676 struct socket *s_listen;
677 struct completion door_bell;
678 void (*original_sk_state_change)(struct sock *sk);
679
680};
681
682static void incomming_connection(struct sock *sk)
683{
684 struct accept_wait_data *ad = sk->sk_user_data;
685 struct drbd_tconn *tconn = ad->tconn;
686
687 if (sk->sk_state != TCP_ESTABLISHED)
688 conn_warn(tconn, "unexpected tcp state change. sk_state = %d\n", sk->sk_state);
689
690 write_lock_bh(&sk->sk_callback_lock);
691 sk->sk_state_change = ad->original_sk_state_change;
692 sk->sk_user_data = NULL;
693 write_unlock_bh(&sk->sk_callback_lock);
694
695 sk->sk_state_change(sk);
696 complete(&ad->door_bell);
697}
698
699static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad)
b411b363 700{
1f3e509b 701 int err, sndbuf_size, rcvbuf_size, my_addr_len;
44ed167d 702 struct sockaddr_in6 my_addr;
1f3e509b 703 struct socket *s_listen;
44ed167d 704 struct net_conf *nc;
b411b363
PR
705 const char *what;
706
44ed167d
PR
707 rcu_read_lock();
708 nc = rcu_dereference(tconn->net_conf);
709 if (!nc) {
710 rcu_read_unlock();
7a426fd8 711 return -EIO;
44ed167d 712 }
44ed167d
PR
713 sndbuf_size = nc->sndbuf_size;
714 rcvbuf_size = nc->rcvbuf_size;
44ed167d 715 rcu_read_unlock();
b411b363 716
089c075d
AG
717 my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6));
718 memcpy(&my_addr, &tconn->my_addr, my_addr_len);
719
b411b363 720 what = "sock_create_kern";
44ed167d 721 err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
1f3e509b 722 SOCK_STREAM, IPPROTO_TCP, &s_listen);
b411b363
PR
723 if (err) {
724 s_listen = NULL;
725 goto out;
726 }
727
1f3e509b 728 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
44ed167d 729 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
b411b363
PR
730
731 what = "bind before listen";
44ed167d 732 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
b411b363
PR
733 if (err < 0)
734 goto out;
735
7a426fd8
PR
736 ad->s_listen = s_listen;
737 write_lock_bh(&s_listen->sk->sk_callback_lock);
738 ad->original_sk_state_change = s_listen->sk->sk_state_change;
739 s_listen->sk->sk_state_change = incomming_connection;
740 s_listen->sk->sk_user_data = ad;
741 write_unlock_bh(&s_listen->sk->sk_callback_lock);
742
2820fd39
PR
743 what = "listen";
744 err = s_listen->ops->listen(s_listen, 5);
745 if (err < 0)
746 goto out;
747
7a426fd8 748 return 0;
1f3e509b
PR
749out:
750 if (s_listen)
751 sock_release(s_listen);
752 if (err < 0) {
753 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
754 conn_err(tconn, "%s failed, err = %d\n", what, err);
755 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
756 }
757 }
758
7a426fd8 759 return -EIO;
1f3e509b
PR
760}
761
7a426fd8 762static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad)
1f3e509b
PR
763{
764 int timeo, connect_int, err = 0;
765 struct socket *s_estab = NULL;
1f3e509b
PR
766 struct net_conf *nc;
767
768 rcu_read_lock();
769 nc = rcu_dereference(tconn->net_conf);
770 if (!nc) {
771 rcu_read_unlock();
772 return NULL;
773 }
774 connect_int = nc->connect_int;
775 rcu_read_unlock();
776
777 timeo = connect_int * HZ;
778 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
779
7a426fd8
PR
780 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
781 if (err <= 0)
782 return NULL;
b411b363 783
7a426fd8 784 err = kernel_accept(ad->s_listen, &s_estab, 0);
b411b363
PR
785 if (err < 0) {
786 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1f3e509b 787 conn_err(tconn, "accept failed, err = %d\n", err);
bbeb641c 788 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
789 }
790 }
b411b363
PR
791
792 return s_estab;
793}
794
e658983a 795static int decode_header(struct drbd_tconn *, void *, struct packet_info *);
b411b363 796
9f5bdc33
AG
797static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock,
798 enum drbd_packet cmd)
799{
800 if (!conn_prepare_command(tconn, sock))
801 return -EIO;
e658983a 802 return conn_send_command(tconn, sock, cmd, 0, NULL, 0);
b411b363
PR
803}
804
9f5bdc33 805static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock)
b411b363 806{
9f5bdc33
AG
807 unsigned int header_size = drbd_header_size(tconn);
808 struct packet_info pi;
809 int err;
b411b363 810
9f5bdc33
AG
811 err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0);
812 if (err != header_size) {
813 if (err >= 0)
814 err = -EIO;
815 return err;
816 }
817 err = decode_header(tconn, tconn->data.rbuf, &pi);
818 if (err)
819 return err;
820 return pi.cmd;
b411b363
PR
821}
822
823/**
824 * drbd_socket_okay() - Free the socket if its connection is not okay
b411b363
PR
825 * @sock: pointer to the pointer to the socket.
826 */
dbd9eea0 827static int drbd_socket_okay(struct socket **sock)
b411b363
PR
828{
829 int rr;
830 char tb[4];
831
832 if (!*sock)
81e84650 833 return false;
b411b363 834
dbd9eea0 835 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
b411b363
PR
836
837 if (rr > 0 || rr == -EAGAIN) {
81e84650 838 return true;
b411b363
PR
839 } else {
840 sock_release(*sock);
841 *sock = NULL;
81e84650 842 return false;
b411b363
PR
843 }
844}
2325eb66
PR
845/* Gets called if a connection is established, or if a new minor gets created
846 in a connection */
c141ebda 847int drbd_connected(struct drbd_conf *mdev)
907599e0 848{
0829f5ed 849 int err;
907599e0
PR
850
851 atomic_set(&mdev->packet_seq, 0);
852 mdev->peer_seq = 0;
853
8410da8f
PR
854 mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
855 &mdev->tconn->cstate_mutex :
856 &mdev->own_state_mutex;
857
0829f5ed
AG
858 err = drbd_send_sync_param(mdev);
859 if (!err)
860 err = drbd_send_sizes(mdev, 0, 0);
861 if (!err)
862 err = drbd_send_uuids(mdev);
863 if (!err)
43de7c85 864 err = drbd_send_current_state(mdev);
907599e0
PR
865 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
866 clear_bit(RESIZE_PENDING, &mdev->flags);
8b924f1d 867 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
0829f5ed 868 return err;
907599e0
PR
869}
870
b411b363
PR
871/*
872 * return values:
873 * 1 yes, we have a valid connection
874 * 0 oops, did not work out, please try again
875 * -1 peer talks different language,
876 * no point in trying again, please go standalone.
877 * -2 We do not have a network config...
878 */
81fa2e67 879static int conn_connect(struct drbd_tconn *tconn)
b411b363 880{
7da35862 881 struct drbd_socket sock, msock;
c141ebda 882 struct drbd_conf *mdev;
44ed167d 883 struct net_conf *nc;
c141ebda 884 int vnr, timeout, try, h, ok;
08b165ba 885 bool discard_my_data;
a1096a6e 886 enum drbd_state_rv rv;
7a426fd8
PR
887 struct accept_wait_data ad = {
888 .tconn = tconn,
889 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
890 };
b411b363 891
bbeb641c 892 if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
893 return -2;
894
7da35862
PR
895 mutex_init(&sock.mutex);
896 sock.sbuf = tconn->data.sbuf;
897 sock.rbuf = tconn->data.rbuf;
898 sock.socket = NULL;
899 mutex_init(&msock.mutex);
900 msock.sbuf = tconn->meta.sbuf;
901 msock.rbuf = tconn->meta.rbuf;
902 msock.socket = NULL;
903
907599e0 904 clear_bit(DISCARD_CONCURRENT, &tconn->flags);
0916e0e3
AG
905
906 /* Assume that the peer only understands protocol 80 until we know better. */
907 tconn->agreed_pro_version = 80;
b411b363 908
7a426fd8
PR
909 if (prepare_listen_socket(tconn, &ad))
910 return 0;
911
b411b363 912 do {
2bf89621
AG
913 struct socket *s;
914
b411b363
PR
915 for (try = 0;;) {
916 /* 3 tries, this should take less than a second! */
907599e0 917 s = drbd_try_connect(tconn);
b411b363
PR
918 if (s || ++try >= 3)
919 break;
920 /* give the other side time to call bind() & listen() */
20ee6390 921 schedule_timeout_interruptible(HZ / 10);
b411b363
PR
922 }
923
924 if (s) {
7da35862
PR
925 if (!sock.socket) {
926 sock.socket = s;
927 send_first_packet(tconn, &sock, P_INITIAL_DATA);
928 } else if (!msock.socket) {
929 msock.socket = s;
930 send_first_packet(tconn, &msock, P_INITIAL_META);
b411b363 931 } else {
81fa2e67 932 conn_err(tconn, "Logic error in conn_connect()\n");
b411b363
PR
933 goto out_release_sockets;
934 }
935 }
936
7da35862
PR
937 if (sock.socket && msock.socket) {
938 rcu_read_lock();
939 nc = rcu_dereference(tconn->net_conf);
940 timeout = nc->ping_timeo * HZ / 10;
941 rcu_read_unlock();
942 schedule_timeout_interruptible(timeout);
943 ok = drbd_socket_okay(&sock.socket);
944 ok = drbd_socket_okay(&msock.socket) && ok;
b411b363
PR
945 if (ok)
946 break;
947 }
948
949retry:
7a426fd8 950 s = drbd_wait_for_connect(tconn, &ad);
b411b363 951 if (s) {
9f5bdc33 952 try = receive_first_packet(tconn, s);
7da35862
PR
953 drbd_socket_okay(&sock.socket);
954 drbd_socket_okay(&msock.socket);
b411b363 955 switch (try) {
e5d6f33a 956 case P_INITIAL_DATA:
7da35862 957 if (sock.socket) {
907599e0 958 conn_warn(tconn, "initial packet S crossed\n");
7da35862 959 sock_release(sock.socket);
b411b363 960 }
7da35862 961 sock.socket = s;
b411b363 962 break;
e5d6f33a 963 case P_INITIAL_META:
7da35862 964 if (msock.socket) {
907599e0 965 conn_warn(tconn, "initial packet M crossed\n");
7da35862 966 sock_release(msock.socket);
b411b363 967 }
7da35862 968 msock.socket = s;
907599e0 969 set_bit(DISCARD_CONCURRENT, &tconn->flags);
b411b363
PR
970 break;
971 default:
907599e0 972 conn_warn(tconn, "Error receiving initial packet\n");
b411b363
PR
973 sock_release(s);
974 if (random32() & 1)
975 goto retry;
976 }
977 }
978
bbeb641c 979 if (tconn->cstate <= C_DISCONNECTING)
b411b363
PR
980 goto out_release_sockets;
981 if (signal_pending(current)) {
982 flush_signals(current);
983 smp_rmb();
907599e0 984 if (get_t_state(&tconn->receiver) == EXITING)
b411b363
PR
985 goto out_release_sockets;
986 }
987
b666dbf8
PR
988 ok = drbd_socket_okay(&sock.socket);
989 ok = drbd_socket_okay(&msock.socket) && ok;
990 } while (!ok);
b411b363 991
7a426fd8
PR
992 if (ad.s_listen)
993 sock_release(ad.s_listen);
994
7da35862
PR
995 sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
996 msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
b411b363 997
7da35862
PR
998 sock.socket->sk->sk_allocation = GFP_NOIO;
999 msock.socket->sk->sk_allocation = GFP_NOIO;
b411b363 1000
7da35862
PR
1001 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1002 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
b411b363 1003
b411b363 1004 /* NOT YET ...
7da35862
PR
1005 * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
1006 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
6038178e 1007 * first set it to the P_CONNECTION_FEATURES timeout,
b411b363 1008 * which we set to 4x the configured ping_timeout. */
44ed167d
PR
1009 rcu_read_lock();
1010 nc = rcu_dereference(tconn->net_conf);
1011
7da35862
PR
1012 sock.socket->sk->sk_sndtimeo =
1013 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
44ed167d 1014
7da35862 1015 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
44ed167d 1016 timeout = nc->timeout * HZ / 10;
08b165ba 1017 discard_my_data = nc->discard_my_data;
44ed167d 1018 rcu_read_unlock();
b411b363 1019
7da35862 1020 msock.socket->sk->sk_sndtimeo = timeout;
b411b363
PR
1021
1022 /* we don't want delays.
25985edc 1023 * we use TCP_CORK where appropriate, though */
7da35862
PR
1024 drbd_tcp_nodelay(sock.socket);
1025 drbd_tcp_nodelay(msock.socket);
b411b363 1026
7da35862
PR
1027 tconn->data.socket = sock.socket;
1028 tconn->meta.socket = msock.socket;
907599e0 1029 tconn->last_received = jiffies;
b411b363 1030
6038178e 1031 h = drbd_do_features(tconn);
b411b363
PR
1032 if (h <= 0)
1033 return h;
1034
907599e0 1035 if (tconn->cram_hmac_tfm) {
b411b363 1036 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
907599e0 1037 switch (drbd_do_auth(tconn)) {
b10d96cb 1038 case -1:
907599e0 1039 conn_err(tconn, "Authentication of peer failed\n");
b411b363 1040 return -1;
b10d96cb 1041 case 0:
907599e0 1042 conn_err(tconn, "Authentication of peer failed, trying again.\n");
b10d96cb 1043 return 0;
b411b363
PR
1044 }
1045 }
1046
7da35862
PR
1047 tconn->data.socket->sk->sk_sndtimeo = timeout;
1048 tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
b411b363 1049
387eb308 1050 if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
7e2455c1 1051 return -1;
b411b363 1052
a1096a6e
PR
1053 set_bit(STATE_SENT, &tconn->flags);
1054
c141ebda
PR
1055 rcu_read_lock();
1056 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1057 kref_get(&mdev->kref);
1058 rcu_read_unlock();
08b165ba
PR
1059
1060 if (discard_my_data)
1061 set_bit(DISCARD_MY_DATA, &mdev->flags);
1062 else
1063 clear_bit(DISCARD_MY_DATA, &mdev->flags);
1064
c141ebda
PR
1065 drbd_connected(mdev);
1066 kref_put(&mdev->kref, &drbd_minor_destroy);
1067 rcu_read_lock();
1068 }
1069 rcu_read_unlock();
1070
a1096a6e
PR
1071 rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1072 if (rv < SS_SUCCESS) {
1073 clear_bit(STATE_SENT, &tconn->flags);
823bd832 1074 return 0;
a1096a6e 1075 }
823bd832
PR
1076
1077 drbd_thread_start(&tconn->asender);
1078
08b165ba
PR
1079 mutex_lock(&tconn->conf_update);
1080 /* The discard_my_data flag is a single-shot modifier to the next
1081 * connection attempt, the handshake of which is now well underway.
1082 * No need for rcu style copying of the whole struct
1083 * just to clear a single value. */
1084 tconn->net_conf->discard_my_data = 0;
1085 mutex_unlock(&tconn->conf_update);
1086
d3fcb490 1087 return h;
b411b363
PR
1088
1089out_release_sockets:
7a426fd8
PR
1090 if (ad.s_listen)
1091 sock_release(ad.s_listen);
7da35862
PR
1092 if (sock.socket)
1093 sock_release(sock.socket);
1094 if (msock.socket)
1095 sock_release(msock.socket);
b411b363
PR
1096 return -1;
1097}
1098
e658983a 1099static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi)
b411b363 1100{
e658983a
AG
1101 unsigned int header_size = drbd_header_size(tconn);
1102
0c8e36d9
AG
1103 if (header_size == sizeof(struct p_header100) &&
1104 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1105 struct p_header100 *h = header;
1106 if (h->pad != 0) {
1107 conn_err(tconn, "Header padding is not zero\n");
1108 return -EINVAL;
1109 }
1110 pi->vnr = be16_to_cpu(h->volume);
1111 pi->cmd = be16_to_cpu(h->command);
1112 pi->size = be32_to_cpu(h->length);
1113 } else if (header_size == sizeof(struct p_header95) &&
1114 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
e658983a 1115 struct p_header95 *h = header;
e658983a 1116 pi->cmd = be16_to_cpu(h->command);
b55d84ba
AG
1117 pi->size = be32_to_cpu(h->length);
1118 pi->vnr = 0;
e658983a
AG
1119 } else if (header_size == sizeof(struct p_header80) &&
1120 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1121 struct p_header80 *h = header;
1122 pi->cmd = be16_to_cpu(h->command);
1123 pi->size = be16_to_cpu(h->length);
77351055 1124 pi->vnr = 0;
02918be2 1125 } else {
e658983a
AG
1126 conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n",
1127 be32_to_cpu(*(__be32 *)header),
1128 tconn->agreed_pro_version);
8172f3e9 1129 return -EINVAL;
b411b363 1130 }
e658983a 1131 pi->data = header + header_size;
8172f3e9 1132 return 0;
257d0af6
PR
1133}
1134
9ba7aa00 1135static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
257d0af6 1136{
e658983a 1137 void *buffer = tconn->data.rbuf;
69bc7bc3 1138 int err;
257d0af6 1139
e658983a 1140 err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn));
a5c31904 1141 if (err)
69bc7bc3 1142 return err;
257d0af6 1143
e658983a 1144 err = decode_header(tconn, buffer, pi);
9ba7aa00 1145 tconn->last_received = jiffies;
b411b363 1146
69bc7bc3 1147 return err;
b411b363
PR
1148}
1149
4b0007c0 1150static void drbd_flush(struct drbd_tconn *tconn)
b411b363
PR
1151{
1152 int rv;
4b0007c0
PR
1153 struct drbd_conf *mdev;
1154 int vnr;
1155
1156 if (tconn->write_ordering >= WO_bdev_flush) {
615e087f 1157 rcu_read_lock();
4b0007c0 1158 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
615e087f
LE
1159 if (!get_ldev(mdev))
1160 continue;
1161 kref_get(&mdev->kref);
1162 rcu_read_unlock();
1163
1164 rv = blkdev_issue_flush(mdev->ldev->backing_bdev,
1165 GFP_NOIO, NULL);
1166 if (rv) {
1167 dev_info(DEV, "local disk flush failed with status %d\n", rv);
1168 /* would rather check on EOPNOTSUPP, but that is not reliable.
1169 * don't try again for ANY return value != 0
1170 * if (rv == -EOPNOTSUPP) */
1171 drbd_bump_write_ordering(tconn, WO_drain_io);
4b0007c0 1172 }
615e087f
LE
1173 put_ldev(mdev);
1174 kref_put(&mdev->kref, &drbd_minor_destroy);
1175
1176 rcu_read_lock();
1177 if (rv)
1178 break;
b411b363 1179 }
615e087f 1180 rcu_read_unlock();
b411b363 1181 }
b411b363
PR
1182}
1183
1184/**
1185 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1186 * @mdev: DRBD device.
1187 * @epoch: Epoch object.
1188 * @ev: Epoch event.
1189 */
1e9dd291 1190static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn,
b411b363
PR
1191 struct drbd_epoch *epoch,
1192 enum epoch_event ev)
1193{
2451fc3b 1194 int epoch_size;
b411b363 1195 struct drbd_epoch *next_epoch;
b411b363
PR
1196 enum finish_epoch rv = FE_STILL_LIVE;
1197
12038a3a 1198 spin_lock(&tconn->epoch_lock);
b411b363
PR
1199 do {
1200 next_epoch = NULL;
b411b363
PR
1201
1202 epoch_size = atomic_read(&epoch->epoch_size);
1203
1204 switch (ev & ~EV_CLEANUP) {
1205 case EV_PUT:
1206 atomic_dec(&epoch->active);
1207 break;
1208 case EV_GOT_BARRIER_NR:
1209 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
b411b363
PR
1210 break;
1211 case EV_BECAME_LAST:
1212 /* nothing to do*/
1213 break;
1214 }
1215
b411b363
PR
1216 if (epoch_size != 0 &&
1217 atomic_read(&epoch->active) == 0 &&
85d73513 1218 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
b411b363 1219 if (!(ev & EV_CLEANUP)) {
12038a3a 1220 spin_unlock(&tconn->epoch_lock);
9ed57dcb 1221 drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size);
12038a3a 1222 spin_lock(&tconn->epoch_lock);
b411b363 1223 }
9ed57dcb
LE
1224#if 0
1225 /* FIXME: dec unacked on connection, once we have
1226 * something to count pending connection packets in. */
85d73513 1227 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
9ed57dcb
LE
1228 dec_unacked(epoch->tconn);
1229#endif
b411b363 1230
12038a3a 1231 if (tconn->current_epoch != epoch) {
b411b363
PR
1232 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1233 list_del(&epoch->list);
1234 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
12038a3a 1235 tconn->epochs--;
b411b363
PR
1236 kfree(epoch);
1237
1238 if (rv == FE_STILL_LIVE)
1239 rv = FE_DESTROYED;
1240 } else {
1241 epoch->flags = 0;
1242 atomic_set(&epoch->epoch_size, 0);
698f9315 1243 /* atomic_set(&epoch->active, 0); is already zero */
b411b363
PR
1244 if (rv == FE_STILL_LIVE)
1245 rv = FE_RECYCLED;
1246 }
1247 }
1248
1249 if (!next_epoch)
1250 break;
1251
1252 epoch = next_epoch;
1253 } while (1);
1254
12038a3a 1255 spin_unlock(&tconn->epoch_lock);
b411b363 1256
b411b363
PR
1257 return rv;
1258}
1259
1260/**
1261 * drbd_bump_write_ordering() - Fall back to an other write ordering method
4b0007c0 1262 * @tconn: DRBD connection.
b411b363
PR
1263 * @wo: Write ordering method to try.
1264 */
4b0007c0 1265void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo)
b411b363 1266{
daeda1cc 1267 struct disk_conf *dc;
4b0007c0 1268 struct drbd_conf *mdev;
b411b363 1269 enum write_ordering_e pwo;
4b0007c0 1270 int vnr;
b411b363
PR
1271 static char *write_ordering_str[] = {
1272 [WO_none] = "none",
1273 [WO_drain_io] = "drain",
1274 [WO_bdev_flush] = "flush",
b411b363
PR
1275 };
1276
4b0007c0 1277 pwo = tconn->write_ordering;
b411b363 1278 wo = min(pwo, wo);
daeda1cc 1279 rcu_read_lock();
4b0007c0 1280 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
27eb13e9 1281 if (!get_ldev_if_state(mdev, D_ATTACHING))
4b0007c0
PR
1282 continue;
1283 dc = rcu_dereference(mdev->ldev->disk_conf);
1284
1285 if (wo == WO_bdev_flush && !dc->disk_flushes)
1286 wo = WO_drain_io;
1287 if (wo == WO_drain_io && !dc->disk_drain)
1288 wo = WO_none;
1289 put_ldev(mdev);
1290 }
daeda1cc 1291 rcu_read_unlock();
4b0007c0
PR
1292 tconn->write_ordering = wo;
1293 if (pwo != tconn->write_ordering || wo == WO_bdev_flush)
1294 conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]);
b411b363
PR
1295}
1296
45bb912b 1297/**
fbe29dec 1298 * drbd_submit_peer_request()
45bb912b 1299 * @mdev: DRBD device.
db830c46 1300 * @peer_req: peer request
45bb912b 1301 * @rw: flag field, see bio->bi_rw
10f6d992
LE
1302 *
1303 * May spread the pages to multiple bios,
1304 * depending on bio_add_page restrictions.
1305 *
1306 * Returns 0 if all bios have been submitted,
1307 * -ENOMEM if we could not allocate enough bios,
1308 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1309 * single page to an empty bio (which should never happen and likely indicates
1310 * that the lower level IO stack is in some way broken). This has been observed
1311 * on certain Xen deployments.
45bb912b
LE
1312 */
1313/* TODO allocate from our own bio_set. */
fbe29dec
AG
1314int drbd_submit_peer_request(struct drbd_conf *mdev,
1315 struct drbd_peer_request *peer_req,
1316 const unsigned rw, const int fault_type)
45bb912b
LE
1317{
1318 struct bio *bios = NULL;
1319 struct bio *bio;
db830c46
AG
1320 struct page *page = peer_req->pages;
1321 sector_t sector = peer_req->i.sector;
1322 unsigned ds = peer_req->i.size;
45bb912b
LE
1323 unsigned n_bios = 0;
1324 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
10f6d992 1325 int err = -ENOMEM;
45bb912b
LE
1326
1327 /* In most cases, we will only need one bio. But in case the lower
1328 * level restrictions happen to be different at this offset on this
1329 * side than those of the sending peer, we may need to submit the
da4a75d2
LE
1330 * request in more than one bio.
1331 *
1332 * Plain bio_alloc is good enough here, this is no DRBD internally
1333 * generated bio, but a bio allocated on behalf of the peer.
1334 */
45bb912b
LE
1335next_bio:
1336 bio = bio_alloc(GFP_NOIO, nr_pages);
1337 if (!bio) {
1338 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1339 goto fail;
1340 }
db830c46 1341 /* > peer_req->i.sector, unless this is the first bio */
45bb912b
LE
1342 bio->bi_sector = sector;
1343 bio->bi_bdev = mdev->ldev->backing_bdev;
45bb912b 1344 bio->bi_rw = rw;
db830c46 1345 bio->bi_private = peer_req;
fcefa62e 1346 bio->bi_end_io = drbd_peer_request_endio;
45bb912b
LE
1347
1348 bio->bi_next = bios;
1349 bios = bio;
1350 ++n_bios;
1351
1352 page_chain_for_each(page) {
1353 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1354 if (!bio_add_page(bio, page, len, 0)) {
10f6d992
LE
1355 /* A single page must always be possible!
1356 * But in case it fails anyways,
1357 * we deal with it, and complain (below). */
1358 if (bio->bi_vcnt == 0) {
1359 dev_err(DEV,
1360 "bio_add_page failed for len=%u, "
1361 "bi_vcnt=0 (bi_sector=%llu)\n",
1362 len, (unsigned long long)bio->bi_sector);
1363 err = -ENOSPC;
1364 goto fail;
1365 }
45bb912b
LE
1366 goto next_bio;
1367 }
1368 ds -= len;
1369 sector += len >> 9;
1370 --nr_pages;
1371 }
1372 D_ASSERT(page == NULL);
1373 D_ASSERT(ds == 0);
1374
db830c46 1375 atomic_set(&peer_req->pending_bios, n_bios);
45bb912b
LE
1376 do {
1377 bio = bios;
1378 bios = bios->bi_next;
1379 bio->bi_next = NULL;
1380
45bb912b 1381 drbd_generic_make_request(mdev, fault_type, bio);
45bb912b 1382 } while (bios);
45bb912b
LE
1383 return 0;
1384
1385fail:
1386 while (bios) {
1387 bio = bios;
1388 bios = bios->bi_next;
1389 bio_put(bio);
1390 }
10f6d992 1391 return err;
45bb912b
LE
1392}
1393
53840641 1394static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
db830c46 1395 struct drbd_peer_request *peer_req)
53840641 1396{
db830c46 1397 struct drbd_interval *i = &peer_req->i;
53840641
AG
1398
1399 drbd_remove_interval(&mdev->write_requests, i);
1400 drbd_clear_interval(i);
1401
6c852bec 1402 /* Wake up any processes waiting for this peer request to complete. */
53840641
AG
1403 if (i->waiting)
1404 wake_up(&mdev->misc_wait);
1405}
1406
77fede51
PR
1407void conn_wait_active_ee_empty(struct drbd_tconn *tconn)
1408{
1409 struct drbd_conf *mdev;
1410 int vnr;
1411
1412 rcu_read_lock();
1413 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1414 kref_get(&mdev->kref);
1415 rcu_read_unlock();
1416 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1417 kref_put(&mdev->kref, &drbd_minor_destroy);
1418 rcu_read_lock();
1419 }
1420 rcu_read_unlock();
1421}
1422
4a76b161 1423static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1424{
2451fc3b 1425 int rv;
e658983a 1426 struct p_barrier *p = pi->data;
b411b363
PR
1427 struct drbd_epoch *epoch;
1428
9ed57dcb
LE
1429 /* FIXME these are unacked on connection,
1430 * not a specific (peer)device.
1431 */
12038a3a 1432 tconn->current_epoch->barrier_nr = p->barrier;
9ed57dcb 1433 tconn->current_epoch->tconn = tconn;
1e9dd291 1434 rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR);
b411b363
PR
1435
1436 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1437 * the activity log, which means it would not be resynced in case the
1438 * R_PRIMARY crashes now.
1439 * Therefore we must send the barrier_ack after the barrier request was
1440 * completed. */
4b0007c0 1441 switch (tconn->write_ordering) {
b411b363
PR
1442 case WO_none:
1443 if (rv == FE_RECYCLED)
82bc0194 1444 return 0;
2451fc3b
PR
1445
1446 /* receiver context, in the writeout path of the other node.
1447 * avoid potential distributed deadlock */
1448 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1449 if (epoch)
1450 break;
1451 else
9ed57dcb 1452 conn_warn(tconn, "Allocation of an epoch failed, slowing down\n");
2451fc3b 1453 /* Fall through */
b411b363
PR
1454
1455 case WO_bdev_flush:
1456 case WO_drain_io:
77fede51 1457 conn_wait_active_ee_empty(tconn);
4b0007c0 1458 drbd_flush(tconn);
2451fc3b 1459
12038a3a 1460 if (atomic_read(&tconn->current_epoch->epoch_size)) {
2451fc3b
PR
1461 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1462 if (epoch)
1463 break;
b411b363
PR
1464 }
1465
82bc0194 1466 return 0;
2451fc3b 1467 default:
9ed57dcb 1468 conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering);
82bc0194 1469 return -EIO;
b411b363
PR
1470 }
1471
1472 epoch->flags = 0;
1473 atomic_set(&epoch->epoch_size, 0);
1474 atomic_set(&epoch->active, 0);
1475
12038a3a
PR
1476 spin_lock(&tconn->epoch_lock);
1477 if (atomic_read(&tconn->current_epoch->epoch_size)) {
1478 list_add(&epoch->list, &tconn->current_epoch->list);
1479 tconn->current_epoch = epoch;
1480 tconn->epochs++;
b411b363
PR
1481 } else {
1482 /* The current_epoch got recycled while we allocated this one... */
1483 kfree(epoch);
1484 }
12038a3a 1485 spin_unlock(&tconn->epoch_lock);
b411b363 1486
82bc0194 1487 return 0;
b411b363
PR
1488}
1489
1490/* used from receive_RSDataReply (recv_resync_read)
1491 * and from receive_Data */
f6ffca9f
AG
1492static struct drbd_peer_request *
1493read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
1494 int data_size) __must_hold(local)
b411b363 1495{
6666032a 1496 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
db830c46 1497 struct drbd_peer_request *peer_req;
b411b363 1498 struct page *page;
a5c31904 1499 int dgs, ds, err;
a0638456
PR
1500 void *dig_in = mdev->tconn->int_dig_in;
1501 void *dig_vv = mdev->tconn->int_dig_vv;
6b4388ac 1502 unsigned long *data;
b411b363 1503
88104ca4
AG
1504 dgs = 0;
1505 if (mdev->tconn->peer_integrity_tfm) {
1506 dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
9f5bdc33
AG
1507 /*
1508 * FIXME: Receive the incoming digest into the receive buffer
1509 * here, together with its struct p_data?
1510 */
a5c31904
AG
1511 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1512 if (err)
b411b363 1513 return NULL;
88104ca4 1514 data_size -= dgs;
b411b363
PR
1515 }
1516
841ce241
AG
1517 if (!expect(IS_ALIGNED(data_size, 512)))
1518 return NULL;
1519 if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
1520 return NULL;
b411b363 1521
6666032a
LE
1522 /* even though we trust out peer,
1523 * we sometimes have to double check. */
1524 if (sector + (data_size>>9) > capacity) {
fdda6544
LE
1525 dev_err(DEV, "request from peer beyond end of local disk: "
1526 "capacity: %llus < sector: %llus + size: %u\n",
6666032a
LE
1527 (unsigned long long)capacity,
1528 (unsigned long long)sector, data_size);
1529 return NULL;
1530 }
1531
b411b363
PR
1532 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1533 * "criss-cross" setup, that might cause write-out on some other DRBD,
1534 * which in turn might block on the other node at this very place. */
0db55363 1535 peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO);
db830c46 1536 if (!peer_req)
b411b363 1537 return NULL;
45bb912b 1538
81a3537a
LE
1539 if (!data_size)
1540 return peer_req;
1541
b411b363 1542 ds = data_size;
db830c46 1543 page = peer_req->pages;
45bb912b
LE
1544 page_chain_for_each(page) {
1545 unsigned len = min_t(int, ds, PAGE_SIZE);
6b4388ac 1546 data = kmap(page);
a5c31904 1547 err = drbd_recv_all_warn(mdev->tconn, data, len);
0cf9d27e 1548 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
6b4388ac
PR
1549 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1550 data[0] = data[0] ^ (unsigned long)-1;
1551 }
b411b363 1552 kunmap(page);
a5c31904 1553 if (err) {
3967deb1 1554 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
1555 return NULL;
1556 }
a5c31904 1557 ds -= len;
b411b363
PR
1558 }
1559
1560 if (dgs) {
5b614abe 1561 drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv);
b411b363 1562 if (memcmp(dig_in, dig_vv, dgs)) {
470be44a
LE
1563 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1564 (unsigned long long)sector, data_size);
3967deb1 1565 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
1566 return NULL;
1567 }
1568 }
1569 mdev->recv_cnt += data_size>>9;
db830c46 1570 return peer_req;
b411b363
PR
1571}
1572
1573/* drbd_drain_block() just takes a data block
1574 * out of the socket input buffer, and discards it.
1575 */
1576static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1577{
1578 struct page *page;
a5c31904 1579 int err = 0;
b411b363
PR
1580 void *data;
1581
c3470cde 1582 if (!data_size)
fc5be839 1583 return 0;
c3470cde 1584
c37c8ecf 1585 page = drbd_alloc_pages(mdev, 1, 1);
b411b363
PR
1586
1587 data = kmap(page);
1588 while (data_size) {
fc5be839
AG
1589 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1590
a5c31904
AG
1591 err = drbd_recv_all_warn(mdev->tconn, data, len);
1592 if (err)
b411b363 1593 break;
a5c31904 1594 data_size -= len;
b411b363
PR
1595 }
1596 kunmap(page);
5cc287e0 1597 drbd_free_pages(mdev, page, 0);
fc5be839 1598 return err;
b411b363
PR
1599}
1600
1601static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1602 sector_t sector, int data_size)
1603{
1604 struct bio_vec *bvec;
1605 struct bio *bio;
a5c31904 1606 int dgs, err, i, expect;
a0638456
PR
1607 void *dig_in = mdev->tconn->int_dig_in;
1608 void *dig_vv = mdev->tconn->int_dig_vv;
b411b363 1609
88104ca4
AG
1610 dgs = 0;
1611 if (mdev->tconn->peer_integrity_tfm) {
1612 dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
a5c31904
AG
1613 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1614 if (err)
1615 return err;
88104ca4 1616 data_size -= dgs;
b411b363
PR
1617 }
1618
b411b363
PR
1619 /* optimistically update recv_cnt. if receiving fails below,
1620 * we disconnect anyways, and counters will be reset. */
1621 mdev->recv_cnt += data_size>>9;
1622
1623 bio = req->master_bio;
1624 D_ASSERT(sector == bio->bi_sector);
1625
1626 bio_for_each_segment(bvec, bio, i) {
a5c31904 1627 void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
b411b363 1628 expect = min_t(int, data_size, bvec->bv_len);
a5c31904 1629 err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
b411b363 1630 kunmap(bvec->bv_page);
a5c31904
AG
1631 if (err)
1632 return err;
1633 data_size -= expect;
b411b363
PR
1634 }
1635
1636 if (dgs) {
5b614abe 1637 drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv);
b411b363
PR
1638 if (memcmp(dig_in, dig_vv, dgs)) {
1639 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
28284cef 1640 return -EINVAL;
b411b363
PR
1641 }
1642 }
1643
1644 D_ASSERT(data_size == 0);
28284cef 1645 return 0;
b411b363
PR
1646}
1647
a990be46
AG
1648/*
1649 * e_end_resync_block() is called in asender context via
1650 * drbd_finish_peer_reqs().
1651 */
99920dc5 1652static int e_end_resync_block(struct drbd_work *w, int unused)
b411b363 1653{
8050e6d0
AG
1654 struct drbd_peer_request *peer_req =
1655 container_of(w, struct drbd_peer_request, w);
00d56944 1656 struct drbd_conf *mdev = w->mdev;
db830c46 1657 sector_t sector = peer_req->i.sector;
99920dc5 1658 int err;
b411b363 1659
db830c46 1660 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1661
db830c46
AG
1662 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1663 drbd_set_in_sync(mdev, sector, peer_req->i.size);
99920dc5 1664 err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
b411b363
PR
1665 } else {
1666 /* Record failure to sync */
db830c46 1667 drbd_rs_failed_io(mdev, sector, peer_req->i.size);
b411b363 1668
99920dc5 1669 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1670 }
1671 dec_unacked(mdev);
1672
99920dc5 1673 return err;
b411b363
PR
1674}
1675
1676static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1677{
db830c46 1678 struct drbd_peer_request *peer_req;
b411b363 1679
db830c46
AG
1680 peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
1681 if (!peer_req)
45bb912b 1682 goto fail;
b411b363
PR
1683
1684 dec_rs_pending(mdev);
1685
b411b363
PR
1686 inc_unacked(mdev);
1687 /* corresponding dec_unacked() in e_end_resync_block()
1688 * respective _drbd_clear_done_ee */
1689
db830c46 1690 peer_req->w.cb = e_end_resync_block;
45bb912b 1691
87eeee41 1692 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1693 list_add(&peer_req->w.list, &mdev->sync_ee);
87eeee41 1694 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 1695
0f0601f4 1696 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
fbe29dec 1697 if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
e1c1b0fc 1698 return 0;
b411b363 1699
10f6d992
LE
1700 /* don't care for the reason here */
1701 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 1702 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1703 list_del(&peer_req->w.list);
87eeee41 1704 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9 1705
3967deb1 1706 drbd_free_peer_req(mdev, peer_req);
45bb912b
LE
1707fail:
1708 put_ldev(mdev);
e1c1b0fc 1709 return -EIO;
b411b363
PR
1710}
1711
668eebc6 1712static struct drbd_request *
bc9c5c41
AG
1713find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
1714 sector_t sector, bool missing_ok, const char *func)
51624585 1715{
51624585
AG
1716 struct drbd_request *req;
1717
bc9c5c41
AG
1718 /* Request object according to our peer */
1719 req = (struct drbd_request *)(unsigned long)id;
5e472264 1720 if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
668eebc6 1721 return req;
c3afd8f5 1722 if (!missing_ok) {
5af172ed 1723 dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func,
c3afd8f5
AG
1724 (unsigned long)id, (unsigned long long)sector);
1725 }
51624585
AG
1726 return NULL;
1727}
1728
4a76b161 1729static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1730{
4a76b161 1731 struct drbd_conf *mdev;
b411b363
PR
1732 struct drbd_request *req;
1733 sector_t sector;
82bc0194 1734 int err;
e658983a 1735 struct p_data *p = pi->data;
4a76b161
AG
1736
1737 mdev = vnr_to_mdev(tconn, pi->vnr);
1738 if (!mdev)
1739 return -EIO;
b411b363
PR
1740
1741 sector = be64_to_cpu(p->sector);
1742
87eeee41 1743 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 1744 req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
87eeee41 1745 spin_unlock_irq(&mdev->tconn->req_lock);
c3afd8f5 1746 if (unlikely(!req))
82bc0194 1747 return -EIO;
b411b363 1748
24c4830c 1749 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
b411b363
PR
1750 * special casing it there for the various failure cases.
1751 * still no race with drbd_fail_pending_reads */
e2857216 1752 err = recv_dless_read(mdev, req, sector, pi->size);
82bc0194 1753 if (!err)
8554df1c 1754 req_mod(req, DATA_RECEIVED);
b411b363
PR
1755 /* else: nothing. handled from drbd_disconnect...
1756 * I don't think we may complete this just yet
1757 * in case we are "on-disconnect: freeze" */
1758
82bc0194 1759 return err;
b411b363
PR
1760}
1761
4a76b161 1762static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1763{
4a76b161 1764 struct drbd_conf *mdev;
b411b363 1765 sector_t sector;
82bc0194 1766 int err;
e658983a 1767 struct p_data *p = pi->data;
4a76b161
AG
1768
1769 mdev = vnr_to_mdev(tconn, pi->vnr);
1770 if (!mdev)
1771 return -EIO;
b411b363
PR
1772
1773 sector = be64_to_cpu(p->sector);
1774 D_ASSERT(p->block_id == ID_SYNCER);
1775
1776 if (get_ldev(mdev)) {
1777 /* data is submitted to disk within recv_resync_read.
1778 * corresponding put_ldev done below on error,
fcefa62e 1779 * or in drbd_peer_request_endio. */
e2857216 1780 err = recv_resync_read(mdev, sector, pi->size);
b411b363
PR
1781 } else {
1782 if (__ratelimit(&drbd_ratelimit_state))
1783 dev_err(DEV, "Can not write resync data to local disk.\n");
1784
e2857216 1785 err = drbd_drain_block(mdev, pi->size);
b411b363 1786
e2857216 1787 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
b411b363
PR
1788 }
1789
e2857216 1790 atomic_add(pi->size >> 9, &mdev->rs_sect_in);
778f271d 1791
82bc0194 1792 return err;
b411b363
PR
1793}
1794
7be8da07
AG
1795static void restart_conflicting_writes(struct drbd_conf *mdev,
1796 sector_t sector, int size)
1797{
1798 struct drbd_interval *i;
1799 struct drbd_request *req;
1800
1801 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
1802 if (!i->local)
1803 continue;
1804 req = container_of(i, struct drbd_request, i);
1805 if (req->rq_state & RQ_LOCAL_PENDING ||
1806 !(req->rq_state & RQ_POSTPONED))
1807 continue;
2312f0b3
LE
1808 /* as it is RQ_POSTPONED, this will cause it to
1809 * be queued on the retry workqueue. */
1810 __req_mod(req, DISCARD_WRITE, NULL);
7be8da07
AG
1811 }
1812}
1813
a990be46
AG
1814/*
1815 * e_end_block() is called in asender context via drbd_finish_peer_reqs().
b411b363 1816 */
99920dc5 1817static int e_end_block(struct drbd_work *w, int cancel)
b411b363 1818{
8050e6d0
AG
1819 struct drbd_peer_request *peer_req =
1820 container_of(w, struct drbd_peer_request, w);
00d56944 1821 struct drbd_conf *mdev = w->mdev;
db830c46 1822 sector_t sector = peer_req->i.sector;
99920dc5 1823 int err = 0, pcmd;
b411b363 1824
303d1448 1825 if (peer_req->flags & EE_SEND_WRITE_ACK) {
db830c46 1826 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1827 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1828 mdev->state.conn <= C_PAUSED_SYNC_T &&
db830c46 1829 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
b411b363 1830 P_RS_WRITE_ACK : P_WRITE_ACK;
99920dc5 1831 err = drbd_send_ack(mdev, pcmd, peer_req);
b411b363 1832 if (pcmd == P_RS_WRITE_ACK)
db830c46 1833 drbd_set_in_sync(mdev, sector, peer_req->i.size);
b411b363 1834 } else {
99920dc5 1835 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1836 /* we expect it to be marked out of sync anyways...
1837 * maybe assert this? */
1838 }
1839 dec_unacked(mdev);
1840 }
1841 /* we delete from the conflict detection hash _after_ we sent out the
1842 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
302bdeae 1843 if (peer_req->flags & EE_IN_INTERVAL_TREE) {
87eeee41 1844 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
1845 D_ASSERT(!drbd_interval_empty(&peer_req->i));
1846 drbd_remove_epoch_entry_interval(mdev, peer_req);
7be8da07
AG
1847 if (peer_req->flags & EE_RESTART_REQUESTS)
1848 restart_conflicting_writes(mdev, sector, peer_req->i.size);
87eeee41 1849 spin_unlock_irq(&mdev->tconn->req_lock);
bb3bfe96 1850 } else
db830c46 1851 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1852
1e9dd291 1853 drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
b411b363 1854
99920dc5 1855 return err;
b411b363
PR
1856}
1857
7be8da07 1858static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
b411b363 1859{
7be8da07 1860 struct drbd_conf *mdev = w->mdev;
8050e6d0
AG
1861 struct drbd_peer_request *peer_req =
1862 container_of(w, struct drbd_peer_request, w);
99920dc5 1863 int err;
b411b363 1864
99920dc5 1865 err = drbd_send_ack(mdev, ack, peer_req);
b411b363
PR
1866 dec_unacked(mdev);
1867
99920dc5 1868 return err;
b411b363
PR
1869}
1870
99920dc5 1871static int e_send_discard_write(struct drbd_work *w, int unused)
7be8da07
AG
1872{
1873 return e_send_ack(w, P_DISCARD_WRITE);
1874}
1875
99920dc5 1876static int e_send_retry_write(struct drbd_work *w, int unused)
7be8da07
AG
1877{
1878 struct drbd_tconn *tconn = w->mdev->tconn;
1879
1880 return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
1881 P_RETRY_WRITE : P_DISCARD_WRITE);
1882}
1883
3e394da1
AG
1884static bool seq_greater(u32 a, u32 b)
1885{
1886 /*
1887 * We assume 32-bit wrap-around here.
1888 * For 24-bit wrap-around, we would have to shift:
1889 * a <<= 8; b <<= 8;
1890 */
1891 return (s32)a - (s32)b > 0;
1892}
1893
1894static u32 seq_max(u32 a, u32 b)
1895{
1896 return seq_greater(a, b) ? a : b;
1897}
1898
7be8da07
AG
1899static bool need_peer_seq(struct drbd_conf *mdev)
1900{
1901 struct drbd_tconn *tconn = mdev->tconn;
302bdeae 1902 int tp;
7be8da07
AG
1903
1904 /*
1905 * We only need to keep track of the last packet_seq number of our peer
1906 * if we are in dual-primary mode and we have the discard flag set; see
1907 * handle_write_conflicts().
1908 */
302bdeae
PR
1909
1910 rcu_read_lock();
1911 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
1912 rcu_read_unlock();
1913
1914 return tp && test_bit(DISCARD_CONCURRENT, &tconn->flags);
7be8da07
AG
1915}
1916
43ae077d 1917static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
3e394da1 1918{
3c13b680 1919 unsigned int newest_peer_seq;
3e394da1 1920
7be8da07
AG
1921 if (need_peer_seq(mdev)) {
1922 spin_lock(&mdev->peer_seq_lock);
3c13b680
LE
1923 newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
1924 mdev->peer_seq = newest_peer_seq;
7be8da07 1925 spin_unlock(&mdev->peer_seq_lock);
3c13b680
LE
1926 /* wake up only if we actually changed mdev->peer_seq */
1927 if (peer_seq == newest_peer_seq)
7be8da07
AG
1928 wake_up(&mdev->seq_wait);
1929 }
3e394da1
AG
1930}
1931
d93f6302
LE
1932static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
1933{
1934 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
1935}
1936
1937/* maybe change sync_ee into interval trees as well? */
3ea35df8 1938static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
d93f6302
LE
1939{
1940 struct drbd_peer_request *rs_req;
1941 bool rv = 0;
1942
1943 spin_lock_irq(&mdev->tconn->req_lock);
1944 list_for_each_entry(rs_req, &mdev->sync_ee, w.list) {
1945 if (overlaps(peer_req->i.sector, peer_req->i.size,
1946 rs_req->i.sector, rs_req->i.size)) {
1947 rv = 1;
1948 break;
1949 }
1950 }
1951 spin_unlock_irq(&mdev->tconn->req_lock);
1952
d93f6302
LE
1953 return rv;
1954}
1955
b411b363
PR
1956/* Called from receive_Data.
1957 * Synchronize packets on sock with packets on msock.
1958 *
1959 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1960 * packet traveling on msock, they are still processed in the order they have
1961 * been sent.
1962 *
1963 * Note: we don't care for Ack packets overtaking P_DATA packets.
1964 *
1965 * In case packet_seq is larger than mdev->peer_seq number, there are
1966 * outstanding packets on the msock. We wait for them to arrive.
1967 * In case we are the logically next packet, we update mdev->peer_seq
1968 * ourselves. Correctly handles 32bit wrap around.
1969 *
1970 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1971 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1972 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1973 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1974 *
1975 * returns 0 if we may process the packet,
1976 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
7be8da07 1977static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
b411b363
PR
1978{
1979 DEFINE_WAIT(wait);
b411b363 1980 long timeout;
7be8da07
AG
1981 int ret;
1982
1983 if (!need_peer_seq(mdev))
1984 return 0;
1985
b411b363
PR
1986 spin_lock(&mdev->peer_seq_lock);
1987 for (;;) {
7be8da07
AG
1988 if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
1989 mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
1990 ret = 0;
b411b363 1991 break;
7be8da07 1992 }
b411b363
PR
1993 if (signal_pending(current)) {
1994 ret = -ERESTARTSYS;
1995 break;
1996 }
7be8da07 1997 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
b411b363 1998 spin_unlock(&mdev->peer_seq_lock);
44ed167d
PR
1999 rcu_read_lock();
2000 timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10;
2001 rcu_read_unlock();
71b1c1eb 2002 timeout = schedule_timeout(timeout);
b411b363 2003 spin_lock(&mdev->peer_seq_lock);
7be8da07 2004 if (!timeout) {
b411b363 2005 ret = -ETIMEDOUT;
71b1c1eb 2006 dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
b411b363
PR
2007 break;
2008 }
2009 }
b411b363 2010 spin_unlock(&mdev->peer_seq_lock);
7be8da07 2011 finish_wait(&mdev->seq_wait, &wait);
b411b363
PR
2012 return ret;
2013}
2014
688593c5
LE
2015/* see also bio_flags_to_wire()
2016 * DRBD_REQ_*, because we need to semantically map the flags to data packet
2017 * flags and back. We may replicate to other kernel versions. */
2018static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
76d2e7ec 2019{
688593c5
LE
2020 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2021 (dpf & DP_FUA ? REQ_FUA : 0) |
2022 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
2023 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
76d2e7ec
PR
2024}
2025
7be8da07
AG
2026static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
2027 unsigned int size)
2028{
2029 struct drbd_interval *i;
2030
2031 repeat:
2032 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
2033 struct drbd_request *req;
2034 struct bio_and_error m;
2035
2036 if (!i->local)
2037 continue;
2038 req = container_of(i, struct drbd_request, i);
2039 if (!(req->rq_state & RQ_POSTPONED))
2040 continue;
2041 req->rq_state &= ~RQ_POSTPONED;
2042 __req_mod(req, NEG_ACKED, &m);
2043 spin_unlock_irq(&mdev->tconn->req_lock);
2044 if (m.bio)
2045 complete_master_bio(mdev, &m);
2046 spin_lock_irq(&mdev->tconn->req_lock);
2047 goto repeat;
2048 }
2049}
2050
2051static int handle_write_conflicts(struct drbd_conf *mdev,
2052 struct drbd_peer_request *peer_req)
2053{
2054 struct drbd_tconn *tconn = mdev->tconn;
2055 bool resolve_conflicts = test_bit(DISCARD_CONCURRENT, &tconn->flags);
2056 sector_t sector = peer_req->i.sector;
2057 const unsigned int size = peer_req->i.size;
2058 struct drbd_interval *i;
2059 bool equal;
2060 int err;
2061
2062 /*
2063 * Inserting the peer request into the write_requests tree will prevent
2064 * new conflicting local requests from being added.
2065 */
2066 drbd_insert_interval(&mdev->write_requests, &peer_req->i);
2067
2068 repeat:
2069 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
2070 if (i == &peer_req->i)
2071 continue;
2072
2073 if (!i->local) {
2074 /*
2075 * Our peer has sent a conflicting remote request; this
2076 * should not happen in a two-node setup. Wait for the
2077 * earlier peer request to complete.
2078 */
2079 err = drbd_wait_misc(mdev, i);
2080 if (err)
2081 goto out;
2082 goto repeat;
2083 }
2084
2085 equal = i->sector == sector && i->size == size;
2086 if (resolve_conflicts) {
2087 /*
2088 * If the peer request is fully contained within the
2089 * overlapping request, it can be discarded; otherwise,
2090 * it will be retried once all overlapping requests
2091 * have completed.
2092 */
2093 bool discard = i->sector <= sector && i->sector +
2094 (i->size >> 9) >= sector + (size >> 9);
2095
2096 if (!equal)
2097 dev_alert(DEV, "Concurrent writes detected: "
2098 "local=%llus +%u, remote=%llus +%u, "
2099 "assuming %s came first\n",
2100 (unsigned long long)i->sector, i->size,
2101 (unsigned long long)sector, size,
2102 discard ? "local" : "remote");
2103
2104 inc_unacked(mdev);
2105 peer_req->w.cb = discard ? e_send_discard_write :
2106 e_send_retry_write;
2107 list_add_tail(&peer_req->w.list, &mdev->done_ee);
2108 wake_asender(mdev->tconn);
2109
2110 err = -ENOENT;
2111 goto out;
2112 } else {
2113 struct drbd_request *req =
2114 container_of(i, struct drbd_request, i);
2115
2116 if (!equal)
2117 dev_alert(DEV, "Concurrent writes detected: "
2118 "local=%llus +%u, remote=%llus +%u\n",
2119 (unsigned long long)i->sector, i->size,
2120 (unsigned long long)sector, size);
2121
2122 if (req->rq_state & RQ_LOCAL_PENDING ||
2123 !(req->rq_state & RQ_POSTPONED)) {
2124 /*
2125 * Wait for the node with the discard flag to
2126 * decide if this request will be discarded or
2127 * retried. Requests that are discarded will
2128 * disappear from the write_requests tree.
2129 *
2130 * In addition, wait for the conflicting
2131 * request to finish locally before submitting
2132 * the conflicting peer request.
2133 */
2134 err = drbd_wait_misc(mdev, &req->i);
2135 if (err) {
2136 _conn_request_state(mdev->tconn,
2137 NS(conn, C_TIMEOUT),
2138 CS_HARD);
2139 fail_postponed_requests(mdev, sector, size);
2140 goto out;
2141 }
2142 goto repeat;
2143 }
2144 /*
2145 * Remember to restart the conflicting requests after
2146 * the new peer request has completed.
2147 */
2148 peer_req->flags |= EE_RESTART_REQUESTS;
2149 }
2150 }
2151 err = 0;
2152
2153 out:
2154 if (err)
2155 drbd_remove_epoch_entry_interval(mdev, peer_req);
2156 return err;
2157}
2158
b411b363 2159/* mirrored write */
4a76b161 2160static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2161{
4a76b161 2162 struct drbd_conf *mdev;
b411b363 2163 sector_t sector;
db830c46 2164 struct drbd_peer_request *peer_req;
e658983a 2165 struct p_data *p = pi->data;
7be8da07 2166 u32 peer_seq = be32_to_cpu(p->seq_num);
b411b363
PR
2167 int rw = WRITE;
2168 u32 dp_flags;
302bdeae 2169 int err, tp;
b411b363 2170
4a76b161
AG
2171 mdev = vnr_to_mdev(tconn, pi->vnr);
2172 if (!mdev)
2173 return -EIO;
2174
7be8da07 2175 if (!get_ldev(mdev)) {
82bc0194
AG
2176 int err2;
2177
7be8da07 2178 err = wait_for_and_update_peer_seq(mdev, peer_seq);
e2857216 2179 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
12038a3a 2180 atomic_inc(&tconn->current_epoch->epoch_size);
e2857216 2181 err2 = drbd_drain_block(mdev, pi->size);
82bc0194
AG
2182 if (!err)
2183 err = err2;
2184 return err;
b411b363
PR
2185 }
2186
fcefa62e
AG
2187 /*
2188 * Corresponding put_ldev done either below (on various errors), or in
2189 * drbd_peer_request_endio, if we successfully submit the data at the
2190 * end of this function.
2191 */
b411b363
PR
2192
2193 sector = be64_to_cpu(p->sector);
e2857216 2194 peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
db830c46 2195 if (!peer_req) {
b411b363 2196 put_ldev(mdev);
82bc0194 2197 return -EIO;
b411b363
PR
2198 }
2199
db830c46 2200 peer_req->w.cb = e_end_block;
b411b363 2201
688593c5
LE
2202 dp_flags = be32_to_cpu(p->dp_flags);
2203 rw |= wire_flags_to_bio(mdev, dp_flags);
81a3537a
LE
2204 if (peer_req->pages == NULL) {
2205 D_ASSERT(peer_req->i.size == 0);
2206 D_ASSERT(dp_flags & DP_FLUSH);
2207 }
688593c5
LE
2208
2209 if (dp_flags & DP_MAY_SET_IN_SYNC)
db830c46 2210 peer_req->flags |= EE_MAY_SET_IN_SYNC;
688593c5 2211
12038a3a
PR
2212 spin_lock(&tconn->epoch_lock);
2213 peer_req->epoch = tconn->current_epoch;
db830c46
AG
2214 atomic_inc(&peer_req->epoch->epoch_size);
2215 atomic_inc(&peer_req->epoch->active);
12038a3a 2216 spin_unlock(&tconn->epoch_lock);
b411b363 2217
302bdeae
PR
2218 rcu_read_lock();
2219 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
2220 rcu_read_unlock();
2221 if (tp) {
2222 peer_req->flags |= EE_IN_INTERVAL_TREE;
7be8da07
AG
2223 err = wait_for_and_update_peer_seq(mdev, peer_seq);
2224 if (err)
b411b363 2225 goto out_interrupted;
87eeee41 2226 spin_lock_irq(&mdev->tconn->req_lock);
7be8da07
AG
2227 err = handle_write_conflicts(mdev, peer_req);
2228 if (err) {
2229 spin_unlock_irq(&mdev->tconn->req_lock);
2230 if (err == -ENOENT) {
b411b363 2231 put_ldev(mdev);
82bc0194 2232 return 0;
b411b363 2233 }
7be8da07 2234 goto out_interrupted;
b411b363 2235 }
7be8da07
AG
2236 } else
2237 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2238 list_add(&peer_req->w.list, &mdev->active_ee);
87eeee41 2239 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2240
d93f6302 2241 if (mdev->state.conn == C_SYNC_TARGET)
3ea35df8 2242 wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req));
d93f6302 2243
303d1448 2244 if (mdev->tconn->agreed_pro_version < 100) {
44ed167d
PR
2245 rcu_read_lock();
2246 switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) {
303d1448
PR
2247 case DRBD_PROT_C:
2248 dp_flags |= DP_SEND_WRITE_ACK;
2249 break;
2250 case DRBD_PROT_B:
2251 dp_flags |= DP_SEND_RECEIVE_ACK;
2252 break;
2253 }
44ed167d 2254 rcu_read_unlock();
303d1448
PR
2255 }
2256
2257 if (dp_flags & DP_SEND_WRITE_ACK) {
2258 peer_req->flags |= EE_SEND_WRITE_ACK;
b411b363
PR
2259 inc_unacked(mdev);
2260 /* corresponding dec_unacked() in e_end_block()
2261 * respective _drbd_clear_done_ee */
303d1448
PR
2262 }
2263
2264 if (dp_flags & DP_SEND_RECEIVE_ACK) {
b411b363
PR
2265 /* I really don't like it that the receiver thread
2266 * sends on the msock, but anyways */
db830c46 2267 drbd_send_ack(mdev, P_RECV_ACK, peer_req);
b411b363
PR
2268 }
2269
6719fb03 2270 if (mdev->state.pdsk < D_INCONSISTENT) {
b411b363 2271 /* In case we have the only disk of the cluster, */
db830c46
AG
2272 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
2273 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2274 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
181286ad 2275 drbd_al_begin_io(mdev, &peer_req->i);
b411b363
PR
2276 }
2277
82bc0194
AG
2278 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
2279 if (!err)
2280 return 0;
b411b363 2281
10f6d992
LE
2282 /* don't care for the reason here */
2283 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2284 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
2285 list_del(&peer_req->w.list);
2286 drbd_remove_epoch_entry_interval(mdev, peer_req);
87eeee41 2287 spin_unlock_irq(&mdev->tconn->req_lock);
db830c46 2288 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
181286ad 2289 drbd_al_complete_io(mdev, &peer_req->i);
22cc37a9 2290
b411b363 2291out_interrupted:
1e9dd291 2292 drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP);
b411b363 2293 put_ldev(mdev);
3967deb1 2294 drbd_free_peer_req(mdev, peer_req);
82bc0194 2295 return err;
b411b363
PR
2296}
2297
0f0601f4
LE
2298/* We may throttle resync, if the lower device seems to be busy,
2299 * and current sync rate is above c_min_rate.
2300 *
2301 * To decide whether or not the lower device is busy, we use a scheme similar
2302 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2303 * (more than 64 sectors) of activity we cannot account for with our own resync
2304 * activity, it obviously is "busy".
2305 *
2306 * The current sync rate used here uses only the most recent two step marks,
2307 * to have a short time average so we can react faster.
2308 */
e3555d85 2309int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
0f0601f4
LE
2310{
2311 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
2312 unsigned long db, dt, dbdt;
e3555d85 2313 struct lc_element *tmp;
0f0601f4
LE
2314 int curr_events;
2315 int throttle = 0;
daeda1cc
PR
2316 unsigned int c_min_rate;
2317
2318 rcu_read_lock();
2319 c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate;
2320 rcu_read_unlock();
0f0601f4
LE
2321
2322 /* feature disabled? */
daeda1cc 2323 if (c_min_rate == 0)
0f0601f4
LE
2324 return 0;
2325
e3555d85
PR
2326 spin_lock_irq(&mdev->al_lock);
2327 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
2328 if (tmp) {
2329 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2330 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
2331 spin_unlock_irq(&mdev->al_lock);
2332 return 0;
2333 }
2334 /* Do not slow down if app IO is already waiting for this extent */
2335 }
2336 spin_unlock_irq(&mdev->al_lock);
2337
0f0601f4
LE
2338 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2339 (int)part_stat_read(&disk->part0, sectors[1]) -
2340 atomic_read(&mdev->rs_sect_ev);
e3555d85 2341
0f0601f4
LE
2342 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
2343 unsigned long rs_left;
2344 int i;
2345
2346 mdev->rs_last_events = curr_events;
2347
2348 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2349 * approx. */
2649f080
LE
2350 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2351
2352 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
2353 rs_left = mdev->ov_left;
2354 else
2355 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
0f0601f4
LE
2356
2357 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
2358 if (!dt)
2359 dt++;
2360 db = mdev->rs_mark_left[i] - rs_left;
2361 dbdt = Bit2KB(db/dt);
2362
daeda1cc 2363 if (dbdt > c_min_rate)
0f0601f4
LE
2364 throttle = 1;
2365 }
2366 return throttle;
2367}
2368
2369
4a76b161 2370static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2371{
4a76b161 2372 struct drbd_conf *mdev;
b411b363 2373 sector_t sector;
4a76b161 2374 sector_t capacity;
db830c46 2375 struct drbd_peer_request *peer_req;
b411b363 2376 struct digest_info *di = NULL;
b18b37be 2377 int size, verb;
b411b363 2378 unsigned int fault_type;
e658983a 2379 struct p_block_req *p = pi->data;
4a76b161
AG
2380
2381 mdev = vnr_to_mdev(tconn, pi->vnr);
2382 if (!mdev)
2383 return -EIO;
2384 capacity = drbd_get_capacity(mdev->this_bdev);
b411b363
PR
2385
2386 sector = be64_to_cpu(p->sector);
2387 size = be32_to_cpu(p->blksize);
2388
c670a398 2389 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
b411b363
PR
2390 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2391 (unsigned long long)sector, size);
82bc0194 2392 return -EINVAL;
b411b363
PR
2393 }
2394 if (sector + (size>>9) > capacity) {
2395 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2396 (unsigned long long)sector, size);
82bc0194 2397 return -EINVAL;
b411b363
PR
2398 }
2399
2400 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
b18b37be 2401 verb = 1;
e2857216 2402 switch (pi->cmd) {
b18b37be
PR
2403 case P_DATA_REQUEST:
2404 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
2405 break;
2406 case P_RS_DATA_REQUEST:
2407 case P_CSUM_RS_REQUEST:
2408 case P_OV_REQUEST:
2409 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
2410 break;
2411 case P_OV_REPLY:
2412 verb = 0;
2413 dec_rs_pending(mdev);
2414 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
2415 break;
2416 default:
49ba9b1b 2417 BUG();
b18b37be
PR
2418 }
2419 if (verb && __ratelimit(&drbd_ratelimit_state))
b411b363
PR
2420 dev_err(DEV, "Can not satisfy peer's read request, "
2421 "no local data.\n");
b18b37be 2422
a821cc4a 2423 /* drain possibly payload */
e2857216 2424 return drbd_drain_block(mdev, pi->size);
b411b363
PR
2425 }
2426
2427 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2428 * "criss-cross" setup, that might cause write-out on some other DRBD,
2429 * which in turn might block on the other node at this very place. */
0db55363 2430 peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO);
db830c46 2431 if (!peer_req) {
b411b363 2432 put_ldev(mdev);
82bc0194 2433 return -ENOMEM;
b411b363
PR
2434 }
2435
e2857216 2436 switch (pi->cmd) {
b411b363 2437 case P_DATA_REQUEST:
db830c46 2438 peer_req->w.cb = w_e_end_data_req;
b411b363 2439 fault_type = DRBD_FAULT_DT_RD;
80a40e43
LE
2440 /* application IO, don't drbd_rs_begin_io */
2441 goto submit;
2442
b411b363 2443 case P_RS_DATA_REQUEST:
db830c46 2444 peer_req->w.cb = w_e_end_rsdata_req;
b411b363 2445 fault_type = DRBD_FAULT_RS_RD;
5f9915bb
LE
2446 /* used in the sector offset progress display */
2447 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
b411b363
PR
2448 break;
2449
2450 case P_OV_REPLY:
2451 case P_CSUM_RS_REQUEST:
2452 fault_type = DRBD_FAULT_RS_RD;
e2857216 2453 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
b411b363
PR
2454 if (!di)
2455 goto out_free_e;
2456
e2857216 2457 di->digest_size = pi->size;
b411b363
PR
2458 di->digest = (((char *)di)+sizeof(struct digest_info));
2459
db830c46
AG
2460 peer_req->digest = di;
2461 peer_req->flags |= EE_HAS_DIGEST;
c36c3ced 2462
e2857216 2463 if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
b411b363
PR
2464 goto out_free_e;
2465
e2857216 2466 if (pi->cmd == P_CSUM_RS_REQUEST) {
31890f4a 2467 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
db830c46 2468 peer_req->w.cb = w_e_end_csum_rs_req;
5f9915bb
LE
2469 /* used in the sector offset progress display */
2470 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
e2857216 2471 } else if (pi->cmd == P_OV_REPLY) {
2649f080
LE
2472 /* track progress, we may need to throttle */
2473 atomic_add(size >> 9, &mdev->rs_sect_in);
db830c46 2474 peer_req->w.cb = w_e_end_ov_reply;
b411b363 2475 dec_rs_pending(mdev);
0f0601f4
LE
2476 /* drbd_rs_begin_io done when we sent this request,
2477 * but accounting still needs to be done. */
2478 goto submit_for_resync;
b411b363
PR
2479 }
2480 break;
2481
2482 case P_OV_REQUEST:
b411b363 2483 if (mdev->ov_start_sector == ~(sector_t)0 &&
31890f4a 2484 mdev->tconn->agreed_pro_version >= 90) {
de228bba
LE
2485 unsigned long now = jiffies;
2486 int i;
b411b363
PR
2487 mdev->ov_start_sector = sector;
2488 mdev->ov_position = sector;
30b743a2
LE
2489 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2490 mdev->rs_total = mdev->ov_left;
de228bba
LE
2491 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2492 mdev->rs_mark_left[i] = mdev->ov_left;
2493 mdev->rs_mark_time[i] = now;
2494 }
b411b363
PR
2495 dev_info(DEV, "Online Verify start sector: %llu\n",
2496 (unsigned long long)sector);
2497 }
db830c46 2498 peer_req->w.cb = w_e_end_ov_req;
b411b363 2499 fault_type = DRBD_FAULT_RS_RD;
b411b363
PR
2500 break;
2501
b411b363 2502 default:
49ba9b1b 2503 BUG();
b411b363
PR
2504 }
2505
0f0601f4
LE
2506 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2507 * wrt the receiver, but it is not as straightforward as it may seem.
2508 * Various places in the resync start and stop logic assume resync
2509 * requests are processed in order, requeuing this on the worker thread
2510 * introduces a bunch of new code for synchronization between threads.
2511 *
2512 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2513 * "forever", throttling after drbd_rs_begin_io will lock that extent
2514 * for application writes for the same time. For now, just throttle
2515 * here, where the rest of the code expects the receiver to sleep for
2516 * a while, anyways.
2517 */
2518
2519 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2520 * this defers syncer requests for some time, before letting at least
2521 * on request through. The resync controller on the receiving side
2522 * will adapt to the incoming rate accordingly.
2523 *
2524 * We cannot throttle here if remote is Primary/SyncTarget:
2525 * we would also throttle its application reads.
2526 * In that case, throttling is done on the SyncTarget only.
2527 */
e3555d85
PR
2528 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2529 schedule_timeout_uninterruptible(HZ/10);
2530 if (drbd_rs_begin_io(mdev, sector))
80a40e43 2531 goto out_free_e;
b411b363 2532
0f0601f4
LE
2533submit_for_resync:
2534 atomic_add(size >> 9, &mdev->rs_sect_ev);
2535
80a40e43 2536submit:
b411b363 2537 inc_unacked(mdev);
87eeee41 2538 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2539 list_add_tail(&peer_req->w.list, &mdev->read_ee);
87eeee41 2540 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2541
fbe29dec 2542 if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
82bc0194 2543 return 0;
b411b363 2544
10f6d992
LE
2545 /* don't care for the reason here */
2546 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2547 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2548 list_del(&peer_req->w.list);
87eeee41 2549 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9
LE
2550 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2551
b411b363 2552out_free_e:
b411b363 2553 put_ldev(mdev);
3967deb1 2554 drbd_free_peer_req(mdev, peer_req);
82bc0194 2555 return -EIO;
b411b363
PR
2556}
2557
2558static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2559{
2560 int self, peer, rv = -100;
2561 unsigned long ch_self, ch_peer;
44ed167d 2562 enum drbd_after_sb_p after_sb_0p;
b411b363
PR
2563
2564 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2565 peer = mdev->p_uuid[UI_BITMAP] & 1;
2566
2567 ch_peer = mdev->p_uuid[UI_SIZE];
2568 ch_self = mdev->comm_bm_set;
2569
44ed167d
PR
2570 rcu_read_lock();
2571 after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p;
2572 rcu_read_unlock();
2573 switch (after_sb_0p) {
b411b363
PR
2574 case ASB_CONSENSUS:
2575 case ASB_DISCARD_SECONDARY:
2576 case ASB_CALL_HELPER:
44ed167d 2577 case ASB_VIOLENTLY:
b411b363
PR
2578 dev_err(DEV, "Configuration error.\n");
2579 break;
2580 case ASB_DISCONNECT:
2581 break;
2582 case ASB_DISCARD_YOUNGER_PRI:
2583 if (self == 0 && peer == 1) {
2584 rv = -1;
2585 break;
2586 }
2587 if (self == 1 && peer == 0) {
2588 rv = 1;
2589 break;
2590 }
2591 /* Else fall through to one of the other strategies... */
2592 case ASB_DISCARD_OLDER_PRI:
2593 if (self == 0 && peer == 1) {
2594 rv = 1;
2595 break;
2596 }
2597 if (self == 1 && peer == 0) {
2598 rv = -1;
2599 break;
2600 }
2601 /* Else fall through to one of the other strategies... */
ad19bf6e 2602 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
b411b363
PR
2603 "Using discard-least-changes instead\n");
2604 case ASB_DISCARD_ZERO_CHG:
2605 if (ch_peer == 0 && ch_self == 0) {
25703f83 2606 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2607 ? -1 : 1;
2608 break;
2609 } else {
2610 if (ch_peer == 0) { rv = 1; break; }
2611 if (ch_self == 0) { rv = -1; break; }
2612 }
44ed167d 2613 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
b411b363
PR
2614 break;
2615 case ASB_DISCARD_LEAST_CHG:
2616 if (ch_self < ch_peer)
2617 rv = -1;
2618 else if (ch_self > ch_peer)
2619 rv = 1;
2620 else /* ( ch_self == ch_peer ) */
2621 /* Well, then use something else. */
25703f83 2622 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2623 ? -1 : 1;
2624 break;
2625 case ASB_DISCARD_LOCAL:
2626 rv = -1;
2627 break;
2628 case ASB_DISCARD_REMOTE:
2629 rv = 1;
2630 }
2631
2632 return rv;
2633}
2634
2635static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2636{
6184ea21 2637 int hg, rv = -100;
44ed167d 2638 enum drbd_after_sb_p after_sb_1p;
b411b363 2639
44ed167d
PR
2640 rcu_read_lock();
2641 after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p;
2642 rcu_read_unlock();
2643 switch (after_sb_1p) {
b411b363
PR
2644 case ASB_DISCARD_YOUNGER_PRI:
2645 case ASB_DISCARD_OLDER_PRI:
2646 case ASB_DISCARD_LEAST_CHG:
2647 case ASB_DISCARD_LOCAL:
2648 case ASB_DISCARD_REMOTE:
44ed167d 2649 case ASB_DISCARD_ZERO_CHG:
b411b363
PR
2650 dev_err(DEV, "Configuration error.\n");
2651 break;
2652 case ASB_DISCONNECT:
2653 break;
2654 case ASB_CONSENSUS:
2655 hg = drbd_asb_recover_0p(mdev);
2656 if (hg == -1 && mdev->state.role == R_SECONDARY)
2657 rv = hg;
2658 if (hg == 1 && mdev->state.role == R_PRIMARY)
2659 rv = hg;
2660 break;
2661 case ASB_VIOLENTLY:
2662 rv = drbd_asb_recover_0p(mdev);
2663 break;
2664 case ASB_DISCARD_SECONDARY:
2665 return mdev->state.role == R_PRIMARY ? 1 : -1;
2666 case ASB_CALL_HELPER:
2667 hg = drbd_asb_recover_0p(mdev);
2668 if (hg == -1 && mdev->state.role == R_PRIMARY) {
bb437946
AG
2669 enum drbd_state_rv rv2;
2670
2671 drbd_set_role(mdev, R_SECONDARY, 0);
b411b363
PR
2672 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2673 * we might be here in C_WF_REPORT_PARAMS which is transient.
2674 * we do not need to wait for the after state change work either. */
bb437946
AG
2675 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2676 if (rv2 != SS_SUCCESS) {
b411b363
PR
2677 drbd_khelper(mdev, "pri-lost-after-sb");
2678 } else {
2679 dev_warn(DEV, "Successfully gave up primary role.\n");
2680 rv = hg;
2681 }
2682 } else
2683 rv = hg;
2684 }
2685
2686 return rv;
2687}
2688
2689static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2690{
6184ea21 2691 int hg, rv = -100;
44ed167d 2692 enum drbd_after_sb_p after_sb_2p;
b411b363 2693
44ed167d
PR
2694 rcu_read_lock();
2695 after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p;
2696 rcu_read_unlock();
2697 switch (after_sb_2p) {
b411b363
PR
2698 case ASB_DISCARD_YOUNGER_PRI:
2699 case ASB_DISCARD_OLDER_PRI:
2700 case ASB_DISCARD_LEAST_CHG:
2701 case ASB_DISCARD_LOCAL:
2702 case ASB_DISCARD_REMOTE:
2703 case ASB_CONSENSUS:
2704 case ASB_DISCARD_SECONDARY:
44ed167d 2705 case ASB_DISCARD_ZERO_CHG:
b411b363
PR
2706 dev_err(DEV, "Configuration error.\n");
2707 break;
2708 case ASB_VIOLENTLY:
2709 rv = drbd_asb_recover_0p(mdev);
2710 break;
2711 case ASB_DISCONNECT:
2712 break;
2713 case ASB_CALL_HELPER:
2714 hg = drbd_asb_recover_0p(mdev);
2715 if (hg == -1) {
bb437946
AG
2716 enum drbd_state_rv rv2;
2717
b411b363
PR
2718 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2719 * we might be here in C_WF_REPORT_PARAMS which is transient.
2720 * we do not need to wait for the after state change work either. */
bb437946
AG
2721 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2722 if (rv2 != SS_SUCCESS) {
b411b363
PR
2723 drbd_khelper(mdev, "pri-lost-after-sb");
2724 } else {
2725 dev_warn(DEV, "Successfully gave up primary role.\n");
2726 rv = hg;
2727 }
2728 } else
2729 rv = hg;
2730 }
2731
2732 return rv;
2733}
2734
2735static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2736 u64 bits, u64 flags)
2737{
2738 if (!uuid) {
2739 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2740 return;
2741 }
2742 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2743 text,
2744 (unsigned long long)uuid[UI_CURRENT],
2745 (unsigned long long)uuid[UI_BITMAP],
2746 (unsigned long long)uuid[UI_HISTORY_START],
2747 (unsigned long long)uuid[UI_HISTORY_END],
2748 (unsigned long long)bits,
2749 (unsigned long long)flags);
2750}
2751
2752/*
2753 100 after split brain try auto recover
2754 2 C_SYNC_SOURCE set BitMap
2755 1 C_SYNC_SOURCE use BitMap
2756 0 no Sync
2757 -1 C_SYNC_TARGET use BitMap
2758 -2 C_SYNC_TARGET set BitMap
2759 -100 after split brain, disconnect
2760-1000 unrelated data
4a23f264
PR
2761-1091 requires proto 91
2762-1096 requires proto 96
b411b363
PR
2763 */
2764static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2765{
2766 u64 self, peer;
2767 int i, j;
2768
2769 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2770 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2771
2772 *rule_nr = 10;
2773 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2774 return 0;
2775
2776 *rule_nr = 20;
2777 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2778 peer != UUID_JUST_CREATED)
2779 return -2;
2780
2781 *rule_nr = 30;
2782 if (self != UUID_JUST_CREATED &&
2783 (peer == UUID_JUST_CREATED || peer == (u64)0))
2784 return 2;
2785
2786 if (self == peer) {
2787 int rct, dc; /* roles at crash time */
2788
2789 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2790
31890f4a 2791 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2792 return -1091;
b411b363
PR
2793
2794 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2795 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2796 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2797 drbd_uuid_set_bm(mdev, 0UL);
2798
2799 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2800 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2801 *rule_nr = 34;
2802 } else {
2803 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2804 *rule_nr = 36;
2805 }
2806
2807 return 1;
2808 }
2809
2810 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2811
31890f4a 2812 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2813 return -1091;
b411b363
PR
2814
2815 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2816 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2817 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2818
2819 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2820 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2821 mdev->p_uuid[UI_BITMAP] = 0UL;
2822
2823 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2824 *rule_nr = 35;
2825 } else {
2826 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2827 *rule_nr = 37;
2828 }
2829
2830 return -1;
2831 }
2832
2833 /* Common power [off|failure] */
2834 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2835 (mdev->p_uuid[UI_FLAGS] & 2);
2836 /* lowest bit is set when we were primary,
2837 * next bit (weight 2) is set when peer was primary */
2838 *rule_nr = 40;
2839
2840 switch (rct) {
2841 case 0: /* !self_pri && !peer_pri */ return 0;
2842 case 1: /* self_pri && !peer_pri */ return 1;
2843 case 2: /* !self_pri && peer_pri */ return -1;
2844 case 3: /* self_pri && peer_pri */
25703f83 2845 dc = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags);
b411b363
PR
2846 return dc ? -1 : 1;
2847 }
2848 }
2849
2850 *rule_nr = 50;
2851 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2852 if (self == peer)
2853 return -1;
2854
2855 *rule_nr = 51;
2856 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2857 if (self == peer) {
31890f4a 2858 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2859 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2860 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2861 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2862 /* The last P_SYNC_UUID did not get though. Undo the last start of
2863 resync as sync source modifications of the peer's UUIDs. */
2864
31890f4a 2865 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2866 return -1091;
b411b363
PR
2867
2868 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2869 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
4a23f264 2870
1882e22d 2871 dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
4a23f264
PR
2872 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2873
b411b363
PR
2874 return -1;
2875 }
2876 }
2877
2878 *rule_nr = 60;
2879 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2880 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2881 peer = mdev->p_uuid[i] & ~((u64)1);
2882 if (self == peer)
2883 return -2;
2884 }
2885
2886 *rule_nr = 70;
2887 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2888 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2889 if (self == peer)
2890 return 1;
2891
2892 *rule_nr = 71;
2893 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2894 if (self == peer) {
31890f4a 2895 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2896 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2897 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2898 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2899 /* The last P_SYNC_UUID did not get though. Undo the last start of
2900 resync as sync source modifications of our UUIDs. */
2901
31890f4a 2902 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2903 return -1091;
b411b363
PR
2904
2905 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2906 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2907
4a23f264 2908 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
b411b363
PR
2909 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2910 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2911
2912 return 1;
2913 }
2914 }
2915
2916
2917 *rule_nr = 80;
d8c2a36b 2918 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
2919 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2920 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2921 if (self == peer)
2922 return 2;
2923 }
2924
2925 *rule_nr = 90;
2926 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2927 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2928 if (self == peer && self != ((u64)0))
2929 return 100;
2930
2931 *rule_nr = 100;
2932 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2933 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2934 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2935 peer = mdev->p_uuid[j] & ~((u64)1);
2936 if (self == peer)
2937 return -100;
2938 }
2939 }
2940
2941 return -1000;
2942}
2943
2944/* drbd_sync_handshake() returns the new conn state on success, or
2945 CONN_MASK (-1) on failure.
2946 */
2947static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2948 enum drbd_disk_state peer_disk) __must_hold(local)
2949{
b411b363
PR
2950 enum drbd_conns rv = C_MASK;
2951 enum drbd_disk_state mydisk;
44ed167d 2952 struct net_conf *nc;
6dff2902 2953 int hg, rule_nr, rr_conflict, tentative;
b411b363
PR
2954
2955 mydisk = mdev->state.disk;
2956 if (mydisk == D_NEGOTIATING)
2957 mydisk = mdev->new_state_tmp.disk;
2958
2959 dev_info(DEV, "drbd_sync_handshake:\n");
2960 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2961 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2962 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2963
2964 hg = drbd_uuid_compare(mdev, &rule_nr);
2965
2966 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2967
2968 if (hg == -1000) {
2969 dev_alert(DEV, "Unrelated data, aborting!\n");
2970 return C_MASK;
2971 }
4a23f264
PR
2972 if (hg < -1000) {
2973 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
b411b363
PR
2974 return C_MASK;
2975 }
2976
2977 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2978 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2979 int f = (hg == -100) || abs(hg) == 2;
2980 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2981 if (f)
2982 hg = hg*2;
2983 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2984 hg > 0 ? "source" : "target");
2985 }
2986
3a11a487
AG
2987 if (abs(hg) == 100)
2988 drbd_khelper(mdev, "initial-split-brain");
2989
44ed167d
PR
2990 rcu_read_lock();
2991 nc = rcu_dereference(mdev->tconn->net_conf);
2992
2993 if (hg == 100 || (hg == -100 && nc->always_asbp)) {
b411b363
PR
2994 int pcount = (mdev->state.role == R_PRIMARY)
2995 + (peer_role == R_PRIMARY);
2996 int forced = (hg == -100);
2997
2998 switch (pcount) {
2999 case 0:
3000 hg = drbd_asb_recover_0p(mdev);
3001 break;
3002 case 1:
3003 hg = drbd_asb_recover_1p(mdev);
3004 break;
3005 case 2:
3006 hg = drbd_asb_recover_2p(mdev);
3007 break;
3008 }
3009 if (abs(hg) < 100) {
3010 dev_warn(DEV, "Split-Brain detected, %d primaries, "
3011 "automatically solved. Sync from %s node\n",
3012 pcount, (hg < 0) ? "peer" : "this");
3013 if (forced) {
3014 dev_warn(DEV, "Doing a full sync, since"
3015 " UUIDs where ambiguous.\n");
3016 hg = hg*2;
3017 }
3018 }
3019 }
3020
3021 if (hg == -100) {
08b165ba 3022 if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1))
b411b363 3023 hg = -1;
08b165ba 3024 if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1))
b411b363
PR
3025 hg = 1;
3026
3027 if (abs(hg) < 100)
3028 dev_warn(DEV, "Split-Brain detected, manually solved. "
3029 "Sync from %s node\n",
3030 (hg < 0) ? "peer" : "this");
3031 }
44ed167d 3032 rr_conflict = nc->rr_conflict;
6dff2902 3033 tentative = nc->tentative;
44ed167d 3034 rcu_read_unlock();
b411b363
PR
3035
3036 if (hg == -100) {
580b9767
LE
3037 /* FIXME this log message is not correct if we end up here
3038 * after an attempted attach on a diskless node.
3039 * We just refuse to attach -- well, we drop the "connection"
3040 * to that disk, in a way... */
3a11a487 3041 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
b411b363
PR
3042 drbd_khelper(mdev, "split-brain");
3043 return C_MASK;
3044 }
3045
3046 if (hg > 0 && mydisk <= D_INCONSISTENT) {
3047 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
3048 return C_MASK;
3049 }
3050
3051 if (hg < 0 && /* by intention we do not use mydisk here. */
3052 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
44ed167d 3053 switch (rr_conflict) {
b411b363
PR
3054 case ASB_CALL_HELPER:
3055 drbd_khelper(mdev, "pri-lost");
3056 /* fall through */
3057 case ASB_DISCONNECT:
3058 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
3059 return C_MASK;
3060 case ASB_VIOLENTLY:
3061 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
3062 "assumption\n");
3063 }
3064 }
3065
6dff2902 3066 if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
cf14c2e9
PR
3067 if (hg == 0)
3068 dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
3069 else
3070 dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
3071 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3072 abs(hg) >= 2 ? "full" : "bit-map based");
3073 return C_MASK;
3074 }
3075
b411b363
PR
3076 if (abs(hg) >= 2) {
3077 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
20ceb2b2
LE
3078 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3079 BM_LOCKED_SET_ALLOWED))
b411b363
PR
3080 return C_MASK;
3081 }
3082
3083 if (hg > 0) { /* become sync source. */
3084 rv = C_WF_BITMAP_S;
3085 } else if (hg < 0) { /* become sync target */
3086 rv = C_WF_BITMAP_T;
3087 } else {
3088 rv = C_CONNECTED;
3089 if (drbd_bm_total_weight(mdev)) {
3090 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
3091 drbd_bm_total_weight(mdev));
3092 }
3093 }
3094
3095 return rv;
3096}
3097
f179d76d 3098static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
b411b363
PR
3099{
3100 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
f179d76d
PR
3101 if (peer == ASB_DISCARD_REMOTE)
3102 return ASB_DISCARD_LOCAL;
b411b363
PR
3103
3104 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
f179d76d
PR
3105 if (peer == ASB_DISCARD_LOCAL)
3106 return ASB_DISCARD_REMOTE;
b411b363
PR
3107
3108 /* everything else is valid if they are equal on both sides. */
f179d76d 3109 return peer;
b411b363
PR
3110}
3111
e2857216 3112static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3113{
e658983a 3114 struct p_protocol *p = pi->data;
036b17ea
PR
3115 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3116 int p_proto, p_discard_my_data, p_two_primaries, cf;
3117 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3118 char integrity_alg[SHARED_SECRET_MAX] = "";
accdbcc5 3119 struct crypto_hash *peer_integrity_tfm = NULL;
7aca6c75 3120 void *int_dig_in = NULL, *int_dig_vv = NULL;
b411b363 3121
b411b363
PR
3122 p_proto = be32_to_cpu(p->protocol);
3123 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
3124 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
3125 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
b411b363 3126 p_two_primaries = be32_to_cpu(p->two_primaries);
cf14c2e9 3127 cf = be32_to_cpu(p->conn_flags);
6139f60d 3128 p_discard_my_data = cf & CF_DISCARD_MY_DATA;
cf14c2e9 3129
86db0618
AG
3130 if (tconn->agreed_pro_version >= 87) {
3131 int err;
3132
88104ca4 3133 if (pi->size > sizeof(integrity_alg))
86db0618 3134 return -EIO;
88104ca4 3135 err = drbd_recv_all(tconn, integrity_alg, pi->size);
86db0618
AG
3136 if (err)
3137 return err;
036b17ea
PR
3138 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3139 }
88104ca4 3140
7d4c782c 3141 if (pi->cmd != P_PROTOCOL_UPDATE) {
fbc12f45 3142 clear_bit(CONN_DRY_RUN, &tconn->flags);
036b17ea 3143
fbc12f45
AG
3144 if (cf & CF_DRY_RUN)
3145 set_bit(CONN_DRY_RUN, &tconn->flags);
cf14c2e9 3146
fbc12f45
AG
3147 rcu_read_lock();
3148 nc = rcu_dereference(tconn->net_conf);
b411b363 3149
fbc12f45 3150 if (p_proto != nc->wire_protocol) {
d505d9be 3151 conn_err(tconn, "incompatible %s settings\n", "protocol");
fbc12f45
AG
3152 goto disconnect_rcu_unlock;
3153 }
44ed167d 3154
fbc12f45 3155 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
d505d9be 3156 conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri");
fbc12f45
AG
3157 goto disconnect_rcu_unlock;
3158 }
b411b363 3159
fbc12f45 3160 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
d505d9be 3161 conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri");
fbc12f45
AG
3162 goto disconnect_rcu_unlock;
3163 }
b411b363 3164
fbc12f45 3165 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
d505d9be 3166 conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri");
fbc12f45
AG
3167 goto disconnect_rcu_unlock;
3168 }
b411b363 3169
fbc12f45 3170 if (p_discard_my_data && nc->discard_my_data) {
d505d9be 3171 conn_err(tconn, "incompatible %s settings\n", "discard-my-data");
fbc12f45
AG
3172 goto disconnect_rcu_unlock;
3173 }
b411b363 3174
fbc12f45 3175 if (p_two_primaries != nc->two_primaries) {
d505d9be 3176 conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries");
fbc12f45
AG
3177 goto disconnect_rcu_unlock;
3178 }
b411b363 3179
fbc12f45 3180 if (strcmp(integrity_alg, nc->integrity_alg)) {
d505d9be 3181 conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg");
fbc12f45
AG
3182 goto disconnect_rcu_unlock;
3183 }
b411b363 3184
fbc12f45 3185 rcu_read_unlock();
036b17ea 3186 }
7d4c782c
AG
3187
3188 if (integrity_alg[0]) {
3189 int hash_size;
3190
3191 /*
3192 * We can only change the peer data integrity algorithm
3193 * here. Changing our own data integrity algorithm
3194 * requires that we send a P_PROTOCOL_UPDATE packet at
3195 * the same time; otherwise, the peer has no way to
3196 * tell between which packets the algorithm should
3197 * change.
3198 */
3199
3200 peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3201 if (!peer_integrity_tfm) {
3202 conn_err(tconn, "peer data-integrity-alg %s not supported\n",
3203 integrity_alg);
3204 goto disconnect;
3205 }
3206
3207 hash_size = crypto_hash_digestsize(peer_integrity_tfm);
3208 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3209 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3210 if (!(int_dig_in && int_dig_vv)) {
3211 conn_err(tconn, "Allocation of buffers for data integrity checking failed\n");
3212 goto disconnect;
3213 }
3214 }
3215
3216 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3217 if (!new_net_conf) {
3218 conn_err(tconn, "Allocation of new net_conf failed\n");
3219 goto disconnect;
3220 }
3221
3222 mutex_lock(&tconn->data.mutex);
3223 mutex_lock(&tconn->conf_update);
3224 old_net_conf = tconn->net_conf;
3225 *new_net_conf = *old_net_conf;
3226
3227 new_net_conf->wire_protocol = p_proto;
3228 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3229 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3230 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3231 new_net_conf->two_primaries = p_two_primaries;
3232
3233 rcu_assign_pointer(tconn->net_conf, new_net_conf);
3234 mutex_unlock(&tconn->conf_update);
3235 mutex_unlock(&tconn->data.mutex);
3236
3237 crypto_free_hash(tconn->peer_integrity_tfm);
3238 kfree(tconn->int_dig_in);
3239 kfree(tconn->int_dig_vv);
3240 tconn->peer_integrity_tfm = peer_integrity_tfm;
3241 tconn->int_dig_in = int_dig_in;
3242 tconn->int_dig_vv = int_dig_vv;
3243
3244 if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3245 conn_info(tconn, "peer data-integrity-alg: %s\n",
3246 integrity_alg[0] ? integrity_alg : "(none)");
3247
3248 synchronize_rcu();
3249 kfree(old_net_conf);
82bc0194 3250 return 0;
b411b363 3251
44ed167d
PR
3252disconnect_rcu_unlock:
3253 rcu_read_unlock();
b411b363 3254disconnect:
b792c35c 3255 crypto_free_hash(peer_integrity_tfm);
036b17ea
PR
3256 kfree(int_dig_in);
3257 kfree(int_dig_vv);
7204624c 3258 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3259 return -EIO;
b411b363
PR
3260}
3261
3262/* helper function
3263 * input: alg name, feature name
3264 * return: NULL (alg name was "")
3265 * ERR_PTR(error) if something goes wrong
3266 * or the crypto hash ptr, if it worked out ok. */
3267struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
3268 const char *alg, const char *name)
3269{
3270 struct crypto_hash *tfm;
3271
3272 if (!alg[0])
3273 return NULL;
3274
3275 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
3276 if (IS_ERR(tfm)) {
3277 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3278 alg, name, PTR_ERR(tfm));
3279 return tfm;
3280 }
b411b363
PR
3281 return tfm;
3282}
3283
4a76b161
AG
3284static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
3285{
3286 void *buffer = tconn->data.rbuf;
3287 int size = pi->size;
3288
3289 while (size) {
3290 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3291 s = drbd_recv(tconn, buffer, s);
3292 if (s <= 0) {
3293 if (s < 0)
3294 return s;
3295 break;
3296 }
3297 size -= s;
3298 }
3299 if (size)
3300 return -EIO;
3301 return 0;
3302}
3303
3304/*
3305 * config_unknown_volume - device configuration command for unknown volume
3306 *
3307 * When a device is added to an existing connection, the node on which the
3308 * device is added first will send configuration commands to its peer but the
3309 * peer will not know about the device yet. It will warn and ignore these
3310 * commands. Once the device is added on the second node, the second node will
3311 * send the same device configuration commands, but in the other direction.
3312 *
3313 * (We can also end up here if drbd is misconfigured.)
3314 */
3315static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
3316{
2fcb8f30
AG
3317 conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n",
3318 cmdname(pi->cmd), pi->vnr);
4a76b161
AG
3319 return ignore_remaining_packet(tconn, pi);
3320}
3321
3322static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3323{
4a76b161 3324 struct drbd_conf *mdev;
e658983a 3325 struct p_rs_param_95 *p;
b411b363
PR
3326 unsigned int header_size, data_size, exp_max_sz;
3327 struct crypto_hash *verify_tfm = NULL;
3328 struct crypto_hash *csums_tfm = NULL;
2ec91e0e 3329 struct net_conf *old_net_conf, *new_net_conf = NULL;
813472ce 3330 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
4a76b161 3331 const int apv = tconn->agreed_pro_version;
813472ce 3332 struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
778f271d 3333 int fifo_size = 0;
82bc0194 3334 int err;
b411b363 3335
4a76b161
AG
3336 mdev = vnr_to_mdev(tconn, pi->vnr);
3337 if (!mdev)
3338 return config_unknown_volume(tconn, pi);
3339
b411b363
PR
3340 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
3341 : apv == 88 ? sizeof(struct p_rs_param)
3342 + SHARED_SECRET_MAX
8e26f9cc
PR
3343 : apv <= 94 ? sizeof(struct p_rs_param_89)
3344 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 3345
e2857216 3346 if (pi->size > exp_max_sz) {
b411b363 3347 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
e2857216 3348 pi->size, exp_max_sz);
82bc0194 3349 return -EIO;
b411b363
PR
3350 }
3351
3352 if (apv <= 88) {
e658983a 3353 header_size = sizeof(struct p_rs_param);
e2857216 3354 data_size = pi->size - header_size;
8e26f9cc 3355 } else if (apv <= 94) {
e658983a 3356 header_size = sizeof(struct p_rs_param_89);
e2857216 3357 data_size = pi->size - header_size;
b411b363 3358 D_ASSERT(data_size == 0);
8e26f9cc 3359 } else {
e658983a 3360 header_size = sizeof(struct p_rs_param_95);
e2857216 3361 data_size = pi->size - header_size;
b411b363
PR
3362 D_ASSERT(data_size == 0);
3363 }
3364
3365 /* initialize verify_alg and csums_alg */
e658983a 3366 p = pi->data;
b411b363
PR
3367 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3368
e658983a 3369 err = drbd_recv_all(mdev->tconn, p, header_size);
82bc0194
AG
3370 if (err)
3371 return err;
b411b363 3372
daeda1cc
PR
3373 mutex_lock(&mdev->tconn->conf_update);
3374 old_net_conf = mdev->tconn->net_conf;
813472ce
PR
3375 if (get_ldev(mdev)) {
3376 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3377 if (!new_disk_conf) {
3378 put_ldev(mdev);
3379 mutex_unlock(&mdev->tconn->conf_update);
3380 dev_err(DEV, "Allocation of new disk_conf failed\n");
3381 return -ENOMEM;
3382 }
daeda1cc 3383
813472ce
PR
3384 old_disk_conf = mdev->ldev->disk_conf;
3385 *new_disk_conf = *old_disk_conf;
3386
6394b935 3387 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
813472ce 3388 }
daeda1cc 3389
b411b363
PR
3390 if (apv >= 88) {
3391 if (apv == 88) {
e4bad1bc
PR
3392 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3393 dev_err(DEV, "verify-alg of wrong size, "
3394 "peer wants %u, accepting only up to %u byte\n",
3395 data_size, SHARED_SECRET_MAX);
813472ce
PR
3396 err = -EIO;
3397 goto reconnect;
b411b363
PR
3398 }
3399
82bc0194 3400 err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
813472ce
PR
3401 if (err)
3402 goto reconnect;
b411b363
PR
3403 /* we expect NUL terminated string */
3404 /* but just in case someone tries to be evil */
3405 D_ASSERT(p->verify_alg[data_size-1] == 0);
3406 p->verify_alg[data_size-1] = 0;
3407
3408 } else /* apv >= 89 */ {
3409 /* we still expect NUL terminated strings */
3410 /* but just in case someone tries to be evil */
3411 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3412 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3413 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3414 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3415 }
3416
2ec91e0e 3417 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
b411b363
PR
3418 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3419 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3420 old_net_conf->verify_alg, p->verify_alg);
b411b363
PR
3421 goto disconnect;
3422 }
3423 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
3424 p->verify_alg, "verify-alg");
3425 if (IS_ERR(verify_tfm)) {
3426 verify_tfm = NULL;
3427 goto disconnect;
3428 }
3429 }
3430
2ec91e0e 3431 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
b411b363
PR
3432 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3433 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3434 old_net_conf->csums_alg, p->csums_alg);
b411b363
PR
3435 goto disconnect;
3436 }
3437 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
3438 p->csums_alg, "csums-alg");
3439 if (IS_ERR(csums_tfm)) {
3440 csums_tfm = NULL;
3441 goto disconnect;
3442 }
3443 }
3444
813472ce 3445 if (apv > 94 && new_disk_conf) {
daeda1cc
PR
3446 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3447 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3448 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3449 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
778f271d 3450
daeda1cc 3451 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
9958c857 3452 if (fifo_size != mdev->rs_plan_s->size) {
813472ce
PR
3453 new_plan = fifo_alloc(fifo_size);
3454 if (!new_plan) {
778f271d 3455 dev_err(DEV, "kmalloc of fifo_buffer failed");
f399002e 3456 put_ldev(mdev);
778f271d
PR
3457 goto disconnect;
3458 }
3459 }
8e26f9cc 3460 }
b411b363 3461
91fd4dad 3462 if (verify_tfm || csums_tfm) {
2ec91e0e
PR
3463 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3464 if (!new_net_conf) {
91fd4dad
PR
3465 dev_err(DEV, "Allocation of new net_conf failed\n");
3466 goto disconnect;
3467 }
3468
2ec91e0e 3469 *new_net_conf = *old_net_conf;
91fd4dad
PR
3470
3471 if (verify_tfm) {
2ec91e0e
PR
3472 strcpy(new_net_conf->verify_alg, p->verify_alg);
3473 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
91fd4dad
PR
3474 crypto_free_hash(mdev->tconn->verify_tfm);
3475 mdev->tconn->verify_tfm = verify_tfm;
3476 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
3477 }
3478 if (csums_tfm) {
2ec91e0e
PR
3479 strcpy(new_net_conf->csums_alg, p->csums_alg);
3480 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
91fd4dad
PR
3481 crypto_free_hash(mdev->tconn->csums_tfm);
3482 mdev->tconn->csums_tfm = csums_tfm;
3483 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
3484 }
2ec91e0e 3485 rcu_assign_pointer(tconn->net_conf, new_net_conf);
b411b363 3486 }
daeda1cc 3487 }
91fd4dad 3488
813472ce
PR
3489 if (new_disk_conf) {
3490 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
3491 put_ldev(mdev);
3492 }
3493
3494 if (new_plan) {
3495 old_plan = mdev->rs_plan_s;
3496 rcu_assign_pointer(mdev->rs_plan_s, new_plan);
b411b363 3497 }
daeda1cc
PR
3498
3499 mutex_unlock(&mdev->tconn->conf_update);
3500 synchronize_rcu();
3501 if (new_net_conf)
3502 kfree(old_net_conf);
3503 kfree(old_disk_conf);
813472ce 3504 kfree(old_plan);
daeda1cc 3505
82bc0194 3506 return 0;
b411b363 3507
813472ce
PR
3508reconnect:
3509 if (new_disk_conf) {
3510 put_ldev(mdev);
3511 kfree(new_disk_conf);
3512 }
3513 mutex_unlock(&mdev->tconn->conf_update);
3514 return -EIO;
3515
b411b363 3516disconnect:
813472ce
PR
3517 kfree(new_plan);
3518 if (new_disk_conf) {
3519 put_ldev(mdev);
3520 kfree(new_disk_conf);
3521 }
a0095508 3522 mutex_unlock(&mdev->tconn->conf_update);
b411b363
PR
3523 /* just for completeness: actually not needed,
3524 * as this is not reached if csums_tfm was ok. */
3525 crypto_free_hash(csums_tfm);
3526 /* but free the verify_tfm again, if csums_tfm did not work out */
3527 crypto_free_hash(verify_tfm);
38fa9988 3528 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3529 return -EIO;
b411b363
PR
3530}
3531
b411b363
PR
3532/* warn if the arguments differ by more than 12.5% */
3533static void warn_if_differ_considerably(struct drbd_conf *mdev,
3534 const char *s, sector_t a, sector_t b)
3535{
3536 sector_t d;
3537 if (a == 0 || b == 0)
3538 return;
3539 d = (a > b) ? (a - b) : (b - a);
3540 if (d > (a>>3) || d > (b>>3))
3541 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
3542 (unsigned long long)a, (unsigned long long)b);
3543}
3544
4a76b161 3545static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3546{
4a76b161 3547 struct drbd_conf *mdev;
e658983a 3548 struct p_sizes *p = pi->data;
b411b363 3549 enum determine_dev_size dd = unchanged;
b411b363
PR
3550 sector_t p_size, p_usize, my_usize;
3551 int ldsc = 0; /* local disk size changed */
e89b591c 3552 enum dds_flags ddsf;
b411b363 3553
4a76b161
AG
3554 mdev = vnr_to_mdev(tconn, pi->vnr);
3555 if (!mdev)
3556 return config_unknown_volume(tconn, pi);
3557
b411b363
PR
3558 p_size = be64_to_cpu(p->d_size);
3559 p_usize = be64_to_cpu(p->u_size);
3560
b411b363
PR
3561 /* just store the peer's disk size for now.
3562 * we still need to figure out whether we accept that. */
3563 mdev->p_size = p_size;
3564
b411b363 3565 if (get_ldev(mdev)) {
daeda1cc
PR
3566 rcu_read_lock();
3567 my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
3568 rcu_read_unlock();
3569
b411b363
PR
3570 warn_if_differ_considerably(mdev, "lower level device sizes",
3571 p_size, drbd_get_max_capacity(mdev->ldev));
3572 warn_if_differ_considerably(mdev, "user requested size",
daeda1cc 3573 p_usize, my_usize);
b411b363
PR
3574
3575 /* if this is the first connect, or an otherwise expected
3576 * param exchange, choose the minimum */
3577 if (mdev->state.conn == C_WF_REPORT_PARAMS)
daeda1cc 3578 p_usize = min_not_zero(my_usize, p_usize);
b411b363
PR
3579
3580 /* Never shrink a device with usable data during connect.
3581 But allow online shrinking if we are connected. */
ef5e44a6 3582 if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) <
daeda1cc
PR
3583 drbd_get_capacity(mdev->this_bdev) &&
3584 mdev->state.disk >= D_OUTDATED &&
3585 mdev->state.conn < C_CONNECTED) {
b411b363 3586 dev_err(DEV, "The peer's disk size is too small!\n");
38fa9988 3587 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 3588 put_ldev(mdev);
82bc0194 3589 return -EIO;
b411b363 3590 }
daeda1cc
PR
3591
3592 if (my_usize != p_usize) {
3593 struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
3594
3595 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3596 if (!new_disk_conf) {
3597 dev_err(DEV, "Allocation of new disk_conf failed\n");
3598 put_ldev(mdev);
3599 return -ENOMEM;
3600 }
3601
3602 mutex_lock(&mdev->tconn->conf_update);
3603 old_disk_conf = mdev->ldev->disk_conf;
3604 *new_disk_conf = *old_disk_conf;
3605 new_disk_conf->disk_size = p_usize;
3606
3607 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
3608 mutex_unlock(&mdev->tconn->conf_update);
3609 synchronize_rcu();
3610 kfree(old_disk_conf);
3611
3612 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3613 (unsigned long)my_usize);
3614 }
3615
b411b363
PR
3616 put_ldev(mdev);
3617 }
b411b363 3618
e89b591c 3619 ddsf = be16_to_cpu(p->dds_flags);
b411b363 3620 if (get_ldev(mdev)) {
24c4830c 3621 dd = drbd_determine_dev_size(mdev, ddsf);
b411b363
PR
3622 put_ldev(mdev);
3623 if (dd == dev_size_error)
82bc0194 3624 return -EIO;
b411b363
PR
3625 drbd_md_sync(mdev);
3626 } else {
3627 /* I am diskless, need to accept the peer's size. */
3628 drbd_set_my_capacity(mdev, p_size);
3629 }
3630
99432fcc
PR
3631 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3632 drbd_reconsider_max_bio_size(mdev);
3633
b411b363
PR
3634 if (get_ldev(mdev)) {
3635 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3636 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3637 ldsc = 1;
3638 }
3639
b411b363
PR
3640 put_ldev(mdev);
3641 }
3642
3643 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3644 if (be64_to_cpu(p->c_size) !=
3645 drbd_get_capacity(mdev->this_bdev) || ldsc) {
3646 /* we have different sizes, probably peer
3647 * needs to know my new size... */
e89b591c 3648 drbd_send_sizes(mdev, 0, ddsf);
b411b363
PR
3649 }
3650 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3651 (dd == grew && mdev->state.conn == C_CONNECTED)) {
3652 if (mdev->state.pdsk >= D_INCONSISTENT &&
e89b591c
PR
3653 mdev->state.disk >= D_INCONSISTENT) {
3654 if (ddsf & DDSF_NO_RESYNC)
3655 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3656 else
3657 resync_after_online_grow(mdev);
3658 } else
b411b363
PR
3659 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3660 }
3661 }
3662
82bc0194 3663 return 0;
b411b363
PR
3664}
3665
4a76b161 3666static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3667{
4a76b161 3668 struct drbd_conf *mdev;
e658983a 3669 struct p_uuids *p = pi->data;
b411b363 3670 u64 *p_uuid;
62b0da3a 3671 int i, updated_uuids = 0;
b411b363 3672
4a76b161
AG
3673 mdev = vnr_to_mdev(tconn, pi->vnr);
3674 if (!mdev)
3675 return config_unknown_volume(tconn, pi);
3676
b411b363
PR
3677 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3678
3679 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3680 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3681
3682 kfree(mdev->p_uuid);
3683 mdev->p_uuid = p_uuid;
3684
3685 if (mdev->state.conn < C_CONNECTED &&
3686 mdev->state.disk < D_INCONSISTENT &&
3687 mdev->state.role == R_PRIMARY &&
3688 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3689 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3690 (unsigned long long)mdev->ed_uuid);
38fa9988 3691 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3692 return -EIO;
b411b363
PR
3693 }
3694
3695 if (get_ldev(mdev)) {
3696 int skip_initial_sync =
3697 mdev->state.conn == C_CONNECTED &&
31890f4a 3698 mdev->tconn->agreed_pro_version >= 90 &&
b411b363
PR
3699 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3700 (p_uuid[UI_FLAGS] & 8);
3701 if (skip_initial_sync) {
3702 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3703 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
20ceb2b2
LE
3704 "clear_n_write from receive_uuids",
3705 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
3706 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3707 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3708 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3709 CS_VERBOSE, NULL);
3710 drbd_md_sync(mdev);
62b0da3a 3711 updated_uuids = 1;
b411b363
PR
3712 }
3713 put_ldev(mdev);
18a50fa2
PR
3714 } else if (mdev->state.disk < D_INCONSISTENT &&
3715 mdev->state.role == R_PRIMARY) {
3716 /* I am a diskless primary, the peer just created a new current UUID
3717 for me. */
62b0da3a 3718 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
b411b363
PR
3719 }
3720
3721 /* Before we test for the disk state, we should wait until an eventually
3722 ongoing cluster wide state change is finished. That is important if
3723 we are primary and are detaching from our disk. We need to see the
3724 new disk state... */
8410da8f
PR
3725 mutex_lock(mdev->state_mutex);
3726 mutex_unlock(mdev->state_mutex);
b411b363 3727 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
62b0da3a
LE
3728 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3729
3730 if (updated_uuids)
3731 drbd_print_uuids(mdev, "receiver updated UUIDs to");
b411b363 3732
82bc0194 3733 return 0;
b411b363
PR
3734}
3735
3736/**
3737 * convert_state() - Converts the peer's view of the cluster state to our point of view
3738 * @ps: The state as seen by the peer.
3739 */
3740static union drbd_state convert_state(union drbd_state ps)
3741{
3742 union drbd_state ms;
3743
3744 static enum drbd_conns c_tab[] = {
369bea63 3745 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
b411b363
PR
3746 [C_CONNECTED] = C_CONNECTED,
3747
3748 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3749 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3750 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3751 [C_VERIFY_S] = C_VERIFY_T,
3752 [C_MASK] = C_MASK,
3753 };
3754
3755 ms.i = ps.i;
3756
3757 ms.conn = c_tab[ps.conn];
3758 ms.peer = ps.role;
3759 ms.role = ps.peer;
3760 ms.pdsk = ps.disk;
3761 ms.disk = ps.pdsk;
3762 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3763
3764 return ms;
3765}
3766
4a76b161 3767static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3768{
4a76b161 3769 struct drbd_conf *mdev;
e658983a 3770 struct p_req_state *p = pi->data;
b411b363 3771 union drbd_state mask, val;
bf885f8a 3772 enum drbd_state_rv rv;
b411b363 3773
4a76b161
AG
3774 mdev = vnr_to_mdev(tconn, pi->vnr);
3775 if (!mdev)
3776 return -EIO;
3777
b411b363
PR
3778 mask.i = be32_to_cpu(p->mask);
3779 val.i = be32_to_cpu(p->val);
3780
25703f83 3781 if (test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags) &&
8410da8f 3782 mutex_is_locked(mdev->state_mutex)) {
b411b363 3783 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
82bc0194 3784 return 0;
b411b363
PR
3785 }
3786
3787 mask = convert_state(mask);
3788 val = convert_state(val);
3789
dfafcc8a
PR
3790 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3791 drbd_send_sr_reply(mdev, rv);
b411b363 3792
b411b363
PR
3793 drbd_md_sync(mdev);
3794
82bc0194 3795 return 0;
b411b363
PR
3796}
3797
e2857216 3798static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
dfafcc8a 3799{
e658983a 3800 struct p_req_state *p = pi->data;
dfafcc8a
PR
3801 union drbd_state mask, val;
3802 enum drbd_state_rv rv;
3803
3804 mask.i = be32_to_cpu(p->mask);
3805 val.i = be32_to_cpu(p->val);
3806
3807 if (test_bit(DISCARD_CONCURRENT, &tconn->flags) &&
3808 mutex_is_locked(&tconn->cstate_mutex)) {
3809 conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
82bc0194 3810 return 0;
dfafcc8a
PR
3811 }
3812
3813 mask = convert_state(mask);
3814 val = convert_state(val);
3815
778bcf2e 3816 rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
dfafcc8a
PR
3817 conn_send_sr_reply(tconn, rv);
3818
82bc0194 3819 return 0;
dfafcc8a
PR
3820}
3821
4a76b161 3822static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3823{
4a76b161 3824 struct drbd_conf *mdev;
e658983a 3825 struct p_state *p = pi->data;
4ac4aada 3826 union drbd_state os, ns, peer_state;
b411b363 3827 enum drbd_disk_state real_peer_disk;
65d922c3 3828 enum chg_state_flags cs_flags;
b411b363
PR
3829 int rv;
3830
4a76b161
AG
3831 mdev = vnr_to_mdev(tconn, pi->vnr);
3832 if (!mdev)
3833 return config_unknown_volume(tconn, pi);
3834
b411b363
PR
3835 peer_state.i = be32_to_cpu(p->state);
3836
3837 real_peer_disk = peer_state.disk;
3838 if (peer_state.disk == D_NEGOTIATING) {
3839 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3840 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3841 }
3842
87eeee41 3843 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 3844 retry:
78bae59b 3845 os = ns = drbd_read_state(mdev);
87eeee41 3846 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 3847
b8853dbd
PR
3848 /* If some other part of the code (asender thread, timeout)
3849 * already decided to close the connection again,
3850 * we must not "re-establish" it here. */
3851 if (os.conn <= C_TEAR_DOWN)
58ffa580 3852 return -ECONNRESET;
b8853dbd 3853
9bcd2521
PR
3854 /* If this is the "end of sync" confirmation, usually the peer disk
3855 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
3856 * set) resync started in PausedSyncT, or if the timing of pause-/
3857 * unpause-sync events has been "just right", the peer disk may
3858 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
3859 */
3860 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
3861 real_peer_disk == D_UP_TO_DATE &&
e9ef7bb6
LE
3862 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3863 /* If we are (becoming) SyncSource, but peer is still in sync
3864 * preparation, ignore its uptodate-ness to avoid flapping, it
3865 * will change to inconsistent once the peer reaches active
3866 * syncing states.
3867 * It may have changed syncer-paused flags, however, so we
3868 * cannot ignore this completely. */
3869 if (peer_state.conn > C_CONNECTED &&
3870 peer_state.conn < C_SYNC_SOURCE)
3871 real_peer_disk = D_INCONSISTENT;
3872
3873 /* if peer_state changes to connected at the same time,
3874 * it explicitly notifies us that it finished resync.
3875 * Maybe we should finish it up, too? */
3876 else if (os.conn >= C_SYNC_SOURCE &&
3877 peer_state.conn == C_CONNECTED) {
3878 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3879 drbd_resync_finished(mdev);
82bc0194 3880 return 0;
e9ef7bb6
LE
3881 }
3882 }
3883
58ffa580
LE
3884 /* explicit verify finished notification, stop sector reached. */
3885 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
3886 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
3887 ov_out_of_sync_print(mdev);
3888 drbd_resync_finished(mdev);
3889 return 0;
3890 }
3891
e9ef7bb6
LE
3892 /* peer says his disk is inconsistent, while we think it is uptodate,
3893 * and this happens while the peer still thinks we have a sync going on,
3894 * but we think we are already done with the sync.
3895 * We ignore this to avoid flapping pdsk.
3896 * This should not happen, if the peer is a recent version of drbd. */
3897 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3898 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3899 real_peer_disk = D_UP_TO_DATE;
3900
4ac4aada
LE
3901 if (ns.conn == C_WF_REPORT_PARAMS)
3902 ns.conn = C_CONNECTED;
b411b363 3903
67531718
PR
3904 if (peer_state.conn == C_AHEAD)
3905 ns.conn = C_BEHIND;
3906
b411b363
PR
3907 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3908 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3909 int cr; /* consider resync */
3910
3911 /* if we established a new connection */
4ac4aada 3912 cr = (os.conn < C_CONNECTED);
b411b363
PR
3913 /* if we had an established connection
3914 * and one of the nodes newly attaches a disk */
4ac4aada 3915 cr |= (os.conn == C_CONNECTED &&
b411b363 3916 (peer_state.disk == D_NEGOTIATING ||
4ac4aada 3917 os.disk == D_NEGOTIATING));
b411b363
PR
3918 /* if we have both been inconsistent, and the peer has been
3919 * forced to be UpToDate with --overwrite-data */
3920 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3921 /* if we had been plain connected, and the admin requested to
3922 * start a sync by "invalidate" or "invalidate-remote" */
4ac4aada 3923 cr |= (os.conn == C_CONNECTED &&
b411b363
PR
3924 (peer_state.conn >= C_STARTING_SYNC_S &&
3925 peer_state.conn <= C_WF_BITMAP_T));
3926
3927 if (cr)
4ac4aada 3928 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
b411b363
PR
3929
3930 put_ldev(mdev);
4ac4aada
LE
3931 if (ns.conn == C_MASK) {
3932 ns.conn = C_CONNECTED;
b411b363 3933 if (mdev->state.disk == D_NEGOTIATING) {
82f59cc6 3934 drbd_force_state(mdev, NS(disk, D_FAILED));
b411b363
PR
3935 } else if (peer_state.disk == D_NEGOTIATING) {
3936 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3937 peer_state.disk = D_DISKLESS;
580b9767 3938 real_peer_disk = D_DISKLESS;
b411b363 3939 } else {
8169e41b 3940 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
82bc0194 3941 return -EIO;
4ac4aada 3942 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
38fa9988 3943 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3944 return -EIO;
b411b363
PR
3945 }
3946 }
3947 }
3948
87eeee41 3949 spin_lock_irq(&mdev->tconn->req_lock);
78bae59b 3950 if (os.i != drbd_read_state(mdev).i)
b411b363
PR
3951 goto retry;
3952 clear_bit(CONSIDER_RESYNC, &mdev->flags);
b411b363
PR
3953 ns.peer = peer_state.role;
3954 ns.pdsk = real_peer_disk;
3955 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4ac4aada 3956 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
b411b363 3957 ns.disk = mdev->new_state_tmp.disk;
4ac4aada 3958 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
2aebfabb 3959 if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
481c6f50 3960 test_bit(NEW_CUR_UUID, &mdev->flags)) {
8554df1c 3961 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
481c6f50 3962 for temporal network outages! */
87eeee41 3963 spin_unlock_irq(&mdev->tconn->req_lock);
481c6f50 3964 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
2f5cdd0b 3965 tl_clear(mdev->tconn);
481c6f50
PR
3966 drbd_uuid_new_current(mdev);
3967 clear_bit(NEW_CUR_UUID, &mdev->flags);
38fa9988 3968 conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
82bc0194 3969 return -EIO;
481c6f50 3970 }
65d922c3 3971 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
78bae59b 3972 ns = drbd_read_state(mdev);
87eeee41 3973 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
3974
3975 if (rv < SS_SUCCESS) {
38fa9988 3976 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3977 return -EIO;
b411b363
PR
3978 }
3979
4ac4aada
LE
3980 if (os.conn > C_WF_REPORT_PARAMS) {
3981 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
b411b363
PR
3982 peer_state.disk != D_NEGOTIATING ) {
3983 /* we want resync, peer has not yet decided to sync... */
3984 /* Nowadays only used when forcing a node into primary role and
3985 setting its disk to UpToDate with that */
3986 drbd_send_uuids(mdev);
43de7c85 3987 drbd_send_current_state(mdev);
b411b363
PR
3988 }
3989 }
3990
08b165ba 3991 clear_bit(DISCARD_MY_DATA, &mdev->flags);
b411b363
PR
3992
3993 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3994
82bc0194 3995 return 0;
b411b363
PR
3996}
3997
4a76b161 3998static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3999{
4a76b161 4000 struct drbd_conf *mdev;
e658983a 4001 struct p_rs_uuid *p = pi->data;
4a76b161
AG
4002
4003 mdev = vnr_to_mdev(tconn, pi->vnr);
4004 if (!mdev)
4005 return -EIO;
b411b363
PR
4006
4007 wait_event(mdev->misc_wait,
4008 mdev->state.conn == C_WF_SYNC_UUID ||
c4752ef1 4009 mdev->state.conn == C_BEHIND ||
b411b363
PR
4010 mdev->state.conn < C_CONNECTED ||
4011 mdev->state.disk < D_NEGOTIATING);
4012
4013 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
4014
b411b363
PR
4015 /* Here the _drbd_uuid_ functions are right, current should
4016 _not_ be rotated into the history */
4017 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
4018 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
4019 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
4020
62b0da3a 4021 drbd_print_uuids(mdev, "updated sync uuid");
b411b363
PR
4022 drbd_start_resync(mdev, C_SYNC_TARGET);
4023
4024 put_ldev(mdev);
4025 } else
4026 dev_err(DEV, "Ignoring SyncUUID packet!\n");
4027
82bc0194 4028 return 0;
b411b363
PR
4029}
4030
2c46407d
AG
4031/**
4032 * receive_bitmap_plain
4033 *
4034 * Return 0 when done, 1 when another iteration is needed, and a negative error
4035 * code upon failure.
4036 */
4037static int
50d0b1ad 4038receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size,
e658983a 4039 unsigned long *p, struct bm_xfer_ctx *c)
b411b363 4040{
50d0b1ad
AG
4041 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4042 drbd_header_size(mdev->tconn);
e658983a 4043 unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 4044 c->bm_words - c->word_offset);
e658983a 4045 unsigned int want = num_words * sizeof(*p);
2c46407d 4046 int err;
b411b363 4047
50d0b1ad
AG
4048 if (want != size) {
4049 dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size);
2c46407d 4050 return -EIO;
b411b363
PR
4051 }
4052 if (want == 0)
2c46407d 4053 return 0;
e658983a 4054 err = drbd_recv_all(mdev->tconn, p, want);
82bc0194 4055 if (err)
2c46407d 4056 return err;
b411b363 4057
e658983a 4058 drbd_bm_merge_lel(mdev, c->word_offset, num_words, p);
b411b363
PR
4059
4060 c->word_offset += num_words;
4061 c->bit_offset = c->word_offset * BITS_PER_LONG;
4062 if (c->bit_offset > c->bm_bits)
4063 c->bit_offset = c->bm_bits;
4064
2c46407d 4065 return 1;
b411b363
PR
4066}
4067
a02d1240
AG
4068static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4069{
4070 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4071}
4072
4073static int dcbp_get_start(struct p_compressed_bm *p)
4074{
4075 return (p->encoding & 0x80) != 0;
4076}
4077
4078static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4079{
4080 return (p->encoding >> 4) & 0x7;
4081}
4082
2c46407d
AG
4083/**
4084 * recv_bm_rle_bits
4085 *
4086 * Return 0 when done, 1 when another iteration is needed, and a negative error
4087 * code upon failure.
4088 */
4089static int
b411b363
PR
4090recv_bm_rle_bits(struct drbd_conf *mdev,
4091 struct p_compressed_bm *p,
c6d25cfe
PR
4092 struct bm_xfer_ctx *c,
4093 unsigned int len)
b411b363
PR
4094{
4095 struct bitstream bs;
4096 u64 look_ahead;
4097 u64 rl;
4098 u64 tmp;
4099 unsigned long s = c->bit_offset;
4100 unsigned long e;
a02d1240 4101 int toggle = dcbp_get_start(p);
b411b363
PR
4102 int have;
4103 int bits;
4104
a02d1240 4105 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
b411b363
PR
4106
4107 bits = bitstream_get_bits(&bs, &look_ahead, 64);
4108 if (bits < 0)
2c46407d 4109 return -EIO;
b411b363
PR
4110
4111 for (have = bits; have > 0; s += rl, toggle = !toggle) {
4112 bits = vli_decode_bits(&rl, look_ahead);
4113 if (bits <= 0)
2c46407d 4114 return -EIO;
b411b363
PR
4115
4116 if (toggle) {
4117 e = s + rl -1;
4118 if (e >= c->bm_bits) {
4119 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
2c46407d 4120 return -EIO;
b411b363
PR
4121 }
4122 _drbd_bm_set_bits(mdev, s, e);
4123 }
4124
4125 if (have < bits) {
4126 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4127 have, bits, look_ahead,
4128 (unsigned int)(bs.cur.b - p->code),
4129 (unsigned int)bs.buf_len);
2c46407d 4130 return -EIO;
b411b363
PR
4131 }
4132 look_ahead >>= bits;
4133 have -= bits;
4134
4135 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4136 if (bits < 0)
2c46407d 4137 return -EIO;
b411b363
PR
4138 look_ahead |= tmp << have;
4139 have += bits;
4140 }
4141
4142 c->bit_offset = s;
4143 bm_xfer_ctx_bit_to_word_offset(c);
4144
2c46407d 4145 return (s != c->bm_bits);
b411b363
PR
4146}
4147
2c46407d
AG
4148/**
4149 * decode_bitmap_c
4150 *
4151 * Return 0 when done, 1 when another iteration is needed, and a negative error
4152 * code upon failure.
4153 */
4154static int
b411b363
PR
4155decode_bitmap_c(struct drbd_conf *mdev,
4156 struct p_compressed_bm *p,
c6d25cfe
PR
4157 struct bm_xfer_ctx *c,
4158 unsigned int len)
b411b363 4159{
a02d1240 4160 if (dcbp_get_code(p) == RLE_VLI_Bits)
e658983a 4161 return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p));
b411b363
PR
4162
4163 /* other variants had been implemented for evaluation,
4164 * but have been dropped as this one turned out to be "best"
4165 * during all our tests. */
4166
4167 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
38fa9988 4168 conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
2c46407d 4169 return -EIO;
b411b363
PR
4170}
4171
4172void INFO_bm_xfer_stats(struct drbd_conf *mdev,
4173 const char *direction, struct bm_xfer_ctx *c)
4174{
4175 /* what would it take to transfer it "plaintext" */
50d0b1ad
AG
4176 unsigned int header_size = drbd_header_size(mdev->tconn);
4177 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4178 unsigned int plain =
4179 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4180 c->bm_words * sizeof(unsigned long);
4181 unsigned int total = c->bytes[0] + c->bytes[1];
4182 unsigned int r;
b411b363
PR
4183
4184 /* total can not be zero. but just in case: */
4185 if (total == 0)
4186 return;
4187
4188 /* don't report if not compressed */
4189 if (total >= plain)
4190 return;
4191
4192 /* total < plain. check for overflow, still */
4193 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4194 : (1000 * total / plain);
4195
4196 if (r > 1000)
4197 r = 1000;
4198
4199 r = 1000 - r;
4200 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4201 "total %u; compression: %u.%u%%\n",
4202 direction,
4203 c->bytes[1], c->packets[1],
4204 c->bytes[0], c->packets[0],
4205 total, r/10, r % 10);
4206}
4207
4208/* Since we are processing the bitfield from lower addresses to higher,
4209 it does not matter if the process it in 32 bit chunks or 64 bit
4210 chunks as long as it is little endian. (Understand it as byte stream,
4211 beginning with the lowest byte...) If we would use big endian
4212 we would need to process it from the highest address to the lowest,
4213 in order to be agnostic to the 32 vs 64 bits issue.
4214
4215 returns 0 on failure, 1 if we successfully received it. */
4a76b161 4216static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4217{
4a76b161 4218 struct drbd_conf *mdev;
b411b363 4219 struct bm_xfer_ctx c;
2c46407d 4220 int err;
4a76b161
AG
4221
4222 mdev = vnr_to_mdev(tconn, pi->vnr);
4223 if (!mdev)
4224 return -EIO;
b411b363 4225
20ceb2b2
LE
4226 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4227 /* you are supposed to send additional out-of-sync information
4228 * if you actually set bits during this phase */
b411b363 4229
b411b363
PR
4230 c = (struct bm_xfer_ctx) {
4231 .bm_bits = drbd_bm_bits(mdev),
4232 .bm_words = drbd_bm_words(mdev),
4233 };
4234
2c46407d 4235 for(;;) {
e658983a
AG
4236 if (pi->cmd == P_BITMAP)
4237 err = receive_bitmap_plain(mdev, pi->size, pi->data, &c);
4238 else if (pi->cmd == P_COMPRESSED_BITMAP) {
b411b363
PR
4239 /* MAYBE: sanity check that we speak proto >= 90,
4240 * and the feature is enabled! */
e658983a 4241 struct p_compressed_bm *p = pi->data;
b411b363 4242
50d0b1ad 4243 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) {
b411b363 4244 dev_err(DEV, "ReportCBitmap packet too large\n");
82bc0194 4245 err = -EIO;
b411b363
PR
4246 goto out;
4247 }
e658983a 4248 if (pi->size <= sizeof(*p)) {
e2857216 4249 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
82bc0194 4250 err = -EIO;
78fcbdae 4251 goto out;
b411b363 4252 }
e658983a
AG
4253 err = drbd_recv_all(mdev->tconn, p, pi->size);
4254 if (err)
4255 goto out;
e2857216 4256 err = decode_bitmap_c(mdev, p, &c, pi->size);
b411b363 4257 } else {
e2857216 4258 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
82bc0194 4259 err = -EIO;
b411b363
PR
4260 goto out;
4261 }
4262
e2857216 4263 c.packets[pi->cmd == P_BITMAP]++;
50d0b1ad 4264 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size;
b411b363 4265
2c46407d
AG
4266 if (err <= 0) {
4267 if (err < 0)
4268 goto out;
b411b363 4269 break;
2c46407d 4270 }
e2857216 4271 err = drbd_recv_header(mdev->tconn, pi);
82bc0194 4272 if (err)
b411b363 4273 goto out;
2c46407d 4274 }
b411b363
PR
4275
4276 INFO_bm_xfer_stats(mdev, "receive", &c);
4277
4278 if (mdev->state.conn == C_WF_BITMAP_T) {
de1f8e4a
AG
4279 enum drbd_state_rv rv;
4280
82bc0194
AG
4281 err = drbd_send_bitmap(mdev);
4282 if (err)
b411b363
PR
4283 goto out;
4284 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
de1f8e4a
AG
4285 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4286 D_ASSERT(rv == SS_SUCCESS);
b411b363
PR
4287 } else if (mdev->state.conn != C_WF_BITMAP_S) {
4288 /* admin may have requested C_DISCONNECTING,
4289 * other threads may have noticed network errors */
4290 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
4291 drbd_conn_str(mdev->state.conn));
4292 }
82bc0194 4293 err = 0;
b411b363 4294
b411b363 4295 out:
20ceb2b2 4296 drbd_bm_unlock(mdev);
82bc0194 4297 if (!err && mdev->state.conn == C_WF_BITMAP_S)
b411b363 4298 drbd_start_resync(mdev, C_SYNC_SOURCE);
82bc0194 4299 return err;
b411b363
PR
4300}
4301
4a76b161 4302static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4303{
4a76b161 4304 conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
e2857216 4305 pi->cmd, pi->size);
2de876ef 4306
4a76b161 4307 return ignore_remaining_packet(tconn, pi);
2de876ef
PR
4308}
4309
4a76b161 4310static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 4311{
e7f52dfb
LE
4312 /* Make sure we've acked all the TCP data associated
4313 * with the data requests being unplugged */
4a76b161 4314 drbd_tcp_quickack(tconn->data.socket);
0ced55a3 4315
82bc0194 4316 return 0;
0ced55a3
PR
4317}
4318
4a76b161 4319static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
73a01a18 4320{
4a76b161 4321 struct drbd_conf *mdev;
e658983a 4322 struct p_block_desc *p = pi->data;
4a76b161
AG
4323
4324 mdev = vnr_to_mdev(tconn, pi->vnr);
4325 if (!mdev)
4326 return -EIO;
73a01a18 4327
f735e363
LE
4328 switch (mdev->state.conn) {
4329 case C_WF_SYNC_UUID:
4330 case C_WF_BITMAP_T:
4331 case C_BEHIND:
4332 break;
4333 default:
4334 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4335 drbd_conn_str(mdev->state.conn));
4336 }
4337
73a01a18
PR
4338 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4339
82bc0194 4340 return 0;
73a01a18
PR
4341}
4342
02918be2
PR
4343struct data_cmd {
4344 int expect_payload;
4345 size_t pkt_size;
4a76b161 4346 int (*fn)(struct drbd_tconn *, struct packet_info *);
02918be2
PR
4347};
4348
4349static struct data_cmd drbd_cmd_handler[] = {
4a76b161
AG
4350 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
4351 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
4352 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4353 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
e658983a
AG
4354 [P_BITMAP] = { 1, 0, receive_bitmap } ,
4355 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4356 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
4a76b161
AG
4357 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4358 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
e658983a
AG
4359 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
4360 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
4a76b161
AG
4361 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
4362 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
4363 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
4364 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
4365 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
4366 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4367 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4368 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4369 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4370 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
4371 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4372 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
036b17ea 4373 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
b411b363
PR
4374};
4375
eefc2f7d 4376static void drbdd(struct drbd_tconn *tconn)
b411b363 4377{
77351055 4378 struct packet_info pi;
02918be2 4379 size_t shs; /* sub header size */
82bc0194 4380 int err;
b411b363 4381
eefc2f7d 4382 while (get_t_state(&tconn->receiver) == RUNNING) {
deebe195
AG
4383 struct data_cmd *cmd;
4384
eefc2f7d 4385 drbd_thread_current_set_cpu(&tconn->receiver);
69bc7bc3 4386 if (drbd_recv_header(tconn, &pi))
02918be2 4387 goto err_out;
b411b363 4388
deebe195 4389 cmd = &drbd_cmd_handler[pi.cmd];
4a76b161 4390 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
2fcb8f30
AG
4391 conn_err(tconn, "Unexpected data packet %s (0x%04x)",
4392 cmdname(pi.cmd), pi.cmd);
02918be2 4393 goto err_out;
0b33a916 4394 }
b411b363 4395
e658983a
AG
4396 shs = cmd->pkt_size;
4397 if (pi.size > shs && !cmd->expect_payload) {
2fcb8f30
AG
4398 conn_err(tconn, "No payload expected %s l:%d\n",
4399 cmdname(pi.cmd), pi.size);
02918be2 4400 goto err_out;
b411b363 4401 }
b411b363 4402
c13f7e1a 4403 if (shs) {
e658983a 4404 err = drbd_recv_all_warn(tconn, pi.data, shs);
a5c31904 4405 if (err)
c13f7e1a 4406 goto err_out;
e2857216 4407 pi.size -= shs;
c13f7e1a
LE
4408 }
4409
4a76b161
AG
4410 err = cmd->fn(tconn, &pi);
4411 if (err) {
9f5bdc33
AG
4412 conn_err(tconn, "error receiving %s, e: %d l: %d!\n",
4413 cmdname(pi.cmd), err, pi.size);
02918be2 4414 goto err_out;
b411b363
PR
4415 }
4416 }
82bc0194 4417 return;
b411b363 4418
82bc0194
AG
4419 err_out:
4420 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
4421}
4422
0e29d163 4423void conn_flush_workqueue(struct drbd_tconn *tconn)
b411b363
PR
4424{
4425 struct drbd_wq_barrier barr;
4426
4427 barr.w.cb = w_prev_work_done;
0e29d163 4428 barr.w.tconn = tconn;
b411b363 4429 init_completion(&barr.done);
d5b27b01 4430 drbd_queue_work(&tconn->sender_work, &barr.w);
b411b363
PR
4431 wait_for_completion(&barr.done);
4432}
4433
81fa2e67 4434static void conn_disconnect(struct drbd_tconn *tconn)
b411b363 4435{
c141ebda 4436 struct drbd_conf *mdev;
bbeb641c 4437 enum drbd_conns oc;
376694a0 4438 int vnr;
b411b363 4439
bbeb641c 4440 if (tconn->cstate == C_STANDALONE)
b411b363 4441 return;
b411b363 4442
b8853dbd
PR
4443 /* We are about to start the cleanup after connection loss.
4444 * Make sure drbd_make_request knows about that.
4445 * Usually we should be in some network failure state already,
4446 * but just in case we are not, we fix it up here.
4447 */
4448 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
4449
b411b363 4450 /* asender does not clean up anything. it must not interfere, either */
360cc740
PR
4451 drbd_thread_stop(&tconn->asender);
4452 drbd_free_sock(tconn);
4453
c141ebda
PR
4454 rcu_read_lock();
4455 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
4456 kref_get(&mdev->kref);
4457 rcu_read_unlock();
4458 drbd_disconnected(mdev);
4459 kref_put(&mdev->kref, &drbd_minor_destroy);
4460 rcu_read_lock();
4461 }
4462 rcu_read_unlock();
4463
12038a3a
PR
4464 if (!list_empty(&tconn->current_epoch->list))
4465 conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n");
4466 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
4467 atomic_set(&tconn->current_epoch->epoch_size, 0);
b6dd1a89 4468 tconn->send.seen_any_write_yet = false;
12038a3a 4469
360cc740
PR
4470 conn_info(tconn, "Connection closed\n");
4471
cb703454
PR
4472 if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
4473 conn_try_outdate_peer_async(tconn);
4474
360cc740 4475 spin_lock_irq(&tconn->req_lock);
bbeb641c
PR
4476 oc = tconn->cstate;
4477 if (oc >= C_UNCONNECTED)
376694a0 4478 _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
bbeb641c 4479
360cc740
PR
4480 spin_unlock_irq(&tconn->req_lock);
4481
f3dfa40a 4482 if (oc == C_DISCONNECTING)
d9cc6e23 4483 conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
360cc740
PR
4484}
4485
c141ebda 4486static int drbd_disconnected(struct drbd_conf *mdev)
360cc740 4487{
360cc740 4488 unsigned int i;
b411b363 4489
85719573 4490 /* wait for current activity to cease. */
87eeee41 4491 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
4492 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
4493 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
4494 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
87eeee41 4495 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4496
4497 /* We do not have data structures that would allow us to
4498 * get the rs_pending_cnt down to 0 again.
4499 * * On C_SYNC_TARGET we do not have any data structures describing
4500 * the pending RSDataRequest's we have sent.
4501 * * On C_SYNC_SOURCE there is no data structure that tracks
4502 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
4503 * And no, it is not the sum of the reference counts in the
4504 * resync_LRU. The resync_LRU tracks the whole operation including
4505 * the disk-IO, while the rs_pending_cnt only tracks the blocks
4506 * on the fly. */
4507 drbd_rs_cancel_all(mdev);
4508 mdev->rs_total = 0;
4509 mdev->rs_failed = 0;
4510 atomic_set(&mdev->rs_pending_cnt, 0);
4511 wake_up(&mdev->misc_wait);
4512
b411b363 4513 del_timer_sync(&mdev->resync_timer);
b411b363
PR
4514 resync_timer_fn((unsigned long)mdev);
4515
b411b363
PR
4516 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
4517 * w_make_resync_request etc. which may still be on the worker queue
4518 * to be "canceled" */
a21e9298 4519 drbd_flush_workqueue(mdev);
b411b363 4520
a990be46 4521 drbd_finish_peer_reqs(mdev);
b411b363 4522
d10b4ea3
PR
4523 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
4524 might have issued a work again. The one before drbd_finish_peer_reqs() is
4525 necessary to reclain net_ee in drbd_finish_peer_reqs(). */
4526 drbd_flush_workqueue(mdev);
4527
b411b363
PR
4528 kfree(mdev->p_uuid);
4529 mdev->p_uuid = NULL;
4530
2aebfabb 4531 if (!drbd_suspended(mdev))
2f5cdd0b 4532 tl_clear(mdev->tconn);
b411b363 4533
b411b363
PR
4534 drbd_md_sync(mdev);
4535
20ceb2b2
LE
4536 /* serialize with bitmap writeout triggered by the state change,
4537 * if any. */
4538 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
4539
b411b363
PR
4540 /* tcp_close and release of sendpage pages can be deferred. I don't
4541 * want to use SO_LINGER, because apparently it can be deferred for
4542 * more than 20 seconds (longest time I checked).
4543 *
4544 * Actually we don't care for exactly when the network stack does its
4545 * put_page(), but release our reference on these pages right here.
4546 */
7721f567 4547 i = drbd_free_peer_reqs(mdev, &mdev->net_ee);
b411b363
PR
4548 if (i)
4549 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
435f0740
LE
4550 i = atomic_read(&mdev->pp_in_use_by_net);
4551 if (i)
4552 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
b411b363
PR
4553 i = atomic_read(&mdev->pp_in_use);
4554 if (i)
45bb912b 4555 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
b411b363
PR
4556
4557 D_ASSERT(list_empty(&mdev->read_ee));
4558 D_ASSERT(list_empty(&mdev->active_ee));
4559 D_ASSERT(list_empty(&mdev->sync_ee));
4560 D_ASSERT(list_empty(&mdev->done_ee));
4561
360cc740 4562 return 0;
b411b363
PR
4563}
4564
4565/*
4566 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
4567 * we can agree on is stored in agreed_pro_version.
4568 *
4569 * feature flags and the reserved array should be enough room for future
4570 * enhancements of the handshake protocol, and possible plugins...
4571 *
4572 * for now, they are expected to be zero, but ignored.
4573 */
6038178e 4574static int drbd_send_features(struct drbd_tconn *tconn)
b411b363 4575{
9f5bdc33
AG
4576 struct drbd_socket *sock;
4577 struct p_connection_features *p;
b411b363 4578
9f5bdc33
AG
4579 sock = &tconn->data;
4580 p = conn_prepare_command(tconn, sock);
4581 if (!p)
e8d17b01 4582 return -EIO;
b411b363
PR
4583 memset(p, 0, sizeof(*p));
4584 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4585 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
9f5bdc33 4586 return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
b411b363
PR
4587}
4588
4589/*
4590 * return values:
4591 * 1 yes, we have a valid connection
4592 * 0 oops, did not work out, please try again
4593 * -1 peer talks different language,
4594 * no point in trying again, please go standalone.
4595 */
6038178e 4596static int drbd_do_features(struct drbd_tconn *tconn)
b411b363 4597{
65d11ed6 4598 /* ASSERT current == tconn->receiver ... */
e658983a
AG
4599 struct p_connection_features *p;
4600 const int expect = sizeof(struct p_connection_features);
77351055 4601 struct packet_info pi;
a5c31904 4602 int err;
b411b363 4603
6038178e 4604 err = drbd_send_features(tconn);
e8d17b01 4605 if (err)
b411b363
PR
4606 return 0;
4607
69bc7bc3
AG
4608 err = drbd_recv_header(tconn, &pi);
4609 if (err)
b411b363
PR
4610 return 0;
4611
6038178e
AG
4612 if (pi.cmd != P_CONNECTION_FEATURES) {
4613 conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
2fcb8f30 4614 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4615 return -1;
4616 }
4617
77351055 4618 if (pi.size != expect) {
6038178e 4619 conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
77351055 4620 expect, pi.size);
b411b363
PR
4621 return -1;
4622 }
4623
e658983a
AG
4624 p = pi.data;
4625 err = drbd_recv_all_warn(tconn, p, expect);
a5c31904 4626 if (err)
b411b363 4627 return 0;
b411b363 4628
b411b363
PR
4629 p->protocol_min = be32_to_cpu(p->protocol_min);
4630 p->protocol_max = be32_to_cpu(p->protocol_max);
4631 if (p->protocol_max == 0)
4632 p->protocol_max = p->protocol_min;
4633
4634 if (PRO_VERSION_MAX < p->protocol_min ||
4635 PRO_VERSION_MIN > p->protocol_max)
4636 goto incompat;
4637
65d11ed6 4638 tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
b411b363 4639
65d11ed6
PR
4640 conn_info(tconn, "Handshake successful: "
4641 "Agreed network protocol version %d\n", tconn->agreed_pro_version);
b411b363
PR
4642
4643 return 1;
4644
4645 incompat:
65d11ed6 4646 conn_err(tconn, "incompatible DRBD dialects: "
b411b363
PR
4647 "I support %d-%d, peer supports %d-%d\n",
4648 PRO_VERSION_MIN, PRO_VERSION_MAX,
4649 p->protocol_min, p->protocol_max);
4650 return -1;
4651}
4652
4653#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
13e6037d 4654static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363
PR
4655{
4656 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4657 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
b10d96cb 4658 return -1;
b411b363
PR
4659}
4660#else
4661#define CHALLENGE_LEN 64
b10d96cb
JT
4662
4663/* Return value:
4664 1 - auth succeeded,
4665 0 - failed, try again (network error),
4666 -1 - auth failed, don't try again.
4667*/
4668
13e6037d 4669static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363 4670{
9f5bdc33 4671 struct drbd_socket *sock;
b411b363
PR
4672 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
4673 struct scatterlist sg;
4674 char *response = NULL;
4675 char *right_response = NULL;
4676 char *peers_ch = NULL;
44ed167d
PR
4677 unsigned int key_len;
4678 char secret[SHARED_SECRET_MAX]; /* 64 byte */
b411b363
PR
4679 unsigned int resp_size;
4680 struct hash_desc desc;
77351055 4681 struct packet_info pi;
44ed167d 4682 struct net_conf *nc;
69bc7bc3 4683 int err, rv;
b411b363 4684
9f5bdc33
AG
4685 /* FIXME: Put the challenge/response into the preallocated socket buffer. */
4686
44ed167d
PR
4687 rcu_read_lock();
4688 nc = rcu_dereference(tconn->net_conf);
4689 key_len = strlen(nc->shared_secret);
4690 memcpy(secret, nc->shared_secret, key_len);
4691 rcu_read_unlock();
4692
13e6037d 4693 desc.tfm = tconn->cram_hmac_tfm;
b411b363
PR
4694 desc.flags = 0;
4695
44ed167d 4696 rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len);
b411b363 4697 if (rv) {
13e6037d 4698 conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv);
b10d96cb 4699 rv = -1;
b411b363
PR
4700 goto fail;
4701 }
4702
4703 get_random_bytes(my_challenge, CHALLENGE_LEN);
4704
9f5bdc33
AG
4705 sock = &tconn->data;
4706 if (!conn_prepare_command(tconn, sock)) {
4707 rv = 0;
4708 goto fail;
4709 }
e658983a 4710 rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0,
9f5bdc33 4711 my_challenge, CHALLENGE_LEN);
b411b363
PR
4712 if (!rv)
4713 goto fail;
4714
69bc7bc3
AG
4715 err = drbd_recv_header(tconn, &pi);
4716 if (err) {
4717 rv = 0;
b411b363 4718 goto fail;
69bc7bc3 4719 }
b411b363 4720
77351055 4721 if (pi.cmd != P_AUTH_CHALLENGE) {
13e6037d 4722 conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n",
2fcb8f30 4723 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4724 rv = 0;
4725 goto fail;
4726 }
4727
77351055 4728 if (pi.size > CHALLENGE_LEN * 2) {
13e6037d 4729 conn_err(tconn, "expected AuthChallenge payload too big.\n");
b10d96cb 4730 rv = -1;
b411b363
PR
4731 goto fail;
4732 }
4733
77351055 4734 peers_ch = kmalloc(pi.size, GFP_NOIO);
b411b363 4735 if (peers_ch == NULL) {
13e6037d 4736 conn_err(tconn, "kmalloc of peers_ch failed\n");
b10d96cb 4737 rv = -1;
b411b363
PR
4738 goto fail;
4739 }
4740
a5c31904
AG
4741 err = drbd_recv_all_warn(tconn, peers_ch, pi.size);
4742 if (err) {
b411b363
PR
4743 rv = 0;
4744 goto fail;
4745 }
4746
13e6037d 4747 resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm);
b411b363
PR
4748 response = kmalloc(resp_size, GFP_NOIO);
4749 if (response == NULL) {
13e6037d 4750 conn_err(tconn, "kmalloc of response failed\n");
b10d96cb 4751 rv = -1;
b411b363
PR
4752 goto fail;
4753 }
4754
4755 sg_init_table(&sg, 1);
77351055 4756 sg_set_buf(&sg, peers_ch, pi.size);
b411b363
PR
4757
4758 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4759 if (rv) {
13e6037d 4760 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4761 rv = -1;
b411b363
PR
4762 goto fail;
4763 }
4764
9f5bdc33
AG
4765 if (!conn_prepare_command(tconn, sock)) {
4766 rv = 0;
4767 goto fail;
4768 }
e658983a 4769 rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0,
9f5bdc33 4770 response, resp_size);
b411b363
PR
4771 if (!rv)
4772 goto fail;
4773
69bc7bc3
AG
4774 err = drbd_recv_header(tconn, &pi);
4775 if (err) {
4776 rv = 0;
b411b363 4777 goto fail;
69bc7bc3 4778 }
b411b363 4779
77351055 4780 if (pi.cmd != P_AUTH_RESPONSE) {
13e6037d 4781 conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n",
2fcb8f30 4782 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4783 rv = 0;
4784 goto fail;
4785 }
4786
77351055 4787 if (pi.size != resp_size) {
13e6037d 4788 conn_err(tconn, "expected AuthResponse payload of wrong size\n");
b411b363
PR
4789 rv = 0;
4790 goto fail;
4791 }
4792
a5c31904
AG
4793 err = drbd_recv_all_warn(tconn, response , resp_size);
4794 if (err) {
b411b363
PR
4795 rv = 0;
4796 goto fail;
4797 }
4798
4799 right_response = kmalloc(resp_size, GFP_NOIO);
2d1ee87d 4800 if (right_response == NULL) {
13e6037d 4801 conn_err(tconn, "kmalloc of right_response failed\n");
b10d96cb 4802 rv = -1;
b411b363
PR
4803 goto fail;
4804 }
4805
4806 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4807
4808 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4809 if (rv) {
13e6037d 4810 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4811 rv = -1;
b411b363
PR
4812 goto fail;
4813 }
4814
4815 rv = !memcmp(response, right_response, resp_size);
4816
4817 if (rv)
44ed167d
PR
4818 conn_info(tconn, "Peer authenticated using %d bytes HMAC\n",
4819 resp_size);
b10d96cb
JT
4820 else
4821 rv = -1;
b411b363
PR
4822
4823 fail:
4824 kfree(peers_ch);
4825 kfree(response);
4826 kfree(right_response);
4827
4828 return rv;
4829}
4830#endif
4831
4832int drbdd_init(struct drbd_thread *thi)
4833{
392c8801 4834 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
4835 int h;
4836
4d641dd7 4837 conn_info(tconn, "receiver (re)started\n");
b411b363
PR
4838
4839 do {
81fa2e67 4840 h = conn_connect(tconn);
b411b363 4841 if (h == 0) {
81fa2e67 4842 conn_disconnect(tconn);
20ee6390 4843 schedule_timeout_interruptible(HZ);
b411b363
PR
4844 }
4845 if (h == -1) {
4d641dd7 4846 conn_warn(tconn, "Discarding network configuration.\n");
bbeb641c 4847 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
4848 }
4849 } while (h == 0);
4850
91fd4dad
PR
4851 if (h > 0)
4852 drbdd(tconn);
b411b363 4853
81fa2e67 4854 conn_disconnect(tconn);
b411b363 4855
4d641dd7 4856 conn_info(tconn, "receiver terminated\n");
b411b363
PR
4857 return 0;
4858}
4859
4860/* ********* acknowledge sender ******** */
4861
e05e1e59 4862static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
e4f78ede 4863{
e658983a 4864 struct p_req_state_reply *p = pi->data;
e4f78ede
PR
4865 int retcode = be32_to_cpu(p->retcode);
4866
4867 if (retcode >= SS_SUCCESS) {
4868 set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags);
4869 } else {
4870 set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags);
4871 conn_err(tconn, "Requested state change failed by peer: %s (%d)\n",
4872 drbd_set_st_err_str(retcode), retcode);
4873 }
4874 wake_up(&tconn->ping_wait);
4875
2735a594 4876 return 0;
e4f78ede
PR
4877}
4878
1952e916 4879static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4880{
1952e916 4881 struct drbd_conf *mdev;
e658983a 4882 struct p_req_state_reply *p = pi->data;
b411b363
PR
4883 int retcode = be32_to_cpu(p->retcode);
4884
1952e916
AG
4885 mdev = vnr_to_mdev(tconn, pi->vnr);
4886 if (!mdev)
2735a594 4887 return -EIO;
1952e916 4888
4d0fc3fd
PR
4889 if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) {
4890 D_ASSERT(tconn->agreed_pro_version < 100);
4891 return got_conn_RqSReply(tconn, pi);
4892 }
4893
e4f78ede
PR
4894 if (retcode >= SS_SUCCESS) {
4895 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4896 } else {
4897 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4898 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4899 drbd_set_st_err_str(retcode), retcode);
b411b363 4900 }
e4f78ede
PR
4901 wake_up(&mdev->state_wait);
4902
2735a594 4903 return 0;
b411b363
PR
4904}
4905
e05e1e59 4906static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4907{
2735a594 4908 return drbd_send_ping_ack(tconn);
b411b363
PR
4909
4910}
4911
e05e1e59 4912static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363
PR
4913{
4914 /* restore idle timeout */
2a67d8b9
PR
4915 tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
4916 if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags))
4917 wake_up(&tconn->ping_wait);
b411b363 4918
2735a594 4919 return 0;
b411b363
PR
4920}
4921
1952e916 4922static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4923{
1952e916 4924 struct drbd_conf *mdev;
e658983a 4925 struct p_block_ack *p = pi->data;
b411b363
PR
4926 sector_t sector = be64_to_cpu(p->sector);
4927 int blksize = be32_to_cpu(p->blksize);
4928
1952e916
AG
4929 mdev = vnr_to_mdev(tconn, pi->vnr);
4930 if (!mdev)
2735a594 4931 return -EIO;
1952e916 4932
31890f4a 4933 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
b411b363
PR
4934
4935 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4936
1d53f09e
LE
4937 if (get_ldev(mdev)) {
4938 drbd_rs_complete_io(mdev, sector);
4939 drbd_set_in_sync(mdev, sector, blksize);
4940 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4941 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4942 put_ldev(mdev);
4943 }
b411b363 4944 dec_rs_pending(mdev);
778f271d 4945 atomic_add(blksize >> 9, &mdev->rs_sect_in);
b411b363 4946
2735a594 4947 return 0;
b411b363
PR
4948}
4949
bc9c5c41
AG
4950static int
4951validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector,
4952 struct rb_root *root, const char *func,
4953 enum drbd_req_event what, bool missing_ok)
b411b363
PR
4954{
4955 struct drbd_request *req;
4956 struct bio_and_error m;
4957
87eeee41 4958 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 4959 req = find_request(mdev, root, id, sector, missing_ok, func);
b411b363 4960 if (unlikely(!req)) {
87eeee41 4961 spin_unlock_irq(&mdev->tconn->req_lock);
85997675 4962 return -EIO;
b411b363
PR
4963 }
4964 __req_mod(req, what, &m);
87eeee41 4965 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4966
4967 if (m.bio)
4968 complete_master_bio(mdev, &m);
85997675 4969 return 0;
b411b363
PR
4970}
4971
1952e916 4972static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4973{
1952e916 4974 struct drbd_conf *mdev;
e658983a 4975 struct p_block_ack *p = pi->data;
b411b363
PR
4976 sector_t sector = be64_to_cpu(p->sector);
4977 int blksize = be32_to_cpu(p->blksize);
4978 enum drbd_req_event what;
4979
1952e916
AG
4980 mdev = vnr_to_mdev(tconn, pi->vnr);
4981 if (!mdev)
2735a594 4982 return -EIO;
1952e916 4983
b411b363
PR
4984 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4985
579b57ed 4986 if (p->block_id == ID_SYNCER) {
b411b363
PR
4987 drbd_set_in_sync(mdev, sector, blksize);
4988 dec_rs_pending(mdev);
2735a594 4989 return 0;
b411b363 4990 }
e05e1e59 4991 switch (pi->cmd) {
b411b363 4992 case P_RS_WRITE_ACK:
8554df1c 4993 what = WRITE_ACKED_BY_PEER_AND_SIS;
b411b363
PR
4994 break;
4995 case P_WRITE_ACK:
8554df1c 4996 what = WRITE_ACKED_BY_PEER;
b411b363
PR
4997 break;
4998 case P_RECV_ACK:
8554df1c 4999 what = RECV_ACKED_BY_PEER;
b411b363 5000 break;
7be8da07 5001 case P_DISCARD_WRITE:
7be8da07
AG
5002 what = DISCARD_WRITE;
5003 break;
5004 case P_RETRY_WRITE:
7be8da07 5005 what = POSTPONE_WRITE;
b411b363
PR
5006 break;
5007 default:
2735a594 5008 BUG();
b411b363
PR
5009 }
5010
2735a594
AG
5011 return validate_req_change_req_state(mdev, p->block_id, sector,
5012 &mdev->write_requests, __func__,
5013 what, false);
b411b363
PR
5014}
5015
1952e916 5016static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5017{
1952e916 5018 struct drbd_conf *mdev;
e658983a 5019 struct p_block_ack *p = pi->data;
b411b363 5020 sector_t sector = be64_to_cpu(p->sector);
2deb8336 5021 int size = be32_to_cpu(p->blksize);
85997675 5022 int err;
b411b363 5023
1952e916
AG
5024 mdev = vnr_to_mdev(tconn, pi->vnr);
5025 if (!mdev)
2735a594 5026 return -EIO;
1952e916 5027
b411b363
PR
5028 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5029
579b57ed 5030 if (p->block_id == ID_SYNCER) {
b411b363
PR
5031 dec_rs_pending(mdev);
5032 drbd_rs_failed_io(mdev, sector, size);
2735a594 5033 return 0;
b411b363 5034 }
2deb8336 5035
85997675
AG
5036 err = validate_req_change_req_state(mdev, p->block_id, sector,
5037 &mdev->write_requests, __func__,
303d1448 5038 NEG_ACKED, true);
85997675 5039 if (err) {
c3afd8f5
AG
5040 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5041 The master bio might already be completed, therefore the
5042 request is no longer in the collision hash. */
5043 /* In Protocol B we might already have got a P_RECV_ACK
5044 but then get a P_NEG_ACK afterwards. */
c3afd8f5 5045 drbd_set_out_of_sync(mdev, sector, size);
2deb8336 5046 }
2735a594 5047 return 0;
b411b363
PR
5048}
5049
1952e916 5050static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5051{
1952e916 5052 struct drbd_conf *mdev;
e658983a 5053 struct p_block_ack *p = pi->data;
b411b363
PR
5054 sector_t sector = be64_to_cpu(p->sector);
5055
1952e916
AG
5056 mdev = vnr_to_mdev(tconn, pi->vnr);
5057 if (!mdev)
2735a594 5058 return -EIO;
1952e916 5059
b411b363 5060 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
7be8da07 5061
380207d0 5062 dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n",
b411b363
PR
5063 (unsigned long long)sector, be32_to_cpu(p->blksize));
5064
2735a594
AG
5065 return validate_req_change_req_state(mdev, p->block_id, sector,
5066 &mdev->read_requests, __func__,
5067 NEG_ACKED, false);
b411b363
PR
5068}
5069
1952e916 5070static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5071{
1952e916 5072 struct drbd_conf *mdev;
b411b363
PR
5073 sector_t sector;
5074 int size;
e658983a 5075 struct p_block_ack *p = pi->data;
1952e916
AG
5076
5077 mdev = vnr_to_mdev(tconn, pi->vnr);
5078 if (!mdev)
2735a594 5079 return -EIO;
b411b363
PR
5080
5081 sector = be64_to_cpu(p->sector);
5082 size = be32_to_cpu(p->blksize);
b411b363
PR
5083
5084 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5085
5086 dec_rs_pending(mdev);
5087
5088 if (get_ldev_if_state(mdev, D_FAILED)) {
5089 drbd_rs_complete_io(mdev, sector);
e05e1e59 5090 switch (pi->cmd) {
d612d309
PR
5091 case P_NEG_RS_DREPLY:
5092 drbd_rs_failed_io(mdev, sector, size);
5093 case P_RS_CANCEL:
5094 break;
5095 default:
2735a594 5096 BUG();
d612d309 5097 }
b411b363
PR
5098 put_ldev(mdev);
5099 }
5100
2735a594 5101 return 0;
b411b363
PR
5102}
5103
1952e916 5104static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5105{
e658983a 5106 struct p_barrier_ack *p = pi->data;
9ed57dcb
LE
5107 struct drbd_conf *mdev;
5108 int vnr;
1952e916 5109
9ed57dcb 5110 tl_release(tconn, p->barrier, be32_to_cpu(p->set_size));
b411b363 5111
9ed57dcb
LE
5112 rcu_read_lock();
5113 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
5114 if (mdev->state.conn == C_AHEAD &&
5115 atomic_read(&mdev->ap_in_flight) == 0 &&
5116 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
5117 mdev->start_resync_timer.expires = jiffies + HZ;
5118 add_timer(&mdev->start_resync_timer);
5119 }
c4752ef1 5120 }
9ed57dcb 5121 rcu_read_unlock();
c4752ef1 5122
2735a594 5123 return 0;
b411b363
PR
5124}
5125
1952e916 5126static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5127{
1952e916 5128 struct drbd_conf *mdev;
e658983a 5129 struct p_block_ack *p = pi->data;
b411b363
PR
5130 struct drbd_work *w;
5131 sector_t sector;
5132 int size;
5133
1952e916
AG
5134 mdev = vnr_to_mdev(tconn, pi->vnr);
5135 if (!mdev)
2735a594 5136 return -EIO;
1952e916 5137
b411b363
PR
5138 sector = be64_to_cpu(p->sector);
5139 size = be32_to_cpu(p->blksize);
5140
5141 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5142
5143 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
8f7bed77 5144 drbd_ov_out_of_sync_found(mdev, sector, size);
b411b363 5145 else
8f7bed77 5146 ov_out_of_sync_print(mdev);
b411b363 5147
1d53f09e 5148 if (!get_ldev(mdev))
2735a594 5149 return 0;
1d53f09e 5150
b411b363
PR
5151 drbd_rs_complete_io(mdev, sector);
5152 dec_rs_pending(mdev);
5153
ea5442af
LE
5154 --mdev->ov_left;
5155
5156 /* let's advance progress step marks only for every other megabyte */
5157 if ((mdev->ov_left & 0x200) == 0x200)
5158 drbd_advance_rs_marks(mdev, mdev->ov_left);
5159
5160 if (mdev->ov_left == 0) {
b411b363
PR
5161 w = kmalloc(sizeof(*w), GFP_NOIO);
5162 if (w) {
5163 w->cb = w_ov_finished;
a21e9298 5164 w->mdev = mdev;
d5b27b01 5165 drbd_queue_work(&mdev->tconn->sender_work, w);
b411b363
PR
5166 } else {
5167 dev_err(DEV, "kmalloc(w) failed.");
8f7bed77 5168 ov_out_of_sync_print(mdev);
b411b363
PR
5169 drbd_resync_finished(mdev);
5170 }
5171 }
1d53f09e 5172 put_ldev(mdev);
2735a594 5173 return 0;
b411b363
PR
5174}
5175
1952e916 5176static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 5177{
2735a594 5178 return 0;
0ced55a3
PR
5179}
5180
a990be46 5181static int tconn_finish_peer_reqs(struct drbd_tconn *tconn)
32862ec7 5182{
082a3439 5183 struct drbd_conf *mdev;
c141ebda 5184 int vnr, not_empty = 0;
32862ec7
PR
5185
5186 do {
5187 clear_bit(SIGNAL_ASENDER, &tconn->flags);
5188 flush_signals(current);
c141ebda
PR
5189
5190 rcu_read_lock();
5191 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
5192 kref_get(&mdev->kref);
5193 rcu_read_unlock();
d3fcb490 5194 if (drbd_finish_peer_reqs(mdev)) {
c141ebda
PR
5195 kref_put(&mdev->kref, &drbd_minor_destroy);
5196 return 1;
d3fcb490 5197 }
c141ebda
PR
5198 kref_put(&mdev->kref, &drbd_minor_destroy);
5199 rcu_read_lock();
082a3439 5200 }
32862ec7 5201 set_bit(SIGNAL_ASENDER, &tconn->flags);
082a3439
PR
5202
5203 spin_lock_irq(&tconn->req_lock);
c141ebda 5204 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
082a3439
PR
5205 not_empty = !list_empty(&mdev->done_ee);
5206 if (not_empty)
5207 break;
5208 }
5209 spin_unlock_irq(&tconn->req_lock);
c141ebda 5210 rcu_read_unlock();
32862ec7
PR
5211 } while (not_empty);
5212
5213 return 0;
5214}
5215
7201b972
AG
5216struct asender_cmd {
5217 size_t pkt_size;
1952e916 5218 int (*fn)(struct drbd_tconn *tconn, struct packet_info *);
7201b972
AG
5219};
5220
5221static struct asender_cmd asender_tbl[] = {
e658983a
AG
5222 [P_PING] = { 0, got_Ping },
5223 [P_PING_ACK] = { 0, got_PingAck },
1952e916
AG
5224 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5225 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5226 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5227 [P_DISCARD_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
5228 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
5229 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
5230 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
5231 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
5232 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
5233 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5234 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
5235 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
5236 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
5237 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5238 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
7201b972
AG
5239};
5240
b411b363
PR
5241int drbd_asender(struct drbd_thread *thi)
5242{
392c8801 5243 struct drbd_tconn *tconn = thi->tconn;
b411b363 5244 struct asender_cmd *cmd = NULL;
77351055 5245 struct packet_info pi;
257d0af6 5246 int rv;
e658983a 5247 void *buf = tconn->meta.rbuf;
b411b363 5248 int received = 0;
52b061a4
AG
5249 unsigned int header_size = drbd_header_size(tconn);
5250 int expect = header_size;
44ed167d
PR
5251 bool ping_timeout_active = false;
5252 struct net_conf *nc;
bb77d34e 5253 int ping_timeo, tcp_cork, ping_int;
b411b363 5254
b411b363
PR
5255 current->policy = SCHED_RR; /* Make this a realtime task! */
5256 current->rt_priority = 2; /* more important than all other tasks */
5257
e77a0a5c 5258 while (get_t_state(thi) == RUNNING) {
80822284 5259 drbd_thread_current_set_cpu(thi);
44ed167d
PR
5260
5261 rcu_read_lock();
5262 nc = rcu_dereference(tconn->net_conf);
5263 ping_timeo = nc->ping_timeo;
bb77d34e 5264 tcp_cork = nc->tcp_cork;
44ed167d
PR
5265 ping_int = nc->ping_int;
5266 rcu_read_unlock();
5267
32862ec7 5268 if (test_and_clear_bit(SEND_PING, &tconn->flags)) {
a17647aa 5269 if (drbd_send_ping(tconn)) {
32862ec7 5270 conn_err(tconn, "drbd_send_ping has failed\n");
841ce241
AG
5271 goto reconnect;
5272 }
44ed167d
PR
5273 tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
5274 ping_timeout_active = true;
b411b363
PR
5275 }
5276
32862ec7
PR
5277 /* TODO: conditionally cork; it may hurt latency if we cork without
5278 much to send */
bb77d34e 5279 if (tcp_cork)
32862ec7 5280 drbd_tcp_cork(tconn->meta.socket);
a990be46
AG
5281 if (tconn_finish_peer_reqs(tconn)) {
5282 conn_err(tconn, "tconn_finish_peer_reqs() failed\n");
32862ec7 5283 goto reconnect;
082a3439 5284 }
b411b363 5285 /* but unconditionally uncork unless disabled */
bb77d34e 5286 if (tcp_cork)
32862ec7 5287 drbd_tcp_uncork(tconn->meta.socket);
b411b363
PR
5288
5289 /* short circuit, recv_msg would return EINTR anyways. */
5290 if (signal_pending(current))
5291 continue;
5292
32862ec7
PR
5293 rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0);
5294 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363
PR
5295
5296 flush_signals(current);
5297
5298 /* Note:
5299 * -EINTR (on meta) we got a signal
5300 * -EAGAIN (on meta) rcvtimeo expired
5301 * -ECONNRESET other side closed the connection
5302 * -ERESTARTSYS (on data) we got a signal
5303 * rv < 0 other than above: unexpected error!
5304 * rv == expected: full header or command
5305 * rv < expected: "woken" by signal during receive
5306 * rv == 0 : "connection shut down by peer"
5307 */
5308 if (likely(rv > 0)) {
5309 received += rv;
5310 buf += rv;
5311 } else if (rv == 0) {
32862ec7 5312 conn_err(tconn, "meta connection shut down by peer.\n");
b411b363
PR
5313 goto reconnect;
5314 } else if (rv == -EAGAIN) {
cb6518cb
LE
5315 /* If the data socket received something meanwhile,
5316 * that is good enough: peer is still alive. */
32862ec7
PR
5317 if (time_after(tconn->last_received,
5318 jiffies - tconn->meta.socket->sk->sk_rcvtimeo))
cb6518cb 5319 continue;
f36af18c 5320 if (ping_timeout_active) {
32862ec7 5321 conn_err(tconn, "PingAck did not arrive in time.\n");
b411b363
PR
5322 goto reconnect;
5323 }
32862ec7 5324 set_bit(SEND_PING, &tconn->flags);
b411b363
PR
5325 continue;
5326 } else if (rv == -EINTR) {
5327 continue;
5328 } else {
32862ec7 5329 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
5330 goto reconnect;
5331 }
5332
5333 if (received == expect && cmd == NULL) {
e658983a 5334 if (decode_header(tconn, tconn->meta.rbuf, &pi))
b411b363 5335 goto reconnect;
7201b972 5336 cmd = &asender_tbl[pi.cmd];
1952e916 5337 if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
2fcb8f30
AG
5338 conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n",
5339 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5340 goto disconnect;
5341 }
e658983a 5342 expect = header_size + cmd->pkt_size;
52b061a4 5343 if (pi.size != expect - header_size) {
32862ec7 5344 conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n",
77351055 5345 pi.cmd, pi.size);
b411b363 5346 goto reconnect;
257d0af6 5347 }
b411b363
PR
5348 }
5349 if (received == expect) {
2735a594 5350 bool err;
a4fbda8e 5351
2735a594
AG
5352 err = cmd->fn(tconn, &pi);
5353 if (err) {
1952e916 5354 conn_err(tconn, "%pf failed\n", cmd->fn);
b411b363 5355 goto reconnect;
1952e916 5356 }
b411b363 5357
a4fbda8e
PR
5358 tconn->last_received = jiffies;
5359
44ed167d
PR
5360 if (cmd == &asender_tbl[P_PING_ACK]) {
5361 /* restore idle timeout */
5362 tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
5363 ping_timeout_active = false;
5364 }
f36af18c 5365
e658983a 5366 buf = tconn->meta.rbuf;
b411b363 5367 received = 0;
52b061a4 5368 expect = header_size;
b411b363
PR
5369 cmd = NULL;
5370 }
5371 }
5372
5373 if (0) {
5374reconnect:
bbeb641c 5375 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
b411b363
PR
5376 }
5377 if (0) {
5378disconnect:
bbeb641c 5379 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 5380 }
32862ec7 5381 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363 5382
32862ec7 5383 conn_info(tconn, "asender terminated\n");
b411b363
PR
5384
5385 return 0;
5386}