drivers/block/drbd/drbd_receiver.c

   1 /*
   2    drbd_receiver.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25
  26 #include <linux/module.h>
  27
  28 #include <asm/uaccess.h>
  29 #include <net/sock.h>
  30
  31 #include <linux/drbd.h>
  32 #include <linux/fs.h>
  33 #include <linux/file.h>
  34 #include <linux/in.h>
  35 #include <linux/mm.h>
  36 #include <linux/memcontrol.h>
  37 #include <linux/mm_inline.h>
  38 #include <linux/slab.h>
  39 #include <linux/smp_lock.h>
  40 #include <linux/pkt_sched.h>
  41 #define __KERNEL_SYSCALLS__
  42 #include <linux/unistd.h>
  43 #include <linux/vmalloc.h>
  44 #include <linux/random.h>
  45 #include <linux/string.h>
  46 #include <linux/scatterlist.h>
  47 #include "drbd_int.h"
  48 #include "drbd_req.h"
  49
  50 #include "drbd_vli.h"
  51
  52 struct flush_work {
  53         struct drbd_work w;
  54         struct drbd_epoch *epoch;
  55 };
  56
  57 enum finish_epoch {
  58         FE_STILL_LIVE,
  59         FE_DESTROYED,
  60         FE_RECYCLED,
  61 };
  62
  63 static int drbd_do_handshake(struct drbd_conf *mdev);
  64 static int drbd_do_auth(struct drbd_conf *mdev);
  65
  66 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
  67 static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
  68
  69 static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
  70 {
  71         struct drbd_epoch *prev;
  72         spin_lock(&mdev->epoch_lock);
  73         prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
  74         if (prev == epoch || prev == mdev->current_epoch)
  75                 prev = NULL;
  76         spin_unlock(&mdev->epoch_lock);
  77         return prev;
  78 }
  79
  80 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  81
  82 /*
  83  * some helper functions to deal with single linked page lists,
  84  * page->private being our "next" pointer.
  85  */
  86
  87 /* If at least n pages are linked at head, get n pages off.
  88  * Otherwise, don't modify head, and return NULL.
  89  * Locking is the responsibility of the caller.
  90  */
  91 static struct page *page_chain_del(struct page **head, int n)
  92 {
  93         struct page *page;
  94         struct page *tmp;
  95
  96         BUG_ON(!n);
  97         BUG_ON(!head);
  98
  99         page = *head;
 100
 101         if (!page)
 102                 return NULL;
 103
 104         while (page) {
 105                 tmp = page_chain_next(page);
 106                 if (--n == 0)
 107                         break; /* found sufficient pages */
 108                 if (tmp == NULL)
 109                         /* insufficient pages, don't use any of them. */
 110                         return NULL;
 111                 page = tmp;
 112         }
 113
 114         /* add end of list marker for the returned list */
 115         set_page_private(page, 0);
 116         /* actual return value, and adjustment of head */
 117         page = *head;
 118         *head = tmp;
 119         return page;
 120 }
 121
 122 /* may be used outside of locks to find the tail of a (usually short)
 123  * "private" page chain, before adding it back to a global chain head
 124  * with page_chain_add() under a spinlock. */
 125 static struct page *page_chain_tail(struct page *page, int *len)
 126 {
 127         struct page *tmp;
 128         int i = 1;
 129         while ((tmp = page_chain_next(page)))
 130                 ++i, page = tmp;
 131         if (len)
 132                 *len = i;
 133         return page;
 134 }
 135
 136 static int page_chain_free(struct page *page)
 137 {
 138         struct page *tmp;
 139         int i = 0;
 140         page_chain_for_each_safe(page, tmp) {
 141                 put_page(page);
 142                 ++i;
 143         }
 144         return i;
 145 }
 146
 147 static void page_chain_add(struct page **head,
 148                 struct page *chain_first, struct page *chain_last)
 149 {
 150 #if 1
 151         struct page *tmp;
 152         tmp = page_chain_tail(chain_first, NULL);
 153         BUG_ON(tmp != chain_last);
 154 #endif
 155
 156         /* add chain to head */
 157         set_page_private(chain_last, (unsigned long)*head);
 158         *head = chain_first;
 159 }
 160
 161 static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
 162 {
 163         struct page *page = NULL;
 164         struct page *tmp = NULL;
 165         int i = 0;
 166
 167         /* Yes, testing drbd_pp_vacant outside the lock is racy.
 168          * So what. It saves a spin_lock. */
 169         if (drbd_pp_vacant >= number) {
 170                 spin_lock(&drbd_pp_lock);
 171                 page = page_chain_del(&drbd_pp_pool, number);
 172                 if (page)
 173                         drbd_pp_vacant -= number;
 174                 spin_unlock(&drbd_pp_lock);
 175                 if (page)
 176                         return page;
 177         }
 178
 179         /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 180          * "criss-cross" setup, that might cause write-out on some other DRBD,
 181          * which in turn might block on the other node at this very place.  */
 182         for (i = 0; i < number; i++) {
 183                 tmp = alloc_page(GFP_TRY);
 184                 if (!tmp)
 185                         break;
 186                 set_page_private(tmp, (unsigned long)page);
 187                 page = tmp;
 188         }
 189
 190         if (i == number)
 191                 return page;
 192
 193         /* Not enough pages immediately available this time.
 194          * No need to jump around here, drbd_pp_alloc will retry this
 195          * function "soon". */
 196         if (page) {
 197                 tmp = page_chain_tail(page, NULL);
 198                 spin_lock(&drbd_pp_lock);
 199                 page_chain_add(&drbd_pp_pool, page, tmp);
 200                 drbd_pp_vacant += i;
 201                 spin_unlock(&drbd_pp_lock);
 202         }
 203         return NULL;
 204 }
 205
 206 /* kick lower level device, if we have more than (arbitrary number)
 207  * reference counts on it, which typically are locally submitted io
 208  * requests.  don't use unacked_cnt, so we speed up proto A and B, too. */
 209 static void maybe_kick_lo(struct drbd_conf *mdev)
 210 {
 211         if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
 212                 drbd_kick_lo(mdev);
 213 }
 214
 215 static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
 216 {
 217         struct drbd_epoch_entry *e;
 218         struct list_head *le, *tle;
 219
 220         /* The EEs are always appended to the end of the list. Since
 221            they are sent in order over the wire, they have to finish
 222            in order. As soon as we see the first not finished we can
 223            stop to examine the list... */
 224
 225         list_for_each_safe(le, tle, &mdev->net_ee) {
 226                 e = list_entry(le, struct drbd_epoch_entry, w.list);
 227                 if (drbd_ee_has_active_page(e))
 228                         break;
 229                 list_move(le, to_be_freed);
 230         }
 231 }
 232
 233 static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 234 {
 235         LIST_HEAD(reclaimed);
 236         struct drbd_epoch_entry *e, *t;
 237
 238         maybe_kick_lo(mdev);
 239         spin_lock_irq(&mdev->req_lock);
 240         reclaim_net_ee(mdev, &reclaimed);
 241         spin_unlock_irq(&mdev->req_lock);
 242
 243         list_for_each_entry_safe(e, t, &reclaimed, w.list)
 244                 drbd_free_ee(mdev, e);
 245 }
 246
 247 /**
 248  * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
 249  * @mdev:       DRBD device.
 250  * @number:     number of pages requested
 251  * @retry:      whether to retry, if not enough pages are available right now
 252  *
 253  * Tries to allocate number pages, first from our own page pool, then from
 254  * the kernel, unless this allocation would exceed the max_buffers setting.
 255  * Possibly retry until DRBD frees sufficient pages somewhere else.
 256  *
 257  * Returns a page chain linked via page->private.
 258  */
 259 static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
 260 {
 261         struct page *page = NULL;
 262         DEFINE_WAIT(wait);
 263
 264         /* Yes, we may run up to @number over max_buffers. If we
 265          * follow it strictly, the admin will get it wrong anyways. */
 266         if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
 267                 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
 268
 269         while (page == NULL) {
 270                 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 271
 272                 drbd_kick_lo_and_reclaim_net(mdev);
 273
 274                 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
 275                         page = drbd_pp_first_pages_or_try_alloc(mdev, number);
 276                         if (page)
 277                                 break;
 278                 }
 279
 280                 if (!retry)
 281                         break;
 282
 283                 if (signal_pending(current)) {
 284                         dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
 285                         break;
 286                 }
 287
 288                 schedule();
 289         }
 290         finish_wait(&drbd_pp_wait, &wait);
 291
 292         if (page)
 293                 atomic_add(number, &mdev->pp_in_use);
 294         return page;
 295 }
 296
 297 /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
 298  * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
 299  * Either links the page chain back to the global pool,
 300  * or returns all pages to the system. */
 301 static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
 302 {
 303         int i;
 304         if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
 305                 i = page_chain_free(page);
 306         else {
 307                 struct page *tmp;
 308                 tmp = page_chain_tail(page, &i);
 309                 spin_lock(&drbd_pp_lock);
 310                 page_chain_add(&drbd_pp_pool, page, tmp);
 311                 drbd_pp_vacant += i;
 312                 spin_unlock(&drbd_pp_lock);
 313         }
 314         atomic_sub(i, &mdev->pp_in_use);
 315         i = atomic_read(&mdev->pp_in_use);
 316         if (i < 0)
 317                 dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
 318         wake_up(&drbd_pp_wait);
 319 }
 320
 321 /*
 322 You need to hold the req_lock:
 323  _drbd_wait_ee_list_empty()
 324
 325 You must not have the req_lock:
 326  drbd_free_ee()
 327  drbd_alloc_ee()
 328  drbd_init_ee()
 329  drbd_release_ee()
 330  drbd_ee_fix_bhs()
 331  drbd_process_done_ee()
 332  drbd_clear_done_ee()
 333  drbd_wait_ee_list_empty()
 334 */
 335
 336 struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 337                                      u64 id,
 338                                      sector_t sector,
 339                                      unsigned int data_size,
 340                                      gfp_t gfp_mask) __must_hold(local)
 341 {
 342         struct drbd_epoch_entry *e;
 343         struct page *page;
 344         unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 345
 346         if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
 347                 return NULL;
 348
 349         e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 350         if (!e) {
 351                 if (!(gfp_mask & __GFP_NOWARN))
 352                         dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
 353                 return NULL;
 354         }
 355
 356         page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
 357         if (!page)
 358                 goto fail;
 359
 360         INIT_HLIST_NODE(&e->colision);
 361         e->epoch = NULL;
 362         e->mdev = mdev;
 363         e->pages = page;
 364         atomic_set(&e->pending_bios, 0);
 365         e->size = data_size;
 366         e->flags = 0;
 367         e->sector = sector;
 368         e->block_id = id;
 369
 370         return e;
 371
 372  fail:
 373         mempool_free(e, drbd_ee_mempool);
 374         return NULL;
 375 }
 376
 377 void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 378 {
 379         if (e->flags & EE_HAS_DIGEST)
 380                 kfree(e->digest);
 381         drbd_pp_free(mdev, e->pages);
 382         D_ASSERT(atomic_read(&e->pending_bios) == 0);
 383         D_ASSERT(hlist_unhashed(&e->colision));
 384         mempool_free(e, drbd_ee_mempool);
 385 }
 386
 387 int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
 388 {
 389         LIST_HEAD(work_list);
 390         struct drbd_epoch_entry *e, *t;
 391         int count = 0;
 392
 393         spin_lock_irq(&mdev->req_lock);
 394         list_splice_init(list, &work_list);
 395         spin_unlock_irq(&mdev->req_lock);
 396
 397         list_for_each_entry_safe(e, t, &work_list, w.list) {
 398                 drbd_free_ee(mdev, e);
 399                 count++;
 400         }
 401         return count;
 402 }
 403
 404
 405 /*
 406  * This function is called from _asender only_
 407  * but see also comments in _req_mod(,barrier_acked)
 408  * and receive_Barrier.
 409  *
 410  * Move entries from net_ee to done_ee, if ready.
 411  * Grab done_ee, call all callbacks, free the entries.
 412  * The callbacks typically send out ACKs.
 413  */
 414 static int drbd_process_done_ee(struct drbd_conf *mdev)
 415 {
 416         LIST_HEAD(work_list);
 417         LIST_HEAD(reclaimed);
 418         struct drbd_epoch_entry *e, *t;
 419         int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
 420
 421         spin_lock_irq(&mdev->req_lock);
 422         reclaim_net_ee(mdev, &reclaimed);
 423         list_splice_init(&mdev->done_ee, &work_list);
 424         spin_unlock_irq(&mdev->req_lock);
 425
 426         list_for_each_entry_safe(e, t, &reclaimed, w.list)
 427                 drbd_free_ee(mdev, e);
 428
 429         /* possible callbacks here:
 430          * e_end_block, and e_end_resync_block, e_send_discard_ack.
 431          * all ignore the last argument.
 432          */
 433         list_for_each_entry_safe(e, t, &work_list, w.list) {
 434                 /* list_del not necessary, next/prev members not touched */
 435                 ok = e->w.cb(mdev, &e->w, !ok) && ok;
 436                 drbd_free_ee(mdev, e);
 437         }
 438         wake_up(&mdev->ee_wait);
 439
 440         return ok;
 441 }
 442
 443 void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
 444 {
 445         DEFINE_WAIT(wait);
 446
 447         /* avoids spin_lock/unlock
 448          * and calling prepare_to_wait in the fast path */
 449         while (!list_empty(head)) {
 450                 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 451                 spin_unlock_irq(&mdev->req_lock);
 452                 drbd_kick_lo(mdev);
 453                 schedule();
 454                 finish_wait(&mdev->ee_wait, &wait);
 455                 spin_lock_irq(&mdev->req_lock);
 456         }
 457 }
 458
 459 void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
 460 {
 461         spin_lock_irq(&mdev->req_lock);
 462         _drbd_wait_ee_list_empty(mdev, head);
 463         spin_unlock_irq(&mdev->req_lock);
 464 }
 465
 466 /* see also kernel_accept; which is only present since 2.6.18.
 467  * also we want to log which part of it failed, exactly */
 468 static int drbd_accept(struct drbd_conf *mdev, const char **what,
 469                 struct socket *sock, struct socket **newsock)
 470 {
 471         struct sock *sk = sock->sk;
 472         int err = 0;
 473
 474         *what = "listen";
 475         err = sock->ops->listen(sock, 5);
 476         if (err < 0)
 477                 goto out;
 478
 479         *what = "sock_create_lite";
 480         err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
 481                                newsock);
 482         if (err < 0)
 483                 goto out;
 484
 485         *what = "accept";
 486         err = sock->ops->accept(sock, *newsock, 0);
 487         if (err < 0) {
 488                 sock_release(*newsock);
 489                 *newsock = NULL;
 490                 goto out;
 491         }
 492         (*newsock)->ops  = sock->ops;
 493
 494 out:
 495         return err;
 496 }
 497
 498 static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
 499                     void *buf, size_t size, int flags)
 500 {
 501         mm_segment_t oldfs;
 502         struct kvec iov = {
 503                 .iov_base = buf,
 504                 .iov_len = size,
 505         };
 506         struct msghdr msg = {
 507                 .msg_iovlen = 1,
 508                 .msg_iov = (struct iovec *)&iov,
 509                 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 510         };
 511         int rv;
 512
 513         oldfs = get_fs();
 514         set_fs(KERNEL_DS);
 515         rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
 516         set_fs(oldfs);
 517
 518         return rv;
 519 }
 520
 521 static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
 522 {
 523         mm_segment_t oldfs;
 524         struct kvec iov = {
 525                 .iov_base = buf,
 526                 .iov_len = size,
 527         };
 528         struct msghdr msg = {
 529                 .msg_iovlen = 1,
 530                 .msg_iov = (struct iovec *)&iov,
 531                 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
 532         };
 533         int rv;
 534
 535         oldfs = get_fs();
 536         set_fs(KERNEL_DS);
 537
 538         for (;;) {
 539                 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
 540                 if (rv == size)
 541                         break;
 542
 543                 /* Note:
 544                  * ECONNRESET   other side closed the connection
 545                  * ERESTARTSYS  (on  sock) we got a signal
 546                  */
 547
 548                 if (rv < 0) {
 549                         if (rv == -ECONNRESET)
 550                                 dev_info(DEV, "sock was reset by peer\n");
 551                         else if (rv != -ERESTARTSYS)
 552                                 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
 553                         break;
 554                 } else if (rv == 0) {
 555                         dev_info(DEV, "sock was shut down by peer\n");
 556                         break;
 557                 } else  {
 558                         /* signal came in, or peer/link went down,
 559                          * after we read a partial message
 560                          */
 561                         /* D_ASSERT(signal_pending(current)); */
 562                         break;
 563                 }
 564         };
 565
 566         set_fs(oldfs);
 567
 568         if (rv != size)
 569                 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
 570
 571         return rv;
 572 }
 573
 574 /* quoting tcp(7):
 575  *   On individual connections, the socket buffer size must be set prior to the
 576  *   listen(2) or connect(2) calls in order to have it take effect.
 577  * This is our wrapper to do so.
 578  */
 579 static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 580                 unsigned int rcv)
 581 {
 582         /* open coded SO_SNDBUF, SO_RCVBUF */
 583         if (snd) {
 584                 sock->sk->sk_sndbuf = snd;
 585                 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 586         }
 587         if (rcv) {
 588                 sock->sk->sk_rcvbuf = rcv;
 589                 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 590         }
 591 }
 592
 593 static struct socket *drbd_try_connect(struct drbd_conf *mdev)
 594 {
 595         const char *what;
 596         struct socket *sock;
 597         struct sockaddr_in6 src_in6;
 598         int err;
 599         int disconnect_on_error = 1;
 600
 601         if (!get_net_conf(mdev))
 602                 return NULL;
 603
 604         what = "sock_create_kern";
 605         err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
 606                 SOCK_STREAM, IPPROTO_TCP, &sock);
 607         if (err < 0) {
 608                 sock = NULL;
 609                 goto out;
 610         }
 611
 612         sock->sk->sk_rcvtimeo =
 613         sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
 614         drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
 615                         mdev->net_conf->rcvbuf_size);
 616
 617        /* explicitly bind to the configured IP as source IP
 618         *  for the outgoing connections.
 619         *  This is needed for multihomed hosts and to be
 620         *  able to use lo: interfaces for drbd.
 621         * Make sure to use 0 as port number, so linux selects
 622         *  a free one dynamically.
 623         */
 624         memcpy(&src_in6, mdev->net_conf->my_addr,
 625                min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
 626         if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
 627                 src_in6.sin6_port = 0;
 628         else
 629                 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 630
 631         what = "bind before connect";
 632         err = sock->ops->bind(sock,
 633                               (struct sockaddr *) &src_in6,
 634                               mdev->net_conf->my_addr_len);
 635         if (err < 0)
 636                 goto out;
 637
 638         /* connect may fail, peer not yet available.
 639          * stay C_WF_CONNECTION, don't go Disconnecting! */
 640         disconnect_on_error = 0;
 641         what = "connect";
 642         err = sock->ops->connect(sock,
 643                                  (struct sockaddr *)mdev->net_conf->peer_addr,
 644                                  mdev->net_conf->peer_addr_len, 0);
 645
 646 out:
 647         if (err < 0) {
 648                 if (sock) {
 649                         sock_release(sock);
 650                         sock = NULL;
 651                 }
 652                 switch (-err) {
 653                         /* timeout, busy, signal pending */
 654                 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 655                 case EINTR: case ERESTARTSYS:
 656                         /* peer not (yet) available, network problem */
 657                 case ECONNREFUSED: case ENETUNREACH:
 658                 case EHOSTDOWN:    case EHOSTUNREACH:
 659                         disconnect_on_error = 0;
 660                         break;
 661                 default:
 662                         dev_err(DEV, "%s failed, err = %d\n", what, err);
 663                 }
 664                 if (disconnect_on_error)
 665                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 666         }
 667         put_net_conf(mdev);
 668         return sock;
 669 }
 670
 671 static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
 672 {
 673         int timeo, err;
 674         struct socket *s_estab = NULL, *s_listen;
 675         const char *what;
 676
 677         if (!get_net_conf(mdev))
 678                 return NULL;
 679
 680         what = "sock_create_kern";
 681         err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
 682                 SOCK_STREAM, IPPROTO_TCP, &s_listen);
 683         if (err) {
 684                 s_listen = NULL;
 685                 goto out;
 686         }
 687
 688         timeo = mdev->net_conf->try_connect_int * HZ;
 689         timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
 690
 691         s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
 692         s_listen->sk->sk_rcvtimeo = timeo;
 693         s_listen->sk->sk_sndtimeo = timeo;
 694         drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
 695                         mdev->net_conf->rcvbuf_size);
 696
 697         what = "bind before listen";
 698         err = s_listen->ops->bind(s_listen,
 699                               (struct sockaddr *) mdev->net_conf->my_addr,
 700                               mdev->net_conf->my_addr_len);
 701         if (err < 0)
 702                 goto out;
 703
 704         err = drbd_accept(mdev, &what, s_listen, &s_estab);
 705
 706 out:
 707         if (s_listen)
 708                 sock_release(s_listen);
 709         if (err < 0) {
 710                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 711                         dev_err(DEV, "%s failed, err = %d\n", what, err);
 712                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 713                 }
 714         }
 715         put_net_conf(mdev);
 716
 717         return s_estab;
 718 }
 719
 720 static int drbd_send_fp(struct drbd_conf *mdev,
 721         struct socket *sock, enum drbd_packets cmd)
 722 {
 723         struct p_header80 *h = &mdev->data.sbuf.header.h80;
 724
 725         return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
 726 }
 727
 728 static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
 729 {
 730         struct p_header80 *h = &mdev->data.rbuf.header.h80;
 731         int rr;
 732
 733         rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
 734
 735         if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
 736                 return be16_to_cpu(h->command);
 737
 738         return 0xffff;
 739 }
 740
 741 /**
 742  * drbd_socket_okay() - Free the socket if its connection is not okay
 743  * @mdev:       DRBD device.
 744  * @sock:       pointer to the pointer to the socket.
 745  */
 746 static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
 747 {
 748         int rr;
 749         char tb[4];
 750
 751         if (!*sock)
 752                 return FALSE;
 753
 754         rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 755
 756         if (rr > 0 || rr == -EAGAIN) {
 757                 return TRUE;
 758         } else {
 759                 sock_release(*sock);
 760                 *sock = NULL;
 761                 return FALSE;
 762         }
 763 }
 764
 765 /*
 766  * return values:
 767  *   1 yes, we have a valid connection
 768  *   0 oops, did not work out, please try again
 769  *  -1 peer talks different language,
 770  *     no point in trying again, please go standalone.
 771  *  -2 We do not have a network config...
 772  */
 773 static int drbd_connect(struct drbd_conf *mdev)
 774 {
 775         struct socket *s, *sock, *msock;
 776         int try, h, ok;
 777
 778         D_ASSERT(!mdev->data.socket);
 779
 780         if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
 781                 return -2;
 782
 783         clear_bit(DISCARD_CONCURRENT, &mdev->flags);
 784
 785         sock  = NULL;
 786         msock = NULL;
 787
 788         do {
 789                 for (try = 0;;) {
 790                         /* 3 tries, this should take less than a second! */
 791                         s = drbd_try_connect(mdev);
 792                         if (s || ++try >= 3)
 793                                 break;
 794                         /* give the other side time to call bind() & listen() */
 795                         __set_current_state(TASK_INTERRUPTIBLE);
 796                         schedule_timeout(HZ / 10);
 797                 }
 798
 799                 if (s) {
 800                         if (!sock) {
 801                                 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
 802                                 sock = s;
 803                                 s = NULL;
 804                         } else if (!msock) {
 805                                 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
 806                                 msock = s;
 807                                 s = NULL;
 808                         } else {
 809                                 dev_err(DEV, "Logic error in drbd_connect()\n");
 810                                 goto out_release_sockets;
 811                         }
 812                 }
 813
 814                 if (sock && msock) {
 815                         __set_current_state(TASK_INTERRUPTIBLE);
 816                         schedule_timeout(HZ / 10);
 817                         ok = drbd_socket_okay(mdev, &sock);
 818                         ok = drbd_socket_okay(mdev, &msock) && ok;
 819                         if (ok)
 820                                 break;
 821                 }
 822
 823 retry:
 824                 s = drbd_wait_for_connect(mdev);
 825                 if (s) {
 826                         try = drbd_recv_fp(mdev, s);
 827                         drbd_socket_okay(mdev, &sock);
 828                         drbd_socket_okay(mdev, &msock);
 829                         switch (try) {
 830                         case P_HAND_SHAKE_S:
 831                                 if (sock) {
 832                                         dev_warn(DEV, "initial packet S crossed\n");
 833                                         sock_release(sock);
 834                                 }
 835                                 sock = s;
 836                                 break;
 837                         case P_HAND_SHAKE_M:
 838                                 if (msock) {
 839                                         dev_warn(DEV, "initial packet M crossed\n");
 840                                         sock_release(msock);
 841                                 }
 842                                 msock = s;
 843                                 set_bit(DISCARD_CONCURRENT, &mdev->flags);
 844                                 break;
 845                         default:
 846                                 dev_warn(DEV, "Error receiving initial packet\n");
 847                                 sock_release(s);
 848                                 if (random32() & 1)
 849                                         goto retry;
 850                         }
 851                 }
 852
 853                 if (mdev->state.conn <= C_DISCONNECTING)
 854                         goto out_release_sockets;
 855                 if (signal_pending(current)) {
 856                         flush_signals(current);
 857                         smp_rmb();
 858                         if (get_t_state(&mdev->receiver) == Exiting)
 859                                 goto out_release_sockets;
 860                 }
 861
 862                 if (sock && msock) {
 863                         ok = drbd_socket_okay(mdev, &sock);
 864                         ok = drbd_socket_okay(mdev, &msock) && ok;
 865                         if (ok)
 866                                 break;
 867                 }
 868         } while (1);
 869
 870         msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
 871         sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
 872
 873         sock->sk->sk_allocation = GFP_NOIO;
 874         msock->sk->sk_allocation = GFP_NOIO;
 875
 876         sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
 877         msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
 878
 879         /* NOT YET ...
 880          * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 881          * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 882          * first set it to the P_HAND_SHAKE timeout,
 883          * which we set to 4x the configured ping_timeout. */
 884         sock->sk->sk_sndtimeo =
 885         sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
 886
 887         msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 888         msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
 889
 890         /* we don't want delays.
 891          * we use TCP_CORK where apropriate, though */
 892         drbd_tcp_nodelay(sock);
 893         drbd_tcp_nodelay(msock);
 894
 895         mdev->data.socket = sock;
 896         mdev->meta.socket = msock;
 897         mdev->last_received = jiffies;
 898
 899         D_ASSERT(mdev->asender.task == NULL);
 900
 901         h = drbd_do_handshake(mdev);
 902         if (h <= 0)
 903                 return h;
 904
 905         if (mdev->cram_hmac_tfm) {
 906                 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
 907                 switch (drbd_do_auth(mdev)) {
 908                 case -1:
 909                         dev_err(DEV, "Authentication of peer failed\n");
 910                         return -1;
 911                 case 0:
 912                         dev_err(DEV, "Authentication of peer failed, trying again.\n");
 913                         return 0;
 914                 }
 915         }
 916
 917         if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
 918                 return 0;
 919
 920         sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 921         sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 922
 923         atomic_set(&mdev->packet_seq, 0);
 924         mdev->peer_seq = 0;
 925
 926         drbd_thread_start(&mdev->asender);
 927
 928         if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
 929                 drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
 930                 put_ldev(mdev);
 931         }
 932
 933         if (!drbd_send_protocol(mdev))
 934                 return -1;
 935         drbd_send_sync_param(mdev, &mdev->sync_conf);
 936         drbd_send_sizes(mdev, 0, 0);
 937         drbd_send_uuids(mdev);
 938         drbd_send_state(mdev);
 939         clear_bit(USE_DEGR_WFC_T, &mdev->flags);
 940         clear_bit(RESIZE_PENDING, &mdev->flags);
 941
 942         return 1;
 943
 944 out_release_sockets:
 945         if (sock)
 946                 sock_release(sock);
 947         if (msock)
 948                 sock_release(msock);
 949         return -1;
 950 }
 951
 952 static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
 953 {
 954         union p_header *h = &mdev->data.rbuf.header;
 955         int r;
 956
 957         r = drbd_recv(mdev, h, sizeof(*h));
 958         if (unlikely(r != sizeof(*h))) {
 959                 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
 960                 return FALSE;
 961         }
 962
 963         if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
 964                 *cmd = be16_to_cpu(h->h80.command);
 965                 *packet_size = be16_to_cpu(h->h80.length);
 966         } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
 967                 *cmd = be16_to_cpu(h->h95.command);
 968                 *packet_size = be32_to_cpu(h->h95.length);
 969         } else {
 970                 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
 971                     (long)be32_to_cpu(h->h80.magic),
 972                     h->h80.command, h->h80.length);
 973                 return FALSE;
 974         }
 975         mdev->last_received = jiffies;
 976
 977         return TRUE;
 978 }
 979
 980 static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
 981 {
 982         int rv;
 983
 984         if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
 985                 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
 986                                         NULL, BLKDEV_IFL_WAIT);
 987                 if (rv) {
 988                         dev_err(DEV, "local disk flush failed with status %d\n", rv);
 989                         /* would rather check on EOPNOTSUPP, but that is not reliable.
 990                          * don't try again for ANY return value != 0
 991                          * if (rv == -EOPNOTSUPP) */
 992                         drbd_bump_write_ordering(mdev, WO_drain_io);
 993                 }
 994                 put_ldev(mdev);
 995         }
 996
 997         return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
 998 }
 999
1000 static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1001 {
1002         struct flush_work *fw = (struct flush_work *)w;
1003         struct drbd_epoch *epoch = fw->epoch;
1004
1005         kfree(w);
1006
1007         if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
1008                 drbd_flush_after_epoch(mdev, epoch);
1009
1010         drbd_may_finish_epoch(mdev, epoch, EV_PUT |
1011                               (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
1012
1013         return 1;
1014 }
1015
1016 /**
1017  * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1018  * @mdev:       DRBD device.
1019  * @epoch:      Epoch object.
1020  * @ev:         Epoch event.
1021  */
1022 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1023                                                struct drbd_epoch *epoch,
1024                                                enum epoch_event ev)
1025 {
1026         int finish, epoch_size;
1027         struct drbd_epoch *next_epoch;
1028         int schedule_flush = 0;
1029         enum finish_epoch rv = FE_STILL_LIVE;
1030
1031         spin_lock(&mdev->epoch_lock);
1032         do {
1033                 next_epoch = NULL;
1034                 finish = 0;
1035
1036                 epoch_size = atomic_read(&epoch->epoch_size);
1037
1038                 switch (ev & ~EV_CLEANUP) {
1039                 case EV_PUT:
1040                         atomic_dec(&epoch->active);
1041                         break;
1042                 case EV_GOT_BARRIER_NR:
1043                         set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1044
1045                         /* Special case: If we just switched from WO_bio_barrier to
1046                            WO_bdev_flush we should not finish the current epoch */
1047                         if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1048                             mdev->write_ordering != WO_bio_barrier &&
1049                             epoch == mdev->current_epoch)
1050                                 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1051                         break;
1052                 case EV_BARRIER_DONE:
1053                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1054                         break;
1055                 case EV_BECAME_LAST:
1056                         /* nothing to do*/
1057                         break;
1058                 }
1059
1060                 if (epoch_size != 0 &&
1061                     atomic_read(&epoch->active) == 0 &&
1062                     test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
1063                     epoch->list.prev == &mdev->current_epoch->list &&
1064                     !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1065                         /* Nearly all conditions are met to finish that epoch... */
1066                         if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1067                             mdev->write_ordering == WO_none ||
1068                             (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1069                             ev & EV_CLEANUP) {
1070                                 finish = 1;
1071                                 set_bit(DE_IS_FINISHING, &epoch->flags);
1072                         } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1073                                  mdev->write_ordering == WO_bio_barrier) {
1074                                 atomic_inc(&epoch->active);
1075                                 schedule_flush = 1;
1076                         }
1077                 }
1078                 if (finish) {
1079                         if (!(ev & EV_CLEANUP)) {
1080                                 spin_unlock(&mdev->epoch_lock);
1081                                 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1082                                 spin_lock(&mdev->epoch_lock);
1083                         }
1084                         dec_unacked(mdev);
1085
1086                         if (mdev->current_epoch != epoch) {
1087                                 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1088                                 list_del(&epoch->list);
1089                                 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1090                                 mdev->epochs--;
1091                                 kfree(epoch);
1092
1093                                 if (rv == FE_STILL_LIVE)
1094                                         rv = FE_DESTROYED;
1095                         } else {
1096                                 epoch->flags = 0;
1097                                 atomic_set(&epoch->epoch_size, 0);
1098                                 /* atomic_set(&epoch->active, 0); is already zero */
1099                                 if (rv == FE_STILL_LIVE)
1100                                         rv = FE_RECYCLED;
1101                         }
1102                 }
1103
1104                 if (!next_epoch)
1105                         break;
1106
1107                 epoch = next_epoch;
1108         } while (1);
1109
1110         spin_unlock(&mdev->epoch_lock);
1111
1112         if (schedule_flush) {
1113                 struct flush_work *fw;
1114                 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1115                 if (fw) {
1116                         fw->w.cb = w_flush;
1117                         fw->epoch = epoch;
1118                         drbd_queue_work(&mdev->data.work, &fw->w);
1119                 } else {
1120                         dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1121                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1122                         /* That is not a recursion, only one level */
1123                         drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1124                         drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1125                 }
1126         }
1127
1128         return rv;
1129 }
1130
1131 /**
1132  * drbd_bump_write_ordering() - Fall back to an other write ordering method
1133  * @mdev:       DRBD device.
1134  * @wo:         Write ordering method to try.
1135  */
1136 void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1137 {
1138         enum write_ordering_e pwo;
1139         static char *write_ordering_str[] = {
1140                 [WO_none] = "none",
1141                 [WO_drain_io] = "drain",
1142                 [WO_bdev_flush] = "flush",
1143                 [WO_bio_barrier] = "barrier",
1144         };
1145
1146         pwo = mdev->write_ordering;
1147         wo = min(pwo, wo);
1148         if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1149                 wo = WO_bdev_flush;
1150         if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1151                 wo = WO_drain_io;
1152         if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1153                 wo = WO_none;
1154         mdev->write_ordering = wo;
1155         if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
1156                 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1157 }
1158
1159 /**
1160  * drbd_submit_ee()
1161  * @mdev:       DRBD device.
1162  * @e:          epoch entry
1163  * @rw:         flag field, see bio->bi_rw
1164  */
1165 /* TODO allocate from our own bio_set. */
1166 int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1167                 const unsigned rw, const int fault_type)
1168 {
1169         struct bio *bios = NULL;
1170         struct bio *bio;
1171         struct page *page = e->pages;
1172         sector_t sector = e->sector;
1173         unsigned ds = e->size;
1174         unsigned n_bios = 0;
1175         unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1176
1177         /* In most cases, we will only need one bio.  But in case the lower
1178          * level restrictions happen to be different at this offset on this
1179          * side than those of the sending peer, we may need to submit the
1180          * request in more than one bio. */
1181 next_bio:
1182         bio = bio_alloc(GFP_NOIO, nr_pages);
1183         if (!bio) {
1184                 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1185                 goto fail;
1186         }
1187         /* > e->sector, unless this is the first bio */
1188         bio->bi_sector = sector;
1189         bio->bi_bdev = mdev->ldev->backing_bdev;
1190         /* we special case some flags in the multi-bio case, see below
1191          * (REQ_UNPLUG, REQ_HARDBARRIER) */
1192         bio->bi_rw = rw;
1193         bio->bi_private = e;
1194         bio->bi_end_io = drbd_endio_sec;
1195
1196         bio->bi_next = bios;
1197         bios = bio;
1198         ++n_bios;
1199
1200         page_chain_for_each(page) {
1201                 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1202                 if (!bio_add_page(bio, page, len, 0)) {
1203                         /* a single page must always be possible! */
1204                         BUG_ON(bio->bi_vcnt == 0);
1205                         goto next_bio;
1206                 }
1207                 ds -= len;
1208                 sector += len >> 9;
1209                 --nr_pages;
1210         }
1211         D_ASSERT(page == NULL);
1212         D_ASSERT(ds == 0);
1213
1214         atomic_set(&e->pending_bios, n_bios);
1215         do {
1216                 bio = bios;
1217                 bios = bios->bi_next;
1218                 bio->bi_next = NULL;
1219
1220                 /* strip off REQ_UNPLUG unless it is the last bio */
1221                 if (bios)
1222                         bio->bi_rw &= ~REQ_UNPLUG;
1223
1224                 drbd_generic_make_request(mdev, fault_type, bio);
1225
1226                 /* strip off REQ_HARDBARRIER,
1227                  * unless it is the first or last bio */
1228                 if (bios && bios->bi_next)
1229                         bios->bi_rw &= ~REQ_HARDBARRIER;
1230         } while (bios);
1231         maybe_kick_lo(mdev);
1232         return 0;
1233
1234 fail:
1235         while (bios) {
1236                 bio = bios;
1237                 bios = bios->bi_next;
1238                 bio_put(bio);
1239         }
1240         return -ENOMEM;
1241 }
1242
1243 /**
1244  * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
1245  * @mdev:       DRBD device.
1246  * @w:          work object.
1247  * @cancel:     The connection will be closed anyways (unused in this callback)
1248  */
1249 int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1250 {
1251         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1252         /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1253            (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1254            so that we can finish that epoch in drbd_may_finish_epoch().
1255            That is necessary if we already have a long chain of Epochs, before
1256            we realize that REQ_HARDBARRIER is actually not supported */
1257
1258         /* As long as the -ENOTSUPP on the barrier is reported immediately
1259            that will never trigger. If it is reported late, we will just
1260            print that warning and continue correctly for all future requests
1261            with WO_bdev_flush */
1262         if (previous_epoch(mdev, e->epoch))
1263                 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1264
1265         /* we still have a local reference,
1266          * get_ldev was done in receive_Data. */
1267
1268         e->w.cb = e_end_block;
1269         if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1270                 /* drbd_submit_ee fails for one reason only:
1271                  * if was not able to allocate sufficient bios.
1272                  * requeue, try again later. */
1273                 e->w.cb = w_e_reissue;
1274                 drbd_queue_work(&mdev->data.work, &e->w);
1275         }
1276         return 1;
1277 }
1278
1279 static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1280 {
1281         int rv, issue_flush;
1282         struct p_barrier *p = &mdev->data.rbuf.barrier;
1283         struct drbd_epoch *epoch;
1284
1285         inc_unacked(mdev);
1286
1287         if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1288                 drbd_kick_lo(mdev);
1289
1290         mdev->current_epoch->barrier_nr = p->barrier;
1291         rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1292
1293         /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1294          * the activity log, which means it would not be resynced in case the
1295          * R_PRIMARY crashes now.
1296          * Therefore we must send the barrier_ack after the barrier request was
1297          * completed. */
1298         switch (mdev->write_ordering) {
1299         case WO_bio_barrier:
1300         case WO_none:
1301                 if (rv == FE_RECYCLED)
1302                         return TRUE;
1303                 break;
1304
1305         case WO_bdev_flush:
1306         case WO_drain_io:
1307                 if (rv == FE_STILL_LIVE) {
1308                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1309                         drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1310                         rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1311                 }
1312                 if (rv == FE_RECYCLED)
1313                         return TRUE;
1314
1315                 /* The asender will send all the ACKs and barrier ACKs out, since
1316                    all EEs moved from the active_ee to the done_ee. We need to
1317                    provide a new epoch object for the EEs that come in soon */
1318                 break;
1319         }
1320
1321         /* receiver context, in the writeout path of the other node.
1322          * avoid potential distributed deadlock */
1323         epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1324         if (!epoch) {
1325                 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1326                 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1327                 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1328                 if (issue_flush) {
1329                         rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1330                         if (rv == FE_RECYCLED)
1331                                 return TRUE;
1332                 }
1333
1334                 drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
1335
1336                 return TRUE;
1337         }
1338
1339         epoch->flags = 0;
1340         atomic_set(&epoch->epoch_size, 0);
1341         atomic_set(&epoch->active, 0);
1342
1343         spin_lock(&mdev->epoch_lock);
1344         if (atomic_read(&mdev->current_epoch->epoch_size)) {
1345                 list_add(&epoch->list, &mdev->current_epoch->list);
1346                 mdev->current_epoch = epoch;
1347                 mdev->epochs++;
1348         } else {
1349                 /* The current_epoch got recycled while we allocated this one... */
1350                 kfree(epoch);
1351         }
1352         spin_unlock(&mdev->epoch_lock);
1353
1354         return TRUE;
1355 }
1356
1357 /* used from receive_RSDataReply (recv_resync_read)
1358  * and from receive_Data */
1359 static struct drbd_epoch_entry *
1360 read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1361 {
1362         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1363         struct drbd_epoch_entry *e;
1364         struct page *page;
1365         int dgs, ds, rr;
1366         void *dig_in = mdev->int_dig_in;
1367         void *dig_vv = mdev->int_dig_vv;
1368         unsigned long *data;
1369
1370         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1371                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1372
1373         if (dgs) {
1374                 rr = drbd_recv(mdev, dig_in, dgs);
1375                 if (rr != dgs) {
1376                         dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
1377                              rr, dgs);
1378                         return NULL;
1379                 }
1380         }
1381
1382         data_size -= dgs;
1383
1384         ERR_IF(data_size &  0x1ff) return NULL;
1385         ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
1386
1387         /* even though we trust out peer,
1388          * we sometimes have to double check. */
1389         if (sector + (data_size>>9) > capacity) {
1390                 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
1391                         (unsigned long long)capacity,
1392                         (unsigned long long)sector, data_size);
1393                 return NULL;
1394         }
1395
1396         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1397          * "criss-cross" setup, that might cause write-out on some other DRBD,
1398          * which in turn might block on the other node at this very place.  */
1399         e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1400         if (!e)
1401                 return NULL;
1402
1403         ds = data_size;
1404         page = e->pages;
1405         page_chain_for_each(page) {
1406                 unsigned len = min_t(int, ds, PAGE_SIZE);
1407                 data = kmap(page);
1408                 rr = drbd_recv(mdev, data, len);
1409                 if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
1410                         dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1411                         data[0] = data[0] ^ (unsigned long)-1;
1412                 }
1413                 kunmap(page);
1414                 if (rr != len) {
1415                         drbd_free_ee(mdev, e);
1416                         dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1417                              rr, len);
1418                         return NULL;
1419                 }
1420                 ds -= rr;
1421         }
1422
1423         if (dgs) {
1424                 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
1425                 if (memcmp(dig_in, dig_vv, dgs)) {
1426                         dev_err(DEV, "Digest integrity check FAILED.\n");
1427                         drbd_bcast_ee(mdev, "digest failed",
1428                                         dgs, dig_in, dig_vv, e);
1429                         drbd_free_ee(mdev, e);
1430                         return NULL;
1431                 }
1432         }
1433         mdev->recv_cnt += data_size>>9;
1434         return e;
1435 }
1436
1437 /* drbd_drain_block() just takes a data block
1438  * out of the socket input buffer, and discards it.
1439  */
1440 static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1441 {
1442         struct page *page;
1443         int rr, rv = 1;
1444         void *data;
1445
1446         if (!data_size)
1447                 return TRUE;
1448
1449         page = drbd_pp_alloc(mdev, 1, 1);
1450
1451         data = kmap(page);
1452         while (data_size) {
1453                 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1454                 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1455                         rv = 0;
1456                         dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1457                              rr, min_t(int, data_size, PAGE_SIZE));
1458                         break;
1459                 }
1460                 data_size -= rr;
1461         }
1462         kunmap(page);
1463         drbd_pp_free(mdev, page);
1464         return rv;
1465 }
1466
1467 static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1468                            sector_t sector, int data_size)
1469 {
1470         struct bio_vec *bvec;
1471         struct bio *bio;
1472         int dgs, rr, i, expect;
1473         void *dig_in = mdev->int_dig_in;
1474         void *dig_vv = mdev->int_dig_vv;
1475
1476         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1477                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1478
1479         if (dgs) {
1480                 rr = drbd_recv(mdev, dig_in, dgs);
1481                 if (rr != dgs) {
1482                         dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
1483                              rr, dgs);
1484                         return 0;
1485                 }
1486         }
1487
1488         data_size -= dgs;
1489
1490         /* optimistically update recv_cnt.  if receiving fails below,
1491          * we disconnect anyways, and counters will be reset. */
1492         mdev->recv_cnt += data_size>>9;
1493
1494         bio = req->master_bio;
1495         D_ASSERT(sector == bio->bi_sector);
1496
1497         bio_for_each_segment(bvec, bio, i) {
1498                 expect = min_t(int, data_size, bvec->bv_len);
1499                 rr = drbd_recv(mdev,
1500                              kmap(bvec->bv_page)+bvec->bv_offset,
1501                              expect);
1502                 kunmap(bvec->bv_page);
1503                 if (rr != expect) {
1504                         dev_warn(DEV, "short read receiving data reply: "
1505                              "read %d expected %d\n",
1506                              rr, expect);
1507                         return 0;
1508                 }
1509                 data_size -= rr;
1510         }
1511
1512         if (dgs) {
1513                 drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1514                 if (memcmp(dig_in, dig_vv, dgs)) {
1515                         dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1516                         return 0;
1517                 }
1518         }
1519
1520         D_ASSERT(data_size == 0);
1521         return 1;
1522 }
1523
1524 /* e_end_resync_block() is called via
1525  * drbd_process_done_ee() by asender only */
1526 static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1527 {
1528         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1529         sector_t sector = e->sector;
1530         int ok;
1531
1532         D_ASSERT(hlist_unhashed(&e->colision));
1533
1534         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1535                 drbd_set_in_sync(mdev, sector, e->size);
1536                 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1537         } else {
1538                 /* Record failure to sync */
1539                 drbd_rs_failed_io(mdev, sector, e->size);
1540
1541                 ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
1542         }
1543         dec_unacked(mdev);
1544
1545         return ok;
1546 }
1547
1548 static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1549 {
1550         struct drbd_epoch_entry *e;
1551
1552         e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1553         if (!e)
1554                 goto fail;
1555
1556         dec_rs_pending(mdev);
1557
1558         inc_unacked(mdev);
1559         /* corresponding dec_unacked() in e_end_resync_block()
1560          * respective _drbd_clear_done_ee */
1561
1562         e->w.cb = e_end_resync_block;
1563
1564         spin_lock_irq(&mdev->req_lock);
1565         list_add(&e->w.list, &mdev->sync_ee);
1566         spin_unlock_irq(&mdev->req_lock);
1567
1568         atomic_add(data_size >> 9, &mdev->rs_sect_ev);
1569         if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1570                 return TRUE;
1571
1572         drbd_free_ee(mdev, e);
1573 fail:
1574         put_ldev(mdev);
1575         return FALSE;
1576 }
1577
1578 static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1579 {
1580         struct drbd_request *req;
1581         sector_t sector;
1582         int ok;
1583         struct p_data *p = &mdev->data.rbuf.data;
1584
1585         sector = be64_to_cpu(p->sector);
1586
1587         spin_lock_irq(&mdev->req_lock);
1588         req = _ar_id_to_req(mdev, p->block_id, sector);
1589         spin_unlock_irq(&mdev->req_lock);
1590         if (unlikely(!req)) {
1591                 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1592                 return FALSE;
1593         }
1594
1595         /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1596          * special casing it there for the various failure cases.
1597          * still no race with drbd_fail_pending_reads */
1598         ok = recv_dless_read(mdev, req, sector, data_size);
1599
1600         if (ok)
1601                 req_mod(req, data_received);
1602         /* else: nothing. handled from drbd_disconnect...
1603          * I don't think we may complete this just yet
1604          * in case we are "on-disconnect: freeze" */
1605
1606         return ok;
1607 }
1608
1609 static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1610 {
1611         sector_t sector;
1612         int ok;
1613         struct p_data *p = &mdev->data.rbuf.data;
1614
1615         sector = be64_to_cpu(p->sector);
1616         D_ASSERT(p->block_id == ID_SYNCER);
1617
1618         if (get_ldev(mdev)) {
1619                 /* data is submitted to disk within recv_resync_read.
1620                  * corresponding put_ldev done below on error,
1621                  * or in drbd_endio_write_sec. */
1622                 ok = recv_resync_read(mdev, sector, data_size);
1623         } else {
1624                 if (__ratelimit(&drbd_ratelimit_state))
1625                         dev_err(DEV, "Can not write resync data to local disk.\n");
1626
1627                 ok = drbd_drain_block(mdev, data_size);
1628
1629                 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1630         }
1631
1632         atomic_add(data_size >> 9, &mdev->rs_sect_in);
1633
1634         return ok;
1635 }
1636
1637 /* e_end_block() is called via drbd_process_done_ee().
1638  * this means this function only runs in the asender thread
1639  */
1640 static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1641 {
1642         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1643         sector_t sector = e->sector;
1644         struct drbd_epoch *epoch;
1645         int ok = 1, pcmd;
1646
1647         if (e->flags & EE_IS_BARRIER) {
1648                 epoch = previous_epoch(mdev, e->epoch);
1649                 if (epoch)
1650                         drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1651         }
1652
1653         if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1654                 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1655                         pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1656                                 mdev->state.conn <= C_PAUSED_SYNC_T &&
1657                                 e->flags & EE_MAY_SET_IN_SYNC) ?
1658                                 P_RS_WRITE_ACK : P_WRITE_ACK;
1659                         ok &= drbd_send_ack(mdev, pcmd, e);
1660                         if (pcmd == P_RS_WRITE_ACK)
1661                                 drbd_set_in_sync(mdev, sector, e->size);
1662                 } else {
1663                         ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
1664                         /* we expect it to be marked out of sync anyways...
1665                          * maybe assert this?  */
1666                 }
1667                 dec_unacked(mdev);
1668         }
1669         /* we delete from the conflict detection hash _after_ we sent out the
1670          * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
1671         if (mdev->net_conf->two_primaries) {
1672                 spin_lock_irq(&mdev->req_lock);
1673                 D_ASSERT(!hlist_unhashed(&e->colision));
1674                 hlist_del_init(&e->colision);
1675                 spin_unlock_irq(&mdev->req_lock);
1676         } else {
1677                 D_ASSERT(hlist_unhashed(&e->colision));
1678         }
1679
1680         drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1681
1682         return ok;
1683 }
1684
1685 static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1686 {
1687         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1688         int ok = 1;
1689
1690         D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1691         ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1692
1693         spin_lock_irq(&mdev->req_lock);
1694         D_ASSERT(!hlist_unhashed(&e->colision));
1695         hlist_del_init(&e->colision);
1696         spin_unlock_irq(&mdev->req_lock);
1697
1698         dec_unacked(mdev);
1699
1700         return ok;
1701 }
1702
1703 /* Called from receive_Data.
1704  * Synchronize packets on sock with packets on msock.
1705  *
1706  * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1707  * packet traveling on msock, they are still processed in the order they have
1708  * been sent.
1709  *
1710  * Note: we don't care for Ack packets overtaking P_DATA packets.
1711  *
1712  * In case packet_seq is larger than mdev->peer_seq number, there are
1713  * outstanding packets on the msock. We wait for them to arrive.
1714  * In case we are the logically next packet, we update mdev->peer_seq
1715  * ourselves. Correctly handles 32bit wrap around.
1716  *
1717  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1718  * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1719  * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1720  * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1721  *
1722  * returns 0 if we may process the packet,
1723  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1724 static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1725 {
1726         DEFINE_WAIT(wait);
1727         unsigned int p_seq;
1728         long timeout;
1729         int ret = 0;
1730         spin_lock(&mdev->peer_seq_lock);
1731         for (;;) {
1732                 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1733                 if (seq_le(packet_seq, mdev->peer_seq+1))
1734                         break;
1735                 if (signal_pending(current)) {
1736                         ret = -ERESTARTSYS;
1737                         break;
1738                 }
1739                 p_seq = mdev->peer_seq;
1740                 spin_unlock(&mdev->peer_seq_lock);
1741                 timeout = schedule_timeout(30*HZ);
1742                 spin_lock(&mdev->peer_seq_lock);
1743                 if (timeout == 0 && p_seq == mdev->peer_seq) {
1744                         ret = -ETIMEDOUT;
1745                         dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1746                         break;
1747                 }
1748         }
1749         finish_wait(&mdev->seq_wait, &wait);
1750         if (mdev->peer_seq+1 == packet_seq)
1751                 mdev->peer_seq++;
1752         spin_unlock(&mdev->peer_seq_lock);
1753         return ret;
1754 }
1755
1756 static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
1757 {
1758         if (mdev->agreed_pro_version >= 95)
1759                 return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1760                         (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
1761                         (dpf & DP_FUA ? REQ_FUA : 0) |
1762                         (dpf & DP_FLUSH ? REQ_FUA : 0) |
1763                         (dpf & DP_DISCARD ? REQ_DISCARD : 0);
1764         else
1765                 return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
1766 }
1767
1768 /* mirrored write */
1769 static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1770 {
1771         sector_t sector;
1772         struct drbd_epoch_entry *e;
1773         struct p_data *p = &mdev->data.rbuf.data;
1774         int rw = WRITE;
1775         u32 dp_flags;
1776
1777         if (!get_ldev(mdev)) {
1778                 if (__ratelimit(&drbd_ratelimit_state))
1779                         dev_err(DEV, "Can not write mirrored data block "
1780                             "to local disk.\n");
1781                 spin_lock(&mdev->peer_seq_lock);
1782                 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1783                         mdev->peer_seq++;
1784                 spin_unlock(&mdev->peer_seq_lock);
1785
1786                 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1787                 atomic_inc(&mdev->current_epoch->epoch_size);
1788                 return drbd_drain_block(mdev, data_size);
1789         }
1790
1791         /* get_ldev(mdev) successful.
1792          * Corresponding put_ldev done either below (on various errors),
1793          * or in drbd_endio_write_sec, if we successfully submit the data at
1794          * the end of this function. */
1795
1796         sector = be64_to_cpu(p->sector);
1797         e = read_in_block(mdev, p->block_id, sector, data_size);
1798         if (!e) {
1799                 put_ldev(mdev);
1800                 return FALSE;
1801         }
1802
1803         e->w.cb = e_end_block;
1804
1805         spin_lock(&mdev->epoch_lock);
1806         e->epoch = mdev->current_epoch;
1807         atomic_inc(&e->epoch->epoch_size);
1808         atomic_inc(&e->epoch->active);
1809
1810         if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1811                 struct drbd_epoch *epoch;
1812                 /* Issue a barrier if we start a new epoch, and the previous epoch
1813                    was not a epoch containing a single request which already was
1814                    a Barrier. */
1815                 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1816                 if (epoch == e->epoch) {
1817                         set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1818                         rw |= REQ_HARDBARRIER;
1819                         e->flags |= EE_IS_BARRIER;
1820                 } else {
1821                         if (atomic_read(&epoch->epoch_size) > 1 ||
1822                             !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1823                                 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1824                                 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1825                                 rw |= REQ_HARDBARRIER;
1826                                 e->flags |= EE_IS_BARRIER;
1827                         }
1828                 }
1829         }
1830         spin_unlock(&mdev->epoch_lock);
1831
1832         dp_flags = be32_to_cpu(p->dp_flags);
1833         rw |= write_flags_to_bio(mdev, dp_flags);
1834
1835         if (dp_flags & DP_MAY_SET_IN_SYNC)
1836                 e->flags |= EE_MAY_SET_IN_SYNC;
1837
1838         /* I'm the receiver, I do hold a net_cnt reference. */
1839         if (!mdev->net_conf->two_primaries) {
1840                 spin_lock_irq(&mdev->req_lock);
1841         } else {
1842                 /* don't get the req_lock yet,
1843                  * we may sleep in drbd_wait_peer_seq */
1844                 const int size = e->size;
1845                 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1846                 DEFINE_WAIT(wait);
1847                 struct drbd_request *i;
1848                 struct hlist_node *n;
1849                 struct hlist_head *slot;
1850                 int first;
1851
1852                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1853                 BUG_ON(mdev->ee_hash == NULL);
1854                 BUG_ON(mdev->tl_hash == NULL);
1855
1856                 /* conflict detection and handling:
1857                  * 1. wait on the sequence number,
1858                  *    in case this data packet overtook ACK packets.
1859                  * 2. check our hash tables for conflicting requests.
1860                  *    we only need to walk the tl_hash, since an ee can not
1861                  *    have a conflict with an other ee: on the submitting
1862                  *    node, the corresponding req had already been conflicting,
1863                  *    and a conflicting req is never sent.
1864                  *
1865                  * Note: for two_primaries, we are protocol C,
1866                  * so there cannot be any request that is DONE
1867                  * but still on the transfer log.
1868                  *
1869                  * unconditionally add to the ee_hash.
1870                  *
1871                  * if no conflicting request is found:
1872                  *    submit.
1873                  *
1874                  * if any conflicting request is found
1875                  * that has not yet been acked,
1876                  * AND I have the "discard concurrent writes" flag:
1877                  *       queue (via done_ee) the P_DISCARD_ACK; OUT.
1878                  *
1879                  * if any conflicting request is found:
1880                  *       block the receiver, waiting on misc_wait
1881                  *       until no more conflicting requests are there,
1882                  *       or we get interrupted (disconnect).
1883                  *
1884                  *       we do not just write after local io completion of those
1885                  *       requests, but only after req is done completely, i.e.
1886                  *       we wait for the P_DISCARD_ACK to arrive!
1887                  *
1888                  *       then proceed normally, i.e. submit.
1889                  */
1890                 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1891                         goto out_interrupted;
1892
1893                 spin_lock_irq(&mdev->req_lock);
1894
1895                 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1896
1897 #define OVERLAPS overlaps(i->sector, i->size, sector, size)
1898                 slot = tl_hash_slot(mdev, sector);
1899                 first = 1;
1900                 for (;;) {
1901                         int have_unacked = 0;
1902                         int have_conflict = 0;
1903                         prepare_to_wait(&mdev->misc_wait, &wait,
1904                                 TASK_INTERRUPTIBLE);
1905                         hlist_for_each_entry(i, n, slot, colision) {
1906                                 if (OVERLAPS) {
1907                                         /* only ALERT on first iteration,
1908                                          * we may be woken up early... */
1909                                         if (first)
1910                                                 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1911                                                       " new: %llus +%u; pending: %llus +%u\n",
1912                                                       current->comm, current->pid,
1913                                                       (unsigned long long)sector, size,
1914                                                       (unsigned long long)i->sector, i->size);
1915                                         if (i->rq_state & RQ_NET_PENDING)
1916                                                 ++have_unacked;
1917                                         ++have_conflict;
1918                                 }
1919                         }
1920 #undef OVERLAPS
1921                         if (!have_conflict)
1922                                 break;
1923
1924                         /* Discard Ack only for the _first_ iteration */
1925                         if (first && discard && have_unacked) {
1926                                 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1927                                      (unsigned long long)sector);
1928                                 inc_unacked(mdev);
1929                                 e->w.cb = e_send_discard_ack;
1930                                 list_add_tail(&e->w.list, &mdev->done_ee);
1931
1932                                 spin_unlock_irq(&mdev->req_lock);
1933
1934                                 /* we could probably send that P_DISCARD_ACK ourselves,
1935                                  * but I don't like the receiver using the msock */
1936
1937                                 put_ldev(mdev);
1938                                 wake_asender(mdev);
1939                                 finish_wait(&mdev->misc_wait, &wait);
1940                                 return TRUE;
1941                         }
1942
1943                         if (signal_pending(current)) {
1944                                 hlist_del_init(&e->colision);
1945
1946                                 spin_unlock_irq(&mdev->req_lock);
1947
1948                                 finish_wait(&mdev->misc_wait, &wait);
1949                                 goto out_interrupted;
1950                         }
1951
1952                         spin_unlock_irq(&mdev->req_lock);
1953                         if (first) {
1954                                 first = 0;
1955                                 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1956                                      "sec=%llus\n", (unsigned long long)sector);
1957                         } else if (discard) {
1958                                 /* we had none on the first iteration.
1959                                  * there must be none now. */
1960                                 D_ASSERT(have_unacked == 0);
1961                         }
1962                         schedule();
1963                         spin_lock_irq(&mdev->req_lock);
1964                 }
1965                 finish_wait(&mdev->misc_wait, &wait);
1966         }
1967
1968         list_add(&e->w.list, &mdev->active_ee);
1969         spin_unlock_irq(&mdev->req_lock);
1970
1971         switch (mdev->net_conf->wire_protocol) {
1972         case DRBD_PROT_C:
1973                 inc_unacked(mdev);
1974                 /* corresponding dec_unacked() in e_end_block()
1975                  * respective _drbd_clear_done_ee */
1976                 break;
1977         case DRBD_PROT_B:
1978                 /* I really don't like it that the receiver thread
1979                  * sends on the msock, but anyways */
1980                 drbd_send_ack(mdev, P_RECV_ACK, e);
1981                 break;
1982         case DRBD_PROT_A:
1983                 /* nothing to do */
1984                 break;
1985         }
1986
1987         if (mdev->state.pdsk == D_DISKLESS) {
1988                 /* In case we have the only disk of the cluster, */
1989                 drbd_set_out_of_sync(mdev, e->sector, e->size);
1990                 e->flags |= EE_CALL_AL_COMPLETE_IO;
1991                 drbd_al_begin_io(mdev, e->sector);
1992         }
1993
1994         if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
1995                 return TRUE;
1996
1997 out_interrupted:
1998         /* yes, the epoch_size now is imbalanced.
1999          * but we drop the connection anyways, so we don't have a chance to
2000          * receive a barrier... atomic_inc(&mdev->epoch_size); */
2001         put_ldev(mdev);
2002         drbd_free_ee(mdev, e);
2003         return FALSE;
2004 }
2005
2006 /* We may throttle resync, if the lower device seems to be busy,
2007  * and current sync rate is above c_min_rate.
2008  *
2009  * To decide whether or not the lower device is busy, we use a scheme similar
2010  * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2011  * (more than 64 sectors) of activity we cannot account for with our own resync
2012  * activity, it obviously is "busy".
2013  *
2014  * The current sync rate used here uses only the most recent two step marks,
2015  * to have a short time average so we can react faster.
2016  */
2017 int drbd_rs_should_slow_down(struct drbd_conf *mdev)
2018 {
2019         struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
2020         unsigned long db, dt, dbdt;
2021         int curr_events;
2022         int throttle = 0;
2023
2024         /* feature disabled? */
2025         if (mdev->sync_conf.c_min_rate == 0)
2026                 return 0;
2027
2028         curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2029                       (int)part_stat_read(&disk->part0, sectors[1]) -
2030                         atomic_read(&mdev->rs_sect_ev);
2031         if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
2032                 unsigned long rs_left;
2033                 int i;
2034
2035                 mdev->rs_last_events = curr_events;
2036
2037                 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2038                  * approx. */
2039                 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
2040                 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2041
2042                 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
2043                 if (!dt)
2044                         dt++;
2045                 db = mdev->rs_mark_left[i] - rs_left;
2046                 dbdt = Bit2KB(db/dt);
2047
2048                 if (dbdt > mdev->sync_conf.c_min_rate)
2049                         throttle = 1;
2050         }
2051         return throttle;
2052 }
2053
2054
2055 static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
2056 {
2057         sector_t sector;
2058         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
2059         struct drbd_epoch_entry *e;
2060         struct digest_info *di = NULL;
2061         int size;
2062         unsigned int fault_type;
2063         struct p_block_req *p = &mdev->data.rbuf.block_req;
2064
2065         sector = be64_to_cpu(p->sector);
2066         size   = be32_to_cpu(p->blksize);
2067
2068         if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
2069                 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2070                                 (unsigned long long)sector, size);
2071                 return FALSE;
2072         }
2073         if (sector + (size>>9) > capacity) {
2074                 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2075                                 (unsigned long long)sector, size);
2076                 return FALSE;
2077         }
2078
2079         if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
2080                 if (__ratelimit(&drbd_ratelimit_state))
2081                         dev_err(DEV, "Can not satisfy peer's read request, "
2082                             "no local data.\n");
2083                 drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY :
2084                                  P_NEG_RS_DREPLY , p);
2085                 return TRUE;
2086         }
2087
2088         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2089          * "criss-cross" setup, that might cause write-out on some other DRBD,
2090          * which in turn might block on the other node at this very place.  */
2091         e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
2092         if (!e) {
2093                 put_ldev(mdev);
2094                 return FALSE;
2095         }
2096
2097         switch (cmd) {
2098         case P_DATA_REQUEST:
2099                 e->w.cb = w_e_end_data_req;
2100                 fault_type = DRBD_FAULT_DT_RD;
2101                 /* application IO, don't drbd_rs_begin_io */
2102                 goto submit;
2103
2104         case P_RS_DATA_REQUEST:
2105                 e->w.cb = w_e_end_rsdata_req;
2106                 fault_type = DRBD_FAULT_RS_RD;
2107                 break;
2108
2109         case P_OV_REPLY:
2110         case P_CSUM_RS_REQUEST:
2111                 fault_type = DRBD_FAULT_RS_RD;
2112                 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2113                 if (!di)
2114                         goto out_free_e;
2115
2116                 di->digest_size = digest_size;
2117                 di->digest = (((char *)di)+sizeof(struct digest_info));
2118
2119                 e->digest = di;
2120                 e->flags |= EE_HAS_DIGEST;
2121
2122                 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2123                         goto out_free_e;
2124
2125                 if (cmd == P_CSUM_RS_REQUEST) {
2126                         D_ASSERT(mdev->agreed_pro_version >= 89);
2127                         e->w.cb = w_e_end_csum_rs_req;
2128                 } else if (cmd == P_OV_REPLY) {
2129                         e->w.cb = w_e_end_ov_reply;
2130                         dec_rs_pending(mdev);
2131                         /* drbd_rs_begin_io done when we sent this request,
2132                          * but accounting still needs to be done. */
2133                         goto submit_for_resync;
2134                 }
2135                 break;
2136
2137         case P_OV_REQUEST:
2138                 if (mdev->state.conn >= C_CONNECTED &&
2139                     mdev->state.conn != C_VERIFY_T)
2140                         dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2141                                 drbd_conn_str(mdev->state.conn));
2142                 if (mdev->ov_start_sector == ~(sector_t)0 &&
2143                     mdev->agreed_pro_version >= 90) {
2144                         mdev->ov_start_sector = sector;
2145                         mdev->ov_position = sector;
2146                         mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
2147                         dev_info(DEV, "Online Verify start sector: %llu\n",
2148                                         (unsigned long long)sector);
2149                 }
2150                 e->w.cb = w_e_end_ov_req;
2151                 fault_type = DRBD_FAULT_RS_RD;
2152                 break;
2153
2154         default:
2155                 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2156                     cmdname(cmd));
2157                 fault_type = DRBD_FAULT_MAX;
2158                 goto out_free_e;
2159         }
2160
2161         /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2162          * wrt the receiver, but it is not as straightforward as it may seem.
2163          * Various places in the resync start and stop logic assume resync
2164          * requests are processed in order, requeuing this on the worker thread
2165          * introduces a bunch of new code for synchronization between threads.
2166          *
2167          * Unlimited throttling before drbd_rs_begin_io may stall the resync
2168          * "forever", throttling after drbd_rs_begin_io will lock that extent
2169          * for application writes for the same time.  For now, just throttle
2170          * here, where the rest of the code expects the receiver to sleep for
2171          * a while, anyways.
2172          */
2173
2174         /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2175          * this defers syncer requests for some time, before letting at least
2176          * on request through.  The resync controller on the receiving side
2177          * will adapt to the incoming rate accordingly.
2178          *
2179          * We cannot throttle here if remote is Primary/SyncTarget:
2180          * we would also throttle its application reads.
2181          * In that case, throttling is done on the SyncTarget only.
2182          */
2183         if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
2184                 msleep(100);
2185         if (drbd_rs_begin_io(mdev, e->sector))
2186                 goto out_free_e;
2187
2188 submit_for_resync:
2189         atomic_add(size >> 9, &mdev->rs_sect_ev);
2190
2191 submit:
2192         inc_unacked(mdev);
2193         spin_lock_irq(&mdev->req_lock);
2194         list_add_tail(&e->w.list, &mdev->read_ee);
2195         spin_unlock_irq(&mdev->req_lock);
2196
2197         if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2198                 return TRUE;
2199
2200 out_free_e:
2201         put_ldev(mdev);
2202         drbd_free_ee(mdev, e);
2203         return FALSE;
2204 }
2205
2206 static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2207 {
2208         int self, peer, rv = -100;
2209         unsigned long ch_self, ch_peer;
2210
2211         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2212         peer = mdev->p_uuid[UI_BITMAP] & 1;
2213
2214         ch_peer = mdev->p_uuid[UI_SIZE];
2215         ch_self = mdev->comm_bm_set;
2216
2217         switch (mdev->net_conf->after_sb_0p) {
2218         case ASB_CONSENSUS:
2219         case ASB_DISCARD_SECONDARY:
2220         case ASB_CALL_HELPER:
2221                 dev_err(DEV, "Configuration error.\n");
2222                 break;
2223         case ASB_DISCONNECT:
2224                 break;
2225         case ASB_DISCARD_YOUNGER_PRI:
2226                 if (self == 0 && peer == 1) {
2227                         rv = -1;
2228                         break;
2229                 }
2230                 if (self == 1 && peer == 0) {
2231                         rv =  1;
2232                         break;
2233                 }
2234                 /* Else fall through to one of the other strategies... */
2235         case ASB_DISCARD_OLDER_PRI:
2236                 if (self == 0 && peer == 1) {
2237                         rv = 1;
2238                         break;
2239                 }
2240                 if (self == 1 && peer == 0) {
2241                         rv = -1;
2242                         break;
2243                 }
2244                 /* Else fall through to one of the other strategies... */
2245                 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
2246                      "Using discard-least-changes instead\n");
2247         case ASB_DISCARD_ZERO_CHG:
2248                 if (ch_peer == 0 && ch_self == 0) {
2249                         rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2250                                 ? -1 : 1;
2251                         break;
2252                 } else {
2253                         if (ch_peer == 0) { rv =  1; break; }
2254                         if (ch_self == 0) { rv = -1; break; }
2255                 }
2256                 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2257                         break;
2258         case ASB_DISCARD_LEAST_CHG:
2259                 if      (ch_self < ch_peer)
2260                         rv = -1;
2261                 else if (ch_self > ch_peer)
2262                         rv =  1;
2263                 else /* ( ch_self == ch_peer ) */
2264                      /* Well, then use something else. */
2265                         rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2266                                 ? -1 : 1;
2267                 break;
2268         case ASB_DISCARD_LOCAL:
2269                 rv = -1;
2270                 break;
2271         case ASB_DISCARD_REMOTE:
2272                 rv =  1;
2273         }
2274
2275         return rv;
2276 }
2277
2278 static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2279 {
2280         int self, peer, hg, rv = -100;
2281
2282         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2283         peer = mdev->p_uuid[UI_BITMAP] & 1;
2284
2285         switch (mdev->net_conf->after_sb_1p) {
2286         case ASB_DISCARD_YOUNGER_PRI:
2287         case ASB_DISCARD_OLDER_PRI:
2288         case ASB_DISCARD_LEAST_CHG:
2289         case ASB_DISCARD_LOCAL:
2290         case ASB_DISCARD_REMOTE:
2291                 dev_err(DEV, "Configuration error.\n");
2292                 break;
2293         case ASB_DISCONNECT:
2294                 break;
2295         case ASB_CONSENSUS:
2296                 hg = drbd_asb_recover_0p(mdev);
2297                 if (hg == -1 && mdev->state.role == R_SECONDARY)
2298                         rv = hg;
2299                 if (hg == 1  && mdev->state.role == R_PRIMARY)
2300                         rv = hg;
2301                 break;
2302         case ASB_VIOLENTLY:
2303                 rv = drbd_asb_recover_0p(mdev);
2304                 break;
2305         case ASB_DISCARD_SECONDARY:
2306                 return mdev->state.role == R_PRIMARY ? 1 : -1;
2307         case ASB_CALL_HELPER:
2308                 hg = drbd_asb_recover_0p(mdev);
2309                 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2310                         self = drbd_set_role(mdev, R_SECONDARY, 0);
2311                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2312                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2313                           * we do not need to wait for the after state change work either. */
2314                         self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2315                         if (self != SS_SUCCESS) {
2316                                 drbd_khelper(mdev, "pri-lost-after-sb");
2317                         } else {
2318                                 dev_warn(DEV, "Successfully gave up primary role.\n");
2319                                 rv = hg;
2320                         }
2321                 } else
2322                         rv = hg;
2323         }
2324
2325         return rv;
2326 }
2327
2328 static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2329 {
2330         int self, peer, hg, rv = -100;
2331
2332         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2333         peer = mdev->p_uuid[UI_BITMAP] & 1;
2334
2335         switch (mdev->net_conf->after_sb_2p) {
2336         case ASB_DISCARD_YOUNGER_PRI:
2337         case ASB_DISCARD_OLDER_PRI:
2338         case ASB_DISCARD_LEAST_CHG:
2339         case ASB_DISCARD_LOCAL:
2340         case ASB_DISCARD_REMOTE:
2341         case ASB_CONSENSUS:
2342         case ASB_DISCARD_SECONDARY:
2343                 dev_err(DEV, "Configuration error.\n");
2344                 break;
2345         case ASB_VIOLENTLY:
2346                 rv = drbd_asb_recover_0p(mdev);
2347                 break;
2348         case ASB_DISCONNECT:
2349                 break;
2350         case ASB_CALL_HELPER:
2351                 hg = drbd_asb_recover_0p(mdev);
2352                 if (hg == -1) {
2353                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2354                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2355                           * we do not need to wait for the after state change work either. */
2356                         self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2357                         if (self != SS_SUCCESS) {
2358                                 drbd_khelper(mdev, "pri-lost-after-sb");
2359                         } else {
2360                                 dev_warn(DEV, "Successfully gave up primary role.\n");
2361                                 rv = hg;
2362                         }
2363                 } else
2364                         rv = hg;
2365         }
2366
2367         return rv;
2368 }
2369
2370 static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2371                            u64 bits, u64 flags)
2372 {
2373         if (!uuid) {
2374                 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2375                 return;
2376         }
2377         dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2378              text,
2379              (unsigned long long)uuid[UI_CURRENT],
2380              (unsigned long long)uuid[UI_BITMAP],
2381              (unsigned long long)uuid[UI_HISTORY_START],
2382              (unsigned long long)uuid[UI_HISTORY_END],
2383              (unsigned long long)bits,
2384              (unsigned long long)flags);
2385 }
2386
2387 /*
2388   100   after split brain try auto recover
2389     2   C_SYNC_SOURCE set BitMap
2390     1   C_SYNC_SOURCE use BitMap
2391     0   no Sync
2392    -1   C_SYNC_TARGET use BitMap
2393    -2   C_SYNC_TARGET set BitMap
2394  -100   after split brain, disconnect
2395 -1000   unrelated data
2396  */
2397 static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2398 {
2399         u64 self, peer;
2400         int i, j;
2401
2402         self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2403         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2404
2405         *rule_nr = 10;
2406         if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2407                 return 0;
2408
2409         *rule_nr = 20;
2410         if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2411              peer != UUID_JUST_CREATED)
2412                 return -2;
2413
2414         *rule_nr = 30;
2415         if (self != UUID_JUST_CREATED &&
2416             (peer == UUID_JUST_CREATED || peer == (u64)0))
2417                 return 2;
2418
2419         if (self == peer) {
2420                 int rct, dc; /* roles at crash time */
2421
2422                 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2423
2424                         if (mdev->agreed_pro_version < 91)
2425                                 return -1001;
2426
2427                         if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2428                             (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2429                                 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2430                                 drbd_uuid_set_bm(mdev, 0UL);
2431
2432                                 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2433                                                mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2434                                 *rule_nr = 34;
2435                         } else {
2436                                 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2437                                 *rule_nr = 36;
2438                         }
2439
2440                         return 1;
2441                 }
2442
2443                 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2444
2445                         if (mdev->agreed_pro_version < 91)
2446                                 return -1001;
2447
2448                         if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2449                             (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2450                                 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2451
2452                                 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2453                                 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2454                                 mdev->p_uuid[UI_BITMAP] = 0UL;
2455
2456                                 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2457                                 *rule_nr = 35;
2458                         } else {
2459                                 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2460                                 *rule_nr = 37;
2461                         }
2462
2463                         return -1;
2464                 }
2465
2466                 /* Common power [off|failure] */
2467                 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2468                         (mdev->p_uuid[UI_FLAGS] & 2);
2469                 /* lowest bit is set when we were primary,
2470                  * next bit (weight 2) is set when peer was primary */
2471                 *rule_nr = 40;
2472
2473                 switch (rct) {
2474                 case 0: /* !self_pri && !peer_pri */ return 0;
2475                 case 1: /*  self_pri && !peer_pri */ return 1;
2476                 case 2: /* !self_pri &&  peer_pri */ return -1;
2477                 case 3: /*  self_pri &&  peer_pri */
2478                         dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2479                         return dc ? -1 : 1;
2480                 }
2481         }
2482
2483         *rule_nr = 50;
2484         peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2485         if (self == peer)
2486                 return -1;
2487
2488         *rule_nr = 51;
2489         peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2490         if (self == peer) {
2491                 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2492                 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
2493                 if (self == peer) {
2494                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2495                            resync as sync source modifications of the peer's UUIDs. */
2496
2497                         if (mdev->agreed_pro_version < 91)
2498                                 return -1001;
2499
2500                         mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2501                         mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2502                         return -1;
2503                 }
2504         }
2505
2506         *rule_nr = 60;
2507         self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2508         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2509                 peer = mdev->p_uuid[i] & ~((u64)1);
2510                 if (self == peer)
2511                         return -2;
2512         }
2513
2514         *rule_nr = 70;
2515         self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2516         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2517         if (self == peer)
2518                 return 1;
2519
2520         *rule_nr = 71;
2521         self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2522         if (self == peer) {
2523                 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
2524                 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2525                 if (self == peer) {
2526                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2527                            resync as sync source modifications of our UUIDs. */
2528
2529                         if (mdev->agreed_pro_version < 91)
2530                                 return -1001;
2531
2532                         _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2533                         _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2534
2535                         dev_info(DEV, "Undid last start of resync:\n");
2536
2537                         drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2538                                        mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2539
2540                         return 1;
2541                 }
2542         }
2543
2544
2545         *rule_nr = 80;
2546         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2547         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2548                 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2549                 if (self == peer)
2550                         return 2;
2551         }
2552
2553         *rule_nr = 90;
2554         self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2555         peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2556         if (self == peer && self != ((u64)0))
2557                 return 100;
2558
2559         *rule_nr = 100;
2560         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2561                 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2562                 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2563                         peer = mdev->p_uuid[j] & ~((u64)1);
2564                         if (self == peer)
2565                                 return -100;
2566                 }
2567         }
2568
2569         return -1000;
2570 }
2571
2572 /* drbd_sync_handshake() returns the new conn state on success, or
2573    CONN_MASK (-1) on failure.
2574  */
2575 static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2576                                            enum drbd_disk_state peer_disk) __must_hold(local)
2577 {
2578         int hg, rule_nr;
2579         enum drbd_conns rv = C_MASK;
2580         enum drbd_disk_state mydisk;
2581
2582         mydisk = mdev->state.disk;
2583         if (mydisk == D_NEGOTIATING)
2584                 mydisk = mdev->new_state_tmp.disk;
2585
2586         dev_info(DEV, "drbd_sync_handshake:\n");
2587         drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2588         drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2589                        mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2590
2591         hg = drbd_uuid_compare(mdev, &rule_nr);
2592
2593         dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2594
2595         if (hg == -1000) {
2596                 dev_alert(DEV, "Unrelated data, aborting!\n");
2597                 return C_MASK;
2598         }
2599         if (hg == -1001) {
2600                 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
2601                 return C_MASK;
2602         }
2603
2604         if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2605             (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
2606                 int f = (hg == -100) || abs(hg) == 2;
2607                 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2608                 if (f)
2609                         hg = hg*2;
2610                 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2611                      hg > 0 ? "source" : "target");
2612         }
2613
2614         if (abs(hg) == 100)
2615                 drbd_khelper(mdev, "initial-split-brain");
2616
2617         if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2618                 int pcount = (mdev->state.role == R_PRIMARY)
2619                            + (peer_role == R_PRIMARY);
2620                 int forced = (hg == -100);
2621
2622                 switch (pcount) {
2623                 case 0:
2624                         hg = drbd_asb_recover_0p(mdev);
2625                         break;
2626                 case 1:
2627                         hg = drbd_asb_recover_1p(mdev);
2628                         break;
2629                 case 2:
2630                         hg = drbd_asb_recover_2p(mdev);
2631                         break;
2632                 }
2633                 if (abs(hg) < 100) {
2634                         dev_warn(DEV, "Split-Brain detected, %d primaries, "
2635                              "automatically solved. Sync from %s node\n",
2636                              pcount, (hg < 0) ? "peer" : "this");
2637                         if (forced) {
2638                                 dev_warn(DEV, "Doing a full sync, since"
2639                                      " UUIDs where ambiguous.\n");
2640                                 hg = hg*2;
2641                         }
2642                 }
2643         }
2644
2645         if (hg == -100) {
2646                 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2647                         hg = -1;
2648                 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2649                         hg = 1;
2650
2651                 if (abs(hg) < 100)
2652                         dev_warn(DEV, "Split-Brain detected, manually solved. "
2653                              "Sync from %s node\n",
2654                              (hg < 0) ? "peer" : "this");
2655         }
2656
2657         if (hg == -100) {
2658                 /* FIXME this log message is not correct if we end up here
2659                  * after an attempted attach on a diskless node.
2660                  * We just refuse to attach -- well, we drop the "connection"
2661                  * to that disk, in a way... */
2662                 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
2663                 drbd_khelper(mdev, "split-brain");
2664                 return C_MASK;
2665         }
2666
2667         if (hg > 0 && mydisk <= D_INCONSISTENT) {
2668                 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2669                 return C_MASK;
2670         }
2671
2672         if (hg < 0 && /* by intention we do not use mydisk here. */
2673             mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2674                 switch (mdev->net_conf->rr_conflict) {
2675                 case ASB_CALL_HELPER:
2676                         drbd_khelper(mdev, "pri-lost");
2677                         /* fall through */
2678                 case ASB_DISCONNECT:
2679                         dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2680                         return C_MASK;
2681                 case ASB_VIOLENTLY:
2682                         dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2683                              "assumption\n");
2684                 }
2685         }
2686
2687         if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
2688                 if (hg == 0)
2689                         dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
2690                 else
2691                         dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
2692                                  drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
2693                                  abs(hg) >= 2 ? "full" : "bit-map based");
2694                 return C_MASK;
2695         }
2696
2697         if (abs(hg) >= 2) {
2698                 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2699                 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
2700                         return C_MASK;
2701         }
2702
2703         if (hg > 0) { /* become sync source. */
2704                 rv = C_WF_BITMAP_S;
2705         } else if (hg < 0) { /* become sync target */
2706                 rv = C_WF_BITMAP_T;
2707         } else {
2708                 rv = C_CONNECTED;
2709                 if (drbd_bm_total_weight(mdev)) {
2710                         dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2711                              drbd_bm_total_weight(mdev));
2712                 }
2713         }
2714
2715         return rv;
2716 }
2717
2718 /* returns 1 if invalid */
2719 static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2720 {
2721         /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2722         if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2723             (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2724                 return 0;
2725
2726         /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2727         if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2728             self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2729                 return 1;
2730
2731         /* everything else is valid if they are equal on both sides. */
2732         if (peer == self)
2733                 return 0;
2734
2735         /* everything es is invalid. */
2736         return 1;
2737 }
2738
2739 static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
2740 {
2741         struct p_protocol *p = &mdev->data.rbuf.protocol;
2742         int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2743         int p_want_lose, p_two_primaries, cf;
2744         char p_integrity_alg[SHARED_SECRET_MAX] = "";
2745
2746         p_proto         = be32_to_cpu(p->protocol);
2747         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
2748         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
2749         p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
2750         p_two_primaries = be32_to_cpu(p->two_primaries);
2751         cf              = be32_to_cpu(p->conn_flags);
2752         p_want_lose = cf & CF_WANT_LOSE;
2753
2754         clear_bit(CONN_DRY_RUN, &mdev->flags);
2755
2756         if (cf & CF_DRY_RUN)
2757                 set_bit(CONN_DRY_RUN, &mdev->flags);
2758
2759         if (p_proto != mdev->net_conf->wire_protocol) {
2760                 dev_err(DEV, "incompatible communication protocols\n");
2761                 goto disconnect;
2762         }
2763
2764         if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2765                 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2766                 goto disconnect;
2767         }
2768
2769         if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2770                 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2771                 goto disconnect;
2772         }
2773
2774         if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2775                 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2776                 goto disconnect;
2777         }
2778
2779         if (p_want_lose && mdev->net_conf->want_lose) {
2780                 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2781                 goto disconnect;
2782         }
2783
2784         if (p_two_primaries != mdev->net_conf->two_primaries) {
2785                 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2786                 goto disconnect;
2787         }
2788
2789         if (mdev->agreed_pro_version >= 87) {
2790                 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2791
2792                 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2793                         return FALSE;
2794
2795                 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2796                 if (strcmp(p_integrity_alg, my_alg)) {
2797                         dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2798                         goto disconnect;
2799                 }
2800                 dev_info(DEV, "data-integrity-alg: %s\n",
2801                      my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2802         }
2803
2804         return TRUE;
2805
2806 disconnect:
2807         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2808         return FALSE;
2809 }
2810
2811 /* helper function
2812  * input: alg name, feature name
2813  * return: NULL (alg name was "")
2814  *         ERR_PTR(error) if something goes wrong
2815  *         or the crypto hash ptr, if it worked out ok. */
2816 struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2817                 const char *alg, const char *name)
2818 {
2819         struct crypto_hash *tfm;
2820
2821         if (!alg[0])
2822                 return NULL;
2823
2824         tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2825         if (IS_ERR(tfm)) {
2826                 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2827                         alg, name, PTR_ERR(tfm));
2828                 return tfm;
2829         }
2830         if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2831                 crypto_free_hash(tfm);
2832                 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2833                 return ERR_PTR(-EINVAL);
2834         }
2835         return tfm;
2836 }
2837
2838 static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
2839 {
2840         int ok = TRUE;
2841         struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
2842         unsigned int header_size, data_size, exp_max_sz;
2843         struct crypto_hash *verify_tfm = NULL;
2844         struct crypto_hash *csums_tfm = NULL;
2845         const int apv = mdev->agreed_pro_version;
2846         int *rs_plan_s = NULL;
2847         int fifo_size = 0;
2848
2849         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
2850                     : apv == 88 ? sizeof(struct p_rs_param)
2851                                         + SHARED_SECRET_MAX
2852                     : apv <= 94 ? sizeof(struct p_rs_param_89)
2853                     : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2854
2855         if (packet_size > exp_max_sz) {
2856                 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2857                     packet_size, exp_max_sz);
2858                 return FALSE;
2859         }
2860
2861         if (apv <= 88) {
2862                 header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
2863                 data_size   = packet_size  - header_size;
2864         } else if (apv <= 94) {
2865                 header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
2866                 data_size   = packet_size  - header_size;
2867                 D_ASSERT(data_size == 0);
2868         } else {
2869                 header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
2870                 data_size   = packet_size  - header_size;
2871                 D_ASSERT(data_size == 0);
2872         }
2873
2874         /* initialize verify_alg and csums_alg */
2875         memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2876
2877         if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
2878                 return FALSE;
2879
2880         mdev->sync_conf.rate      = be32_to_cpu(p->rate);
2881
2882         if (apv >= 88) {
2883                 if (apv == 88) {
2884                         if (data_size > SHARED_SECRET_MAX) {
2885                                 dev_err(DEV, "verify-alg too long, "
2886                                     "peer wants %u, accepting only %u byte\n",
2887                                                 data_size, SHARED_SECRET_MAX);
2888                                 return FALSE;
2889                         }
2890
2891                         if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2892                                 return FALSE;
2893
2894                         /* we expect NUL terminated string */
2895                         /* but just in case someone tries to be evil */
2896                         D_ASSERT(p->verify_alg[data_size-1] == 0);
2897                         p->verify_alg[data_size-1] = 0;
2898
2899                 } else /* apv >= 89 */ {
2900                         /* we still expect NUL terminated strings */
2901                         /* but just in case someone tries to be evil */
2902                         D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2903                         D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2904                         p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2905                         p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2906                 }
2907
2908                 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2909                         if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2910                                 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2911                                     mdev->sync_conf.verify_alg, p->verify_alg);
2912                                 goto disconnect;
2913                         }
2914                         verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2915                                         p->verify_alg, "verify-alg");
2916                         if (IS_ERR(verify_tfm)) {
2917                                 verify_tfm = NULL;
2918                                 goto disconnect;
2919                         }
2920                 }
2921
2922                 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2923                         if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2924                                 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2925                                     mdev->sync_conf.csums_alg, p->csums_alg);
2926                                 goto disconnect;
2927                         }
2928                         csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2929                                         p->csums_alg, "csums-alg");
2930                         if (IS_ERR(csums_tfm)) {
2931                                 csums_tfm = NULL;
2932                                 goto disconnect;
2933                         }
2934                 }
2935
2936                 if (apv > 94) {
2937                         mdev->sync_conf.rate      = be32_to_cpu(p->rate);
2938                         mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
2939                         mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
2940                         mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
2941                         mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
2942
2943                         fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
2944                         if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
2945                                 rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
2946                                 if (!rs_plan_s) {
2947                                         dev_err(DEV, "kmalloc of fifo_buffer failed");
2948                                         goto disconnect;
2949                                 }
2950                         }
2951                 }
2952
2953                 spin_lock(&mdev->peer_seq_lock);
2954                 /* lock against drbd_nl_syncer_conf() */
2955                 if (verify_tfm) {
2956                         strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2957                         mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2958                         crypto_free_hash(mdev->verify_tfm);
2959                         mdev->verify_tfm = verify_tfm;
2960                         dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2961                 }
2962                 if (csums_tfm) {
2963                         strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2964                         mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2965                         crypto_free_hash(mdev->csums_tfm);
2966                         mdev->csums_tfm = csums_tfm;
2967                         dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2968                 }
2969                 if (fifo_size != mdev->rs_plan_s.size) {
2970                         kfree(mdev->rs_plan_s.values);
2971                         mdev->rs_plan_s.values = rs_plan_s;
2972                         mdev->rs_plan_s.size   = fifo_size;
2973                         mdev->rs_planed = 0;
2974                 }
2975                 spin_unlock(&mdev->peer_seq_lock);
2976         }
2977
2978         return ok;
2979 disconnect:
2980         /* just for completeness: actually not needed,
2981          * as this is not reached if csums_tfm was ok. */
2982         crypto_free_hash(csums_tfm);
2983         /* but free the verify_tfm again, if csums_tfm did not work out */
2984         crypto_free_hash(verify_tfm);
2985         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2986         return FALSE;
2987 }
2988
2989 static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2990 {
2991         /* sorry, we currently have no working implementation
2992          * of distributed TCQ */
2993 }
2994
2995 /* warn if the arguments differ by more than 12.5% */
2996 static void warn_if_differ_considerably(struct drbd_conf *mdev,
2997         const char *s, sector_t a, sector_t b)
2998 {
2999         sector_t d;
3000         if (a == 0 || b == 0)
3001                 return;
3002         d = (a > b) ? (a - b) : (b - a);
3003         if (d > (a>>3) || d > (b>>3))
3004                 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
3005                      (unsigned long long)a, (unsigned long long)b);
3006 }
3007
3008 static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3009 {
3010         struct p_sizes *p = &mdev->data.rbuf.sizes;
3011         enum determine_dev_size dd = unchanged;
3012         unsigned int max_seg_s;
3013         sector_t p_size, p_usize, my_usize;
3014         int ldsc = 0; /* local disk size changed */
3015         enum dds_flags ddsf;
3016
3017         p_size = be64_to_cpu(p->d_size);
3018         p_usize = be64_to_cpu(p->u_size);
3019
3020         if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
3021                 dev_err(DEV, "some backing storage is needed\n");
3022                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3023                 return FALSE;
3024         }
3025
3026         /* just store the peer's disk size for now.
3027          * we still need to figure out whether we accept that. */
3028         mdev->p_size = p_size;
3029
3030 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
3031         if (get_ldev(mdev)) {
3032                 warn_if_differ_considerably(mdev, "lower level device sizes",
3033                            p_size, drbd_get_max_capacity(mdev->ldev));
3034                 warn_if_differ_considerably(mdev, "user requested size",
3035                                             p_usize, mdev->ldev->dc.disk_size);
3036
3037                 /* if this is the first connect, or an otherwise expected
3038                  * param exchange, choose the minimum */
3039                 if (mdev->state.conn == C_WF_REPORT_PARAMS)
3040                         p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
3041                                              p_usize);
3042
3043                 my_usize = mdev->ldev->dc.disk_size;
3044
3045                 if (mdev->ldev->dc.disk_size != p_usize) {
3046                         mdev->ldev->dc.disk_size = p_usize;
3047                         dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3048                              (unsigned long)mdev->ldev->dc.disk_size);
3049                 }
3050
3051                 /* Never shrink a device with usable data during connect.
3052                    But allow online shrinking if we are connected. */
3053                 if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
3054                    drbd_get_capacity(mdev->this_bdev) &&
3055                    mdev->state.disk >= D_OUTDATED &&
3056                    mdev->state.conn < C_CONNECTED) {
3057                         dev_err(DEV, "The peer's disk size is too small!\n");
3058                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3059                         mdev->ldev->dc.disk_size = my_usize;
3060                         put_ldev(mdev);
3061                         return FALSE;
3062                 }
3063                 put_ldev(mdev);
3064         }
3065 #undef min_not_zero
3066
3067         ddsf = be16_to_cpu(p->dds_flags);
3068         if (get_ldev(mdev)) {
3069                 dd = drbd_determin_dev_size(mdev, ddsf);
3070                 put_ldev(mdev);
3071                 if (dd == dev_size_error)
3072                         return FALSE;
3073                 drbd_md_sync(mdev);
3074         } else {
3075                 /* I am diskless, need to accept the peer's size. */
3076                 drbd_set_my_capacity(mdev, p_size);
3077         }
3078
3079         if (get_ldev(mdev)) {
3080                 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3081                         mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3082                         ldsc = 1;
3083                 }
3084
3085                 if (mdev->agreed_pro_version < 94)
3086                         max_seg_s = be32_to_cpu(p->max_segment_size);
3087                 else /* drbd 8.3.8 onwards */
3088                         max_seg_s = DRBD_MAX_SEGMENT_SIZE;
3089
3090                 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
3091                         drbd_setup_queue_param(mdev, max_seg_s);
3092
3093                 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
3094                 put_ldev(mdev);
3095         }
3096
3097         if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3098                 if (be64_to_cpu(p->c_size) !=
3099                     drbd_get_capacity(mdev->this_bdev) || ldsc) {
3100                         /* we have different sizes, probably peer
3101                          * needs to know my new size... */
3102                         drbd_send_sizes(mdev, 0, ddsf);
3103                 }
3104                 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3105                     (dd == grew && mdev->state.conn == C_CONNECTED)) {
3106                         if (mdev->state.pdsk >= D_INCONSISTENT &&
3107                             mdev->state.disk >= D_INCONSISTENT) {
3108                                 if (ddsf & DDSF_NO_RESYNC)
3109                                         dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3110                                 else
3111                                         resync_after_online_grow(mdev);
3112                         } else
3113                                 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3114                 }
3115         }
3116
3117         return TRUE;
3118 }
3119
3120 static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3121 {
3122         struct p_uuids *p = &mdev->data.rbuf.uuids;
3123         u64 *p_uuid;
3124         int i;
3125
3126         p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3127
3128         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3129                 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3130
3131         kfree(mdev->p_uuid);
3132         mdev->p_uuid = p_uuid;
3133
3134         if (mdev->state.conn < C_CONNECTED &&
3135             mdev->state.disk < D_INCONSISTENT &&
3136             mdev->state.role == R_PRIMARY &&
3137             (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3138                 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3139                     (unsigned long long)mdev->ed_uuid);
3140                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3141                 return FALSE;
3142         }
3143
3144         if (get_ldev(mdev)) {
3145                 int skip_initial_sync =
3146                         mdev->state.conn == C_CONNECTED &&
3147                         mdev->agreed_pro_version >= 90 &&
3148                         mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3149                         (p_uuid[UI_FLAGS] & 8);
3150                 if (skip_initial_sync) {
3151                         dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3152                         drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3153                                         "clear_n_write from receive_uuids");
3154                         _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3155                         _drbd_uuid_set(mdev, UI_BITMAP, 0);
3156                         _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3157                                         CS_VERBOSE, NULL);
3158                         drbd_md_sync(mdev);
3159                 }
3160                 put_ldev(mdev);
3161         } else if (mdev->state.disk < D_INCONSISTENT &&
3162                    mdev->state.role == R_PRIMARY) {
3163                 /* I am a diskless primary, the peer just created a new current UUID
3164                    for me. */
3165                 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3166         }
3167
3168         /* Before we test for the disk state, we should wait until an eventually
3169            ongoing cluster wide state change is finished. That is important if
3170            we are primary and are detaching from our disk. We need to see the
3171            new disk state... */
3172         wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3173         if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
3174                 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3175
3176         return TRUE;
3177 }
3178
3179 /**
3180  * convert_state() - Converts the peer's view of the cluster state to our point of view
3181  * @ps:         The state as seen by the peer.
3182  */
3183 static union drbd_state convert_state(union drbd_state ps)
3184 {
3185         union drbd_state ms;
3186
3187         static enum drbd_conns c_tab[] = {
3188                 [C_CONNECTED] = C_CONNECTED,
3189
3190                 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3191                 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3192                 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3193                 [C_VERIFY_S]       = C_VERIFY_T,
3194                 [C_MASK]   = C_MASK,
3195         };
3196
3197         ms.i = ps.i;
3198
3199         ms.conn = c_tab[ps.conn];
3200         ms.peer = ps.role;
3201         ms.role = ps.peer;
3202         ms.pdsk = ps.disk;
3203         ms.disk = ps.pdsk;
3204         ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3205
3206         return ms;
3207 }
3208
3209 static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3210 {
3211         struct p_req_state *p = &mdev->data.rbuf.req_state;
3212         union drbd_state mask, val;
3213         int rv;
3214
3215         mask.i = be32_to_cpu(p->mask);
3216         val.i = be32_to_cpu(p->val);
3217
3218         if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3219             test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3220                 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3221                 return TRUE;
3222         }
3223
3224         mask = convert_state(mask);
3225         val = convert_state(val);
3226
3227         rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3228
3229         drbd_send_sr_reply(mdev, rv);
3230         drbd_md_sync(mdev);
3231
3232         return TRUE;
3233 }
3234
3235 static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3236 {
3237         struct p_state *p = &mdev->data.rbuf.state;
3238         enum drbd_conns nconn, oconn;
3239         union drbd_state ns, peer_state;
3240         enum drbd_disk_state real_peer_disk;
3241         enum chg_state_flags cs_flags;
3242         int rv;
3243
3244         peer_state.i = be32_to_cpu(p->state);
3245
3246         real_peer_disk = peer_state.disk;
3247         if (peer_state.disk == D_NEGOTIATING) {
3248                 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3249                 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3250         }
3251
3252         spin_lock_irq(&mdev->req_lock);
3253  retry:
3254         oconn = nconn = mdev->state.conn;
3255         spin_unlock_irq(&mdev->req_lock);
3256
3257         if (nconn == C_WF_REPORT_PARAMS)
3258                 nconn = C_CONNECTED;
3259
3260         if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3261             get_ldev_if_state(mdev, D_NEGOTIATING)) {
3262                 int cr; /* consider resync */
3263
3264                 /* if we established a new connection */
3265                 cr  = (oconn < C_CONNECTED);
3266                 /* if we had an established connection
3267                  * and one of the nodes newly attaches a disk */
3268                 cr |= (oconn == C_CONNECTED &&
3269                        (peer_state.disk == D_NEGOTIATING ||
3270                         mdev->state.disk == D_NEGOTIATING));
3271                 /* if we have both been inconsistent, and the peer has been
3272                  * forced to be UpToDate with --overwrite-data */
3273                 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3274                 /* if we had been plain connected, and the admin requested to
3275                  * start a sync by "invalidate" or "invalidate-remote" */
3276                 cr |= (oconn == C_CONNECTED &&
3277                                 (peer_state.conn >= C_STARTING_SYNC_S &&
3278                                  peer_state.conn <= C_WF_BITMAP_T));
3279
3280                 if (cr)
3281                         nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3282
3283                 put_ldev(mdev);
3284                 if (nconn == C_MASK) {
3285                         nconn = C_CONNECTED;
3286                         if (mdev->state.disk == D_NEGOTIATING) {
3287                                 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3288                         } else if (peer_state.disk == D_NEGOTIATING) {
3289                                 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3290                                 peer_state.disk = D_DISKLESS;
3291                                 real_peer_disk = D_DISKLESS;
3292                         } else {
3293                                 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
3294                                         return FALSE;
3295                                 D_ASSERT(oconn == C_WF_REPORT_PARAMS);
3296                                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3297                                 return FALSE;
3298                         }
3299                 }
3300         }
3301
3302         spin_lock_irq(&mdev->req_lock);
3303         if (mdev->state.conn != oconn)
3304                 goto retry;
3305         clear_bit(CONSIDER_RESYNC, &mdev->flags);
3306         ns.i = mdev->state.i;
3307         ns.conn = nconn;
3308         ns.peer = peer_state.role;
3309         ns.pdsk = real_peer_disk;
3310         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3311         if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3312                 ns.disk = mdev->new_state_tmp.disk;
3313         cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD);
3314         if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED &&
3315             test_bit(NEW_CUR_UUID, &mdev->flags)) {
3316                 /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
3317                    for temporal network outages! */
3318                 spin_unlock_irq(&mdev->req_lock);
3319                 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
3320                 tl_clear(mdev);
3321                 drbd_uuid_new_current(mdev);
3322                 clear_bit(NEW_CUR_UUID, &mdev->flags);
3323                 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
3324                 return FALSE;
3325         }
3326         rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
3327         ns = mdev->state;
3328         spin_unlock_irq(&mdev->req_lock);
3329
3330         if (rv < SS_SUCCESS) {
3331                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3332                 return FALSE;
3333         }
3334
3335         if (oconn > C_WF_REPORT_PARAMS) {
3336                 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3337                     peer_state.disk != D_NEGOTIATING ) {
3338                         /* we want resync, peer has not yet decided to sync... */
3339                         /* Nowadays only used when forcing a node into primary role and
3340                            setting its disk to UpToDate with that */
3341                         drbd_send_uuids(mdev);
3342                         drbd_send_state(mdev);
3343                 }
3344         }
3345
3346         mdev->net_conf->want_lose = 0;
3347
3348         drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3349
3350         return TRUE;
3351 }
3352
3353 static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3354 {
3355         struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
3356
3357         wait_event(mdev->misc_wait,
3358                    mdev->state.conn == C_WF_SYNC_UUID ||
3359                    mdev->state.conn < C_CONNECTED ||
3360                    mdev->state.disk < D_NEGOTIATING);
3361
3362         /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3363
3364         /* Here the _drbd_uuid_ functions are right, current should
3365            _not_ be rotated into the history */
3366         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3367                 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3368                 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3369
3370                 drbd_start_resync(mdev, C_SYNC_TARGET);
3371
3372                 put_ldev(mdev);
3373         } else
3374                 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3375
3376         return TRUE;
3377 }
3378
3379 enum receive_bitmap_ret { OK, DONE, FAILED };
3380
3381 static enum receive_bitmap_ret
3382 receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3383                      unsigned long *buffer, struct bm_xfer_ctx *c)
3384 {
3385         unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3386         unsigned want = num_words * sizeof(long);
3387
3388         if (want != data_size) {
3389                 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
3390                 return FAILED;
3391         }
3392         if (want == 0)
3393                 return DONE;
3394         if (drbd_recv(mdev, buffer, want) != want)
3395                 return FAILED;
3396
3397         drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3398
3399         c->word_offset += num_words;
3400         c->bit_offset = c->word_offset * BITS_PER_LONG;
3401         if (c->bit_offset > c->bm_bits)
3402                 c->bit_offset = c->bm_bits;
3403
3404         return OK;
3405 }
3406
3407 static enum receive_bitmap_ret
3408 recv_bm_rle_bits(struct drbd_conf *mdev,
3409                 struct p_compressed_bm *p,
3410                 struct bm_xfer_ctx *c)
3411 {
3412         struct bitstream bs;
3413         u64 look_ahead;
3414         u64 rl;
3415         u64 tmp;
3416         unsigned long s = c->bit_offset;
3417         unsigned long e;
3418         int len = p->head.length - (sizeof(*p) - sizeof(p->head));
3419         int toggle = DCBP_get_start(p);
3420         int have;
3421         int bits;
3422
3423         bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3424
3425         bits = bitstream_get_bits(&bs, &look_ahead, 64);
3426         if (bits < 0)
3427                 return FAILED;
3428
3429         for (have = bits; have > 0; s += rl, toggle = !toggle) {
3430                 bits = vli_decode_bits(&rl, look_ahead);
3431                 if (bits <= 0)
3432                         return FAILED;
3433
3434                 if (toggle) {
3435                         e = s + rl -1;
3436                         if (e >= c->bm_bits) {
3437                                 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3438                                 return FAILED;
3439                         }
3440                         _drbd_bm_set_bits(mdev, s, e);
3441                 }
3442
3443                 if (have < bits) {
3444                         dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3445                                 have, bits, look_ahead,
3446                                 (unsigned int)(bs.cur.b - p->code),
3447                                 (unsigned int)bs.buf_len);
3448                         return FAILED;
3449                 }
3450                 look_ahead >>= bits;
3451                 have -= bits;
3452
3453                 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3454                 if (bits < 0)
3455                         return FAILED;
3456                 look_ahead |= tmp << have;
3457                 have += bits;
3458         }
3459
3460         c->bit_offset = s;
3461         bm_xfer_ctx_bit_to_word_offset(c);
3462
3463         return (s == c->bm_bits) ? DONE : OK;
3464 }
3465
3466 static enum receive_bitmap_ret
3467 decode_bitmap_c(struct drbd_conf *mdev,
3468                 struct p_compressed_bm *p,
3469                 struct bm_xfer_ctx *c)
3470 {
3471         if (DCBP_get_code(p) == RLE_VLI_Bits)
3472                 return recv_bm_rle_bits(mdev, p, c);
3473
3474         /* other variants had been implemented for evaluation,
3475          * but have been dropped as this one turned out to be "best"
3476          * during all our tests. */
3477
3478         dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3479         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3480         return FAILED;
3481 }
3482
3483 void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3484                 const char *direction, struct bm_xfer_ctx *c)
3485 {
3486         /* what would it take to transfer it "plaintext" */
3487         unsigned plain = sizeof(struct p_header80) *
3488                 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3489                 + c->bm_words * sizeof(long);
3490         unsigned total = c->bytes[0] + c->bytes[1];
3491         unsigned r;
3492
3493         /* total can not be zero. but just in case: */
3494         if (total == 0)
3495                 return;
3496
3497         /* don't report if not compressed */
3498         if (total >= plain)
3499                 return;
3500
3501         /* total < plain. check for overflow, still */
3502         r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3503                                     : (1000 * total / plain);
3504
3505         if (r > 1000)
3506                 r = 1000;
3507
3508         r = 1000 - r;
3509         dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3510              "total %u; compression: %u.%u%%\n",
3511                         direction,
3512                         c->bytes[1], c->packets[1],
3513                         c->bytes[0], c->packets[0],
3514                         total, r/10, r % 10);
3515 }
3516
3517 /* Since we are processing the bitfield from lower addresses to higher,
3518    it does not matter if the process it in 32 bit chunks or 64 bit
3519    chunks as long as it is little endian. (Understand it as byte stream,
3520    beginning with the lowest byte...) If we would use big endian
3521    we would need to process it from the highest address to the lowest,
3522    in order to be agnostic to the 32 vs 64 bits issue.
3523
3524    returns 0 on failure, 1 if we successfully received it. */
3525 static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3526 {
3527         struct bm_xfer_ctx c;
3528         void *buffer;
3529         enum receive_bitmap_ret ret;
3530         int ok = FALSE;
3531         struct p_header80 *h = &mdev->data.rbuf.header.h80;
3532
3533         wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3534
3535         drbd_bm_lock(mdev, "receive bitmap");
3536
3537         /* maybe we should use some per thread scratch page,
3538          * and allocate that during initial device creation? */
3539         buffer   = (unsigned long *) __get_free_page(GFP_NOIO);
3540         if (!buffer) {
3541                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3542                 goto out;
3543         }
3544
3545         c = (struct bm_xfer_ctx) {
3546                 .bm_bits = drbd_bm_bits(mdev),
3547                 .bm_words = drbd_bm_words(mdev),
3548         };
3549
3550         do {
3551                 if (cmd == P_BITMAP) {
3552                         ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
3553                 } else if (cmd == P_COMPRESSED_BITMAP) {
3554                         /* MAYBE: sanity check that we speak proto >= 90,
3555                          * and the feature is enabled! */
3556                         struct p_compressed_bm *p;
3557
3558                         if (data_size > BM_PACKET_PAYLOAD_BYTES) {
3559                                 dev_err(DEV, "ReportCBitmap packet too large\n");
3560                                 goto out;
3561                         }
3562                         /* use the page buff */
3563                         p = buffer;
3564                         memcpy(p, h, sizeof(*h));
3565                         if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
3566                                 goto out;
3567                         if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3568                                 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3569                                 return FAILED;
3570                         }
3571                         ret = decode_bitmap_c(mdev, p, &c);
3572                 } else {
3573                         dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
3574                         goto out;
3575                 }
3576
3577                 c.packets[cmd == P_BITMAP]++;
3578                 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
3579
3580                 if (ret != OK)
3581                         break;
3582
3583                 if (!drbd_recv_header(mdev, &cmd, &data_size))
3584                         goto out;
3585         } while (ret == OK);
3586         if (ret == FAILED)
3587                 goto out;
3588
3589         INFO_bm_xfer_stats(mdev, "receive", &c);
3590
3591         if (mdev->state.conn == C_WF_BITMAP_T) {
3592                 ok = !drbd_send_bitmap(mdev);
3593                 if (!ok)
3594                         goto out;
3595                 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3596                 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3597                 D_ASSERT(ok == SS_SUCCESS);
3598         } else if (mdev->state.conn != C_WF_BITMAP_S) {
3599                 /* admin may have requested C_DISCONNECTING,
3600                  * other threads may have noticed network errors */
3601                 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3602                     drbd_conn_str(mdev->state.conn));
3603         }
3604
3605         ok = TRUE;
3606  out:
3607         drbd_bm_unlock(mdev);
3608         if (ok && mdev->state.conn == C_WF_BITMAP_S)
3609                 drbd_start_resync(mdev, C_SYNC_SOURCE);
3610         free_page((unsigned long) buffer);
3611         return ok;
3612 }
3613
3614 static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3615 {
3616         /* TODO zero copy sink :) */
3617         static char sink[128];
3618         int size, want, r;
3619
3620         dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3621                  cmd, data_size);
3622
3623         size = data_size;
3624         while (size > 0) {
3625                 want = min_t(int, size, sizeof(sink));
3626                 r = drbd_recv(mdev, sink, want);
3627                 ERR_IF(r <= 0) break;
3628                 size -= r;
3629         }
3630         return size == 0;
3631 }
3632
3633 static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3634 {
3635         if (mdev->state.disk >= D_INCONSISTENT)
3636                 drbd_kick_lo(mdev);
3637
3638         /* Make sure we've acked all the TCP data associated
3639          * with the data requests being unplugged */
3640         drbd_tcp_quickack(mdev->data.socket);
3641
3642         return TRUE;
3643 }
3644
3645 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
3646
3647 struct data_cmd {
3648         int expect_payload;
3649         size_t pkt_size;
3650         drbd_cmd_handler_f function;
3651 };
3652
3653 static struct data_cmd drbd_cmd_handler[] = {
3654         [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
3655         [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
3656         [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
3657         [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
3658         [P_BITMAP]          = { 1, sizeof(struct p_header80), receive_bitmap } ,
3659         [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3660         [P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
3661         [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
3662         [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3663         [P_SYNC_PARAM]      = { 1, sizeof(struct p_header80), receive_SyncParam },
3664         [P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
3665         [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
3666         [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
3667         [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
3668         [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
3669         [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
3670         [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
3671         [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
3672         [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
3673         [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3674         [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
3675         /* anything missing from this table is in
3676          * the asender_tbl, see get_asender_cmd */
3677         [P_MAX_CMD]         = { 0, 0, NULL },
3678 };
3679
3680 /* All handler functions that expect a sub-header get that sub-heder in
3681    mdev->data.rbuf.header.head.payload.
3682
3683    Usually in mdev->data.rbuf.header.head the callback can find the usual
3684    p_header, but they may not rely on that. Since there is also p_header95 !
3685  */
3686
3687 static void drbdd(struct drbd_conf *mdev)
3688 {
3689         union p_header *header = &mdev->data.rbuf.header;
3690         unsigned int packet_size;
3691         enum drbd_packets cmd;
3692         size_t shs; /* sub header size */
3693         int rv;
3694
3695         while (get_t_state(&mdev->receiver) == Running) {
3696                 drbd_thread_current_set_cpu(mdev);
3697                 if (!drbd_recv_header(mdev, &cmd, &packet_size))
3698                         goto err_out;
3699
3700                 if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
3701                         dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
3702                         goto err_out;
3703                 }
3704
3705                 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
3706                 rv = drbd_recv(mdev, &header->h80.payload, shs);
3707                 if (unlikely(rv != shs)) {
3708                         dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
3709                         goto err_out;
3710                 }
3711
3712                 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
3713                         dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
3714                         goto err_out;
3715                 }
3716
3717                 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
3718
3719                 if (unlikely(!rv)) {
3720                         dev_err(DEV, "error receiving %s, l: %d!\n",
3721                             cmdname(cmd), packet_size);
3722                         goto err_out;
3723                 }
3724         }
3725
3726         if (0) {
3727         err_out:
3728                 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3729         }
3730 }
3731
3732 void drbd_flush_workqueue(struct drbd_conf *mdev)
3733 {
3734         struct drbd_wq_barrier barr;
3735
3736         barr.w.cb = w_prev_work_done;
3737         init_completion(&barr.done);
3738         drbd_queue_work(&mdev->data.work, &barr.w);
3739         wait_for_completion(&barr.done);
3740 }
3741
3742 void drbd_free_tl_hash(struct drbd_conf *mdev)
3743 {
3744         struct hlist_head *h;
3745
3746         spin_lock_irq(&mdev->req_lock);
3747
3748         if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
3749                 spin_unlock_irq(&mdev->req_lock);
3750                 return;
3751         }
3752         /* paranoia code */
3753         for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3754                 if (h->first)
3755                         dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3756                                 (int)(h - mdev->ee_hash), h->first);
3757         kfree(mdev->ee_hash);
3758         mdev->ee_hash = NULL;
3759         mdev->ee_hash_s = 0;
3760
3761         /* paranoia code */
3762         for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3763                 if (h->first)
3764                         dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3765                                 (int)(h - mdev->tl_hash), h->first);
3766         kfree(mdev->tl_hash);
3767         mdev->tl_hash = NULL;
3768         mdev->tl_hash_s = 0;
3769         spin_unlock_irq(&mdev->req_lock);
3770 }
3771
3772 static void drbd_disconnect(struct drbd_conf *mdev)
3773 {
3774         enum drbd_fencing_p fp;
3775         union drbd_state os, ns;
3776         int rv = SS_UNKNOWN_ERROR;
3777         unsigned int i;
3778
3779         if (mdev->state.conn == C_STANDALONE)
3780                 return;
3781         if (mdev->state.conn >= C_WF_CONNECTION)
3782                 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3783                                 drbd_conn_str(mdev->state.conn));
3784
3785         /* asender does not clean up anything. it must not interfere, either */
3786         drbd_thread_stop(&mdev->asender);
3787         drbd_free_sock(mdev);
3788
3789         /* wait for current activity to cease. */
3790         spin_lock_irq(&mdev->req_lock);
3791         _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3792         _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3793         _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3794         spin_unlock_irq(&mdev->req_lock);
3795
3796         /* We do not have data structures that would allow us to
3797          * get the rs_pending_cnt down to 0 again.
3798          *  * On C_SYNC_TARGET we do not have any data structures describing
3799          *    the pending RSDataRequest's we have sent.
3800          *  * On C_SYNC_SOURCE there is no data structure that tracks
3801          *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3802          *  And no, it is not the sum of the reference counts in the
3803          *  resync_LRU. The resync_LRU tracks the whole operation including
3804          *  the disk-IO, while the rs_pending_cnt only tracks the blocks
3805          *  on the fly. */
3806         drbd_rs_cancel_all(mdev);
3807         mdev->rs_total = 0;
3808         mdev->rs_failed = 0;
3809         atomic_set(&mdev->rs_pending_cnt, 0);
3810         wake_up(&mdev->misc_wait);
3811
3812         /* make sure syncer is stopped and w_resume_next_sg queued */
3813         del_timer_sync(&mdev->resync_timer);
3814         resync_timer_fn((unsigned long)mdev);
3815
3816         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3817          * w_make_resync_request etc. which may still be on the worker queue
3818          * to be "canceled" */
3819         drbd_flush_workqueue(mdev);
3820
3821         /* This also does reclaim_net_ee().  If we do this too early, we might
3822          * miss some resync ee and pages.*/
3823         drbd_process_done_ee(mdev);
3824
3825         kfree(mdev->p_uuid);
3826         mdev->p_uuid = NULL;
3827
3828         if (!mdev->state.susp)
3829                 tl_clear(mdev);
3830
3831         dev_info(DEV, "Connection closed\n");
3832
3833         drbd_md_sync(mdev);
3834
3835         fp = FP_DONT_CARE;
3836         if (get_ldev(mdev)) {
3837                 fp = mdev->ldev->dc.fencing;
3838                 put_ldev(mdev);
3839         }
3840
3841         if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
3842                 drbd_try_outdate_peer_async(mdev);
3843
3844         spin_lock_irq(&mdev->req_lock);
3845         os = mdev->state;
3846         if (os.conn >= C_UNCONNECTED) {
3847                 /* Do not restart in case we are C_DISCONNECTING */
3848                 ns = os;
3849                 ns.conn = C_UNCONNECTED;
3850                 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3851         }
3852         spin_unlock_irq(&mdev->req_lock);
3853
3854         if (os.conn == C_DISCONNECTING) {
3855                 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
3856
3857                 if (!mdev->state.susp) {
3858                         /* we must not free the tl_hash
3859                          * while application io is still on the fly */
3860                         wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3861                         drbd_free_tl_hash(mdev);
3862                 }
3863
3864                 crypto_free_hash(mdev->cram_hmac_tfm);
3865                 mdev->cram_hmac_tfm = NULL;
3866
3867                 kfree(mdev->net_conf);
3868                 mdev->net_conf = NULL;
3869                 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3870         }
3871
3872         /* tcp_close and release of sendpage pages can be deferred.  I don't
3873          * want to use SO_LINGER, because apparently it can be deferred for
3874          * more than 20 seconds (longest time I checked).
3875          *
3876          * Actually we don't care for exactly when the network stack does its
3877          * put_page(), but release our reference on these pages right here.
3878          */
3879         i = drbd_release_ee(mdev, &mdev->net_ee);
3880         if (i)
3881                 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3882         i = atomic_read(&mdev->pp_in_use);
3883         if (i)
3884                 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
3885
3886         D_ASSERT(list_empty(&mdev->read_ee));
3887         D_ASSERT(list_empty(&mdev->active_ee));
3888         D_ASSERT(list_empty(&mdev->sync_ee));
3889         D_ASSERT(list_empty(&mdev->done_ee));
3890
3891         /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3892         atomic_set(&mdev->current_epoch->epoch_size, 0);
3893         D_ASSERT(list_empty(&mdev->current_epoch->list));
3894 }
3895
3896 /*
3897  * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3898  * we can agree on is stored in agreed_pro_version.
3899  *
3900  * feature flags and the reserved array should be enough room for future
3901  * enhancements of the handshake protocol, and possible plugins...
3902  *
3903  * for now, they are expected to be zero, but ignored.
3904  */
3905 static int drbd_send_handshake(struct drbd_conf *mdev)
3906 {
3907         /* ASSERT current == mdev->receiver ... */
3908         struct p_handshake *p = &mdev->data.sbuf.handshake;
3909         int ok;
3910
3911         if (mutex_lock_interruptible(&mdev->data.mutex)) {
3912                 dev_err(DEV, "interrupted during initial handshake\n");
3913                 return 0; /* interrupted. not ok. */
3914         }
3915
3916         if (mdev->data.socket == NULL) {
3917                 mutex_unlock(&mdev->data.mutex);
3918                 return 0;
3919         }
3920
3921         memset(p, 0, sizeof(*p));
3922         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3923         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3924         ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3925                              (struct p_header80 *)p, sizeof(*p), 0 );
3926         mutex_unlock(&mdev->data.mutex);
3927         return ok;
3928 }
3929
3930 /*
3931  * return values:
3932  *   1 yes, we have a valid connection
3933  *   0 oops, did not work out, please try again
3934  *  -1 peer talks different language,
3935  *     no point in trying again, please go standalone.
3936  */
3937 static int drbd_do_handshake(struct drbd_conf *mdev)
3938 {
3939         /* ASSERT current == mdev->receiver ... */
3940         struct p_handshake *p = &mdev->data.rbuf.handshake;
3941         const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
3942         unsigned int length;
3943         enum drbd_packets cmd;
3944         int rv;
3945
3946         rv = drbd_send_handshake(mdev);
3947         if (!rv)
3948                 return 0;
3949
3950         rv = drbd_recv_header(mdev, &cmd, &length);
3951         if (!rv)
3952                 return 0;
3953
3954         if (cmd != P_HAND_SHAKE) {
3955                 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3956                      cmdname(cmd), cmd);
3957                 return -1;
3958         }
3959
3960         if (length != expect) {
3961                 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3962                      expect, length);
3963                 return -1;
3964         }
3965
3966         rv = drbd_recv(mdev, &p->head.payload, expect);
3967
3968         if (rv != expect) {
3969                 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
3970                 return 0;
3971         }
3972
3973         p->protocol_min = be32_to_cpu(p->protocol_min);
3974         p->protocol_max = be32_to_cpu(p->protocol_max);
3975         if (p->protocol_max == 0)
3976                 p->protocol_max = p->protocol_min;
3977
3978         if (PRO_VERSION_MAX < p->protocol_min ||
3979             PRO_VERSION_MIN > p->protocol_max)
3980                 goto incompat;
3981
3982         mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3983
3984         dev_info(DEV, "Handshake successful: "
3985              "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3986
3987         return 1;
3988
3989  incompat:
3990         dev_err(DEV, "incompatible DRBD dialects: "
3991             "I support %d-%d, peer supports %d-%d\n",
3992             PRO_VERSION_MIN, PRO_VERSION_MAX,
3993             p->protocol_min, p->protocol_max);
3994         return -1;
3995 }
3996
3997 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
3998 static int drbd_do_auth(struct drbd_conf *mdev)
3999 {
4000         dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4001         dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
4002         return -1;
4003 }
4004 #else
4005 #define CHALLENGE_LEN 64
4006
4007 /* Return value:
4008         1 - auth succeeded,
4009         0 - failed, try again (network error),
4010         -1 - auth failed, don't try again.
4011 */
4012
4013 static int drbd_do_auth(struct drbd_conf *mdev)
4014 {
4015         char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
4016         struct scatterlist sg;
4017         char *response = NULL;
4018         char *right_response = NULL;
4019         char *peers_ch = NULL;
4020         unsigned int key_len = strlen(mdev->net_conf->shared_secret);
4021         unsigned int resp_size;
4022         struct hash_desc desc;
4023         enum drbd_packets cmd;
4024         unsigned int length;
4025         int rv;
4026
4027         desc.tfm = mdev->cram_hmac_tfm;
4028         desc.flags = 0;
4029
4030         rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
4031                                 (u8 *)mdev->net_conf->shared_secret, key_len);
4032         if (rv) {
4033                 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
4034                 rv = -1;
4035                 goto fail;
4036         }
4037
4038         get_random_bytes(my_challenge, CHALLENGE_LEN);
4039
4040         rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
4041         if (!rv)
4042                 goto fail;
4043
4044         rv = drbd_recv_header(mdev, &cmd, &length);
4045         if (!rv)
4046                 goto fail;
4047
4048         if (cmd != P_AUTH_CHALLENGE) {
4049                 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
4050                     cmdname(cmd), cmd);
4051                 rv = 0;
4052                 goto fail;
4053         }
4054
4055         if (length > CHALLENGE_LEN * 2) {
4056                 dev_err(DEV, "expected AuthChallenge payload too big.\n");
4057                 rv = -1;
4058                 goto fail;
4059         }
4060
4061         peers_ch = kmalloc(length, GFP_NOIO);
4062         if (peers_ch == NULL) {
4063                 dev_err(DEV, "kmalloc of peers_ch failed\n");
4064                 rv = -1;
4065                 goto fail;
4066         }
4067
4068         rv = drbd_recv(mdev, peers_ch, length);
4069
4070         if (rv != length) {
4071                 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
4072                 rv = 0;
4073                 goto fail;
4074         }
4075
4076         resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
4077         response = kmalloc(resp_size, GFP_NOIO);
4078         if (response == NULL) {
4079                 dev_err(DEV, "kmalloc of response failed\n");
4080                 rv = -1;
4081                 goto fail;
4082         }
4083
4084         sg_init_table(&sg, 1);
4085         sg_set_buf(&sg, peers_ch, length);
4086
4087         rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4088         if (rv) {
4089                 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
4090                 rv = -1;
4091                 goto fail;
4092         }
4093
4094         rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
4095         if (!rv)
4096                 goto fail;
4097
4098         rv = drbd_recv_header(mdev, &cmd, &length);
4099         if (!rv)
4100                 goto fail;
4101
4102         if (cmd != P_AUTH_RESPONSE) {
4103                 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
4104                         cmdname(cmd), cmd);
4105                 rv = 0;
4106                 goto fail;
4107         }
4108
4109         if (length != resp_size) {
4110                 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
4111                 rv = 0;
4112                 goto fail;
4113         }
4114
4115         rv = drbd_recv(mdev, response , resp_size);
4116
4117         if (rv != resp_size) {
4118                 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
4119                 rv = 0;
4120                 goto fail;
4121         }
4122
4123         right_response = kmalloc(resp_size, GFP_NOIO);
4124         if (right_response == NULL) {
4125                 dev_err(DEV, "kmalloc of right_response failed\n");
4126                 rv = -1;
4127                 goto fail;
4128         }
4129
4130         sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4131
4132         rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4133         if (rv) {
4134                 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
4135                 rv = -1;
4136                 goto fail;
4137         }
4138
4139         rv = !memcmp(response, right_response, resp_size);
4140
4141         if (rv)
4142                 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
4143                      resp_size, mdev->net_conf->cram_hmac_alg);
4144         else
4145                 rv = -1;
4146
4147  fail:
4148         kfree(peers_ch);
4149         kfree(response);
4150         kfree(right_response);
4151
4152         return rv;
4153 }
4154 #endif
4155
4156 int drbdd_init(struct drbd_thread *thi)
4157 {
4158         struct drbd_conf *mdev = thi->mdev;
4159         unsigned int minor = mdev_to_minor(mdev);
4160         int h;
4161
4162         sprintf(current->comm, "drbd%d_receiver", minor);
4163
4164         dev_info(DEV, "receiver (re)started\n");
4165
4166         do {
4167                 h = drbd_connect(mdev);
4168                 if (h == 0) {
4169                         drbd_disconnect(mdev);
4170                         __set_current_state(TASK_INTERRUPTIBLE);
4171                         schedule_timeout(HZ);
4172                 }
4173                 if (h == -1) {
4174                         dev_warn(DEV, "Discarding network configuration.\n");
4175                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4176                 }
4177         } while (h == 0);
4178
4179         if (h > 0) {
4180                 if (get_net_conf(mdev)) {
4181                         drbdd(mdev);
4182                         put_net_conf(mdev);
4183                 }
4184         }
4185
4186         drbd_disconnect(mdev);
4187
4188         dev_info(DEV, "receiver terminated\n");
4189         return 0;
4190 }
4191
4192 /* ********* acknowledge sender ******** */
4193
4194 static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
4195 {
4196         struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4197
4198         int retcode = be32_to_cpu(p->retcode);
4199
4200         if (retcode >= SS_SUCCESS) {
4201                 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4202         } else {
4203                 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4204                 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4205                     drbd_set_st_err_str(retcode), retcode);
4206         }
4207         wake_up(&mdev->state_wait);
4208
4209         return TRUE;
4210 }
4211
4212 static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
4213 {
4214         return drbd_send_ping_ack(mdev);
4215
4216 }
4217
4218 static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
4219 {
4220         /* restore idle timeout */
4221         mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4222         if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4223                 wake_up(&mdev->misc_wait);
4224
4225         return TRUE;
4226 }
4227
4228 static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
4229 {
4230         struct p_block_ack *p = (struct p_block_ack *)h;
4231         sector_t sector = be64_to_cpu(p->sector);
4232         int blksize = be32_to_cpu(p->blksize);
4233
4234         D_ASSERT(mdev->agreed_pro_version >= 89);
4235
4236         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4237
4238         drbd_rs_complete_io(mdev, sector);
4239         drbd_set_in_sync(mdev, sector, blksize);
4240         /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4241         mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4242         dec_rs_pending(mdev);
4243         atomic_add(blksize >> 9, &mdev->rs_sect_in);
4244
4245         return TRUE;
4246 }
4247
4248 /* when we receive the ACK for a write request,
4249  * verify that we actually know about it */
4250 static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4251         u64 id, sector_t sector)
4252 {
4253         struct hlist_head *slot = tl_hash_slot(mdev, sector);
4254         struct hlist_node *n;
4255         struct drbd_request *req;
4256
4257         hlist_for_each_entry(req, n, slot, colision) {
4258                 if ((unsigned long)req == (unsigned long)id) {
4259                         if (req->sector != sector) {
4260                                 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4261                                     "wrong sector (%llus versus %llus)\n", req,
4262                                     (unsigned long long)req->sector,
4263                                     (unsigned long long)sector);
4264                                 break;
4265                         }
4266                         return req;
4267                 }
4268         }
4269         dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4270                 (void *)(unsigned long)id, (unsigned long long)sector);
4271         return NULL;
4272 }
4273
4274 typedef struct drbd_request *(req_validator_fn)
4275         (struct drbd_conf *mdev, u64 id, sector_t sector);
4276
4277 static int validate_req_change_req_state(struct drbd_conf *mdev,
4278         u64 id, sector_t sector, req_validator_fn validator,
4279         const char *func, enum drbd_req_event what)
4280 {
4281         struct drbd_request *req;
4282         struct bio_and_error m;
4283
4284         spin_lock_irq(&mdev->req_lock);
4285         req = validator(mdev, id, sector);
4286         if (unlikely(!req)) {
4287                 spin_unlock_irq(&mdev->req_lock);
4288                 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
4289                 return FALSE;
4290         }
4291         __req_mod(req, what, &m);
4292         spin_unlock_irq(&mdev->req_lock);
4293
4294         if (m.bio)
4295                 complete_master_bio(mdev, &m);
4296         return TRUE;
4297 }
4298
4299 static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
4300 {
4301         struct p_block_ack *p = (struct p_block_ack *)h;
4302         sector_t sector = be64_to_cpu(p->sector);
4303         int blksize = be32_to_cpu(p->blksize);
4304         enum drbd_req_event what;
4305
4306         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4307
4308         if (is_syncer_block_id(p->block_id)) {
4309                 drbd_set_in_sync(mdev, sector, blksize);
4310                 dec_rs_pending(mdev);
4311                 return TRUE;
4312         }
4313         switch (be16_to_cpu(h->command)) {
4314         case P_RS_WRITE_ACK:
4315                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4316                 what = write_acked_by_peer_and_sis;
4317                 break;
4318         case P_WRITE_ACK:
4319                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4320                 what = write_acked_by_peer;
4321                 break;
4322         case P_RECV_ACK:
4323                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4324                 what = recv_acked_by_peer;
4325                 break;
4326         case P_DISCARD_ACK:
4327                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4328                 what = conflict_discarded_by_peer;
4329                 break;
4330         default:
4331                 D_ASSERT(0);
4332                 return FALSE;
4333         }
4334
4335         return validate_req_change_req_state(mdev, p->block_id, sector,
4336                 _ack_id_to_req, __func__ , what);
4337 }
4338
4339 static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
4340 {
4341         struct p_block_ack *p = (struct p_block_ack *)h;
4342         sector_t sector = be64_to_cpu(p->sector);
4343
4344         if (__ratelimit(&drbd_ratelimit_state))
4345                 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
4346
4347         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4348
4349         if (is_syncer_block_id(p->block_id)) {
4350                 int size = be32_to_cpu(p->blksize);
4351                 dec_rs_pending(mdev);
4352                 drbd_rs_failed_io(mdev, sector, size);
4353                 return TRUE;
4354         }
4355         return validate_req_change_req_state(mdev, p->block_id, sector,
4356                 _ack_id_to_req, __func__ , neg_acked);
4357 }
4358
4359 static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
4360 {
4361         struct p_block_ack *p = (struct p_block_ack *)h;
4362         sector_t sector = be64_to_cpu(p->sector);
4363
4364         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4365         dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4366             (unsigned long long)sector, be32_to_cpu(p->blksize));
4367
4368         return validate_req_change_req_state(mdev, p->block_id, sector,
4369                 _ar_id_to_req, __func__ , neg_acked);
4370 }
4371
4372 static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
4373 {
4374         sector_t sector;
4375         int size;
4376         struct p_block_ack *p = (struct p_block_ack *)h;
4377
4378         sector = be64_to_cpu(p->sector);
4379         size = be32_to_cpu(p->blksize);
4380
4381         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4382
4383         dec_rs_pending(mdev);
4384
4385         if (get_ldev_if_state(mdev, D_FAILED)) {
4386                 drbd_rs_complete_io(mdev, sector);
4387                 drbd_rs_failed_io(mdev, sector, size);
4388                 put_ldev(mdev);
4389         }
4390
4391         return TRUE;
4392 }
4393
4394 static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
4395 {
4396         struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4397
4398         tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4399
4400         return TRUE;
4401 }
4402
4403 static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
4404 {
4405         struct p_block_ack *p = (struct p_block_ack *)h;
4406         struct drbd_work *w;
4407         sector_t sector;
4408         int size;
4409
4410         sector = be64_to_cpu(p->sector);
4411         size = be32_to_cpu(p->blksize);
4412
4413         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4414
4415         if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4416                 drbd_ov_oos_found(mdev, sector, size);
4417         else
4418                 ov_oos_print(mdev);
4419
4420         drbd_rs_complete_io(mdev, sector);
4421         dec_rs_pending(mdev);
4422
4423         if (--mdev->ov_left == 0) {
4424                 w = kmalloc(sizeof(*w), GFP_NOIO);
4425                 if (w) {
4426                         w->cb = w_ov_finished;
4427                         drbd_queue_work_front(&mdev->data.work, w);
4428                 } else {
4429                         dev_err(DEV, "kmalloc(w) failed.");
4430                         ov_oos_print(mdev);
4431                         drbd_resync_finished(mdev);
4432                 }
4433         }
4434         return TRUE;
4435 }
4436
4437 static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
4438 {
4439         return TRUE;
4440 }
4441
4442 struct asender_cmd {
4443         size_t pkt_size;
4444         int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
4445 };
4446
4447 static struct asender_cmd *get_asender_cmd(int cmd)
4448 {
4449         static struct asender_cmd asender_tbl[] = {
4450                 /* anything missing from this table is in
4451                  * the drbd_cmd_handler (drbd_default_handler) table,
4452                  * see the beginning of drbdd() */
4453         [P_PING]            = { sizeof(struct p_header80), got_Ping },
4454         [P_PING_ACK]        = { sizeof(struct p_header80), got_PingAck },
4455         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
4456         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
4457         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
4458         [P_DISCARD_ACK]     = { sizeof(struct p_block_ack), got_BlockAck },
4459         [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
4460         [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
4461         [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
4462         [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
4463         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
4464         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4465         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
4466         [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
4467         [P_MAX_CMD]         = { 0, NULL },
4468         };
4469         if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4470                 return NULL;
4471         return &asender_tbl[cmd];
4472 }
4473
4474 int drbd_asender(struct drbd_thread *thi)
4475 {
4476         struct drbd_conf *mdev = thi->mdev;
4477         struct p_header80 *h = &mdev->meta.rbuf.header.h80;
4478         struct asender_cmd *cmd = NULL;
4479
4480         int rv, len;
4481         void *buf    = h;
4482         int received = 0;
4483         int expect   = sizeof(struct p_header80);
4484         int empty;
4485
4486         sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4487
4488         current->policy = SCHED_RR;  /* Make this a realtime task! */
4489         current->rt_priority = 2;    /* more important than all other tasks */
4490
4491         while (get_t_state(thi) == Running) {
4492                 drbd_thread_current_set_cpu(mdev);
4493                 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4494                         ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4495                         mdev->meta.socket->sk->sk_rcvtimeo =
4496                                 mdev->net_conf->ping_timeo*HZ/10;
4497                 }
4498
4499                 /* conditionally cork;
4500                  * it may hurt latency if we cork without much to send */
4501                 if (!mdev->net_conf->no_cork &&
4502                         3 < atomic_read(&mdev->unacked_cnt))
4503                         drbd_tcp_cork(mdev->meta.socket);
4504                 while (1) {
4505                         clear_bit(SIGNAL_ASENDER, &mdev->flags);
4506                         flush_signals(current);
4507                         if (!drbd_process_done_ee(mdev)) {
4508                                 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4509                                 goto reconnect;
4510                         }
4511                         /* to avoid race with newly queued ACKs */
4512                         set_bit(SIGNAL_ASENDER, &mdev->flags);
4513                         spin_lock_irq(&mdev->req_lock);
4514                         empty = list_empty(&mdev->done_ee);
4515                         spin_unlock_irq(&mdev->req_lock);
4516                         /* new ack may have been queued right here,
4517                          * but then there is also a signal pending,
4518                          * and we start over... */
4519                         if (empty)
4520                                 break;
4521                 }
4522                 /* but unconditionally uncork unless disabled */
4523                 if (!mdev->net_conf->no_cork)
4524                         drbd_tcp_uncork(mdev->meta.socket);
4525
4526                 /* short circuit, recv_msg would return EINTR anyways. */
4527                 if (signal_pending(current))
4528                         continue;
4529
4530                 rv = drbd_recv_short(mdev, mdev->meta.socket,
4531                                      buf, expect-received, 0);
4532                 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4533
4534                 flush_signals(current);
4535
4536                 /* Note:
4537                  * -EINTR        (on meta) we got a signal
4538                  * -EAGAIN       (on meta) rcvtimeo expired
4539                  * -ECONNRESET   other side closed the connection
4540                  * -ERESTARTSYS  (on data) we got a signal
4541                  * rv <  0       other than above: unexpected error!
4542                  * rv == expected: full header or command
4543                  * rv <  expected: "woken" by signal during receive
4544                  * rv == 0       : "connection shut down by peer"
4545                  */
4546                 if (likely(rv > 0)) {
4547                         received += rv;
4548                         buf      += rv;
4549                 } else if (rv == 0) {
4550                         dev_err(DEV, "meta connection shut down by peer.\n");
4551                         goto reconnect;
4552                 } else if (rv == -EAGAIN) {
4553                         if (mdev->meta.socket->sk->sk_rcvtimeo ==
4554                             mdev->net_conf->ping_timeo*HZ/10) {
4555                                 dev_err(DEV, "PingAck did not arrive in time.\n");
4556                                 goto reconnect;
4557                         }
4558                         set_bit(SEND_PING, &mdev->flags);
4559                         continue;
4560                 } else if (rv == -EINTR) {
4561                         continue;
4562                 } else {
4563                         dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4564                         goto reconnect;
4565                 }
4566
4567                 if (received == expect && cmd == NULL) {
4568                         if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4569                                 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
4570                                     (long)be32_to_cpu(h->magic),
4571                                     h->command, h->length);
4572                                 goto reconnect;
4573                         }
4574                         cmd = get_asender_cmd(be16_to_cpu(h->command));
4575                         len = be16_to_cpu(h->length);
4576                         if (unlikely(cmd == NULL)) {
4577                                 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
4578                                     (long)be32_to_cpu(h->magic),
4579                                     h->command, h->length);
4580                                 goto disconnect;
4581                         }
4582                         expect = cmd->pkt_size;
4583                         ERR_IF(len != expect-sizeof(struct p_header80))
4584                                 goto reconnect;
4585                 }
4586                 if (received == expect) {
4587                         D_ASSERT(cmd != NULL);
4588                         if (!cmd->process(mdev, h))
4589                                 goto reconnect;
4590
4591                         buf      = h;
4592                         received = 0;
4593                         expect   = sizeof(struct p_header80);
4594                         cmd      = NULL;
4595                 }
4596         }
4597
4598         if (0) {
4599 reconnect:
4600                 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4601         }
4602         if (0) {
4603 disconnect:
4604                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4605         }
4606         clear_bit(SIGNAL_ASENDER, &mdev->flags);
4607
4608         D_ASSERT(mdev->state.conn < C_CONNECTED);
4609         dev_info(DEV, "asender terminated\n");
4610
4611         return 0;
4612 }