]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/block/drbd/drbd_main.c
drbd: Introduce drbd_header_size()
[mirror_ubuntu-artful-kernel.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
2a48fc0a 59static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
60int drbdd_init(struct drbd_thread *);
61int drbd_worker(struct drbd_thread *);
62int drbd_asender(struct drbd_thread *);
63
64int drbd_init(void);
65static int drbd_open(struct block_device *bdev, fmode_t mode);
66static int drbd_release(struct gendisk *gd, fmode_t mode);
99920dc5 67static int w_md_sync(struct drbd_work *w, int unused);
b411b363 68static void md_sync_timer_fn(unsigned long data);
99920dc5
AG
69static int w_bitmap_io(struct drbd_work *w, int unused);
70static int w_go_diskless(struct drbd_work *w, int unused);
b411b363 71
b411b363
PR
72MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
73 "Lars Ellenberg <lars@linbit.com>");
74MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
75MODULE_VERSION(REL_VERSION);
76MODULE_LICENSE("GPL");
81a5d60e 77MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
2b8a90b5 78 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
79MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
80
81#include <linux/moduleparam.h>
82/* allow_open_on_secondary */
83MODULE_PARM_DESC(allow_oos, "DONT USE!");
84/* thanks to these macros, if compiled into the kernel (not-module),
85 * this becomes the boot parameter drbd.minor_count */
86module_param(minor_count, uint, 0444);
87module_param(disable_sendpage, bool, 0644);
88module_param(allow_oos, bool, 0);
b411b363
PR
89module_param(proc_details, int, 0644);
90
91#ifdef CONFIG_DRBD_FAULT_INJECTION
92int enable_faults;
93int fault_rate;
94static int fault_count;
95int fault_devs;
96/* bitmap of enabled faults */
97module_param(enable_faults, int, 0664);
98/* fault rate % value - applies to all enabled faults */
99module_param(fault_rate, int, 0664);
100/* count of faults inserted */
101module_param(fault_count, int, 0664);
102/* bitmap of devices to insert faults on */
103module_param(fault_devs, int, 0644);
104#endif
105
106/* module parameter, defined */
2b8a90b5 107unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
108int disable_sendpage;
109int allow_oos;
b411b363
PR
110int proc_details; /* Detail level in proc drbd*/
111
112/* Module parameter for setting the user mode helper program
113 * to run. Default is /sbin/drbdadm */
114char usermode_helper[80] = "/sbin/drbdadm";
115
116module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
117
118/* in 2.6.x, our device mapping and config info contains our virtual gendisks
119 * as member "struct gendisk *vdisk;"
120 */
81a5d60e 121struct idr minors;
2111438b 122struct list_head drbd_tconns; /* list of struct drbd_tconn */
543cc10b 123DEFINE_MUTEX(drbd_cfg_mutex);
b411b363
PR
124
125struct kmem_cache *drbd_request_cache;
6c852bec 126struct kmem_cache *drbd_ee_cache; /* peer requests */
b411b363
PR
127struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
128struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
129mempool_t *drbd_request_mempool;
130mempool_t *drbd_ee_mempool;
35abf594 131mempool_t *drbd_md_io_page_pool;
da4a75d2 132struct bio_set *drbd_md_io_bio_set;
b411b363
PR
133
134/* I do not use a standard mempool, because:
135 1) I want to hand out the pre-allocated objects first.
136 2) I want to be able to interrupt sleeping allocation with a signal.
137 Note: This is a single linked list, the next pointer is the private
138 member of struct page.
139 */
140struct page *drbd_pp_pool;
141spinlock_t drbd_pp_lock;
142int drbd_pp_vacant;
143wait_queue_head_t drbd_pp_wait;
144
145DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
146
7d4e9d09 147static const struct block_device_operations drbd_ops = {
b411b363
PR
148 .owner = THIS_MODULE,
149 .open = drbd_open,
150 .release = drbd_release,
151};
152
da4a75d2
LE
153static void bio_destructor_drbd(struct bio *bio)
154{
155 bio_free(bio, drbd_md_io_bio_set);
156}
157
158struct bio *bio_alloc_drbd(gfp_t gfp_mask)
159{
160 struct bio *bio;
161
162 if (!drbd_md_io_bio_set)
163 return bio_alloc(gfp_mask, 1);
164
165 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
166 if (!bio)
167 return NULL;
168 bio->bi_destructor = bio_destructor_drbd;
169 return bio;
170}
171
b411b363
PR
172#ifdef __CHECKER__
173/* When checking with sparse, and this is an inline function, sparse will
174 give tons of false positives. When this is a real functions sparse works.
175 */
176int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
177{
178 int io_allowed;
179
180 atomic_inc(&mdev->local_cnt);
181 io_allowed = (mdev->state.disk >= mins);
182 if (!io_allowed) {
183 if (atomic_dec_and_test(&mdev->local_cnt))
184 wake_up(&mdev->misc_wait);
185 }
186 return io_allowed;
187}
188
189#endif
190
191/**
192 * DOC: The transfer log
193 *
194 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
87eeee41 195 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
b411b363
PR
196 * of the list. There is always at least one &struct drbd_tl_epoch object.
197 *
198 * Each &struct drbd_tl_epoch has a circular double linked list of requests
199 * attached.
200 */
2f5cdd0b 201static int tl_init(struct drbd_tconn *tconn)
b411b363
PR
202{
203 struct drbd_tl_epoch *b;
204
205 /* during device minor initialization, we may well use GFP_KERNEL */
206 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
207 if (!b)
208 return 0;
209 INIT_LIST_HEAD(&b->requests);
210 INIT_LIST_HEAD(&b->w.list);
211 b->next = NULL;
212 b->br_number = 4711;
7e602c0a 213 b->n_writes = 0;
b411b363
PR
214 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
215
2f5cdd0b
PR
216 tconn->oldest_tle = b;
217 tconn->newest_tle = b;
218 INIT_LIST_HEAD(&tconn->out_of_sequence_requests);
b411b363 219
b411b363
PR
220 return 1;
221}
222
2f5cdd0b 223static void tl_cleanup(struct drbd_tconn *tconn)
b411b363 224{
2f5cdd0b
PR
225 if (tconn->oldest_tle != tconn->newest_tle)
226 conn_err(tconn, "ASSERT FAILED: oldest_tle == newest_tle\n");
227 if (!list_empty(&tconn->out_of_sequence_requests))
228 conn_err(tconn, "ASSERT FAILED: list_empty(out_of_sequence_requests)\n");
229 kfree(tconn->oldest_tle);
230 tconn->oldest_tle = NULL;
231 kfree(tconn->unused_spare_tle);
232 tconn->unused_spare_tle = NULL;
d628769b
AG
233}
234
b411b363
PR
235/**
236 * _tl_add_barrier() - Adds a barrier to the transfer log
237 * @mdev: DRBD device.
238 * @new: Barrier to be added before the current head of the TL.
239 *
240 * The caller must hold the req_lock.
241 */
2f5cdd0b 242void _tl_add_barrier(struct drbd_tconn *tconn, struct drbd_tl_epoch *new)
b411b363
PR
243{
244 struct drbd_tl_epoch *newest_before;
245
246 INIT_LIST_HEAD(&new->requests);
247 INIT_LIST_HEAD(&new->w.list);
248 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
249 new->next = NULL;
7e602c0a 250 new->n_writes = 0;
b411b363 251
2f5cdd0b 252 newest_before = tconn->newest_tle;
b411b363
PR
253 /* never send a barrier number == 0, because that is special-cased
254 * when using TCQ for our write ordering code */
255 new->br_number = (newest_before->br_number+1) ?: 1;
2f5cdd0b
PR
256 if (tconn->newest_tle != new) {
257 tconn->newest_tle->next = new;
258 tconn->newest_tle = new;
b411b363
PR
259 }
260}
261
262/**
263 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
264 * @mdev: DRBD device.
265 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
266 * @set_size: Expected number of requests before that barrier.
267 *
268 * In case the passed barrier_nr or set_size does not match the oldest
269 * &struct drbd_tl_epoch objects this function will cause a termination
270 * of the connection.
271 */
2f5cdd0b
PR
272void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
273 unsigned int set_size)
b411b363 274{
2f5cdd0b 275 struct drbd_conf *mdev;
b411b363
PR
276 struct drbd_tl_epoch *b, *nob; /* next old barrier */
277 struct list_head *le, *tle;
278 struct drbd_request *r;
279
2f5cdd0b 280 spin_lock_irq(&tconn->req_lock);
b411b363 281
2f5cdd0b 282 b = tconn->oldest_tle;
b411b363
PR
283
284 /* first some paranoia code */
285 if (b == NULL) {
2f5cdd0b
PR
286 conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
287 barrier_nr);
b411b363
PR
288 goto bail;
289 }
290 if (b->br_number != barrier_nr) {
2f5cdd0b
PR
291 conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
292 barrier_nr, b->br_number);
b411b363
PR
293 goto bail;
294 }
7e602c0a 295 if (b->n_writes != set_size) {
2f5cdd0b
PR
296 conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
297 barrier_nr, set_size, b->n_writes);
b411b363
PR
298 goto bail;
299 }
300
301 /* Clean up list of requests processed during current epoch */
302 list_for_each_safe(le, tle, &b->requests) {
303 r = list_entry(le, struct drbd_request, tl_requests);
8554df1c 304 _req_mod(r, BARRIER_ACKED);
b411b363
PR
305 }
306 /* There could be requests on the list waiting for completion
307 of the write to the local disk. To avoid corruptions of
308 slab's data structures we have to remove the lists head.
309
310 Also there could have been a barrier ack out of sequence, overtaking
311 the write acks - which would be a bug and violating write ordering.
312 To not deadlock in case we lose connection while such requests are
313 still pending, we need some way to find them for the
8554df1c 314 _req_mode(CONNECTION_LOST_WHILE_PENDING).
b411b363
PR
315
316 These have been list_move'd to the out_of_sequence_requests list in
8554df1c 317 _req_mod(, BARRIER_ACKED) above.
b411b363
PR
318 */
319 list_del_init(&b->requests);
2f5cdd0b 320 mdev = b->w.mdev;
b411b363
PR
321
322 nob = b->next;
323 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
2f5cdd0b 324 _tl_add_barrier(tconn, b);
b411b363 325 if (nob)
2f5cdd0b 326 tconn->oldest_tle = nob;
b411b363 327 /* if nob == NULL b was the only barrier, and becomes the new
2f5cdd0b 328 barrier. Therefore tconn->oldest_tle points already to b */
b411b363
PR
329 } else {
330 D_ASSERT(nob != NULL);
2f5cdd0b 331 tconn->oldest_tle = nob;
b411b363
PR
332 kfree(b);
333 }
334
2f5cdd0b 335 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
336 dec_ap_pending(mdev);
337
338 return;
339
340bail:
2f5cdd0b
PR
341 spin_unlock_irq(&tconn->req_lock);
342 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
343}
344
617049aa 345
b411b363 346/**
11b58e73 347 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 348 * @mdev: DRBD device.
11b58e73 349 * @what: The action/event to perform with all request objects
b411b363 350 *
8554df1c
AG
351 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
352 * RESTART_FROZEN_DISK_IO.
b411b363 353 */
2f5cdd0b 354void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
b411b363 355{
11b58e73 356 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 357 struct list_head *le, *tle, carry_reads;
11b58e73
PR
358 struct drbd_request *req;
359 int rv, n_writes, n_reads;
b411b363 360
2f5cdd0b
PR
361 b = tconn->oldest_tle;
362 pn = &tconn->oldest_tle;
b411b363 363 while (b) {
11b58e73
PR
364 n_writes = 0;
365 n_reads = 0;
b9b98716 366 INIT_LIST_HEAD(&carry_reads);
b411b363 367 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
368 req = list_entry(le, struct drbd_request, tl_requests);
369 rv = _req_mod(req, what);
370
371 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
372 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
373 }
374 tmp = b->next;
375
b9b98716 376 if (n_writes) {
8554df1c 377 if (what == RESEND) {
11b58e73
PR
378 b->n_writes = n_writes;
379 if (b->w.cb == NULL) {
380 b->w.cb = w_send_barrier;
2f5cdd0b
PR
381 inc_ap_pending(b->w.mdev);
382 set_bit(CREATE_BARRIER, &b->w.mdev->flags);
11b58e73
PR
383 }
384
2f5cdd0b 385 drbd_queue_work(&tconn->data.work, &b->w);
11b58e73
PR
386 }
387 pn = &b->next;
388 } else {
b9b98716
PR
389 if (n_reads)
390 list_add(&carry_reads, &b->requests);
11b58e73
PR
391 /* there could still be requests on that ring list,
392 * in case local io is still pending */
393 list_del(&b->requests);
394
395 /* dec_ap_pending corresponding to queue_barrier.
396 * the newest barrier may not have been queued yet,
397 * in which case w.cb is still NULL. */
398 if (b->w.cb != NULL)
2f5cdd0b 399 dec_ap_pending(b->w.mdev);
11b58e73 400
2f5cdd0b 401 if (b == tconn->newest_tle) {
11b58e73 402 /* recycle, but reinit! */
2f5cdd0b
PR
403 if (tmp != NULL)
404 conn_err(tconn, "ASSERT FAILED tmp == NULL");
11b58e73 405 INIT_LIST_HEAD(&b->requests);
b9b98716 406 list_splice(&carry_reads, &b->requests);
11b58e73
PR
407 INIT_LIST_HEAD(&b->w.list);
408 b->w.cb = NULL;
409 b->br_number = net_random();
410 b->n_writes = 0;
411
412 *pn = b;
413 break;
414 }
415 *pn = tmp;
416 kfree(b);
b411b363 417 }
b411b363 418 b = tmp;
b9b98716 419 list_splice(&carry_reads, &b->requests);
b411b363 420 }
11b58e73
PR
421}
422
b411b363
PR
423
424/**
425 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
426 * @mdev: DRBD device.
427 *
428 * This is called after the connection to the peer was lost. The storage covered
429 * by the requests on the transfer gets marked as our of sync. Called from the
430 * receiver thread and the worker thread.
431 */
2f5cdd0b 432void tl_clear(struct drbd_tconn *tconn)
b411b363 433{
2f5cdd0b 434 struct drbd_conf *mdev;
b411b363
PR
435 struct list_head *le, *tle;
436 struct drbd_request *r;
e90285e0 437 int vnr;
b411b363 438
2f5cdd0b 439 spin_lock_irq(&tconn->req_lock);
b411b363 440
2f5cdd0b 441 _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
442
443 /* we expect this list to be empty. */
2f5cdd0b
PR
444 if (!list_empty(&tconn->out_of_sequence_requests))
445 conn_err(tconn, "ASSERT FAILED list_empty(&out_of_sequence_requests)\n");
b411b363
PR
446
447 /* but just in case, clean it up anyways! */
2f5cdd0b 448 list_for_each_safe(le, tle, &tconn->out_of_sequence_requests) {
b411b363
PR
449 r = list_entry(le, struct drbd_request, tl_requests);
450 /* It would be nice to complete outside of spinlock.
451 * But this is easier for now. */
8554df1c 452 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
453 }
454
455 /* ensure bit indicating barrier is required is clear */
e90285e0 456 idr_for_each_entry(&tconn->volumes, mdev, vnr)
2f5cdd0b 457 clear_bit(CREATE_BARRIER, &mdev->flags);
b411b363 458
2f5cdd0b 459 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
460}
461
2f5cdd0b 462void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
11b58e73 463{
2f5cdd0b
PR
464 spin_lock_irq(&tconn->req_lock);
465 _tl_restart(tconn, what);
466 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
467}
468
b411b363
PR
469static int drbd_thread_setup(void *arg)
470{
471 struct drbd_thread *thi = (struct drbd_thread *) arg;
392c8801 472 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
473 unsigned long flags;
474 int retval;
475
f1b3a6ec 476 snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
392c8801 477 thi->name[0], thi->tconn->name);
f1b3a6ec 478
b411b363
PR
479restart:
480 retval = thi->function(thi);
481
482 spin_lock_irqsave(&thi->t_lock, flags);
483
e77a0a5c 484 /* if the receiver has been "EXITING", the last thing it did
b411b363
PR
485 * was set the conn state to "StandAlone",
486 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
487 * and receiver thread will be "started".
e77a0a5c 488 * drbd_thread_start needs to set "RESTARTING" in that case.
b411b363 489 * t_state check and assignment needs to be within the same spinlock,
e77a0a5c
AG
490 * so either thread_start sees EXITING, and can remap to RESTARTING,
491 * or thread_start see NONE, and can proceed as normal.
b411b363
PR
492 */
493
e77a0a5c 494 if (thi->t_state == RESTARTING) {
392c8801 495 conn_info(tconn, "Restarting %s thread\n", thi->name);
e77a0a5c 496 thi->t_state = RUNNING;
b411b363
PR
497 spin_unlock_irqrestore(&thi->t_lock, flags);
498 goto restart;
499 }
500
501 thi->task = NULL;
e77a0a5c 502 thi->t_state = NONE;
b411b363
PR
503 smp_mb();
504 complete(&thi->stop);
505 spin_unlock_irqrestore(&thi->t_lock, flags);
506
392c8801 507 conn_info(tconn, "Terminating %s\n", current->comm);
b411b363
PR
508
509 /* Release mod reference taken when thread was started */
510 module_put(THIS_MODULE);
511 return retval;
512}
513
392c8801 514static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
bed879ae 515 int (*func) (struct drbd_thread *), char *name)
b411b363
PR
516{
517 spin_lock_init(&thi->t_lock);
518 thi->task = NULL;
e77a0a5c 519 thi->t_state = NONE;
b411b363 520 thi->function = func;
392c8801 521 thi->tconn = tconn;
bed879ae 522 strncpy(thi->name, name, ARRAY_SIZE(thi->name));
b411b363
PR
523}
524
525int drbd_thread_start(struct drbd_thread *thi)
526{
392c8801 527 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
528 struct task_struct *nt;
529 unsigned long flags;
530
b411b363
PR
531 /* is used from state engine doing drbd_thread_stop_nowait,
532 * while holding the req lock irqsave */
533 spin_lock_irqsave(&thi->t_lock, flags);
534
535 switch (thi->t_state) {
e77a0a5c 536 case NONE:
392c8801 537 conn_info(tconn, "Starting %s thread (from %s [%d])\n",
bed879ae 538 thi->name, current->comm, current->pid);
b411b363
PR
539
540 /* Get ref on module for thread - this is released when thread exits */
541 if (!try_module_get(THIS_MODULE)) {
392c8801 542 conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
b411b363 543 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 544 return false;
b411b363
PR
545 }
546
547 init_completion(&thi->stop);
b411b363 548 thi->reset_cpu_mask = 1;
e77a0a5c 549 thi->t_state = RUNNING;
b411b363
PR
550 spin_unlock_irqrestore(&thi->t_lock, flags);
551 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
552
553 nt = kthread_create(drbd_thread_setup, (void *) thi,
392c8801 554 "drbd_%c_%s", thi->name[0], thi->tconn->name);
b411b363
PR
555
556 if (IS_ERR(nt)) {
392c8801 557 conn_err(tconn, "Couldn't start thread\n");
b411b363
PR
558
559 module_put(THIS_MODULE);
81e84650 560 return false;
b411b363
PR
561 }
562 spin_lock_irqsave(&thi->t_lock, flags);
563 thi->task = nt;
e77a0a5c 564 thi->t_state = RUNNING;
b411b363
PR
565 spin_unlock_irqrestore(&thi->t_lock, flags);
566 wake_up_process(nt);
567 break;
e77a0a5c
AG
568 case EXITING:
569 thi->t_state = RESTARTING;
392c8801 570 conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
bed879ae 571 thi->name, current->comm, current->pid);
b411b363 572 /* fall through */
e77a0a5c
AG
573 case RUNNING:
574 case RESTARTING:
b411b363
PR
575 default:
576 spin_unlock_irqrestore(&thi->t_lock, flags);
577 break;
578 }
579
81e84650 580 return true;
b411b363
PR
581}
582
583
584void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
585{
586 unsigned long flags;
587
e77a0a5c 588 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
b411b363
PR
589
590 /* may be called from state engine, holding the req lock irqsave */
591 spin_lock_irqsave(&thi->t_lock, flags);
592
e77a0a5c 593 if (thi->t_state == NONE) {
b411b363
PR
594 spin_unlock_irqrestore(&thi->t_lock, flags);
595 if (restart)
596 drbd_thread_start(thi);
597 return;
598 }
599
600 if (thi->t_state != ns) {
601 if (thi->task == NULL) {
602 spin_unlock_irqrestore(&thi->t_lock, flags);
603 return;
604 }
605
606 thi->t_state = ns;
607 smp_mb();
608 init_completion(&thi->stop);
609 if (thi->task != current)
610 force_sig(DRBD_SIGKILL, thi->task);
b411b363
PR
611 }
612
613 spin_unlock_irqrestore(&thi->t_lock, flags);
614
615 if (wait)
616 wait_for_completion(&thi->stop);
617}
618
392c8801 619static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 620{
bed879ae
PR
621 struct drbd_thread *thi =
622 task == tconn->receiver.task ? &tconn->receiver :
623 task == tconn->asender.task ? &tconn->asender :
624 task == tconn->worker.task ? &tconn->worker : NULL;
625
626 return thi;
627}
628
392c8801 629char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 630{
392c8801 631 struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
bed879ae
PR
632 return thi ? thi->name : task->comm;
633}
634
80883197 635int conn_lowest_minor(struct drbd_tconn *tconn)
80822284 636{
e90285e0
PR
637 int vnr = 0;
638 struct drbd_conf *mdev;
774b3055 639
e90285e0
PR
640 mdev = idr_get_next(&tconn->volumes, &vnr);
641 if (!mdev)
774b3055 642 return -1;
e90285e0 643 return mdev_to_minor(mdev);
80822284 644}
774b3055
PR
645
646#ifdef CONFIG_SMP
b411b363
PR
647/**
648 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
649 * @mdev: DRBD device.
650 *
651 * Forces all threads of a device onto the same CPU. This is beneficial for
652 * DRBD's performance. May be overwritten by user's configuration.
653 */
80822284 654void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
b411b363
PR
655{
656 int ord, cpu;
657
658 /* user override. */
80822284 659 if (cpumask_weight(tconn->cpu_mask))
b411b363
PR
660 return;
661
80822284 662 ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
b411b363
PR
663 for_each_online_cpu(cpu) {
664 if (ord-- == 0) {
80822284 665 cpumask_set_cpu(cpu, tconn->cpu_mask);
b411b363
PR
666 return;
667 }
668 }
669 /* should not be reached */
80822284 670 cpumask_setall(tconn->cpu_mask);
b411b363
PR
671}
672
673/**
674 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
675 * @mdev: DRBD device.
bc31fe33 676 * @thi: drbd_thread object
b411b363
PR
677 *
678 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
679 * prematurely.
680 */
80822284 681void drbd_thread_current_set_cpu(struct drbd_thread *thi)
b411b363
PR
682{
683 struct task_struct *p = current;
bed879ae 684
b411b363
PR
685 if (!thi->reset_cpu_mask)
686 return;
687 thi->reset_cpu_mask = 0;
392c8801 688 set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
b411b363
PR
689}
690#endif
691
52b061a4
AG
692/**
693 * drbd_header_size - size of a packet header
694 *
695 * The header size is a multiple of 8, so any payload following the header is
696 * word aligned on 64-bit architectures. (The bitmap send and receive code
697 * relies on this.)
698 */
699unsigned int drbd_header_size(struct drbd_tconn *tconn)
700{
701 BUILD_BUG_ON(sizeof(struct p_header80) != sizeof(struct p_header95));
702 BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
703 return sizeof(struct p_header80);
704}
705
d38e787e 706static void prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
fd340c12
PR
707{
708 h->magic = cpu_to_be32(DRBD_MAGIC);
709 h->command = cpu_to_be16(cmd);
710 h->length = cpu_to_be16(size);
711}
712
d38e787e 713static void prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
fd340c12
PR
714{
715 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
716 h->command = cpu_to_be16(cmd);
717 h->length = cpu_to_be32(size);
718}
719
d38e787e
PR
720static void _prepare_header(struct drbd_tconn *tconn, int vnr, struct p_header *h,
721 enum drbd_packet cmd, int size)
722{
0916e0e3 723 if (tconn->agreed_pro_version >= 95)
d38e787e
PR
724 prepare_header95(&h->h95, cmd, size);
725 else
726 prepare_header80(&h->h80, cmd, size);
727}
728
fd340c12 729static void prepare_header(struct drbd_conf *mdev, struct p_header *h,
d8763023 730 enum drbd_packet cmd, int size)
fd340c12 731{
d38e787e 732 _prepare_header(mdev->tconn, mdev->vnr, h, cmd, size);
fd340c12
PR
733}
734
b411b363 735/* the appropriate socket mutex must be held already */
7c96715a 736int _conn_send_cmd(struct drbd_tconn *tconn, int vnr, struct drbd_socket *sock,
d8763023
AG
737 enum drbd_packet cmd, struct p_header *h, size_t size,
738 unsigned msg_flags)
b411b363 739{
ecf2363c 740 int err;
b411b363 741
d38e787e 742 _prepare_header(tconn, vnr, h, cmd, size - sizeof(struct p_header));
7c96715a 743 err = drbd_send_all(tconn, sock->socket, h, size, msg_flags);
ecf2363c
AG
744 if (err && !signal_pending(current))
745 conn_warn(tconn, "short send %s size=%d\n",
746 cmdname(cmd), (int)size);
747 return err;
b411b363
PR
748}
749
750/* don't pass the socket. we may only look at it
751 * when we hold the appropriate socket mutex.
752 */
7d168ed3 753int conn_send_cmd(struct drbd_tconn *tconn, int vnr, struct drbd_socket *sock,
d8763023 754 enum drbd_packet cmd, struct p_header *h, size_t size)
b411b363 755{
596a37f9 756 int err = -EIO;
b411b363 757
7d168ed3
AG
758 mutex_lock(&sock->mutex);
759 if (sock->socket)
7c96715a 760 err = _conn_send_cmd(tconn, vnr, sock, cmd, h, size, 0);
7d168ed3 761 mutex_unlock(&sock->mutex);
596a37f9 762 return err;
b411b363
PR
763}
764
61120870 765int conn_send_cmd2(struct drbd_tconn *tconn, enum drbd_packet cmd, char *data,
b411b363
PR
766 size_t size)
767{
61120870 768 struct p_header80 h;
ce9879cb 769 int err;
b411b363 770
61120870 771 prepare_header80(&h, cmd, size);
ce9879cb
AG
772 err = drbd_get_data_sock(tconn);
773 if (!err) {
774 err = drbd_send_all(tconn, tconn->data.socket, &h, sizeof(h), 0);
775 if (!err)
776 err = drbd_send_all(tconn, tconn->data.socket, data, size, 0);
777 drbd_put_data_sock(tconn);
778 }
779 return err;
b411b363
PR
780}
781
dba58587
AG
782void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock)
783{
784 mutex_lock(&sock->mutex);
785 if (!sock->socket) {
786 mutex_unlock(&sock->mutex);
787 return NULL;
788 }
789 return sock->sbuf;
790}
791
792void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock)
793{
794 return conn_prepare_command(mdev->tconn, sock);
795}
796
797static int __send_command(struct drbd_tconn *tconn, int vnr,
798 struct drbd_socket *sock, enum drbd_packet cmd,
799 unsigned int header_size, void *data,
800 unsigned int size)
801{
802 int msg_flags;
803 int err;
804
805 /*
806 * Called with @data == NULL and the size of the data blocks in @size
807 * for commands that send data blocks. For those commands, omit the
808 * MSG_MORE flag: this will increase the likelihood that data blocks
809 * which are page aligned on the sender will end up page aligned on the
810 * receiver.
811 */
812 msg_flags = data ? MSG_MORE : 0;
813
814 _prepare_header(tconn, vnr, sock->sbuf, cmd,
815 header_size - sizeof(struct p_header) + size);
816 err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size,
817 msg_flags);
818 if (data && !err)
819 err = drbd_send_all(tconn, sock->socket, data, size, 0);
820 return err;
821}
822
823int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
824 enum drbd_packet cmd, unsigned int header_size,
825 void *data, unsigned int size)
826{
827 int err;
828
829 err = __send_command(tconn, 0, sock, cmd, header_size, data, size);
830 mutex_unlock(&sock->mutex);
831 return err;
832}
833
834int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock,
835 enum drbd_packet cmd, unsigned int header_size,
836 void *data, unsigned int size)
837{
838 int err;
839
840 err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size,
841 data, size);
842 mutex_unlock(&sock->mutex);
843 return err;
844}
845
e307f352
AG
846int drbd_send_ping(struct drbd_tconn *tconn)
847{
848 struct p_header h;
a17647aa 849 return conn_send_cmd(tconn, 0, &tconn->meta, P_PING, &h, sizeof(h));
e307f352
AG
850}
851
852int drbd_send_ping_ack(struct drbd_tconn *tconn)
853{
854 struct p_header h;
a17647aa 855 return conn_send_cmd(tconn, 0, &tconn->meta, P_PING_ACK, &h, sizeof(h));
e307f352
AG
856}
857
f399002e 858int drbd_send_sync_param(struct drbd_conf *mdev)
b411b363 859{
8e26f9cc 860 struct p_rs_param_95 *p;
7c96715a 861 struct drbd_socket *sock;
103ea275 862 int size, err;
31890f4a 863 const int apv = mdev->tconn->agreed_pro_version;
b411b363
PR
864
865 size = apv <= 87 ? sizeof(struct p_rs_param)
866 : apv == 88 ? sizeof(struct p_rs_param)
f399002e 867 + strlen(mdev->tconn->net_conf->verify_alg) + 1
8e26f9cc
PR
868 : apv <= 94 ? sizeof(struct p_rs_param_89)
869 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 870
e42325a5 871 mutex_lock(&mdev->tconn->data.mutex);
7c96715a 872 sock = &mdev->tconn->data;
b411b363 873
7c96715a 874 if (likely(sock->socket != NULL)) {
d8763023
AG
875 enum drbd_packet cmd =
876 apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
b411b363 877
5a87d920 878 p = mdev->tconn->data.sbuf;
b411b363
PR
879
880 /* initialize verify_alg and csums_alg */
881 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
882
f399002e
LE
883 if (get_ldev(mdev)) {
884 p->rate = cpu_to_be32(mdev->ldev->dc.resync_rate);
885 p->c_plan_ahead = cpu_to_be32(mdev->ldev->dc.c_plan_ahead);
886 p->c_delay_target = cpu_to_be32(mdev->ldev->dc.c_delay_target);
887 p->c_fill_target = cpu_to_be32(mdev->ldev->dc.c_fill_target);
888 p->c_max_rate = cpu_to_be32(mdev->ldev->dc.c_max_rate);
889 put_ldev(mdev);
890 } else {
891 p->rate = cpu_to_be32(DRBD_RATE_DEF);
892 p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
893 p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
894 p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
895 p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
896 }
b411b363
PR
897
898 if (apv >= 88)
f399002e 899 strcpy(p->verify_alg, mdev->tconn->net_conf->verify_alg);
b411b363 900 if (apv >= 89)
f399002e 901 strcpy(p->csums_alg, mdev->tconn->net_conf->csums_alg);
b411b363 902
103ea275 903 err = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
b411b363 904 } else
103ea275 905 err = -EIO;
b411b363 906
e42325a5 907 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 908
103ea275 909 return err;
b411b363
PR
910}
911
dc8228d1 912int drbd_send_protocol(struct drbd_tconn *tconn)
b411b363
PR
913{
914 struct p_protocol *p;
387eb308 915 int size, cf, err;
b411b363
PR
916
917 size = sizeof(struct p_protocol);
918
dc8228d1
PR
919 if (tconn->agreed_pro_version >= 87)
920 size += strlen(tconn->net_conf->integrity_alg) + 1;
b411b363
PR
921
922 /* we must not recurse into our own queue,
923 * as that is blocked during handshake */
924 p = kmalloc(size, GFP_NOIO);
925 if (p == NULL)
387eb308 926 return -ENOMEM;
b411b363 927
dc8228d1
PR
928 p->protocol = cpu_to_be32(tconn->net_conf->wire_protocol);
929 p->after_sb_0p = cpu_to_be32(tconn->net_conf->after_sb_0p);
930 p->after_sb_1p = cpu_to_be32(tconn->net_conf->after_sb_1p);
931 p->after_sb_2p = cpu_to_be32(tconn->net_conf->after_sb_2p);
932 p->two_primaries = cpu_to_be32(tconn->net_conf->two_primaries);
b411b363 933
cf14c2e9 934 cf = 0;
dc8228d1 935 if (tconn->net_conf->want_lose)
cf14c2e9 936 cf |= CF_WANT_LOSE;
dc8228d1
PR
937 if (tconn->net_conf->dry_run) {
938 if (tconn->agreed_pro_version >= 92)
cf14c2e9
PR
939 cf |= CF_DRY_RUN;
940 else {
dc8228d1 941 conn_err(tconn, "--dry-run is not supported by peer");
7ac314c8 942 kfree(p);
387eb308 943 return -EOPNOTSUPP;
cf14c2e9
PR
944 }
945 }
946 p->conn_flags = cpu_to_be32(cf);
947
dc8228d1
PR
948 if (tconn->agreed_pro_version >= 87)
949 strcpy(p->integrity_alg, tconn->net_conf->integrity_alg);
b411b363 950
387eb308 951 err = conn_send_cmd2(tconn, P_PROTOCOL, p->head.payload, size - sizeof(struct p_header));
b411b363 952 kfree(p);
387eb308 953 return err;
b411b363
PR
954}
955
956int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
957{
958 struct p_uuids p;
959 int i;
960
961 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2ae5f95b 962 return 0;
b411b363
PR
963
964 for (i = UI_CURRENT; i < UI_SIZE; i++)
965 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
966
967 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
968 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
89e58e75 969 uuid_flags |= mdev->tconn->net_conf->want_lose ? 1 : 0;
b411b363
PR
970 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
971 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
972 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
973
974 put_ldev(mdev);
975
2ae5f95b 976 return drbd_send_cmd(mdev, &mdev->tconn->data, P_UUIDS, &p.head, sizeof(p));
b411b363
PR
977}
978
979int drbd_send_uuids(struct drbd_conf *mdev)
980{
981 return _drbd_send_uuids(mdev, 0);
982}
983
984int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
985{
986 return _drbd_send_uuids(mdev, 8);
987}
988
62b0da3a
LE
989void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
990{
991 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
992 u64 *uuid = mdev->ldev->md.uuid;
993 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
994 text,
995 (unsigned long long)uuid[UI_CURRENT],
996 (unsigned long long)uuid[UI_BITMAP],
997 (unsigned long long)uuid[UI_HISTORY_START],
998 (unsigned long long)uuid[UI_HISTORY_END]);
999 put_ldev(mdev);
1000 } else {
1001 dev_info(DEV, "%s effective data uuid: %016llX\n",
1002 text,
1003 (unsigned long long)mdev->ed_uuid);
1004 }
1005}
1006
9c1b7f72 1007void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
1008{
1009 struct p_rs_uuid p;
5a22db89
LE
1010 u64 uuid;
1011
1012 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 1013
4a23f264 1014 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 1015 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 1016 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
1017 drbd_md_sync(mdev);
1018 p.uuid = cpu_to_be64(uuid);
b411b363 1019
9c1b7f72 1020 drbd_send_cmd(mdev, &mdev->tconn->data, P_SYNC_UUID, &p.head, sizeof(p));
b411b363
PR
1021}
1022
e89b591c 1023int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
1024{
1025 struct p_sizes p;
1026 sector_t d_size, u_size;
99432fcc 1027 int q_order_type, max_bio_size;
b411b363
PR
1028
1029 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1030 D_ASSERT(mdev->ldev->backing_bdev);
1031 d_size = drbd_get_max_capacity(mdev->ldev);
1032 u_size = mdev->ldev->dc.disk_size;
1033 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
1034 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
1035 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
1036 put_ldev(mdev);
1037 } else {
1038 d_size = 0;
1039 u_size = 0;
1040 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 1041 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
1042 }
1043
1044 p.d_size = cpu_to_be64(d_size);
1045 p.u_size = cpu_to_be64(u_size);
1046 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 1047 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
1048 p.queue_order_type = cpu_to_be16(q_order_type);
1049 p.dds_flags = cpu_to_be16(flags);
b411b363 1050
f02d4d0a 1051 return drbd_send_cmd(mdev, &mdev->tconn->data, P_SIZES, &p.head, sizeof(p));
b411b363
PR
1052}
1053
1054/**
1055 * drbd_send_state() - Sends the drbd state to the peer
1056 * @mdev: DRBD device.
1057 */
1058int drbd_send_state(struct drbd_conf *mdev)
1059{
7c96715a 1060 struct drbd_socket *sock;
b411b363 1061 struct p_state p;
927036f9 1062 int err = -EIO;
b411b363 1063
e42325a5 1064 mutex_lock(&mdev->tconn->data.mutex);
b411b363
PR
1065
1066 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
7c96715a 1067 sock = &mdev->tconn->data;
b411b363 1068
7c96715a 1069 if (likely(sock->socket != NULL))
927036f9 1070 err = _drbd_send_cmd(mdev, sock, P_STATE, &p.head, sizeof(p), 0);
b411b363 1071
e42325a5 1072 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 1073
927036f9 1074 return err;
b411b363
PR
1075}
1076
cf29c9d8
PR
1077int _conn_send_state_req(struct drbd_tconn *tconn, int vnr, enum drbd_packet cmd,
1078 union drbd_state mask, union drbd_state val)
b411b363
PR
1079{
1080 struct p_req_state p;
1081
1082 p.mask = cpu_to_be32(mask.i);
1083 p.val = cpu_to_be32(val.i);
1084
758970c8 1085 return conn_send_cmd(tconn, vnr, &tconn->data, cmd, &p.head, sizeof(p));
b411b363
PR
1086}
1087
2f4e7abe 1088void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
1089{
1090 struct p_req_state_reply p;
1091
1092 p.retcode = cpu_to_be32(retcode);
1093
2f4e7abe 1094 drbd_send_cmd(mdev, &mdev->tconn->meta, P_STATE_CHG_REPLY, &p.head, sizeof(p));
b411b363
PR
1095}
1096
047cd4a6
PR
1097int conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
1098{
1099 struct p_req_state_reply p;
1100 enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1101
1102 p.retcode = cpu_to_be32(retcode);
1103
7d168ed3 1104 return !conn_send_cmd(tconn, 0, &tconn->meta, cmd, &p.head, sizeof(p));
047cd4a6
PR
1105}
1106
a02d1240
AG
1107static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
1108{
1109 BUG_ON(code & ~0xf);
1110 p->encoding = (p->encoding & ~0xf) | code;
1111}
1112
1113static void dcbp_set_start(struct p_compressed_bm *p, int set)
1114{
1115 p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
1116}
1117
1118static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
1119{
1120 BUG_ON(n & ~0x7);
1121 p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
1122}
1123
b411b363
PR
1124int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1125 struct p_compressed_bm *p,
1126 struct bm_xfer_ctx *c)
1127{
1128 struct bitstream bs;
1129 unsigned long plain_bits;
1130 unsigned long tmp;
1131 unsigned long rl;
1132 unsigned len;
1133 unsigned toggle;
1134 int bits;
1135
1136 /* may we use this feature? */
f399002e 1137 if ((mdev->tconn->net_conf->use_rle == 0) ||
31890f4a 1138 (mdev->tconn->agreed_pro_version < 90))
b411b363
PR
1139 return 0;
1140
1141 if (c->bit_offset >= c->bm_bits)
1142 return 0; /* nothing to do. */
1143
1144 /* use at most thus many bytes */
1145 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1146 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1147 /* plain bits covered in this code string */
1148 plain_bits = 0;
1149
1150 /* p->encoding & 0x80 stores whether the first run length is set.
1151 * bit offset is implicit.
1152 * start with toggle == 2 to be able to tell the first iteration */
1153 toggle = 2;
1154
1155 /* see how much plain bits we can stuff into one packet
1156 * using RLE and VLI. */
1157 do {
1158 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1159 : _drbd_bm_find_next(mdev, c->bit_offset);
1160 if (tmp == -1UL)
1161 tmp = c->bm_bits;
1162 rl = tmp - c->bit_offset;
1163
1164 if (toggle == 2) { /* first iteration */
1165 if (rl == 0) {
1166 /* the first checked bit was set,
1167 * store start value, */
a02d1240 1168 dcbp_set_start(p, 1);
b411b363
PR
1169 /* but skip encoding of zero run length */
1170 toggle = !toggle;
1171 continue;
1172 }
a02d1240 1173 dcbp_set_start(p, 0);
b411b363
PR
1174 }
1175
1176 /* paranoia: catch zero runlength.
1177 * can only happen if bitmap is modified while we scan it. */
1178 if (rl == 0) {
1179 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1180 "t:%u bo:%lu\n", toggle, c->bit_offset);
1181 return -1;
1182 }
1183
1184 bits = vli_encode_bits(&bs, rl);
1185 if (bits == -ENOBUFS) /* buffer full */
1186 break;
1187 if (bits <= 0) {
1188 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1189 return 0;
1190 }
1191
1192 toggle = !toggle;
1193 plain_bits += rl;
1194 c->bit_offset = tmp;
1195 } while (c->bit_offset < c->bm_bits);
1196
1197 len = bs.cur.b - p->code + !!bs.cur.bit;
1198
1199 if (plain_bits < (len << 3)) {
1200 /* incompressible with this method.
1201 * we need to rewind both word and bit position. */
1202 c->bit_offset -= plain_bits;
1203 bm_xfer_ctx_bit_to_word_offset(c);
1204 c->bit_offset = c->word_offset * BITS_PER_LONG;
1205 return 0;
1206 }
1207
1208 /* RLE + VLI was able to compress it just fine.
1209 * update c->word_offset. */
1210 bm_xfer_ctx_bit_to_word_offset(c);
1211
1212 /* store pad_bits */
a02d1240 1213 dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
b411b363
PR
1214
1215 return len;
1216}
1217
f70af118
AG
1218/**
1219 * send_bitmap_rle_or_plain
1220 *
1221 * Return 0 when done, 1 when another iteration is needed, and a negative error
1222 * code upon failure.
1223 */
1224static int
79ed9bd0 1225send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c)
b411b363 1226{
79ed9bd0 1227 struct p_compressed_bm *p = mdev->tconn->data.sbuf;
b411b363 1228 unsigned long num_words;
a982dd57 1229 int len, err;
b411b363
PR
1230
1231 len = fill_bitmap_rle_bits(mdev, p, c);
1232
1233 if (len < 0)
f70af118 1234 return -EIO;
b411b363
PR
1235
1236 if (len) {
a02d1240 1237 dcbp_set_code(p, RLE_VLI_Bits);
7c96715a 1238 err = _drbd_send_cmd(mdev, &mdev->tconn->data,
79ed9bd0 1239 P_COMPRESSED_BITMAP, &p->head,
04dfa137 1240 sizeof(*p) + len, 0);
b411b363
PR
1241
1242 c->packets[0]++;
1243 c->bytes[0] += sizeof(*p) + len;
1244
1245 if (c->bit_offset >= c->bm_bits)
1246 len = 0; /* DONE */
1247 } else {
1248 /* was not compressible.
1249 * send a buffer full of plain text bits instead. */
79ed9bd0 1250 struct p_header *h = mdev->tconn->data.sbuf;
b411b363
PR
1251 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1252 len = num_words * sizeof(long);
1253 if (len)
79ed9bd0
AG
1254 drbd_bm_get_lel(mdev, c->word_offset, num_words,
1255 (unsigned long *)h->payload);
7c96715a 1256 err = _drbd_send_cmd(mdev, &mdev->tconn->data, P_BITMAP,
04dfa137 1257 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
1258 c->word_offset += num_words;
1259 c->bit_offset = c->word_offset * BITS_PER_LONG;
1260
1261 c->packets[1]++;
0b70a13d 1262 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
1263
1264 if (c->bit_offset > c->bm_bits)
1265 c->bit_offset = c->bm_bits;
1266 }
a982dd57 1267 if (!err) {
f70af118
AG
1268 if (len == 0) {
1269 INFO_bm_xfer_stats(mdev, "send", c);
1270 return 0;
1271 } else
1272 return 1;
1273 }
1274 return -EIO;
b411b363
PR
1275}
1276
1277/* See the comment at receive_bitmap() */
058820cd 1278static int _drbd_send_bitmap(struct drbd_conf *mdev)
b411b363
PR
1279{
1280 struct bm_xfer_ctx c;
f70af118 1281 int err;
b411b363 1282
841ce241
AG
1283 if (!expect(mdev->bitmap))
1284 return false;
b411b363 1285
b411b363
PR
1286 if (get_ldev(mdev)) {
1287 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1288 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1289 drbd_bm_set_all(mdev);
1290 if (drbd_bm_write(mdev)) {
1291 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1292 * but otherwise process as per normal - need to tell other
1293 * side that a full resync is required! */
1294 dev_err(DEV, "Failed to write bitmap to disk!\n");
1295 } else {
1296 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
1297 drbd_md_sync(mdev);
1298 }
1299 }
1300 put_ldev(mdev);
1301 }
1302
1303 c = (struct bm_xfer_ctx) {
1304 .bm_bits = drbd_bm_bits(mdev),
1305 .bm_words = drbd_bm_words(mdev),
1306 };
1307
1308 do {
79ed9bd0 1309 err = send_bitmap_rle_or_plain(mdev, &c);
f70af118 1310 } while (err > 0);
b411b363 1311
f70af118 1312 return err == 0;
b411b363
PR
1313}
1314
1315int drbd_send_bitmap(struct drbd_conf *mdev)
1316{
1317 int err;
1318
11b0be28 1319 if (drbd_get_data_sock(mdev->tconn))
b411b363
PR
1320 return -1;
1321 err = !_drbd_send_bitmap(mdev);
61120870 1322 drbd_put_data_sock(mdev->tconn);
b411b363
PR
1323 return err;
1324}
d4e67d7c 1325void drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
b411b363 1326{
b411b363
PR
1327 struct p_barrier_ack p;
1328
1329 p.barrier = barrier_nr;
1330 p.set_size = cpu_to_be32(set_size);
1331
d4e67d7c
AG
1332 if (mdev->state.conn >= C_CONNECTED)
1333 drbd_send_cmd(mdev, &mdev->tconn->meta, P_BARRIER_ACK, &p.head, sizeof(p));
b411b363
PR
1334}
1335
1336/**
1337 * _drbd_send_ack() - Sends an ack packet
1338 * @mdev: DRBD device.
1339 * @cmd: Packet command code.
1340 * @sector: sector, needs to be in big endian byte order
1341 * @blksize: size in byte, needs to be in big endian byte order
1342 * @block_id: Id, big endian byte order
1343 */
d8763023
AG
1344static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
1345 u64 sector, u32 blksize, u64 block_id)
b411b363 1346{
b411b363
PR
1347 struct p_block_ack p;
1348
1349 p.sector = sector;
1350 p.block_id = block_id;
1351 p.blksize = blksize;
8ccf218e 1352 p.seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
b411b363 1353
e42325a5 1354 if (!mdev->tconn->meta.socket || mdev->state.conn < C_CONNECTED)
a8c32aa8
AG
1355 return -EIO;
1356 return drbd_send_cmd(mdev, &mdev->tconn->meta, cmd, &p.head, sizeof(p));
b411b363
PR
1357}
1358
2b2bf214
LE
1359/* dp->sector and dp->block_id already/still in network byte order,
1360 * data_size is payload size according to dp->head,
1361 * and may need to be corrected for digest size. */
a9a9994d
AG
1362void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
1363 struct p_data *dp, int data_size)
b411b363 1364{
a0638456
PR
1365 data_size -= (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
1366 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
a9a9994d
AG
1367 _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
1368 dp->block_id);
b411b363
PR
1369}
1370
a9a9994d
AG
1371void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
1372 struct p_block_req *rp)
b411b363 1373{
a9a9994d 1374 _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
b411b363
PR
1375}
1376
1377/**
1378 * drbd_send_ack() - Sends an ack packet
db830c46
AG
1379 * @mdev: DRBD device
1380 * @cmd: packet command code
1381 * @peer_req: peer request
b411b363 1382 */
d8763023 1383int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1384 struct drbd_peer_request *peer_req)
b411b363 1385{
dd516121
AG
1386 return _drbd_send_ack(mdev, cmd,
1387 cpu_to_be64(peer_req->i.sector),
1388 cpu_to_be32(peer_req->i.size),
1389 peer_req->block_id);
b411b363
PR
1390}
1391
1392/* This function misuses the block_id field to signal if the blocks
1393 * are is sync or not. */
d8763023 1394int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1395 sector_t sector, int blksize, u64 block_id)
1396{
fa79abd8
AG
1397 return _drbd_send_ack(mdev, cmd,
1398 cpu_to_be64(sector),
1399 cpu_to_be32(blksize),
1400 cpu_to_be64(block_id));
b411b363
PR
1401}
1402
1403int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1404 sector_t sector, int size, u64 block_id)
1405{
b411b363
PR
1406 struct p_block_req p;
1407
1408 p.sector = cpu_to_be64(sector);
1409 p.block_id = block_id;
1410 p.blksize = cpu_to_be32(size);
1411
6c1005e7 1412 return drbd_send_cmd(mdev, &mdev->tconn->data, cmd, &p.head, sizeof(p));
b411b363
PR
1413}
1414
d8763023
AG
1415int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
1416 void *digest, int digest_size, enum drbd_packet cmd)
b411b363 1417{
db1b0b72 1418 int err;
b411b363
PR
1419 struct p_block_req p;
1420
fd340c12 1421 prepare_header(mdev, &p.head, cmd, sizeof(p) - sizeof(struct p_header) + digest_size);
b411b363 1422 p.sector = cpu_to_be64(sector);
9a8e7753 1423 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1424 p.blksize = cpu_to_be32(size);
1425
e42325a5 1426 mutex_lock(&mdev->tconn->data.mutex);
db1b0b72
AG
1427 err = drbd_send_all(mdev->tconn, mdev->tconn->data.socket, &p, sizeof(p), 0);
1428 if (!err)
1429 err = drbd_send_all(mdev->tconn, mdev->tconn->data.socket, digest, digest_size, 0);
e42325a5 1430 mutex_unlock(&mdev->tconn->data.mutex);
db1b0b72 1431 return err;
b411b363
PR
1432}
1433
1434int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
1435{
b411b363
PR
1436 struct p_block_req p;
1437
1438 p.sector = cpu_to_be64(sector);
9a8e7753 1439 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1440 p.blksize = cpu_to_be32(size);
1441
5b9f499c 1442 return drbd_send_cmd(mdev, &mdev->tconn->data, P_OV_REQUEST, &p.head, sizeof(p));
b411b363
PR
1443}
1444
1445/* called on sndtimeo
81e84650
AG
1446 * returns false if we should retry,
1447 * true if we think connection is dead
b411b363 1448 */
1a7ba646 1449static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
b411b363
PR
1450{
1451 int drop_it;
1452 /* long elapsed = (long)(jiffies - mdev->last_received); */
1453
1a7ba646
PR
1454 drop_it = tconn->meta.socket == sock
1455 || !tconn->asender.task
1456 || get_t_state(&tconn->asender) != RUNNING
bbeb641c 1457 || tconn->cstate < C_WF_REPORT_PARAMS;
b411b363
PR
1458
1459 if (drop_it)
81e84650 1460 return true;
b411b363 1461
1a7ba646 1462 drop_it = !--tconn->ko_count;
b411b363 1463 if (!drop_it) {
1a7ba646
PR
1464 conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1465 current->comm, current->pid, tconn->ko_count);
1466 request_ping(tconn);
b411b363
PR
1467 }
1468
1469 return drop_it; /* && (mdev->state == R_PRIMARY) */;
1470}
1471
1a7ba646 1472static void drbd_update_congested(struct drbd_tconn *tconn)
9e204cdd 1473{
1a7ba646 1474 struct sock *sk = tconn->data.socket->sk;
9e204cdd 1475 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1a7ba646 1476 set_bit(NET_CONGESTED, &tconn->flags);
9e204cdd
AG
1477}
1478
b411b363
PR
1479/* The idea of sendpage seems to be to put some kind of reference
1480 * to the page into the skb, and to hand it over to the NIC. In
1481 * this process get_page() gets called.
1482 *
1483 * As soon as the page was really sent over the network put_page()
1484 * gets called by some part of the network layer. [ NIC driver? ]
1485 *
1486 * [ get_page() / put_page() increment/decrement the count. If count
1487 * reaches 0 the page will be freed. ]
1488 *
1489 * This works nicely with pages from FSs.
1490 * But this means that in protocol A we might signal IO completion too early!
1491 *
1492 * In order not to corrupt data during a resync we must make sure
1493 * that we do not reuse our own buffer pages (EEs) to early, therefore
1494 * we have the net_ee list.
1495 *
1496 * XFS seems to have problems, still, it submits pages with page_count == 0!
1497 * As a workaround, we disable sendpage on pages
1498 * with page_count == 0 or PageSlab.
1499 */
1500static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
b987427b 1501 int offset, size_t size, unsigned msg_flags)
b411b363 1502{
b987427b
AG
1503 struct socket *socket;
1504 void *addr;
1505 int err;
1506
1507 socket = mdev->tconn->data.socket;
1508 addr = kmap(page) + offset;
1509 err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags);
b411b363 1510 kunmap(page);
b987427b
AG
1511 if (!err)
1512 mdev->send_cnt += size >> 9;
1513 return err;
b411b363
PR
1514}
1515
1516static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1517 int offset, size_t size, unsigned msg_flags)
b411b363 1518{
88b390ff 1519 struct socket *socket = mdev->tconn->data.socket;
b411b363 1520 mm_segment_t oldfs = get_fs();
b411b363 1521 int len = size;
88b390ff 1522 int err = -EIO;
b411b363
PR
1523
1524 /* e.g. XFS meta- & log-data is in slab pages, which have a
1525 * page_count of 0 and/or have PageSlab() set.
1526 * we cannot use send_page for those, as that does get_page();
1527 * put_page(); and would cause either a VM_BUG directly, or
1528 * __page_cache_release a page that would actually still be referenced
1529 * by someone, leading to some obscure delayed Oops somewhere else. */
1530 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
88b390ff 1531 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 1532
ba11ad9a 1533 msg_flags |= MSG_NOSIGNAL;
1a7ba646 1534 drbd_update_congested(mdev->tconn);
b411b363
PR
1535 set_fs(KERNEL_DS);
1536 do {
88b390ff
AG
1537 int sent;
1538
1539 sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
b411b363 1540 if (sent <= 0) {
88b390ff
AG
1541 if (sent == -EAGAIN) {
1542 if (we_should_drop_the_connection(mdev->tconn, socket))
1543 break;
1544 continue;
1545 }
b411b363
PR
1546 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
1547 __func__, (int)size, len, sent);
88b390ff
AG
1548 if (sent < 0)
1549 err = sent;
b411b363
PR
1550 break;
1551 }
1552 len -= sent;
1553 offset += sent;
1554 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
1555 set_fs(oldfs);
01a311a5 1556 clear_bit(NET_CONGESTED, &mdev->tconn->flags);
b411b363 1557
88b390ff
AG
1558 if (len == 0) {
1559 err = 0;
1560 mdev->send_cnt += size >> 9;
1561 }
1562 return err;
b411b363
PR
1563}
1564
1565static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
1566{
1567 struct bio_vec *bvec;
1568 int i;
ba11ad9a 1569 /* hint all but last page with MSG_MORE */
b411b363 1570 __bio_for_each_segment(bvec, bio, i, 0) {
7fae55da
AG
1571 int err;
1572
1573 err = _drbd_no_send_page(mdev, bvec->bv_page,
1574 bvec->bv_offset, bvec->bv_len,
1575 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
1576 if (err)
1577 return err;
b411b363 1578 }
7fae55da 1579 return 0;
b411b363
PR
1580}
1581
1582static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
1583{
1584 struct bio_vec *bvec;
1585 int i;
ba11ad9a 1586 /* hint all but last page with MSG_MORE */
b411b363 1587 __bio_for_each_segment(bvec, bio, i, 0) {
7fae55da
AG
1588 int err;
1589
1590 err = _drbd_send_page(mdev, bvec->bv_page,
1591 bvec->bv_offset, bvec->bv_len,
1592 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
1593 if (err)
1594 return err;
b411b363 1595 }
7fae55da 1596 return 0;
b411b363
PR
1597}
1598
db830c46
AG
1599static int _drbd_send_zc_ee(struct drbd_conf *mdev,
1600 struct drbd_peer_request *peer_req)
45bb912b 1601{
db830c46
AG
1602 struct page *page = peer_req->pages;
1603 unsigned len = peer_req->i.size;
9f69230c 1604 int err;
db830c46 1605
ba11ad9a 1606 /* hint all but last page with MSG_MORE */
45bb912b
LE
1607 page_chain_for_each(page) {
1608 unsigned l = min_t(unsigned, len, PAGE_SIZE);
9f69230c
AG
1609
1610 err = _drbd_send_page(mdev, page, 0, l,
1611 page_chain_next(page) ? MSG_MORE : 0);
1612 if (err)
1613 return err;
45bb912b
LE
1614 len -= l;
1615 }
9f69230c 1616 return 0;
45bb912b
LE
1617}
1618
76d2e7ec
PR
1619static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
1620{
31890f4a 1621 if (mdev->tconn->agreed_pro_version >= 95)
76d2e7ec 1622 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
1623 (bi_rw & REQ_FUA ? DP_FUA : 0) |
1624 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
1625 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
1626 else
721a9602 1627 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
1628}
1629
b411b363
PR
1630/* Used to send write requests
1631 * R_PRIMARY -> Peer (P_DATA)
1632 */
1633int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
1634{
6bdb9b0e 1635 int err;
b411b363
PR
1636 struct p_data p;
1637 unsigned int dp_flags = 0;
1638 void *dgb;
1639 int dgs;
1640
6bdb9b0e
AG
1641 err = drbd_get_data_sock(mdev->tconn);
1642 if (err)
1643 return err;
b411b363 1644
a0638456
PR
1645 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1646 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1647
fd340c12 1648 prepare_header(mdev, &p.head, P_DATA, sizeof(p) - sizeof(struct p_header) + dgs + req->i.size);
ace652ac 1649 p.sector = cpu_to_be64(req->i.sector);
b411b363 1650 p.block_id = (unsigned long)req;
8ccf218e 1651 p.seq_num = cpu_to_be32(req->seq_num = atomic_inc_return(&mdev->packet_seq));
b411b363 1652
76d2e7ec
PR
1653 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
1654
b411b363
PR
1655 if (mdev->state.conn >= C_SYNC_SOURCE &&
1656 mdev->state.conn <= C_PAUSED_SYNC_T)
1657 dp_flags |= DP_MAY_SET_IN_SYNC;
1658
1659 p.dp_flags = cpu_to_be32(dp_flags);
b411b363 1660 set_bit(UNPLUG_REMOTE, &mdev->flags);
6bdb9b0e
AG
1661 err = drbd_send_all(mdev->tconn, mdev->tconn->data.socket, &p,
1662 sizeof(p), dgs ? MSG_MORE : 0);
1663 if (!err && dgs) {
a0638456
PR
1664 dgb = mdev->tconn->int_dig_out;
1665 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, dgb);
6bdb9b0e 1666 err = drbd_send_all(mdev->tconn, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363 1667 }
6bdb9b0e 1668 if (!err) {
470be44a
LE
1669 /* For protocol A, we have to memcpy the payload into
1670 * socket buffers, as we may complete right away
1671 * as soon as we handed it over to tcp, at which point the data
1672 * pages may become invalid.
1673 *
1674 * For data-integrity enabled, we copy it as well, so we can be
1675 * sure that even if the bio pages may still be modified, it
1676 * won't change the data on the wire, thus if the digest checks
1677 * out ok after sending on this side, but does not fit on the
1678 * receiving side, we sure have detected corruption elsewhere.
1679 */
89e58e75 1680 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A || dgs)
6bdb9b0e 1681 err = _drbd_send_bio(mdev, req->master_bio);
b411b363 1682 else
6bdb9b0e 1683 err = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
1684
1685 /* double check digest, sometimes buffers have been modified in flight. */
1686 if (dgs > 0 && dgs <= 64) {
24c4830c 1687 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
1688 * currently supported in kernel crypto. */
1689 unsigned char digest[64];
a0638456
PR
1690 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, digest);
1691 if (memcmp(mdev->tconn->int_dig_out, digest, dgs)) {
470be44a
LE
1692 dev_warn(DEV,
1693 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
ace652ac 1694 (unsigned long long)req->i.sector, req->i.size);
470be44a
LE
1695 }
1696 } /* else if (dgs > 64) {
1697 ... Be noisy about digest too large ...
1698 } */
b411b363
PR
1699 }
1700
61120870 1701 drbd_put_data_sock(mdev->tconn);
bd26bfc5 1702
6bdb9b0e 1703 return err;
b411b363
PR
1704}
1705
1706/* answer packet, used to send data back for read requests:
1707 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
1708 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
1709 */
d8763023 1710int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1711 struct drbd_peer_request *peer_req)
b411b363 1712{
7b57b89d 1713 int err;
b411b363
PR
1714 struct p_data p;
1715 void *dgb;
1716 int dgs;
1717
a0638456
PR
1718 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1719 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1720
db830c46
AG
1721 prepare_header(mdev, &p.head, cmd, sizeof(p) -
1722 sizeof(struct p_header80) +
1723 dgs + peer_req->i.size);
1724 p.sector = cpu_to_be64(peer_req->i.sector);
1725 p.block_id = peer_req->block_id;
cc378270 1726 p.seq_num = 0; /* unused */
b411b363
PR
1727
1728 /* Only called by our kernel thread.
1729 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
1730 * in response to admin command or module unload.
1731 */
7b57b89d
AG
1732 err = drbd_get_data_sock(mdev->tconn);
1733 if (err)
1734 return err;
1735 err = drbd_send_all(mdev->tconn, mdev->tconn->data.socket, &p,
1736 sizeof(p), dgs ? MSG_MORE : 0);
1737 if (!err && dgs) {
a0638456 1738 dgb = mdev->tconn->int_dig_out;
db830c46 1739 drbd_csum_ee(mdev, mdev->tconn->integrity_w_tfm, peer_req, dgb);
7b57b89d
AG
1740 err = drbd_send_all(mdev->tconn, mdev->tconn->data.socket, dgb,
1741 dgs, 0);
b411b363 1742 }
7b57b89d
AG
1743 if (!err)
1744 err = _drbd_send_zc_ee(mdev, peer_req);
61120870 1745 drbd_put_data_sock(mdev->tconn);
bd26bfc5 1746
7b57b89d 1747 return err;
b411b363
PR
1748}
1749
8f7bed77 1750int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req)
73a01a18
PR
1751{
1752 struct p_block_desc p;
1753
ace652ac
AG
1754 p.sector = cpu_to_be64(req->i.sector);
1755 p.blksize = cpu_to_be32(req->i.size);
73a01a18 1756
73218a3c 1757 return drbd_send_cmd(mdev, &mdev->tconn->data, P_OUT_OF_SYNC, &p.head, sizeof(p));
73a01a18
PR
1758}
1759
b411b363
PR
1760/*
1761 drbd_send distinguishes two cases:
1762
1763 Packets sent via the data socket "sock"
1764 and packets sent via the meta data socket "msock"
1765
1766 sock msock
1767 -----------------+-------------------------+------------------------------
1768 timeout conf.timeout / 2 conf.timeout / 2
1769 timeout action send a ping via msock Abort communication
1770 and close all sockets
1771*/
1772
1773/*
1774 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1775 */
bedbd2a5 1776int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
b411b363
PR
1777 void *buf, size_t size, unsigned msg_flags)
1778{
1779 struct kvec iov;
1780 struct msghdr msg;
1781 int rv, sent = 0;
1782
1783 if (!sock)
c0d42c8e 1784 return -EBADR;
b411b363
PR
1785
1786 /* THINK if (signal_pending) return ... ? */
1787
1788 iov.iov_base = buf;
1789 iov.iov_len = size;
1790
1791 msg.msg_name = NULL;
1792 msg.msg_namelen = 0;
1793 msg.msg_control = NULL;
1794 msg.msg_controllen = 0;
1795 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
1796
bedbd2a5
PR
1797 if (sock == tconn->data.socket) {
1798 tconn->ko_count = tconn->net_conf->ko_count;
1799 drbd_update_congested(tconn);
b411b363
PR
1800 }
1801 do {
1802 /* STRANGE
1803 * tcp_sendmsg does _not_ use its size parameter at all ?
1804 *
1805 * -EAGAIN on timeout, -EINTR on signal.
1806 */
1807/* THINK
1808 * do we need to block DRBD_SIG if sock == &meta.socket ??
1809 * otherwise wake_asender() might interrupt some send_*Ack !
1810 */
1811 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
1812 if (rv == -EAGAIN) {
bedbd2a5 1813 if (we_should_drop_the_connection(tconn, sock))
b411b363
PR
1814 break;
1815 else
1816 continue;
1817 }
b411b363
PR
1818 if (rv == -EINTR) {
1819 flush_signals(current);
1820 rv = 0;
1821 }
1822 if (rv < 0)
1823 break;
1824 sent += rv;
1825 iov.iov_base += rv;
1826 iov.iov_len -= rv;
1827 } while (sent < size);
1828
bedbd2a5
PR
1829 if (sock == tconn->data.socket)
1830 clear_bit(NET_CONGESTED, &tconn->flags);
b411b363
PR
1831
1832 if (rv <= 0) {
1833 if (rv != -EAGAIN) {
bedbd2a5
PR
1834 conn_err(tconn, "%s_sendmsg returned %d\n",
1835 sock == tconn->meta.socket ? "msock" : "sock",
1836 rv);
bbeb641c 1837 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 1838 } else
bbeb641c 1839 conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
b411b363
PR
1840 }
1841
1842 return sent;
1843}
1844
fb708e40
AG
1845/**
1846 * drbd_send_all - Send an entire buffer
1847 *
1848 * Returns 0 upon success and a negative error value otherwise.
1849 */
1850int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer,
1851 size_t size, unsigned msg_flags)
1852{
1853 int err;
1854
1855 err = drbd_send(tconn, sock, buffer, size, msg_flags);
1856 if (err < 0)
1857 return err;
1858 if (err != size)
1859 return -EIO;
1860 return 0;
1861}
1862
b411b363
PR
1863static int drbd_open(struct block_device *bdev, fmode_t mode)
1864{
1865 struct drbd_conf *mdev = bdev->bd_disk->private_data;
1866 unsigned long flags;
1867 int rv = 0;
1868
2a48fc0a 1869 mutex_lock(&drbd_main_mutex);
87eeee41 1870 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
1871 /* to have a stable mdev->state.role
1872 * and no race with updating open_cnt */
1873
1874 if (mdev->state.role != R_PRIMARY) {
1875 if (mode & FMODE_WRITE)
1876 rv = -EROFS;
1877 else if (!allow_oos)
1878 rv = -EMEDIUMTYPE;
1879 }
1880
1881 if (!rv)
1882 mdev->open_cnt++;
87eeee41 1883 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
2a48fc0a 1884 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1885
1886 return rv;
1887}
1888
1889static int drbd_release(struct gendisk *gd, fmode_t mode)
1890{
1891 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 1892 mutex_lock(&drbd_main_mutex);
b411b363 1893 mdev->open_cnt--;
2a48fc0a 1894 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1895 return 0;
1896}
1897
b411b363
PR
1898static void drbd_set_defaults(struct drbd_conf *mdev)
1899{
f399002e
LE
1900 /* Beware! The actual layout differs
1901 * between big endian and little endian */
da9fbc27 1902 mdev->state = (union drbd_dev_state) {
b411b363
PR
1903 { .role = R_SECONDARY,
1904 .peer = R_UNKNOWN,
1905 .conn = C_STANDALONE,
1906 .disk = D_DISKLESS,
1907 .pdsk = D_UNKNOWN,
b411b363
PR
1908 } };
1909}
1910
1911void drbd_init_set_defaults(struct drbd_conf *mdev)
1912{
1913 /* the memset(,0,) did most of this.
1914 * note: only assignments, no allocation in here */
1915
1916 drbd_set_defaults(mdev);
1917
b411b363
PR
1918 atomic_set(&mdev->ap_bio_cnt, 0);
1919 atomic_set(&mdev->ap_pending_cnt, 0);
1920 atomic_set(&mdev->rs_pending_cnt, 0);
1921 atomic_set(&mdev->unacked_cnt, 0);
1922 atomic_set(&mdev->local_cnt, 0);
435f0740 1923 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 1924 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 1925 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 1926 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
1927
1928 mutex_init(&mdev->md_io_mutex);
8410da8f
PR
1929 mutex_init(&mdev->own_state_mutex);
1930 mdev->state_mutex = &mdev->own_state_mutex;
b411b363 1931
b411b363 1932 spin_lock_init(&mdev->al_lock);
b411b363
PR
1933 spin_lock_init(&mdev->peer_seq_lock);
1934 spin_lock_init(&mdev->epoch_lock);
1935
1936 INIT_LIST_HEAD(&mdev->active_ee);
1937 INIT_LIST_HEAD(&mdev->sync_ee);
1938 INIT_LIST_HEAD(&mdev->done_ee);
1939 INIT_LIST_HEAD(&mdev->read_ee);
1940 INIT_LIST_HEAD(&mdev->net_ee);
1941 INIT_LIST_HEAD(&mdev->resync_reads);
b411b363
PR
1942 INIT_LIST_HEAD(&mdev->resync_work.list);
1943 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 1944 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 1945 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 1946 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 1947 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 1948
794abb75 1949 mdev->resync_work.cb = w_resync_timer;
b411b363 1950 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 1951 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
1952 mdev->md_sync_work.cb = w_md_sync;
1953 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 1954 mdev->start_resync_work.cb = w_start_resync;
a21e9298
PR
1955
1956 mdev->resync_work.mdev = mdev;
1957 mdev->unplug_work.mdev = mdev;
1958 mdev->go_diskless.mdev = mdev;
1959 mdev->md_sync_work.mdev = mdev;
1960 mdev->bm_io_work.w.mdev = mdev;
1961 mdev->start_resync_work.mdev = mdev;
1962
b411b363
PR
1963 init_timer(&mdev->resync_timer);
1964 init_timer(&mdev->md_sync_timer);
370a43e7 1965 init_timer(&mdev->start_resync_timer);
7fde2be9 1966 init_timer(&mdev->request_timer);
b411b363
PR
1967 mdev->resync_timer.function = resync_timer_fn;
1968 mdev->resync_timer.data = (unsigned long) mdev;
1969 mdev->md_sync_timer.function = md_sync_timer_fn;
1970 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
1971 mdev->start_resync_timer.function = start_resync_timer_fn;
1972 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
1973 mdev->request_timer.function = request_timer_fn;
1974 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
1975
1976 init_waitqueue_head(&mdev->misc_wait);
1977 init_waitqueue_head(&mdev->state_wait);
1978 init_waitqueue_head(&mdev->ee_wait);
1979 init_waitqueue_head(&mdev->al_wait);
1980 init_waitqueue_head(&mdev->seq_wait);
1981
fd340c12 1982 /* mdev->tconn->agreed_pro_version gets initialized in drbd_connect() */
2451fc3b 1983 mdev->write_ordering = WO_bdev_flush;
b411b363 1984 mdev->resync_wenr = LC_FREE;
99432fcc
PR
1985 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
1986 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
1987}
1988
1989void drbd_mdev_cleanup(struct drbd_conf *mdev)
1990{
1d7734a0 1991 int i;
e6b3ea83 1992 if (mdev->tconn->receiver.t_state != NONE)
b411b363 1993 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
e6b3ea83 1994 mdev->tconn->receiver.t_state);
b411b363
PR
1995
1996 /* no need to lock it, I'm the only thread alive */
1997 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
1998 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
1999 mdev->al_writ_cnt =
2000 mdev->bm_writ_cnt =
2001 mdev->read_cnt =
2002 mdev->recv_cnt =
2003 mdev->send_cnt =
2004 mdev->writ_cnt =
2005 mdev->p_size =
2006 mdev->rs_start =
2007 mdev->rs_total =
1d7734a0
LE
2008 mdev->rs_failed = 0;
2009 mdev->rs_last_events = 0;
0f0601f4 2010 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
2011 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2012 mdev->rs_mark_left[i] = 0;
2013 mdev->rs_mark_time[i] = 0;
2014 }
89e58e75 2015 D_ASSERT(mdev->tconn->net_conf == NULL);
b411b363
PR
2016
2017 drbd_set_my_capacity(mdev, 0);
2018 if (mdev->bitmap) {
2019 /* maybe never allocated. */
02d9a94b 2020 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
2021 drbd_bm_cleanup(mdev);
2022 }
2023
2024 drbd_free_resources(mdev);
0778286a 2025 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
2026
2027 /*
2028 * currently we drbd_init_ee only on module load, so
2029 * we may do drbd_release_ee only on module unload!
2030 */
2031 D_ASSERT(list_empty(&mdev->active_ee));
2032 D_ASSERT(list_empty(&mdev->sync_ee));
2033 D_ASSERT(list_empty(&mdev->done_ee));
2034 D_ASSERT(list_empty(&mdev->read_ee));
2035 D_ASSERT(list_empty(&mdev->net_ee));
2036 D_ASSERT(list_empty(&mdev->resync_reads));
e42325a5
PR
2037 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
2038 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
b411b363
PR
2039 D_ASSERT(list_empty(&mdev->resync_work.list));
2040 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 2041 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
2042
2043 drbd_set_defaults(mdev);
b411b363
PR
2044}
2045
2046
2047static void drbd_destroy_mempools(void)
2048{
2049 struct page *page;
2050
2051 while (drbd_pp_pool) {
2052 page = drbd_pp_pool;
2053 drbd_pp_pool = (struct page *)page_private(page);
2054 __free_page(page);
2055 drbd_pp_vacant--;
2056 }
2057
2058 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2059
da4a75d2
LE
2060 if (drbd_md_io_bio_set)
2061 bioset_free(drbd_md_io_bio_set);
35abf594
LE
2062 if (drbd_md_io_page_pool)
2063 mempool_destroy(drbd_md_io_page_pool);
b411b363
PR
2064 if (drbd_ee_mempool)
2065 mempool_destroy(drbd_ee_mempool);
2066 if (drbd_request_mempool)
2067 mempool_destroy(drbd_request_mempool);
2068 if (drbd_ee_cache)
2069 kmem_cache_destroy(drbd_ee_cache);
2070 if (drbd_request_cache)
2071 kmem_cache_destroy(drbd_request_cache);
2072 if (drbd_bm_ext_cache)
2073 kmem_cache_destroy(drbd_bm_ext_cache);
2074 if (drbd_al_ext_cache)
2075 kmem_cache_destroy(drbd_al_ext_cache);
2076
da4a75d2 2077 drbd_md_io_bio_set = NULL;
35abf594 2078 drbd_md_io_page_pool = NULL;
b411b363
PR
2079 drbd_ee_mempool = NULL;
2080 drbd_request_mempool = NULL;
2081 drbd_ee_cache = NULL;
2082 drbd_request_cache = NULL;
2083 drbd_bm_ext_cache = NULL;
2084 drbd_al_ext_cache = NULL;
2085
2086 return;
2087}
2088
2089static int drbd_create_mempools(void)
2090{
2091 struct page *page;
1816a2b4 2092 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
2093 int i;
2094
2095 /* prepare our caches and mempools */
2096 drbd_request_mempool = NULL;
2097 drbd_ee_cache = NULL;
2098 drbd_request_cache = NULL;
2099 drbd_bm_ext_cache = NULL;
2100 drbd_al_ext_cache = NULL;
2101 drbd_pp_pool = NULL;
35abf594 2102 drbd_md_io_page_pool = NULL;
da4a75d2 2103 drbd_md_io_bio_set = NULL;
b411b363
PR
2104
2105 /* caches */
2106 drbd_request_cache = kmem_cache_create(
2107 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2108 if (drbd_request_cache == NULL)
2109 goto Enomem;
2110
2111 drbd_ee_cache = kmem_cache_create(
f6ffca9f 2112 "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
b411b363
PR
2113 if (drbd_ee_cache == NULL)
2114 goto Enomem;
2115
2116 drbd_bm_ext_cache = kmem_cache_create(
2117 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2118 if (drbd_bm_ext_cache == NULL)
2119 goto Enomem;
2120
2121 drbd_al_ext_cache = kmem_cache_create(
2122 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2123 if (drbd_al_ext_cache == NULL)
2124 goto Enomem;
2125
2126 /* mempools */
da4a75d2
LE
2127 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
2128 if (drbd_md_io_bio_set == NULL)
2129 goto Enomem;
2130
35abf594
LE
2131 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
2132 if (drbd_md_io_page_pool == NULL)
2133 goto Enomem;
2134
b411b363
PR
2135 drbd_request_mempool = mempool_create(number,
2136 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2137 if (drbd_request_mempool == NULL)
2138 goto Enomem;
2139
2140 drbd_ee_mempool = mempool_create(number,
2141 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 2142 if (drbd_ee_mempool == NULL)
b411b363
PR
2143 goto Enomem;
2144
2145 /* drbd's page pool */
2146 spin_lock_init(&drbd_pp_lock);
2147
2148 for (i = 0; i < number; i++) {
2149 page = alloc_page(GFP_HIGHUSER);
2150 if (!page)
2151 goto Enomem;
2152 set_page_private(page, (unsigned long)drbd_pp_pool);
2153 drbd_pp_pool = page;
2154 }
2155 drbd_pp_vacant = number;
2156
2157 return 0;
2158
2159Enomem:
2160 drbd_destroy_mempools(); /* in case we allocated some */
2161 return -ENOMEM;
2162}
2163
2164static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2165 void *unused)
2166{
2167 /* just so we have it. you never know what interesting things we
2168 * might want to do here some day...
2169 */
2170
2171 return NOTIFY_DONE;
2172}
2173
2174static struct notifier_block drbd_notifier = {
2175 .notifier_call = drbd_notify_sys,
2176};
2177
2178static void drbd_release_ee_lists(struct drbd_conf *mdev)
2179{
2180 int rr;
2181
2182 rr = drbd_release_ee(mdev, &mdev->active_ee);
2183 if (rr)
2184 dev_err(DEV, "%d EEs in active list found!\n", rr);
2185
2186 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2187 if (rr)
2188 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2189
2190 rr = drbd_release_ee(mdev, &mdev->read_ee);
2191 if (rr)
2192 dev_err(DEV, "%d EEs in read list found!\n", rr);
2193
2194 rr = drbd_release_ee(mdev, &mdev->done_ee);
2195 if (rr)
2196 dev_err(DEV, "%d EEs in done list found!\n", rr);
2197
2198 rr = drbd_release_ee(mdev, &mdev->net_ee);
2199 if (rr)
2200 dev_err(DEV, "%d EEs in net list found!\n", rr);
2201}
2202
774b3055
PR
2203/* caution. no locking. */
2204void drbd_delete_device(unsigned int minor)
b411b363
PR
2205{
2206 struct drbd_conf *mdev = minor_to_mdev(minor);
2207
2208 if (!mdev)
2209 return;
2210
569083c0
LE
2211 idr_remove(&mdev->tconn->volumes, mdev->vnr);
2212 idr_remove(&minors, minor);
2213 synchronize_rcu();
774b3055 2214
b411b363 2215 /* paranoia asserts */
70dc65e1 2216 D_ASSERT(mdev->open_cnt == 0);
e42325a5 2217 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
b411b363
PR
2218 /* end paranoia asserts */
2219
2220 del_gendisk(mdev->vdisk);
2221
2222 /* cleanup stuff that may have been allocated during
2223 * device (re-)configuration or state changes */
2224
2225 if (mdev->this_bdev)
2226 bdput(mdev->this_bdev);
2227
2228 drbd_free_resources(mdev);
2229
2230 drbd_release_ee_lists(mdev);
2231
b411b363
PR
2232 lc_destroy(mdev->act_log);
2233 lc_destroy(mdev->resync);
2234
2235 kfree(mdev->p_uuid);
2236 /* mdev->p_uuid = NULL; */
2237
b411b363
PR
2238 /* cleanup the rest that has been
2239 * allocated from drbd_new_device
2240 * and actually free the mdev itself */
2241 drbd_free_mdev(mdev);
2242}
2243
2244static void drbd_cleanup(void)
2245{
2246 unsigned int i;
81a5d60e 2247 struct drbd_conf *mdev;
b411b363
PR
2248
2249 unregister_reboot_notifier(&drbd_notifier);
2250
17a93f30
LE
2251 /* first remove proc,
2252 * drbdsetup uses it's presence to detect
2253 * whether DRBD is loaded.
2254 * If we would get stuck in proc removal,
2255 * but have netlink already deregistered,
2256 * some drbdsetup commands may wait forever
2257 * for an answer.
2258 */
2259 if (drbd_proc)
2260 remove_proc_entry("drbd", NULL);
2261
3b98c0c2 2262 drbd_genl_unregister();
b411b363 2263
81a5d60e
PR
2264 idr_for_each_entry(&minors, mdev, i)
2265 drbd_delete_device(i);
2266 drbd_destroy_mempools();
b411b363
PR
2267 unregister_blkdev(DRBD_MAJOR, "drbd");
2268
81a5d60e
PR
2269 idr_destroy(&minors);
2270
b411b363
PR
2271 printk(KERN_INFO "drbd: module cleanup done.\n");
2272}
2273
2274/**
2275 * drbd_congested() - Callback for pdflush
2276 * @congested_data: User data
2277 * @bdi_bits: Bits pdflush is currently interested in
2278 *
2279 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2280 */
2281static int drbd_congested(void *congested_data, int bdi_bits)
2282{
2283 struct drbd_conf *mdev = congested_data;
2284 struct request_queue *q;
2285 char reason = '-';
2286 int r = 0;
2287
1b881ef7 2288 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
2289 /* DRBD has frozen IO */
2290 r = bdi_bits;
2291 reason = 'd';
2292 goto out;
2293 }
2294
2295 if (get_ldev(mdev)) {
2296 q = bdev_get_queue(mdev->ldev->backing_bdev);
2297 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2298 put_ldev(mdev);
2299 if (r)
2300 reason = 'b';
2301 }
2302
01a311a5 2303 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
b411b363
PR
2304 r |= (1 << BDI_async_congested);
2305 reason = reason == 'b' ? 'a' : 'n';
2306 }
2307
2308out:
2309 mdev->congestion_reason = reason;
2310 return r;
2311}
2312
6699b655
PR
2313static void drbd_init_workqueue(struct drbd_work_queue* wq)
2314{
2315 sema_init(&wq->s, 0);
2316 spin_lock_init(&wq->q_lock);
2317 INIT_LIST_HEAD(&wq->q);
2318}
2319
1aba4d7f
PR
2320struct drbd_tconn *conn_by_name(const char *name)
2321{
2322 struct drbd_tconn *tconn;
2323
3b98c0c2
LE
2324 if (!name || !name[0])
2325 return NULL;
2326
543cc10b 2327 mutex_lock(&drbd_cfg_mutex);
1aba4d7f
PR
2328 list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
2329 if (!strcmp(tconn->name, name))
2330 goto found;
2331 }
2332 tconn = NULL;
2333found:
543cc10b 2334 mutex_unlock(&drbd_cfg_mutex);
1aba4d7f
PR
2335 return tconn;
2336}
2337
e6ef8a5c
AG
2338static int drbd_alloc_socket(struct drbd_socket *socket)
2339{
2340 socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
2341 if (!socket->rbuf)
2342 return -ENOMEM;
5a87d920
AG
2343 socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
2344 if (!socket->sbuf)
2345 return -ENOMEM;
e6ef8a5c
AG
2346 return 0;
2347}
2348
2349static void drbd_free_socket(struct drbd_socket *socket)
2350{
5a87d920 2351 free_page((unsigned long) socket->sbuf);
e6ef8a5c
AG
2352 free_page((unsigned long) socket->rbuf);
2353}
2354
3b98c0c2 2355struct drbd_tconn *drbd_new_tconn(const char *name)
2111438b
PR
2356{
2357 struct drbd_tconn *tconn;
2358
2359 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
2360 if (!tconn)
2361 return NULL;
2362
2363 tconn->name = kstrdup(name, GFP_KERNEL);
2364 if (!tconn->name)
2365 goto fail;
2366
e6ef8a5c
AG
2367 if (drbd_alloc_socket(&tconn->data))
2368 goto fail;
2369 if (drbd_alloc_socket(&tconn->meta))
2370 goto fail;
2371
774b3055
PR
2372 if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
2373 goto fail;
2374
2f5cdd0b
PR
2375 if (!tl_init(tconn))
2376 goto fail;
2377
bbeb641c 2378 tconn->cstate = C_STANDALONE;
8410da8f 2379 mutex_init(&tconn->cstate_mutex);
6699b655 2380 spin_lock_init(&tconn->req_lock);
b2fb6dbe
PR
2381 atomic_set(&tconn->net_cnt, 0);
2382 init_waitqueue_head(&tconn->net_cnt_wait);
2a67d8b9 2383 init_waitqueue_head(&tconn->ping_wait);
062e879c 2384 idr_init(&tconn->volumes);
b2fb6dbe 2385
6699b655
PR
2386 drbd_init_workqueue(&tconn->data.work);
2387 mutex_init(&tconn->data.mutex);
2388
2389 drbd_init_workqueue(&tconn->meta.work);
2390 mutex_init(&tconn->meta.mutex);
2391
392c8801
PR
2392 drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
2393 drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
2394 drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
2395
f399002e
LE
2396 tconn->res_opts = (struct res_opts) {
2397 {}, 0, /* cpu_mask */
2398 DRBD_ON_NO_DATA_DEF, /* on_no_data */
2399 };
2400
543cc10b
LE
2401 mutex_lock(&drbd_cfg_mutex);
2402 list_add_tail(&tconn->all_tconn, &drbd_tconns);
2403 mutex_unlock(&drbd_cfg_mutex);
2111438b
PR
2404
2405 return tconn;
2406
2407fail:
2f5cdd0b 2408 tl_cleanup(tconn);
774b3055 2409 free_cpumask_var(tconn->cpu_mask);
e6ef8a5c
AG
2410 drbd_free_socket(&tconn->meta);
2411 drbd_free_socket(&tconn->data);
2111438b
PR
2412 kfree(tconn->name);
2413 kfree(tconn);
2414
2415 return NULL;
2416}
2417
2418void drbd_free_tconn(struct drbd_tconn *tconn)
2419{
2111438b 2420 list_del(&tconn->all_tconn);
062e879c 2421 idr_destroy(&tconn->volumes);
2111438b 2422
774b3055 2423 free_cpumask_var(tconn->cpu_mask);
e6ef8a5c
AG
2424 drbd_free_socket(&tconn->meta);
2425 drbd_free_socket(&tconn->data);
2111438b 2426 kfree(tconn->name);
b42a70ad
PR
2427 kfree(tconn->int_dig_out);
2428 kfree(tconn->int_dig_in);
2429 kfree(tconn->int_dig_vv);
2111438b
PR
2430 kfree(tconn);
2431}
2432
774b3055 2433enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
b411b363
PR
2434{
2435 struct drbd_conf *mdev;
2436 struct gendisk *disk;
2437 struct request_queue *q;
774b3055 2438 int vnr_got = vnr;
81a5d60e 2439 int minor_got = minor;
8432b314 2440 enum drbd_ret_code err = ERR_NOMEM;
774b3055
PR
2441
2442 mdev = minor_to_mdev(minor);
2443 if (mdev)
2444 return ERR_MINOR_EXISTS;
b411b363
PR
2445
2446 /* GFP_KERNEL, we are outside of all write-out paths */
2447 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2448 if (!mdev)
774b3055
PR
2449 return ERR_NOMEM;
2450
2451 mdev->tconn = tconn;
b411b363 2452 mdev->minor = minor;
3b98c0c2 2453 mdev->vnr = vnr;
b411b363
PR
2454
2455 drbd_init_set_defaults(mdev);
2456
2457 q = blk_alloc_queue(GFP_KERNEL);
2458 if (!q)
2459 goto out_no_q;
2460 mdev->rq_queue = q;
2461 q->queuedata = mdev;
b411b363
PR
2462
2463 disk = alloc_disk(1);
2464 if (!disk)
2465 goto out_no_disk;
2466 mdev->vdisk = disk;
2467
81e84650 2468 set_disk_ro(disk, true);
b411b363
PR
2469
2470 disk->queue = q;
2471 disk->major = DRBD_MAJOR;
2472 disk->first_minor = minor;
2473 disk->fops = &drbd_ops;
2474 sprintf(disk->disk_name, "drbd%d", minor);
2475 disk->private_data = mdev;
2476
2477 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2478 /* we have no partitions. we contain only ourselves. */
2479 mdev->this_bdev->bd_contains = mdev->this_bdev;
2480
2481 q->backing_dev_info.congested_fn = drbd_congested;
2482 q->backing_dev_info.congested_data = mdev;
2483
2f58dcfc 2484 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
2485 /* Setting the max_hw_sectors to an odd value of 8kibyte here
2486 This triggers a max_bio_size message upon first attach or connect */
2487 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
2488 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
2489 blk_queue_merge_bvec(q, drbd_merge_bvec);
87eeee41 2490 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
b411b363
PR
2491
2492 mdev->md_io_page = alloc_page(GFP_KERNEL);
2493 if (!mdev->md_io_page)
2494 goto out_no_io_page;
2495
2496 if (drbd_bm_init(mdev))
2497 goto out_no_bitmap;
dac1389c 2498 mdev->read_requests = RB_ROOT;
de696716 2499 mdev->write_requests = RB_ROOT;
b411b363 2500
b411b363
PR
2501 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2502 if (!mdev->current_epoch)
2503 goto out_no_epoch;
2504
2505 INIT_LIST_HEAD(&mdev->current_epoch->list);
2506 mdev->epochs = 1;
2507
81a5d60e 2508 if (!idr_pre_get(&minors, GFP_KERNEL))
8432b314
LE
2509 goto out_no_minor_idr;
2510 if (idr_get_new_above(&minors, mdev, minor, &minor_got))
2511 goto out_no_minor_idr;
81a5d60e 2512 if (minor_got != minor) {
8432b314
LE
2513 err = ERR_MINOR_EXISTS;
2514 drbd_msg_put_info("requested minor exists already");
569083c0 2515 goto out_idr_remove_minor;
81a5d60e 2516 }
8432b314
LE
2517
2518 if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
2519 goto out_idr_remove_minor;
2520 if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
2521 goto out_idr_remove_minor;
2522 if (vnr_got != vnr) {
2523 err = ERR_INVALID_REQUEST;
2524 drbd_msg_put_info("requested volume exists already");
2525 goto out_idr_remove_vol;
2526 }
774b3055
PR
2527 add_disk(disk);
2528
2325eb66
PR
2529 /* inherit the connection state */
2530 mdev->state.conn = tconn->cstate;
2531 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2532 drbd_connected(vnr, mdev, tconn);
2533
774b3055 2534 return NO_ERROR;
b411b363 2535
569083c0
LE
2536out_idr_remove_vol:
2537 idr_remove(&tconn->volumes, vnr_got);
8432b314
LE
2538out_idr_remove_minor:
2539 idr_remove(&minors, minor_got);
569083c0 2540 synchronize_rcu();
8432b314 2541out_no_minor_idr:
81a5d60e 2542 kfree(mdev->current_epoch);
b411b363 2543out_no_epoch:
b411b363
PR
2544 drbd_bm_cleanup(mdev);
2545out_no_bitmap:
2546 __free_page(mdev->md_io_page);
2547out_no_io_page:
2548 put_disk(disk);
2549out_no_disk:
2550 blk_cleanup_queue(q);
2551out_no_q:
b411b363 2552 kfree(mdev);
8432b314 2553 return err;
b411b363
PR
2554}
2555
2556/* counterpart of drbd_new_device.
2557 * last part of drbd_delete_device. */
2558void drbd_free_mdev(struct drbd_conf *mdev)
2559{
2560 kfree(mdev->current_epoch);
b411b363
PR
2561 if (mdev->bitmap) /* should no longer be there. */
2562 drbd_bm_cleanup(mdev);
2563 __free_page(mdev->md_io_page);
2564 put_disk(mdev->vdisk);
2565 blk_cleanup_queue(mdev->rq_queue);
b411b363
PR
2566 kfree(mdev);
2567}
2568
2569
2570int __init drbd_init(void)
2571{
2572 int err;
2573
fd340c12 2574 BUILD_BUG_ON(sizeof(struct p_header80) != sizeof(struct p_header95));
6038178e 2575 BUILD_BUG_ON(sizeof(struct p_connection_features) != 80);
b411b363 2576
2b8a90b5 2577 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363 2578 printk(KERN_ERR
81a5d60e 2579 "drbd: invalid minor_count (%d)\n", minor_count);
b411b363
PR
2580#ifdef MODULE
2581 return -EINVAL;
2582#else
2583 minor_count = 8;
2584#endif
2585 }
2586
b411b363
PR
2587 err = register_blkdev(DRBD_MAJOR, "drbd");
2588 if (err) {
2589 printk(KERN_ERR
2590 "drbd: unable to register block device major %d\n",
2591 DRBD_MAJOR);
2592 return err;
2593 }
2594
3b98c0c2
LE
2595 err = drbd_genl_register();
2596 if (err) {
2597 printk(KERN_ERR "drbd: unable to register generic netlink family\n");
2598 goto fail;
2599 }
2600
2601
b411b363
PR
2602 register_reboot_notifier(&drbd_notifier);
2603
2604 /*
2605 * allocate all necessary structs
2606 */
2607 err = -ENOMEM;
2608
2609 init_waitqueue_head(&drbd_pp_wait);
2610
2611 drbd_proc = NULL; /* play safe for drbd_cleanup */
81a5d60e 2612 idr_init(&minors);
b411b363
PR
2613
2614 err = drbd_create_mempools();
2615 if (err)
3b98c0c2 2616 goto fail;
b411b363 2617
8c484ee4 2618 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
2619 if (!drbd_proc) {
2620 printk(KERN_ERR "drbd: unable to register proc file\n");
3b98c0c2 2621 goto fail;
b411b363
PR
2622 }
2623
2624 rwlock_init(&global_state_lock);
2111438b 2625 INIT_LIST_HEAD(&drbd_tconns);
b411b363
PR
2626
2627 printk(KERN_INFO "drbd: initialized. "
2628 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2629 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2630 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
2631 printk(KERN_INFO "drbd: registered as block device major %d\n",
2632 DRBD_MAJOR);
b411b363
PR
2633
2634 return 0; /* Success! */
2635
3b98c0c2 2636fail:
b411b363
PR
2637 drbd_cleanup();
2638 if (err == -ENOMEM)
2639 /* currently always the case */
2640 printk(KERN_ERR "drbd: ran out of memory\n");
2641 else
2642 printk(KERN_ERR "drbd: initialization failure\n");
2643 return err;
2644}
2645
2646void drbd_free_bc(struct drbd_backing_dev *ldev)
2647{
2648 if (ldev == NULL)
2649 return;
2650
e525fd89
TH
2651 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2652 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
2653
2654 kfree(ldev);
2655}
2656
360cc740
PR
2657void drbd_free_sock(struct drbd_tconn *tconn)
2658{
2659 if (tconn->data.socket) {
2660 mutex_lock(&tconn->data.mutex);
2661 kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
2662 sock_release(tconn->data.socket);
2663 tconn->data.socket = NULL;
2664 mutex_unlock(&tconn->data.mutex);
b411b363 2665 }
360cc740
PR
2666 if (tconn->meta.socket) {
2667 mutex_lock(&tconn->meta.mutex);
2668 kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
2669 sock_release(tconn->meta.socket);
2670 tconn->meta.socket = NULL;
2671 mutex_unlock(&tconn->meta.mutex);
b411b363
PR
2672 }
2673}
2674
2675
2676void drbd_free_resources(struct drbd_conf *mdev)
2677{
f399002e
LE
2678 crypto_free_hash(mdev->tconn->csums_tfm);
2679 mdev->tconn->csums_tfm = NULL;
2680 crypto_free_hash(mdev->tconn->verify_tfm);
2681 mdev->tconn->verify_tfm = NULL;
a0638456
PR
2682 crypto_free_hash(mdev->tconn->cram_hmac_tfm);
2683 mdev->tconn->cram_hmac_tfm = NULL;
2684 crypto_free_hash(mdev->tconn->integrity_w_tfm);
2685 mdev->tconn->integrity_w_tfm = NULL;
2686 crypto_free_hash(mdev->tconn->integrity_r_tfm);
2687 mdev->tconn->integrity_r_tfm = NULL;
b411b363 2688
360cc740 2689 drbd_free_sock(mdev->tconn);
b411b363
PR
2690
2691 __no_warn(local,
2692 drbd_free_bc(mdev->ldev);
2693 mdev->ldev = NULL;);
2694}
2695
2696/* meta data management */
2697
2698struct meta_data_on_disk {
2699 u64 la_size; /* last agreed size. */
2700 u64 uuid[UI_SIZE]; /* UUIDs. */
2701 u64 device_uuid;
2702 u64 reserved_u64_1;
2703 u32 flags; /* MDF */
2704 u32 magic;
2705 u32 md_size_sect;
2706 u32 al_offset; /* offset to this block */
2707 u32 al_nr_extents; /* important for restoring the AL */
f399002e 2708 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
b411b363
PR
2709 u32 bm_offset; /* offset to the bitmap, from here */
2710 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
2711 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2712 u32 reserved_u32[3];
b411b363
PR
2713
2714} __packed;
2715
2716/**
2717 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
2718 * @mdev: DRBD device.
2719 */
2720void drbd_md_sync(struct drbd_conf *mdev)
2721{
2722 struct meta_data_on_disk *buffer;
2723 sector_t sector;
2724 int i;
2725
ee15b038
LE
2726 del_timer(&mdev->md_sync_timer);
2727 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
2728 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2729 return;
b411b363
PR
2730
2731 /* We use here D_FAILED and not D_ATTACHING because we try to write
2732 * metadata even if we detach due to a disk failure! */
2733 if (!get_ldev_if_state(mdev, D_FAILED))
2734 return;
2735
b411b363
PR
2736 mutex_lock(&mdev->md_io_mutex);
2737 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2738 memset(buffer, 0, 512);
2739
2740 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2741 for (i = UI_CURRENT; i < UI_SIZE; i++)
2742 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2743 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
2744 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
2745
2746 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
2747 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
2748 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
2749 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
2750 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
2751
2752 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 2753 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
2754
2755 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
2756 sector = mdev->ldev->md.md_offset;
2757
3fbf4d21 2758 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
2759 /* this was a try anyways ... */
2760 dev_err(DEV, "meta data update failed!\n");
81e84650 2761 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
2762 }
2763
2764 /* Update mdev->ldev->md.la_size_sect,
2765 * since we updated it on metadata. */
2766 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
2767
2768 mutex_unlock(&mdev->md_io_mutex);
2769 put_ldev(mdev);
2770}
2771
2772/**
2773 * drbd_md_read() - Reads in the meta data super block
2774 * @mdev: DRBD device.
2775 * @bdev: Device from which the meta data should be read in.
2776 *
116676ca 2777 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
2778 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
2779 */
2780int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2781{
2782 struct meta_data_on_disk *buffer;
2783 int i, rv = NO_ERROR;
2784
2785 if (!get_ldev_if_state(mdev, D_ATTACHING))
2786 return ERR_IO_MD_DISK;
2787
b411b363
PR
2788 mutex_lock(&mdev->md_io_mutex);
2789 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2790
3fbf4d21 2791 if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 2792 /* NOTE: can't do normal error processing here as this is
b411b363
PR
2793 called BEFORE disk is attached */
2794 dev_err(DEV, "Error while reading metadata.\n");
2795 rv = ERR_IO_MD_DISK;
2796 goto err;
2797 }
2798
e7fad8af 2799 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
2800 dev_err(DEV, "Error while reading metadata, magic not found.\n");
2801 rv = ERR_MD_INVALID;
2802 goto err;
2803 }
2804 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
2805 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
2806 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
2807 rv = ERR_MD_INVALID;
2808 goto err;
2809 }
2810 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2811 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2812 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2813 rv = ERR_MD_INVALID;
2814 goto err;
2815 }
2816 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2817 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2818 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2819 rv = ERR_MD_INVALID;
2820 goto err;
2821 }
2822
2823 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2824 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2825 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2826 rv = ERR_MD_INVALID;
2827 goto err;
2828 }
2829
2830 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2831 for (i = UI_CURRENT; i < UI_SIZE; i++)
2832 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2833 bdev->md.flags = be32_to_cpu(buffer->flags);
f399002e 2834 bdev->dc.al_extents = be32_to_cpu(buffer->al_nr_extents);
b411b363
PR
2835 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2836
87eeee41 2837 spin_lock_irq(&mdev->tconn->req_lock);
99432fcc
PR
2838 if (mdev->state.conn < C_CONNECTED) {
2839 int peer;
2840 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
2841 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
2842 mdev->peer_max_bio_size = peer;
2843 }
87eeee41 2844 spin_unlock_irq(&mdev->tconn->req_lock);
99432fcc 2845
f399002e
LE
2846 if (bdev->dc.al_extents < 7)
2847 bdev->dc.al_extents = 127;
b411b363
PR
2848
2849 err:
2850 mutex_unlock(&mdev->md_io_mutex);
2851 put_ldev(mdev);
2852
2853 return rv;
2854}
2855
2856/**
2857 * drbd_md_mark_dirty() - Mark meta data super block as dirty
2858 * @mdev: DRBD device.
2859 *
2860 * Call this function if you change anything that should be written to
2861 * the meta-data super block. This function sets MD_DIRTY, and starts a
2862 * timer that ensures that within five seconds you have to call drbd_md_sync().
2863 */
ca0e6098 2864#ifdef DEBUG
ee15b038
LE
2865void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
2866{
2867 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
2868 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
2869 mdev->last_md_mark_dirty.line = line;
2870 mdev->last_md_mark_dirty.func = func;
2871 }
2872}
2873#else
b411b363
PR
2874void drbd_md_mark_dirty(struct drbd_conf *mdev)
2875{
ee15b038 2876 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 2877 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 2878}
ee15b038 2879#endif
b411b363
PR
2880
2881static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
2882{
2883 int i;
2884
62b0da3a 2885 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 2886 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
2887}
2888
2889void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2890{
2891 if (idx == UI_CURRENT) {
2892 if (mdev->state.role == R_PRIMARY)
2893 val |= 1;
2894 else
2895 val &= ~((u64)1);
2896
2897 drbd_set_ed_uuid(mdev, val);
2898 }
2899
2900 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
2901 drbd_md_mark_dirty(mdev);
2902}
2903
2904
2905void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2906{
2907 if (mdev->ldev->md.uuid[idx]) {
2908 drbd_uuid_move_history(mdev);
2909 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
2910 }
2911 _drbd_uuid_set(mdev, idx, val);
2912}
2913
2914/**
2915 * drbd_uuid_new_current() - Creates a new current UUID
2916 * @mdev: DRBD device.
2917 *
2918 * Creates a new current UUID, and rotates the old current UUID into
2919 * the bitmap slot. Causes an incremental resync upon next connect.
2920 */
2921void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
2922{
2923 u64 val;
62b0da3a
LE
2924 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2925
2926 if (bm_uuid)
2927 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2928
b411b363 2929 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
2930
2931 get_random_bytes(&val, sizeof(u64));
2932 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 2933 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
2934 /* get it to stable storage _now_ */
2935 drbd_md_sync(mdev);
b411b363
PR
2936}
2937
2938void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
2939{
2940 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
2941 return;
2942
2943 if (val == 0) {
2944 drbd_uuid_move_history(mdev);
2945 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
2946 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 2947 } else {
62b0da3a
LE
2948 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2949 if (bm_uuid)
2950 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2951
62b0da3a 2952 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
2953 }
2954 drbd_md_mark_dirty(mdev);
2955}
2956
2957/**
2958 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2959 * @mdev: DRBD device.
2960 *
2961 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
2962 */
2963int drbd_bmio_set_n_write(struct drbd_conf *mdev)
2964{
2965 int rv = -EIO;
2966
2967 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2968 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
2969 drbd_md_sync(mdev);
2970 drbd_bm_set_all(mdev);
2971
2972 rv = drbd_bm_write(mdev);
2973
2974 if (!rv) {
2975 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2976 drbd_md_sync(mdev);
2977 }
2978
2979 put_ldev(mdev);
2980 }
2981
2982 return rv;
2983}
2984
2985/**
2986 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2987 * @mdev: DRBD device.
2988 *
2989 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
2990 */
2991int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
2992{
2993 int rv = -EIO;
2994
0778286a 2995 drbd_resume_al(mdev);
b411b363
PR
2996 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2997 drbd_bm_clear_all(mdev);
2998 rv = drbd_bm_write(mdev);
2999 put_ldev(mdev);
3000 }
3001
3002 return rv;
3003}
3004
99920dc5 3005static int w_bitmap_io(struct drbd_work *w, int unused)
b411b363
PR
3006{
3007 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
00d56944 3008 struct drbd_conf *mdev = w->mdev;
02851e9f 3009 int rv = -EIO;
b411b363
PR
3010
3011 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3012
02851e9f 3013 if (get_ldev(mdev)) {
20ceb2b2 3014 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
3015 rv = work->io_fn(mdev);
3016 drbd_bm_unlock(mdev);
3017 put_ldev(mdev);
3018 }
b411b363 3019
4738fa16 3020 clear_bit_unlock(BITMAP_IO, &mdev->flags);
b411b363
PR
3021 wake_up(&mdev->misc_wait);
3022
3023 if (work->done)
3024 work->done(mdev, rv);
3025
3026 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3027 work->why = NULL;
20ceb2b2 3028 work->flags = 0;
b411b363 3029
99920dc5 3030 return 0;
b411b363
PR
3031}
3032
82f59cc6
LE
3033void drbd_ldev_destroy(struct drbd_conf *mdev)
3034{
3035 lc_destroy(mdev->resync);
3036 mdev->resync = NULL;
3037 lc_destroy(mdev->act_log);
3038 mdev->act_log = NULL;
3039 __no_warn(local,
3040 drbd_free_bc(mdev->ldev);
3041 mdev->ldev = NULL;);
3042
82f59cc6
LE
3043 clear_bit(GO_DISKLESS, &mdev->flags);
3044}
3045
99920dc5 3046static int w_go_diskless(struct drbd_work *w, int unused)
e9e6f3ec 3047{
00d56944
PR
3048 struct drbd_conf *mdev = w->mdev;
3049
e9e6f3ec 3050 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
3051 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3052 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
3053 * the protected members anymore, though, so once put_ldev reaches zero
3054 * again, it will be safe to free them. */
e9e6f3ec 3055 drbd_force_state(mdev, NS(disk, D_DISKLESS));
99920dc5 3056 return 0;
e9e6f3ec
LE
3057}
3058
3059void drbd_go_diskless(struct drbd_conf *mdev)
3060{
3061 D_ASSERT(mdev->state.disk == D_FAILED);
3062 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
e42325a5 3063 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
e9e6f3ec
LE
3064}
3065
b411b363
PR
3066/**
3067 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3068 * @mdev: DRBD device.
3069 * @io_fn: IO callback to be called when bitmap IO is possible
3070 * @done: callback to be called after the bitmap IO was performed
3071 * @why: Descriptive text of the reason for doing the IO
3072 *
3073 * While IO on the bitmap happens we freeze application IO thus we ensure
3074 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3075 * called from worker context. It MUST NOT be used while a previous such
3076 * work is still pending!
3077 */
3078void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3079 int (*io_fn)(struct drbd_conf *),
3080 void (*done)(struct drbd_conf *, int),
20ceb2b2 3081 char *why, enum bm_flag flags)
b411b363 3082{
e6b3ea83 3083 D_ASSERT(current == mdev->tconn->worker.task);
b411b363
PR
3084
3085 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3086 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3087 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3088 if (mdev->bm_io_work.why)
3089 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3090 why, mdev->bm_io_work.why);
3091
3092 mdev->bm_io_work.io_fn = io_fn;
3093 mdev->bm_io_work.done = done;
3094 mdev->bm_io_work.why = why;
20ceb2b2 3095 mdev->bm_io_work.flags = flags;
b411b363 3096
87eeee41 3097 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
3098 set_bit(BITMAP_IO, &mdev->flags);
3099 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 3100 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
e42325a5 3101 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
b411b363 3102 }
87eeee41 3103 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
3104}
3105
3106/**
3107 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3108 * @mdev: DRBD device.
3109 * @io_fn: IO callback to be called when bitmap IO is possible
3110 * @why: Descriptive text of the reason for doing the IO
3111 *
3112 * freezes application IO while that the actual IO operations runs. This
3113 * functions MAY NOT be called from worker context.
3114 */
20ceb2b2
LE
3115int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
3116 char *why, enum bm_flag flags)
b411b363
PR
3117{
3118 int rv;
3119
e6b3ea83 3120 D_ASSERT(current != mdev->tconn->worker.task);
b411b363 3121
20ceb2b2
LE
3122 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3123 drbd_suspend_io(mdev);
b411b363 3124
20ceb2b2 3125 drbd_bm_lock(mdev, why, flags);
b411b363
PR
3126 rv = io_fn(mdev);
3127 drbd_bm_unlock(mdev);
3128
20ceb2b2
LE
3129 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3130 drbd_resume_io(mdev);
b411b363
PR
3131
3132 return rv;
3133}
3134
3135void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3136{
3137 if ((mdev->ldev->md.flags & flag) != flag) {
3138 drbd_md_mark_dirty(mdev);
3139 mdev->ldev->md.flags |= flag;
3140 }
3141}
3142
3143void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3144{
3145 if ((mdev->ldev->md.flags & flag) != 0) {
3146 drbd_md_mark_dirty(mdev);
3147 mdev->ldev->md.flags &= ~flag;
3148 }
3149}
3150int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3151{
3152 return (bdev->md.flags & flag) != 0;
3153}
3154
3155static void md_sync_timer_fn(unsigned long data)
3156{
3157 struct drbd_conf *mdev = (struct drbd_conf *) data;
3158
e42325a5 3159 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
b411b363
PR
3160}
3161
99920dc5 3162static int w_md_sync(struct drbd_work *w, int unused)
b411b363 3163{
00d56944
PR
3164 struct drbd_conf *mdev = w->mdev;
3165
b411b363 3166 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
3167#ifdef DEBUG
3168 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3169 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3170#endif
b411b363 3171 drbd_md_sync(mdev);
99920dc5 3172 return 0;
b411b363
PR
3173}
3174
d8763023 3175const char *cmdname(enum drbd_packet cmd)
f2ad9063
AG
3176{
3177 /* THINK may need to become several global tables
3178 * when we want to support more than
3179 * one PRO_VERSION */
3180 static const char *cmdnames[] = {
3181 [P_DATA] = "Data",
3182 [P_DATA_REPLY] = "DataReply",
3183 [P_RS_DATA_REPLY] = "RSDataReply",
3184 [P_BARRIER] = "Barrier",
3185 [P_BITMAP] = "ReportBitMap",
3186 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
3187 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
3188 [P_UNPLUG_REMOTE] = "UnplugRemote",
3189 [P_DATA_REQUEST] = "DataRequest",
3190 [P_RS_DATA_REQUEST] = "RSDataRequest",
3191 [P_SYNC_PARAM] = "SyncParam",
3192 [P_SYNC_PARAM89] = "SyncParam89",
3193 [P_PROTOCOL] = "ReportProtocol",
3194 [P_UUIDS] = "ReportUUIDs",
3195 [P_SIZES] = "ReportSizes",
3196 [P_STATE] = "ReportState",
3197 [P_SYNC_UUID] = "ReportSyncUUID",
3198 [P_AUTH_CHALLENGE] = "AuthChallenge",
3199 [P_AUTH_RESPONSE] = "AuthResponse",
3200 [P_PING] = "Ping",
3201 [P_PING_ACK] = "PingAck",
3202 [P_RECV_ACK] = "RecvAck",
3203 [P_WRITE_ACK] = "WriteAck",
3204 [P_RS_WRITE_ACK] = "RSWriteAck",
7be8da07 3205 [P_DISCARD_WRITE] = "DiscardWrite",
f2ad9063
AG
3206 [P_NEG_ACK] = "NegAck",
3207 [P_NEG_DREPLY] = "NegDReply",
3208 [P_NEG_RS_DREPLY] = "NegRSDReply",
3209 [P_BARRIER_ACK] = "BarrierAck",
3210 [P_STATE_CHG_REQ] = "StateChgRequest",
3211 [P_STATE_CHG_REPLY] = "StateChgReply",
3212 [P_OV_REQUEST] = "OVRequest",
3213 [P_OV_REPLY] = "OVReply",
3214 [P_OV_RESULT] = "OVResult",
3215 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
3216 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
3217 [P_COMPRESSED_BITMAP] = "CBitmap",
3218 [P_DELAY_PROBE] = "DelayProbe",
3219 [P_OUT_OF_SYNC] = "OutOfSync",
7be8da07 3220 [P_RETRY_WRITE] = "RetryWrite",
f2ad9063
AG
3221 };
3222
e5d6f33a
AG
3223 if (cmd == P_INITIAL_META)
3224 return "InitialMeta";
3225 if (cmd == P_INITIAL_DATA)
3226 return "InitialData";
6038178e
AG
3227 if (cmd == P_CONNECTION_FEATURES)
3228 return "ConnectionFeatures";
6e849ce8 3229 if (cmd >= ARRAY_SIZE(cmdnames))
f2ad9063
AG
3230 return "Unknown";
3231 return cmdnames[cmd];
3232}
3233
7be8da07
AG
3234/**
3235 * drbd_wait_misc - wait for a request to make progress
3236 * @mdev: device associated with the request
3237 * @i: the struct drbd_interval embedded in struct drbd_request or
3238 * struct drbd_peer_request
3239 */
3240int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
3241{
3242 struct net_conf *net_conf = mdev->tconn->net_conf;
3243 DEFINE_WAIT(wait);
3244 long timeout;
3245
3246 if (!net_conf)
3247 return -ETIMEDOUT;
3248 timeout = MAX_SCHEDULE_TIMEOUT;
3249 if (net_conf->ko_count)
3250 timeout = net_conf->timeout * HZ / 10 * net_conf->ko_count;
3251
3252 /* Indicate to wake up mdev->misc_wait on progress. */
3253 i->waiting = true;
3254 prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
3255 spin_unlock_irq(&mdev->tconn->req_lock);
3256 timeout = schedule_timeout(timeout);
3257 finish_wait(&mdev->misc_wait, &wait);
3258 spin_lock_irq(&mdev->tconn->req_lock);
3259 if (!timeout || mdev->state.conn < C_CONNECTED)
3260 return -ETIMEDOUT;
3261 if (signal_pending(current))
3262 return -ERESTARTSYS;
3263 return 0;
3264}
3265
b411b363
PR
3266#ifdef CONFIG_DRBD_FAULT_INJECTION
3267/* Fault insertion support including random number generator shamelessly
3268 * stolen from kernel/rcutorture.c */
3269struct fault_random_state {
3270 unsigned long state;
3271 unsigned long count;
3272};
3273
3274#define FAULT_RANDOM_MULT 39916801 /* prime */
3275#define FAULT_RANDOM_ADD 479001701 /* prime */
3276#define FAULT_RANDOM_REFRESH 10000
3277
3278/*
3279 * Crude but fast random-number generator. Uses a linear congruential
3280 * generator, with occasional help from get_random_bytes().
3281 */
3282static unsigned long
3283_drbd_fault_random(struct fault_random_state *rsp)
3284{
3285 long refresh;
3286
49829ea7 3287 if (!rsp->count--) {
b411b363
PR
3288 get_random_bytes(&refresh, sizeof(refresh));
3289 rsp->state += refresh;
3290 rsp->count = FAULT_RANDOM_REFRESH;
3291 }
3292 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3293 return swahw32(rsp->state);
3294}
3295
3296static char *
3297_drbd_fault_str(unsigned int type) {
3298 static char *_faults[] = {
3299 [DRBD_FAULT_MD_WR] = "Meta-data write",
3300 [DRBD_FAULT_MD_RD] = "Meta-data read",
3301 [DRBD_FAULT_RS_WR] = "Resync write",
3302 [DRBD_FAULT_RS_RD] = "Resync read",
3303 [DRBD_FAULT_DT_WR] = "Data write",
3304 [DRBD_FAULT_DT_RD] = "Data read",
3305 [DRBD_FAULT_DT_RA] = "Data read ahead",
3306 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3307 [DRBD_FAULT_AL_EE] = "EE allocation",
3308 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3309 };
3310
3311 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3312}
3313
3314unsigned int
3315_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3316{
3317 static struct fault_random_state rrs = {0, 0};
3318
3319 unsigned int ret = (
3320 (fault_devs == 0 ||
3321 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3322 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3323
3324 if (ret) {
3325 fault_count++;
3326
7383506c 3327 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
3328 dev_warn(DEV, "***Simulating %s failure\n",
3329 _drbd_fault_str(type));
3330 }
3331
3332 return ret;
3333}
3334#endif
3335
3336const char *drbd_buildtag(void)
3337{
3338 /* DRBD built from external sources has here a reference to the
3339 git hash of the source code. */
3340
3341 static char buildtag[38] = "\0uilt-in";
3342
3343 if (buildtag[0] == 0) {
3344#ifdef CONFIG_MODULES
3345 if (THIS_MODULE != NULL)
3346 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3347 else
3348#endif
3349 buildtag[0] = 'b';
3350 }
3351
3352 return buildtag;
3353}
3354
3355module_init(drbd_init)
3356module_exit(drbd_cleanup)
3357
b411b363
PR
3358EXPORT_SYMBOL(drbd_conn_str);
3359EXPORT_SYMBOL(drbd_role_str);
3360EXPORT_SYMBOL(drbd_disk_str);
3361EXPORT_SYMBOL(drbd_set_st_err_str);