]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/block/drbd/drbd_main.c
drbd: Remove obsolete drbd_crypto_is_hash()
[mirror_ubuntu-artful-kernel.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
2a48fc0a 59static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
60int drbdd_init(struct drbd_thread *);
61int drbd_worker(struct drbd_thread *);
62int drbd_asender(struct drbd_thread *);
63
64int drbd_init(void);
65static int drbd_open(struct block_device *bdev, fmode_t mode);
66static int drbd_release(struct gendisk *gd, fmode_t mode);
99920dc5 67static int w_md_sync(struct drbd_work *w, int unused);
b411b363 68static void md_sync_timer_fn(unsigned long data);
99920dc5
AG
69static int w_bitmap_io(struct drbd_work *w, int unused);
70static int w_go_diskless(struct drbd_work *w, int unused);
b411b363 71
b411b363
PR
72MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
73 "Lars Ellenberg <lars@linbit.com>");
74MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
75MODULE_VERSION(REL_VERSION);
76MODULE_LICENSE("GPL");
81a5d60e 77MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
2b8a90b5 78 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
79MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
80
81#include <linux/moduleparam.h>
82/* allow_open_on_secondary */
83MODULE_PARM_DESC(allow_oos, "DONT USE!");
84/* thanks to these macros, if compiled into the kernel (not-module),
85 * this becomes the boot parameter drbd.minor_count */
86module_param(minor_count, uint, 0444);
87module_param(disable_sendpage, bool, 0644);
88module_param(allow_oos, bool, 0);
b411b363
PR
89module_param(proc_details, int, 0644);
90
91#ifdef CONFIG_DRBD_FAULT_INJECTION
92int enable_faults;
93int fault_rate;
94static int fault_count;
95int fault_devs;
96/* bitmap of enabled faults */
97module_param(enable_faults, int, 0664);
98/* fault rate % value - applies to all enabled faults */
99module_param(fault_rate, int, 0664);
100/* count of faults inserted */
101module_param(fault_count, int, 0664);
102/* bitmap of devices to insert faults on */
103module_param(fault_devs, int, 0644);
104#endif
105
106/* module parameter, defined */
2b8a90b5 107unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
108int disable_sendpage;
109int allow_oos;
b411b363
PR
110int proc_details; /* Detail level in proc drbd*/
111
112/* Module parameter for setting the user mode helper program
113 * to run. Default is /sbin/drbdadm */
114char usermode_helper[80] = "/sbin/drbdadm";
115
116module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
117
118/* in 2.6.x, our device mapping and config info contains our virtual gendisks
119 * as member "struct gendisk *vdisk;"
120 */
81a5d60e 121struct idr minors;
2111438b 122struct list_head drbd_tconns; /* list of struct drbd_tconn */
ef356262 123DECLARE_RWSEM(drbd_cfg_rwsem);
b411b363
PR
124
125struct kmem_cache *drbd_request_cache;
6c852bec 126struct kmem_cache *drbd_ee_cache; /* peer requests */
b411b363
PR
127struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
128struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
129mempool_t *drbd_request_mempool;
130mempool_t *drbd_ee_mempool;
35abf594 131mempool_t *drbd_md_io_page_pool;
da4a75d2 132struct bio_set *drbd_md_io_bio_set;
b411b363
PR
133
134/* I do not use a standard mempool, because:
135 1) I want to hand out the pre-allocated objects first.
136 2) I want to be able to interrupt sleeping allocation with a signal.
137 Note: This is a single linked list, the next pointer is the private
138 member of struct page.
139 */
140struct page *drbd_pp_pool;
141spinlock_t drbd_pp_lock;
142int drbd_pp_vacant;
143wait_queue_head_t drbd_pp_wait;
144
145DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
146
7d4e9d09 147static const struct block_device_operations drbd_ops = {
b411b363
PR
148 .owner = THIS_MODULE,
149 .open = drbd_open,
150 .release = drbd_release,
151};
152
da4a75d2
LE
153static void bio_destructor_drbd(struct bio *bio)
154{
155 bio_free(bio, drbd_md_io_bio_set);
156}
157
158struct bio *bio_alloc_drbd(gfp_t gfp_mask)
159{
160 struct bio *bio;
161
162 if (!drbd_md_io_bio_set)
163 return bio_alloc(gfp_mask, 1);
164
165 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
166 if (!bio)
167 return NULL;
168 bio->bi_destructor = bio_destructor_drbd;
169 return bio;
170}
171
b411b363
PR
172#ifdef __CHECKER__
173/* When checking with sparse, and this is an inline function, sparse will
174 give tons of false positives. When this is a real functions sparse works.
175 */
176int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
177{
178 int io_allowed;
179
180 atomic_inc(&mdev->local_cnt);
181 io_allowed = (mdev->state.disk >= mins);
182 if (!io_allowed) {
183 if (atomic_dec_and_test(&mdev->local_cnt))
184 wake_up(&mdev->misc_wait);
185 }
186 return io_allowed;
187}
188
189#endif
190
191/**
192 * DOC: The transfer log
193 *
194 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
87eeee41 195 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
b411b363
PR
196 * of the list. There is always at least one &struct drbd_tl_epoch object.
197 *
198 * Each &struct drbd_tl_epoch has a circular double linked list of requests
199 * attached.
200 */
2f5cdd0b 201static int tl_init(struct drbd_tconn *tconn)
b411b363
PR
202{
203 struct drbd_tl_epoch *b;
204
205 /* during device minor initialization, we may well use GFP_KERNEL */
206 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
207 if (!b)
208 return 0;
209 INIT_LIST_HEAD(&b->requests);
210 INIT_LIST_HEAD(&b->w.list);
211 b->next = NULL;
212 b->br_number = 4711;
7e602c0a 213 b->n_writes = 0;
b411b363
PR
214 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
215
2f5cdd0b
PR
216 tconn->oldest_tle = b;
217 tconn->newest_tle = b;
218 INIT_LIST_HEAD(&tconn->out_of_sequence_requests);
b411b363 219
b411b363
PR
220 return 1;
221}
222
2f5cdd0b 223static void tl_cleanup(struct drbd_tconn *tconn)
b411b363 224{
2f5cdd0b
PR
225 if (tconn->oldest_tle != tconn->newest_tle)
226 conn_err(tconn, "ASSERT FAILED: oldest_tle == newest_tle\n");
227 if (!list_empty(&tconn->out_of_sequence_requests))
228 conn_err(tconn, "ASSERT FAILED: list_empty(out_of_sequence_requests)\n");
229 kfree(tconn->oldest_tle);
230 tconn->oldest_tle = NULL;
231 kfree(tconn->unused_spare_tle);
232 tconn->unused_spare_tle = NULL;
d628769b
AG
233}
234
b411b363
PR
235/**
236 * _tl_add_barrier() - Adds a barrier to the transfer log
237 * @mdev: DRBD device.
238 * @new: Barrier to be added before the current head of the TL.
239 *
240 * The caller must hold the req_lock.
241 */
2f5cdd0b 242void _tl_add_barrier(struct drbd_tconn *tconn, struct drbd_tl_epoch *new)
b411b363
PR
243{
244 struct drbd_tl_epoch *newest_before;
245
246 INIT_LIST_HEAD(&new->requests);
247 INIT_LIST_HEAD(&new->w.list);
248 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
249 new->next = NULL;
7e602c0a 250 new->n_writes = 0;
b411b363 251
2f5cdd0b 252 newest_before = tconn->newest_tle;
b411b363
PR
253 /* never send a barrier number == 0, because that is special-cased
254 * when using TCQ for our write ordering code */
255 new->br_number = (newest_before->br_number+1) ?: 1;
2f5cdd0b
PR
256 if (tconn->newest_tle != new) {
257 tconn->newest_tle->next = new;
258 tconn->newest_tle = new;
b411b363
PR
259 }
260}
261
262/**
263 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
264 * @mdev: DRBD device.
265 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
266 * @set_size: Expected number of requests before that barrier.
267 *
268 * In case the passed barrier_nr or set_size does not match the oldest
269 * &struct drbd_tl_epoch objects this function will cause a termination
270 * of the connection.
271 */
2f5cdd0b
PR
272void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
273 unsigned int set_size)
b411b363 274{
2f5cdd0b 275 struct drbd_conf *mdev;
b411b363
PR
276 struct drbd_tl_epoch *b, *nob; /* next old barrier */
277 struct list_head *le, *tle;
278 struct drbd_request *r;
279
2f5cdd0b 280 spin_lock_irq(&tconn->req_lock);
b411b363 281
2f5cdd0b 282 b = tconn->oldest_tle;
b411b363
PR
283
284 /* first some paranoia code */
285 if (b == NULL) {
2f5cdd0b
PR
286 conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
287 barrier_nr);
b411b363
PR
288 goto bail;
289 }
290 if (b->br_number != barrier_nr) {
2f5cdd0b
PR
291 conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
292 barrier_nr, b->br_number);
b411b363
PR
293 goto bail;
294 }
7e602c0a 295 if (b->n_writes != set_size) {
2f5cdd0b
PR
296 conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
297 barrier_nr, set_size, b->n_writes);
b411b363
PR
298 goto bail;
299 }
300
301 /* Clean up list of requests processed during current epoch */
302 list_for_each_safe(le, tle, &b->requests) {
303 r = list_entry(le, struct drbd_request, tl_requests);
8554df1c 304 _req_mod(r, BARRIER_ACKED);
b411b363
PR
305 }
306 /* There could be requests on the list waiting for completion
307 of the write to the local disk. To avoid corruptions of
308 slab's data structures we have to remove the lists head.
309
310 Also there could have been a barrier ack out of sequence, overtaking
311 the write acks - which would be a bug and violating write ordering.
312 To not deadlock in case we lose connection while such requests are
313 still pending, we need some way to find them for the
8554df1c 314 _req_mode(CONNECTION_LOST_WHILE_PENDING).
b411b363
PR
315
316 These have been list_move'd to the out_of_sequence_requests list in
8554df1c 317 _req_mod(, BARRIER_ACKED) above.
b411b363
PR
318 */
319 list_del_init(&b->requests);
2f5cdd0b 320 mdev = b->w.mdev;
b411b363
PR
321
322 nob = b->next;
323 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
2f5cdd0b 324 _tl_add_barrier(tconn, b);
b411b363 325 if (nob)
2f5cdd0b 326 tconn->oldest_tle = nob;
b411b363 327 /* if nob == NULL b was the only barrier, and becomes the new
2f5cdd0b 328 barrier. Therefore tconn->oldest_tle points already to b */
b411b363
PR
329 } else {
330 D_ASSERT(nob != NULL);
2f5cdd0b 331 tconn->oldest_tle = nob;
b411b363
PR
332 kfree(b);
333 }
334
2f5cdd0b 335 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
336 dec_ap_pending(mdev);
337
338 return;
339
340bail:
2f5cdd0b
PR
341 spin_unlock_irq(&tconn->req_lock);
342 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
343}
344
617049aa 345
b411b363 346/**
11b58e73 347 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 348 * @mdev: DRBD device.
11b58e73 349 * @what: The action/event to perform with all request objects
b411b363 350 *
8554df1c
AG
351 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
352 * RESTART_FROZEN_DISK_IO.
b411b363 353 */
2f5cdd0b 354void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
b411b363 355{
11b58e73 356 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 357 struct list_head *le, *tle, carry_reads;
11b58e73
PR
358 struct drbd_request *req;
359 int rv, n_writes, n_reads;
b411b363 360
2f5cdd0b
PR
361 b = tconn->oldest_tle;
362 pn = &tconn->oldest_tle;
b411b363 363 while (b) {
11b58e73
PR
364 n_writes = 0;
365 n_reads = 0;
b9b98716 366 INIT_LIST_HEAD(&carry_reads);
b411b363 367 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
368 req = list_entry(le, struct drbd_request, tl_requests);
369 rv = _req_mod(req, what);
370
371 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
372 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
373 }
374 tmp = b->next;
375
b9b98716 376 if (n_writes) {
8554df1c 377 if (what == RESEND) {
11b58e73
PR
378 b->n_writes = n_writes;
379 if (b->w.cb == NULL) {
380 b->w.cb = w_send_barrier;
2f5cdd0b
PR
381 inc_ap_pending(b->w.mdev);
382 set_bit(CREATE_BARRIER, &b->w.mdev->flags);
11b58e73
PR
383 }
384
2f5cdd0b 385 drbd_queue_work(&tconn->data.work, &b->w);
11b58e73
PR
386 }
387 pn = &b->next;
388 } else {
b9b98716
PR
389 if (n_reads)
390 list_add(&carry_reads, &b->requests);
11b58e73
PR
391 /* there could still be requests on that ring list,
392 * in case local io is still pending */
393 list_del(&b->requests);
394
395 /* dec_ap_pending corresponding to queue_barrier.
396 * the newest barrier may not have been queued yet,
397 * in which case w.cb is still NULL. */
398 if (b->w.cb != NULL)
2f5cdd0b 399 dec_ap_pending(b->w.mdev);
11b58e73 400
2f5cdd0b 401 if (b == tconn->newest_tle) {
11b58e73 402 /* recycle, but reinit! */
2f5cdd0b
PR
403 if (tmp != NULL)
404 conn_err(tconn, "ASSERT FAILED tmp == NULL");
11b58e73 405 INIT_LIST_HEAD(&b->requests);
b9b98716 406 list_splice(&carry_reads, &b->requests);
11b58e73
PR
407 INIT_LIST_HEAD(&b->w.list);
408 b->w.cb = NULL;
409 b->br_number = net_random();
410 b->n_writes = 0;
411
412 *pn = b;
413 break;
414 }
415 *pn = tmp;
416 kfree(b);
b411b363 417 }
b411b363 418 b = tmp;
b9b98716 419 list_splice(&carry_reads, &b->requests);
b411b363 420 }
11b58e73
PR
421}
422
b411b363
PR
423
424/**
425 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
426 * @mdev: DRBD device.
427 *
428 * This is called after the connection to the peer was lost. The storage covered
429 * by the requests on the transfer gets marked as our of sync. Called from the
430 * receiver thread and the worker thread.
431 */
2f5cdd0b 432void tl_clear(struct drbd_tconn *tconn)
b411b363 433{
2f5cdd0b 434 struct drbd_conf *mdev;
b411b363
PR
435 struct list_head *le, *tle;
436 struct drbd_request *r;
e90285e0 437 int vnr;
b411b363 438
2f5cdd0b 439 spin_lock_irq(&tconn->req_lock);
b411b363 440
2f5cdd0b 441 _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
442
443 /* we expect this list to be empty. */
2f5cdd0b
PR
444 if (!list_empty(&tconn->out_of_sequence_requests))
445 conn_err(tconn, "ASSERT FAILED list_empty(&out_of_sequence_requests)\n");
b411b363
PR
446
447 /* but just in case, clean it up anyways! */
2f5cdd0b 448 list_for_each_safe(le, tle, &tconn->out_of_sequence_requests) {
b411b363
PR
449 r = list_entry(le, struct drbd_request, tl_requests);
450 /* It would be nice to complete outside of spinlock.
451 * But this is easier for now. */
8554df1c 452 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
453 }
454
455 /* ensure bit indicating barrier is required is clear */
695d08fa 456 rcu_read_lock();
e90285e0 457 idr_for_each_entry(&tconn->volumes, mdev, vnr)
2f5cdd0b 458 clear_bit(CREATE_BARRIER, &mdev->flags);
695d08fa 459 rcu_read_unlock();
b411b363 460
2f5cdd0b 461 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
462}
463
2f5cdd0b 464void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
11b58e73 465{
2f5cdd0b
PR
466 spin_lock_irq(&tconn->req_lock);
467 _tl_restart(tconn, what);
468 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
469}
470
b411b363
PR
471static int drbd_thread_setup(void *arg)
472{
473 struct drbd_thread *thi = (struct drbd_thread *) arg;
392c8801 474 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
475 unsigned long flags;
476 int retval;
477
f1b3a6ec 478 snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
392c8801 479 thi->name[0], thi->tconn->name);
f1b3a6ec 480
b411b363
PR
481restart:
482 retval = thi->function(thi);
483
484 spin_lock_irqsave(&thi->t_lock, flags);
485
e77a0a5c 486 /* if the receiver has been "EXITING", the last thing it did
b411b363
PR
487 * was set the conn state to "StandAlone",
488 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
489 * and receiver thread will be "started".
e77a0a5c 490 * drbd_thread_start needs to set "RESTARTING" in that case.
b411b363 491 * t_state check and assignment needs to be within the same spinlock,
e77a0a5c
AG
492 * so either thread_start sees EXITING, and can remap to RESTARTING,
493 * or thread_start see NONE, and can proceed as normal.
b411b363
PR
494 */
495
e77a0a5c 496 if (thi->t_state == RESTARTING) {
392c8801 497 conn_info(tconn, "Restarting %s thread\n", thi->name);
e77a0a5c 498 thi->t_state = RUNNING;
b411b363
PR
499 spin_unlock_irqrestore(&thi->t_lock, flags);
500 goto restart;
501 }
502
503 thi->task = NULL;
e77a0a5c 504 thi->t_state = NONE;
b411b363
PR
505 smp_mb();
506 complete(&thi->stop);
507 spin_unlock_irqrestore(&thi->t_lock, flags);
508
392c8801 509 conn_info(tconn, "Terminating %s\n", current->comm);
b411b363
PR
510
511 /* Release mod reference taken when thread was started */
9dc9fbb3
PR
512
513 kref_put(&tconn->kref, &conn_destroy);
b411b363
PR
514 module_put(THIS_MODULE);
515 return retval;
516}
517
392c8801 518static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
bed879ae 519 int (*func) (struct drbd_thread *), char *name)
b411b363
PR
520{
521 spin_lock_init(&thi->t_lock);
522 thi->task = NULL;
e77a0a5c 523 thi->t_state = NONE;
b411b363 524 thi->function = func;
392c8801 525 thi->tconn = tconn;
bed879ae 526 strncpy(thi->name, name, ARRAY_SIZE(thi->name));
b411b363
PR
527}
528
529int drbd_thread_start(struct drbd_thread *thi)
530{
392c8801 531 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
532 struct task_struct *nt;
533 unsigned long flags;
534
b411b363
PR
535 /* is used from state engine doing drbd_thread_stop_nowait,
536 * while holding the req lock irqsave */
537 spin_lock_irqsave(&thi->t_lock, flags);
538
539 switch (thi->t_state) {
e77a0a5c 540 case NONE:
392c8801 541 conn_info(tconn, "Starting %s thread (from %s [%d])\n",
bed879ae 542 thi->name, current->comm, current->pid);
b411b363
PR
543
544 /* Get ref on module for thread - this is released when thread exits */
545 if (!try_module_get(THIS_MODULE)) {
392c8801 546 conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
b411b363 547 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 548 return false;
b411b363
PR
549 }
550
9dc9fbb3
PR
551 kref_get(&thi->tconn->kref);
552
b411b363 553 init_completion(&thi->stop);
b411b363 554 thi->reset_cpu_mask = 1;
e77a0a5c 555 thi->t_state = RUNNING;
b411b363
PR
556 spin_unlock_irqrestore(&thi->t_lock, flags);
557 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
558
559 nt = kthread_create(drbd_thread_setup, (void *) thi,
392c8801 560 "drbd_%c_%s", thi->name[0], thi->tconn->name);
b411b363
PR
561
562 if (IS_ERR(nt)) {
392c8801 563 conn_err(tconn, "Couldn't start thread\n");
b411b363 564
9dc9fbb3 565 kref_put(&tconn->kref, &conn_destroy);
b411b363 566 module_put(THIS_MODULE);
81e84650 567 return false;
b411b363
PR
568 }
569 spin_lock_irqsave(&thi->t_lock, flags);
570 thi->task = nt;
e77a0a5c 571 thi->t_state = RUNNING;
b411b363
PR
572 spin_unlock_irqrestore(&thi->t_lock, flags);
573 wake_up_process(nt);
574 break;
e77a0a5c
AG
575 case EXITING:
576 thi->t_state = RESTARTING;
392c8801 577 conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
bed879ae 578 thi->name, current->comm, current->pid);
b411b363 579 /* fall through */
e77a0a5c
AG
580 case RUNNING:
581 case RESTARTING:
b411b363
PR
582 default:
583 spin_unlock_irqrestore(&thi->t_lock, flags);
584 break;
585 }
586
81e84650 587 return true;
b411b363
PR
588}
589
590
591void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
592{
593 unsigned long flags;
594
e77a0a5c 595 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
b411b363
PR
596
597 /* may be called from state engine, holding the req lock irqsave */
598 spin_lock_irqsave(&thi->t_lock, flags);
599
e77a0a5c 600 if (thi->t_state == NONE) {
b411b363
PR
601 spin_unlock_irqrestore(&thi->t_lock, flags);
602 if (restart)
603 drbd_thread_start(thi);
604 return;
605 }
606
607 if (thi->t_state != ns) {
608 if (thi->task == NULL) {
609 spin_unlock_irqrestore(&thi->t_lock, flags);
610 return;
611 }
612
613 thi->t_state = ns;
614 smp_mb();
615 init_completion(&thi->stop);
616 if (thi->task != current)
617 force_sig(DRBD_SIGKILL, thi->task);
b411b363
PR
618 }
619
620 spin_unlock_irqrestore(&thi->t_lock, flags);
621
622 if (wait)
623 wait_for_completion(&thi->stop);
624}
625
392c8801 626static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 627{
bed879ae
PR
628 struct drbd_thread *thi =
629 task == tconn->receiver.task ? &tconn->receiver :
630 task == tconn->asender.task ? &tconn->asender :
631 task == tconn->worker.task ? &tconn->worker : NULL;
632
633 return thi;
634}
635
392c8801 636char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 637{
392c8801 638 struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
bed879ae
PR
639 return thi ? thi->name : task->comm;
640}
641
80883197 642int conn_lowest_minor(struct drbd_tconn *tconn)
80822284 643{
e90285e0 644 struct drbd_conf *mdev;
695d08fa 645 int vnr = 0, m;
774b3055 646
695d08fa 647 rcu_read_lock();
e90285e0 648 mdev = idr_get_next(&tconn->volumes, &vnr);
695d08fa
PR
649 m = mdev ? mdev_to_minor(mdev) : -1;
650 rcu_read_unlock();
651
652 return m;
80822284 653}
774b3055
PR
654
655#ifdef CONFIG_SMP
b411b363
PR
656/**
657 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
658 * @mdev: DRBD device.
659 *
660 * Forces all threads of a device onto the same CPU. This is beneficial for
661 * DRBD's performance. May be overwritten by user's configuration.
662 */
80822284 663void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
b411b363
PR
664{
665 int ord, cpu;
666
667 /* user override. */
80822284 668 if (cpumask_weight(tconn->cpu_mask))
b411b363
PR
669 return;
670
80822284 671 ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
b411b363
PR
672 for_each_online_cpu(cpu) {
673 if (ord-- == 0) {
80822284 674 cpumask_set_cpu(cpu, tconn->cpu_mask);
b411b363
PR
675 return;
676 }
677 }
678 /* should not be reached */
80822284 679 cpumask_setall(tconn->cpu_mask);
b411b363
PR
680}
681
682/**
683 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
684 * @mdev: DRBD device.
bc31fe33 685 * @thi: drbd_thread object
b411b363
PR
686 *
687 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
688 * prematurely.
689 */
80822284 690void drbd_thread_current_set_cpu(struct drbd_thread *thi)
b411b363
PR
691{
692 struct task_struct *p = current;
bed879ae 693
b411b363
PR
694 if (!thi->reset_cpu_mask)
695 return;
696 thi->reset_cpu_mask = 0;
392c8801 697 set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
b411b363
PR
698}
699#endif
700
52b061a4
AG
701/**
702 * drbd_header_size - size of a packet header
703 *
704 * The header size is a multiple of 8, so any payload following the header is
705 * word aligned on 64-bit architectures. (The bitmap send and receive code
706 * relies on this.)
707 */
708unsigned int drbd_header_size(struct drbd_tconn *tconn)
709{
0c8e36d9
AG
710 if (tconn->agreed_pro_version >= 100) {
711 BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
712 return sizeof(struct p_header100);
713 } else {
714 BUILD_BUG_ON(sizeof(struct p_header80) !=
715 sizeof(struct p_header95));
716 BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
717 return sizeof(struct p_header80);
718 }
52b061a4
AG
719}
720
e658983a 721static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
fd340c12
PR
722{
723 h->magic = cpu_to_be32(DRBD_MAGIC);
724 h->command = cpu_to_be16(cmd);
725 h->length = cpu_to_be16(size);
e658983a 726 return sizeof(struct p_header80);
fd340c12
PR
727}
728
e658983a 729static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
fd340c12
PR
730{
731 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
732 h->command = cpu_to_be16(cmd);
b55d84ba 733 h->length = cpu_to_be32(size);
e658983a 734 return sizeof(struct p_header95);
fd340c12
PR
735}
736
0c8e36d9
AG
737static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
738 int size, int vnr)
739{
740 h->magic = cpu_to_be32(DRBD_MAGIC_100);
741 h->volume = cpu_to_be16(vnr);
742 h->command = cpu_to_be16(cmd);
743 h->length = cpu_to_be32(size);
744 h->pad = 0;
745 return sizeof(struct p_header100);
746}
747
748static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr,
749 void *buffer, enum drbd_packet cmd, int size)
d38e787e 750{
0c8e36d9
AG
751 if (tconn->agreed_pro_version >= 100)
752 return prepare_header100(buffer, cmd, size, vnr);
753 else if (tconn->agreed_pro_version >= 95 &&
754 size > DRBD_MAX_SIZE_H80_PACKET)
e658983a 755 return prepare_header95(buffer, cmd, size);
d38e787e 756 else
e658983a 757 return prepare_header80(buffer, cmd, size);
d38e787e
PR
758}
759
dba58587
AG
760void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock)
761{
762 mutex_lock(&sock->mutex);
763 if (!sock->socket) {
764 mutex_unlock(&sock->mutex);
765 return NULL;
766 }
e658983a 767 return sock->sbuf + drbd_header_size(tconn);
dba58587
AG
768}
769
770void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock)
771{
772 return conn_prepare_command(mdev->tconn, sock);
773}
774
775static int __send_command(struct drbd_tconn *tconn, int vnr,
776 struct drbd_socket *sock, enum drbd_packet cmd,
777 unsigned int header_size, void *data,
778 unsigned int size)
779{
780 int msg_flags;
781 int err;
782
783 /*
784 * Called with @data == NULL and the size of the data blocks in @size
785 * for commands that send data blocks. For those commands, omit the
786 * MSG_MORE flag: this will increase the likelihood that data blocks
787 * which are page aligned on the sender will end up page aligned on the
788 * receiver.
789 */
790 msg_flags = data ? MSG_MORE : 0;
791
e658983a
AG
792 header_size += prepare_header(tconn, vnr, sock->sbuf, cmd,
793 header_size + size);
dba58587
AG
794 err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size,
795 msg_flags);
796 if (data && !err)
797 err = drbd_send_all(tconn, sock->socket, data, size, 0);
798 return err;
799}
800
801int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
802 enum drbd_packet cmd, unsigned int header_size,
803 void *data, unsigned int size)
804{
805 int err;
806
807 err = __send_command(tconn, 0, sock, cmd, header_size, data, size);
808 mutex_unlock(&sock->mutex);
809 return err;
810}
811
812int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock,
813 enum drbd_packet cmd, unsigned int header_size,
814 void *data, unsigned int size)
815{
816 int err;
817
818 err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size,
819 data, size);
820 mutex_unlock(&sock->mutex);
821 return err;
822}
823
e307f352
AG
824int drbd_send_ping(struct drbd_tconn *tconn)
825{
9f5bdc33
AG
826 struct drbd_socket *sock;
827
828 sock = &tconn->meta;
829 if (!conn_prepare_command(tconn, sock))
830 return -EIO;
e658983a 831 return conn_send_command(tconn, sock, P_PING, 0, NULL, 0);
e307f352
AG
832}
833
834int drbd_send_ping_ack(struct drbd_tconn *tconn)
835{
9f5bdc33
AG
836 struct drbd_socket *sock;
837
838 sock = &tconn->meta;
839 if (!conn_prepare_command(tconn, sock))
840 return -EIO;
e658983a 841 return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0);
e307f352
AG
842}
843
f399002e 844int drbd_send_sync_param(struct drbd_conf *mdev)
b411b363 845{
7c96715a 846 struct drbd_socket *sock;
9f5bdc33
AG
847 struct p_rs_param_95 *p;
848 int size;
31890f4a 849 const int apv = mdev->tconn->agreed_pro_version;
9f5bdc33 850 enum drbd_packet cmd;
44ed167d 851 struct net_conf *nc;
9f5bdc33
AG
852
853 sock = &mdev->tconn->data;
854 p = drbd_prepare_command(mdev, sock);
855 if (!p)
856 return -EIO;
b411b363 857
44ed167d
PR
858 rcu_read_lock();
859 nc = rcu_dereference(mdev->tconn->net_conf);
860
b411b363
PR
861 size = apv <= 87 ? sizeof(struct p_rs_param)
862 : apv == 88 ? sizeof(struct p_rs_param)
44ed167d 863 + strlen(nc->verify_alg) + 1
8e26f9cc
PR
864 : apv <= 94 ? sizeof(struct p_rs_param_89)
865 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 866
9f5bdc33 867 cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
b411b363 868
9f5bdc33
AG
869 /* initialize verify_alg and csums_alg */
870 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
b411b363 871
9f5bdc33
AG
872 if (get_ldev(mdev)) {
873 p->rate = cpu_to_be32(mdev->ldev->dc.resync_rate);
874 p->c_plan_ahead = cpu_to_be32(mdev->ldev->dc.c_plan_ahead);
875 p->c_delay_target = cpu_to_be32(mdev->ldev->dc.c_delay_target);
876 p->c_fill_target = cpu_to_be32(mdev->ldev->dc.c_fill_target);
877 p->c_max_rate = cpu_to_be32(mdev->ldev->dc.c_max_rate);
878 put_ldev(mdev);
879 } else {
880 p->rate = cpu_to_be32(DRBD_RATE_DEF);
881 p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
882 p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
883 p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
884 p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
885 }
b411b363 886
9f5bdc33 887 if (apv >= 88)
44ed167d 888 strcpy(p->verify_alg, nc->verify_alg);
9f5bdc33 889 if (apv >= 89)
44ed167d
PR
890 strcpy(p->csums_alg, nc->csums_alg);
891 rcu_read_unlock();
b411b363 892
9f5bdc33 893 return drbd_send_command(mdev, sock, cmd, size, NULL, 0);
b411b363
PR
894}
895
dc8228d1 896int drbd_send_protocol(struct drbd_tconn *tconn)
b411b363 897{
9f5bdc33 898 struct drbd_socket *sock;
b411b363 899 struct p_protocol *p;
44ed167d 900 struct net_conf *nc;
9f5bdc33 901 int size, cf;
b411b363 902
9f5bdc33
AG
903 sock = &tconn->data;
904 p = conn_prepare_command(tconn, sock);
905 if (!p)
906 return -EIO;
907
44ed167d
PR
908 rcu_read_lock();
909 nc = rcu_dereference(tconn->net_conf);
910
911 if (nc->dry_run && tconn->agreed_pro_version < 92) {
912 rcu_read_unlock();
913 mutex_unlock(&sock->mutex);
914 conn_err(tconn, "--dry-run is not supported by peer");
915 return -EOPNOTSUPP;
916 }
917
9f5bdc33 918 size = sizeof(*p);
dc8228d1 919 if (tconn->agreed_pro_version >= 87)
44ed167d 920 size += strlen(nc->integrity_alg) + 1;
b411b363 921
44ed167d
PR
922 p->protocol = cpu_to_be32(nc->wire_protocol);
923 p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
924 p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
925 p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
926 p->two_primaries = cpu_to_be32(nc->two_primaries);
cf14c2e9 927 cf = 0;
44ed167d 928 if (nc->want_lose)
cf14c2e9 929 cf |= CF_WANT_LOSE;
44ed167d 930 if (nc->dry_run)
9f5bdc33 931 cf |= CF_DRY_RUN;
cf14c2e9
PR
932 p->conn_flags = cpu_to_be32(cf);
933
dc8228d1 934 if (tconn->agreed_pro_version >= 87)
44ed167d
PR
935 strcpy(p->integrity_alg, nc->integrity_alg);
936 rcu_read_unlock();
937
9f5bdc33 938 return conn_send_command(tconn, sock, P_PROTOCOL, size, NULL, 0);
b411b363
PR
939}
940
941int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
942{
9f5bdc33
AG
943 struct drbd_socket *sock;
944 struct p_uuids *p;
b411b363
PR
945 int i;
946
947 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2ae5f95b 948 return 0;
b411b363 949
9f5bdc33
AG
950 sock = &mdev->tconn->data;
951 p = drbd_prepare_command(mdev, sock);
952 if (!p) {
953 put_ldev(mdev);
954 return -EIO;
955 }
b411b363 956 for (i = UI_CURRENT; i < UI_SIZE; i++)
9f5bdc33 957 p->uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
b411b363
PR
958
959 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
9f5bdc33 960 p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
44ed167d
PR
961 rcu_read_lock();
962 uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->want_lose ? 1 : 0;
963 rcu_read_unlock();
b411b363
PR
964 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
965 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
9f5bdc33 966 p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
b411b363
PR
967
968 put_ldev(mdev);
9f5bdc33 969 return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0);
b411b363
PR
970}
971
972int drbd_send_uuids(struct drbd_conf *mdev)
973{
974 return _drbd_send_uuids(mdev, 0);
975}
976
977int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
978{
979 return _drbd_send_uuids(mdev, 8);
980}
981
62b0da3a
LE
982void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
983{
984 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
985 u64 *uuid = mdev->ldev->md.uuid;
986 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
987 text,
988 (unsigned long long)uuid[UI_CURRENT],
989 (unsigned long long)uuid[UI_BITMAP],
990 (unsigned long long)uuid[UI_HISTORY_START],
991 (unsigned long long)uuid[UI_HISTORY_END]);
992 put_ldev(mdev);
993 } else {
994 dev_info(DEV, "%s effective data uuid: %016llX\n",
995 text,
996 (unsigned long long)mdev->ed_uuid);
997 }
998}
999
9c1b7f72 1000void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363 1001{
9f5bdc33
AG
1002 struct drbd_socket *sock;
1003 struct p_rs_uuid *p;
5a22db89
LE
1004 u64 uuid;
1005
1006 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 1007
4a23f264 1008 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 1009 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 1010 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89 1011 drbd_md_sync(mdev);
b411b363 1012
9f5bdc33
AG
1013 sock = &mdev->tconn->data;
1014 p = drbd_prepare_command(mdev, sock);
1015 if (p) {
1016 p->uuid = cpu_to_be64(uuid);
1017 drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
1018 }
b411b363
PR
1019}
1020
e89b591c 1021int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363 1022{
9f5bdc33
AG
1023 struct drbd_socket *sock;
1024 struct p_sizes *p;
b411b363 1025 sector_t d_size, u_size;
99432fcc 1026 int q_order_type, max_bio_size;
b411b363
PR
1027
1028 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1029 D_ASSERT(mdev->ldev->backing_bdev);
1030 d_size = drbd_get_max_capacity(mdev->ldev);
1031 u_size = mdev->ldev->dc.disk_size;
1032 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
1033 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
1034 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
1035 put_ldev(mdev);
1036 } else {
1037 d_size = 0;
1038 u_size = 0;
1039 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 1040 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
1041 }
1042
9f5bdc33
AG
1043 sock = &mdev->tconn->data;
1044 p = drbd_prepare_command(mdev, sock);
1045 if (!p)
1046 return -EIO;
1047 p->d_size = cpu_to_be64(d_size);
1048 p->u_size = cpu_to_be64(u_size);
1049 p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1050 p->max_bio_size = cpu_to_be32(max_bio_size);
1051 p->queue_order_type = cpu_to_be16(q_order_type);
1052 p->dds_flags = cpu_to_be16(flags);
1053 return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
b411b363
PR
1054}
1055
1056/**
1057 * drbd_send_state() - Sends the drbd state to the peer
1058 * @mdev: DRBD device.
1059 */
1060int drbd_send_state(struct drbd_conf *mdev)
1061{
7c96715a 1062 struct drbd_socket *sock;
9f5bdc33 1063 struct p_state *p;
b411b363 1064
7c96715a 1065 sock = &mdev->tconn->data;
9f5bdc33
AG
1066 p = drbd_prepare_command(mdev, sock);
1067 if (!p)
1068 return -EIO;
1069 p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1070 return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
1071}
b411b363 1072
9f5bdc33
AG
1073int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val)
1074{
1075 struct drbd_socket *sock;
1076 struct p_req_state *p;
b411b363 1077
9f5bdc33
AG
1078 sock = &mdev->tconn->data;
1079 p = drbd_prepare_command(mdev, sock);
1080 if (!p)
1081 return -EIO;
1082 p->mask = cpu_to_be32(mask.i);
1083 p->val = cpu_to_be32(val.i);
1084 return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
b411b363 1085
b411b363
PR
1086}
1087
9f5bdc33 1088int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
b411b363 1089{
9f5bdc33
AG
1090 enum drbd_packet cmd;
1091 struct drbd_socket *sock;
1092 struct p_req_state *p;
b411b363 1093
9f5bdc33
AG
1094 cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
1095 sock = &tconn->data;
1096 p = conn_prepare_command(tconn, sock);
1097 if (!p)
1098 return -EIO;
1099 p->mask = cpu_to_be32(mask.i);
1100 p->val = cpu_to_be32(val.i);
1101 return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
b411b363
PR
1102}
1103
2f4e7abe 1104void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363 1105{
9f5bdc33
AG
1106 struct drbd_socket *sock;
1107 struct p_req_state_reply *p;
b411b363 1108
9f5bdc33
AG
1109 sock = &mdev->tconn->meta;
1110 p = drbd_prepare_command(mdev, sock);
1111 if (p) {
1112 p->retcode = cpu_to_be32(retcode);
1113 drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
1114 }
b411b363
PR
1115}
1116
9f5bdc33 1117void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
047cd4a6 1118{
9f5bdc33
AG
1119 struct drbd_socket *sock;
1120 struct p_req_state_reply *p;
047cd4a6
PR
1121 enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1122
9f5bdc33
AG
1123 sock = &tconn->meta;
1124 p = conn_prepare_command(tconn, sock);
1125 if (p) {
1126 p->retcode = cpu_to_be32(retcode);
1127 conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
1128 }
047cd4a6
PR
1129}
1130
a02d1240
AG
1131static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
1132{
1133 BUG_ON(code & ~0xf);
1134 p->encoding = (p->encoding & ~0xf) | code;
1135}
1136
1137static void dcbp_set_start(struct p_compressed_bm *p, int set)
1138{
1139 p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
1140}
1141
1142static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
1143{
1144 BUG_ON(n & ~0x7);
1145 p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
1146}
1147
b411b363 1148int fill_bitmap_rle_bits(struct drbd_conf *mdev,
50d0b1ad
AG
1149 struct p_compressed_bm *p,
1150 unsigned int size,
1151 struct bm_xfer_ctx *c)
b411b363
PR
1152{
1153 struct bitstream bs;
1154 unsigned long plain_bits;
1155 unsigned long tmp;
1156 unsigned long rl;
1157 unsigned len;
1158 unsigned toggle;
44ed167d 1159 int bits, use_rle;
b411b363
PR
1160
1161 /* may we use this feature? */
44ed167d
PR
1162 rcu_read_lock();
1163 use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle;
1164 rcu_read_unlock();
1165 if (!use_rle || mdev->tconn->agreed_pro_version < 90)
1166 return 0;
b411b363
PR
1167
1168 if (c->bit_offset >= c->bm_bits)
1169 return 0; /* nothing to do. */
1170
1171 /* use at most thus many bytes */
50d0b1ad
AG
1172 bitstream_init(&bs, p->code, size, 0);
1173 memset(p->code, 0, size);
b411b363
PR
1174 /* plain bits covered in this code string */
1175 plain_bits = 0;
1176
1177 /* p->encoding & 0x80 stores whether the first run length is set.
1178 * bit offset is implicit.
1179 * start with toggle == 2 to be able to tell the first iteration */
1180 toggle = 2;
1181
1182 /* see how much plain bits we can stuff into one packet
1183 * using RLE and VLI. */
1184 do {
1185 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1186 : _drbd_bm_find_next(mdev, c->bit_offset);
1187 if (tmp == -1UL)
1188 tmp = c->bm_bits;
1189 rl = tmp - c->bit_offset;
1190
1191 if (toggle == 2) { /* first iteration */
1192 if (rl == 0) {
1193 /* the first checked bit was set,
1194 * store start value, */
a02d1240 1195 dcbp_set_start(p, 1);
b411b363
PR
1196 /* but skip encoding of zero run length */
1197 toggle = !toggle;
1198 continue;
1199 }
a02d1240 1200 dcbp_set_start(p, 0);
b411b363
PR
1201 }
1202
1203 /* paranoia: catch zero runlength.
1204 * can only happen if bitmap is modified while we scan it. */
1205 if (rl == 0) {
1206 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1207 "t:%u bo:%lu\n", toggle, c->bit_offset);
1208 return -1;
1209 }
1210
1211 bits = vli_encode_bits(&bs, rl);
1212 if (bits == -ENOBUFS) /* buffer full */
1213 break;
1214 if (bits <= 0) {
1215 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1216 return 0;
1217 }
1218
1219 toggle = !toggle;
1220 plain_bits += rl;
1221 c->bit_offset = tmp;
1222 } while (c->bit_offset < c->bm_bits);
1223
1224 len = bs.cur.b - p->code + !!bs.cur.bit;
1225
1226 if (plain_bits < (len << 3)) {
1227 /* incompressible with this method.
1228 * we need to rewind both word and bit position. */
1229 c->bit_offset -= plain_bits;
1230 bm_xfer_ctx_bit_to_word_offset(c);
1231 c->bit_offset = c->word_offset * BITS_PER_LONG;
1232 return 0;
1233 }
1234
1235 /* RLE + VLI was able to compress it just fine.
1236 * update c->word_offset. */
1237 bm_xfer_ctx_bit_to_word_offset(c);
1238
1239 /* store pad_bits */
a02d1240 1240 dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
b411b363
PR
1241
1242 return len;
1243}
1244
f70af118
AG
1245/**
1246 * send_bitmap_rle_or_plain
1247 *
1248 * Return 0 when done, 1 when another iteration is needed, and a negative error
1249 * code upon failure.
1250 */
1251static int
79ed9bd0 1252send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c)
b411b363 1253{
9f5bdc33 1254 struct drbd_socket *sock = &mdev->tconn->data;
50d0b1ad 1255 unsigned int header_size = drbd_header_size(mdev->tconn);
e658983a 1256 struct p_compressed_bm *p = sock->sbuf + header_size;
a982dd57 1257 int len, err;
b411b363 1258
e658983a
AG
1259 len = fill_bitmap_rle_bits(mdev, p,
1260 DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
b411b363 1261 if (len < 0)
f70af118 1262 return -EIO;
b411b363
PR
1263
1264 if (len) {
a02d1240 1265 dcbp_set_code(p, RLE_VLI_Bits);
9f5bdc33
AG
1266 err = __send_command(mdev->tconn, mdev->vnr, sock,
1267 P_COMPRESSED_BITMAP, sizeof(*p) + len,
1268 NULL, 0);
b411b363 1269 c->packets[0]++;
e658983a 1270 c->bytes[0] += header_size + sizeof(*p) + len;
b411b363
PR
1271
1272 if (c->bit_offset >= c->bm_bits)
1273 len = 0; /* DONE */
1274 } else {
1275 /* was not compressible.
1276 * send a buffer full of plain text bits instead. */
50d0b1ad
AG
1277 unsigned int data_size;
1278 unsigned long num_words;
e658983a 1279 unsigned long *p = sock->sbuf + header_size;
50d0b1ad
AG
1280
1281 data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
e658983a 1282 num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 1283 c->bm_words - c->word_offset);
e658983a 1284 len = num_words * sizeof(*p);
b411b363 1285 if (len)
e658983a
AG
1286 drbd_bm_get_lel(mdev, c->word_offset, num_words, p);
1287 err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0);
b411b363
PR
1288 c->word_offset += num_words;
1289 c->bit_offset = c->word_offset * BITS_PER_LONG;
1290
1291 c->packets[1]++;
50d0b1ad 1292 c->bytes[1] += header_size + len;
b411b363
PR
1293
1294 if (c->bit_offset > c->bm_bits)
1295 c->bit_offset = c->bm_bits;
1296 }
a982dd57 1297 if (!err) {
f70af118
AG
1298 if (len == 0) {
1299 INFO_bm_xfer_stats(mdev, "send", c);
1300 return 0;
1301 } else
1302 return 1;
1303 }
1304 return -EIO;
b411b363
PR
1305}
1306
1307/* See the comment at receive_bitmap() */
058820cd 1308static int _drbd_send_bitmap(struct drbd_conf *mdev)
b411b363
PR
1309{
1310 struct bm_xfer_ctx c;
f70af118 1311 int err;
b411b363 1312
841ce241
AG
1313 if (!expect(mdev->bitmap))
1314 return false;
b411b363 1315
b411b363
PR
1316 if (get_ldev(mdev)) {
1317 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1318 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1319 drbd_bm_set_all(mdev);
1320 if (drbd_bm_write(mdev)) {
1321 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1322 * but otherwise process as per normal - need to tell other
1323 * side that a full resync is required! */
1324 dev_err(DEV, "Failed to write bitmap to disk!\n");
1325 } else {
1326 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
1327 drbd_md_sync(mdev);
1328 }
1329 }
1330 put_ldev(mdev);
1331 }
1332
1333 c = (struct bm_xfer_ctx) {
1334 .bm_bits = drbd_bm_bits(mdev),
1335 .bm_words = drbd_bm_words(mdev),
1336 };
1337
1338 do {
79ed9bd0 1339 err = send_bitmap_rle_or_plain(mdev, &c);
f70af118 1340 } while (err > 0);
b411b363 1341
f70af118 1342 return err == 0;
b411b363
PR
1343}
1344
1345int drbd_send_bitmap(struct drbd_conf *mdev)
1346{
9f5bdc33
AG
1347 struct drbd_socket *sock = &mdev->tconn->data;
1348 int err = -1;
b411b363 1349
9f5bdc33
AG
1350 mutex_lock(&sock->mutex);
1351 if (sock->socket)
1352 err = !_drbd_send_bitmap(mdev);
1353 mutex_unlock(&sock->mutex);
b411b363
PR
1354 return err;
1355}
9f5bdc33 1356
d4e67d7c 1357void drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
b411b363 1358{
9f5bdc33
AG
1359 struct drbd_socket *sock;
1360 struct p_barrier_ack *p;
b411b363 1361
9f5bdc33
AG
1362 if (mdev->state.conn < C_CONNECTED)
1363 return;
b411b363 1364
9f5bdc33
AG
1365 sock = &mdev->tconn->meta;
1366 p = drbd_prepare_command(mdev, sock);
1367 if (!p)
1368 return;
1369 p->barrier = barrier_nr;
1370 p->set_size = cpu_to_be32(set_size);
1371 drbd_send_command(mdev, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
b411b363
PR
1372}
1373
1374/**
1375 * _drbd_send_ack() - Sends an ack packet
1376 * @mdev: DRBD device.
1377 * @cmd: Packet command code.
1378 * @sector: sector, needs to be in big endian byte order
1379 * @blksize: size in byte, needs to be in big endian byte order
1380 * @block_id: Id, big endian byte order
1381 */
d8763023
AG
1382static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
1383 u64 sector, u32 blksize, u64 block_id)
b411b363 1384{
9f5bdc33
AG
1385 struct drbd_socket *sock;
1386 struct p_block_ack *p;
b411b363 1387
9f5bdc33
AG
1388 if (mdev->state.conn < C_CONNECTED)
1389 return -EIO;
b411b363 1390
9f5bdc33
AG
1391 sock = &mdev->tconn->meta;
1392 p = drbd_prepare_command(mdev, sock);
1393 if (!p)
a8c32aa8 1394 return -EIO;
9f5bdc33
AG
1395 p->sector = sector;
1396 p->block_id = block_id;
1397 p->blksize = blksize;
1398 p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
1399 return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
b411b363
PR
1400}
1401
2b2bf214
LE
1402/* dp->sector and dp->block_id already/still in network byte order,
1403 * data_size is payload size according to dp->head,
1404 * and may need to be corrected for digest size. */
a9a9994d
AG
1405void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
1406 struct p_data *dp, int data_size)
b411b363 1407{
5b614abe
AG
1408 data_size -= (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->peer_integrity_tfm) ?
1409 crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm) : 0;
a9a9994d
AG
1410 _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
1411 dp->block_id);
b411b363
PR
1412}
1413
a9a9994d
AG
1414void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
1415 struct p_block_req *rp)
b411b363 1416{
a9a9994d 1417 _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
b411b363
PR
1418}
1419
1420/**
1421 * drbd_send_ack() - Sends an ack packet
db830c46
AG
1422 * @mdev: DRBD device
1423 * @cmd: packet command code
1424 * @peer_req: peer request
b411b363 1425 */
d8763023 1426int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1427 struct drbd_peer_request *peer_req)
b411b363 1428{
dd516121
AG
1429 return _drbd_send_ack(mdev, cmd,
1430 cpu_to_be64(peer_req->i.sector),
1431 cpu_to_be32(peer_req->i.size),
1432 peer_req->block_id);
b411b363
PR
1433}
1434
1435/* This function misuses the block_id field to signal if the blocks
1436 * are is sync or not. */
d8763023 1437int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1438 sector_t sector, int blksize, u64 block_id)
1439{
fa79abd8
AG
1440 return _drbd_send_ack(mdev, cmd,
1441 cpu_to_be64(sector),
1442 cpu_to_be32(blksize),
1443 cpu_to_be64(block_id));
b411b363
PR
1444}
1445
1446int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1447 sector_t sector, int size, u64 block_id)
1448{
9f5bdc33
AG
1449 struct drbd_socket *sock;
1450 struct p_block_req *p;
b411b363 1451
9f5bdc33
AG
1452 sock = &mdev->tconn->data;
1453 p = drbd_prepare_command(mdev, sock);
1454 if (!p)
1455 return -EIO;
1456 p->sector = cpu_to_be64(sector);
1457 p->block_id = block_id;
1458 p->blksize = cpu_to_be32(size);
1459 return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
b411b363
PR
1460}
1461
d8763023
AG
1462int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
1463 void *digest, int digest_size, enum drbd_packet cmd)
b411b363 1464{
9f5bdc33
AG
1465 struct drbd_socket *sock;
1466 struct p_block_req *p;
b411b363 1467
9f5bdc33 1468 /* FIXME: Put the digest into the preallocated socket buffer. */
b411b363 1469
9f5bdc33
AG
1470 sock = &mdev->tconn->data;
1471 p = drbd_prepare_command(mdev, sock);
1472 if (!p)
1473 return -EIO;
1474 p->sector = cpu_to_be64(sector);
1475 p->block_id = ID_SYNCER /* unused */;
1476 p->blksize = cpu_to_be32(size);
1477 return drbd_send_command(mdev, sock, cmd, sizeof(*p),
1478 digest, digest_size);
b411b363
PR
1479}
1480
1481int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
1482{
9f5bdc33
AG
1483 struct drbd_socket *sock;
1484 struct p_block_req *p;
b411b363 1485
9f5bdc33
AG
1486 sock = &mdev->tconn->data;
1487 p = drbd_prepare_command(mdev, sock);
1488 if (!p)
1489 return -EIO;
1490 p->sector = cpu_to_be64(sector);
1491 p->block_id = ID_SYNCER /* unused */;
1492 p->blksize = cpu_to_be32(size);
1493 return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
b411b363
PR
1494}
1495
1496/* called on sndtimeo
81e84650
AG
1497 * returns false if we should retry,
1498 * true if we think connection is dead
b411b363 1499 */
1a7ba646 1500static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
b411b363
PR
1501{
1502 int drop_it;
1503 /* long elapsed = (long)(jiffies - mdev->last_received); */
1504
1a7ba646
PR
1505 drop_it = tconn->meta.socket == sock
1506 || !tconn->asender.task
1507 || get_t_state(&tconn->asender) != RUNNING
bbeb641c 1508 || tconn->cstate < C_WF_REPORT_PARAMS;
b411b363
PR
1509
1510 if (drop_it)
81e84650 1511 return true;
b411b363 1512
1a7ba646 1513 drop_it = !--tconn->ko_count;
b411b363 1514 if (!drop_it) {
1a7ba646
PR
1515 conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1516 current->comm, current->pid, tconn->ko_count);
1517 request_ping(tconn);
b411b363
PR
1518 }
1519
1520 return drop_it; /* && (mdev->state == R_PRIMARY) */;
1521}
1522
1a7ba646 1523static void drbd_update_congested(struct drbd_tconn *tconn)
9e204cdd 1524{
1a7ba646 1525 struct sock *sk = tconn->data.socket->sk;
9e204cdd 1526 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1a7ba646 1527 set_bit(NET_CONGESTED, &tconn->flags);
9e204cdd
AG
1528}
1529
b411b363
PR
1530/* The idea of sendpage seems to be to put some kind of reference
1531 * to the page into the skb, and to hand it over to the NIC. In
1532 * this process get_page() gets called.
1533 *
1534 * As soon as the page was really sent over the network put_page()
1535 * gets called by some part of the network layer. [ NIC driver? ]
1536 *
1537 * [ get_page() / put_page() increment/decrement the count. If count
1538 * reaches 0 the page will be freed. ]
1539 *
1540 * This works nicely with pages from FSs.
1541 * But this means that in protocol A we might signal IO completion too early!
1542 *
1543 * In order not to corrupt data during a resync we must make sure
1544 * that we do not reuse our own buffer pages (EEs) to early, therefore
1545 * we have the net_ee list.
1546 *
1547 * XFS seems to have problems, still, it submits pages with page_count == 0!
1548 * As a workaround, we disable sendpage on pages
1549 * with page_count == 0 or PageSlab.
1550 */
1551static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
b987427b 1552 int offset, size_t size, unsigned msg_flags)
b411b363 1553{
b987427b
AG
1554 struct socket *socket;
1555 void *addr;
1556 int err;
1557
1558 socket = mdev->tconn->data.socket;
1559 addr = kmap(page) + offset;
1560 err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags);
b411b363 1561 kunmap(page);
b987427b
AG
1562 if (!err)
1563 mdev->send_cnt += size >> 9;
1564 return err;
b411b363
PR
1565}
1566
1567static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1568 int offset, size_t size, unsigned msg_flags)
b411b363 1569{
88b390ff 1570 struct socket *socket = mdev->tconn->data.socket;
b411b363 1571 mm_segment_t oldfs = get_fs();
b411b363 1572 int len = size;
88b390ff 1573 int err = -EIO;
b411b363
PR
1574
1575 /* e.g. XFS meta- & log-data is in slab pages, which have a
1576 * page_count of 0 and/or have PageSlab() set.
1577 * we cannot use send_page for those, as that does get_page();
1578 * put_page(); and would cause either a VM_BUG directly, or
1579 * __page_cache_release a page that would actually still be referenced
1580 * by someone, leading to some obscure delayed Oops somewhere else. */
1581 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
88b390ff 1582 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 1583
ba11ad9a 1584 msg_flags |= MSG_NOSIGNAL;
1a7ba646 1585 drbd_update_congested(mdev->tconn);
b411b363
PR
1586 set_fs(KERNEL_DS);
1587 do {
88b390ff
AG
1588 int sent;
1589
1590 sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
b411b363 1591 if (sent <= 0) {
88b390ff
AG
1592 if (sent == -EAGAIN) {
1593 if (we_should_drop_the_connection(mdev->tconn, socket))
1594 break;
1595 continue;
1596 }
b411b363
PR
1597 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
1598 __func__, (int)size, len, sent);
88b390ff
AG
1599 if (sent < 0)
1600 err = sent;
b411b363
PR
1601 break;
1602 }
1603 len -= sent;
1604 offset += sent;
1605 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
1606 set_fs(oldfs);
01a311a5 1607 clear_bit(NET_CONGESTED, &mdev->tconn->flags);
b411b363 1608
88b390ff
AG
1609 if (len == 0) {
1610 err = 0;
1611 mdev->send_cnt += size >> 9;
1612 }
1613 return err;
b411b363
PR
1614}
1615
1616static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
1617{
1618 struct bio_vec *bvec;
1619 int i;
ba11ad9a 1620 /* hint all but last page with MSG_MORE */
b411b363 1621 __bio_for_each_segment(bvec, bio, i, 0) {
7fae55da
AG
1622 int err;
1623
1624 err = _drbd_no_send_page(mdev, bvec->bv_page,
1625 bvec->bv_offset, bvec->bv_len,
1626 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
1627 if (err)
1628 return err;
b411b363 1629 }
7fae55da 1630 return 0;
b411b363
PR
1631}
1632
1633static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
1634{
1635 struct bio_vec *bvec;
1636 int i;
ba11ad9a 1637 /* hint all but last page with MSG_MORE */
b411b363 1638 __bio_for_each_segment(bvec, bio, i, 0) {
7fae55da
AG
1639 int err;
1640
1641 err = _drbd_send_page(mdev, bvec->bv_page,
1642 bvec->bv_offset, bvec->bv_len,
1643 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
1644 if (err)
1645 return err;
b411b363 1646 }
7fae55da 1647 return 0;
b411b363
PR
1648}
1649
db830c46
AG
1650static int _drbd_send_zc_ee(struct drbd_conf *mdev,
1651 struct drbd_peer_request *peer_req)
45bb912b 1652{
db830c46
AG
1653 struct page *page = peer_req->pages;
1654 unsigned len = peer_req->i.size;
9f69230c 1655 int err;
db830c46 1656
ba11ad9a 1657 /* hint all but last page with MSG_MORE */
45bb912b
LE
1658 page_chain_for_each(page) {
1659 unsigned l = min_t(unsigned, len, PAGE_SIZE);
9f69230c
AG
1660
1661 err = _drbd_send_page(mdev, page, 0, l,
1662 page_chain_next(page) ? MSG_MORE : 0);
1663 if (err)
1664 return err;
45bb912b
LE
1665 len -= l;
1666 }
9f69230c 1667 return 0;
45bb912b
LE
1668}
1669
76d2e7ec
PR
1670static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
1671{
31890f4a 1672 if (mdev->tconn->agreed_pro_version >= 95)
76d2e7ec 1673 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
1674 (bi_rw & REQ_FUA ? DP_FUA : 0) |
1675 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
1676 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
1677 else
721a9602 1678 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
1679}
1680
b411b363
PR
1681/* Used to send write requests
1682 * R_PRIMARY -> Peer (P_DATA)
1683 */
1684int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
1685{
9f5bdc33
AG
1686 struct drbd_socket *sock;
1687 struct p_data *p;
b411b363 1688 unsigned int dp_flags = 0;
b411b363 1689 int dgs;
9f5bdc33 1690 int err;
b411b363 1691
8d412fc6
AG
1692 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_tfm) ?
1693 crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
b411b363 1694
9f5bdc33
AG
1695 sock = &mdev->tconn->data;
1696 p = drbd_prepare_command(mdev, sock);
1697 if (!p)
1698 return -EIO;
1699 p->sector = cpu_to_be64(req->i.sector);
1700 p->block_id = (unsigned long)req;
1701 p->seq_num = cpu_to_be32(req->seq_num = atomic_inc_return(&mdev->packet_seq));
76d2e7ec 1702 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
b411b363
PR
1703 if (mdev->state.conn >= C_SYNC_SOURCE &&
1704 mdev->state.conn <= C_PAUSED_SYNC_T)
1705 dp_flags |= DP_MAY_SET_IN_SYNC;
303d1448
PR
1706 if (mdev->tconn->agreed_pro_version >= 100) {
1707 if (req->rq_state & RQ_EXP_RECEIVE_ACK)
1708 dp_flags |= DP_SEND_RECEIVE_ACK;
1709 if (req->rq_state & RQ_EXP_WRITE_ACK)
1710 dp_flags |= DP_SEND_WRITE_ACK;
1711 }
9f5bdc33
AG
1712 p->dp_flags = cpu_to_be32(dp_flags);
1713 if (dgs)
8d412fc6 1714 drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1);
9f5bdc33 1715 err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
6bdb9b0e 1716 if (!err) {
470be44a
LE
1717 /* For protocol A, we have to memcpy the payload into
1718 * socket buffers, as we may complete right away
1719 * as soon as we handed it over to tcp, at which point the data
1720 * pages may become invalid.
1721 *
1722 * For data-integrity enabled, we copy it as well, so we can be
1723 * sure that even if the bio pages may still be modified, it
1724 * won't change the data on the wire, thus if the digest checks
1725 * out ok after sending on this side, but does not fit on the
1726 * receiving side, we sure have detected corruption elsewhere.
1727 */
303d1448 1728 if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
6bdb9b0e 1729 err = _drbd_send_bio(mdev, req->master_bio);
b411b363 1730 else
6bdb9b0e 1731 err = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
1732
1733 /* double check digest, sometimes buffers have been modified in flight. */
1734 if (dgs > 0 && dgs <= 64) {
24c4830c 1735 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
1736 * currently supported in kernel crypto. */
1737 unsigned char digest[64];
8d412fc6 1738 drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest);
9f5bdc33 1739 if (memcmp(p + 1, digest, dgs)) {
470be44a
LE
1740 dev_warn(DEV,
1741 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
ace652ac 1742 (unsigned long long)req->i.sector, req->i.size);
470be44a
LE
1743 }
1744 } /* else if (dgs > 64) {
1745 ... Be noisy about digest too large ...
1746 } */
b411b363 1747 }
9f5bdc33 1748 mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
bd26bfc5 1749
6bdb9b0e 1750 return err;
b411b363
PR
1751}
1752
1753/* answer packet, used to send data back for read requests:
1754 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
1755 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
1756 */
d8763023 1757int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1758 struct drbd_peer_request *peer_req)
b411b363 1759{
9f5bdc33
AG
1760 struct drbd_socket *sock;
1761 struct p_data *p;
7b57b89d 1762 int err;
b411b363
PR
1763 int dgs;
1764
8d412fc6
AG
1765 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_tfm) ?
1766 crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
b411b363 1767
9f5bdc33
AG
1768 sock = &mdev->tconn->data;
1769 p = drbd_prepare_command(mdev, sock);
1770 if (!p)
1771 return -EIO;
1772 p->sector = cpu_to_be64(peer_req->i.sector);
1773 p->block_id = peer_req->block_id;
1774 p->seq_num = 0; /* unused */
1775 if (dgs)
8d412fc6 1776 drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1);
9f5bdc33 1777 err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
7b57b89d
AG
1778 if (!err)
1779 err = _drbd_send_zc_ee(mdev, peer_req);
9f5bdc33 1780 mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
bd26bfc5 1781
7b57b89d 1782 return err;
b411b363
PR
1783}
1784
8f7bed77 1785int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req)
73a01a18 1786{
9f5bdc33
AG
1787 struct drbd_socket *sock;
1788 struct p_block_desc *p;
73a01a18 1789
9f5bdc33
AG
1790 sock = &mdev->tconn->data;
1791 p = drbd_prepare_command(mdev, sock);
1792 if (!p)
1793 return -EIO;
1794 p->sector = cpu_to_be64(req->i.sector);
1795 p->blksize = cpu_to_be32(req->i.size);
1796 return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
73a01a18
PR
1797}
1798
b411b363
PR
1799/*
1800 drbd_send distinguishes two cases:
1801
1802 Packets sent via the data socket "sock"
1803 and packets sent via the meta data socket "msock"
1804
1805 sock msock
1806 -----------------+-------------------------+------------------------------
1807 timeout conf.timeout / 2 conf.timeout / 2
1808 timeout action send a ping via msock Abort communication
1809 and close all sockets
1810*/
1811
1812/*
1813 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1814 */
bedbd2a5 1815int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
b411b363
PR
1816 void *buf, size_t size, unsigned msg_flags)
1817{
1818 struct kvec iov;
1819 struct msghdr msg;
1820 int rv, sent = 0;
1821
1822 if (!sock)
c0d42c8e 1823 return -EBADR;
b411b363
PR
1824
1825 /* THINK if (signal_pending) return ... ? */
1826
1827 iov.iov_base = buf;
1828 iov.iov_len = size;
1829
1830 msg.msg_name = NULL;
1831 msg.msg_namelen = 0;
1832 msg.msg_control = NULL;
1833 msg.msg_controllen = 0;
1834 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
1835
bedbd2a5 1836 if (sock == tconn->data.socket) {
44ed167d
PR
1837 rcu_read_lock();
1838 tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count;
1839 rcu_read_unlock();
bedbd2a5 1840 drbd_update_congested(tconn);
b411b363
PR
1841 }
1842 do {
1843 /* STRANGE
1844 * tcp_sendmsg does _not_ use its size parameter at all ?
1845 *
1846 * -EAGAIN on timeout, -EINTR on signal.
1847 */
1848/* THINK
1849 * do we need to block DRBD_SIG if sock == &meta.socket ??
1850 * otherwise wake_asender() might interrupt some send_*Ack !
1851 */
1852 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
1853 if (rv == -EAGAIN) {
bedbd2a5 1854 if (we_should_drop_the_connection(tconn, sock))
b411b363
PR
1855 break;
1856 else
1857 continue;
1858 }
b411b363
PR
1859 if (rv == -EINTR) {
1860 flush_signals(current);
1861 rv = 0;
1862 }
1863 if (rv < 0)
1864 break;
1865 sent += rv;
1866 iov.iov_base += rv;
1867 iov.iov_len -= rv;
1868 } while (sent < size);
1869
bedbd2a5
PR
1870 if (sock == tconn->data.socket)
1871 clear_bit(NET_CONGESTED, &tconn->flags);
b411b363
PR
1872
1873 if (rv <= 0) {
1874 if (rv != -EAGAIN) {
bedbd2a5
PR
1875 conn_err(tconn, "%s_sendmsg returned %d\n",
1876 sock == tconn->meta.socket ? "msock" : "sock",
1877 rv);
bbeb641c 1878 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 1879 } else
bbeb641c 1880 conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
b411b363
PR
1881 }
1882
1883 return sent;
1884}
1885
fb708e40
AG
1886/**
1887 * drbd_send_all - Send an entire buffer
1888 *
1889 * Returns 0 upon success and a negative error value otherwise.
1890 */
1891int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer,
1892 size_t size, unsigned msg_flags)
1893{
1894 int err;
1895
1896 err = drbd_send(tconn, sock, buffer, size, msg_flags);
1897 if (err < 0)
1898 return err;
1899 if (err != size)
1900 return -EIO;
1901 return 0;
1902}
1903
b411b363
PR
1904static int drbd_open(struct block_device *bdev, fmode_t mode)
1905{
1906 struct drbd_conf *mdev = bdev->bd_disk->private_data;
1907 unsigned long flags;
1908 int rv = 0;
1909
2a48fc0a 1910 mutex_lock(&drbd_main_mutex);
87eeee41 1911 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
1912 /* to have a stable mdev->state.role
1913 * and no race with updating open_cnt */
1914
1915 if (mdev->state.role != R_PRIMARY) {
1916 if (mode & FMODE_WRITE)
1917 rv = -EROFS;
1918 else if (!allow_oos)
1919 rv = -EMEDIUMTYPE;
1920 }
1921
1922 if (!rv)
1923 mdev->open_cnt++;
87eeee41 1924 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
2a48fc0a 1925 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1926
1927 return rv;
1928}
1929
1930static int drbd_release(struct gendisk *gd, fmode_t mode)
1931{
1932 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 1933 mutex_lock(&drbd_main_mutex);
b411b363 1934 mdev->open_cnt--;
2a48fc0a 1935 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1936 return 0;
1937}
1938
b411b363
PR
1939static void drbd_set_defaults(struct drbd_conf *mdev)
1940{
f399002e
LE
1941 /* Beware! The actual layout differs
1942 * between big endian and little endian */
da9fbc27 1943 mdev->state = (union drbd_dev_state) {
b411b363
PR
1944 { .role = R_SECONDARY,
1945 .peer = R_UNKNOWN,
1946 .conn = C_STANDALONE,
1947 .disk = D_DISKLESS,
1948 .pdsk = D_UNKNOWN,
b411b363
PR
1949 } };
1950}
1951
1952void drbd_init_set_defaults(struct drbd_conf *mdev)
1953{
1954 /* the memset(,0,) did most of this.
1955 * note: only assignments, no allocation in here */
1956
1957 drbd_set_defaults(mdev);
1958
b411b363
PR
1959 atomic_set(&mdev->ap_bio_cnt, 0);
1960 atomic_set(&mdev->ap_pending_cnt, 0);
1961 atomic_set(&mdev->rs_pending_cnt, 0);
1962 atomic_set(&mdev->unacked_cnt, 0);
1963 atomic_set(&mdev->local_cnt, 0);
435f0740 1964 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 1965 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 1966 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 1967 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
1968
1969 mutex_init(&mdev->md_io_mutex);
8410da8f
PR
1970 mutex_init(&mdev->own_state_mutex);
1971 mdev->state_mutex = &mdev->own_state_mutex;
b411b363 1972
b411b363 1973 spin_lock_init(&mdev->al_lock);
b411b363
PR
1974 spin_lock_init(&mdev->peer_seq_lock);
1975 spin_lock_init(&mdev->epoch_lock);
1976
1977 INIT_LIST_HEAD(&mdev->active_ee);
1978 INIT_LIST_HEAD(&mdev->sync_ee);
1979 INIT_LIST_HEAD(&mdev->done_ee);
1980 INIT_LIST_HEAD(&mdev->read_ee);
1981 INIT_LIST_HEAD(&mdev->net_ee);
1982 INIT_LIST_HEAD(&mdev->resync_reads);
b411b363
PR
1983 INIT_LIST_HEAD(&mdev->resync_work.list);
1984 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 1985 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 1986 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 1987 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 1988 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 1989
794abb75 1990 mdev->resync_work.cb = w_resync_timer;
b411b363 1991 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 1992 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
1993 mdev->md_sync_work.cb = w_md_sync;
1994 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 1995 mdev->start_resync_work.cb = w_start_resync;
a21e9298
PR
1996
1997 mdev->resync_work.mdev = mdev;
1998 mdev->unplug_work.mdev = mdev;
1999 mdev->go_diskless.mdev = mdev;
2000 mdev->md_sync_work.mdev = mdev;
2001 mdev->bm_io_work.w.mdev = mdev;
2002 mdev->start_resync_work.mdev = mdev;
2003
b411b363
PR
2004 init_timer(&mdev->resync_timer);
2005 init_timer(&mdev->md_sync_timer);
370a43e7 2006 init_timer(&mdev->start_resync_timer);
7fde2be9 2007 init_timer(&mdev->request_timer);
b411b363
PR
2008 mdev->resync_timer.function = resync_timer_fn;
2009 mdev->resync_timer.data = (unsigned long) mdev;
2010 mdev->md_sync_timer.function = md_sync_timer_fn;
2011 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
2012 mdev->start_resync_timer.function = start_resync_timer_fn;
2013 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
2014 mdev->request_timer.function = request_timer_fn;
2015 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
2016
2017 init_waitqueue_head(&mdev->misc_wait);
2018 init_waitqueue_head(&mdev->state_wait);
2019 init_waitqueue_head(&mdev->ee_wait);
2020 init_waitqueue_head(&mdev->al_wait);
2021 init_waitqueue_head(&mdev->seq_wait);
2022
fd340c12 2023 /* mdev->tconn->agreed_pro_version gets initialized in drbd_connect() */
2451fc3b 2024 mdev->write_ordering = WO_bdev_flush;
b411b363 2025 mdev->resync_wenr = LC_FREE;
99432fcc
PR
2026 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2027 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
2028}
2029
2030void drbd_mdev_cleanup(struct drbd_conf *mdev)
2031{
1d7734a0 2032 int i;
e6b3ea83 2033 if (mdev->tconn->receiver.t_state != NONE)
b411b363 2034 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
e6b3ea83 2035 mdev->tconn->receiver.t_state);
b411b363
PR
2036
2037 /* no need to lock it, I'm the only thread alive */
2038 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2039 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2040 mdev->al_writ_cnt =
2041 mdev->bm_writ_cnt =
2042 mdev->read_cnt =
2043 mdev->recv_cnt =
2044 mdev->send_cnt =
2045 mdev->writ_cnt =
2046 mdev->p_size =
2047 mdev->rs_start =
2048 mdev->rs_total =
1d7734a0
LE
2049 mdev->rs_failed = 0;
2050 mdev->rs_last_events = 0;
0f0601f4 2051 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
2052 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2053 mdev->rs_mark_left[i] = 0;
2054 mdev->rs_mark_time[i] = 0;
2055 }
89e58e75 2056 D_ASSERT(mdev->tconn->net_conf == NULL);
b411b363
PR
2057
2058 drbd_set_my_capacity(mdev, 0);
2059 if (mdev->bitmap) {
2060 /* maybe never allocated. */
02d9a94b 2061 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
2062 drbd_bm_cleanup(mdev);
2063 }
2064
1d041225
PR
2065 drbd_free_bc(mdev->ldev);
2066 mdev->ldev = NULL;
2067
0778286a 2068 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363 2069
b411b363
PR
2070 D_ASSERT(list_empty(&mdev->active_ee));
2071 D_ASSERT(list_empty(&mdev->sync_ee));
2072 D_ASSERT(list_empty(&mdev->done_ee));
2073 D_ASSERT(list_empty(&mdev->read_ee));
2074 D_ASSERT(list_empty(&mdev->net_ee));
2075 D_ASSERT(list_empty(&mdev->resync_reads));
e42325a5
PR
2076 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
2077 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
b411b363
PR
2078 D_ASSERT(list_empty(&mdev->resync_work.list));
2079 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 2080 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
2081
2082 drbd_set_defaults(mdev);
b411b363
PR
2083}
2084
2085
2086static void drbd_destroy_mempools(void)
2087{
2088 struct page *page;
2089
2090 while (drbd_pp_pool) {
2091 page = drbd_pp_pool;
2092 drbd_pp_pool = (struct page *)page_private(page);
2093 __free_page(page);
2094 drbd_pp_vacant--;
2095 }
2096
2097 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2098
da4a75d2
LE
2099 if (drbd_md_io_bio_set)
2100 bioset_free(drbd_md_io_bio_set);
35abf594
LE
2101 if (drbd_md_io_page_pool)
2102 mempool_destroy(drbd_md_io_page_pool);
b411b363
PR
2103 if (drbd_ee_mempool)
2104 mempool_destroy(drbd_ee_mempool);
2105 if (drbd_request_mempool)
2106 mempool_destroy(drbd_request_mempool);
2107 if (drbd_ee_cache)
2108 kmem_cache_destroy(drbd_ee_cache);
2109 if (drbd_request_cache)
2110 kmem_cache_destroy(drbd_request_cache);
2111 if (drbd_bm_ext_cache)
2112 kmem_cache_destroy(drbd_bm_ext_cache);
2113 if (drbd_al_ext_cache)
2114 kmem_cache_destroy(drbd_al_ext_cache);
2115
da4a75d2 2116 drbd_md_io_bio_set = NULL;
35abf594 2117 drbd_md_io_page_pool = NULL;
b411b363
PR
2118 drbd_ee_mempool = NULL;
2119 drbd_request_mempool = NULL;
2120 drbd_ee_cache = NULL;
2121 drbd_request_cache = NULL;
2122 drbd_bm_ext_cache = NULL;
2123 drbd_al_ext_cache = NULL;
2124
2125 return;
2126}
2127
2128static int drbd_create_mempools(void)
2129{
2130 struct page *page;
1816a2b4 2131 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
2132 int i;
2133
2134 /* prepare our caches and mempools */
2135 drbd_request_mempool = NULL;
2136 drbd_ee_cache = NULL;
2137 drbd_request_cache = NULL;
2138 drbd_bm_ext_cache = NULL;
2139 drbd_al_ext_cache = NULL;
2140 drbd_pp_pool = NULL;
35abf594 2141 drbd_md_io_page_pool = NULL;
da4a75d2 2142 drbd_md_io_bio_set = NULL;
b411b363
PR
2143
2144 /* caches */
2145 drbd_request_cache = kmem_cache_create(
2146 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2147 if (drbd_request_cache == NULL)
2148 goto Enomem;
2149
2150 drbd_ee_cache = kmem_cache_create(
f6ffca9f 2151 "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
b411b363
PR
2152 if (drbd_ee_cache == NULL)
2153 goto Enomem;
2154
2155 drbd_bm_ext_cache = kmem_cache_create(
2156 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2157 if (drbd_bm_ext_cache == NULL)
2158 goto Enomem;
2159
2160 drbd_al_ext_cache = kmem_cache_create(
2161 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2162 if (drbd_al_ext_cache == NULL)
2163 goto Enomem;
2164
2165 /* mempools */
da4a75d2
LE
2166 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
2167 if (drbd_md_io_bio_set == NULL)
2168 goto Enomem;
2169
35abf594
LE
2170 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
2171 if (drbd_md_io_page_pool == NULL)
2172 goto Enomem;
2173
b411b363
PR
2174 drbd_request_mempool = mempool_create(number,
2175 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2176 if (drbd_request_mempool == NULL)
2177 goto Enomem;
2178
2179 drbd_ee_mempool = mempool_create(number,
2180 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 2181 if (drbd_ee_mempool == NULL)
b411b363
PR
2182 goto Enomem;
2183
2184 /* drbd's page pool */
2185 spin_lock_init(&drbd_pp_lock);
2186
2187 for (i = 0; i < number; i++) {
2188 page = alloc_page(GFP_HIGHUSER);
2189 if (!page)
2190 goto Enomem;
2191 set_page_private(page, (unsigned long)drbd_pp_pool);
2192 drbd_pp_pool = page;
2193 }
2194 drbd_pp_vacant = number;
2195
2196 return 0;
2197
2198Enomem:
2199 drbd_destroy_mempools(); /* in case we allocated some */
2200 return -ENOMEM;
2201}
2202
2203static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2204 void *unused)
2205{
2206 /* just so we have it. you never know what interesting things we
2207 * might want to do here some day...
2208 */
2209
2210 return NOTIFY_DONE;
2211}
2212
2213static struct notifier_block drbd_notifier = {
2214 .notifier_call = drbd_notify_sys,
2215};
2216
7721f567 2217static void drbd_release_all_peer_reqs(struct drbd_conf *mdev)
b411b363
PR
2218{
2219 int rr;
2220
7721f567 2221 rr = drbd_free_peer_reqs(mdev, &mdev->active_ee);
b411b363
PR
2222 if (rr)
2223 dev_err(DEV, "%d EEs in active list found!\n", rr);
2224
7721f567 2225 rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee);
b411b363
PR
2226 if (rr)
2227 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2228
7721f567 2229 rr = drbd_free_peer_reqs(mdev, &mdev->read_ee);
b411b363
PR
2230 if (rr)
2231 dev_err(DEV, "%d EEs in read list found!\n", rr);
2232
7721f567 2233 rr = drbd_free_peer_reqs(mdev, &mdev->done_ee);
b411b363
PR
2234 if (rr)
2235 dev_err(DEV, "%d EEs in done list found!\n", rr);
2236
7721f567 2237 rr = drbd_free_peer_reqs(mdev, &mdev->net_ee);
b411b363
PR
2238 if (rr)
2239 dev_err(DEV, "%d EEs in net list found!\n", rr);
2240}
2241
774b3055 2242/* caution. no locking. */
ff370e5a 2243void drbd_delete_device(struct drbd_conf *mdev)
b411b363 2244{
9dc9fbb3
PR
2245 struct drbd_tconn *tconn = mdev->tconn;
2246
569083c0 2247 idr_remove(&mdev->tconn->volumes, mdev->vnr);
ff370e5a 2248 idr_remove(&minors, mdev_to_minor(mdev));
569083c0 2249 synchronize_rcu();
774b3055 2250
b411b363 2251 /* paranoia asserts */
70dc65e1 2252 D_ASSERT(mdev->open_cnt == 0);
e42325a5 2253 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
b411b363
PR
2254 /* end paranoia asserts */
2255
2256 del_gendisk(mdev->vdisk);
2257
2258 /* cleanup stuff that may have been allocated during
2259 * device (re-)configuration or state changes */
2260
2261 if (mdev->this_bdev)
2262 bdput(mdev->this_bdev);
2263
1d041225
PR
2264 drbd_free_bc(mdev->ldev);
2265 mdev->ldev = NULL;
b411b363 2266
7721f567 2267 drbd_release_all_peer_reqs(mdev);
b411b363 2268
b411b363
PR
2269 lc_destroy(mdev->act_log);
2270 lc_destroy(mdev->resync);
2271
2272 kfree(mdev->p_uuid);
2273 /* mdev->p_uuid = NULL; */
2274
cd1d9950
PR
2275 kfree(mdev->current_epoch);
2276 if (mdev->bitmap) /* should no longer be there. */
2277 drbd_bm_cleanup(mdev);
2278 __free_page(mdev->md_io_page);
2279 put_disk(mdev->vdisk);
2280 blk_cleanup_queue(mdev->rq_queue);
2281 kfree(mdev);
9dc9fbb3
PR
2282
2283 kref_put(&tconn->kref, &conn_destroy);
b411b363
PR
2284}
2285
2286static void drbd_cleanup(void)
2287{
2288 unsigned int i;
81a5d60e 2289 struct drbd_conf *mdev;
b411b363
PR
2290
2291 unregister_reboot_notifier(&drbd_notifier);
2292
17a93f30
LE
2293 /* first remove proc,
2294 * drbdsetup uses it's presence to detect
2295 * whether DRBD is loaded.
2296 * If we would get stuck in proc removal,
2297 * but have netlink already deregistered,
2298 * some drbdsetup commands may wait forever
2299 * for an answer.
2300 */
2301 if (drbd_proc)
2302 remove_proc_entry("drbd", NULL);
2303
3b98c0c2 2304 drbd_genl_unregister();
b411b363 2305
d3fcb490 2306 down_write(&drbd_cfg_rwsem);
81a5d60e 2307 idr_for_each_entry(&minors, mdev, i)
ff370e5a 2308 drbd_delete_device(mdev);
d3fcb490 2309 up_write(&drbd_cfg_rwsem);
ff370e5a 2310
81a5d60e 2311 drbd_destroy_mempools();
b411b363
PR
2312 unregister_blkdev(DRBD_MAJOR, "drbd");
2313
81a5d60e
PR
2314 idr_destroy(&minors);
2315
b411b363
PR
2316 printk(KERN_INFO "drbd: module cleanup done.\n");
2317}
2318
2319/**
2320 * drbd_congested() - Callback for pdflush
2321 * @congested_data: User data
2322 * @bdi_bits: Bits pdflush is currently interested in
2323 *
2324 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2325 */
2326static int drbd_congested(void *congested_data, int bdi_bits)
2327{
2328 struct drbd_conf *mdev = congested_data;
2329 struct request_queue *q;
2330 char reason = '-';
2331 int r = 0;
2332
1b881ef7 2333 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
2334 /* DRBD has frozen IO */
2335 r = bdi_bits;
2336 reason = 'd';
2337 goto out;
2338 }
2339
2340 if (get_ldev(mdev)) {
2341 q = bdev_get_queue(mdev->ldev->backing_bdev);
2342 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2343 put_ldev(mdev);
2344 if (r)
2345 reason = 'b';
2346 }
2347
01a311a5 2348 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
b411b363
PR
2349 r |= (1 << BDI_async_congested);
2350 reason = reason == 'b' ? 'a' : 'n';
2351 }
2352
2353out:
2354 mdev->congestion_reason = reason;
2355 return r;
2356}
2357
6699b655
PR
2358static void drbd_init_workqueue(struct drbd_work_queue* wq)
2359{
2360 sema_init(&wq->s, 0);
2361 spin_lock_init(&wq->q_lock);
2362 INIT_LIST_HEAD(&wq->q);
2363}
2364
0ace9dfa 2365struct drbd_tconn *conn_get_by_name(const char *name)
1aba4d7f
PR
2366{
2367 struct drbd_tconn *tconn;
2368
3b98c0c2
LE
2369 if (!name || !name[0])
2370 return NULL;
2371
ef356262 2372 down_read(&drbd_cfg_rwsem);
1aba4d7f 2373 list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
0ace9dfa
PR
2374 if (!strcmp(tconn->name, name)) {
2375 kref_get(&tconn->kref);
1aba4d7f 2376 goto found;
0ace9dfa 2377 }
1aba4d7f
PR
2378 }
2379 tconn = NULL;
2380found:
ef356262 2381 up_read(&drbd_cfg_rwsem);
1aba4d7f
PR
2382 return tconn;
2383}
2384
e6ef8a5c
AG
2385static int drbd_alloc_socket(struct drbd_socket *socket)
2386{
2387 socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
2388 if (!socket->rbuf)
2389 return -ENOMEM;
5a87d920
AG
2390 socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
2391 if (!socket->sbuf)
2392 return -ENOMEM;
e6ef8a5c
AG
2393 return 0;
2394}
2395
2396static void drbd_free_socket(struct drbd_socket *socket)
2397{
5a87d920 2398 free_page((unsigned long) socket->sbuf);
e6ef8a5c
AG
2399 free_page((unsigned long) socket->rbuf);
2400}
2401
91fd4dad
PR
2402void conn_free_crypto(struct drbd_tconn *tconn)
2403{
1d041225
PR
2404 drbd_free_sock(tconn);
2405
2406 crypto_free_hash(tconn->csums_tfm);
2407 crypto_free_hash(tconn->verify_tfm);
91fd4dad 2408 crypto_free_hash(tconn->cram_hmac_tfm);
8d412fc6 2409 crypto_free_hash(tconn->integrity_tfm);
5b614abe 2410 crypto_free_hash(tconn->peer_integrity_tfm);
91fd4dad
PR
2411 kfree(tconn->int_dig_in);
2412 kfree(tconn->int_dig_vv);
1d041225
PR
2413
2414 tconn->csums_tfm = NULL;
2415 tconn->verify_tfm = NULL;
91fd4dad 2416 tconn->cram_hmac_tfm = NULL;
8d412fc6 2417 tconn->integrity_tfm = NULL;
5b614abe 2418 tconn->peer_integrity_tfm = NULL;
91fd4dad
PR
2419 tconn->int_dig_in = NULL;
2420 tconn->int_dig_vv = NULL;
2421}
2422
9dc9fbb3 2423struct drbd_tconn *conn_create(const char *name)
2111438b
PR
2424{
2425 struct drbd_tconn *tconn;
2426
2427 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
2428 if (!tconn)
2429 return NULL;
2430
2431 tconn->name = kstrdup(name, GFP_KERNEL);
2432 if (!tconn->name)
2433 goto fail;
2434
e6ef8a5c
AG
2435 if (drbd_alloc_socket(&tconn->data))
2436 goto fail;
2437 if (drbd_alloc_socket(&tconn->meta))
2438 goto fail;
2439
774b3055
PR
2440 if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
2441 goto fail;
2442
2f5cdd0b
PR
2443 if (!tl_init(tconn))
2444 goto fail;
2445
bbeb641c 2446 tconn->cstate = C_STANDALONE;
8410da8f 2447 mutex_init(&tconn->cstate_mutex);
6699b655 2448 spin_lock_init(&tconn->req_lock);
91fd4dad 2449 mutex_init(&tconn->net_conf_update);
2a67d8b9 2450 init_waitqueue_head(&tconn->ping_wait);
062e879c 2451 idr_init(&tconn->volumes);
b2fb6dbe 2452
6699b655
PR
2453 drbd_init_workqueue(&tconn->data.work);
2454 mutex_init(&tconn->data.mutex);
2455
2456 drbd_init_workqueue(&tconn->meta.work);
2457 mutex_init(&tconn->meta.mutex);
2458
392c8801
PR
2459 drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
2460 drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
2461 drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
2462
5979e361 2463 drbd_set_res_opts_default(&tconn->res_opts);
f399002e 2464
ef356262 2465 down_write(&drbd_cfg_rwsem);
9dc9fbb3 2466 kref_init(&tconn->kref);
543cc10b 2467 list_add_tail(&tconn->all_tconn, &drbd_tconns);
ef356262 2468 up_write(&drbd_cfg_rwsem);
2111438b
PR
2469
2470 return tconn;
2471
2472fail:
2f5cdd0b 2473 tl_cleanup(tconn);
774b3055 2474 free_cpumask_var(tconn->cpu_mask);
e6ef8a5c
AG
2475 drbd_free_socket(&tconn->meta);
2476 drbd_free_socket(&tconn->data);
2111438b
PR
2477 kfree(tconn->name);
2478 kfree(tconn);
2479
2480 return NULL;
2481}
2482
9dc9fbb3 2483void conn_destroy(struct kref *kref)
2111438b 2484{
9dc9fbb3
PR
2485 struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref);
2486
062e879c 2487 idr_destroy(&tconn->volumes);
2111438b 2488
774b3055 2489 free_cpumask_var(tconn->cpu_mask);
e6ef8a5c
AG
2490 drbd_free_socket(&tconn->meta);
2491 drbd_free_socket(&tconn->data);
2111438b 2492 kfree(tconn->name);
b42a70ad
PR
2493 kfree(tconn->int_dig_in);
2494 kfree(tconn->int_dig_vv);
2111438b
PR
2495 kfree(tconn);
2496}
2497
774b3055 2498enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
b411b363
PR
2499{
2500 struct drbd_conf *mdev;
2501 struct gendisk *disk;
2502 struct request_queue *q;
774b3055 2503 int vnr_got = vnr;
81a5d60e 2504 int minor_got = minor;
8432b314 2505 enum drbd_ret_code err = ERR_NOMEM;
774b3055
PR
2506
2507 mdev = minor_to_mdev(minor);
2508 if (mdev)
2509 return ERR_MINOR_EXISTS;
b411b363
PR
2510
2511 /* GFP_KERNEL, we are outside of all write-out paths */
2512 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2513 if (!mdev)
774b3055
PR
2514 return ERR_NOMEM;
2515
9dc9fbb3 2516 kref_get(&tconn->kref);
774b3055 2517 mdev->tconn = tconn;
9dc9fbb3 2518
b411b363 2519 mdev->minor = minor;
3b98c0c2 2520 mdev->vnr = vnr;
b411b363
PR
2521
2522 drbd_init_set_defaults(mdev);
2523
2524 q = blk_alloc_queue(GFP_KERNEL);
2525 if (!q)
2526 goto out_no_q;
2527 mdev->rq_queue = q;
2528 q->queuedata = mdev;
b411b363
PR
2529
2530 disk = alloc_disk(1);
2531 if (!disk)
2532 goto out_no_disk;
2533 mdev->vdisk = disk;
2534
81e84650 2535 set_disk_ro(disk, true);
b411b363
PR
2536
2537 disk->queue = q;
2538 disk->major = DRBD_MAJOR;
2539 disk->first_minor = minor;
2540 disk->fops = &drbd_ops;
2541 sprintf(disk->disk_name, "drbd%d", minor);
2542 disk->private_data = mdev;
2543
2544 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2545 /* we have no partitions. we contain only ourselves. */
2546 mdev->this_bdev->bd_contains = mdev->this_bdev;
2547
2548 q->backing_dev_info.congested_fn = drbd_congested;
2549 q->backing_dev_info.congested_data = mdev;
2550
2f58dcfc 2551 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
2552 /* Setting the max_hw_sectors to an odd value of 8kibyte here
2553 This triggers a max_bio_size message upon first attach or connect */
2554 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
2555 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
2556 blk_queue_merge_bvec(q, drbd_merge_bvec);
87eeee41 2557 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
b411b363
PR
2558
2559 mdev->md_io_page = alloc_page(GFP_KERNEL);
2560 if (!mdev->md_io_page)
2561 goto out_no_io_page;
2562
2563 if (drbd_bm_init(mdev))
2564 goto out_no_bitmap;
dac1389c 2565 mdev->read_requests = RB_ROOT;
de696716 2566 mdev->write_requests = RB_ROOT;
b411b363 2567
b411b363
PR
2568 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2569 if (!mdev->current_epoch)
2570 goto out_no_epoch;
2571
2572 INIT_LIST_HEAD(&mdev->current_epoch->list);
2573 mdev->epochs = 1;
2574
81a5d60e 2575 if (!idr_pre_get(&minors, GFP_KERNEL))
8432b314
LE
2576 goto out_no_minor_idr;
2577 if (idr_get_new_above(&minors, mdev, minor, &minor_got))
2578 goto out_no_minor_idr;
81a5d60e 2579 if (minor_got != minor) {
8432b314
LE
2580 err = ERR_MINOR_EXISTS;
2581 drbd_msg_put_info("requested minor exists already");
569083c0 2582 goto out_idr_remove_minor;
81a5d60e 2583 }
8432b314
LE
2584
2585 if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
2586 goto out_idr_remove_minor;
2587 if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
2588 goto out_idr_remove_minor;
2589 if (vnr_got != vnr) {
2590 err = ERR_INVALID_REQUEST;
2591 drbd_msg_put_info("requested volume exists already");
2592 goto out_idr_remove_vol;
2593 }
774b3055
PR
2594 add_disk(disk);
2595
2325eb66
PR
2596 /* inherit the connection state */
2597 mdev->state.conn = tconn->cstate;
2598 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2599 drbd_connected(vnr, mdev, tconn);
2600
774b3055 2601 return NO_ERROR;
b411b363 2602
569083c0
LE
2603out_idr_remove_vol:
2604 idr_remove(&tconn->volumes, vnr_got);
8432b314
LE
2605out_idr_remove_minor:
2606 idr_remove(&minors, minor_got);
569083c0 2607 synchronize_rcu();
8432b314 2608out_no_minor_idr:
81a5d60e 2609 kfree(mdev->current_epoch);
b411b363 2610out_no_epoch:
b411b363
PR
2611 drbd_bm_cleanup(mdev);
2612out_no_bitmap:
2613 __free_page(mdev->md_io_page);
2614out_no_io_page:
2615 put_disk(disk);
2616out_no_disk:
2617 blk_cleanup_queue(q);
2618out_no_q:
b411b363 2619 kfree(mdev);
9dc9fbb3 2620 kref_put(&tconn->kref, &conn_destroy);
8432b314 2621 return err;
b411b363
PR
2622}
2623
b411b363
PR
2624int __init drbd_init(void)
2625{
2626 int err;
2627
2b8a90b5 2628 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363 2629 printk(KERN_ERR
81a5d60e 2630 "drbd: invalid minor_count (%d)\n", minor_count);
b411b363
PR
2631#ifdef MODULE
2632 return -EINVAL;
2633#else
2634 minor_count = 8;
2635#endif
2636 }
2637
b411b363
PR
2638 err = register_blkdev(DRBD_MAJOR, "drbd");
2639 if (err) {
2640 printk(KERN_ERR
2641 "drbd: unable to register block device major %d\n",
2642 DRBD_MAJOR);
2643 return err;
2644 }
2645
3b98c0c2
LE
2646 err = drbd_genl_register();
2647 if (err) {
2648 printk(KERN_ERR "drbd: unable to register generic netlink family\n");
2649 goto fail;
2650 }
2651
2652
b411b363
PR
2653 register_reboot_notifier(&drbd_notifier);
2654
2655 /*
2656 * allocate all necessary structs
2657 */
2658 err = -ENOMEM;
2659
2660 init_waitqueue_head(&drbd_pp_wait);
2661
2662 drbd_proc = NULL; /* play safe for drbd_cleanup */
81a5d60e 2663 idr_init(&minors);
b411b363
PR
2664
2665 err = drbd_create_mempools();
2666 if (err)
3b98c0c2 2667 goto fail;
b411b363 2668
8c484ee4 2669 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
2670 if (!drbd_proc) {
2671 printk(KERN_ERR "drbd: unable to register proc file\n");
3b98c0c2 2672 goto fail;
b411b363
PR
2673 }
2674
2675 rwlock_init(&global_state_lock);
2111438b 2676 INIT_LIST_HEAD(&drbd_tconns);
b411b363
PR
2677
2678 printk(KERN_INFO "drbd: initialized. "
2679 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2680 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2681 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
2682 printk(KERN_INFO "drbd: registered as block device major %d\n",
2683 DRBD_MAJOR);
b411b363
PR
2684
2685 return 0; /* Success! */
2686
3b98c0c2 2687fail:
b411b363
PR
2688 drbd_cleanup();
2689 if (err == -ENOMEM)
2690 /* currently always the case */
2691 printk(KERN_ERR "drbd: ran out of memory\n");
2692 else
2693 printk(KERN_ERR "drbd: initialization failure\n");
2694 return err;
2695}
2696
2697void drbd_free_bc(struct drbd_backing_dev *ldev)
2698{
2699 if (ldev == NULL)
2700 return;
2701
e525fd89
TH
2702 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2703 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
2704
2705 kfree(ldev);
2706}
2707
360cc740
PR
2708void drbd_free_sock(struct drbd_tconn *tconn)
2709{
2710 if (tconn->data.socket) {
2711 mutex_lock(&tconn->data.mutex);
2712 kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
2713 sock_release(tconn->data.socket);
2714 tconn->data.socket = NULL;
2715 mutex_unlock(&tconn->data.mutex);
b411b363 2716 }
360cc740
PR
2717 if (tconn->meta.socket) {
2718 mutex_lock(&tconn->meta.mutex);
2719 kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
2720 sock_release(tconn->meta.socket);
2721 tconn->meta.socket = NULL;
2722 mutex_unlock(&tconn->meta.mutex);
b411b363
PR
2723 }
2724}
2725
b411b363
PR
2726/* meta data management */
2727
2728struct meta_data_on_disk {
2729 u64 la_size; /* last agreed size. */
2730 u64 uuid[UI_SIZE]; /* UUIDs. */
2731 u64 device_uuid;
2732 u64 reserved_u64_1;
2733 u32 flags; /* MDF */
2734 u32 magic;
2735 u32 md_size_sect;
2736 u32 al_offset; /* offset to this block */
2737 u32 al_nr_extents; /* important for restoring the AL */
f399002e 2738 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
b411b363
PR
2739 u32 bm_offset; /* offset to the bitmap, from here */
2740 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
2741 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2742 u32 reserved_u32[3];
b411b363
PR
2743
2744} __packed;
2745
2746/**
2747 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
2748 * @mdev: DRBD device.
2749 */
2750void drbd_md_sync(struct drbd_conf *mdev)
2751{
2752 struct meta_data_on_disk *buffer;
2753 sector_t sector;
2754 int i;
2755
ee15b038
LE
2756 del_timer(&mdev->md_sync_timer);
2757 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
2758 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2759 return;
b411b363
PR
2760
2761 /* We use here D_FAILED and not D_ATTACHING because we try to write
2762 * metadata even if we detach due to a disk failure! */
2763 if (!get_ldev_if_state(mdev, D_FAILED))
2764 return;
2765
b411b363
PR
2766 mutex_lock(&mdev->md_io_mutex);
2767 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2768 memset(buffer, 0, 512);
2769
2770 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2771 for (i = UI_CURRENT; i < UI_SIZE; i++)
2772 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2773 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
2774 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
2775
2776 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
2777 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
2778 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
2779 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
2780 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
2781
2782 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 2783 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
2784
2785 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
2786 sector = mdev->ldev->md.md_offset;
2787
3fbf4d21 2788 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
2789 /* this was a try anyways ... */
2790 dev_err(DEV, "meta data update failed!\n");
81e84650 2791 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
2792 }
2793
2794 /* Update mdev->ldev->md.la_size_sect,
2795 * since we updated it on metadata. */
2796 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
2797
2798 mutex_unlock(&mdev->md_io_mutex);
2799 put_ldev(mdev);
2800}
2801
2802/**
2803 * drbd_md_read() - Reads in the meta data super block
2804 * @mdev: DRBD device.
2805 * @bdev: Device from which the meta data should be read in.
2806 *
116676ca 2807 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
2808 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
2809 */
2810int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2811{
2812 struct meta_data_on_disk *buffer;
2813 int i, rv = NO_ERROR;
2814
2815 if (!get_ldev_if_state(mdev, D_ATTACHING))
2816 return ERR_IO_MD_DISK;
2817
b411b363
PR
2818 mutex_lock(&mdev->md_io_mutex);
2819 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2820
3fbf4d21 2821 if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 2822 /* NOTE: can't do normal error processing here as this is
b411b363
PR
2823 called BEFORE disk is attached */
2824 dev_err(DEV, "Error while reading metadata.\n");
2825 rv = ERR_IO_MD_DISK;
2826 goto err;
2827 }
2828
e7fad8af 2829 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
2830 dev_err(DEV, "Error while reading metadata, magic not found.\n");
2831 rv = ERR_MD_INVALID;
2832 goto err;
2833 }
2834 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
2835 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
2836 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
2837 rv = ERR_MD_INVALID;
2838 goto err;
2839 }
2840 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2841 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2842 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2843 rv = ERR_MD_INVALID;
2844 goto err;
2845 }
2846 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2847 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2848 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2849 rv = ERR_MD_INVALID;
2850 goto err;
2851 }
2852
2853 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2854 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2855 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2856 rv = ERR_MD_INVALID;
2857 goto err;
2858 }
2859
2860 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2861 for (i = UI_CURRENT; i < UI_SIZE; i++)
2862 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2863 bdev->md.flags = be32_to_cpu(buffer->flags);
f399002e 2864 bdev->dc.al_extents = be32_to_cpu(buffer->al_nr_extents);
b411b363
PR
2865 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2866
87eeee41 2867 spin_lock_irq(&mdev->tconn->req_lock);
99432fcc
PR
2868 if (mdev->state.conn < C_CONNECTED) {
2869 int peer;
2870 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
2871 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
2872 mdev->peer_max_bio_size = peer;
2873 }
87eeee41 2874 spin_unlock_irq(&mdev->tconn->req_lock);
99432fcc 2875
f399002e
LE
2876 if (bdev->dc.al_extents < 7)
2877 bdev->dc.al_extents = 127;
b411b363
PR
2878
2879 err:
2880 mutex_unlock(&mdev->md_io_mutex);
2881 put_ldev(mdev);
2882
2883 return rv;
2884}
2885
2886/**
2887 * drbd_md_mark_dirty() - Mark meta data super block as dirty
2888 * @mdev: DRBD device.
2889 *
2890 * Call this function if you change anything that should be written to
2891 * the meta-data super block. This function sets MD_DIRTY, and starts a
2892 * timer that ensures that within five seconds you have to call drbd_md_sync().
2893 */
ca0e6098 2894#ifdef DEBUG
ee15b038
LE
2895void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
2896{
2897 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
2898 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
2899 mdev->last_md_mark_dirty.line = line;
2900 mdev->last_md_mark_dirty.func = func;
2901 }
2902}
2903#else
b411b363
PR
2904void drbd_md_mark_dirty(struct drbd_conf *mdev)
2905{
ee15b038 2906 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 2907 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 2908}
ee15b038 2909#endif
b411b363
PR
2910
2911static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
2912{
2913 int i;
2914
62b0da3a 2915 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 2916 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
2917}
2918
2919void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2920{
2921 if (idx == UI_CURRENT) {
2922 if (mdev->state.role == R_PRIMARY)
2923 val |= 1;
2924 else
2925 val &= ~((u64)1);
2926
2927 drbd_set_ed_uuid(mdev, val);
2928 }
2929
2930 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
2931 drbd_md_mark_dirty(mdev);
2932}
2933
2934
2935void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2936{
2937 if (mdev->ldev->md.uuid[idx]) {
2938 drbd_uuid_move_history(mdev);
2939 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
2940 }
2941 _drbd_uuid_set(mdev, idx, val);
2942}
2943
2944/**
2945 * drbd_uuid_new_current() - Creates a new current UUID
2946 * @mdev: DRBD device.
2947 *
2948 * Creates a new current UUID, and rotates the old current UUID into
2949 * the bitmap slot. Causes an incremental resync upon next connect.
2950 */
2951void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
2952{
2953 u64 val;
62b0da3a
LE
2954 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2955
2956 if (bm_uuid)
2957 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2958
b411b363 2959 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
2960
2961 get_random_bytes(&val, sizeof(u64));
2962 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 2963 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
2964 /* get it to stable storage _now_ */
2965 drbd_md_sync(mdev);
b411b363
PR
2966}
2967
2968void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
2969{
2970 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
2971 return;
2972
2973 if (val == 0) {
2974 drbd_uuid_move_history(mdev);
2975 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
2976 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 2977 } else {
62b0da3a
LE
2978 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2979 if (bm_uuid)
2980 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2981
62b0da3a 2982 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
2983 }
2984 drbd_md_mark_dirty(mdev);
2985}
2986
2987/**
2988 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2989 * @mdev: DRBD device.
2990 *
2991 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
2992 */
2993int drbd_bmio_set_n_write(struct drbd_conf *mdev)
2994{
2995 int rv = -EIO;
2996
2997 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2998 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
2999 drbd_md_sync(mdev);
3000 drbd_bm_set_all(mdev);
3001
3002 rv = drbd_bm_write(mdev);
3003
3004 if (!rv) {
3005 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3006 drbd_md_sync(mdev);
3007 }
3008
3009 put_ldev(mdev);
3010 }
3011
3012 return rv;
3013}
3014
3015/**
3016 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3017 * @mdev: DRBD device.
3018 *
3019 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3020 */
3021int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3022{
3023 int rv = -EIO;
3024
0778286a 3025 drbd_resume_al(mdev);
b411b363
PR
3026 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3027 drbd_bm_clear_all(mdev);
3028 rv = drbd_bm_write(mdev);
3029 put_ldev(mdev);
3030 }
3031
3032 return rv;
3033}
3034
99920dc5 3035static int w_bitmap_io(struct drbd_work *w, int unused)
b411b363
PR
3036{
3037 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
00d56944 3038 struct drbd_conf *mdev = w->mdev;
02851e9f 3039 int rv = -EIO;
b411b363
PR
3040
3041 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3042
02851e9f 3043 if (get_ldev(mdev)) {
20ceb2b2 3044 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
3045 rv = work->io_fn(mdev);
3046 drbd_bm_unlock(mdev);
3047 put_ldev(mdev);
3048 }
b411b363 3049
4738fa16 3050 clear_bit_unlock(BITMAP_IO, &mdev->flags);
b411b363
PR
3051 wake_up(&mdev->misc_wait);
3052
3053 if (work->done)
3054 work->done(mdev, rv);
3055
3056 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3057 work->why = NULL;
20ceb2b2 3058 work->flags = 0;
b411b363 3059
99920dc5 3060 return 0;
b411b363
PR
3061}
3062
82f59cc6
LE
3063void drbd_ldev_destroy(struct drbd_conf *mdev)
3064{
3065 lc_destroy(mdev->resync);
3066 mdev->resync = NULL;
3067 lc_destroy(mdev->act_log);
3068 mdev->act_log = NULL;
3069 __no_warn(local,
3070 drbd_free_bc(mdev->ldev);
3071 mdev->ldev = NULL;);
3072
82f59cc6
LE
3073 clear_bit(GO_DISKLESS, &mdev->flags);
3074}
3075
99920dc5 3076static int w_go_diskless(struct drbd_work *w, int unused)
e9e6f3ec 3077{
00d56944
PR
3078 struct drbd_conf *mdev = w->mdev;
3079
e9e6f3ec 3080 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
3081 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3082 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
3083 * the protected members anymore, though, so once put_ldev reaches zero
3084 * again, it will be safe to free them. */
e9e6f3ec 3085 drbd_force_state(mdev, NS(disk, D_DISKLESS));
99920dc5 3086 return 0;
e9e6f3ec
LE
3087}
3088
3089void drbd_go_diskless(struct drbd_conf *mdev)
3090{
3091 D_ASSERT(mdev->state.disk == D_FAILED);
3092 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
e42325a5 3093 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
e9e6f3ec
LE
3094}
3095
b411b363
PR
3096/**
3097 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3098 * @mdev: DRBD device.
3099 * @io_fn: IO callback to be called when bitmap IO is possible
3100 * @done: callback to be called after the bitmap IO was performed
3101 * @why: Descriptive text of the reason for doing the IO
3102 *
3103 * While IO on the bitmap happens we freeze application IO thus we ensure
3104 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3105 * called from worker context. It MUST NOT be used while a previous such
3106 * work is still pending!
3107 */
3108void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3109 int (*io_fn)(struct drbd_conf *),
3110 void (*done)(struct drbd_conf *, int),
20ceb2b2 3111 char *why, enum bm_flag flags)
b411b363 3112{
e6b3ea83 3113 D_ASSERT(current == mdev->tconn->worker.task);
b411b363
PR
3114
3115 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3116 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3117 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3118 if (mdev->bm_io_work.why)
3119 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3120 why, mdev->bm_io_work.why);
3121
3122 mdev->bm_io_work.io_fn = io_fn;
3123 mdev->bm_io_work.done = done;
3124 mdev->bm_io_work.why = why;
20ceb2b2 3125 mdev->bm_io_work.flags = flags;
b411b363 3126
87eeee41 3127 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
3128 set_bit(BITMAP_IO, &mdev->flags);
3129 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 3130 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
e42325a5 3131 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
b411b363 3132 }
87eeee41 3133 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
3134}
3135
3136/**
3137 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3138 * @mdev: DRBD device.
3139 * @io_fn: IO callback to be called when bitmap IO is possible
3140 * @why: Descriptive text of the reason for doing the IO
3141 *
3142 * freezes application IO while that the actual IO operations runs. This
3143 * functions MAY NOT be called from worker context.
3144 */
20ceb2b2
LE
3145int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
3146 char *why, enum bm_flag flags)
b411b363
PR
3147{
3148 int rv;
3149
e6b3ea83 3150 D_ASSERT(current != mdev->tconn->worker.task);
b411b363 3151
20ceb2b2
LE
3152 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3153 drbd_suspend_io(mdev);
b411b363 3154
20ceb2b2 3155 drbd_bm_lock(mdev, why, flags);
b411b363
PR
3156 rv = io_fn(mdev);
3157 drbd_bm_unlock(mdev);
3158
20ceb2b2
LE
3159 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3160 drbd_resume_io(mdev);
b411b363
PR
3161
3162 return rv;
3163}
3164
3165void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3166{
3167 if ((mdev->ldev->md.flags & flag) != flag) {
3168 drbd_md_mark_dirty(mdev);
3169 mdev->ldev->md.flags |= flag;
3170 }
3171}
3172
3173void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3174{
3175 if ((mdev->ldev->md.flags & flag) != 0) {
3176 drbd_md_mark_dirty(mdev);
3177 mdev->ldev->md.flags &= ~flag;
3178 }
3179}
3180int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3181{
3182 return (bdev->md.flags & flag) != 0;
3183}
3184
3185static void md_sync_timer_fn(unsigned long data)
3186{
3187 struct drbd_conf *mdev = (struct drbd_conf *) data;
3188
e42325a5 3189 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
b411b363
PR
3190}
3191
99920dc5 3192static int w_md_sync(struct drbd_work *w, int unused)
b411b363 3193{
00d56944
PR
3194 struct drbd_conf *mdev = w->mdev;
3195
b411b363 3196 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
3197#ifdef DEBUG
3198 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3199 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3200#endif
b411b363 3201 drbd_md_sync(mdev);
99920dc5 3202 return 0;
b411b363
PR
3203}
3204
d8763023 3205const char *cmdname(enum drbd_packet cmd)
f2ad9063
AG
3206{
3207 /* THINK may need to become several global tables
3208 * when we want to support more than
3209 * one PRO_VERSION */
3210 static const char *cmdnames[] = {
3211 [P_DATA] = "Data",
3212 [P_DATA_REPLY] = "DataReply",
3213 [P_RS_DATA_REPLY] = "RSDataReply",
3214 [P_BARRIER] = "Barrier",
3215 [P_BITMAP] = "ReportBitMap",
3216 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
3217 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
3218 [P_UNPLUG_REMOTE] = "UnplugRemote",
3219 [P_DATA_REQUEST] = "DataRequest",
3220 [P_RS_DATA_REQUEST] = "RSDataRequest",
3221 [P_SYNC_PARAM] = "SyncParam",
3222 [P_SYNC_PARAM89] = "SyncParam89",
3223 [P_PROTOCOL] = "ReportProtocol",
3224 [P_UUIDS] = "ReportUUIDs",
3225 [P_SIZES] = "ReportSizes",
3226 [P_STATE] = "ReportState",
3227 [P_SYNC_UUID] = "ReportSyncUUID",
3228 [P_AUTH_CHALLENGE] = "AuthChallenge",
3229 [P_AUTH_RESPONSE] = "AuthResponse",
3230 [P_PING] = "Ping",
3231 [P_PING_ACK] = "PingAck",
3232 [P_RECV_ACK] = "RecvAck",
3233 [P_WRITE_ACK] = "WriteAck",
3234 [P_RS_WRITE_ACK] = "RSWriteAck",
7be8da07 3235 [P_DISCARD_WRITE] = "DiscardWrite",
f2ad9063
AG
3236 [P_NEG_ACK] = "NegAck",
3237 [P_NEG_DREPLY] = "NegDReply",
3238 [P_NEG_RS_DREPLY] = "NegRSDReply",
3239 [P_BARRIER_ACK] = "BarrierAck",
3240 [P_STATE_CHG_REQ] = "StateChgRequest",
3241 [P_STATE_CHG_REPLY] = "StateChgReply",
3242 [P_OV_REQUEST] = "OVRequest",
3243 [P_OV_REPLY] = "OVReply",
3244 [P_OV_RESULT] = "OVResult",
3245 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
3246 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
3247 [P_COMPRESSED_BITMAP] = "CBitmap",
3248 [P_DELAY_PROBE] = "DelayProbe",
3249 [P_OUT_OF_SYNC] = "OutOfSync",
7be8da07 3250 [P_RETRY_WRITE] = "RetryWrite",
ae25b336
LE
3251 [P_RS_CANCEL] = "RSCancel",
3252 [P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
3253 [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
3254
3255 /* enum drbd_packet, but not commands - obsoleted flags:
3256 * P_MAY_IGNORE
3257 * P_MAX_OPT_CMD
3258 */
f2ad9063
AG
3259 };
3260
ae25b336 3261 /* too big for the array: 0xfffX */
e5d6f33a
AG
3262 if (cmd == P_INITIAL_META)
3263 return "InitialMeta";
3264 if (cmd == P_INITIAL_DATA)
3265 return "InitialData";
6038178e
AG
3266 if (cmd == P_CONNECTION_FEATURES)
3267 return "ConnectionFeatures";
6e849ce8 3268 if (cmd >= ARRAY_SIZE(cmdnames))
f2ad9063
AG
3269 return "Unknown";
3270 return cmdnames[cmd];
3271}
3272
7be8da07
AG
3273/**
3274 * drbd_wait_misc - wait for a request to make progress
3275 * @mdev: device associated with the request
3276 * @i: the struct drbd_interval embedded in struct drbd_request or
3277 * struct drbd_peer_request
3278 */
3279int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
3280{
44ed167d 3281 struct net_conf *nc;
7be8da07
AG
3282 DEFINE_WAIT(wait);
3283 long timeout;
3284
44ed167d
PR
3285 rcu_read_lock();
3286 nc = rcu_dereference(mdev->tconn->net_conf);
3287 if (!nc) {
3288 rcu_read_unlock();
7be8da07 3289 return -ETIMEDOUT;
44ed167d
PR
3290 }
3291 timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
3292 rcu_read_unlock();
7be8da07
AG
3293
3294 /* Indicate to wake up mdev->misc_wait on progress. */
3295 i->waiting = true;
3296 prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
3297 spin_unlock_irq(&mdev->tconn->req_lock);
3298 timeout = schedule_timeout(timeout);
3299 finish_wait(&mdev->misc_wait, &wait);
3300 spin_lock_irq(&mdev->tconn->req_lock);
3301 if (!timeout || mdev->state.conn < C_CONNECTED)
3302 return -ETIMEDOUT;
3303 if (signal_pending(current))
3304 return -ERESTARTSYS;
3305 return 0;
3306}
3307
b411b363
PR
3308#ifdef CONFIG_DRBD_FAULT_INJECTION
3309/* Fault insertion support including random number generator shamelessly
3310 * stolen from kernel/rcutorture.c */
3311struct fault_random_state {
3312 unsigned long state;
3313 unsigned long count;
3314};
3315
3316#define FAULT_RANDOM_MULT 39916801 /* prime */
3317#define FAULT_RANDOM_ADD 479001701 /* prime */
3318#define FAULT_RANDOM_REFRESH 10000
3319
3320/*
3321 * Crude but fast random-number generator. Uses a linear congruential
3322 * generator, with occasional help from get_random_bytes().
3323 */
3324static unsigned long
3325_drbd_fault_random(struct fault_random_state *rsp)
3326{
3327 long refresh;
3328
49829ea7 3329 if (!rsp->count--) {
b411b363
PR
3330 get_random_bytes(&refresh, sizeof(refresh));
3331 rsp->state += refresh;
3332 rsp->count = FAULT_RANDOM_REFRESH;
3333 }
3334 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3335 return swahw32(rsp->state);
3336}
3337
3338static char *
3339_drbd_fault_str(unsigned int type) {
3340 static char *_faults[] = {
3341 [DRBD_FAULT_MD_WR] = "Meta-data write",
3342 [DRBD_FAULT_MD_RD] = "Meta-data read",
3343 [DRBD_FAULT_RS_WR] = "Resync write",
3344 [DRBD_FAULT_RS_RD] = "Resync read",
3345 [DRBD_FAULT_DT_WR] = "Data write",
3346 [DRBD_FAULT_DT_RD] = "Data read",
3347 [DRBD_FAULT_DT_RA] = "Data read ahead",
3348 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3349 [DRBD_FAULT_AL_EE] = "EE allocation",
3350 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3351 };
3352
3353 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3354}
3355
3356unsigned int
3357_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3358{
3359 static struct fault_random_state rrs = {0, 0};
3360
3361 unsigned int ret = (
3362 (fault_devs == 0 ||
3363 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3364 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3365
3366 if (ret) {
3367 fault_count++;
3368
7383506c 3369 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
3370 dev_warn(DEV, "***Simulating %s failure\n",
3371 _drbd_fault_str(type));
3372 }
3373
3374 return ret;
3375}
3376#endif
3377
3378const char *drbd_buildtag(void)
3379{
3380 /* DRBD built from external sources has here a reference to the
3381 git hash of the source code. */
3382
3383 static char buildtag[38] = "\0uilt-in";
3384
3385 if (buildtag[0] == 0) {
3386#ifdef CONFIG_MODULES
3387 if (THIS_MODULE != NULL)
3388 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3389 else
3390#endif
3391 buildtag[0] = 'b';
3392 }
3393
3394 return buildtag;
3395}
3396
3397module_init(drbd_init)
3398module_exit(drbd_cleanup)
3399
b411b363
PR
3400EXPORT_SYMBOL(drbd_conn_str);
3401EXPORT_SYMBOL(drbd_role_str);
3402EXPORT_SYMBOL(drbd_disk_str);
3403EXPORT_SYMBOL(drbd_set_st_err_str);