]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/block/drbd/drbd_main.c
drbd: Implemented new commands to create/delete connections/minors
[mirror_ubuntu-artful-kernel.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
2a48fc0a 59static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
60int drbdd_init(struct drbd_thread *);
61int drbd_worker(struct drbd_thread *);
62int drbd_asender(struct drbd_thread *);
63
64int drbd_init(void);
65static int drbd_open(struct block_device *bdev, fmode_t mode);
66static int drbd_release(struct gendisk *gd, fmode_t mode);
00d56944 67static int w_md_sync(struct drbd_work *w, int unused);
b411b363 68static void md_sync_timer_fn(unsigned long data);
00d56944
PR
69static int w_bitmap_io(struct drbd_work *w, int unused);
70static int w_go_diskless(struct drbd_work *w, int unused);
b411b363 71
b411b363
PR
72MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
73 "Lars Ellenberg <lars@linbit.com>");
74MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
75MODULE_VERSION(REL_VERSION);
76MODULE_LICENSE("GPL");
2b8a90b5
PR
77MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
78 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
79MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
80
81#include <linux/moduleparam.h>
82/* allow_open_on_secondary */
83MODULE_PARM_DESC(allow_oos, "DONT USE!");
84/* thanks to these macros, if compiled into the kernel (not-module),
85 * this becomes the boot parameter drbd.minor_count */
86module_param(minor_count, uint, 0444);
87module_param(disable_sendpage, bool, 0644);
88module_param(allow_oos, bool, 0);
89module_param(cn_idx, uint, 0444);
90module_param(proc_details, int, 0644);
91
92#ifdef CONFIG_DRBD_FAULT_INJECTION
93int enable_faults;
94int fault_rate;
95static int fault_count;
96int fault_devs;
97/* bitmap of enabled faults */
98module_param(enable_faults, int, 0664);
99/* fault rate % value - applies to all enabled faults */
100module_param(fault_rate, int, 0664);
101/* count of faults inserted */
102module_param(fault_count, int, 0664);
103/* bitmap of devices to insert faults on */
104module_param(fault_devs, int, 0644);
105#endif
106
107/* module parameter, defined */
2b8a90b5 108unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
109int disable_sendpage;
110int allow_oos;
111unsigned int cn_idx = CN_IDX_DRBD;
112int proc_details; /* Detail level in proc drbd*/
113
114/* Module parameter for setting the user mode helper program
115 * to run. Default is /sbin/drbdadm */
116char usermode_helper[80] = "/sbin/drbdadm";
117
118module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
119
120/* in 2.6.x, our device mapping and config info contains our virtual gendisks
121 * as member "struct gendisk *vdisk;"
122 */
123struct drbd_conf **minor_table;
2111438b 124struct list_head drbd_tconns; /* list of struct drbd_tconn */
b411b363
PR
125
126struct kmem_cache *drbd_request_cache;
6c852bec 127struct kmem_cache *drbd_ee_cache; /* peer requests */
b411b363
PR
128struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
129struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
130mempool_t *drbd_request_mempool;
131mempool_t *drbd_ee_mempool;
132
133/* I do not use a standard mempool, because:
134 1) I want to hand out the pre-allocated objects first.
135 2) I want to be able to interrupt sleeping allocation with a signal.
136 Note: This is a single linked list, the next pointer is the private
137 member of struct page.
138 */
139struct page *drbd_pp_pool;
140spinlock_t drbd_pp_lock;
141int drbd_pp_vacant;
142wait_queue_head_t drbd_pp_wait;
143
144DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
145
7d4e9d09 146static const struct block_device_operations drbd_ops = {
b411b363
PR
147 .owner = THIS_MODULE,
148 .open = drbd_open,
149 .release = drbd_release,
150};
151
152#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
153
154#ifdef __CHECKER__
155/* When checking with sparse, and this is an inline function, sparse will
156 give tons of false positives. When this is a real functions sparse works.
157 */
158int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
159{
160 int io_allowed;
161
162 atomic_inc(&mdev->local_cnt);
163 io_allowed = (mdev->state.disk >= mins);
164 if (!io_allowed) {
165 if (atomic_dec_and_test(&mdev->local_cnt))
166 wake_up(&mdev->misc_wait);
167 }
168 return io_allowed;
169}
170
171#endif
172
173/**
174 * DOC: The transfer log
175 *
176 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
87eeee41 177 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
b411b363
PR
178 * of the list. There is always at least one &struct drbd_tl_epoch object.
179 *
180 * Each &struct drbd_tl_epoch has a circular double linked list of requests
181 * attached.
182 */
2f5cdd0b 183static int tl_init(struct drbd_tconn *tconn)
b411b363
PR
184{
185 struct drbd_tl_epoch *b;
186
187 /* during device minor initialization, we may well use GFP_KERNEL */
188 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
189 if (!b)
190 return 0;
191 INIT_LIST_HEAD(&b->requests);
192 INIT_LIST_HEAD(&b->w.list);
193 b->next = NULL;
194 b->br_number = 4711;
7e602c0a 195 b->n_writes = 0;
b411b363
PR
196 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
197
2f5cdd0b
PR
198 tconn->oldest_tle = b;
199 tconn->newest_tle = b;
200 INIT_LIST_HEAD(&tconn->out_of_sequence_requests);
b411b363 201
b411b363
PR
202 return 1;
203}
204
2f5cdd0b 205static void tl_cleanup(struct drbd_tconn *tconn)
b411b363 206{
2f5cdd0b
PR
207 if (tconn->oldest_tle != tconn->newest_tle)
208 conn_err(tconn, "ASSERT FAILED: oldest_tle == newest_tle\n");
209 if (!list_empty(&tconn->out_of_sequence_requests))
210 conn_err(tconn, "ASSERT FAILED: list_empty(out_of_sequence_requests)\n");
211 kfree(tconn->oldest_tle);
212 tconn->oldest_tle = NULL;
213 kfree(tconn->unused_spare_tle);
214 tconn->unused_spare_tle = NULL;
d628769b
AG
215}
216
b411b363
PR
217/**
218 * _tl_add_barrier() - Adds a barrier to the transfer log
219 * @mdev: DRBD device.
220 * @new: Barrier to be added before the current head of the TL.
221 *
222 * The caller must hold the req_lock.
223 */
2f5cdd0b 224void _tl_add_barrier(struct drbd_tconn *tconn, struct drbd_tl_epoch *new)
b411b363
PR
225{
226 struct drbd_tl_epoch *newest_before;
227
228 INIT_LIST_HEAD(&new->requests);
229 INIT_LIST_HEAD(&new->w.list);
230 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
231 new->next = NULL;
7e602c0a 232 new->n_writes = 0;
b411b363 233
2f5cdd0b 234 newest_before = tconn->newest_tle;
b411b363
PR
235 /* never send a barrier number == 0, because that is special-cased
236 * when using TCQ for our write ordering code */
237 new->br_number = (newest_before->br_number+1) ?: 1;
2f5cdd0b
PR
238 if (tconn->newest_tle != new) {
239 tconn->newest_tle->next = new;
240 tconn->newest_tle = new;
b411b363
PR
241 }
242}
243
244/**
245 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
246 * @mdev: DRBD device.
247 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
248 * @set_size: Expected number of requests before that barrier.
249 *
250 * In case the passed barrier_nr or set_size does not match the oldest
251 * &struct drbd_tl_epoch objects this function will cause a termination
252 * of the connection.
253 */
2f5cdd0b
PR
254void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
255 unsigned int set_size)
b411b363 256{
2f5cdd0b 257 struct drbd_conf *mdev;
b411b363
PR
258 struct drbd_tl_epoch *b, *nob; /* next old barrier */
259 struct list_head *le, *tle;
260 struct drbd_request *r;
261
2f5cdd0b 262 spin_lock_irq(&tconn->req_lock);
b411b363 263
2f5cdd0b 264 b = tconn->oldest_tle;
b411b363
PR
265
266 /* first some paranoia code */
267 if (b == NULL) {
2f5cdd0b
PR
268 conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
269 barrier_nr);
b411b363
PR
270 goto bail;
271 }
272 if (b->br_number != barrier_nr) {
2f5cdd0b
PR
273 conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
274 barrier_nr, b->br_number);
b411b363
PR
275 goto bail;
276 }
7e602c0a 277 if (b->n_writes != set_size) {
2f5cdd0b
PR
278 conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
279 barrier_nr, set_size, b->n_writes);
b411b363
PR
280 goto bail;
281 }
282
283 /* Clean up list of requests processed during current epoch */
284 list_for_each_safe(le, tle, &b->requests) {
285 r = list_entry(le, struct drbd_request, tl_requests);
8554df1c 286 _req_mod(r, BARRIER_ACKED);
b411b363
PR
287 }
288 /* There could be requests on the list waiting for completion
289 of the write to the local disk. To avoid corruptions of
290 slab's data structures we have to remove the lists head.
291
292 Also there could have been a barrier ack out of sequence, overtaking
293 the write acks - which would be a bug and violating write ordering.
294 To not deadlock in case we lose connection while such requests are
295 still pending, we need some way to find them for the
8554df1c 296 _req_mode(CONNECTION_LOST_WHILE_PENDING).
b411b363
PR
297
298 These have been list_move'd to the out_of_sequence_requests list in
8554df1c 299 _req_mod(, BARRIER_ACKED) above.
b411b363
PR
300 */
301 list_del_init(&b->requests);
2f5cdd0b 302 mdev = b->w.mdev;
b411b363
PR
303
304 nob = b->next;
305 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
2f5cdd0b 306 _tl_add_barrier(tconn, b);
b411b363 307 if (nob)
2f5cdd0b 308 tconn->oldest_tle = nob;
b411b363 309 /* if nob == NULL b was the only barrier, and becomes the new
2f5cdd0b 310 barrier. Therefore tconn->oldest_tle points already to b */
b411b363
PR
311 } else {
312 D_ASSERT(nob != NULL);
2f5cdd0b 313 tconn->oldest_tle = nob;
b411b363
PR
314 kfree(b);
315 }
316
2f5cdd0b 317 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
318 dec_ap_pending(mdev);
319
320 return;
321
322bail:
2f5cdd0b
PR
323 spin_unlock_irq(&tconn->req_lock);
324 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
325}
326
617049aa 327
b411b363 328/**
11b58e73 329 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 330 * @mdev: DRBD device.
11b58e73 331 * @what: The action/event to perform with all request objects
b411b363 332 *
8554df1c
AG
333 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
334 * RESTART_FROZEN_DISK_IO.
b411b363 335 */
2f5cdd0b 336void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
b411b363 337{
11b58e73 338 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 339 struct list_head *le, *tle, carry_reads;
11b58e73
PR
340 struct drbd_request *req;
341 int rv, n_writes, n_reads;
b411b363 342
2f5cdd0b
PR
343 b = tconn->oldest_tle;
344 pn = &tconn->oldest_tle;
b411b363 345 while (b) {
11b58e73
PR
346 n_writes = 0;
347 n_reads = 0;
b9b98716 348 INIT_LIST_HEAD(&carry_reads);
b411b363 349 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
350 req = list_entry(le, struct drbd_request, tl_requests);
351 rv = _req_mod(req, what);
352
353 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
354 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
355 }
356 tmp = b->next;
357
b9b98716 358 if (n_writes) {
8554df1c 359 if (what == RESEND) {
11b58e73
PR
360 b->n_writes = n_writes;
361 if (b->w.cb == NULL) {
362 b->w.cb = w_send_barrier;
2f5cdd0b
PR
363 inc_ap_pending(b->w.mdev);
364 set_bit(CREATE_BARRIER, &b->w.mdev->flags);
11b58e73
PR
365 }
366
2f5cdd0b 367 drbd_queue_work(&tconn->data.work, &b->w);
11b58e73
PR
368 }
369 pn = &b->next;
370 } else {
b9b98716
PR
371 if (n_reads)
372 list_add(&carry_reads, &b->requests);
11b58e73
PR
373 /* there could still be requests on that ring list,
374 * in case local io is still pending */
375 list_del(&b->requests);
376
377 /* dec_ap_pending corresponding to queue_barrier.
378 * the newest barrier may not have been queued yet,
379 * in which case w.cb is still NULL. */
380 if (b->w.cb != NULL)
2f5cdd0b 381 dec_ap_pending(b->w.mdev);
11b58e73 382
2f5cdd0b 383 if (b == tconn->newest_tle) {
11b58e73 384 /* recycle, but reinit! */
2f5cdd0b
PR
385 if (tmp != NULL)
386 conn_err(tconn, "ASSERT FAILED tmp == NULL");
11b58e73 387 INIT_LIST_HEAD(&b->requests);
b9b98716 388 list_splice(&carry_reads, &b->requests);
11b58e73
PR
389 INIT_LIST_HEAD(&b->w.list);
390 b->w.cb = NULL;
391 b->br_number = net_random();
392 b->n_writes = 0;
393
394 *pn = b;
395 break;
396 }
397 *pn = tmp;
398 kfree(b);
b411b363 399 }
b411b363 400 b = tmp;
b9b98716 401 list_splice(&carry_reads, &b->requests);
b411b363 402 }
11b58e73
PR
403}
404
b411b363
PR
405
406/**
407 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
408 * @mdev: DRBD device.
409 *
410 * This is called after the connection to the peer was lost. The storage covered
411 * by the requests on the transfer gets marked as our of sync. Called from the
412 * receiver thread and the worker thread.
413 */
2f5cdd0b 414void tl_clear(struct drbd_tconn *tconn)
b411b363 415{
2f5cdd0b 416 struct drbd_conf *mdev;
b411b363
PR
417 struct list_head *le, *tle;
418 struct drbd_request *r;
2f5cdd0b 419 int minor;
b411b363 420
2f5cdd0b 421 spin_lock_irq(&tconn->req_lock);
b411b363 422
2f5cdd0b 423 _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
424
425 /* we expect this list to be empty. */
2f5cdd0b
PR
426 if (!list_empty(&tconn->out_of_sequence_requests))
427 conn_err(tconn, "ASSERT FAILED list_empty(&out_of_sequence_requests)\n");
b411b363
PR
428
429 /* but just in case, clean it up anyways! */
2f5cdd0b 430 list_for_each_safe(le, tle, &tconn->out_of_sequence_requests) {
b411b363
PR
431 r = list_entry(le, struct drbd_request, tl_requests);
432 /* It would be nice to complete outside of spinlock.
433 * But this is easier for now. */
8554df1c 434 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
435 }
436
437 /* ensure bit indicating barrier is required is clear */
2f5cdd0b
PR
438 idr_for_each_entry(&tconn->volumes, mdev, minor)
439 clear_bit(CREATE_BARRIER, &mdev->flags);
b411b363 440
2f5cdd0b 441 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
442}
443
2f5cdd0b 444void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
11b58e73 445{
2f5cdd0b
PR
446 spin_lock_irq(&tconn->req_lock);
447 _tl_restart(tconn, what);
448 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
449}
450
b411b363
PR
451static int drbd_thread_setup(void *arg)
452{
453 struct drbd_thread *thi = (struct drbd_thread *) arg;
392c8801 454 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
455 unsigned long flags;
456 int retval;
457
f1b3a6ec 458 snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
392c8801 459 thi->name[0], thi->tconn->name);
f1b3a6ec 460
b411b363
PR
461restart:
462 retval = thi->function(thi);
463
464 spin_lock_irqsave(&thi->t_lock, flags);
465
e77a0a5c 466 /* if the receiver has been "EXITING", the last thing it did
b411b363
PR
467 * was set the conn state to "StandAlone",
468 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
469 * and receiver thread will be "started".
e77a0a5c 470 * drbd_thread_start needs to set "RESTARTING" in that case.
b411b363 471 * t_state check and assignment needs to be within the same spinlock,
e77a0a5c
AG
472 * so either thread_start sees EXITING, and can remap to RESTARTING,
473 * or thread_start see NONE, and can proceed as normal.
b411b363
PR
474 */
475
e77a0a5c 476 if (thi->t_state == RESTARTING) {
392c8801 477 conn_info(tconn, "Restarting %s thread\n", thi->name);
e77a0a5c 478 thi->t_state = RUNNING;
b411b363
PR
479 spin_unlock_irqrestore(&thi->t_lock, flags);
480 goto restart;
481 }
482
483 thi->task = NULL;
e77a0a5c 484 thi->t_state = NONE;
b411b363
PR
485 smp_mb();
486 complete(&thi->stop);
487 spin_unlock_irqrestore(&thi->t_lock, flags);
488
392c8801 489 conn_info(tconn, "Terminating %s\n", current->comm);
b411b363
PR
490
491 /* Release mod reference taken when thread was started */
492 module_put(THIS_MODULE);
493 return retval;
494}
495
392c8801 496static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
bed879ae 497 int (*func) (struct drbd_thread *), char *name)
b411b363
PR
498{
499 spin_lock_init(&thi->t_lock);
500 thi->task = NULL;
e77a0a5c 501 thi->t_state = NONE;
b411b363 502 thi->function = func;
392c8801 503 thi->tconn = tconn;
bed879ae 504 strncpy(thi->name, name, ARRAY_SIZE(thi->name));
b411b363
PR
505}
506
507int drbd_thread_start(struct drbd_thread *thi)
508{
392c8801 509 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
510 struct task_struct *nt;
511 unsigned long flags;
512
b411b363
PR
513 /* is used from state engine doing drbd_thread_stop_nowait,
514 * while holding the req lock irqsave */
515 spin_lock_irqsave(&thi->t_lock, flags);
516
517 switch (thi->t_state) {
e77a0a5c 518 case NONE:
392c8801 519 conn_info(tconn, "Starting %s thread (from %s [%d])\n",
bed879ae 520 thi->name, current->comm, current->pid);
b411b363
PR
521
522 /* Get ref on module for thread - this is released when thread exits */
523 if (!try_module_get(THIS_MODULE)) {
392c8801 524 conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
b411b363 525 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 526 return false;
b411b363
PR
527 }
528
529 init_completion(&thi->stop);
b411b363 530 thi->reset_cpu_mask = 1;
e77a0a5c 531 thi->t_state = RUNNING;
b411b363
PR
532 spin_unlock_irqrestore(&thi->t_lock, flags);
533 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
534
535 nt = kthread_create(drbd_thread_setup, (void *) thi,
392c8801 536 "drbd_%c_%s", thi->name[0], thi->tconn->name);
b411b363
PR
537
538 if (IS_ERR(nt)) {
392c8801 539 conn_err(tconn, "Couldn't start thread\n");
b411b363
PR
540
541 module_put(THIS_MODULE);
81e84650 542 return false;
b411b363
PR
543 }
544 spin_lock_irqsave(&thi->t_lock, flags);
545 thi->task = nt;
e77a0a5c 546 thi->t_state = RUNNING;
b411b363
PR
547 spin_unlock_irqrestore(&thi->t_lock, flags);
548 wake_up_process(nt);
549 break;
e77a0a5c
AG
550 case EXITING:
551 thi->t_state = RESTARTING;
392c8801 552 conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
bed879ae 553 thi->name, current->comm, current->pid);
b411b363 554 /* fall through */
e77a0a5c
AG
555 case RUNNING:
556 case RESTARTING:
b411b363
PR
557 default:
558 spin_unlock_irqrestore(&thi->t_lock, flags);
559 break;
560 }
561
81e84650 562 return true;
b411b363
PR
563}
564
565
566void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
567{
568 unsigned long flags;
569
e77a0a5c 570 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
b411b363
PR
571
572 /* may be called from state engine, holding the req lock irqsave */
573 spin_lock_irqsave(&thi->t_lock, flags);
574
e77a0a5c 575 if (thi->t_state == NONE) {
b411b363
PR
576 spin_unlock_irqrestore(&thi->t_lock, flags);
577 if (restart)
578 drbd_thread_start(thi);
579 return;
580 }
581
582 if (thi->t_state != ns) {
583 if (thi->task == NULL) {
584 spin_unlock_irqrestore(&thi->t_lock, flags);
585 return;
586 }
587
588 thi->t_state = ns;
589 smp_mb();
590 init_completion(&thi->stop);
591 if (thi->task != current)
592 force_sig(DRBD_SIGKILL, thi->task);
b411b363
PR
593 }
594
595 spin_unlock_irqrestore(&thi->t_lock, flags);
596
597 if (wait)
598 wait_for_completion(&thi->stop);
599}
600
392c8801 601static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 602{
bed879ae
PR
603 struct drbd_thread *thi =
604 task == tconn->receiver.task ? &tconn->receiver :
605 task == tconn->asender.task ? &tconn->asender :
606 task == tconn->worker.task ? &tconn->worker : NULL;
607
608 return thi;
609}
610
392c8801 611char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 612{
392c8801 613 struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
bed879ae
PR
614 return thi ? thi->name : task->comm;
615}
616
80883197 617int conn_lowest_minor(struct drbd_tconn *tconn)
80822284
PR
618{
619 int minor = 0;
774b3055
PR
620
621 if (!idr_get_next(&tconn->volumes, &minor))
622 return -1;
80822284
PR
623 return minor;
624}
774b3055
PR
625
626#ifdef CONFIG_SMP
b411b363
PR
627/**
628 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
629 * @mdev: DRBD device.
630 *
631 * Forces all threads of a device onto the same CPU. This is beneficial for
632 * DRBD's performance. May be overwritten by user's configuration.
633 */
80822284 634void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
b411b363
PR
635{
636 int ord, cpu;
637
638 /* user override. */
80822284 639 if (cpumask_weight(tconn->cpu_mask))
b411b363
PR
640 return;
641
80822284 642 ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
b411b363
PR
643 for_each_online_cpu(cpu) {
644 if (ord-- == 0) {
80822284 645 cpumask_set_cpu(cpu, tconn->cpu_mask);
b411b363
PR
646 return;
647 }
648 }
649 /* should not be reached */
80822284 650 cpumask_setall(tconn->cpu_mask);
b411b363
PR
651}
652
653/**
654 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
655 * @mdev: DRBD device.
bc31fe33 656 * @thi: drbd_thread object
b411b363
PR
657 *
658 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
659 * prematurely.
660 */
80822284 661void drbd_thread_current_set_cpu(struct drbd_thread *thi)
b411b363
PR
662{
663 struct task_struct *p = current;
bed879ae 664
b411b363
PR
665 if (!thi->reset_cpu_mask)
666 return;
667 thi->reset_cpu_mask = 0;
392c8801 668 set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
b411b363
PR
669}
670#endif
671
d38e787e 672static void prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
fd340c12
PR
673{
674 h->magic = cpu_to_be32(DRBD_MAGIC);
675 h->command = cpu_to_be16(cmd);
676 h->length = cpu_to_be16(size);
677}
678
d38e787e 679static void prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
fd340c12
PR
680{
681 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
682 h->command = cpu_to_be16(cmd);
683 h->length = cpu_to_be32(size);
684}
685
d38e787e
PR
686static void _prepare_header(struct drbd_tconn *tconn, int vnr, struct p_header *h,
687 enum drbd_packet cmd, int size)
688{
689 if (tconn->agreed_pro_version >= 100 || size > DRBD_MAX_SIZE_H80_PACKET)
690 prepare_header95(&h->h95, cmd, size);
691 else
692 prepare_header80(&h->h80, cmd, size);
693}
694
fd340c12 695static void prepare_header(struct drbd_conf *mdev, struct p_header *h,
d8763023 696 enum drbd_packet cmd, int size)
fd340c12 697{
d38e787e 698 _prepare_header(mdev->tconn, mdev->vnr, h, cmd, size);
fd340c12
PR
699}
700
b411b363 701/* the appropriate socket mutex must be held already */
d38e787e 702int _conn_send_cmd(struct drbd_tconn *tconn, int vnr, struct socket *sock,
d8763023
AG
703 enum drbd_packet cmd, struct p_header *h, size_t size,
704 unsigned msg_flags)
b411b363
PR
705{
706 int sent, ok;
707
d38e787e 708 _prepare_header(tconn, vnr, h, cmd, size - sizeof(struct p_header));
b411b363 709
d38e787e 710 sent = drbd_send(tconn, sock, h, size, msg_flags);
b411b363
PR
711
712 ok = (sent == size);
0ddc5549 713 if (!ok && !signal_pending(current))
d38e787e
PR
714 conn_warn(tconn, "short sent %s size=%d sent=%d\n",
715 cmdname(cmd), (int)size, sent);
b411b363
PR
716 return ok;
717}
718
719/* don't pass the socket. we may only look at it
720 * when we hold the appropriate socket mutex.
721 */
2a67d8b9 722int conn_send_cmd(struct drbd_tconn *tconn, int vnr, int use_data_socket,
d8763023 723 enum drbd_packet cmd, struct p_header *h, size_t size)
b411b363
PR
724{
725 int ok = 0;
726 struct socket *sock;
727
728 if (use_data_socket) {
2a67d8b9
PR
729 mutex_lock(&tconn->data.mutex);
730 sock = tconn->data.socket;
b411b363 731 } else {
2a67d8b9
PR
732 mutex_lock(&tconn->meta.mutex);
733 sock = tconn->meta.socket;
b411b363
PR
734 }
735
736 /* drbd_disconnect() could have called drbd_free_sock()
737 * while we were waiting in down()... */
738 if (likely(sock != NULL))
2a67d8b9 739 ok = _conn_send_cmd(tconn, vnr, sock, cmd, h, size, 0);
b411b363
PR
740
741 if (use_data_socket)
2a67d8b9 742 mutex_unlock(&tconn->data.mutex);
b411b363 743 else
2a67d8b9 744 mutex_unlock(&tconn->meta.mutex);
b411b363
PR
745 return ok;
746}
747
61120870 748int conn_send_cmd2(struct drbd_tconn *tconn, enum drbd_packet cmd, char *data,
b411b363
PR
749 size_t size)
750{
61120870 751 struct p_header80 h;
b411b363
PR
752 int ok;
753
61120870 754 prepare_header80(&h, cmd, size);
b411b363 755
61120870 756 if (!drbd_get_data_sock(tconn))
b411b363
PR
757 return 0;
758
b411b363 759 ok = (sizeof(h) ==
61120870 760 drbd_send(tconn, tconn->data.socket, &h, sizeof(h), 0));
b411b363 761 ok = ok && (size ==
61120870 762 drbd_send(tconn, tconn->data.socket, data, size, 0));
b411b363 763
61120870 764 drbd_put_data_sock(tconn);
b411b363
PR
765
766 return ok;
767}
768
769int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
770{
8e26f9cc 771 struct p_rs_param_95 *p;
b411b363
PR
772 struct socket *sock;
773 int size, rv;
31890f4a 774 const int apv = mdev->tconn->agreed_pro_version;
b411b363
PR
775
776 size = apv <= 87 ? sizeof(struct p_rs_param)
777 : apv == 88 ? sizeof(struct p_rs_param)
778 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
779 : apv <= 94 ? sizeof(struct p_rs_param_89)
780 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
781
782 /* used from admin command context and receiver/worker context.
783 * to avoid kmalloc, grab the socket right here,
784 * then use the pre-allocated sbuf there */
e42325a5
PR
785 mutex_lock(&mdev->tconn->data.mutex);
786 sock = mdev->tconn->data.socket;
b411b363
PR
787
788 if (likely(sock != NULL)) {
d8763023
AG
789 enum drbd_packet cmd =
790 apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
b411b363 791
e42325a5 792 p = &mdev->tconn->data.sbuf.rs_param_95;
b411b363
PR
793
794 /* initialize verify_alg and csums_alg */
795 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
796
797 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
798 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
799 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
800 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
801 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
802
803 if (apv >= 88)
804 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
805 if (apv >= 89)
806 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
807
808 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
809 } else
810 rv = 0; /* not ok */
811
e42325a5 812 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
813
814 return rv;
815}
816
dc8228d1 817int drbd_send_protocol(struct drbd_tconn *tconn)
b411b363
PR
818{
819 struct p_protocol *p;
cf14c2e9 820 int size, cf, rv;
b411b363
PR
821
822 size = sizeof(struct p_protocol);
823
dc8228d1
PR
824 if (tconn->agreed_pro_version >= 87)
825 size += strlen(tconn->net_conf->integrity_alg) + 1;
b411b363
PR
826
827 /* we must not recurse into our own queue,
828 * as that is blocked during handshake */
829 p = kmalloc(size, GFP_NOIO);
830 if (p == NULL)
831 return 0;
832
dc8228d1
PR
833 p->protocol = cpu_to_be32(tconn->net_conf->wire_protocol);
834 p->after_sb_0p = cpu_to_be32(tconn->net_conf->after_sb_0p);
835 p->after_sb_1p = cpu_to_be32(tconn->net_conf->after_sb_1p);
836 p->after_sb_2p = cpu_to_be32(tconn->net_conf->after_sb_2p);
837 p->two_primaries = cpu_to_be32(tconn->net_conf->two_primaries);
b411b363 838
cf14c2e9 839 cf = 0;
dc8228d1 840 if (tconn->net_conf->want_lose)
cf14c2e9 841 cf |= CF_WANT_LOSE;
dc8228d1
PR
842 if (tconn->net_conf->dry_run) {
843 if (tconn->agreed_pro_version >= 92)
cf14c2e9
PR
844 cf |= CF_DRY_RUN;
845 else {
dc8228d1 846 conn_err(tconn, "--dry-run is not supported by peer");
7ac314c8 847 kfree(p);
148efa16 848 return -1;
cf14c2e9
PR
849 }
850 }
851 p->conn_flags = cpu_to_be32(cf);
852
dc8228d1
PR
853 if (tconn->agreed_pro_version >= 87)
854 strcpy(p->integrity_alg, tconn->net_conf->integrity_alg);
b411b363 855
dc8228d1 856 rv = conn_send_cmd2(tconn, P_PROTOCOL, p->head.payload, size - sizeof(struct p_header));
b411b363
PR
857 kfree(p);
858 return rv;
859}
860
861int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
862{
863 struct p_uuids p;
864 int i;
865
866 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
867 return 1;
868
869 for (i = UI_CURRENT; i < UI_SIZE; i++)
870 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
871
872 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
873 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
89e58e75 874 uuid_flags |= mdev->tconn->net_conf->want_lose ? 1 : 0;
b411b363
PR
875 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
876 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
877 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
878
879 put_ldev(mdev);
880
c012949a 881 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, &p.head, sizeof(p));
b411b363
PR
882}
883
884int drbd_send_uuids(struct drbd_conf *mdev)
885{
886 return _drbd_send_uuids(mdev, 0);
887}
888
889int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
890{
891 return _drbd_send_uuids(mdev, 8);
892}
893
62b0da3a
LE
894void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
895{
896 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
897 u64 *uuid = mdev->ldev->md.uuid;
898 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
899 text,
900 (unsigned long long)uuid[UI_CURRENT],
901 (unsigned long long)uuid[UI_BITMAP],
902 (unsigned long long)uuid[UI_HISTORY_START],
903 (unsigned long long)uuid[UI_HISTORY_END]);
904 put_ldev(mdev);
905 } else {
906 dev_info(DEV, "%s effective data uuid: %016llX\n",
907 text,
908 (unsigned long long)mdev->ed_uuid);
909 }
910}
911
5a22db89 912int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
913{
914 struct p_rs_uuid p;
5a22db89
LE
915 u64 uuid;
916
917 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 918
4a23f264 919 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 920 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 921 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
922 drbd_md_sync(mdev);
923 p.uuid = cpu_to_be64(uuid);
b411b363 924
c012949a 925 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, &p.head, sizeof(p));
b411b363
PR
926}
927
e89b591c 928int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
929{
930 struct p_sizes p;
931 sector_t d_size, u_size;
99432fcc 932 int q_order_type, max_bio_size;
b411b363
PR
933 int ok;
934
935 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
936 D_ASSERT(mdev->ldev->backing_bdev);
937 d_size = drbd_get_max_capacity(mdev->ldev);
938 u_size = mdev->ldev->dc.disk_size;
939 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
940 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
941 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
942 put_ldev(mdev);
943 } else {
944 d_size = 0;
945 u_size = 0;
946 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 947 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
948 }
949
950 p.d_size = cpu_to_be64(d_size);
951 p.u_size = cpu_to_be64(u_size);
952 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 953 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
954 p.queue_order_type = cpu_to_be16(q_order_type);
955 p.dds_flags = cpu_to_be16(flags);
b411b363 956
c012949a 957 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, &p.head, sizeof(p));
b411b363
PR
958 return ok;
959}
960
961/**
962 * drbd_send_state() - Sends the drbd state to the peer
963 * @mdev: DRBD device.
964 */
965int drbd_send_state(struct drbd_conf *mdev)
966{
967 struct socket *sock;
968 struct p_state p;
969 int ok = 0;
970
e42325a5 971 mutex_lock(&mdev->tconn->data.mutex);
b411b363
PR
972
973 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
e42325a5 974 sock = mdev->tconn->data.socket;
b411b363
PR
975
976 if (likely(sock != NULL)) {
c012949a 977 ok = _drbd_send_cmd(mdev, sock, P_STATE, &p.head, sizeof(p), 0);
b411b363
PR
978 }
979
e42325a5 980 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 981
b411b363
PR
982 return ok;
983}
984
cf29c9d8
PR
985int _conn_send_state_req(struct drbd_tconn *tconn, int vnr, enum drbd_packet cmd,
986 union drbd_state mask, union drbd_state val)
b411b363
PR
987{
988 struct p_req_state p;
989
990 p.mask = cpu_to_be32(mask.i);
991 p.val = cpu_to_be32(val.i);
992
cf29c9d8 993 return conn_send_cmd(tconn, vnr, USE_DATA_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
994}
995
bf885f8a 996int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
997{
998 struct p_req_state_reply p;
999
1000 p.retcode = cpu_to_be32(retcode);
1001
c012949a 1002 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, &p.head, sizeof(p));
b411b363
PR
1003}
1004
047cd4a6
PR
1005int conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
1006{
1007 struct p_req_state_reply p;
1008 enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1009
1010 p.retcode = cpu_to_be32(retcode);
1011
1012 return conn_send_cmd(tconn, 0, USE_META_SOCKET, cmd, &p.head, sizeof(p));
1013}
1014
b411b363
PR
1015int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1016 struct p_compressed_bm *p,
1017 struct bm_xfer_ctx *c)
1018{
1019 struct bitstream bs;
1020 unsigned long plain_bits;
1021 unsigned long tmp;
1022 unsigned long rl;
1023 unsigned len;
1024 unsigned toggle;
1025 int bits;
1026
1027 /* may we use this feature? */
1028 if ((mdev->sync_conf.use_rle == 0) ||
31890f4a 1029 (mdev->tconn->agreed_pro_version < 90))
b411b363
PR
1030 return 0;
1031
1032 if (c->bit_offset >= c->bm_bits)
1033 return 0; /* nothing to do. */
1034
1035 /* use at most thus many bytes */
1036 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1037 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1038 /* plain bits covered in this code string */
1039 plain_bits = 0;
1040
1041 /* p->encoding & 0x80 stores whether the first run length is set.
1042 * bit offset is implicit.
1043 * start with toggle == 2 to be able to tell the first iteration */
1044 toggle = 2;
1045
1046 /* see how much plain bits we can stuff into one packet
1047 * using RLE and VLI. */
1048 do {
1049 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1050 : _drbd_bm_find_next(mdev, c->bit_offset);
1051 if (tmp == -1UL)
1052 tmp = c->bm_bits;
1053 rl = tmp - c->bit_offset;
1054
1055 if (toggle == 2) { /* first iteration */
1056 if (rl == 0) {
1057 /* the first checked bit was set,
1058 * store start value, */
1059 DCBP_set_start(p, 1);
1060 /* but skip encoding of zero run length */
1061 toggle = !toggle;
1062 continue;
1063 }
1064 DCBP_set_start(p, 0);
1065 }
1066
1067 /* paranoia: catch zero runlength.
1068 * can only happen if bitmap is modified while we scan it. */
1069 if (rl == 0) {
1070 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1071 "t:%u bo:%lu\n", toggle, c->bit_offset);
1072 return -1;
1073 }
1074
1075 bits = vli_encode_bits(&bs, rl);
1076 if (bits == -ENOBUFS) /* buffer full */
1077 break;
1078 if (bits <= 0) {
1079 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1080 return 0;
1081 }
1082
1083 toggle = !toggle;
1084 plain_bits += rl;
1085 c->bit_offset = tmp;
1086 } while (c->bit_offset < c->bm_bits);
1087
1088 len = bs.cur.b - p->code + !!bs.cur.bit;
1089
1090 if (plain_bits < (len << 3)) {
1091 /* incompressible with this method.
1092 * we need to rewind both word and bit position. */
1093 c->bit_offset -= plain_bits;
1094 bm_xfer_ctx_bit_to_word_offset(c);
1095 c->bit_offset = c->word_offset * BITS_PER_LONG;
1096 return 0;
1097 }
1098
1099 /* RLE + VLI was able to compress it just fine.
1100 * update c->word_offset. */
1101 bm_xfer_ctx_bit_to_word_offset(c);
1102
1103 /* store pad_bits */
1104 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1105
1106 return len;
1107}
1108
f70af118
AG
1109/**
1110 * send_bitmap_rle_or_plain
1111 *
1112 * Return 0 when done, 1 when another iteration is needed, and a negative error
1113 * code upon failure.
1114 */
1115static int
b411b363 1116send_bitmap_rle_or_plain(struct drbd_conf *mdev,
c012949a 1117 struct p_header *h, struct bm_xfer_ctx *c)
b411b363
PR
1118{
1119 struct p_compressed_bm *p = (void*)h;
1120 unsigned long num_words;
1121 int len;
1122 int ok;
1123
1124 len = fill_bitmap_rle_bits(mdev, p, c);
1125
1126 if (len < 0)
f70af118 1127 return -EIO;
b411b363
PR
1128
1129 if (len) {
1130 DCBP_set_code(p, RLE_VLI_Bits);
e42325a5 1131 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_COMPRESSED_BITMAP, h,
b411b363
PR
1132 sizeof(*p) + len, 0);
1133
1134 c->packets[0]++;
1135 c->bytes[0] += sizeof(*p) + len;
1136
1137 if (c->bit_offset >= c->bm_bits)
1138 len = 0; /* DONE */
1139 } else {
1140 /* was not compressible.
1141 * send a buffer full of plain text bits instead. */
1142 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1143 len = num_words * sizeof(long);
1144 if (len)
1145 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
e42325a5 1146 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_BITMAP,
0b70a13d 1147 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
1148 c->word_offset += num_words;
1149 c->bit_offset = c->word_offset * BITS_PER_LONG;
1150
1151 c->packets[1]++;
0b70a13d 1152 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
1153
1154 if (c->bit_offset > c->bm_bits)
1155 c->bit_offset = c->bm_bits;
1156 }
f70af118
AG
1157 if (ok) {
1158 if (len == 0) {
1159 INFO_bm_xfer_stats(mdev, "send", c);
1160 return 0;
1161 } else
1162 return 1;
1163 }
1164 return -EIO;
b411b363
PR
1165}
1166
1167/* See the comment at receive_bitmap() */
1168int _drbd_send_bitmap(struct drbd_conf *mdev)
1169{
1170 struct bm_xfer_ctx c;
c012949a 1171 struct p_header *p;
f70af118 1172 int err;
b411b363 1173
841ce241
AG
1174 if (!expect(mdev->bitmap))
1175 return false;
b411b363
PR
1176
1177 /* maybe we should use some per thread scratch page,
1178 * and allocate that during initial device creation? */
c012949a 1179 p = (struct p_header *) __get_free_page(GFP_NOIO);
b411b363
PR
1180 if (!p) {
1181 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 1182 return false;
b411b363
PR
1183 }
1184
1185 if (get_ldev(mdev)) {
1186 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1187 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1188 drbd_bm_set_all(mdev);
1189 if (drbd_bm_write(mdev)) {
1190 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1191 * but otherwise process as per normal - need to tell other
1192 * side that a full resync is required! */
1193 dev_err(DEV, "Failed to write bitmap to disk!\n");
1194 } else {
1195 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
1196 drbd_md_sync(mdev);
1197 }
1198 }
1199 put_ldev(mdev);
1200 }
1201
1202 c = (struct bm_xfer_ctx) {
1203 .bm_bits = drbd_bm_bits(mdev),
1204 .bm_words = drbd_bm_words(mdev),
1205 };
1206
1207 do {
f70af118
AG
1208 err = send_bitmap_rle_or_plain(mdev, p, &c);
1209 } while (err > 0);
b411b363
PR
1210
1211 free_page((unsigned long) p);
f70af118 1212 return err == 0;
b411b363
PR
1213}
1214
1215int drbd_send_bitmap(struct drbd_conf *mdev)
1216{
1217 int err;
1218
61120870 1219 if (!drbd_get_data_sock(mdev->tconn))
b411b363
PR
1220 return -1;
1221 err = !_drbd_send_bitmap(mdev);
61120870 1222 drbd_put_data_sock(mdev->tconn);
b411b363
PR
1223 return err;
1224}
1225
1226int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
1227{
1228 int ok;
1229 struct p_barrier_ack p;
1230
1231 p.barrier = barrier_nr;
1232 p.set_size = cpu_to_be32(set_size);
1233
1234 if (mdev->state.conn < C_CONNECTED)
81e84650 1235 return false;
c012949a 1236 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, &p.head, sizeof(p));
b411b363
PR
1237 return ok;
1238}
1239
1240/**
1241 * _drbd_send_ack() - Sends an ack packet
1242 * @mdev: DRBD device.
1243 * @cmd: Packet command code.
1244 * @sector: sector, needs to be in big endian byte order
1245 * @blksize: size in byte, needs to be in big endian byte order
1246 * @block_id: Id, big endian byte order
1247 */
d8763023
AG
1248static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
1249 u64 sector, u32 blksize, u64 block_id)
b411b363
PR
1250{
1251 int ok;
1252 struct p_block_ack p;
1253
1254 p.sector = sector;
1255 p.block_id = block_id;
1256 p.blksize = blksize;
8ccf218e 1257 p.seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
b411b363 1258
e42325a5 1259 if (!mdev->tconn->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 1260 return false;
c012949a 1261 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
1262 return ok;
1263}
1264
2b2bf214
LE
1265/* dp->sector and dp->block_id already/still in network byte order,
1266 * data_size is payload size according to dp->head,
1267 * and may need to be corrected for digest size. */
d8763023 1268int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
2b2bf214 1269 struct p_data *dp, int data_size)
b411b363 1270{
a0638456
PR
1271 data_size -= (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
1272 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
b411b363
PR
1273 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
1274 dp->block_id);
1275}
1276
d8763023 1277int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1278 struct p_block_req *rp)
1279{
1280 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
1281}
1282
1283/**
1284 * drbd_send_ack() - Sends an ack packet
db830c46
AG
1285 * @mdev: DRBD device
1286 * @cmd: packet command code
1287 * @peer_req: peer request
b411b363 1288 */
d8763023 1289int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1290 struct drbd_peer_request *peer_req)
b411b363
PR
1291{
1292 return _drbd_send_ack(mdev, cmd,
db830c46
AG
1293 cpu_to_be64(peer_req->i.sector),
1294 cpu_to_be32(peer_req->i.size),
1295 peer_req->block_id);
b411b363
PR
1296}
1297
1298/* This function misuses the block_id field to signal if the blocks
1299 * are is sync or not. */
d8763023 1300int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1301 sector_t sector, int blksize, u64 block_id)
1302{
1303 return _drbd_send_ack(mdev, cmd,
1304 cpu_to_be64(sector),
1305 cpu_to_be32(blksize),
1306 cpu_to_be64(block_id));
1307}
1308
1309int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1310 sector_t sector, int size, u64 block_id)
1311{
1312 int ok;
1313 struct p_block_req p;
1314
1315 p.sector = cpu_to_be64(sector);
1316 p.block_id = block_id;
1317 p.blksize = cpu_to_be32(size);
1318
c012949a 1319 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
1320 return ok;
1321}
1322
d8763023
AG
1323int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
1324 void *digest, int digest_size, enum drbd_packet cmd)
b411b363
PR
1325{
1326 int ok;
1327 struct p_block_req p;
1328
fd340c12 1329 prepare_header(mdev, &p.head, cmd, sizeof(p) - sizeof(struct p_header) + digest_size);
b411b363 1330 p.sector = cpu_to_be64(sector);
9a8e7753 1331 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1332 p.blksize = cpu_to_be32(size);
1333
e42325a5 1334 mutex_lock(&mdev->tconn->data.mutex);
b411b363 1335
bedbd2a5
PR
1336 ok = (sizeof(p) == drbd_send(mdev->tconn, mdev->tconn->data.socket, &p, sizeof(p), 0));
1337 ok = ok && (digest_size == drbd_send(mdev->tconn, mdev->tconn->data.socket, digest, digest_size, 0));
b411b363 1338
e42325a5 1339 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
1340
1341 return ok;
1342}
1343
1344int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
1345{
1346 int ok;
1347 struct p_block_req p;
1348
1349 p.sector = cpu_to_be64(sector);
9a8e7753 1350 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1351 p.blksize = cpu_to_be32(size);
1352
c012949a 1353 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, &p.head, sizeof(p));
b411b363
PR
1354 return ok;
1355}
1356
1357/* called on sndtimeo
81e84650
AG
1358 * returns false if we should retry,
1359 * true if we think connection is dead
b411b363 1360 */
1a7ba646 1361static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
b411b363
PR
1362{
1363 int drop_it;
1364 /* long elapsed = (long)(jiffies - mdev->last_received); */
1365
1a7ba646
PR
1366 drop_it = tconn->meta.socket == sock
1367 || !tconn->asender.task
1368 || get_t_state(&tconn->asender) != RUNNING
bbeb641c 1369 || tconn->cstate < C_WF_REPORT_PARAMS;
b411b363
PR
1370
1371 if (drop_it)
81e84650 1372 return true;
b411b363 1373
1a7ba646 1374 drop_it = !--tconn->ko_count;
b411b363 1375 if (!drop_it) {
1a7ba646
PR
1376 conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1377 current->comm, current->pid, tconn->ko_count);
1378 request_ping(tconn);
b411b363
PR
1379 }
1380
1381 return drop_it; /* && (mdev->state == R_PRIMARY) */;
1382}
1383
1a7ba646 1384static void drbd_update_congested(struct drbd_tconn *tconn)
9e204cdd 1385{
1a7ba646 1386 struct sock *sk = tconn->data.socket->sk;
9e204cdd 1387 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1a7ba646 1388 set_bit(NET_CONGESTED, &tconn->flags);
9e204cdd
AG
1389}
1390
b411b363
PR
1391/* The idea of sendpage seems to be to put some kind of reference
1392 * to the page into the skb, and to hand it over to the NIC. In
1393 * this process get_page() gets called.
1394 *
1395 * As soon as the page was really sent over the network put_page()
1396 * gets called by some part of the network layer. [ NIC driver? ]
1397 *
1398 * [ get_page() / put_page() increment/decrement the count. If count
1399 * reaches 0 the page will be freed. ]
1400 *
1401 * This works nicely with pages from FSs.
1402 * But this means that in protocol A we might signal IO completion too early!
1403 *
1404 * In order not to corrupt data during a resync we must make sure
1405 * that we do not reuse our own buffer pages (EEs) to early, therefore
1406 * we have the net_ee list.
1407 *
1408 * XFS seems to have problems, still, it submits pages with page_count == 0!
1409 * As a workaround, we disable sendpage on pages
1410 * with page_count == 0 or PageSlab.
1411 */
1412static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1413 int offset, size_t size, unsigned msg_flags)
b411b363 1414{
bedbd2a5 1415 int sent = drbd_send(mdev->tconn, mdev->tconn->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
1416 kunmap(page);
1417 if (sent == size)
1418 mdev->send_cnt += size>>9;
1419 return sent == size;
1420}
1421
1422static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1423 int offset, size_t size, unsigned msg_flags)
b411b363
PR
1424{
1425 mm_segment_t oldfs = get_fs();
1426 int sent, ok;
1427 int len = size;
1428
1429 /* e.g. XFS meta- & log-data is in slab pages, which have a
1430 * page_count of 0 and/or have PageSlab() set.
1431 * we cannot use send_page for those, as that does get_page();
1432 * put_page(); and would cause either a VM_BUG directly, or
1433 * __page_cache_release a page that would actually still be referenced
1434 * by someone, leading to some obscure delayed Oops somewhere else. */
1435 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 1436 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 1437
ba11ad9a 1438 msg_flags |= MSG_NOSIGNAL;
1a7ba646 1439 drbd_update_congested(mdev->tconn);
b411b363
PR
1440 set_fs(KERNEL_DS);
1441 do {
e42325a5 1442 sent = mdev->tconn->data.socket->ops->sendpage(mdev->tconn->data.socket, page,
b411b363 1443 offset, len,
ba11ad9a 1444 msg_flags);
b411b363 1445 if (sent == -EAGAIN) {
1a7ba646 1446 if (we_should_drop_the_connection(mdev->tconn,
e42325a5 1447 mdev->tconn->data.socket))
b411b363
PR
1448 break;
1449 else
1450 continue;
1451 }
1452 if (sent <= 0) {
1453 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
1454 __func__, (int)size, len, sent);
1455 break;
1456 }
1457 len -= sent;
1458 offset += sent;
1459 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
1460 set_fs(oldfs);
01a311a5 1461 clear_bit(NET_CONGESTED, &mdev->tconn->flags);
b411b363
PR
1462
1463 ok = (len == 0);
1464 if (likely(ok))
1465 mdev->send_cnt += size>>9;
1466 return ok;
1467}
1468
1469static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
1470{
1471 struct bio_vec *bvec;
1472 int i;
ba11ad9a 1473 /* hint all but last page with MSG_MORE */
b411b363
PR
1474 __bio_for_each_segment(bvec, bio, i, 0) {
1475 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
1476 bvec->bv_offset, bvec->bv_len,
1477 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
1478 return 0;
1479 }
1480 return 1;
1481}
1482
1483static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
1484{
1485 struct bio_vec *bvec;
1486 int i;
ba11ad9a 1487 /* hint all but last page with MSG_MORE */
b411b363
PR
1488 __bio_for_each_segment(bvec, bio, i, 0) {
1489 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
1490 bvec->bv_offset, bvec->bv_len,
1491 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
1492 return 0;
1493 }
b411b363
PR
1494 return 1;
1495}
1496
db830c46
AG
1497static int _drbd_send_zc_ee(struct drbd_conf *mdev,
1498 struct drbd_peer_request *peer_req)
45bb912b 1499{
db830c46
AG
1500 struct page *page = peer_req->pages;
1501 unsigned len = peer_req->i.size;
1502
ba11ad9a 1503 /* hint all but last page with MSG_MORE */
45bb912b
LE
1504 page_chain_for_each(page) {
1505 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
1506 if (!_drbd_send_page(mdev, page, 0, l,
1507 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
1508 return 0;
1509 len -= l;
1510 }
1511 return 1;
1512}
1513
76d2e7ec
PR
1514static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
1515{
31890f4a 1516 if (mdev->tconn->agreed_pro_version >= 95)
76d2e7ec 1517 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
1518 (bi_rw & REQ_FUA ? DP_FUA : 0) |
1519 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
1520 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
1521 else
721a9602 1522 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
1523}
1524
b411b363
PR
1525/* Used to send write requests
1526 * R_PRIMARY -> Peer (P_DATA)
1527 */
1528int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
1529{
1530 int ok = 1;
1531 struct p_data p;
1532 unsigned int dp_flags = 0;
1533 void *dgb;
1534 int dgs;
1535
61120870 1536 if (!drbd_get_data_sock(mdev->tconn))
b411b363
PR
1537 return 0;
1538
a0638456
PR
1539 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1540 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1541
fd340c12 1542 prepare_header(mdev, &p.head, P_DATA, sizeof(p) - sizeof(struct p_header) + dgs + req->i.size);
ace652ac 1543 p.sector = cpu_to_be64(req->i.sector);
b411b363 1544 p.block_id = (unsigned long)req;
8ccf218e 1545 p.seq_num = cpu_to_be32(req->seq_num = atomic_inc_return(&mdev->packet_seq));
b411b363 1546
76d2e7ec
PR
1547 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
1548
b411b363
PR
1549 if (mdev->state.conn >= C_SYNC_SOURCE &&
1550 mdev->state.conn <= C_PAUSED_SYNC_T)
1551 dp_flags |= DP_MAY_SET_IN_SYNC;
1552
1553 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
1554 set_bit(UNPLUG_REMOTE, &mdev->flags);
1555 ok = (sizeof(p) ==
bedbd2a5 1556 drbd_send(mdev->tconn, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363 1557 if (ok && dgs) {
a0638456
PR
1558 dgb = mdev->tconn->int_dig_out;
1559 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, dgb);
bedbd2a5 1560 ok = dgs == drbd_send(mdev->tconn, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
1561 }
1562 if (ok) {
470be44a
LE
1563 /* For protocol A, we have to memcpy the payload into
1564 * socket buffers, as we may complete right away
1565 * as soon as we handed it over to tcp, at which point the data
1566 * pages may become invalid.
1567 *
1568 * For data-integrity enabled, we copy it as well, so we can be
1569 * sure that even if the bio pages may still be modified, it
1570 * won't change the data on the wire, thus if the digest checks
1571 * out ok after sending on this side, but does not fit on the
1572 * receiving side, we sure have detected corruption elsewhere.
1573 */
89e58e75 1574 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
1575 ok = _drbd_send_bio(mdev, req->master_bio);
1576 else
1577 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
1578
1579 /* double check digest, sometimes buffers have been modified in flight. */
1580 if (dgs > 0 && dgs <= 64) {
24c4830c 1581 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
1582 * currently supported in kernel crypto. */
1583 unsigned char digest[64];
a0638456
PR
1584 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, digest);
1585 if (memcmp(mdev->tconn->int_dig_out, digest, dgs)) {
470be44a
LE
1586 dev_warn(DEV,
1587 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
ace652ac 1588 (unsigned long long)req->i.sector, req->i.size);
470be44a
LE
1589 }
1590 } /* else if (dgs > 64) {
1591 ... Be noisy about digest too large ...
1592 } */
b411b363
PR
1593 }
1594
61120870 1595 drbd_put_data_sock(mdev->tconn);
bd26bfc5 1596
b411b363
PR
1597 return ok;
1598}
1599
1600/* answer packet, used to send data back for read requests:
1601 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
1602 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
1603 */
d8763023 1604int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1605 struct drbd_peer_request *peer_req)
b411b363
PR
1606{
1607 int ok;
1608 struct p_data p;
1609 void *dgb;
1610 int dgs;
1611
a0638456
PR
1612 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1613 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1614
db830c46
AG
1615 prepare_header(mdev, &p.head, cmd, sizeof(p) -
1616 sizeof(struct p_header80) +
1617 dgs + peer_req->i.size);
1618 p.sector = cpu_to_be64(peer_req->i.sector);
1619 p.block_id = peer_req->block_id;
cc378270 1620 p.seq_num = 0; /* unused */
b411b363
PR
1621
1622 /* Only called by our kernel thread.
1623 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
1624 * in response to admin command or module unload.
1625 */
61120870 1626 if (!drbd_get_data_sock(mdev->tconn))
b411b363
PR
1627 return 0;
1628
bedbd2a5 1629 ok = sizeof(p) == drbd_send(mdev->tconn, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363 1630 if (ok && dgs) {
a0638456 1631 dgb = mdev->tconn->int_dig_out;
db830c46 1632 drbd_csum_ee(mdev, mdev->tconn->integrity_w_tfm, peer_req, dgb);
bedbd2a5 1633 ok = dgs == drbd_send(mdev->tconn, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
1634 }
1635 if (ok)
db830c46 1636 ok = _drbd_send_zc_ee(mdev, peer_req);
b411b363 1637
61120870 1638 drbd_put_data_sock(mdev->tconn);
bd26bfc5 1639
b411b363
PR
1640 return ok;
1641}
1642
73a01a18
PR
1643int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
1644{
1645 struct p_block_desc p;
1646
ace652ac
AG
1647 p.sector = cpu_to_be64(req->i.sector);
1648 p.blksize = cpu_to_be32(req->i.size);
73a01a18
PR
1649
1650 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
1651}
1652
b411b363
PR
1653/*
1654 drbd_send distinguishes two cases:
1655
1656 Packets sent via the data socket "sock"
1657 and packets sent via the meta data socket "msock"
1658
1659 sock msock
1660 -----------------+-------------------------+------------------------------
1661 timeout conf.timeout / 2 conf.timeout / 2
1662 timeout action send a ping via msock Abort communication
1663 and close all sockets
1664*/
1665
1666/*
1667 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1668 */
bedbd2a5 1669int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
b411b363
PR
1670 void *buf, size_t size, unsigned msg_flags)
1671{
1672 struct kvec iov;
1673 struct msghdr msg;
1674 int rv, sent = 0;
1675
1676 if (!sock)
1677 return -1000;
1678
1679 /* THINK if (signal_pending) return ... ? */
1680
1681 iov.iov_base = buf;
1682 iov.iov_len = size;
1683
1684 msg.msg_name = NULL;
1685 msg.msg_namelen = 0;
1686 msg.msg_control = NULL;
1687 msg.msg_controllen = 0;
1688 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
1689
bedbd2a5
PR
1690 if (sock == tconn->data.socket) {
1691 tconn->ko_count = tconn->net_conf->ko_count;
1692 drbd_update_congested(tconn);
b411b363
PR
1693 }
1694 do {
1695 /* STRANGE
1696 * tcp_sendmsg does _not_ use its size parameter at all ?
1697 *
1698 * -EAGAIN on timeout, -EINTR on signal.
1699 */
1700/* THINK
1701 * do we need to block DRBD_SIG if sock == &meta.socket ??
1702 * otherwise wake_asender() might interrupt some send_*Ack !
1703 */
1704 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
1705 if (rv == -EAGAIN) {
bedbd2a5 1706 if (we_should_drop_the_connection(tconn, sock))
b411b363
PR
1707 break;
1708 else
1709 continue;
1710 }
b411b363
PR
1711 if (rv == -EINTR) {
1712 flush_signals(current);
1713 rv = 0;
1714 }
1715 if (rv < 0)
1716 break;
1717 sent += rv;
1718 iov.iov_base += rv;
1719 iov.iov_len -= rv;
1720 } while (sent < size);
1721
bedbd2a5
PR
1722 if (sock == tconn->data.socket)
1723 clear_bit(NET_CONGESTED, &tconn->flags);
b411b363
PR
1724
1725 if (rv <= 0) {
1726 if (rv != -EAGAIN) {
bedbd2a5
PR
1727 conn_err(tconn, "%s_sendmsg returned %d\n",
1728 sock == tconn->meta.socket ? "msock" : "sock",
1729 rv);
bbeb641c 1730 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 1731 } else
bbeb641c 1732 conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
b411b363
PR
1733 }
1734
1735 return sent;
1736}
1737
1738static int drbd_open(struct block_device *bdev, fmode_t mode)
1739{
1740 struct drbd_conf *mdev = bdev->bd_disk->private_data;
1741 unsigned long flags;
1742 int rv = 0;
1743
2a48fc0a 1744 mutex_lock(&drbd_main_mutex);
87eeee41 1745 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
1746 /* to have a stable mdev->state.role
1747 * and no race with updating open_cnt */
1748
1749 if (mdev->state.role != R_PRIMARY) {
1750 if (mode & FMODE_WRITE)
1751 rv = -EROFS;
1752 else if (!allow_oos)
1753 rv = -EMEDIUMTYPE;
1754 }
1755
1756 if (!rv)
1757 mdev->open_cnt++;
87eeee41 1758 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
2a48fc0a 1759 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1760
1761 return rv;
1762}
1763
1764static int drbd_release(struct gendisk *gd, fmode_t mode)
1765{
1766 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 1767 mutex_lock(&drbd_main_mutex);
b411b363 1768 mdev->open_cnt--;
2a48fc0a 1769 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1770 return 0;
1771}
1772
b411b363
PR
1773static void drbd_set_defaults(struct drbd_conf *mdev)
1774{
85f4cc17
PR
1775 /* This way we get a compile error when sync_conf grows,
1776 and we forgot to initialize it here */
1777 mdev->sync_conf = (struct syncer_conf) {
1778 /* .rate = */ DRBD_RATE_DEF,
1779 /* .after = */ DRBD_AFTER_DEF,
1780 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
1781 /* .verify_alg = */ {}, 0,
1782 /* .cpu_mask = */ {}, 0,
1783 /* .csums_alg = */ {}, 0,
e756414f 1784 /* .use_rle = */ 0,
9a31d716
PR
1785 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
1786 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
1787 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
1788 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
1789 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
1790 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
1791 };
1792
1793 /* Have to use that way, because the layout differs between
1794 big endian and little endian */
b411b363
PR
1795 mdev->state = (union drbd_state) {
1796 { .role = R_SECONDARY,
1797 .peer = R_UNKNOWN,
1798 .conn = C_STANDALONE,
1799 .disk = D_DISKLESS,
1800 .pdsk = D_UNKNOWN,
fb22c402
PR
1801 .susp = 0,
1802 .susp_nod = 0,
1803 .susp_fen = 0
b411b363
PR
1804 } };
1805}
1806
1807void drbd_init_set_defaults(struct drbd_conf *mdev)
1808{
1809 /* the memset(,0,) did most of this.
1810 * note: only assignments, no allocation in here */
1811
1812 drbd_set_defaults(mdev);
1813
b411b363
PR
1814 atomic_set(&mdev->ap_bio_cnt, 0);
1815 atomic_set(&mdev->ap_pending_cnt, 0);
1816 atomic_set(&mdev->rs_pending_cnt, 0);
1817 atomic_set(&mdev->unacked_cnt, 0);
1818 atomic_set(&mdev->local_cnt, 0);
b411b363 1819 atomic_set(&mdev->pp_in_use, 0);
435f0740 1820 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 1821 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 1822 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 1823 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
1824
1825 mutex_init(&mdev->md_io_mutex);
8410da8f
PR
1826 mutex_init(&mdev->own_state_mutex);
1827 mdev->state_mutex = &mdev->own_state_mutex;
b411b363 1828
b411b363 1829 spin_lock_init(&mdev->al_lock);
b411b363
PR
1830 spin_lock_init(&mdev->peer_seq_lock);
1831 spin_lock_init(&mdev->epoch_lock);
1832
1833 INIT_LIST_HEAD(&mdev->active_ee);
1834 INIT_LIST_HEAD(&mdev->sync_ee);
1835 INIT_LIST_HEAD(&mdev->done_ee);
1836 INIT_LIST_HEAD(&mdev->read_ee);
1837 INIT_LIST_HEAD(&mdev->net_ee);
1838 INIT_LIST_HEAD(&mdev->resync_reads);
b411b363
PR
1839 INIT_LIST_HEAD(&mdev->resync_work.list);
1840 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 1841 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 1842 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 1843 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 1844 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 1845
794abb75 1846 mdev->resync_work.cb = w_resync_timer;
b411b363 1847 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 1848 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
1849 mdev->md_sync_work.cb = w_md_sync;
1850 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 1851 mdev->start_resync_work.cb = w_start_resync;
a21e9298
PR
1852
1853 mdev->resync_work.mdev = mdev;
1854 mdev->unplug_work.mdev = mdev;
1855 mdev->go_diskless.mdev = mdev;
1856 mdev->md_sync_work.mdev = mdev;
1857 mdev->bm_io_work.w.mdev = mdev;
1858 mdev->start_resync_work.mdev = mdev;
1859
b411b363
PR
1860 init_timer(&mdev->resync_timer);
1861 init_timer(&mdev->md_sync_timer);
370a43e7 1862 init_timer(&mdev->start_resync_timer);
7fde2be9 1863 init_timer(&mdev->request_timer);
b411b363
PR
1864 mdev->resync_timer.function = resync_timer_fn;
1865 mdev->resync_timer.data = (unsigned long) mdev;
1866 mdev->md_sync_timer.function = md_sync_timer_fn;
1867 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
1868 mdev->start_resync_timer.function = start_resync_timer_fn;
1869 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
1870 mdev->request_timer.function = request_timer_fn;
1871 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
1872
1873 init_waitqueue_head(&mdev->misc_wait);
1874 init_waitqueue_head(&mdev->state_wait);
1875 init_waitqueue_head(&mdev->ee_wait);
1876 init_waitqueue_head(&mdev->al_wait);
1877 init_waitqueue_head(&mdev->seq_wait);
1878
fd340c12 1879 /* mdev->tconn->agreed_pro_version gets initialized in drbd_connect() */
2451fc3b 1880 mdev->write_ordering = WO_bdev_flush;
b411b363 1881 mdev->resync_wenr = LC_FREE;
99432fcc
PR
1882 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
1883 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
1884}
1885
1886void drbd_mdev_cleanup(struct drbd_conf *mdev)
1887{
1d7734a0 1888 int i;
e6b3ea83 1889 if (mdev->tconn->receiver.t_state != NONE)
b411b363 1890 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
e6b3ea83 1891 mdev->tconn->receiver.t_state);
b411b363
PR
1892
1893 /* no need to lock it, I'm the only thread alive */
1894 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
1895 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
1896 mdev->al_writ_cnt =
1897 mdev->bm_writ_cnt =
1898 mdev->read_cnt =
1899 mdev->recv_cnt =
1900 mdev->send_cnt =
1901 mdev->writ_cnt =
1902 mdev->p_size =
1903 mdev->rs_start =
1904 mdev->rs_total =
1d7734a0
LE
1905 mdev->rs_failed = 0;
1906 mdev->rs_last_events = 0;
0f0601f4 1907 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
1908 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1909 mdev->rs_mark_left[i] = 0;
1910 mdev->rs_mark_time[i] = 0;
1911 }
89e58e75 1912 D_ASSERT(mdev->tconn->net_conf == NULL);
b411b363
PR
1913
1914 drbd_set_my_capacity(mdev, 0);
1915 if (mdev->bitmap) {
1916 /* maybe never allocated. */
02d9a94b 1917 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
1918 drbd_bm_cleanup(mdev);
1919 }
1920
1921 drbd_free_resources(mdev);
0778286a 1922 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
1923
1924 /*
1925 * currently we drbd_init_ee only on module load, so
1926 * we may do drbd_release_ee only on module unload!
1927 */
1928 D_ASSERT(list_empty(&mdev->active_ee));
1929 D_ASSERT(list_empty(&mdev->sync_ee));
1930 D_ASSERT(list_empty(&mdev->done_ee));
1931 D_ASSERT(list_empty(&mdev->read_ee));
1932 D_ASSERT(list_empty(&mdev->net_ee));
1933 D_ASSERT(list_empty(&mdev->resync_reads));
e42325a5
PR
1934 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
1935 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
b411b363
PR
1936 D_ASSERT(list_empty(&mdev->resync_work.list));
1937 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 1938 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
1939
1940 drbd_set_defaults(mdev);
b411b363
PR
1941}
1942
1943
1944static void drbd_destroy_mempools(void)
1945{
1946 struct page *page;
1947
1948 while (drbd_pp_pool) {
1949 page = drbd_pp_pool;
1950 drbd_pp_pool = (struct page *)page_private(page);
1951 __free_page(page);
1952 drbd_pp_vacant--;
1953 }
1954
1955 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
1956
1957 if (drbd_ee_mempool)
1958 mempool_destroy(drbd_ee_mempool);
1959 if (drbd_request_mempool)
1960 mempool_destroy(drbd_request_mempool);
1961 if (drbd_ee_cache)
1962 kmem_cache_destroy(drbd_ee_cache);
1963 if (drbd_request_cache)
1964 kmem_cache_destroy(drbd_request_cache);
1965 if (drbd_bm_ext_cache)
1966 kmem_cache_destroy(drbd_bm_ext_cache);
1967 if (drbd_al_ext_cache)
1968 kmem_cache_destroy(drbd_al_ext_cache);
1969
1970 drbd_ee_mempool = NULL;
1971 drbd_request_mempool = NULL;
1972 drbd_ee_cache = NULL;
1973 drbd_request_cache = NULL;
1974 drbd_bm_ext_cache = NULL;
1975 drbd_al_ext_cache = NULL;
1976
1977 return;
1978}
1979
1980static int drbd_create_mempools(void)
1981{
1982 struct page *page;
1816a2b4 1983 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
1984 int i;
1985
1986 /* prepare our caches and mempools */
1987 drbd_request_mempool = NULL;
1988 drbd_ee_cache = NULL;
1989 drbd_request_cache = NULL;
1990 drbd_bm_ext_cache = NULL;
1991 drbd_al_ext_cache = NULL;
1992 drbd_pp_pool = NULL;
1993
1994 /* caches */
1995 drbd_request_cache = kmem_cache_create(
1996 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
1997 if (drbd_request_cache == NULL)
1998 goto Enomem;
1999
2000 drbd_ee_cache = kmem_cache_create(
f6ffca9f 2001 "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
b411b363
PR
2002 if (drbd_ee_cache == NULL)
2003 goto Enomem;
2004
2005 drbd_bm_ext_cache = kmem_cache_create(
2006 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2007 if (drbd_bm_ext_cache == NULL)
2008 goto Enomem;
2009
2010 drbd_al_ext_cache = kmem_cache_create(
2011 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2012 if (drbd_al_ext_cache == NULL)
2013 goto Enomem;
2014
2015 /* mempools */
2016 drbd_request_mempool = mempool_create(number,
2017 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2018 if (drbd_request_mempool == NULL)
2019 goto Enomem;
2020
2021 drbd_ee_mempool = mempool_create(number,
2022 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 2023 if (drbd_ee_mempool == NULL)
b411b363
PR
2024 goto Enomem;
2025
2026 /* drbd's page pool */
2027 spin_lock_init(&drbd_pp_lock);
2028
2029 for (i = 0; i < number; i++) {
2030 page = alloc_page(GFP_HIGHUSER);
2031 if (!page)
2032 goto Enomem;
2033 set_page_private(page, (unsigned long)drbd_pp_pool);
2034 drbd_pp_pool = page;
2035 }
2036 drbd_pp_vacant = number;
2037
2038 return 0;
2039
2040Enomem:
2041 drbd_destroy_mempools(); /* in case we allocated some */
2042 return -ENOMEM;
2043}
2044
2045static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2046 void *unused)
2047{
2048 /* just so we have it. you never know what interesting things we
2049 * might want to do here some day...
2050 */
2051
2052 return NOTIFY_DONE;
2053}
2054
2055static struct notifier_block drbd_notifier = {
2056 .notifier_call = drbd_notify_sys,
2057};
2058
2059static void drbd_release_ee_lists(struct drbd_conf *mdev)
2060{
2061 int rr;
2062
2063 rr = drbd_release_ee(mdev, &mdev->active_ee);
2064 if (rr)
2065 dev_err(DEV, "%d EEs in active list found!\n", rr);
2066
2067 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2068 if (rr)
2069 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2070
2071 rr = drbd_release_ee(mdev, &mdev->read_ee);
2072 if (rr)
2073 dev_err(DEV, "%d EEs in read list found!\n", rr);
2074
2075 rr = drbd_release_ee(mdev, &mdev->done_ee);
2076 if (rr)
2077 dev_err(DEV, "%d EEs in done list found!\n", rr);
2078
2079 rr = drbd_release_ee(mdev, &mdev->net_ee);
2080 if (rr)
2081 dev_err(DEV, "%d EEs in net list found!\n", rr);
2082}
2083
774b3055
PR
2084/* caution. no locking. */
2085void drbd_delete_device(unsigned int minor)
b411b363
PR
2086{
2087 struct drbd_conf *mdev = minor_to_mdev(minor);
2088
2089 if (!mdev)
2090 return;
2091
774b3055
PR
2092 idr_remove(&mdev->tconn->volumes, minor);
2093
b411b363 2094 /* paranoia asserts */
70dc65e1 2095 D_ASSERT(mdev->open_cnt == 0);
e42325a5 2096 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
b411b363
PR
2097 /* end paranoia asserts */
2098
2099 del_gendisk(mdev->vdisk);
2100
2101 /* cleanup stuff that may have been allocated during
2102 * device (re-)configuration or state changes */
2103
2104 if (mdev->this_bdev)
2105 bdput(mdev->this_bdev);
2106
2107 drbd_free_resources(mdev);
2108
2109 drbd_release_ee_lists(mdev);
2110
b411b363
PR
2111 lc_destroy(mdev->act_log);
2112 lc_destroy(mdev->resync);
2113
2114 kfree(mdev->p_uuid);
2115 /* mdev->p_uuid = NULL; */
2116
b411b363
PR
2117 /* cleanup the rest that has been
2118 * allocated from drbd_new_device
2119 * and actually free the mdev itself */
2120 drbd_free_mdev(mdev);
2121}
2122
2123static void drbd_cleanup(void)
2124{
2125 unsigned int i;
2126
2127 unregister_reboot_notifier(&drbd_notifier);
2128
17a93f30
LE
2129 /* first remove proc,
2130 * drbdsetup uses it's presence to detect
2131 * whether DRBD is loaded.
2132 * If we would get stuck in proc removal,
2133 * but have netlink already deregistered,
2134 * some drbdsetup commands may wait forever
2135 * for an answer.
2136 */
2137 if (drbd_proc)
2138 remove_proc_entry("drbd", NULL);
2139
b411b363
PR
2140 drbd_nl_cleanup();
2141
2142 if (minor_table) {
b411b363
PR
2143 i = minor_count;
2144 while (i--)
2145 drbd_delete_device(i);
2146 drbd_destroy_mempools();
2147 }
2148
2149 kfree(minor_table);
2150
2151 unregister_blkdev(DRBD_MAJOR, "drbd");
2152
2153 printk(KERN_INFO "drbd: module cleanup done.\n");
2154}
2155
2156/**
2157 * drbd_congested() - Callback for pdflush
2158 * @congested_data: User data
2159 * @bdi_bits: Bits pdflush is currently interested in
2160 *
2161 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2162 */
2163static int drbd_congested(void *congested_data, int bdi_bits)
2164{
2165 struct drbd_conf *mdev = congested_data;
2166 struct request_queue *q;
2167 char reason = '-';
2168 int r = 0;
2169
1b881ef7 2170 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
2171 /* DRBD has frozen IO */
2172 r = bdi_bits;
2173 reason = 'd';
2174 goto out;
2175 }
2176
2177 if (get_ldev(mdev)) {
2178 q = bdev_get_queue(mdev->ldev->backing_bdev);
2179 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2180 put_ldev(mdev);
2181 if (r)
2182 reason = 'b';
2183 }
2184
01a311a5 2185 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
b411b363
PR
2186 r |= (1 << BDI_async_congested);
2187 reason = reason == 'b' ? 'a' : 'n';
2188 }
2189
2190out:
2191 mdev->congestion_reason = reason;
2192 return r;
2193}
2194
6699b655
PR
2195static void drbd_init_workqueue(struct drbd_work_queue* wq)
2196{
2197 sema_init(&wq->s, 0);
2198 spin_lock_init(&wq->q_lock);
2199 INIT_LIST_HEAD(&wq->q);
2200}
2201
1aba4d7f
PR
2202struct drbd_tconn *conn_by_name(const char *name)
2203{
2204 struct drbd_tconn *tconn;
2205
2206 write_lock_irq(&global_state_lock);
2207 list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
2208 if (!strcmp(tconn->name, name))
2209 goto found;
2210 }
2211 tconn = NULL;
2212found:
2213 write_unlock_irq(&global_state_lock);
2214 return tconn;
2215}
2216
2111438b
PR
2217struct drbd_tconn *drbd_new_tconn(char *name)
2218{
2219 struct drbd_tconn *tconn;
2220
2221 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
2222 if (!tconn)
2223 return NULL;
2224
2225 tconn->name = kstrdup(name, GFP_KERNEL);
2226 if (!tconn->name)
2227 goto fail;
2228
774b3055
PR
2229 if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
2230 goto fail;
2231
2f5cdd0b
PR
2232 if (!tl_init(tconn))
2233 goto fail;
2234
bbeb641c 2235 tconn->cstate = C_STANDALONE;
8410da8f 2236 mutex_init(&tconn->cstate_mutex);
6699b655 2237 spin_lock_init(&tconn->req_lock);
b2fb6dbe
PR
2238 atomic_set(&tconn->net_cnt, 0);
2239 init_waitqueue_head(&tconn->net_cnt_wait);
2a67d8b9 2240 init_waitqueue_head(&tconn->ping_wait);
062e879c 2241 idr_init(&tconn->volumes);
b2fb6dbe 2242
6699b655
PR
2243 drbd_init_workqueue(&tconn->data.work);
2244 mutex_init(&tconn->data.mutex);
2245
2246 drbd_init_workqueue(&tconn->meta.work);
2247 mutex_init(&tconn->meta.mutex);
2248
392c8801
PR
2249 drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
2250 drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
2251 drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
2252
2111438b
PR
2253 write_lock_irq(&global_state_lock);
2254 list_add(&tconn->all_tconn, &drbd_tconns);
2255 write_unlock_irq(&global_state_lock);
2256
2257 return tconn;
2258
2259fail:
2f5cdd0b 2260 tl_cleanup(tconn);
774b3055 2261 free_cpumask_var(tconn->cpu_mask);
2111438b
PR
2262 kfree(tconn->name);
2263 kfree(tconn);
2264
2265 return NULL;
2266}
2267
2268void drbd_free_tconn(struct drbd_tconn *tconn)
2269{
2270 write_lock_irq(&global_state_lock);
2271 list_del(&tconn->all_tconn);
2272 write_unlock_irq(&global_state_lock);
062e879c 2273 idr_destroy(&tconn->volumes);
2111438b 2274
774b3055 2275 free_cpumask_var(tconn->cpu_mask);
2111438b 2276 kfree(tconn->name);
b42a70ad
PR
2277 kfree(tconn->int_dig_out);
2278 kfree(tconn->int_dig_in);
2279 kfree(tconn->int_dig_vv);
2111438b
PR
2280 kfree(tconn);
2281}
2282
774b3055 2283enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
b411b363
PR
2284{
2285 struct drbd_conf *mdev;
2286 struct gendisk *disk;
2287 struct request_queue *q;
774b3055
PR
2288 int vnr_got = vnr;
2289
2290 mdev = minor_to_mdev(minor);
2291 if (mdev)
2292 return ERR_MINOR_EXISTS;
b411b363
PR
2293
2294 /* GFP_KERNEL, we are outside of all write-out paths */
2295 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2296 if (!mdev)
774b3055
PR
2297 return ERR_NOMEM;
2298
2299 mdev->tconn = tconn;
2300 if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
2301 goto out_no_idr;
2302 if (idr_get_new(&tconn->volumes, mdev, &vnr_got))
2303 goto out_no_idr;
2304 if (vnr_got != vnr) {
2305 dev_err(DEV, "vnr_got (%d) != vnr (%d)\n", vnr_got, vnr);
2306 goto out_no_q;
062e879c 2307 }
b411b363
PR
2308
2309 mdev->minor = minor;
2310
2311 drbd_init_set_defaults(mdev);
2312
2313 q = blk_alloc_queue(GFP_KERNEL);
2314 if (!q)
2315 goto out_no_q;
2316 mdev->rq_queue = q;
2317 q->queuedata = mdev;
b411b363
PR
2318
2319 disk = alloc_disk(1);
2320 if (!disk)
2321 goto out_no_disk;
2322 mdev->vdisk = disk;
2323
81e84650 2324 set_disk_ro(disk, true);
b411b363
PR
2325
2326 disk->queue = q;
2327 disk->major = DRBD_MAJOR;
2328 disk->first_minor = minor;
2329 disk->fops = &drbd_ops;
2330 sprintf(disk->disk_name, "drbd%d", minor);
2331 disk->private_data = mdev;
2332
2333 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2334 /* we have no partitions. we contain only ourselves. */
2335 mdev->this_bdev->bd_contains = mdev->this_bdev;
2336
2337 q->backing_dev_info.congested_fn = drbd_congested;
2338 q->backing_dev_info.congested_data = mdev;
2339
2f58dcfc 2340 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
2341 /* Setting the max_hw_sectors to an odd value of 8kibyte here
2342 This triggers a max_bio_size message upon first attach or connect */
2343 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
2344 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
2345 blk_queue_merge_bvec(q, drbd_merge_bvec);
87eeee41 2346 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
b411b363
PR
2347
2348 mdev->md_io_page = alloc_page(GFP_KERNEL);
2349 if (!mdev->md_io_page)
2350 goto out_no_io_page;
2351
2352 if (drbd_bm_init(mdev))
2353 goto out_no_bitmap;
dac1389c 2354 mdev->read_requests = RB_ROOT;
de696716 2355 mdev->write_requests = RB_ROOT;
b411b363 2356
b411b363
PR
2357 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2358 if (!mdev->current_epoch)
2359 goto out_no_epoch;
2360
2361 INIT_LIST_HEAD(&mdev->current_epoch->list);
2362 mdev->epochs = 1;
2363
774b3055
PR
2364 minor_table[minor] = mdev;
2365 add_disk(disk);
2366
2367 return NO_ERROR;
b411b363
PR
2368
2369/* out_whatever_else:
2370 kfree(mdev->current_epoch); */
2371out_no_epoch:
b411b363
PR
2372 drbd_bm_cleanup(mdev);
2373out_no_bitmap:
2374 __free_page(mdev->md_io_page);
2375out_no_io_page:
2376 put_disk(disk);
2377out_no_disk:
2378 blk_cleanup_queue(q);
2379out_no_q:
774b3055
PR
2380 idr_remove(&tconn->volumes, vnr_got);
2381out_no_idr:
b411b363 2382 kfree(mdev);
774b3055 2383 return ERR_NOMEM;
b411b363
PR
2384}
2385
2386/* counterpart of drbd_new_device.
2387 * last part of drbd_delete_device. */
2388void drbd_free_mdev(struct drbd_conf *mdev)
2389{
2390 kfree(mdev->current_epoch);
b411b363
PR
2391 if (mdev->bitmap) /* should no longer be there. */
2392 drbd_bm_cleanup(mdev);
2393 __free_page(mdev->md_io_page);
2394 put_disk(mdev->vdisk);
2395 blk_cleanup_queue(mdev->rq_queue);
b411b363
PR
2396 kfree(mdev);
2397}
2398
2399
2400int __init drbd_init(void)
2401{
2402 int err;
2403
fd340c12
PR
2404 BUILD_BUG_ON(sizeof(struct p_header80) != sizeof(struct p_header95));
2405 BUILD_BUG_ON(sizeof(struct p_handshake) != 80);
b411b363 2406
2b8a90b5 2407 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363
PR
2408 printk(KERN_ERR
2409 "drbd: invalid minor_count (%d)\n", minor_count);
2410#ifdef MODULE
2411 return -EINVAL;
2412#else
2413 minor_count = 8;
2414#endif
2415 }
2416
2417 err = drbd_nl_init();
2418 if (err)
2419 return err;
2420
2421 err = register_blkdev(DRBD_MAJOR, "drbd");
2422 if (err) {
2423 printk(KERN_ERR
2424 "drbd: unable to register block device major %d\n",
2425 DRBD_MAJOR);
2426 return err;
2427 }
2428
2429 register_reboot_notifier(&drbd_notifier);
2430
2431 /*
2432 * allocate all necessary structs
2433 */
2434 err = -ENOMEM;
2435
2436 init_waitqueue_head(&drbd_pp_wait);
2437
2438 drbd_proc = NULL; /* play safe for drbd_cleanup */
2439 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
2440 GFP_KERNEL);
2441 if (!minor_table)
2442 goto Enomem;
2443
2444 err = drbd_create_mempools();
2445 if (err)
2446 goto Enomem;
2447
8c484ee4 2448 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
2449 if (!drbd_proc) {
2450 printk(KERN_ERR "drbd: unable to register proc file\n");
2451 goto Enomem;
2452 }
2453
2454 rwlock_init(&global_state_lock);
2111438b 2455 INIT_LIST_HEAD(&drbd_tconns);
b411b363
PR
2456
2457 printk(KERN_INFO "drbd: initialized. "
2458 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2459 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2460 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
2461 printk(KERN_INFO "drbd: registered as block device major %d\n",
2462 DRBD_MAJOR);
2463 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
2464
2465 return 0; /* Success! */
2466
2467Enomem:
2468 drbd_cleanup();
2469 if (err == -ENOMEM)
2470 /* currently always the case */
2471 printk(KERN_ERR "drbd: ran out of memory\n");
2472 else
2473 printk(KERN_ERR "drbd: initialization failure\n");
2474 return err;
2475}
2476
2477void drbd_free_bc(struct drbd_backing_dev *ldev)
2478{
2479 if (ldev == NULL)
2480 return;
2481
e525fd89
TH
2482 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2483 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
2484
2485 kfree(ldev);
2486}
2487
360cc740
PR
2488void drbd_free_sock(struct drbd_tconn *tconn)
2489{
2490 if (tconn->data.socket) {
2491 mutex_lock(&tconn->data.mutex);
2492 kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
2493 sock_release(tconn->data.socket);
2494 tconn->data.socket = NULL;
2495 mutex_unlock(&tconn->data.mutex);
b411b363 2496 }
360cc740
PR
2497 if (tconn->meta.socket) {
2498 mutex_lock(&tconn->meta.mutex);
2499 kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
2500 sock_release(tconn->meta.socket);
2501 tconn->meta.socket = NULL;
2502 mutex_unlock(&tconn->meta.mutex);
b411b363
PR
2503 }
2504}
2505
2506
2507void drbd_free_resources(struct drbd_conf *mdev)
2508{
2509 crypto_free_hash(mdev->csums_tfm);
2510 mdev->csums_tfm = NULL;
2511 crypto_free_hash(mdev->verify_tfm);
2512 mdev->verify_tfm = NULL;
a0638456
PR
2513 crypto_free_hash(mdev->tconn->cram_hmac_tfm);
2514 mdev->tconn->cram_hmac_tfm = NULL;
2515 crypto_free_hash(mdev->tconn->integrity_w_tfm);
2516 mdev->tconn->integrity_w_tfm = NULL;
2517 crypto_free_hash(mdev->tconn->integrity_r_tfm);
2518 mdev->tconn->integrity_r_tfm = NULL;
b411b363 2519
360cc740 2520 drbd_free_sock(mdev->tconn);
b411b363
PR
2521
2522 __no_warn(local,
2523 drbd_free_bc(mdev->ldev);
2524 mdev->ldev = NULL;);
2525}
2526
2527/* meta data management */
2528
2529struct meta_data_on_disk {
2530 u64 la_size; /* last agreed size. */
2531 u64 uuid[UI_SIZE]; /* UUIDs. */
2532 u64 device_uuid;
2533 u64 reserved_u64_1;
2534 u32 flags; /* MDF */
2535 u32 magic;
2536 u32 md_size_sect;
2537 u32 al_offset; /* offset to this block */
2538 u32 al_nr_extents; /* important for restoring the AL */
2539 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
2540 u32 bm_offset; /* offset to the bitmap, from here */
2541 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
2542 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2543 u32 reserved_u32[3];
b411b363
PR
2544
2545} __packed;
2546
2547/**
2548 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
2549 * @mdev: DRBD device.
2550 */
2551void drbd_md_sync(struct drbd_conf *mdev)
2552{
2553 struct meta_data_on_disk *buffer;
2554 sector_t sector;
2555 int i;
2556
ee15b038
LE
2557 del_timer(&mdev->md_sync_timer);
2558 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
2559 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2560 return;
b411b363
PR
2561
2562 /* We use here D_FAILED and not D_ATTACHING because we try to write
2563 * metadata even if we detach due to a disk failure! */
2564 if (!get_ldev_if_state(mdev, D_FAILED))
2565 return;
2566
b411b363
PR
2567 mutex_lock(&mdev->md_io_mutex);
2568 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2569 memset(buffer, 0, 512);
2570
2571 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2572 for (i = UI_CURRENT; i < UI_SIZE; i++)
2573 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2574 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
2575 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
2576
2577 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
2578 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
2579 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
2580 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
2581 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
2582
2583 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 2584 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
2585
2586 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
2587 sector = mdev->ldev->md.md_offset;
2588
3f3a9b84 2589 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
2590 /* this was a try anyways ... */
2591 dev_err(DEV, "meta data update failed!\n");
81e84650 2592 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
2593 }
2594
2595 /* Update mdev->ldev->md.la_size_sect,
2596 * since we updated it on metadata. */
2597 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
2598
2599 mutex_unlock(&mdev->md_io_mutex);
2600 put_ldev(mdev);
2601}
2602
2603/**
2604 * drbd_md_read() - Reads in the meta data super block
2605 * @mdev: DRBD device.
2606 * @bdev: Device from which the meta data should be read in.
2607 *
116676ca 2608 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
2609 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
2610 */
2611int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2612{
2613 struct meta_data_on_disk *buffer;
2614 int i, rv = NO_ERROR;
2615
2616 if (!get_ldev_if_state(mdev, D_ATTACHING))
2617 return ERR_IO_MD_DISK;
2618
b411b363
PR
2619 mutex_lock(&mdev->md_io_mutex);
2620 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2621
2622 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 2623 /* NOTE: can't do normal error processing here as this is
b411b363
PR
2624 called BEFORE disk is attached */
2625 dev_err(DEV, "Error while reading metadata.\n");
2626 rv = ERR_IO_MD_DISK;
2627 goto err;
2628 }
2629
e7fad8af 2630 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
2631 dev_err(DEV, "Error while reading metadata, magic not found.\n");
2632 rv = ERR_MD_INVALID;
2633 goto err;
2634 }
2635 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
2636 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
2637 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
2638 rv = ERR_MD_INVALID;
2639 goto err;
2640 }
2641 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2642 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2643 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2644 rv = ERR_MD_INVALID;
2645 goto err;
2646 }
2647 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2648 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2649 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2650 rv = ERR_MD_INVALID;
2651 goto err;
2652 }
2653
2654 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2655 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2656 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2657 rv = ERR_MD_INVALID;
2658 goto err;
2659 }
2660
2661 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2662 for (i = UI_CURRENT; i < UI_SIZE; i++)
2663 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2664 bdev->md.flags = be32_to_cpu(buffer->flags);
2665 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
2666 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2667
87eeee41 2668 spin_lock_irq(&mdev->tconn->req_lock);
99432fcc
PR
2669 if (mdev->state.conn < C_CONNECTED) {
2670 int peer;
2671 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
2672 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
2673 mdev->peer_max_bio_size = peer;
2674 }
87eeee41 2675 spin_unlock_irq(&mdev->tconn->req_lock);
99432fcc 2676
b411b363
PR
2677 if (mdev->sync_conf.al_extents < 7)
2678 mdev->sync_conf.al_extents = 127;
2679
2680 err:
2681 mutex_unlock(&mdev->md_io_mutex);
2682 put_ldev(mdev);
2683
2684 return rv;
2685}
2686
2687/**
2688 * drbd_md_mark_dirty() - Mark meta data super block as dirty
2689 * @mdev: DRBD device.
2690 *
2691 * Call this function if you change anything that should be written to
2692 * the meta-data super block. This function sets MD_DIRTY, and starts a
2693 * timer that ensures that within five seconds you have to call drbd_md_sync().
2694 */
ca0e6098 2695#ifdef DEBUG
ee15b038
LE
2696void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
2697{
2698 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
2699 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
2700 mdev->last_md_mark_dirty.line = line;
2701 mdev->last_md_mark_dirty.func = func;
2702 }
2703}
2704#else
b411b363
PR
2705void drbd_md_mark_dirty(struct drbd_conf *mdev)
2706{
ee15b038 2707 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 2708 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 2709}
ee15b038 2710#endif
b411b363
PR
2711
2712static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
2713{
2714 int i;
2715
62b0da3a 2716 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 2717 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
2718}
2719
2720void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2721{
2722 if (idx == UI_CURRENT) {
2723 if (mdev->state.role == R_PRIMARY)
2724 val |= 1;
2725 else
2726 val &= ~((u64)1);
2727
2728 drbd_set_ed_uuid(mdev, val);
2729 }
2730
2731 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
2732 drbd_md_mark_dirty(mdev);
2733}
2734
2735
2736void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2737{
2738 if (mdev->ldev->md.uuid[idx]) {
2739 drbd_uuid_move_history(mdev);
2740 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
2741 }
2742 _drbd_uuid_set(mdev, idx, val);
2743}
2744
2745/**
2746 * drbd_uuid_new_current() - Creates a new current UUID
2747 * @mdev: DRBD device.
2748 *
2749 * Creates a new current UUID, and rotates the old current UUID into
2750 * the bitmap slot. Causes an incremental resync upon next connect.
2751 */
2752void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
2753{
2754 u64 val;
62b0da3a
LE
2755 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2756
2757 if (bm_uuid)
2758 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2759
b411b363 2760 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
2761
2762 get_random_bytes(&val, sizeof(u64));
2763 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 2764 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
2765 /* get it to stable storage _now_ */
2766 drbd_md_sync(mdev);
b411b363
PR
2767}
2768
2769void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
2770{
2771 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
2772 return;
2773
2774 if (val == 0) {
2775 drbd_uuid_move_history(mdev);
2776 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
2777 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 2778 } else {
62b0da3a
LE
2779 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2780 if (bm_uuid)
2781 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2782
62b0da3a 2783 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
2784 }
2785 drbd_md_mark_dirty(mdev);
2786}
2787
2788/**
2789 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2790 * @mdev: DRBD device.
2791 *
2792 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
2793 */
2794int drbd_bmio_set_n_write(struct drbd_conf *mdev)
2795{
2796 int rv = -EIO;
2797
2798 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2799 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
2800 drbd_md_sync(mdev);
2801 drbd_bm_set_all(mdev);
2802
2803 rv = drbd_bm_write(mdev);
2804
2805 if (!rv) {
2806 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2807 drbd_md_sync(mdev);
2808 }
2809
2810 put_ldev(mdev);
2811 }
2812
2813 return rv;
2814}
2815
2816/**
2817 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2818 * @mdev: DRBD device.
2819 *
2820 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
2821 */
2822int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
2823{
2824 int rv = -EIO;
2825
0778286a 2826 drbd_resume_al(mdev);
b411b363
PR
2827 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2828 drbd_bm_clear_all(mdev);
2829 rv = drbd_bm_write(mdev);
2830 put_ldev(mdev);
2831 }
2832
2833 return rv;
2834}
2835
00d56944 2836static int w_bitmap_io(struct drbd_work *w, int unused)
b411b363
PR
2837{
2838 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
00d56944 2839 struct drbd_conf *mdev = w->mdev;
02851e9f 2840 int rv = -EIO;
b411b363
PR
2841
2842 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
2843
02851e9f 2844 if (get_ldev(mdev)) {
20ceb2b2 2845 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
2846 rv = work->io_fn(mdev);
2847 drbd_bm_unlock(mdev);
2848 put_ldev(mdev);
2849 }
b411b363 2850
4738fa16 2851 clear_bit_unlock(BITMAP_IO, &mdev->flags);
b411b363
PR
2852 wake_up(&mdev->misc_wait);
2853
2854 if (work->done)
2855 work->done(mdev, rv);
2856
2857 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
2858 work->why = NULL;
20ceb2b2 2859 work->flags = 0;
b411b363
PR
2860
2861 return 1;
2862}
2863
82f59cc6
LE
2864void drbd_ldev_destroy(struct drbd_conf *mdev)
2865{
2866 lc_destroy(mdev->resync);
2867 mdev->resync = NULL;
2868 lc_destroy(mdev->act_log);
2869 mdev->act_log = NULL;
2870 __no_warn(local,
2871 drbd_free_bc(mdev->ldev);
2872 mdev->ldev = NULL;);
2873
82f59cc6
LE
2874 clear_bit(GO_DISKLESS, &mdev->flags);
2875}
2876
00d56944 2877static int w_go_diskless(struct drbd_work *w, int unused)
e9e6f3ec 2878{
00d56944
PR
2879 struct drbd_conf *mdev = w->mdev;
2880
e9e6f3ec 2881 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
2882 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
2883 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
2884 * the protected members anymore, though, so once put_ldev reaches zero
2885 * again, it will be safe to free them. */
e9e6f3ec 2886 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
2887 return 1;
2888}
2889
2890void drbd_go_diskless(struct drbd_conf *mdev)
2891{
2892 D_ASSERT(mdev->state.disk == D_FAILED);
2893 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
e42325a5 2894 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
e9e6f3ec
LE
2895}
2896
b411b363
PR
2897/**
2898 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
2899 * @mdev: DRBD device.
2900 * @io_fn: IO callback to be called when bitmap IO is possible
2901 * @done: callback to be called after the bitmap IO was performed
2902 * @why: Descriptive text of the reason for doing the IO
2903 *
2904 * While IO on the bitmap happens we freeze application IO thus we ensure
2905 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
2906 * called from worker context. It MUST NOT be used while a previous such
2907 * work is still pending!
2908 */
2909void drbd_queue_bitmap_io(struct drbd_conf *mdev,
2910 int (*io_fn)(struct drbd_conf *),
2911 void (*done)(struct drbd_conf *, int),
20ceb2b2 2912 char *why, enum bm_flag flags)
b411b363 2913{
e6b3ea83 2914 D_ASSERT(current == mdev->tconn->worker.task);
b411b363
PR
2915
2916 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
2917 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
2918 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
2919 if (mdev->bm_io_work.why)
2920 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
2921 why, mdev->bm_io_work.why);
2922
2923 mdev->bm_io_work.io_fn = io_fn;
2924 mdev->bm_io_work.done = done;
2925 mdev->bm_io_work.why = why;
20ceb2b2 2926 mdev->bm_io_work.flags = flags;
b411b363 2927
87eeee41 2928 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
2929 set_bit(BITMAP_IO, &mdev->flags);
2930 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 2931 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
e42325a5 2932 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
b411b363 2933 }
87eeee41 2934 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
2935}
2936
2937/**
2938 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
2939 * @mdev: DRBD device.
2940 * @io_fn: IO callback to be called when bitmap IO is possible
2941 * @why: Descriptive text of the reason for doing the IO
2942 *
2943 * freezes application IO while that the actual IO operations runs. This
2944 * functions MAY NOT be called from worker context.
2945 */
20ceb2b2
LE
2946int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
2947 char *why, enum bm_flag flags)
b411b363
PR
2948{
2949 int rv;
2950
e6b3ea83 2951 D_ASSERT(current != mdev->tconn->worker.task);
b411b363 2952
20ceb2b2
LE
2953 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
2954 drbd_suspend_io(mdev);
b411b363 2955
20ceb2b2 2956 drbd_bm_lock(mdev, why, flags);
b411b363
PR
2957 rv = io_fn(mdev);
2958 drbd_bm_unlock(mdev);
2959
20ceb2b2
LE
2960 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
2961 drbd_resume_io(mdev);
b411b363
PR
2962
2963 return rv;
2964}
2965
2966void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
2967{
2968 if ((mdev->ldev->md.flags & flag) != flag) {
2969 drbd_md_mark_dirty(mdev);
2970 mdev->ldev->md.flags |= flag;
2971 }
2972}
2973
2974void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
2975{
2976 if ((mdev->ldev->md.flags & flag) != 0) {
2977 drbd_md_mark_dirty(mdev);
2978 mdev->ldev->md.flags &= ~flag;
2979 }
2980}
2981int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
2982{
2983 return (bdev->md.flags & flag) != 0;
2984}
2985
2986static void md_sync_timer_fn(unsigned long data)
2987{
2988 struct drbd_conf *mdev = (struct drbd_conf *) data;
2989
e42325a5 2990 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
b411b363
PR
2991}
2992
00d56944 2993static int w_md_sync(struct drbd_work *w, int unused)
b411b363 2994{
00d56944
PR
2995 struct drbd_conf *mdev = w->mdev;
2996
b411b363 2997 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
2998#ifdef DEBUG
2999 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3000 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3001#endif
b411b363 3002 drbd_md_sync(mdev);
b411b363
PR
3003 return 1;
3004}
3005
d8763023 3006const char *cmdname(enum drbd_packet cmd)
f2ad9063
AG
3007{
3008 /* THINK may need to become several global tables
3009 * when we want to support more than
3010 * one PRO_VERSION */
3011 static const char *cmdnames[] = {
3012 [P_DATA] = "Data",
3013 [P_DATA_REPLY] = "DataReply",
3014 [P_RS_DATA_REPLY] = "RSDataReply",
3015 [P_BARRIER] = "Barrier",
3016 [P_BITMAP] = "ReportBitMap",
3017 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
3018 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
3019 [P_UNPLUG_REMOTE] = "UnplugRemote",
3020 [P_DATA_REQUEST] = "DataRequest",
3021 [P_RS_DATA_REQUEST] = "RSDataRequest",
3022 [P_SYNC_PARAM] = "SyncParam",
3023 [P_SYNC_PARAM89] = "SyncParam89",
3024 [P_PROTOCOL] = "ReportProtocol",
3025 [P_UUIDS] = "ReportUUIDs",
3026 [P_SIZES] = "ReportSizes",
3027 [P_STATE] = "ReportState",
3028 [P_SYNC_UUID] = "ReportSyncUUID",
3029 [P_AUTH_CHALLENGE] = "AuthChallenge",
3030 [P_AUTH_RESPONSE] = "AuthResponse",
3031 [P_PING] = "Ping",
3032 [P_PING_ACK] = "PingAck",
3033 [P_RECV_ACK] = "RecvAck",
3034 [P_WRITE_ACK] = "WriteAck",
3035 [P_RS_WRITE_ACK] = "RSWriteAck",
7be8da07 3036 [P_DISCARD_WRITE] = "DiscardWrite",
f2ad9063
AG
3037 [P_NEG_ACK] = "NegAck",
3038 [P_NEG_DREPLY] = "NegDReply",
3039 [P_NEG_RS_DREPLY] = "NegRSDReply",
3040 [P_BARRIER_ACK] = "BarrierAck",
3041 [P_STATE_CHG_REQ] = "StateChgRequest",
3042 [P_STATE_CHG_REPLY] = "StateChgReply",
3043 [P_OV_REQUEST] = "OVRequest",
3044 [P_OV_REPLY] = "OVReply",
3045 [P_OV_RESULT] = "OVResult",
3046 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
3047 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
3048 [P_COMPRESSED_BITMAP] = "CBitmap",
3049 [P_DELAY_PROBE] = "DelayProbe",
3050 [P_OUT_OF_SYNC] = "OutOfSync",
7be8da07 3051 [P_RETRY_WRITE] = "RetryWrite",
f2ad9063
AG
3052 };
3053
3054 if (cmd == P_HAND_SHAKE_M)
3055 return "HandShakeM";
3056 if (cmd == P_HAND_SHAKE_S)
3057 return "HandShakeS";
3058 if (cmd == P_HAND_SHAKE)
3059 return "HandShake";
6e849ce8 3060 if (cmd >= ARRAY_SIZE(cmdnames))
f2ad9063
AG
3061 return "Unknown";
3062 return cmdnames[cmd];
3063}
3064
7be8da07
AG
3065/**
3066 * drbd_wait_misc - wait for a request to make progress
3067 * @mdev: device associated with the request
3068 * @i: the struct drbd_interval embedded in struct drbd_request or
3069 * struct drbd_peer_request
3070 */
3071int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
3072{
3073 struct net_conf *net_conf = mdev->tconn->net_conf;
3074 DEFINE_WAIT(wait);
3075 long timeout;
3076
3077 if (!net_conf)
3078 return -ETIMEDOUT;
3079 timeout = MAX_SCHEDULE_TIMEOUT;
3080 if (net_conf->ko_count)
3081 timeout = net_conf->timeout * HZ / 10 * net_conf->ko_count;
3082
3083 /* Indicate to wake up mdev->misc_wait on progress. */
3084 i->waiting = true;
3085 prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
3086 spin_unlock_irq(&mdev->tconn->req_lock);
3087 timeout = schedule_timeout(timeout);
3088 finish_wait(&mdev->misc_wait, &wait);
3089 spin_lock_irq(&mdev->tconn->req_lock);
3090 if (!timeout || mdev->state.conn < C_CONNECTED)
3091 return -ETIMEDOUT;
3092 if (signal_pending(current))
3093 return -ERESTARTSYS;
3094 return 0;
3095}
3096
b411b363
PR
3097#ifdef CONFIG_DRBD_FAULT_INJECTION
3098/* Fault insertion support including random number generator shamelessly
3099 * stolen from kernel/rcutorture.c */
3100struct fault_random_state {
3101 unsigned long state;
3102 unsigned long count;
3103};
3104
3105#define FAULT_RANDOM_MULT 39916801 /* prime */
3106#define FAULT_RANDOM_ADD 479001701 /* prime */
3107#define FAULT_RANDOM_REFRESH 10000
3108
3109/*
3110 * Crude but fast random-number generator. Uses a linear congruential
3111 * generator, with occasional help from get_random_bytes().
3112 */
3113static unsigned long
3114_drbd_fault_random(struct fault_random_state *rsp)
3115{
3116 long refresh;
3117
49829ea7 3118 if (!rsp->count--) {
b411b363
PR
3119 get_random_bytes(&refresh, sizeof(refresh));
3120 rsp->state += refresh;
3121 rsp->count = FAULT_RANDOM_REFRESH;
3122 }
3123 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3124 return swahw32(rsp->state);
3125}
3126
3127static char *
3128_drbd_fault_str(unsigned int type) {
3129 static char *_faults[] = {
3130 [DRBD_FAULT_MD_WR] = "Meta-data write",
3131 [DRBD_FAULT_MD_RD] = "Meta-data read",
3132 [DRBD_FAULT_RS_WR] = "Resync write",
3133 [DRBD_FAULT_RS_RD] = "Resync read",
3134 [DRBD_FAULT_DT_WR] = "Data write",
3135 [DRBD_FAULT_DT_RD] = "Data read",
3136 [DRBD_FAULT_DT_RA] = "Data read ahead",
3137 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3138 [DRBD_FAULT_AL_EE] = "EE allocation",
3139 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3140 };
3141
3142 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3143}
3144
3145unsigned int
3146_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3147{
3148 static struct fault_random_state rrs = {0, 0};
3149
3150 unsigned int ret = (
3151 (fault_devs == 0 ||
3152 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3153 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3154
3155 if (ret) {
3156 fault_count++;
3157
7383506c 3158 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
3159 dev_warn(DEV, "***Simulating %s failure\n",
3160 _drbd_fault_str(type));
3161 }
3162
3163 return ret;
3164}
3165#endif
3166
3167const char *drbd_buildtag(void)
3168{
3169 /* DRBD built from external sources has here a reference to the
3170 git hash of the source code. */
3171
3172 static char buildtag[38] = "\0uilt-in";
3173
3174 if (buildtag[0] == 0) {
3175#ifdef CONFIG_MODULES
3176 if (THIS_MODULE != NULL)
3177 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3178 else
3179#endif
3180 buildtag[0] = 'b';
3181 }
3182
3183 return buildtag;
3184}
3185
3186module_init(drbd_init)
3187module_exit(drbd_cleanup)
3188
b411b363
PR
3189EXPORT_SYMBOL(drbd_conn_str);
3190EXPORT_SYMBOL(drbd_role_str);
3191EXPORT_SYMBOL(drbd_disk_str);
3192EXPORT_SYMBOL(drbd_set_st_err_str);