]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/drbd/drbd_main.c
drbd: Protect accesses to the uuid set with a spinlock
[mirror_ubuntu-bionic-kernel.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
2a48fc0a 67static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
68int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
e9e6f3ec 81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
509fc019 82static void _tl_clear(struct drbd_conf *mdev);
b411b363 83
b411b363
PR
84MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
85 "Lars Ellenberg <lars@linbit.com>");
86MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
87MODULE_VERSION(REL_VERSION);
88MODULE_LICENSE("GPL");
2b8a90b5
PR
89MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
90 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
91MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
92
93#include <linux/moduleparam.h>
94/* allow_open_on_secondary */
95MODULE_PARM_DESC(allow_oos, "DONT USE!");
96/* thanks to these macros, if compiled into the kernel (not-module),
97 * this becomes the boot parameter drbd.minor_count */
98module_param(minor_count, uint, 0444);
99module_param(disable_sendpage, bool, 0644);
100module_param(allow_oos, bool, 0);
101module_param(cn_idx, uint, 0444);
102module_param(proc_details, int, 0644);
103
104#ifdef CONFIG_DRBD_FAULT_INJECTION
105int enable_faults;
106int fault_rate;
107static int fault_count;
108int fault_devs;
109/* bitmap of enabled faults */
110module_param(enable_faults, int, 0664);
111/* fault rate % value - applies to all enabled faults */
112module_param(fault_rate, int, 0664);
113/* count of faults inserted */
114module_param(fault_count, int, 0664);
115/* bitmap of devices to insert faults on */
116module_param(fault_devs, int, 0644);
117#endif
118
119/* module parameter, defined */
2b8a90b5 120unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
90ab5ee9
RR
121bool disable_sendpage;
122bool allow_oos;
b411b363
PR
123unsigned int cn_idx = CN_IDX_DRBD;
124int proc_details; /* Detail level in proc drbd*/
125
126/* Module parameter for setting the user mode helper program
127 * to run. Default is /sbin/drbdadm */
128char usermode_helper[80] = "/sbin/drbdadm";
129
130module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
131
132/* in 2.6.x, our device mapping and config info contains our virtual gendisks
133 * as member "struct gendisk *vdisk;"
134 */
135struct drbd_conf **minor_table;
136
137struct kmem_cache *drbd_request_cache;
138struct kmem_cache *drbd_ee_cache; /* epoch entries */
139struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141mempool_t *drbd_request_mempool;
142mempool_t *drbd_ee_mempool;
4281808f 143mempool_t *drbd_md_io_page_pool;
9476f39d 144struct bio_set *drbd_md_io_bio_set;
b411b363
PR
145
146/* I do not use a standard mempool, because:
147 1) I want to hand out the pre-allocated objects first.
148 2) I want to be able to interrupt sleeping allocation with a signal.
149 Note: This is a single linked list, the next pointer is the private
150 member of struct page.
151 */
152struct page *drbd_pp_pool;
153spinlock_t drbd_pp_lock;
154int drbd_pp_vacant;
155wait_queue_head_t drbd_pp_wait;
156
157DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
158
7d4e9d09 159static const struct block_device_operations drbd_ops = {
b411b363
PR
160 .owner = THIS_MODULE,
161 .open = drbd_open,
162 .release = drbd_release,
163};
164
9476f39d
LE
165struct bio *bio_alloc_drbd(gfp_t gfp_mask)
166{
9476f39d
LE
167 if (!drbd_md_io_bio_set)
168 return bio_alloc(gfp_mask, 1);
169
395c72a7 170 return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
9476f39d
LE
171}
172
b411b363
PR
173#ifdef __CHECKER__
174/* When checking with sparse, and this is an inline function, sparse will
175 give tons of false positives. When this is a real functions sparse works.
176 */
177int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
178{
179 int io_allowed;
180
181 atomic_inc(&mdev->local_cnt);
182 io_allowed = (mdev->state.disk >= mins);
183 if (!io_allowed) {
184 if (atomic_dec_and_test(&mdev->local_cnt))
185 wake_up(&mdev->misc_wait);
186 }
187 return io_allowed;
188}
189
190#endif
191
192/**
193 * DOC: The transfer log
194 *
195 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
196 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
197 * of the list. There is always at least one &struct drbd_tl_epoch object.
198 *
199 * Each &struct drbd_tl_epoch has a circular double linked list of requests
200 * attached.
201 */
202static int tl_init(struct drbd_conf *mdev)
203{
204 struct drbd_tl_epoch *b;
205
206 /* during device minor initialization, we may well use GFP_KERNEL */
207 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
208 if (!b)
209 return 0;
210 INIT_LIST_HEAD(&b->requests);
211 INIT_LIST_HEAD(&b->w.list);
212 b->next = NULL;
213 b->br_number = 4711;
7e602c0a 214 b->n_writes = 0;
b411b363
PR
215 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
216
217 mdev->oldest_tle = b;
218 mdev->newest_tle = b;
219 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
6d7e32f5 220 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
b411b363
PR
221
222 mdev->tl_hash = NULL;
223 mdev->tl_hash_s = 0;
224
225 return 1;
226}
227
228static void tl_cleanup(struct drbd_conf *mdev)
229{
230 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
231 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
232 kfree(mdev->oldest_tle);
233 mdev->oldest_tle = NULL;
234 kfree(mdev->unused_spare_tle);
235 mdev->unused_spare_tle = NULL;
236 kfree(mdev->tl_hash);
237 mdev->tl_hash = NULL;
238 mdev->tl_hash_s = 0;
239}
240
241/**
242 * _tl_add_barrier() - Adds a barrier to the transfer log
243 * @mdev: DRBD device.
244 * @new: Barrier to be added before the current head of the TL.
245 *
246 * The caller must hold the req_lock.
247 */
248void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
249{
250 struct drbd_tl_epoch *newest_before;
251
252 INIT_LIST_HEAD(&new->requests);
253 INIT_LIST_HEAD(&new->w.list);
254 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
255 new->next = NULL;
7e602c0a 256 new->n_writes = 0;
b411b363
PR
257
258 newest_before = mdev->newest_tle;
c088b2d9 259 new->br_number = newest_before->br_number+1;
b411b363
PR
260 if (mdev->newest_tle != new) {
261 mdev->newest_tle->next = new;
262 mdev->newest_tle = new;
263 }
264}
265
266/**
267 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
268 * @mdev: DRBD device.
269 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
270 * @set_size: Expected number of requests before that barrier.
271 *
272 * In case the passed barrier_nr or set_size does not match the oldest
273 * &struct drbd_tl_epoch objects this function will cause a termination
274 * of the connection.
275 */
276void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
277 unsigned int set_size)
278{
279 struct drbd_tl_epoch *b, *nob; /* next old barrier */
280 struct list_head *le, *tle;
281 struct drbd_request *r;
282
283 spin_lock_irq(&mdev->req_lock);
284
285 b = mdev->oldest_tle;
286
287 /* first some paranoia code */
288 if (b == NULL) {
289 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
290 barrier_nr);
291 goto bail;
292 }
293 if (b->br_number != barrier_nr) {
294 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
295 barrier_nr, b->br_number);
296 goto bail;
297 }
7e602c0a
PR
298 if (b->n_writes != set_size) {
299 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
300 barrier_nr, set_size, b->n_writes);
b411b363
PR
301 goto bail;
302 }
303
304 /* Clean up list of requests processed during current epoch */
305 list_for_each_safe(le, tle, &b->requests) {
306 r = list_entry(le, struct drbd_request, tl_requests);
307 _req_mod(r, barrier_acked);
308 }
309 /* There could be requests on the list waiting for completion
310 of the write to the local disk. To avoid corruptions of
311 slab's data structures we have to remove the lists head.
312
313 Also there could have been a barrier ack out of sequence, overtaking
314 the write acks - which would be a bug and violating write ordering.
315 To not deadlock in case we lose connection while such requests are
316 still pending, we need some way to find them for the
317 _req_mode(connection_lost_while_pending).
318
319 These have been list_move'd to the out_of_sequence_requests list in
320 _req_mod(, barrier_acked) above.
321 */
6d7e32f5 322 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
b411b363
PR
323
324 nob = b->next;
325 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
326 _tl_add_barrier(mdev, b);
327 if (nob)
328 mdev->oldest_tle = nob;
329 /* if nob == NULL b was the only barrier, and becomes the new
330 barrier. Therefore mdev->oldest_tle points already to b */
331 } else {
332 D_ASSERT(nob != NULL);
333 mdev->oldest_tle = nob;
334 kfree(b);
335 }
336
337 spin_unlock_irq(&mdev->req_lock);
338 dec_ap_pending(mdev);
339
340 return;
341
342bail:
343 spin_unlock_irq(&mdev->req_lock);
344 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
345}
346
617049aa 347
b411b363 348/**
11b58e73 349 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 350 * @mdev: DRBD device.
11b58e73 351 * @what: The action/event to perform with all request objects
b411b363 352 *
11b58e73 353 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
fd2491f4 354 * restart_frozen_disk_io.
b411b363 355 */
11b58e73 356static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b411b363 357{
11b58e73 358 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 359 struct list_head *le, *tle, carry_reads;
11b58e73
PR
360 struct drbd_request *req;
361 int rv, n_writes, n_reads;
b411b363
PR
362
363 b = mdev->oldest_tle;
11b58e73 364 pn = &mdev->oldest_tle;
b411b363 365 while (b) {
11b58e73
PR
366 n_writes = 0;
367 n_reads = 0;
b9b98716 368 INIT_LIST_HEAD(&carry_reads);
b411b363 369 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
370 req = list_entry(le, struct drbd_request, tl_requests);
371 rv = _req_mod(req, what);
372
373 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
374 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
375 }
376 tmp = b->next;
377
b9b98716 378 if (n_writes) {
11b58e73
PR
379 if (what == resend) {
380 b->n_writes = n_writes;
381 if (b->w.cb == NULL) {
382 b->w.cb = w_send_barrier;
383 inc_ap_pending(mdev);
384 set_bit(CREATE_BARRIER, &mdev->flags);
385 }
386
387 drbd_queue_work(&mdev->data.work, &b->w);
388 }
389 pn = &b->next;
390 } else {
b9b98716
PR
391 if (n_reads)
392 list_add(&carry_reads, &b->requests);
11b58e73
PR
393 /* there could still be requests on that ring list,
394 * in case local io is still pending */
395 list_del(&b->requests);
396
397 /* dec_ap_pending corresponding to queue_barrier.
398 * the newest barrier may not have been queued yet,
399 * in which case w.cb is still NULL. */
400 if (b->w.cb != NULL)
401 dec_ap_pending(mdev);
402
403 if (b == mdev->newest_tle) {
404 /* recycle, but reinit! */
405 D_ASSERT(tmp == NULL);
406 INIT_LIST_HEAD(&b->requests);
b9b98716 407 list_splice(&carry_reads, &b->requests);
11b58e73
PR
408 INIT_LIST_HEAD(&b->w.list);
409 b->w.cb = NULL;
410 b->br_number = net_random();
411 b->n_writes = 0;
412
413 *pn = b;
414 break;
415 }
416 *pn = tmp;
417 kfree(b);
b411b363 418 }
b411b363 419 b = tmp;
b9b98716 420 list_splice(&carry_reads, &b->requests);
b411b363 421 }
6d7e32f5
PR
422
423 /* Actions operating on the disk state, also want to work on
424 requests that got barrier acked. */
6d7e32f5 425
509fc019
PR
426 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
427 req = list_entry(le, struct drbd_request, tl_requests);
428 _req_mod(req, what);
6d7e32f5 429 }
11b58e73
PR
430}
431
b411b363
PR
432
433/**
434 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
435 * @mdev: DRBD device.
436 *
437 * This is called after the connection to the peer was lost. The storage covered
438 * by the requests on the transfer gets marked as our of sync. Called from the
439 * receiver thread and the worker thread.
440 */
441void tl_clear(struct drbd_conf *mdev)
509fc019
PR
442{
443 spin_lock_irq(&mdev->req_lock);
444 _tl_clear(mdev);
445 spin_unlock_irq(&mdev->req_lock);
446}
447
448static void _tl_clear(struct drbd_conf *mdev)
b411b363 449{
b411b363
PR
450 struct list_head *le, *tle;
451 struct drbd_request *r;
b411b363 452
11b58e73 453 _tl_restart(mdev, connection_lost_while_pending);
b411b363
PR
454
455 /* we expect this list to be empty. */
456 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
457
458 /* but just in case, clean it up anyways! */
459 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
460 r = list_entry(le, struct drbd_request, tl_requests);
461 /* It would be nice to complete outside of spinlock.
462 * But this is easier for now. */
463 _req_mod(r, connection_lost_while_pending);
464 }
465
466 /* ensure bit indicating barrier is required is clear */
467 clear_bit(CREATE_BARRIER, &mdev->flags);
468
288f422e
PR
469 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
470
b411b363
PR
471}
472
11b58e73
PR
473void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
474{
475 spin_lock_irq(&mdev->req_lock);
476 _tl_restart(mdev, what);
b411b363
PR
477 spin_unlock_irq(&mdev->req_lock);
478}
479
fd2491f4
PR
480/**
481 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
482 * @mdev: DRBD device.
483 */
484void tl_abort_disk_io(struct drbd_conf *mdev)
485{
486 struct drbd_tl_epoch *b;
487 struct list_head *le, *tle;
488 struct drbd_request *req;
489
490 spin_lock_irq(&mdev->req_lock);
491 b = mdev->oldest_tle;
492 while (b) {
493 list_for_each_safe(le, tle, &b->requests) {
494 req = list_entry(le, struct drbd_request, tl_requests);
495 if (!(req->rq_state & RQ_LOCAL_PENDING))
496 continue;
497 _req_mod(req, abort_disk_io);
498 }
499 b = b->next;
500 }
501
502 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
503 req = list_entry(le, struct drbd_request, tl_requests);
504 if (!(req->rq_state & RQ_LOCAL_PENDING))
505 continue;
506 _req_mod(req, abort_disk_io);
507 }
508
509 spin_unlock_irq(&mdev->req_lock);
510}
511
b411b363 512/**
81e84650 513 * cl_wide_st_chg() - true if the state change is a cluster wide one
b411b363
PR
514 * @mdev: DRBD device.
515 * @os: old (current) state.
516 * @ns: new (wanted) state.
517 */
518static int cl_wide_st_chg(struct drbd_conf *mdev,
519 union drbd_state os, union drbd_state ns)
520{
521 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
522 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
523 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
524 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
02ee8f95 525 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
b411b363
PR
526 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
527 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
528}
529
bf885f8a
AG
530enum drbd_state_rv
531drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
532 union drbd_state mask, union drbd_state val)
b411b363
PR
533{
534 unsigned long flags;
535 union drbd_state os, ns;
bf885f8a 536 enum drbd_state_rv rv;
b411b363
PR
537
538 spin_lock_irqsave(&mdev->req_lock, flags);
539 os = mdev->state;
540 ns.i = (os.i & ~mask.i) | val.i;
541 rv = _drbd_set_state(mdev, ns, f, NULL);
542 ns = mdev->state;
543 spin_unlock_irqrestore(&mdev->req_lock, flags);
544
545 return rv;
546}
547
548/**
549 * drbd_force_state() - Impose a change which happens outside our control on our state
550 * @mdev: DRBD device.
551 * @mask: mask of state bits to change.
552 * @val: value of new state bits.
553 */
554void drbd_force_state(struct drbd_conf *mdev,
555 union drbd_state mask, union drbd_state val)
556{
557 drbd_change_state(mdev, CS_HARD, mask, val);
558}
559
bf885f8a
AG
560static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
561static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
562 union drbd_state,
563 union drbd_state);
77e8fdfc
PR
564enum sanitize_state_warnings {
565 NO_WARNING,
566 ABORTED_ONLINE_VERIFY,
567 ABORTED_RESYNC,
568 CONNECTION_LOST_NEGOTIATING,
569 IMPLICITLY_UPGRADED_DISK,
570 IMPLICITLY_UPGRADED_PDSK,
571};
b411b363 572static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
77e8fdfc 573 union drbd_state ns, enum sanitize_state_warnings *warn);
b411b363
PR
574int drbd_send_state_req(struct drbd_conf *,
575 union drbd_state, union drbd_state);
576
c8b32563
AG
577static enum drbd_state_rv
578_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
579 union drbd_state val)
b411b363
PR
580{
581 union drbd_state os, ns;
582 unsigned long flags;
bf885f8a 583 enum drbd_state_rv rv;
b411b363
PR
584
585 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
586 return SS_CW_SUCCESS;
587
588 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
589 return SS_CW_FAILED_BY_PEER;
590
591 rv = 0;
592 spin_lock_irqsave(&mdev->req_lock, flags);
593 os = mdev->state;
594 ns.i = (os.i & ~mask.i) | val.i;
595 ns = sanitize_state(mdev, os, ns, NULL);
596
597 if (!cl_wide_st_chg(mdev, os, ns))
598 rv = SS_CW_NO_NEED;
599 if (!rv) {
600 rv = is_valid_state(mdev, ns);
601 if (rv == SS_SUCCESS) {
602 rv = is_valid_state_transition(mdev, ns, os);
603 if (rv == SS_SUCCESS)
bf885f8a 604 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
b411b363
PR
605 }
606 }
607 spin_unlock_irqrestore(&mdev->req_lock, flags);
608
609 return rv;
610}
611
612/**
613 * drbd_req_state() - Perform an eventually cluster wide state change
614 * @mdev: DRBD device.
615 * @mask: mask of state bits to change.
616 * @val: value of new state bits.
617 * @f: flags
618 *
619 * Should not be called directly, use drbd_request_state() or
620 * _drbd_request_state().
621 */
bf885f8a
AG
622static enum drbd_state_rv
623drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
624 union drbd_state val, enum chg_state_flags f)
b411b363
PR
625{
626 struct completion done;
627 unsigned long flags;
628 union drbd_state os, ns;
bf885f8a 629 enum drbd_state_rv rv;
b411b363
PR
630
631 init_completion(&done);
632
633 if (f & CS_SERIALIZE)
634 mutex_lock(&mdev->state_mutex);
635
636 spin_lock_irqsave(&mdev->req_lock, flags);
637 os = mdev->state;
638 ns.i = (os.i & ~mask.i) | val.i;
639 ns = sanitize_state(mdev, os, ns, NULL);
640
641 if (cl_wide_st_chg(mdev, os, ns)) {
642 rv = is_valid_state(mdev, ns);
643 if (rv == SS_SUCCESS)
644 rv = is_valid_state_transition(mdev, ns, os);
645 spin_unlock_irqrestore(&mdev->req_lock, flags);
646
647 if (rv < SS_SUCCESS) {
648 if (f & CS_VERBOSE)
649 print_st_err(mdev, os, ns, rv);
650 goto abort;
651 }
652
653 drbd_state_lock(mdev);
654 if (!drbd_send_state_req(mdev, mask, val)) {
655 drbd_state_unlock(mdev);
656 rv = SS_CW_FAILED_BY_PEER;
657 if (f & CS_VERBOSE)
658 print_st_err(mdev, os, ns, rv);
659 goto abort;
660 }
661
662 wait_event(mdev->state_wait,
663 (rv = _req_st_cond(mdev, mask, val)));
664
665 if (rv < SS_SUCCESS) {
666 drbd_state_unlock(mdev);
667 if (f & CS_VERBOSE)
668 print_st_err(mdev, os, ns, rv);
669 goto abort;
670 }
671 spin_lock_irqsave(&mdev->req_lock, flags);
672 os = mdev->state;
673 ns.i = (os.i & ~mask.i) | val.i;
674 rv = _drbd_set_state(mdev, ns, f, &done);
675 drbd_state_unlock(mdev);
676 } else {
677 rv = _drbd_set_state(mdev, ns, f, &done);
678 }
679
680 spin_unlock_irqrestore(&mdev->req_lock, flags);
681
682 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
683 D_ASSERT(current != mdev->worker.task);
684 wait_for_completion(&done);
685 }
686
687abort:
688 if (f & CS_SERIALIZE)
689 mutex_unlock(&mdev->state_mutex);
690
691 return rv;
692}
693
694/**
695 * _drbd_request_state() - Request a state change (with flags)
696 * @mdev: DRBD device.
697 * @mask: mask of state bits to change.
698 * @val: value of new state bits.
699 * @f: flags
700 *
701 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
702 * flag, or when logging of failed state change requests is not desired.
703 */
bf885f8a
AG
704enum drbd_state_rv
705_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
706 union drbd_state val, enum chg_state_flags f)
b411b363 707{
bf885f8a 708 enum drbd_state_rv rv;
b411b363
PR
709
710 wait_event(mdev->state_wait,
711 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
712
713 return rv;
714}
715
716static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
717{
718 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
719 name,
720 drbd_conn_str(ns.conn),
721 drbd_role_str(ns.role),
722 drbd_role_str(ns.peer),
723 drbd_disk_str(ns.disk),
724 drbd_disk_str(ns.pdsk),
fb22c402 725 is_susp(ns) ? 's' : 'r',
b411b363
PR
726 ns.aftr_isp ? 'a' : '-',
727 ns.peer_isp ? 'p' : '-',
728 ns.user_isp ? 'u' : '-'
729 );
730}
731
bf885f8a
AG
732void print_st_err(struct drbd_conf *mdev, union drbd_state os,
733 union drbd_state ns, enum drbd_state_rv err)
b411b363
PR
734{
735 if (err == SS_IN_TRANSIENT_STATE)
736 return;
737 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
738 print_st(mdev, " state", os);
739 print_st(mdev, "wanted", ns);
740}
741
742
b411b363
PR
743/**
744 * is_valid_state() - Returns an SS_ error code if ns is not valid
745 * @mdev: DRBD device.
746 * @ns: State to consider.
747 */
bf885f8a
AG
748static enum drbd_state_rv
749is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
b411b363
PR
750{
751 /* See drbd_state_sw_errors in drbd_strings.c */
752
753 enum drbd_fencing_p fp;
bf885f8a 754 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
755
756 fp = FP_DONT_CARE;
757 if (get_ldev(mdev)) {
758 fp = mdev->ldev->dc.fencing;
759 put_ldev(mdev);
760 }
761
762 if (get_net_conf(mdev)) {
763 if (!mdev->net_conf->two_primaries &&
764 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
765 rv = SS_TWO_PRIMARIES;
766 put_net_conf(mdev);
767 }
768
769 if (rv <= 0)
770 /* already found a reason to abort */;
771 else if (ns.role == R_SECONDARY && mdev->open_cnt)
772 rv = SS_DEVICE_IN_USE;
773
774 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
775 rv = SS_NO_UP_TO_DATE_DISK;
776
777 else if (fp >= FP_RESOURCE &&
778 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
779 rv = SS_PRIMARY_NOP;
780
781 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
782 rv = SS_NO_UP_TO_DATE_DISK;
783
784 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
785 rv = SS_NO_LOCAL_DISK;
786
787 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
788 rv = SS_NO_REMOTE_DISK;
789
8d4ce82b
LE
790 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
791 rv = SS_NO_UP_TO_DATE_DISK;
792
b411b363
PR
793 else if ((ns.conn == C_CONNECTED ||
794 ns.conn == C_WF_BITMAP_S ||
795 ns.conn == C_SYNC_SOURCE ||
796 ns.conn == C_PAUSED_SYNC_S) &&
797 ns.disk == D_OUTDATED)
798 rv = SS_CONNECTED_OUTDATES;
799
800 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
801 (mdev->sync_conf.verify_alg[0] == 0))
802 rv = SS_NO_VERIFY_ALG;
803
804 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
805 mdev->agreed_pro_version < 88)
806 rv = SS_NOT_SUPPORTED;
807
fa7d9396
PR
808 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
809 rv = SS_CONNECTED_OUTDATES;
810
b411b363
PR
811 return rv;
812}
813
814/**
815 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
816 * @mdev: DRBD device.
817 * @ns: new state.
818 * @os: old state.
819 */
bf885f8a
AG
820static enum drbd_state_rv
821is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
822 union drbd_state os)
b411b363 823{
bf885f8a 824 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
825
826 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
827 os.conn > C_CONNECTED)
828 rv = SS_RESYNC_RUNNING;
829
830 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
831 rv = SS_ALREADY_STANDALONE;
832
833 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
834 rv = SS_IS_DISKLESS;
835
836 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
837 rv = SS_NO_NET_CONFIG;
838
839 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
840 rv = SS_LOWER_THAN_OUTDATED;
841
842 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
843 rv = SS_IN_TRANSIENT_STATE;
844
845 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
846 rv = SS_IN_TRANSIENT_STATE;
847
197296ff
PR
848 /* While establishing a connection only allow cstate to change.
849 Delay/refuse role changes, detach attach etc... */
850 if (test_bit(STATE_SENT, &mdev->flags) &&
851 !(os.conn == C_WF_REPORT_PARAMS ||
852 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
853 rv = SS_IN_TRANSIENT_STATE;
854
b411b363
PR
855 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
856 rv = SS_NEED_CONNECTION;
857
858 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
859 ns.conn != os.conn && os.conn > C_CONNECTED)
860 rv = SS_RESYNC_RUNNING;
861
862 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
863 os.conn < C_CONNECTED)
864 rv = SS_NEED_CONNECTION;
865
1fc80cf3
PR
866 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
867 && os.conn < C_WF_REPORT_PARAMS)
868 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
869
b411b363
PR
870 return rv;
871}
872
77e8fdfc
PR
873static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
874{
875 static const char *msg_table[] = {
876 [NO_WARNING] = "",
877 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
878 [ABORTED_RESYNC] = "Resync aborted.",
879 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
880 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
881 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
882 };
883
884 if (warn != NO_WARNING)
885 dev_warn(DEV, "%s\n", msg_table[warn]);
886}
887
b411b363
PR
888/**
889 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
890 * @mdev: DRBD device.
891 * @os: old state.
892 * @ns: new state.
893 * @warn_sync_abort:
894 *
895 * When we loose connection, we have to set the state of the peers disk (pdsk)
896 * to D_UNKNOWN. This rule and many more along those lines are in this function.
897 */
898static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
77e8fdfc 899 union drbd_state ns, enum sanitize_state_warnings *warn)
b411b363
PR
900{
901 enum drbd_fencing_p fp;
ab17b68f 902 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
b411b363 903
77e8fdfc
PR
904 if (warn)
905 *warn = NO_WARNING;
906
b411b363
PR
907 fp = FP_DONT_CARE;
908 if (get_ldev(mdev)) {
909 fp = mdev->ldev->dc.fencing;
910 put_ldev(mdev);
911 }
912
913 /* Disallow Network errors to configure a device's network part */
914 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
915 os.conn <= C_DISCONNECTING)
916 ns.conn = os.conn;
917
f2906e18
LE
918 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
919 * If you try to go into some Sync* state, that shall fail (elsewhere). */
b411b363 920 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
545752d5 921 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
b411b363
PR
922 ns.conn = os.conn;
923
82f59cc6
LE
924 /* we cannot fail (again) if we already detached */
925 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
926 ns.disk = D_DISKLESS;
927
b411b363
PR
928 /* After C_DISCONNECTING only C_STANDALONE may follow */
929 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
930 ns.conn = os.conn;
931
932 if (ns.conn < C_CONNECTED) {
933 ns.peer_isp = 0;
934 ns.peer = R_UNKNOWN;
935 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
936 ns.pdsk = D_UNKNOWN;
937 }
938
939 /* Clear the aftr_isp when becoming unconfigured */
940 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
941 ns.aftr_isp = 0;
942
b411b363
PR
943 /* Abort resync if a disk fails/detaches */
944 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
945 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
77e8fdfc
PR
946 if (warn)
947 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
948 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
b411b363
PR
949 ns.conn = C_CONNECTED;
950 }
951
b411b363
PR
952 /* Connection breaks down before we finished "Negotiating" */
953 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
954 get_ldev_if_state(mdev, D_NEGOTIATING)) {
955 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
956 ns.disk = mdev->new_state_tmp.disk;
957 ns.pdsk = mdev->new_state_tmp.pdsk;
958 } else {
77e8fdfc
PR
959 if (warn)
960 *warn = CONNECTION_LOST_NEGOTIATING;
b411b363
PR
961 ns.disk = D_DISKLESS;
962 ns.pdsk = D_UNKNOWN;
963 }
964 put_ldev(mdev);
965 }
966
ab17b68f
PR
967 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
968 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
969 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
970 ns.disk = D_UP_TO_DATE;
971 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
972 ns.pdsk = D_UP_TO_DATE;
973 }
974
975 /* Implications of the connection stat on the disk states */
976 disk_min = D_DISKLESS;
977 disk_max = D_UP_TO_DATE;
978 pdsk_min = D_INCONSISTENT;
979 pdsk_max = D_UNKNOWN;
980 switch ((enum drbd_conns)ns.conn) {
981 case C_WF_BITMAP_T:
982 case C_PAUSED_SYNC_T:
983 case C_STARTING_SYNC_T:
984 case C_WF_SYNC_UUID:
985 case C_BEHIND:
986 disk_min = D_INCONSISTENT;
987 disk_max = D_OUTDATED;
988 pdsk_min = D_UP_TO_DATE;
989 pdsk_max = D_UP_TO_DATE;
990 break;
991 case C_VERIFY_S:
992 case C_VERIFY_T:
993 disk_min = D_UP_TO_DATE;
994 disk_max = D_UP_TO_DATE;
995 pdsk_min = D_UP_TO_DATE;
996 pdsk_max = D_UP_TO_DATE;
997 break;
998 case C_CONNECTED:
999 disk_min = D_DISKLESS;
1000 disk_max = D_UP_TO_DATE;
1001 pdsk_min = D_DISKLESS;
1002 pdsk_max = D_UP_TO_DATE;
1003 break;
1004 case C_WF_BITMAP_S:
1005 case C_PAUSED_SYNC_S:
1006 case C_STARTING_SYNC_S:
1007 case C_AHEAD:
1008 disk_min = D_UP_TO_DATE;
1009 disk_max = D_UP_TO_DATE;
1010 pdsk_min = D_INCONSISTENT;
1011 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1012 break;
1013 case C_SYNC_TARGET:
1014 disk_min = D_INCONSISTENT;
1015 disk_max = D_INCONSISTENT;
1016 pdsk_min = D_UP_TO_DATE;
1017 pdsk_max = D_UP_TO_DATE;
1018 break;
1019 case C_SYNC_SOURCE:
1020 disk_min = D_UP_TO_DATE;
1021 disk_max = D_UP_TO_DATE;
1022 pdsk_min = D_INCONSISTENT;
1023 pdsk_max = D_INCONSISTENT;
1024 break;
1025 case C_STANDALONE:
1026 case C_DISCONNECTING:
1027 case C_UNCONNECTED:
1028 case C_TIMEOUT:
1029 case C_BROKEN_PIPE:
1030 case C_NETWORK_FAILURE:
1031 case C_PROTOCOL_ERROR:
1032 case C_TEAR_DOWN:
1033 case C_WF_CONNECTION:
1034 case C_WF_REPORT_PARAMS:
1035 case C_MASK:
1036 break;
1037 }
1038 if (ns.disk > disk_max)
1039 ns.disk = disk_max;
1040
1041 if (ns.disk < disk_min) {
77e8fdfc
PR
1042 if (warn)
1043 *warn = IMPLICITLY_UPGRADED_DISK;
ab17b68f
PR
1044 ns.disk = disk_min;
1045 }
1046 if (ns.pdsk > pdsk_max)
1047 ns.pdsk = pdsk_max;
1048
1049 if (ns.pdsk < pdsk_min) {
77e8fdfc
PR
1050 if (warn)
1051 *warn = IMPLICITLY_UPGRADED_PDSK;
ab17b68f
PR
1052 ns.pdsk = pdsk_min;
1053 }
1054
b411b363 1055 if (fp == FP_STONITH &&
0a492166
PR
1056 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1057 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
fb22c402 1058 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
265be2d0
PR
1059
1060 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1061 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1062 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
fb22c402 1063 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
b411b363
PR
1064
1065 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1066 if (ns.conn == C_SYNC_SOURCE)
1067 ns.conn = C_PAUSED_SYNC_S;
1068 if (ns.conn == C_SYNC_TARGET)
1069 ns.conn = C_PAUSED_SYNC_T;
1070 } else {
1071 if (ns.conn == C_PAUSED_SYNC_S)
1072 ns.conn = C_SYNC_SOURCE;
1073 if (ns.conn == C_PAUSED_SYNC_T)
1074 ns.conn = C_SYNC_TARGET;
1075 }
1076
1077 return ns;
1078}
1079
1080/* helper for __drbd_set_state */
1081static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1082{
30b743a2
LE
1083 if (mdev->agreed_pro_version < 90)
1084 mdev->ov_start_sector = 0;
1085 mdev->rs_total = drbd_bm_bits(mdev);
1086 mdev->ov_position = 0;
b411b363
PR
1087 if (cs == C_VERIFY_T) {
1088 /* starting online verify from an arbitrary position
1089 * does not fit well into the existing protocol.
1090 * on C_VERIFY_T, we initialize ov_left and friends
1091 * implicitly in receive_DataRequest once the
1092 * first P_OV_REQUEST is received */
1093 mdev->ov_start_sector = ~(sector_t)0;
1094 } else {
1095 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
30b743a2 1096 if (bit >= mdev->rs_total) {
b411b363
PR
1097 mdev->ov_start_sector =
1098 BM_BIT_TO_SECT(mdev->rs_total - 1);
30b743a2
LE
1099 mdev->rs_total = 1;
1100 } else
1101 mdev->rs_total -= bit;
b411b363
PR
1102 mdev->ov_position = mdev->ov_start_sector;
1103 }
30b743a2 1104 mdev->ov_left = mdev->rs_total;
b411b363
PR
1105}
1106
0778286a
PR
1107static void drbd_resume_al(struct drbd_conf *mdev)
1108{
1109 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1110 dev_info(DEV, "Resumed AL updates\n");
1111}
1112
b411b363
PR
1113/**
1114 * __drbd_set_state() - Set a new DRBD state
1115 * @mdev: DRBD device.
1116 * @ns: new state.
1117 * @flags: Flags
1118 * @done: Optional completion, that will get completed after the after_state_ch() finished
1119 *
1120 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1121 */
bf885f8a
AG
1122enum drbd_state_rv
1123__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1124 enum chg_state_flags flags, struct completion *done)
b411b363
PR
1125{
1126 union drbd_state os;
bf885f8a 1127 enum drbd_state_rv rv = SS_SUCCESS;
77e8fdfc 1128 enum sanitize_state_warnings ssw;
b411b363
PR
1129 struct after_state_chg_work *ascw;
1130
1131 os = mdev->state;
1132
77e8fdfc 1133 ns = sanitize_state(mdev, os, ns, &ssw);
b411b363
PR
1134
1135 if (ns.i == os.i)
1136 return SS_NOTHING_TO_DO;
1137
1138 if (!(flags & CS_HARD)) {
1139 /* pre-state-change checks ; only look at ns */
1140 /* See drbd_state_sw_errors in drbd_strings.c */
1141
1142 rv = is_valid_state(mdev, ns);
1143 if (rv < SS_SUCCESS) {
1144 /* If the old state was illegal as well, then let
1145 this happen...*/
1146
1616a254 1147 if (is_valid_state(mdev, os) == rv)
b411b363 1148 rv = is_valid_state_transition(mdev, ns, os);
b411b363
PR
1149 } else
1150 rv = is_valid_state_transition(mdev, ns, os);
1151 }
1152
1153 if (rv < SS_SUCCESS) {
1154 if (flags & CS_VERBOSE)
1155 print_st_err(mdev, os, ns, rv);
1156 return rv;
1157 }
1158
77e8fdfc 1159 print_sanitize_warnings(mdev, ssw);
b411b363
PR
1160
1161 {
662d91a2
AG
1162 char *pbp, pb[300];
1163 pbp = pb;
1164 *pbp = 0;
1165 if (ns.role != os.role)
1166 pbp += sprintf(pbp, "role( %s -> %s ) ",
1167 drbd_role_str(os.role),
1168 drbd_role_str(ns.role));
1169 if (ns.peer != os.peer)
1170 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1171 drbd_role_str(os.peer),
1172 drbd_role_str(ns.peer));
1173 if (ns.conn != os.conn)
1174 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1175 drbd_conn_str(os.conn),
1176 drbd_conn_str(ns.conn));
1177 if (ns.disk != os.disk)
1178 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1179 drbd_disk_str(os.disk),
1180 drbd_disk_str(ns.disk));
1181 if (ns.pdsk != os.pdsk)
1182 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1183 drbd_disk_str(os.pdsk),
1184 drbd_disk_str(ns.pdsk));
1185 if (is_susp(ns) != is_susp(os))
1186 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1187 is_susp(os),
1188 is_susp(ns));
1189 if (ns.aftr_isp != os.aftr_isp)
1190 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1191 os.aftr_isp,
1192 ns.aftr_isp);
1193 if (ns.peer_isp != os.peer_isp)
1194 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1195 os.peer_isp,
1196 ns.peer_isp);
1197 if (ns.user_isp != os.user_isp)
1198 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1199 os.user_isp,
1200 ns.user_isp);
1201 dev_info(DEV, "%s\n", pb);
b411b363
PR
1202 }
1203
1204 /* solve the race between becoming unconfigured,
1205 * worker doing the cleanup, and
1206 * admin reconfiguring us:
1207 * on (re)configure, first set CONFIG_PENDING,
1208 * then wait for a potentially exiting worker,
1209 * start the worker, and schedule one no_op.
1210 * then proceed with configuration.
1211 */
1212 if (ns.disk == D_DISKLESS &&
1213 ns.conn == C_STANDALONE &&
1214 ns.role == R_SECONDARY &&
1215 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1216 set_bit(DEVICE_DYING, &mdev->flags);
1217
82f59cc6
LE
1218 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1219 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1220 * drbd_ldev_destroy() won't happen before our corresponding
1221 * after_state_ch works run, where we put_ldev again. */
1222 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1223 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1224 atomic_inc(&mdev->local_cnt);
1225
1226 mdev->state = ns;
62b0da3a
LE
1227
1228 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1229 drbd_print_uuids(mdev, "attached to UUIDs");
1230
b411b363
PR
1231 wake_up(&mdev->misc_wait);
1232 wake_up(&mdev->state_wait);
1233
b411b363
PR
1234 /* aborted verify run. log the last position */
1235 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1236 ns.conn < C_CONNECTED) {
1237 mdev->ov_start_sector =
30b743a2 1238 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
b411b363
PR
1239 dev_info(DEV, "Online Verify reached sector %llu\n",
1240 (unsigned long long)mdev->ov_start_sector);
1241 }
1242
1243 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1244 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1245 dev_info(DEV, "Syncer continues.\n");
1d7734a0
LE
1246 mdev->rs_paused += (long)jiffies
1247 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
63106d3c
PR
1248 if (ns.conn == C_SYNC_TARGET)
1249 mod_timer(&mdev->resync_timer, jiffies);
b411b363
PR
1250 }
1251
1252 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1253 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1254 dev_info(DEV, "Resync suspended\n");
1d7734a0 1255 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
b411b363
PR
1256 }
1257
1258 if (os.conn == C_CONNECTED &&
1259 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1d7734a0
LE
1260 unsigned long now = jiffies;
1261 int i;
1262
30b743a2 1263 set_ov_position(mdev, ns.conn);
1d7734a0 1264 mdev->rs_start = now;
0f0601f4
LE
1265 mdev->rs_last_events = 0;
1266 mdev->rs_last_sect_ev = 0;
b411b363
PR
1267 mdev->ov_last_oos_size = 0;
1268 mdev->ov_last_oos_start = 0;
1269
1d7734a0 1270 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
30b743a2 1271 mdev->rs_mark_left[i] = mdev->ov_left;
1d7734a0
LE
1272 mdev->rs_mark_time[i] = now;
1273 }
1274
2649f080
LE
1275 drbd_rs_controller_reset(mdev);
1276
b411b363
PR
1277 if (ns.conn == C_VERIFY_S) {
1278 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1279 (unsigned long long)mdev->ov_position);
1280 mod_timer(&mdev->resync_timer, jiffies);
1281 }
1282 }
1283
1284 if (get_ldev(mdev)) {
1285 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1286 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1287 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1288
1289 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1290 mdf |= MDF_CRASHED_PRIMARY;
1291 if (mdev->state.role == R_PRIMARY ||
1292 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1293 mdf |= MDF_PRIMARY_IND;
1294 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1295 mdf |= MDF_CONNECTED_IND;
1296 if (mdev->state.disk > D_INCONSISTENT)
1297 mdf |= MDF_CONSISTENT;
1298 if (mdev->state.disk > D_OUTDATED)
1299 mdf |= MDF_WAS_UP_TO_DATE;
1300 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1301 mdf |= MDF_PEER_OUT_DATED;
1302 if (mdf != mdev->ldev->md.flags) {
1303 mdev->ldev->md.flags = mdf;
1304 drbd_md_mark_dirty(mdev);
1305 }
1306 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1307 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1308 put_ldev(mdev);
1309 }
1310
1311 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1312 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1313 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1314 set_bit(CONSIDER_RESYNC, &mdev->flags);
1315
1316 /* Receiver should clean up itself */
1317 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1318 drbd_thread_stop_nowait(&mdev->receiver);
1319
1320 /* Now the receiver finished cleaning up itself, it should die */
1321 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1322 drbd_thread_stop_nowait(&mdev->receiver);
1323
1324 /* Upon network failure, we need to restart the receiver. */
1e86ac48 1325 if (os.conn > C_WF_CONNECTION &&
b411b363
PR
1326 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1327 drbd_thread_restart_nowait(&mdev->receiver);
1328
0778286a
PR
1329 /* Resume AL writing if we get a connection */
1330 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1331 drbd_resume_al(mdev);
1332
ba280c09
LE
1333 /* remember last connect and attach times so request_timer_fn() won't
1334 * kill newly established sessions while we are still trying to thaw
1335 * previously frozen IO */
1336 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1337 mdev->last_reconnect_jif = jiffies;
1338 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1339 ns.disk > D_NEGOTIATING)
1340 mdev->last_reattach_jif = jiffies;
1341
b411b363
PR
1342 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1343 if (ascw) {
1344 ascw->os = os;
1345 ascw->ns = ns;
1346 ascw->flags = flags;
1347 ascw->w.cb = w_after_state_ch;
1348 ascw->done = done;
1349 drbd_queue_work(&mdev->data.work, &ascw->w);
1350 } else {
1351 dev_warn(DEV, "Could not kmalloc an ascw\n");
1352 }
1353
1354 return rv;
1355}
1356
1357static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1358{
1359 struct after_state_chg_work *ascw =
1360 container_of(w, struct after_state_chg_work, w);
1361 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1362 if (ascw->flags & CS_WAIT_COMPLETE) {
1363 D_ASSERT(ascw->done != NULL);
1364 complete(ascw->done);
1365 }
1366 kfree(ascw);
1367
1368 return 1;
1369}
1370
1371static void abw_start_sync(struct drbd_conf *mdev, int rv)
1372{
1373 if (rv) {
1374 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1375 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1376 return;
1377 }
1378
1379 switch (mdev->state.conn) {
1380 case C_STARTING_SYNC_T:
1381 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1382 break;
1383 case C_STARTING_SYNC_S:
1384 drbd_start_resync(mdev, C_SYNC_SOURCE);
1385 break;
1386 }
1387}
1388
20ceb2b2
LE
1389int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1390 int (*io_fn)(struct drbd_conf *),
1391 char *why, enum bm_flag flags)
19f843aa
LE
1392{
1393 int rv;
1394
1395 D_ASSERT(current == mdev->worker.task);
1396
1397 /* open coded non-blocking drbd_suspend_io(mdev); */
1398 set_bit(SUSPEND_IO, &mdev->flags);
19f843aa 1399
20ceb2b2 1400 drbd_bm_lock(mdev, why, flags);
19f843aa
LE
1401 rv = io_fn(mdev);
1402 drbd_bm_unlock(mdev);
1403
1404 drbd_resume_io(mdev);
1405
1406 return rv;
1407}
1408
b411b363
PR
1409/**
1410 * after_state_ch() - Perform after state change actions that may sleep
1411 * @mdev: DRBD device.
1412 * @os: old state.
1413 * @ns: new state.
1414 * @flags: Flags
1415 */
1416static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1417 union drbd_state ns, enum chg_state_flags flags)
1418{
1419 enum drbd_fencing_p fp;
67098930 1420 enum drbd_req_event what = nothing;
fb22c402 1421 union drbd_state nsm = (union drbd_state){ .i = -1 };
b411b363
PR
1422
1423 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1424 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1425 if (mdev->p_uuid)
1426 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1427 }
1428
1429 fp = FP_DONT_CARE;
1430 if (get_ldev(mdev)) {
1431 fp = mdev->ldev->dc.fencing;
1432 put_ldev(mdev);
1433 }
1434
1435 /* Inform userspace about the change... */
1436 drbd_bcast_state(mdev, ns);
1437
1438 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1439 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1440 drbd_khelper(mdev, "pri-on-incon-degr");
1441
1442 /* Here we have the actions that are performed after a
1443 state change. This function might sleep */
1444
dfa8bedb
PR
1445 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1446 mod_timer(&mdev->request_timer, jiffies + HZ);
1447
fb22c402
PR
1448 nsm.i = -1;
1449 if (ns.susp_nod) {
3f98688a
PR
1450 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1451 what = resend;
265be2d0 1452
79f16f5d
PR
1453 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1454 ns.disk > D_NEGOTIATING)
3f98688a 1455 what = restart_frozen_disk_io;
fb22c402 1456
3f98688a
PR
1457 if (what != nothing)
1458 nsm.susp_nod = 0;
265be2d0
PR
1459 }
1460
fb22c402 1461 if (ns.susp_fen) {
43a5182c
PR
1462 /* case1: The outdate peer handler is successful: */
1463 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
43a5182c
PR
1464 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1465 drbd_uuid_new_current(mdev);
1466 clear_bit(NEW_CUR_UUID, &mdev->flags);
43a5182c 1467 }
b411b363 1468 spin_lock_irq(&mdev->req_lock);
509fc019 1469 _tl_clear(mdev);
fb22c402 1470 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
b411b363
PR
1471 spin_unlock_irq(&mdev->req_lock);
1472 }
43a5182c
PR
1473 /* case2: The connection was established again: */
1474 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1475 clear_bit(NEW_CUR_UUID, &mdev->flags);
67098930 1476 what = resend;
fb22c402 1477 nsm.susp_fen = 0;
43a5182c 1478 }
b411b363 1479 }
67098930
PR
1480
1481 if (what != nothing) {
1482 spin_lock_irq(&mdev->req_lock);
1483 _tl_restart(mdev, what);
fb22c402
PR
1484 nsm.i &= mdev->state.i;
1485 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
67098930 1486 spin_unlock_irq(&mdev->req_lock);
b411b363 1487 }
67098930 1488
5a22db89
LE
1489 /* Became sync source. With protocol >= 96, we still need to send out
1490 * the sync uuid now. Need to do that before any drbd_send_state, or
1491 * the other side may go "paused sync" before receiving the sync uuids,
1492 * which is unexpected. */
1493 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1494 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1495 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1496 drbd_gen_and_send_sync_uuid(mdev);
1497 put_ldev(mdev);
1498 }
1499
b411b363
PR
1500 /* Do not change the order of the if above and the two below... */
1501 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
0029d624
LE
1502 /* we probably will start a resync soon.
1503 * make sure those things are properly reset. */
1504 mdev->rs_total = 0;
1505 mdev->rs_failed = 0;
1506 atomic_set(&mdev->rs_pending_cnt, 0);
1507 drbd_rs_cancel_all(mdev);
1508
b411b363 1509 drbd_send_uuids(mdev);
f479ea06 1510 drbd_send_state(mdev, ns);
b411b363 1511 }
54b956ab
LE
1512 /* No point in queuing send_bitmap if we don't have a connection
1513 * anymore, so check also the _current_ state, not only the new state
1514 * at the time this work was queued. */
1515 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1516 mdev->state.conn == C_WF_BITMAP_S)
1517 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
20ceb2b2
LE
1518 "send_bitmap (WFBitMapS)",
1519 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1520
1521 /* Lost contact to peer's copy of the data */
1522 if ((os.pdsk >= D_INCONSISTENT &&
1523 os.pdsk != D_UNKNOWN &&
1524 os.pdsk != D_OUTDATED)
1525 && (ns.pdsk < D_INCONSISTENT ||
1526 ns.pdsk == D_UNKNOWN ||
1527 ns.pdsk == D_OUTDATED)) {
b411b363
PR
1528 if (get_ldev(mdev)) {
1529 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
2c8d1967 1530 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
fb22c402 1531 if (is_susp(mdev->state)) {
43a5182c
PR
1532 set_bit(NEW_CUR_UUID, &mdev->flags);
1533 } else {
1534 drbd_uuid_new_current(mdev);
1535 drbd_send_uuids(mdev);
1536 }
2c8d1967 1537 }
b411b363
PR
1538 put_ldev(mdev);
1539 }
1540 }
1541
1542 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
bca482e9
PR
1543 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1544 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
2c8d1967 1545 drbd_uuid_new_current(mdev);
18a50fa2
PR
1546 drbd_send_uuids(mdev);
1547 }
b411b363
PR
1548 /* D_DISKLESS Peer becomes secondary */
1549 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
20ceb2b2
LE
1550 /* We may still be Primary ourselves.
1551 * No harm done if the bitmap still changes,
1552 * redirtied pages will follow later. */
1553 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1554 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
19f843aa
LE
1555 put_ldev(mdev);
1556 }
1557
06d33e96
LE
1558 /* Write out all changed bits on demote.
1559 * Though, no need to da that just yet
1560 * if there is a resync going on still */
1561 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1562 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
20ceb2b2
LE
1563 /* No changes to the bitmap expected this time, so assert that,
1564 * even though no harm was done if it did change. */
1565 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1566 "demote", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1567 put_ldev(mdev);
1568 }
1569
1570 /* Last part of the attaching process ... */
1571 if (ns.conn >= C_CONNECTED &&
1572 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
e89b591c 1573 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363 1574 drbd_send_uuids(mdev);
f479ea06 1575 drbd_send_state(mdev, ns);
b411b363
PR
1576 }
1577
1578 /* We want to pause/continue resync, tell peer. */
1579 if (ns.conn >= C_CONNECTED &&
1580 ((os.aftr_isp != ns.aftr_isp) ||
1581 (os.user_isp != ns.user_isp)))
f479ea06 1582 drbd_send_state(mdev, ns);
b411b363
PR
1583
1584 /* In case one of the isp bits got set, suspend other devices. */
1585 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1586 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1587 suspend_other_sg(mdev);
1588
1589 /* Make sure the peer gets informed about eventual state
1590 changes (ISP bits) while we were in WFReportParams. */
1591 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
f479ea06 1592 drbd_send_state(mdev, ns);
b411b363 1593
67531718 1594 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
f479ea06 1595 drbd_send_state(mdev, ns);
67531718 1596
b411b363
PR
1597 /* We are in the progress to start a full sync... */
1598 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1599 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
20ceb2b2
LE
1600 /* no other bitmap changes expected during this phase */
1601 drbd_queue_bitmap_io(mdev,
1602 &drbd_bmio_set_n_write, &abw_start_sync,
1603 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1604
1605 /* We are invalidating our self... */
1606 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1607 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
20ceb2b2
LE
1608 /* other bitmap operation expected during this phase */
1609 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1610 "set_n_write from invalidate", BM_LOCKED_MASK);
b411b363 1611
82f59cc6
LE
1612 /* first half of local IO error, failure to attach,
1613 * or administrative detach */
1614 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
7caacb69
PR
1615 enum drbd_io_error_p eh = EP_PASS_ON;
1616 int was_io_error = 0;
82f59cc6 1617 /* corresponding get_ldev was in __drbd_set_state, to serialize
7caacb69
PR
1618 * our cleanup here with the transition to D_DISKLESS.
1619 * But is is still not save to dreference ldev here, since
1620 * we might come from an failed Attach before ldev was set. */
1621 if (mdev->ldev) {
1622 eh = mdev->ldev->dc.on_io_error;
1623 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1624
63a6d0bb
LE
1625 if (was_io_error && eh == EP_CALL_HELPER)
1626 drbd_khelper(mdev, "local-io-error");
1627
383606e0
LE
1628 /* Immediately allow completion of all application IO,
1629 * that waits for completion from the local disk,
1630 * if this was a force-detach due to disk_timeout
1631 * or administrator request (drbdsetup detach --force).
1632 * Do NOT abort otherwise.
1633 * Aborting local requests may cause serious problems,
1634 * if requests are completed to upper layers already,
1635 * and then later the already submitted local bio completes.
1636 * This can cause DMA into former bio pages that meanwhile
1637 * have been re-used for other things.
1638 * So aborting local requests may cause crashes,
1639 * or even worse, silent data corruption.
1640 */
1641 if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
1642 tl_abort_disk_io(mdev);
7caacb69
PR
1643
1644 /* current state still has to be D_FAILED,
1645 * there is only one way out: to D_DISKLESS,
1646 * and that may only happen after our put_ldev below. */
1647 if (mdev->state.disk != D_FAILED)
1648 dev_err(DEV,
1649 "ASSERT FAILED: disk is %s during detach\n",
1650 drbd_disk_str(mdev->state.disk));
1651
1652 if (ns.conn >= C_CONNECTED)
1653 drbd_send_state(mdev, ns);
1654
1655 drbd_rs_cancel_all(mdev);
1656
1657 /* In case we want to get something to stable storage still,
1658 * this may be the last chance.
1659 * Following put_ldev may transition to D_DISKLESS. */
1660 drbd_md_sync(mdev);
1661 }
82f59cc6 1662 put_ldev(mdev);
e9e6f3ec 1663 }
b411b363 1664
82f59cc6
LE
1665 /* second half of local IO error, failure to attach,
1666 * or administrative detach,
1667 * after local_cnt references have reached zero again */
1668 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1669 /* We must still be diskless,
1670 * re-attach has to be serialized with this! */
1671 if (mdev->state.disk != D_DISKLESS)
1672 dev_err(DEV,
1673 "ASSERT FAILED: disk is %s while going diskless\n",
1674 drbd_disk_str(mdev->state.disk));
e9e6f3ec 1675
4afc433c
PR
1676 if (ns.conn >= C_CONNECTED)
1677 drbd_send_state(mdev, ns);
1678
82f59cc6 1679 /* corresponding get_ldev in __drbd_set_state
25985edc 1680 * this may finally trigger drbd_ldev_destroy. */
82f59cc6 1681 put_ldev(mdev);
b411b363
PR
1682 }
1683
738a84b2 1684 /* Notify peer that I had a local IO error, and did not detached.. */
4afc433c 1685 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
f479ea06 1686 drbd_send_state(mdev, ns);
738a84b2 1687
b411b363
PR
1688 /* Disks got bigger while they were detached */
1689 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1690 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1691 if (ns.conn == C_CONNECTED)
1692 resync_after_online_grow(mdev);
1693 }
1694
1695 /* A resync finished or aborted, wake paused devices... */
1696 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1697 (os.peer_isp && !ns.peer_isp) ||
1698 (os.user_isp && !ns.user_isp))
1699 resume_next_sg(mdev);
1700
af85e8e8
LE
1701 /* sync target done with resync. Explicitly notify peer, even though
1702 * it should (at least for non-empty resyncs) already know itself. */
1703 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
f479ea06 1704 drbd_send_state(mdev, ns);
af85e8e8 1705
197296ff
PR
1706 /* Wake up role changes, that were delayed because of connection establishing */
1707 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1708 clear_bit(STATE_SENT, &mdev->flags);
1709 wake_up(&mdev->state_wait);
1710 }
1711
79a30d2d
LE
1712 /* This triggers bitmap writeout of potentially still unwritten pages
1713 * if the resync finished cleanly, or aborted because of peer disk
20ceb2b2 1714 * failure, or because of connection loss.
79a30d2d
LE
1715 * For resync aborted because of local disk failure, we cannot do
1716 * any bitmap writeout anymore.
20ceb2b2 1717 * No harm done if some bits change during this phase.
79a30d2d 1718 */
20ceb2b2 1719 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
0e8488ad
LE
1720 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1721 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
79a30d2d
LE
1722 put_ldev(mdev);
1723 }
02851e9f 1724
f70b3511 1725 /* free tl_hash if we Got thawed and are C_STANDALONE */
fb22c402 1726 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
f70b3511
PR
1727 drbd_free_tl_hash(mdev);
1728
b411b363
PR
1729 /* Upon network connection, we need to start the receiver */
1730 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1731 drbd_thread_start(&mdev->receiver);
1732
1733 /* Terminate worker thread if we are unconfigured - it will be
1734 restarted as needed... */
1735 if (ns.disk == D_DISKLESS &&
1736 ns.conn == C_STANDALONE &&
1737 ns.role == R_SECONDARY) {
1738 if (os.aftr_isp != ns.aftr_isp)
1739 resume_next_sg(mdev);
1740 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1741 if (test_bit(DEVICE_DYING, &mdev->flags))
1742 drbd_thread_stop_nowait(&mdev->worker);
1743 }
1744
1745 drbd_md_sync(mdev);
1746}
1747
1748
1749static int drbd_thread_setup(void *arg)
1750{
1751 struct drbd_thread *thi = (struct drbd_thread *) arg;
1752 struct drbd_conf *mdev = thi->mdev;
1753 unsigned long flags;
1754 int retval;
1755
1756restart:
1757 retval = thi->function(thi);
1758
1759 spin_lock_irqsave(&thi->t_lock, flags);
1760
1761 /* if the receiver has been "Exiting", the last thing it did
1762 * was set the conn state to "StandAlone",
1763 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1764 * and receiver thread will be "started".
1765 * drbd_thread_start needs to set "Restarting" in that case.
1766 * t_state check and assignment needs to be within the same spinlock,
1767 * so either thread_start sees Exiting, and can remap to Restarting,
1768 * or thread_start see None, and can proceed as normal.
1769 */
1770
1771 if (thi->t_state == Restarting) {
1772 dev_info(DEV, "Restarting %s\n", current->comm);
1773 thi->t_state = Running;
1774 spin_unlock_irqrestore(&thi->t_lock, flags);
1775 goto restart;
1776 }
1777
1778 thi->task = NULL;
1779 thi->t_state = None;
1780 smp_mb();
1781 complete(&thi->stop);
1782 spin_unlock_irqrestore(&thi->t_lock, flags);
1783
1784 dev_info(DEV, "Terminating %s\n", current->comm);
1785
1786 /* Release mod reference taken when thread was started */
1787 module_put(THIS_MODULE);
1788 return retval;
1789}
1790
1791static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1792 int (*func) (struct drbd_thread *))
1793{
1794 spin_lock_init(&thi->t_lock);
1795 thi->task = NULL;
1796 thi->t_state = None;
1797 thi->function = func;
1798 thi->mdev = mdev;
1799}
1800
1801int drbd_thread_start(struct drbd_thread *thi)
1802{
1803 struct drbd_conf *mdev = thi->mdev;
1804 struct task_struct *nt;
1805 unsigned long flags;
1806
1807 const char *me =
1808 thi == &mdev->receiver ? "receiver" :
1809 thi == &mdev->asender ? "asender" :
1810 thi == &mdev->worker ? "worker" : "NONSENSE";
1811
1812 /* is used from state engine doing drbd_thread_stop_nowait,
1813 * while holding the req lock irqsave */
1814 spin_lock_irqsave(&thi->t_lock, flags);
1815
1816 switch (thi->t_state) {
1817 case None:
1818 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1819 me, current->comm, current->pid);
1820
1821 /* Get ref on module for thread - this is released when thread exits */
1822 if (!try_module_get(THIS_MODULE)) {
1823 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1824 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 1825 return false;
b411b363
PR
1826 }
1827
1828 init_completion(&thi->stop);
1829 D_ASSERT(thi->task == NULL);
1830 thi->reset_cpu_mask = 1;
1831 thi->t_state = Running;
1832 spin_unlock_irqrestore(&thi->t_lock, flags);
1833 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1834
1835 nt = kthread_create(drbd_thread_setup, (void *) thi,
1836 "drbd%d_%s", mdev_to_minor(mdev), me);
1837
1838 if (IS_ERR(nt)) {
1839 dev_err(DEV, "Couldn't start thread\n");
1840
1841 module_put(THIS_MODULE);
81e84650 1842 return false;
b411b363
PR
1843 }
1844 spin_lock_irqsave(&thi->t_lock, flags);
1845 thi->task = nt;
1846 thi->t_state = Running;
1847 spin_unlock_irqrestore(&thi->t_lock, flags);
1848 wake_up_process(nt);
1849 break;
1850 case Exiting:
1851 thi->t_state = Restarting;
1852 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1853 me, current->comm, current->pid);
1854 /* fall through */
1855 case Running:
1856 case Restarting:
1857 default:
1858 spin_unlock_irqrestore(&thi->t_lock, flags);
1859 break;
1860 }
1861
81e84650 1862 return true;
b411b363
PR
1863}
1864
1865
1866void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1867{
1868 unsigned long flags;
1869
1870 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1871
1872 /* may be called from state engine, holding the req lock irqsave */
1873 spin_lock_irqsave(&thi->t_lock, flags);
1874
1875 if (thi->t_state == None) {
1876 spin_unlock_irqrestore(&thi->t_lock, flags);
1877 if (restart)
1878 drbd_thread_start(thi);
1879 return;
1880 }
1881
1882 if (thi->t_state != ns) {
1883 if (thi->task == NULL) {
1884 spin_unlock_irqrestore(&thi->t_lock, flags);
1885 return;
1886 }
1887
1888 thi->t_state = ns;
1889 smp_mb();
1890 init_completion(&thi->stop);
1891 if (thi->task != current)
1892 force_sig(DRBD_SIGKILL, thi->task);
1893
1894 }
1895
1896 spin_unlock_irqrestore(&thi->t_lock, flags);
1897
1898 if (wait)
1899 wait_for_completion(&thi->stop);
1900}
1901
1902#ifdef CONFIG_SMP
1903/**
1904 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1905 * @mdev: DRBD device.
1906 *
1907 * Forces all threads of a device onto the same CPU. This is beneficial for
1908 * DRBD's performance. May be overwritten by user's configuration.
1909 */
1910void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1911{
1912 int ord, cpu;
1913
1914 /* user override. */
1915 if (cpumask_weight(mdev->cpu_mask))
1916 return;
1917
1918 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1919 for_each_online_cpu(cpu) {
1920 if (ord-- == 0) {
1921 cpumask_set_cpu(cpu, mdev->cpu_mask);
1922 return;
1923 }
1924 }
1925 /* should not be reached */
1926 cpumask_setall(mdev->cpu_mask);
1927}
1928
1929/**
1930 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1931 * @mdev: DRBD device.
1932 *
1933 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1934 * prematurely.
1935 */
1936void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1937{
1938 struct task_struct *p = current;
1939 struct drbd_thread *thi =
1940 p == mdev->asender.task ? &mdev->asender :
1941 p == mdev->receiver.task ? &mdev->receiver :
1942 p == mdev->worker.task ? &mdev->worker :
1943 NULL;
1944 ERR_IF(thi == NULL)
1945 return;
1946 if (!thi->reset_cpu_mask)
1947 return;
1948 thi->reset_cpu_mask = 0;
1949 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1950}
1951#endif
1952
1953/* the appropriate socket mutex must be held already */
1954int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
0b70a13d 1955 enum drbd_packets cmd, struct p_header80 *h,
b411b363
PR
1956 size_t size, unsigned msg_flags)
1957{
1958 int sent, ok;
1959
81e84650
AG
1960 ERR_IF(!h) return false;
1961 ERR_IF(!size) return false;
b411b363
PR
1962
1963 h->magic = BE_DRBD_MAGIC;
1964 h->command = cpu_to_be16(cmd);
0b70a13d 1965 h->length = cpu_to_be16(size-sizeof(struct p_header80));
b411b363 1966
b411b363
PR
1967 sent = drbd_send(mdev, sock, h, size, msg_flags);
1968
1969 ok = (sent == size);
0ddc5549
LE
1970 if (!ok && !signal_pending(current))
1971 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
b411b363
PR
1972 cmdname(cmd), (int)size, sent);
1973 return ok;
1974}
1975
1976/* don't pass the socket. we may only look at it
1977 * when we hold the appropriate socket mutex.
1978 */
1979int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
0b70a13d 1980 enum drbd_packets cmd, struct p_header80 *h, size_t size)
b411b363
PR
1981{
1982 int ok = 0;
1983 struct socket *sock;
1984
1985 if (use_data_socket) {
1986 mutex_lock(&mdev->data.mutex);
1987 sock = mdev->data.socket;
1988 } else {
1989 mutex_lock(&mdev->meta.mutex);
1990 sock = mdev->meta.socket;
1991 }
1992
1993 /* drbd_disconnect() could have called drbd_free_sock()
1994 * while we were waiting in down()... */
1995 if (likely(sock != NULL))
1996 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1997
1998 if (use_data_socket)
1999 mutex_unlock(&mdev->data.mutex);
2000 else
2001 mutex_unlock(&mdev->meta.mutex);
2002 return ok;
2003}
2004
2005int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
2006 size_t size)
2007{
0b70a13d 2008 struct p_header80 h;
b411b363
PR
2009 int ok;
2010
2011 h.magic = BE_DRBD_MAGIC;
2012 h.command = cpu_to_be16(cmd);
2013 h.length = cpu_to_be16(size);
2014
2015 if (!drbd_get_data_sock(mdev))
2016 return 0;
2017
b411b363
PR
2018 ok = (sizeof(h) ==
2019 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2020 ok = ok && (size ==
2021 drbd_send(mdev, mdev->data.socket, data, size, 0));
2022
2023 drbd_put_data_sock(mdev);
2024
2025 return ok;
2026}
2027
2028int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2029{
8e26f9cc 2030 struct p_rs_param_95 *p;
b411b363
PR
2031 struct socket *sock;
2032 int size, rv;
2033 const int apv = mdev->agreed_pro_version;
2034
2035 size = apv <= 87 ? sizeof(struct p_rs_param)
2036 : apv == 88 ? sizeof(struct p_rs_param)
2037 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
2038 : apv <= 94 ? sizeof(struct p_rs_param_89)
2039 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
2040
2041 /* used from admin command context and receiver/worker context.
2042 * to avoid kmalloc, grab the socket right here,
2043 * then use the pre-allocated sbuf there */
2044 mutex_lock(&mdev->data.mutex);
2045 sock = mdev->data.socket;
2046
2047 if (likely(sock != NULL)) {
2048 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2049
8e26f9cc 2050 p = &mdev->data.sbuf.rs_param_95;
b411b363
PR
2051
2052 /* initialize verify_alg and csums_alg */
2053 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2054
2055 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
2056 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2057 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2058 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2059 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
2060
2061 if (apv >= 88)
2062 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2063 if (apv >= 89)
2064 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2065
2066 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2067 } else
2068 rv = 0; /* not ok */
2069
2070 mutex_unlock(&mdev->data.mutex);
2071
2072 return rv;
2073}
2074
2075int drbd_send_protocol(struct drbd_conf *mdev)
2076{
2077 struct p_protocol *p;
cf14c2e9 2078 int size, cf, rv;
b411b363
PR
2079
2080 size = sizeof(struct p_protocol);
2081
2082 if (mdev->agreed_pro_version >= 87)
2083 size += strlen(mdev->net_conf->integrity_alg) + 1;
2084
2085 /* we must not recurse into our own queue,
2086 * as that is blocked during handshake */
2087 p = kmalloc(size, GFP_NOIO);
2088 if (p == NULL)
2089 return 0;
2090
2091 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2092 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2093 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2094 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
b411b363
PR
2095 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2096
cf14c2e9
PR
2097 cf = 0;
2098 if (mdev->net_conf->want_lose)
2099 cf |= CF_WANT_LOSE;
2100 if (mdev->net_conf->dry_run) {
2101 if (mdev->agreed_pro_version >= 92)
2102 cf |= CF_DRY_RUN;
2103 else {
2104 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 2105 kfree(p);
148efa16 2106 return -1;
cf14c2e9
PR
2107 }
2108 }
2109 p->conn_flags = cpu_to_be32(cf);
2110
b411b363
PR
2111 if (mdev->agreed_pro_version >= 87)
2112 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2113
2114 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
0b70a13d 2115 (struct p_header80 *)p, size);
b411b363
PR
2116 kfree(p);
2117 return rv;
2118}
2119
2120int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2121{
2122 struct p_uuids p;
2123 int i;
2124
2125 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2126 return 1;
2127
9f2247bb 2128 spin_lock_irq(&mdev->ldev->md.uuid_lock);
b411b363
PR
2129 for (i = UI_CURRENT; i < UI_SIZE; i++)
2130 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
9f2247bb 2131 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
b411b363
PR
2132
2133 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2134 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2135 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2136 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2137 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2138 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2139
2140 put_ldev(mdev);
2141
2142 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
0b70a13d 2143 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2144}
2145
2146int drbd_send_uuids(struct drbd_conf *mdev)
2147{
2148 return _drbd_send_uuids(mdev, 0);
2149}
2150
2151int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2152{
2153 return _drbd_send_uuids(mdev, 8);
2154}
2155
62b0da3a
LE
2156void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2157{
2158 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2159 u64 *uuid = mdev->ldev->md.uuid;
2160 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2161 text,
2162 (unsigned long long)uuid[UI_CURRENT],
2163 (unsigned long long)uuid[UI_BITMAP],
2164 (unsigned long long)uuid[UI_HISTORY_START],
2165 (unsigned long long)uuid[UI_HISTORY_END]);
2166 put_ldev(mdev);
2167 } else {
2168 dev_info(DEV, "%s effective data uuid: %016llX\n",
2169 text,
2170 (unsigned long long)mdev->ed_uuid);
2171 }
2172}
2173
5a22db89 2174int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
2175{
2176 struct p_rs_uuid p;
5a22db89
LE
2177 u64 uuid;
2178
2179 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 2180
5ba3dac5
PR
2181 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2182 if (uuid && uuid != UUID_JUST_CREATED)
2183 uuid = uuid + UUID_NEW_BM_OFFSET;
2184 else
2185 get_random_bytes(&uuid, sizeof(u64));
5a22db89 2186 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 2187 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
2188 drbd_md_sync(mdev);
2189 p.uuid = cpu_to_be64(uuid);
b411b363
PR
2190
2191 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
0b70a13d 2192 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2193}
2194
e89b591c 2195int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
2196{
2197 struct p_sizes p;
2198 sector_t d_size, u_size;
db141b2f
LE
2199 int q_order_type;
2200 unsigned int max_bio_size;
b411b363
PR
2201 int ok;
2202
2203 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2204 D_ASSERT(mdev->ldev->backing_bdev);
2205 d_size = drbd_get_max_capacity(mdev->ldev);
2206 u_size = mdev->ldev->dc.disk_size;
2207 q_order_type = drbd_queue_order_type(mdev);
99432fcc 2208 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
db141b2f 2209 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
2210 put_ldev(mdev);
2211 } else {
2212 d_size = 0;
2213 u_size = 0;
2214 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 2215 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
2216 }
2217
6809384c
PR
2218 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2219 if (mdev->agreed_pro_version <= 94)
db141b2f 2220 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
6809384c 2221
b411b363
PR
2222 p.d_size = cpu_to_be64(d_size);
2223 p.u_size = cpu_to_be64(u_size);
2224 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 2225 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
2226 p.queue_order_type = cpu_to_be16(q_order_type);
2227 p.dds_flags = cpu_to_be16(flags);
b411b363
PR
2228
2229 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
0b70a13d 2230 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2231 return ok;
2232}
2233
2234/**
f479ea06 2235 * drbd_send_current_state() - Sends the drbd state to the peer
b411b363
PR
2236 * @mdev: DRBD device.
2237 */
f479ea06 2238int drbd_send_current_state(struct drbd_conf *mdev)
b411b363
PR
2239{
2240 struct socket *sock;
2241 struct p_state p;
2242 int ok = 0;
2243
2244 /* Grab state lock so we wont send state if we're in the middle
2245 * of a cluster wide state change on another thread */
2246 drbd_state_lock(mdev);
2247
2248 mutex_lock(&mdev->data.mutex);
2249
2250 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2251 sock = mdev->data.socket;
2252
2253 if (likely(sock != NULL)) {
2254 ok = _drbd_send_cmd(mdev, sock, P_STATE,
0b70a13d 2255 (struct p_header80 *)&p, sizeof(p), 0);
b411b363
PR
2256 }
2257
2258 mutex_unlock(&mdev->data.mutex);
2259
2260 drbd_state_unlock(mdev);
2261 return ok;
2262}
2263
f479ea06
LE
2264/**
2265 * drbd_send_state() - After a state change, sends the new state to the peer
2266 * @mdev: DRBD device.
2267 * @state: the state to send, not necessarily the current state.
2268 *
2269 * Each state change queues an "after_state_ch" work, which will eventually
2270 * send the resulting new state to the peer. If more state changes happen
2271 * between queuing and processing of the after_state_ch work, we still
2272 * want to send each intermediary state in the order it occurred.
2273 */
2274int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2275{
2276 struct socket *sock;
2277 struct p_state p;
2278 int ok = 0;
2279
2280 mutex_lock(&mdev->data.mutex);
2281
2282 p.state = cpu_to_be32(state.i);
2283 sock = mdev->data.socket;
2284
2285 if (likely(sock != NULL)) {
2286 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2287 (struct p_header80 *)&p, sizeof(p), 0);
2288 }
2289
2290 mutex_unlock(&mdev->data.mutex);
2291
2292 return ok;
2293}
2294
b411b363
PR
2295int drbd_send_state_req(struct drbd_conf *mdev,
2296 union drbd_state mask, union drbd_state val)
2297{
2298 struct p_req_state p;
2299
2300 p.mask = cpu_to_be32(mask.i);
2301 p.val = cpu_to_be32(val.i);
2302
2303 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
0b70a13d 2304 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2305}
2306
bf885f8a 2307int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
2308{
2309 struct p_req_state_reply p;
2310
2311 p.retcode = cpu_to_be32(retcode);
2312
2313 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
0b70a13d 2314 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2315}
2316
2317int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2318 struct p_compressed_bm *p,
2319 struct bm_xfer_ctx *c)
2320{
2321 struct bitstream bs;
2322 unsigned long plain_bits;
2323 unsigned long tmp;
2324 unsigned long rl;
2325 unsigned len;
2326 unsigned toggle;
2327 int bits;
2328
2329 /* may we use this feature? */
2330 if ((mdev->sync_conf.use_rle == 0) ||
2331 (mdev->agreed_pro_version < 90))
2332 return 0;
2333
2334 if (c->bit_offset >= c->bm_bits)
2335 return 0; /* nothing to do. */
2336
2337 /* use at most thus many bytes */
2338 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2339 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2340 /* plain bits covered in this code string */
2341 plain_bits = 0;
2342
2343 /* p->encoding & 0x80 stores whether the first run length is set.
2344 * bit offset is implicit.
2345 * start with toggle == 2 to be able to tell the first iteration */
2346 toggle = 2;
2347
2348 /* see how much plain bits we can stuff into one packet
2349 * using RLE and VLI. */
2350 do {
2351 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2352 : _drbd_bm_find_next(mdev, c->bit_offset);
2353 if (tmp == -1UL)
2354 tmp = c->bm_bits;
2355 rl = tmp - c->bit_offset;
2356
2357 if (toggle == 2) { /* first iteration */
2358 if (rl == 0) {
2359 /* the first checked bit was set,
2360 * store start value, */
2361 DCBP_set_start(p, 1);
2362 /* but skip encoding of zero run length */
2363 toggle = !toggle;
2364 continue;
2365 }
2366 DCBP_set_start(p, 0);
2367 }
2368
2369 /* paranoia: catch zero runlength.
2370 * can only happen if bitmap is modified while we scan it. */
2371 if (rl == 0) {
2372 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2373 "t:%u bo:%lu\n", toggle, c->bit_offset);
2374 return -1;
2375 }
2376
2377 bits = vli_encode_bits(&bs, rl);
2378 if (bits == -ENOBUFS) /* buffer full */
2379 break;
2380 if (bits <= 0) {
2381 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2382 return 0;
2383 }
2384
2385 toggle = !toggle;
2386 plain_bits += rl;
2387 c->bit_offset = tmp;
2388 } while (c->bit_offset < c->bm_bits);
2389
2390 len = bs.cur.b - p->code + !!bs.cur.bit;
2391
2392 if (plain_bits < (len << 3)) {
2393 /* incompressible with this method.
2394 * we need to rewind both word and bit position. */
2395 c->bit_offset -= plain_bits;
2396 bm_xfer_ctx_bit_to_word_offset(c);
2397 c->bit_offset = c->word_offset * BITS_PER_LONG;
2398 return 0;
2399 }
2400
2401 /* RLE + VLI was able to compress it just fine.
2402 * update c->word_offset. */
2403 bm_xfer_ctx_bit_to_word_offset(c);
2404
2405 /* store pad_bits */
2406 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2407
2408 return len;
2409}
2410
f70af118
AG
2411/**
2412 * send_bitmap_rle_or_plain
2413 *
2414 * Return 0 when done, 1 when another iteration is needed, and a negative error
2415 * code upon failure.
2416 */
2417static int
b411b363 2418send_bitmap_rle_or_plain(struct drbd_conf *mdev,
f70af118 2419 struct p_header80 *h, struct bm_xfer_ctx *c)
b411b363
PR
2420{
2421 struct p_compressed_bm *p = (void*)h;
2422 unsigned long num_words;
2423 int len;
2424 int ok;
2425
2426 len = fill_bitmap_rle_bits(mdev, p, c);
2427
2428 if (len < 0)
f70af118 2429 return -EIO;
b411b363
PR
2430
2431 if (len) {
2432 DCBP_set_code(p, RLE_VLI_Bits);
2433 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2434 sizeof(*p) + len, 0);
2435
2436 c->packets[0]++;
2437 c->bytes[0] += sizeof(*p) + len;
2438
2439 if (c->bit_offset >= c->bm_bits)
2440 len = 0; /* DONE */
2441 } else {
2442 /* was not compressible.
2443 * send a buffer full of plain text bits instead. */
2444 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2445 len = num_words * sizeof(long);
2446 if (len)
2447 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2448 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
0b70a13d 2449 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
2450 c->word_offset += num_words;
2451 c->bit_offset = c->word_offset * BITS_PER_LONG;
2452
2453 c->packets[1]++;
0b70a13d 2454 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
2455
2456 if (c->bit_offset > c->bm_bits)
2457 c->bit_offset = c->bm_bits;
2458 }
f70af118
AG
2459 if (ok) {
2460 if (len == 0) {
2461 INFO_bm_xfer_stats(mdev, "send", c);
2462 return 0;
2463 } else
2464 return 1;
2465 }
2466 return -EIO;
b411b363
PR
2467}
2468
2469/* See the comment at receive_bitmap() */
2470int _drbd_send_bitmap(struct drbd_conf *mdev)
2471{
2472 struct bm_xfer_ctx c;
0b70a13d 2473 struct p_header80 *p;
f70af118 2474 int err;
b411b363 2475
81e84650 2476 ERR_IF(!mdev->bitmap) return false;
b411b363
PR
2477
2478 /* maybe we should use some per thread scratch page,
2479 * and allocate that during initial device creation? */
0b70a13d 2480 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
b411b363
PR
2481 if (!p) {
2482 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 2483 return false;
b411b363
PR
2484 }
2485
2486 if (get_ldev(mdev)) {
2487 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2488 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2489 drbd_bm_set_all(mdev);
2490 if (drbd_bm_write(mdev)) {
2491 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2492 * but otherwise process as per normal - need to tell other
2493 * side that a full resync is required! */
2494 dev_err(DEV, "Failed to write bitmap to disk!\n");
2495 } else {
2496 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2497 drbd_md_sync(mdev);
2498 }
2499 }
2500 put_ldev(mdev);
2501 }
2502
2503 c = (struct bm_xfer_ctx) {
2504 .bm_bits = drbd_bm_bits(mdev),
2505 .bm_words = drbd_bm_words(mdev),
2506 };
2507
2508 do {
f70af118
AG
2509 err = send_bitmap_rle_or_plain(mdev, p, &c);
2510 } while (err > 0);
b411b363
PR
2511
2512 free_page((unsigned long) p);
f70af118 2513 return err == 0;
b411b363
PR
2514}
2515
2516int drbd_send_bitmap(struct drbd_conf *mdev)
2517{
2518 int err;
2519
2520 if (!drbd_get_data_sock(mdev))
2521 return -1;
2522 err = !_drbd_send_bitmap(mdev);
2523 drbd_put_data_sock(mdev);
2524 return err;
2525}
2526
2527int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2528{
2529 int ok;
2530 struct p_barrier_ack p;
2531
2532 p.barrier = barrier_nr;
2533 p.set_size = cpu_to_be32(set_size);
2534
2535 if (mdev->state.conn < C_CONNECTED)
81e84650 2536 return false;
b411b363 2537 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
0b70a13d 2538 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2539 return ok;
2540}
2541
2542/**
2543 * _drbd_send_ack() - Sends an ack packet
2544 * @mdev: DRBD device.
2545 * @cmd: Packet command code.
2546 * @sector: sector, needs to be in big endian byte order
2547 * @blksize: size in byte, needs to be in big endian byte order
2548 * @block_id: Id, big endian byte order
2549 */
2550static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2551 u64 sector,
2552 u32 blksize,
2553 u64 block_id)
2554{
2555 int ok;
2556 struct p_block_ack p;
2557
2558 p.sector = sector;
2559 p.block_id = block_id;
2560 p.blksize = blksize;
2561 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2562
2563 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 2564 return false;
b411b363 2565 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
0b70a13d 2566 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2567 return ok;
2568}
2569
2b2bf214
LE
2570/* dp->sector and dp->block_id already/still in network byte order,
2571 * data_size is payload size according to dp->head,
2572 * and may need to be corrected for digest size. */
b411b363 2573int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2b2bf214 2574 struct p_data *dp, int data_size)
b411b363 2575{
2b2bf214
LE
2576 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2577 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
b411b363
PR
2578 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2579 dp->block_id);
2580}
2581
2582int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2583 struct p_block_req *rp)
2584{
2585 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2586}
2587
2588/**
2589 * drbd_send_ack() - Sends an ack packet
2590 * @mdev: DRBD device.
2591 * @cmd: Packet command code.
2592 * @e: Epoch entry.
2593 */
2594int drbd_send_ack(struct drbd_conf *mdev,
2595 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2596{
2597 return _drbd_send_ack(mdev, cmd,
2598 cpu_to_be64(e->sector),
2599 cpu_to_be32(e->size),
2600 e->block_id);
2601}
2602
2603/* This function misuses the block_id field to signal if the blocks
2604 * are is sync or not. */
2605int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2606 sector_t sector, int blksize, u64 block_id)
2607{
2608 return _drbd_send_ack(mdev, cmd,
2609 cpu_to_be64(sector),
2610 cpu_to_be32(blksize),
2611 cpu_to_be64(block_id));
2612}
2613
2614int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2615 sector_t sector, int size, u64 block_id)
2616{
2617 int ok;
2618 struct p_block_req p;
2619
2620 p.sector = cpu_to_be64(sector);
2621 p.block_id = block_id;
2622 p.blksize = cpu_to_be32(size);
2623
2624 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
0b70a13d 2625 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2626 return ok;
2627}
2628
2629int drbd_send_drequest_csum(struct drbd_conf *mdev,
2630 sector_t sector, int size,
2631 void *digest, int digest_size,
2632 enum drbd_packets cmd)
2633{
2634 int ok;
2635 struct p_block_req p;
2636
2637 p.sector = cpu_to_be64(sector);
2638 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2639 p.blksize = cpu_to_be32(size);
2640
2641 p.head.magic = BE_DRBD_MAGIC;
2642 p.head.command = cpu_to_be16(cmd);
0b70a13d 2643 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
b411b363
PR
2644
2645 mutex_lock(&mdev->data.mutex);
2646
2647 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2648 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2649
2650 mutex_unlock(&mdev->data.mutex);
2651
2652 return ok;
2653}
2654
2655int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2656{
2657 int ok;
2658 struct p_block_req p;
2659
2660 p.sector = cpu_to_be64(sector);
2661 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2662 p.blksize = cpu_to_be32(size);
2663
2664 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
0b70a13d 2665 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2666 return ok;
2667}
2668
2669/* called on sndtimeo
81e84650
AG
2670 * returns false if we should retry,
2671 * true if we think connection is dead
b411b363
PR
2672 */
2673static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2674{
2675 int drop_it;
2676 /* long elapsed = (long)(jiffies - mdev->last_received); */
2677
2678 drop_it = mdev->meta.socket == sock
2679 || !mdev->asender.task
2680 || get_t_state(&mdev->asender) != Running
2681 || mdev->state.conn < C_CONNECTED;
2682
2683 if (drop_it)
81e84650 2684 return true;
b411b363
PR
2685
2686 drop_it = !--mdev->ko_count;
2687 if (!drop_it) {
2688 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2689 current->comm, current->pid, mdev->ko_count);
2690 request_ping(mdev);
2691 }
2692
2693 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2694}
2695
2696/* The idea of sendpage seems to be to put some kind of reference
2697 * to the page into the skb, and to hand it over to the NIC. In
2698 * this process get_page() gets called.
2699 *
2700 * As soon as the page was really sent over the network put_page()
2701 * gets called by some part of the network layer. [ NIC driver? ]
2702 *
2703 * [ get_page() / put_page() increment/decrement the count. If count
2704 * reaches 0 the page will be freed. ]
2705 *
2706 * This works nicely with pages from FSs.
2707 * But this means that in protocol A we might signal IO completion too early!
2708 *
2709 * In order not to corrupt data during a resync we must make sure
2710 * that we do not reuse our own buffer pages (EEs) to early, therefore
2711 * we have the net_ee list.
2712 *
2713 * XFS seems to have problems, still, it submits pages with page_count == 0!
2714 * As a workaround, we disable sendpage on pages
2715 * with page_count == 0 or PageSlab.
2716 */
2717static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2718 int offset, size_t size, unsigned msg_flags)
b411b363 2719{
ba11ad9a 2720 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
2721 kunmap(page);
2722 if (sent == size)
2723 mdev->send_cnt += size>>9;
2724 return sent == size;
2725}
2726
2727static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2728 int offset, size_t size, unsigned msg_flags)
b411b363
PR
2729{
2730 mm_segment_t oldfs = get_fs();
2731 int sent, ok;
2732 int len = size;
2733
2734 /* e.g. XFS meta- & log-data is in slab pages, which have a
2735 * page_count of 0 and/or have PageSlab() set.
2736 * we cannot use send_page for those, as that does get_page();
2737 * put_page(); and would cause either a VM_BUG directly, or
2738 * __page_cache_release a page that would actually still be referenced
2739 * by someone, leading to some obscure delayed Oops somewhere else. */
2740 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 2741 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 2742
ba11ad9a 2743 msg_flags |= MSG_NOSIGNAL;
b411b363
PR
2744 drbd_update_congested(mdev);
2745 set_fs(KERNEL_DS);
2746 do {
2747 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2748 offset, len,
ba11ad9a 2749 msg_flags);
b411b363
PR
2750 if (sent == -EAGAIN) {
2751 if (we_should_drop_the_connection(mdev,
2752 mdev->data.socket))
2753 break;
2754 else
2755 continue;
2756 }
2757 if (sent <= 0) {
2758 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2759 __func__, (int)size, len, sent);
2760 break;
2761 }
2762 len -= sent;
2763 offset += sent;
2764 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2765 set_fs(oldfs);
2766 clear_bit(NET_CONGESTED, &mdev->flags);
2767
2768 ok = (len == 0);
2769 if (likely(ok))
2770 mdev->send_cnt += size>>9;
2771 return ok;
2772}
2773
2774static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2775{
2776 struct bio_vec *bvec;
2777 int i;
ba11ad9a 2778 /* hint all but last page with MSG_MORE */
001a8868 2779 bio_for_each_segment(bvec, bio, i) {
b411b363 2780 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2781 bvec->bv_offset, bvec->bv_len,
2782 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2783 return 0;
2784 }
2785 return 1;
2786}
2787
2788static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2789{
2790 struct bio_vec *bvec;
2791 int i;
ba11ad9a 2792 /* hint all but last page with MSG_MORE */
001a8868 2793 bio_for_each_segment(bvec, bio, i) {
b411b363 2794 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2795 bvec->bv_offset, bvec->bv_len,
2796 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2797 return 0;
2798 }
b411b363
PR
2799 return 1;
2800}
2801
45bb912b
LE
2802static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2803{
2804 struct page *page = e->pages;
2805 unsigned len = e->size;
ba11ad9a 2806 /* hint all but last page with MSG_MORE */
45bb912b
LE
2807 page_chain_for_each(page) {
2808 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
2809 if (!_drbd_send_page(mdev, page, 0, l,
2810 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
2811 return 0;
2812 len -= l;
2813 }
2814 return 1;
2815}
2816
76d2e7ec
PR
2817static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2818{
2819 if (mdev->agreed_pro_version >= 95)
2820 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
2821 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2822 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2823 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2824 else
721a9602 2825 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
2826}
2827
b411b363
PR
2828/* Used to send write requests
2829 * R_PRIMARY -> Peer (P_DATA)
2830 */
2831int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2832{
2833 int ok = 1;
2834 struct p_data p;
2835 unsigned int dp_flags = 0;
2836 void *dgb;
2837 int dgs;
2838
2839 if (!drbd_get_data_sock(mdev))
2840 return 0;
2841
2842 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2843 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2844
d5373389 2845 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2846 p.head.h80.magic = BE_DRBD_MAGIC;
2847 p.head.h80.command = cpu_to_be16(P_DATA);
2848 p.head.h80.length =
2849 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2850 } else {
2851 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2852 p.head.h95.command = cpu_to_be16(P_DATA);
2853 p.head.h95.length =
2854 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2855 }
b411b363
PR
2856
2857 p.sector = cpu_to_be64(req->sector);
2858 p.block_id = (unsigned long)req;
671a74e7 2859 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
b411b363 2860
76d2e7ec
PR
2861 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2862
b411b363
PR
2863 if (mdev->state.conn >= C_SYNC_SOURCE &&
2864 mdev->state.conn <= C_PAUSED_SYNC_T)
2865 dp_flags |= DP_MAY_SET_IN_SYNC;
2866
2867 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2868 set_bit(UNPLUG_REMOTE, &mdev->flags);
2869 ok = (sizeof(p) ==
ba11ad9a 2870 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363
PR
2871 if (ok && dgs) {
2872 dgb = mdev->int_dig_out;
45bb912b 2873 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
cab2f74b 2874 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2875 }
2876 if (ok) {
470be44a
LE
2877 /* For protocol A, we have to memcpy the payload into
2878 * socket buffers, as we may complete right away
2879 * as soon as we handed it over to tcp, at which point the data
2880 * pages may become invalid.
2881 *
2882 * For data-integrity enabled, we copy it as well, so we can be
2883 * sure that even if the bio pages may still be modified, it
2884 * won't change the data on the wire, thus if the digest checks
2885 * out ok after sending on this side, but does not fit on the
2886 * receiving side, we sure have detected corruption elsewhere.
2887 */
2888 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
2889 ok = _drbd_send_bio(mdev, req->master_bio);
2890 else
2891 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
2892
2893 /* double check digest, sometimes buffers have been modified in flight. */
2894 if (dgs > 0 && dgs <= 64) {
24c4830c 2895 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
2896 * currently supported in kernel crypto. */
2897 unsigned char digest[64];
2898 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2899 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2900 dev_warn(DEV,
2901 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2902 (unsigned long long)req->sector, req->size);
2903 }
2904 } /* else if (dgs > 64) {
2905 ... Be noisy about digest too large ...
2906 } */
b411b363
PR
2907 }
2908
2909 drbd_put_data_sock(mdev);
bd26bfc5 2910
b411b363
PR
2911 return ok;
2912}
2913
2914/* answer packet, used to send data back for read requests:
2915 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2916 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2917 */
2918int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2919 struct drbd_epoch_entry *e)
2920{
2921 int ok;
2922 struct p_data p;
2923 void *dgb;
2924 int dgs;
2925
2926 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2927 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2928
d5373389 2929 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2930 p.head.h80.magic = BE_DRBD_MAGIC;
2931 p.head.h80.command = cpu_to_be16(cmd);
2932 p.head.h80.length =
2933 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2934 } else {
2935 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2936 p.head.h95.command = cpu_to_be16(cmd);
2937 p.head.h95.length =
2938 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2939 }
b411b363
PR
2940
2941 p.sector = cpu_to_be64(e->sector);
2942 p.block_id = e->block_id;
2943 /* p.seq_num = 0; No sequence numbers here.. */
2944
2945 /* Only called by our kernel thread.
2946 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2947 * in response to admin command or module unload.
2948 */
2949 if (!drbd_get_data_sock(mdev))
2950 return 0;
2951
0b70a13d 2952 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363
PR
2953 if (ok && dgs) {
2954 dgb = mdev->int_dig_out;
45bb912b 2955 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
cab2f74b 2956 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2957 }
2958 if (ok)
45bb912b 2959 ok = _drbd_send_zc_ee(mdev, e);
b411b363
PR
2960
2961 drbd_put_data_sock(mdev);
bd26bfc5 2962
b411b363
PR
2963 return ok;
2964}
2965
73a01a18
PR
2966int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2967{
2968 struct p_block_desc p;
2969
2970 p.sector = cpu_to_be64(req->sector);
2971 p.blksize = cpu_to_be32(req->size);
2972
2973 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2974}
2975
b411b363
PR
2976/*
2977 drbd_send distinguishes two cases:
2978
2979 Packets sent via the data socket "sock"
2980 and packets sent via the meta data socket "msock"
2981
2982 sock msock
2983 -----------------+-------------------------+------------------------------
2984 timeout conf.timeout / 2 conf.timeout / 2
2985 timeout action send a ping via msock Abort communication
2986 and close all sockets
2987*/
2988
2989/*
2990 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2991 */
2992int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2993 void *buf, size_t size, unsigned msg_flags)
2994{
2995 struct kvec iov;
2996 struct msghdr msg;
2997 int rv, sent = 0;
2998
2999 if (!sock)
3000 return -1000;
3001
3002 /* THINK if (signal_pending) return ... ? */
3003
3004 iov.iov_base = buf;
3005 iov.iov_len = size;
3006
3007 msg.msg_name = NULL;
3008 msg.msg_namelen = 0;
3009 msg.msg_control = NULL;
3010 msg.msg_controllen = 0;
3011 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
3012
3013 if (sock == mdev->data.socket) {
3014 mdev->ko_count = mdev->net_conf->ko_count;
3015 drbd_update_congested(mdev);
3016 }
3017 do {
3018 /* STRANGE
3019 * tcp_sendmsg does _not_ use its size parameter at all ?
3020 *
3021 * -EAGAIN on timeout, -EINTR on signal.
3022 */
3023/* THINK
3024 * do we need to block DRBD_SIG if sock == &meta.socket ??
3025 * otherwise wake_asender() might interrupt some send_*Ack !
3026 */
3027 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3028 if (rv == -EAGAIN) {
3029 if (we_should_drop_the_connection(mdev, sock))
3030 break;
3031 else
3032 continue;
3033 }
3034 D_ASSERT(rv != 0);
3035 if (rv == -EINTR) {
3036 flush_signals(current);
3037 rv = 0;
3038 }
3039 if (rv < 0)
3040 break;
3041 sent += rv;
3042 iov.iov_base += rv;
3043 iov.iov_len -= rv;
3044 } while (sent < size);
3045
3046 if (sock == mdev->data.socket)
3047 clear_bit(NET_CONGESTED, &mdev->flags);
3048
3049 if (rv <= 0) {
3050 if (rv != -EAGAIN) {
3051 dev_err(DEV, "%s_sendmsg returned %d\n",
3052 sock == mdev->meta.socket ? "msock" : "sock",
3053 rv);
3054 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3055 } else
3056 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3057 }
3058
3059 return sent;
3060}
3061
3062static int drbd_open(struct block_device *bdev, fmode_t mode)
3063{
3064 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3065 unsigned long flags;
3066 int rv = 0;
3067
2a48fc0a 3068 mutex_lock(&drbd_main_mutex);
b411b363
PR
3069 spin_lock_irqsave(&mdev->req_lock, flags);
3070 /* to have a stable mdev->state.role
3071 * and no race with updating open_cnt */
3072
3073 if (mdev->state.role != R_PRIMARY) {
3074 if (mode & FMODE_WRITE)
3075 rv = -EROFS;
3076 else if (!allow_oos)
3077 rv = -EMEDIUMTYPE;
3078 }
3079
3080 if (!rv)
3081 mdev->open_cnt++;
3082 spin_unlock_irqrestore(&mdev->req_lock, flags);
2a48fc0a 3083 mutex_unlock(&drbd_main_mutex);
b411b363
PR
3084
3085 return rv;
3086}
3087
3088static int drbd_release(struct gendisk *gd, fmode_t mode)
3089{
3090 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 3091 mutex_lock(&drbd_main_mutex);
b411b363 3092 mdev->open_cnt--;
2a48fc0a 3093 mutex_unlock(&drbd_main_mutex);
b411b363
PR
3094 return 0;
3095}
3096
b411b363
PR
3097static void drbd_set_defaults(struct drbd_conf *mdev)
3098{
85f4cc17
PR
3099 /* This way we get a compile error when sync_conf grows,
3100 and we forgot to initialize it here */
3101 mdev->sync_conf = (struct syncer_conf) {
3102 /* .rate = */ DRBD_RATE_DEF,
3103 /* .after = */ DRBD_AFTER_DEF,
3104 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
3105 /* .verify_alg = */ {}, 0,
3106 /* .cpu_mask = */ {}, 0,
3107 /* .csums_alg = */ {}, 0,
e756414f 3108 /* .use_rle = */ 0,
9a31d716
PR
3109 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3110 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3111 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3112 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
3113 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3114 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
3115 };
3116
3117 /* Have to use that way, because the layout differs between
3118 big endian and little endian */
b411b363
PR
3119 mdev->state = (union drbd_state) {
3120 { .role = R_SECONDARY,
3121 .peer = R_UNKNOWN,
3122 .conn = C_STANDALONE,
3123 .disk = D_DISKLESS,
3124 .pdsk = D_UNKNOWN,
fb22c402
PR
3125 .susp = 0,
3126 .susp_nod = 0,
3127 .susp_fen = 0
b411b363
PR
3128 } };
3129}
3130
3131void drbd_init_set_defaults(struct drbd_conf *mdev)
3132{
3133 /* the memset(,0,) did most of this.
3134 * note: only assignments, no allocation in here */
3135
3136 drbd_set_defaults(mdev);
3137
b411b363
PR
3138 atomic_set(&mdev->ap_bio_cnt, 0);
3139 atomic_set(&mdev->ap_pending_cnt, 0);
3140 atomic_set(&mdev->rs_pending_cnt, 0);
3141 atomic_set(&mdev->unacked_cnt, 0);
3142 atomic_set(&mdev->local_cnt, 0);
3143 atomic_set(&mdev->net_cnt, 0);
3144 atomic_set(&mdev->packet_seq, 0);
3145 atomic_set(&mdev->pp_in_use, 0);
435f0740 3146 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 3147 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 3148 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 3149 atomic_set(&mdev->ap_in_flight, 0);
e1711731 3150 atomic_set(&mdev->md_io_in_use, 0);
b411b363 3151
b411b363
PR
3152 mutex_init(&mdev->data.mutex);
3153 mutex_init(&mdev->meta.mutex);
3154 sema_init(&mdev->data.work.s, 0);
3155 sema_init(&mdev->meta.work.s, 0);
3156 mutex_init(&mdev->state_mutex);
3157
3158 spin_lock_init(&mdev->data.work.q_lock);
3159 spin_lock_init(&mdev->meta.work.q_lock);
3160
3161 spin_lock_init(&mdev->al_lock);
3162 spin_lock_init(&mdev->req_lock);
3163 spin_lock_init(&mdev->peer_seq_lock);
3164 spin_lock_init(&mdev->epoch_lock);
3165
3166 INIT_LIST_HEAD(&mdev->active_ee);
3167 INIT_LIST_HEAD(&mdev->sync_ee);
3168 INIT_LIST_HEAD(&mdev->done_ee);
3169 INIT_LIST_HEAD(&mdev->read_ee);
3170 INIT_LIST_HEAD(&mdev->net_ee);
3171 INIT_LIST_HEAD(&mdev->resync_reads);
3172 INIT_LIST_HEAD(&mdev->data.work.q);
3173 INIT_LIST_HEAD(&mdev->meta.work.q);
3174 INIT_LIST_HEAD(&mdev->resync_work.list);
3175 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 3176 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 3177 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 3178 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 3179 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 3180
794abb75 3181 mdev->resync_work.cb = w_resync_timer;
b411b363 3182 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 3183 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
3184 mdev->md_sync_work.cb = w_md_sync;
3185 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 3186 mdev->start_resync_work.cb = w_start_resync;
b411b363
PR
3187 init_timer(&mdev->resync_timer);
3188 init_timer(&mdev->md_sync_timer);
370a43e7 3189 init_timer(&mdev->start_resync_timer);
7fde2be9 3190 init_timer(&mdev->request_timer);
b411b363
PR
3191 mdev->resync_timer.function = resync_timer_fn;
3192 mdev->resync_timer.data = (unsigned long) mdev;
3193 mdev->md_sync_timer.function = md_sync_timer_fn;
3194 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
3195 mdev->start_resync_timer.function = start_resync_timer_fn;
3196 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
3197 mdev->request_timer.function = request_timer_fn;
3198 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
3199
3200 init_waitqueue_head(&mdev->misc_wait);
3201 init_waitqueue_head(&mdev->state_wait);
84dfb9f5 3202 init_waitqueue_head(&mdev->net_cnt_wait);
b411b363
PR
3203 init_waitqueue_head(&mdev->ee_wait);
3204 init_waitqueue_head(&mdev->al_wait);
3205 init_waitqueue_head(&mdev->seq_wait);
3206
3207 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3208 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3209 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3210
3211 mdev->agreed_pro_version = PRO_VERSION_MAX;
2451fc3b 3212 mdev->write_ordering = WO_bdev_flush;
b411b363 3213 mdev->resync_wenr = LC_FREE;
99432fcc
PR
3214 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3215 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
3216}
3217
3218void drbd_mdev_cleanup(struct drbd_conf *mdev)
3219{
1d7734a0 3220 int i;
b411b363
PR
3221 if (mdev->receiver.t_state != None)
3222 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3223 mdev->receiver.t_state);
3224
3225 /* no need to lock it, I'm the only thread alive */
3226 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3227 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3228 mdev->al_writ_cnt =
3229 mdev->bm_writ_cnt =
3230 mdev->read_cnt =
3231 mdev->recv_cnt =
3232 mdev->send_cnt =
3233 mdev->writ_cnt =
3234 mdev->p_size =
3235 mdev->rs_start =
3236 mdev->rs_total =
1d7734a0
LE
3237 mdev->rs_failed = 0;
3238 mdev->rs_last_events = 0;
0f0601f4 3239 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
3240 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3241 mdev->rs_mark_left[i] = 0;
3242 mdev->rs_mark_time[i] = 0;
3243 }
b411b363
PR
3244 D_ASSERT(mdev->net_conf == NULL);
3245
3246 drbd_set_my_capacity(mdev, 0);
3247 if (mdev->bitmap) {
3248 /* maybe never allocated. */
02d9a94b 3249 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
3250 drbd_bm_cleanup(mdev);
3251 }
3252
3253 drbd_free_resources(mdev);
0778286a 3254 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
3255
3256 /*
3257 * currently we drbd_init_ee only on module load, so
3258 * we may do drbd_release_ee only on module unload!
3259 */
3260 D_ASSERT(list_empty(&mdev->active_ee));
3261 D_ASSERT(list_empty(&mdev->sync_ee));
3262 D_ASSERT(list_empty(&mdev->done_ee));
3263 D_ASSERT(list_empty(&mdev->read_ee));
3264 D_ASSERT(list_empty(&mdev->net_ee));
3265 D_ASSERT(list_empty(&mdev->resync_reads));
3266 D_ASSERT(list_empty(&mdev->data.work.q));
3267 D_ASSERT(list_empty(&mdev->meta.work.q));
3268 D_ASSERT(list_empty(&mdev->resync_work.list));
3269 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 3270 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
3271
3272 drbd_set_defaults(mdev);
b411b363
PR
3273}
3274
3275
3276static void drbd_destroy_mempools(void)
3277{
3278 struct page *page;
3279
3280 while (drbd_pp_pool) {
3281 page = drbd_pp_pool;
3282 drbd_pp_pool = (struct page *)page_private(page);
3283 __free_page(page);
3284 drbd_pp_vacant--;
3285 }
3286
3287 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3288
9476f39d
LE
3289 if (drbd_md_io_bio_set)
3290 bioset_free(drbd_md_io_bio_set);
4281808f
LE
3291 if (drbd_md_io_page_pool)
3292 mempool_destroy(drbd_md_io_page_pool);
b411b363
PR
3293 if (drbd_ee_mempool)
3294 mempool_destroy(drbd_ee_mempool);
3295 if (drbd_request_mempool)
3296 mempool_destroy(drbd_request_mempool);
3297 if (drbd_ee_cache)
3298 kmem_cache_destroy(drbd_ee_cache);
3299 if (drbd_request_cache)
3300 kmem_cache_destroy(drbd_request_cache);
3301 if (drbd_bm_ext_cache)
3302 kmem_cache_destroy(drbd_bm_ext_cache);
3303 if (drbd_al_ext_cache)
3304 kmem_cache_destroy(drbd_al_ext_cache);
3305
9476f39d 3306 drbd_md_io_bio_set = NULL;
4281808f 3307 drbd_md_io_page_pool = NULL;
b411b363
PR
3308 drbd_ee_mempool = NULL;
3309 drbd_request_mempool = NULL;
3310 drbd_ee_cache = NULL;
3311 drbd_request_cache = NULL;
3312 drbd_bm_ext_cache = NULL;
3313 drbd_al_ext_cache = NULL;
3314
3315 return;
3316}
3317
3318static int drbd_create_mempools(void)
3319{
3320 struct page *page;
1816a2b4 3321 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
3322 int i;
3323
3324 /* prepare our caches and mempools */
3325 drbd_request_mempool = NULL;
3326 drbd_ee_cache = NULL;
3327 drbd_request_cache = NULL;
3328 drbd_bm_ext_cache = NULL;
3329 drbd_al_ext_cache = NULL;
3330 drbd_pp_pool = NULL;
4281808f 3331 drbd_md_io_page_pool = NULL;
9476f39d 3332 drbd_md_io_bio_set = NULL;
b411b363
PR
3333
3334 /* caches */
3335 drbd_request_cache = kmem_cache_create(
3336 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3337 if (drbd_request_cache == NULL)
3338 goto Enomem;
3339
3340 drbd_ee_cache = kmem_cache_create(
3341 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3342 if (drbd_ee_cache == NULL)
3343 goto Enomem;
3344
3345 drbd_bm_ext_cache = kmem_cache_create(
3346 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3347 if (drbd_bm_ext_cache == NULL)
3348 goto Enomem;
3349
3350 drbd_al_ext_cache = kmem_cache_create(
3351 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3352 if (drbd_al_ext_cache == NULL)
3353 goto Enomem;
3354
3355 /* mempools */
9476f39d
LE
3356#ifdef COMPAT_HAVE_BIOSET_CREATE
3357 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3358 if (drbd_md_io_bio_set == NULL)
3359 goto Enomem;
3360#endif
3361
4281808f
LE
3362 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3363 if (drbd_md_io_page_pool == NULL)
3364 goto Enomem;
3365
b411b363
PR
3366 drbd_request_mempool = mempool_create(number,
3367 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3368 if (drbd_request_mempool == NULL)
3369 goto Enomem;
3370
3371 drbd_ee_mempool = mempool_create(number,
3372 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 3373 if (drbd_ee_mempool == NULL)
b411b363
PR
3374 goto Enomem;
3375
3376 /* drbd's page pool */
3377 spin_lock_init(&drbd_pp_lock);
3378
3379 for (i = 0; i < number; i++) {
3380 page = alloc_page(GFP_HIGHUSER);
3381 if (!page)
3382 goto Enomem;
3383 set_page_private(page, (unsigned long)drbd_pp_pool);
3384 drbd_pp_pool = page;
3385 }
3386 drbd_pp_vacant = number;
3387
3388 return 0;
3389
3390Enomem:
3391 drbd_destroy_mempools(); /* in case we allocated some */
3392 return -ENOMEM;
3393}
3394
3395static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3396 void *unused)
3397{
3398 /* just so we have it. you never know what interesting things we
3399 * might want to do here some day...
3400 */
3401
3402 return NOTIFY_DONE;
3403}
3404
3405static struct notifier_block drbd_notifier = {
3406 .notifier_call = drbd_notify_sys,
3407};
3408
3409static void drbd_release_ee_lists(struct drbd_conf *mdev)
3410{
3411 int rr;
3412
3413 rr = drbd_release_ee(mdev, &mdev->active_ee);
3414 if (rr)
3415 dev_err(DEV, "%d EEs in active list found!\n", rr);
3416
3417 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3418 if (rr)
3419 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3420
3421 rr = drbd_release_ee(mdev, &mdev->read_ee);
3422 if (rr)
3423 dev_err(DEV, "%d EEs in read list found!\n", rr);
3424
3425 rr = drbd_release_ee(mdev, &mdev->done_ee);
3426 if (rr)
3427 dev_err(DEV, "%d EEs in done list found!\n", rr);
3428
3429 rr = drbd_release_ee(mdev, &mdev->net_ee);
3430 if (rr)
3431 dev_err(DEV, "%d EEs in net list found!\n", rr);
3432}
3433
3434/* caution. no locking.
3435 * currently only used from module cleanup code. */
3436static void drbd_delete_device(unsigned int minor)
3437{
3438 struct drbd_conf *mdev = minor_to_mdev(minor);
3439
3440 if (!mdev)
3441 return;
3442
dfa8bedb
PR
3443 del_timer_sync(&mdev->request_timer);
3444
b411b363
PR
3445 /* paranoia asserts */
3446 if (mdev->open_cnt != 0)
3447 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3448 __FILE__ , __LINE__);
3449
3450 ERR_IF (!list_empty(&mdev->data.work.q)) {
3451 struct list_head *lp;
3452 list_for_each(lp, &mdev->data.work.q) {
3453 dev_err(DEV, "lp = %p\n", lp);
3454 }
3455 };
3456 /* end paranoia asserts */
3457
3458 del_gendisk(mdev->vdisk);
3459
3460 /* cleanup stuff that may have been allocated during
3461 * device (re-)configuration or state changes */
3462
3463 if (mdev->this_bdev)
3464 bdput(mdev->this_bdev);
3465
3466 drbd_free_resources(mdev);
3467
3468 drbd_release_ee_lists(mdev);
3469
24c4830c 3470 /* should be freed on disconnect? */
b411b363
PR
3471 kfree(mdev->ee_hash);
3472 /*
3473 mdev->ee_hash_s = 0;
3474 mdev->ee_hash = NULL;
3475 */
3476
3477 lc_destroy(mdev->act_log);
3478 lc_destroy(mdev->resync);
3479
3480 kfree(mdev->p_uuid);
3481 /* mdev->p_uuid = NULL; */
3482
3483 kfree(mdev->int_dig_out);
3484 kfree(mdev->int_dig_in);
3485 kfree(mdev->int_dig_vv);
3486
3487 /* cleanup the rest that has been
3488 * allocated from drbd_new_device
3489 * and actually free the mdev itself */
3490 drbd_free_mdev(mdev);
3491}
3492
3493static void drbd_cleanup(void)
3494{
3495 unsigned int i;
3496
3497 unregister_reboot_notifier(&drbd_notifier);
3498
17a93f30
LE
3499 /* first remove proc,
3500 * drbdsetup uses it's presence to detect
3501 * whether DRBD is loaded.
3502 * If we would get stuck in proc removal,
3503 * but have netlink already deregistered,
3504 * some drbdsetup commands may wait forever
3505 * for an answer.
3506 */
3507 if (drbd_proc)
3508 remove_proc_entry("drbd", NULL);
3509
b411b363
PR
3510 drbd_nl_cleanup();
3511
3512 if (minor_table) {
b411b363
PR
3513 i = minor_count;
3514 while (i--)
3515 drbd_delete_device(i);
3516 drbd_destroy_mempools();
3517 }
3518
3519 kfree(minor_table);
3520
3521 unregister_blkdev(DRBD_MAJOR, "drbd");
3522
3523 printk(KERN_INFO "drbd: module cleanup done.\n");
3524}
3525
3526/**
d97482ed 3527 * drbd_congested() - Callback for the flusher thread
b411b363 3528 * @congested_data: User data
d97482ed 3529 * @bdi_bits: Bits the BDI flusher thread is currently interested in
b411b363
PR
3530 *
3531 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3532 */
3533static int drbd_congested(void *congested_data, int bdi_bits)
3534{
3535 struct drbd_conf *mdev = congested_data;
3536 struct request_queue *q;
3537 char reason = '-';
3538 int r = 0;
3539
1b881ef7 3540 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
3541 /* DRBD has frozen IO */
3542 r = bdi_bits;
3543 reason = 'd';
3544 goto out;
3545 }
3546
c2ba686f
LE
3547 if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
3548 r |= (1 << BDI_async_congested);
3549 /* Without good local data, we would need to read from remote,
3550 * and that would need the worker thread as well, which is
3551 * currently blocked waiting for that usermode helper to
3552 * finish.
3553 */
3554 if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
3555 r |= (1 << BDI_sync_congested);
3556 else
3557 put_ldev(mdev);
3558 r &= bdi_bits;
3559 reason = 'c';
3560 goto out;
3561 }
3562
b411b363
PR
3563 if (get_ldev(mdev)) {
3564 q = bdev_get_queue(mdev->ldev->backing_bdev);
3565 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3566 put_ldev(mdev);
3567 if (r)
3568 reason = 'b';
3569 }
3570
3571 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3572 r |= (1 << BDI_async_congested);
3573 reason = reason == 'b' ? 'a' : 'n';
3574 }
3575
3576out:
3577 mdev->congestion_reason = reason;
3578 return r;
3579}
3580
3581struct drbd_conf *drbd_new_device(unsigned int minor)
3582{
3583 struct drbd_conf *mdev;
3584 struct gendisk *disk;
3585 struct request_queue *q;
3586
3587 /* GFP_KERNEL, we are outside of all write-out paths */
3588 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3589 if (!mdev)
3590 return NULL;
3591 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3592 goto out_no_cpumask;
3593
3594 mdev->minor = minor;
3595
3596 drbd_init_set_defaults(mdev);
3597
3598 q = blk_alloc_queue(GFP_KERNEL);
3599 if (!q)
3600 goto out_no_q;
3601 mdev->rq_queue = q;
3602 q->queuedata = mdev;
b411b363
PR
3603
3604 disk = alloc_disk(1);
3605 if (!disk)
3606 goto out_no_disk;
3607 mdev->vdisk = disk;
3608
81e84650 3609 set_disk_ro(disk, true);
b411b363
PR
3610
3611 disk->queue = q;
3612 disk->major = DRBD_MAJOR;
3613 disk->first_minor = minor;
3614 disk->fops = &drbd_ops;
3615 sprintf(disk->disk_name, "drbd%d", minor);
3616 disk->private_data = mdev;
3617
3618 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3619 /* we have no partitions. we contain only ourselves. */
3620 mdev->this_bdev->bd_contains = mdev->this_bdev;
3621
3622 q->backing_dev_info.congested_fn = drbd_congested;
3623 q->backing_dev_info.congested_data = mdev;
3624
2f58dcfc 3625 blk_queue_make_request(q, drbd_make_request);
a73ff323 3626 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
99432fcc
PR
3627 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3628 This triggers a max_bio_size message upon first attach or connect */
3629 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
3630 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3631 blk_queue_merge_bvec(q, drbd_merge_bvec);
7eaceacc 3632 q->queue_lock = &mdev->req_lock;
b411b363
PR
3633
3634 mdev->md_io_page = alloc_page(GFP_KERNEL);
3635 if (!mdev->md_io_page)
3636 goto out_no_io_page;
3637
3638 if (drbd_bm_init(mdev))
3639 goto out_no_bitmap;
3640 /* no need to lock access, we are still initializing this minor device. */
3641 if (!tl_init(mdev))
3642 goto out_no_tl;
3643
3644 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3645 if (!mdev->app_reads_hash)
3646 goto out_no_app_reads;
3647
3648 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3649 if (!mdev->current_epoch)
3650 goto out_no_epoch;
3651
3652 INIT_LIST_HEAD(&mdev->current_epoch->list);
3653 mdev->epochs = 1;
3654
3655 return mdev;
3656
3657/* out_whatever_else:
3658 kfree(mdev->current_epoch); */
3659out_no_epoch:
3660 kfree(mdev->app_reads_hash);
3661out_no_app_reads:
3662 tl_cleanup(mdev);
3663out_no_tl:
3664 drbd_bm_cleanup(mdev);
3665out_no_bitmap:
3666 __free_page(mdev->md_io_page);
3667out_no_io_page:
3668 put_disk(disk);
3669out_no_disk:
3670 blk_cleanup_queue(q);
3671out_no_q:
3672 free_cpumask_var(mdev->cpu_mask);
3673out_no_cpumask:
3674 kfree(mdev);
3675 return NULL;
3676}
3677
3678/* counterpart of drbd_new_device.
3679 * last part of drbd_delete_device. */
3680void drbd_free_mdev(struct drbd_conf *mdev)
3681{
3682 kfree(mdev->current_epoch);
3683 kfree(mdev->app_reads_hash);
3684 tl_cleanup(mdev);
3685 if (mdev->bitmap) /* should no longer be there. */
3686 drbd_bm_cleanup(mdev);
3687 __free_page(mdev->md_io_page);
3688 put_disk(mdev->vdisk);
3689 blk_cleanup_queue(mdev->rq_queue);
3690 free_cpumask_var(mdev->cpu_mask);
3719094e 3691 drbd_free_tl_hash(mdev);
b411b363
PR
3692 kfree(mdev);
3693}
3694
3695
3696int __init drbd_init(void)
3697{
3698 int err;
3699
3700 if (sizeof(struct p_handshake) != 80) {
3701 printk(KERN_ERR
3702 "drbd: never change the size or layout "
3703 "of the HandShake packet.\n");
3704 return -EINVAL;
3705 }
3706
2b8a90b5 3707 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363
PR
3708 printk(KERN_ERR
3709 "drbd: invalid minor_count (%d)\n", minor_count);
3710#ifdef MODULE
3711 return -EINVAL;
3712#else
3713 minor_count = 8;
3714#endif
3715 }
3716
3717 err = drbd_nl_init();
3718 if (err)
3719 return err;
3720
3721 err = register_blkdev(DRBD_MAJOR, "drbd");
3722 if (err) {
3723 printk(KERN_ERR
3724 "drbd: unable to register block device major %d\n",
3725 DRBD_MAJOR);
3726 return err;
3727 }
3728
3729 register_reboot_notifier(&drbd_notifier);
3730
3731 /*
3732 * allocate all necessary structs
3733 */
3734 err = -ENOMEM;
3735
3736 init_waitqueue_head(&drbd_pp_wait);
3737
3738 drbd_proc = NULL; /* play safe for drbd_cleanup */
3739 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3740 GFP_KERNEL);
3741 if (!minor_table)
3742 goto Enomem;
3743
3744 err = drbd_create_mempools();
3745 if (err)
3746 goto Enomem;
3747
8c484ee4 3748 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3749 if (!drbd_proc) {
3750 printk(KERN_ERR "drbd: unable to register proc file\n");
3751 goto Enomem;
3752 }
3753
3754 rwlock_init(&global_state_lock);
3755
3756 printk(KERN_INFO "drbd: initialized. "
3757 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3758 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3759 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3760 printk(KERN_INFO "drbd: registered as block device major %d\n",
3761 DRBD_MAJOR);
3762 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3763
3764 return 0; /* Success! */
3765
3766Enomem:
3767 drbd_cleanup();
3768 if (err == -ENOMEM)
3769 /* currently always the case */
3770 printk(KERN_ERR "drbd: ran out of memory\n");
3771 else
3772 printk(KERN_ERR "drbd: initialization failure\n");
3773 return err;
3774}
3775
3776void drbd_free_bc(struct drbd_backing_dev *ldev)
3777{
3778 if (ldev == NULL)
3779 return;
3780
e525fd89
TH
3781 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3782 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
3783
3784 kfree(ldev);
3785}
3786
3787void drbd_free_sock(struct drbd_conf *mdev)
3788{
3789 if (mdev->data.socket) {
4589d7f8 3790 mutex_lock(&mdev->data.mutex);
b411b363
PR
3791 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3792 sock_release(mdev->data.socket);
3793 mdev->data.socket = NULL;
4589d7f8 3794 mutex_unlock(&mdev->data.mutex);
b411b363
PR
3795 }
3796 if (mdev->meta.socket) {
4589d7f8 3797 mutex_lock(&mdev->meta.mutex);
b411b363
PR
3798 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3799 sock_release(mdev->meta.socket);
3800 mdev->meta.socket = NULL;
4589d7f8 3801 mutex_unlock(&mdev->meta.mutex);
b411b363
PR
3802 }
3803}
3804
3805
3806void drbd_free_resources(struct drbd_conf *mdev)
3807{
3808 crypto_free_hash(mdev->csums_tfm);
3809 mdev->csums_tfm = NULL;
3810 crypto_free_hash(mdev->verify_tfm);
3811 mdev->verify_tfm = NULL;
3812 crypto_free_hash(mdev->cram_hmac_tfm);
3813 mdev->cram_hmac_tfm = NULL;
3814 crypto_free_hash(mdev->integrity_w_tfm);
3815 mdev->integrity_w_tfm = NULL;
3816 crypto_free_hash(mdev->integrity_r_tfm);
3817 mdev->integrity_r_tfm = NULL;
3818
3819 drbd_free_sock(mdev);
3820
3821 __no_warn(local,
3822 drbd_free_bc(mdev->ldev);
3823 mdev->ldev = NULL;);
3824}
3825
3826/* meta data management */
3827
3828struct meta_data_on_disk {
3829 u64 la_size; /* last agreed size. */
3830 u64 uuid[UI_SIZE]; /* UUIDs. */
3831 u64 device_uuid;
3832 u64 reserved_u64_1;
3833 u32 flags; /* MDF */
3834 u32 magic;
3835 u32 md_size_sect;
3836 u32 al_offset; /* offset to this block */
3837 u32 al_nr_extents; /* important for restoring the AL */
3838 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3839 u32 bm_offset; /* offset to the bitmap, from here */
3840 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
3841 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3842 u32 reserved_u32[3];
b411b363
PR
3843
3844} __packed;
3845
3846/**
3847 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3848 * @mdev: DRBD device.
3849 */
3850void drbd_md_sync(struct drbd_conf *mdev)
3851{
3852 struct meta_data_on_disk *buffer;
3853 sector_t sector;
3854 int i;
3855
ee15b038
LE
3856 del_timer(&mdev->md_sync_timer);
3857 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
3858 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3859 return;
b411b363
PR
3860
3861 /* We use here D_FAILED and not D_ATTACHING because we try to write
3862 * metadata even if we detach due to a disk failure! */
3863 if (!get_ldev_if_state(mdev, D_FAILED))
3864 return;
3865
e1711731
PR
3866 buffer = drbd_md_get_buffer(mdev);
3867 if (!buffer)
3868 goto out;
3869
b411b363
PR
3870 memset(buffer, 0, 512);
3871
3872 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3873 for (i = UI_CURRENT; i < UI_SIZE; i++)
3874 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3875 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3876 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3877
3878 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3879 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3880 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3881 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3882 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3883
3884 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 3885 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
3886
3887 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3888 sector = mdev->ldev->md.md_offset;
3889
3f3a9b84 3890 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
3891 /* this was a try anyways ... */
3892 dev_err(DEV, "meta data update failed!\n");
383606e0 3893 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
b411b363
PR
3894 }
3895
3896 /* Update mdev->ldev->md.la_size_sect,
3897 * since we updated it on metadata. */
3898 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3899
e1711731
PR
3900 drbd_md_put_buffer(mdev);
3901out:
b411b363
PR
3902 put_ldev(mdev);
3903}
3904
3905/**
3906 * drbd_md_read() - Reads in the meta data super block
3907 * @mdev: DRBD device.
3908 * @bdev: Device from which the meta data should be read in.
3909 *
116676ca 3910 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
3911 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3912 */
3913int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3914{
3915 struct meta_data_on_disk *buffer;
3916 int i, rv = NO_ERROR;
3917
3918 if (!get_ldev_if_state(mdev, D_ATTACHING))
3919 return ERR_IO_MD_DISK;
3920
e1711731
PR
3921 buffer = drbd_md_get_buffer(mdev);
3922 if (!buffer)
3923 goto out;
b411b363
PR
3924
3925 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 3926 /* NOTE: can't do normal error processing here as this is
b411b363
PR
3927 called BEFORE disk is attached */
3928 dev_err(DEV, "Error while reading metadata.\n");
3929 rv = ERR_IO_MD_DISK;
3930 goto err;
3931 }
3932
3933 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3934 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3935 rv = ERR_MD_INVALID;
3936 goto err;
3937 }
3938 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3939 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3940 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3941 rv = ERR_MD_INVALID;
3942 goto err;
3943 }
3944 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3945 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3946 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3947 rv = ERR_MD_INVALID;
3948 goto err;
3949 }
3950 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3951 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3952 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3953 rv = ERR_MD_INVALID;
3954 goto err;
3955 }
3956
3957 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3958 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3959 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3960 rv = ERR_MD_INVALID;
3961 goto err;
3962 }
3963
3964 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3965 for (i = UI_CURRENT; i < UI_SIZE; i++)
3966 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3967 bdev->md.flags = be32_to_cpu(buffer->flags);
3968 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3969 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3970
99432fcc
PR
3971 spin_lock_irq(&mdev->req_lock);
3972 if (mdev->state.conn < C_CONNECTED) {
db141b2f 3973 unsigned int peer;
99432fcc 3974 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
db141b2f 3975 peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
99432fcc
PR
3976 mdev->peer_max_bio_size = peer;
3977 }
3978 spin_unlock_irq(&mdev->req_lock);
3979
b411b363
PR
3980 if (mdev->sync_conf.al_extents < 7)
3981 mdev->sync_conf.al_extents = 127;
3982
3983 err:
e1711731
PR
3984 drbd_md_put_buffer(mdev);
3985 out:
b411b363
PR
3986 put_ldev(mdev);
3987
3988 return rv;
3989}
3990
3991/**
3992 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3993 * @mdev: DRBD device.
3994 *
3995 * Call this function if you change anything that should be written to
3996 * the meta-data super block. This function sets MD_DIRTY, and starts a
3997 * timer that ensures that within five seconds you have to call drbd_md_sync().
3998 */
ca0e6098 3999#ifdef DEBUG
ee15b038
LE
4000void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
4001{
4002 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
4003 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
4004 mdev->last_md_mark_dirty.line = line;
4005 mdev->last_md_mark_dirty.func = func;
4006 }
4007}
4008#else
b411b363
PR
4009void drbd_md_mark_dirty(struct drbd_conf *mdev)
4010{
ee15b038 4011 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 4012 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 4013}
ee15b038 4014#endif
b411b363 4015
9f2247bb 4016void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
b411b363
PR
4017{
4018 int i;
4019
62b0da3a 4020 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 4021 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
4022}
4023
9f2247bb 4024void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
b411b363
PR
4025{
4026 if (idx == UI_CURRENT) {
4027 if (mdev->state.role == R_PRIMARY)
4028 val |= 1;
4029 else
4030 val &= ~((u64)1);
4031
4032 drbd_set_ed_uuid(mdev, val);
4033 }
4034
4035 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
4036 drbd_md_mark_dirty(mdev);
4037}
4038
9f2247bb
PR
4039void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4040{
4041 unsigned long flags;
4042 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4043 __drbd_uuid_set(mdev, idx, val);
4044 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4045}
b411b363
PR
4046
4047void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4048{
9f2247bb
PR
4049 unsigned long flags;
4050 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
b411b363
PR
4051 if (mdev->ldev->md.uuid[idx]) {
4052 drbd_uuid_move_history(mdev);
4053 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363 4054 }
9f2247bb
PR
4055 __drbd_uuid_set(mdev, idx, val);
4056 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
b411b363
PR
4057}
4058
4059/**
4060 * drbd_uuid_new_current() - Creates a new current UUID
4061 * @mdev: DRBD device.
4062 *
4063 * Creates a new current UUID, and rotates the old current UUID into
4064 * the bitmap slot. Causes an incremental resync upon next connect.
4065 */
4066void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4067{
4068 u64 val;
9f2247bb
PR
4069 unsigned long long bm_uuid;
4070
4071 get_random_bytes(&val, sizeof(u64));
4072
4073 spin_lock_irq(&mdev->ldev->md.uuid_lock);
4074 bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
62b0da3a
LE
4075
4076 if (bm_uuid)
4077 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 4078
b411b363 4079 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
9f2247bb
PR
4080 __drbd_uuid_set(mdev, UI_CURRENT, val);
4081 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
b411b363 4082
62b0da3a 4083 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
4084 /* get it to stable storage _now_ */
4085 drbd_md_sync(mdev);
b411b363
PR
4086}
4087
4088void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4089{
9f2247bb 4090 unsigned long flags;
b411b363
PR
4091 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4092 return;
4093
9f2247bb 4094 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
b411b363
PR
4095 if (val == 0) {
4096 drbd_uuid_move_history(mdev);
4097 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4098 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 4099 } else {
62b0da3a
LE
4100 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4101 if (bm_uuid)
4102 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 4103
62b0da3a 4104 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363 4105 }
9f2247bb
PR
4106 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4107
b411b363
PR
4108 drbd_md_mark_dirty(mdev);
4109}
4110
4111/**
4112 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4113 * @mdev: DRBD device.
4114 *
4115 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4116 */
4117int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4118{
4119 int rv = -EIO;
4120
4121 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4122 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4123 drbd_md_sync(mdev);
4124 drbd_bm_set_all(mdev);
4125
4126 rv = drbd_bm_write(mdev);
4127
4128 if (!rv) {
4129 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4130 drbd_md_sync(mdev);
4131 }
4132
4133 put_ldev(mdev);
4134 }
4135
4136 return rv;
4137}
4138
4139/**
4140 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4141 * @mdev: DRBD device.
4142 *
4143 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4144 */
4145int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4146{
4147 int rv = -EIO;
4148
0778286a 4149 drbd_resume_al(mdev);
b411b363
PR
4150 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4151 drbd_bm_clear_all(mdev);
4152 rv = drbd_bm_write(mdev);
4153 put_ldev(mdev);
4154 }
4155
4156 return rv;
4157}
4158
4159static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4160{
4161 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
02851e9f 4162 int rv = -EIO;
b411b363
PR
4163
4164 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4165
02851e9f 4166 if (get_ldev(mdev)) {
20ceb2b2 4167 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
4168 rv = work->io_fn(mdev);
4169 drbd_bm_unlock(mdev);
4170 put_ldev(mdev);
4171 }
b411b363
PR
4172
4173 clear_bit(BITMAP_IO, &mdev->flags);
127b3178 4174 smp_mb__after_clear_bit();
b411b363
PR
4175 wake_up(&mdev->misc_wait);
4176
4177 if (work->done)
4178 work->done(mdev, rv);
4179
4180 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4181 work->why = NULL;
20ceb2b2 4182 work->flags = 0;
b411b363
PR
4183
4184 return 1;
4185}
4186
82f59cc6
LE
4187void drbd_ldev_destroy(struct drbd_conf *mdev)
4188{
4189 lc_destroy(mdev->resync);
4190 mdev->resync = NULL;
4191 lc_destroy(mdev->act_log);
4192 mdev->act_log = NULL;
4193 __no_warn(local,
4194 drbd_free_bc(mdev->ldev);
4195 mdev->ldev = NULL;);
4196
4197 if (mdev->md_io_tmpp) {
4198 __free_page(mdev->md_io_tmpp);
4199 mdev->md_io_tmpp = NULL;
4200 }
4201 clear_bit(GO_DISKLESS, &mdev->flags);
4202}
4203
e9e6f3ec
LE
4204static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4205{
4206 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
4207 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4208 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
4209 * the protected members anymore, though, so once put_ldev reaches zero
4210 * again, it will be safe to free them. */
e9e6f3ec 4211 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
4212 return 1;
4213}
4214
4215void drbd_go_diskless(struct drbd_conf *mdev)
4216{
4217 D_ASSERT(mdev->state.disk == D_FAILED);
4218 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
9d282875 4219 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
e9e6f3ec
LE
4220}
4221
b411b363
PR
4222/**
4223 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4224 * @mdev: DRBD device.
4225 * @io_fn: IO callback to be called when bitmap IO is possible
4226 * @done: callback to be called after the bitmap IO was performed
4227 * @why: Descriptive text of the reason for doing the IO
4228 *
4229 * While IO on the bitmap happens we freeze application IO thus we ensure
4230 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4231 * called from worker context. It MUST NOT be used while a previous such
4232 * work is still pending!
4233 */
4234void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4235 int (*io_fn)(struct drbd_conf *),
4236 void (*done)(struct drbd_conf *, int),
20ceb2b2 4237 char *why, enum bm_flag flags)
b411b363
PR
4238{
4239 D_ASSERT(current == mdev->worker.task);
4240
4241 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4242 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4243 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4244 if (mdev->bm_io_work.why)
4245 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4246 why, mdev->bm_io_work.why);
4247
4248 mdev->bm_io_work.io_fn = io_fn;
4249 mdev->bm_io_work.done = done;
4250 mdev->bm_io_work.why = why;
20ceb2b2 4251 mdev->bm_io_work.flags = flags;
b411b363 4252
22afd7ee 4253 spin_lock_irq(&mdev->req_lock);
b411b363
PR
4254 set_bit(BITMAP_IO, &mdev->flags);
4255 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 4256 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
b411b363 4257 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
b411b363 4258 }
22afd7ee 4259 spin_unlock_irq(&mdev->req_lock);
b411b363
PR
4260}
4261
4262/**
4263 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4264 * @mdev: DRBD device.
4265 * @io_fn: IO callback to be called when bitmap IO is possible
4266 * @why: Descriptive text of the reason for doing the IO
4267 *
4268 * freezes application IO while that the actual IO operations runs. This
4269 * functions MAY NOT be called from worker context.
4270 */
20ceb2b2
LE
4271int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4272 char *why, enum bm_flag flags)
b411b363
PR
4273{
4274 int rv;
4275
4276 D_ASSERT(current != mdev->worker.task);
4277
20ceb2b2
LE
4278 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4279 drbd_suspend_io(mdev);
b411b363 4280
20ceb2b2 4281 drbd_bm_lock(mdev, why, flags);
b411b363
PR
4282 rv = io_fn(mdev);
4283 drbd_bm_unlock(mdev);
4284
20ceb2b2
LE
4285 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4286 drbd_resume_io(mdev);
b411b363
PR
4287
4288 return rv;
4289}
4290
4291void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4292{
4293 if ((mdev->ldev->md.flags & flag) != flag) {
4294 drbd_md_mark_dirty(mdev);
4295 mdev->ldev->md.flags |= flag;
4296 }
4297}
4298
4299void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4300{
4301 if ((mdev->ldev->md.flags & flag) != 0) {
4302 drbd_md_mark_dirty(mdev);
4303 mdev->ldev->md.flags &= ~flag;
4304 }
4305}
4306int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4307{
4308 return (bdev->md.flags & flag) != 0;
4309}
4310
4311static void md_sync_timer_fn(unsigned long data)
4312{
4313 struct drbd_conf *mdev = (struct drbd_conf *) data;
4314
4315 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4316}
4317
4318static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4319{
4320 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
4321#ifdef DEBUG
4322 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4323 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4324#endif
b411b363 4325 drbd_md_sync(mdev);
b411b363
PR
4326 return 1;
4327}
4328
4329#ifdef CONFIG_DRBD_FAULT_INJECTION
4330/* Fault insertion support including random number generator shamelessly
4331 * stolen from kernel/rcutorture.c */
4332struct fault_random_state {
4333 unsigned long state;
4334 unsigned long count;
4335};
4336
4337#define FAULT_RANDOM_MULT 39916801 /* prime */
4338#define FAULT_RANDOM_ADD 479001701 /* prime */
4339#define FAULT_RANDOM_REFRESH 10000
4340
4341/*
4342 * Crude but fast random-number generator. Uses a linear congruential
4343 * generator, with occasional help from get_random_bytes().
4344 */
4345static unsigned long
4346_drbd_fault_random(struct fault_random_state *rsp)
4347{
4348 long refresh;
4349
49829ea7 4350 if (!rsp->count--) {
b411b363
PR
4351 get_random_bytes(&refresh, sizeof(refresh));
4352 rsp->state += refresh;
4353 rsp->count = FAULT_RANDOM_REFRESH;
4354 }
4355 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4356 return swahw32(rsp->state);
4357}
4358
4359static char *
4360_drbd_fault_str(unsigned int type) {
4361 static char *_faults[] = {
4362 [DRBD_FAULT_MD_WR] = "Meta-data write",
4363 [DRBD_FAULT_MD_RD] = "Meta-data read",
4364 [DRBD_FAULT_RS_WR] = "Resync write",
4365 [DRBD_FAULT_RS_RD] = "Resync read",
4366 [DRBD_FAULT_DT_WR] = "Data write",
4367 [DRBD_FAULT_DT_RD] = "Data read",
4368 [DRBD_FAULT_DT_RA] = "Data read ahead",
4369 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
4370 [DRBD_FAULT_AL_EE] = "EE allocation",
4371 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
4372 };
4373
4374 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4375}
4376
4377unsigned int
4378_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4379{
4380 static struct fault_random_state rrs = {0, 0};
4381
4382 unsigned int ret = (
4383 (fault_devs == 0 ||
4384 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4385 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4386
4387 if (ret) {
4388 fault_count++;
4389
7383506c 4390 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
4391 dev_warn(DEV, "***Simulating %s failure\n",
4392 _drbd_fault_str(type));
4393 }
4394
4395 return ret;
4396}
4397#endif
4398
4399const char *drbd_buildtag(void)
4400{
4401 /* DRBD built from external sources has here a reference to the
4402 git hash of the source code. */
4403
4404 static char buildtag[38] = "\0uilt-in";
4405
4406 if (buildtag[0] == 0) {
bc4854bc
CW
4407#ifdef MODULE
4408 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4409#else
4410 buildtag[0] = 'b';
b411b363 4411#endif
b411b363
PR
4412 }
4413
4414 return buildtag;
4415}
4416
4417module_init(drbd_init)
4418module_exit(drbd_cleanup)
4419
b411b363
PR
4420EXPORT_SYMBOL(drbd_conn_str);
4421EXPORT_SYMBOL(drbd_role_str);
4422EXPORT_SYMBOL(drbd_disk_str);
4423EXPORT_SYMBOL(drbd_set_st_err_str);