]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - drivers/block/drbd/drbd_main.c
drbd: allow bitmap to change during writeout from resync_finished
[mirror_ubuntu-artful-kernel.git] / drivers / block / drbd / drbd_main.c
1 /*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65 };
66
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
90 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92 #include <linux/moduleparam.h>
93 /* allow_open_on_secondary */
94 MODULE_PARM_DESC(allow_oos, "DONT USE!");
95 /* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97 module_param(minor_count, uint, 0444);
98 module_param(disable_sendpage, bool, 0644);
99 module_param(allow_oos, bool, 0);
100 module_param(cn_idx, uint, 0444);
101 module_param(proc_details, int, 0644);
102
103 #ifdef CONFIG_DRBD_FAULT_INJECTION
104 int enable_faults;
105 int fault_rate;
106 static int fault_count;
107 int fault_devs;
108 /* bitmap of enabled faults */
109 module_param(enable_faults, int, 0664);
110 /* fault rate % value - applies to all enabled faults */
111 module_param(fault_rate, int, 0664);
112 /* count of faults inserted */
113 module_param(fault_count, int, 0664);
114 /* bitmap of devices to insert faults on */
115 module_param(fault_devs, int, 0644);
116 #endif
117
118 /* module parameter, defined */
119 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
120 bool disable_sendpage;
121 bool allow_oos;
122 unsigned int cn_idx = CN_IDX_DRBD;
123 int proc_details; /* Detail level in proc drbd*/
124
125 /* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127 char usermode_helper[80] = "/sbin/drbdadm";
128
129 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134 struct drbd_conf **minor_table;
135
136 struct kmem_cache *drbd_request_cache;
137 struct kmem_cache *drbd_ee_cache; /* epoch entries */
138 struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139 struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140 mempool_t *drbd_request_mempool;
141 mempool_t *drbd_ee_mempool;
142
143 /* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149 struct page *drbd_pp_pool;
150 spinlock_t drbd_pp_lock;
151 int drbd_pp_vacant;
152 wait_queue_head_t drbd_pp_wait;
153
154 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
156 static const struct block_device_operations drbd_ops = {
157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160 };
161
162 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164 #ifdef __CHECKER__
165 /* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169 {
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179 }
180
181 #endif
182
183 /**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193 static int tl_init(struct drbd_conf *mdev)
194 {
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
205 b->n_writes = 0;
206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
212
213 mdev->tl_hash = NULL;
214 mdev->tl_hash_s = 0;
215
216 return 1;
217 }
218
219 static void tl_cleanup(struct drbd_conf *mdev)
220 {
221 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
222 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
223 kfree(mdev->oldest_tle);
224 mdev->oldest_tle = NULL;
225 kfree(mdev->unused_spare_tle);
226 mdev->unused_spare_tle = NULL;
227 kfree(mdev->tl_hash);
228 mdev->tl_hash = NULL;
229 mdev->tl_hash_s = 0;
230 }
231
232 /**
233 * _tl_add_barrier() - Adds a barrier to the transfer log
234 * @mdev: DRBD device.
235 * @new: Barrier to be added before the current head of the TL.
236 *
237 * The caller must hold the req_lock.
238 */
239 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
240 {
241 struct drbd_tl_epoch *newest_before;
242
243 INIT_LIST_HEAD(&new->requests);
244 INIT_LIST_HEAD(&new->w.list);
245 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
246 new->next = NULL;
247 new->n_writes = 0;
248
249 newest_before = mdev->newest_tle;
250 new->br_number = newest_before->br_number+1;
251 if (mdev->newest_tle != new) {
252 mdev->newest_tle->next = new;
253 mdev->newest_tle = new;
254 }
255 }
256
257 /**
258 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259 * @mdev: DRBD device.
260 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261 * @set_size: Expected number of requests before that barrier.
262 *
263 * In case the passed barrier_nr or set_size does not match the oldest
264 * &struct drbd_tl_epoch objects this function will cause a termination
265 * of the connection.
266 */
267 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268 unsigned int set_size)
269 {
270 struct drbd_tl_epoch *b, *nob; /* next old barrier */
271 struct list_head *le, *tle;
272 struct drbd_request *r;
273
274 spin_lock_irq(&mdev->req_lock);
275
276 b = mdev->oldest_tle;
277
278 /* first some paranoia code */
279 if (b == NULL) {
280 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281 barrier_nr);
282 goto bail;
283 }
284 if (b->br_number != barrier_nr) {
285 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286 barrier_nr, b->br_number);
287 goto bail;
288 }
289 if (b->n_writes != set_size) {
290 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291 barrier_nr, set_size, b->n_writes);
292 goto bail;
293 }
294
295 /* Clean up list of requests processed during current epoch */
296 list_for_each_safe(le, tle, &b->requests) {
297 r = list_entry(le, struct drbd_request, tl_requests);
298 _req_mod(r, barrier_acked);
299 }
300 /* There could be requests on the list waiting for completion
301 of the write to the local disk. To avoid corruptions of
302 slab's data structures we have to remove the lists head.
303
304 Also there could have been a barrier ack out of sequence, overtaking
305 the write acks - which would be a bug and violating write ordering.
306 To not deadlock in case we lose connection while such requests are
307 still pending, we need some way to find them for the
308 _req_mode(connection_lost_while_pending).
309
310 These have been list_move'd to the out_of_sequence_requests list in
311 _req_mod(, barrier_acked) above.
312 */
313 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
314
315 nob = b->next;
316 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317 _tl_add_barrier(mdev, b);
318 if (nob)
319 mdev->oldest_tle = nob;
320 /* if nob == NULL b was the only barrier, and becomes the new
321 barrier. Therefore mdev->oldest_tle points already to b */
322 } else {
323 D_ASSERT(nob != NULL);
324 mdev->oldest_tle = nob;
325 kfree(b);
326 }
327
328 spin_unlock_irq(&mdev->req_lock);
329 dec_ap_pending(mdev);
330
331 return;
332
333 bail:
334 spin_unlock_irq(&mdev->req_lock);
335 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336 }
337
338
339 /**
340 * _tl_restart() - Walks the transfer log, and applies an action to all requests
341 * @mdev: DRBD device.
342 * @what: The action/event to perform with all request objects
343 *
344 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
345 * restart_frozen_disk_io.
346 */
347 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
348 {
349 struct drbd_tl_epoch *b, *tmp, **pn;
350 struct list_head *le, *tle, carry_reads;
351 struct drbd_request *req;
352 int rv, n_writes, n_reads;
353
354 b = mdev->oldest_tle;
355 pn = &mdev->oldest_tle;
356 while (b) {
357 n_writes = 0;
358 n_reads = 0;
359 INIT_LIST_HEAD(&carry_reads);
360 list_for_each_safe(le, tle, &b->requests) {
361 req = list_entry(le, struct drbd_request, tl_requests);
362 rv = _req_mod(req, what);
363
364 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
365 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
366 }
367 tmp = b->next;
368
369 if (n_writes) {
370 if (what == resend) {
371 b->n_writes = n_writes;
372 if (b->w.cb == NULL) {
373 b->w.cb = w_send_barrier;
374 inc_ap_pending(mdev);
375 set_bit(CREATE_BARRIER, &mdev->flags);
376 }
377
378 drbd_queue_work(&mdev->data.work, &b->w);
379 }
380 pn = &b->next;
381 } else {
382 if (n_reads)
383 list_add(&carry_reads, &b->requests);
384 /* there could still be requests on that ring list,
385 * in case local io is still pending */
386 list_del(&b->requests);
387
388 /* dec_ap_pending corresponding to queue_barrier.
389 * the newest barrier may not have been queued yet,
390 * in which case w.cb is still NULL. */
391 if (b->w.cb != NULL)
392 dec_ap_pending(mdev);
393
394 if (b == mdev->newest_tle) {
395 /* recycle, but reinit! */
396 D_ASSERT(tmp == NULL);
397 INIT_LIST_HEAD(&b->requests);
398 list_splice(&carry_reads, &b->requests);
399 INIT_LIST_HEAD(&b->w.list);
400 b->w.cb = NULL;
401 b->br_number = net_random();
402 b->n_writes = 0;
403
404 *pn = b;
405 break;
406 }
407 *pn = tmp;
408 kfree(b);
409 }
410 b = tmp;
411 list_splice(&carry_reads, &b->requests);
412 }
413
414 /* Actions operating on the disk state, also want to work on
415 requests that got barrier acked. */
416 switch (what) {
417 case fail_frozen_disk_io:
418 case restart_frozen_disk_io:
419 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
420 req = list_entry(le, struct drbd_request, tl_requests);
421 _req_mod(req, what);
422 }
423
424 case connection_lost_while_pending:
425 case resend:
426 break;
427 default:
428 dev_err(DEV, "what = %d in _tl_restart()\n", what);
429 }
430 }
431
432
433 /**
434 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
435 * @mdev: DRBD device.
436 *
437 * This is called after the connection to the peer was lost. The storage covered
438 * by the requests on the transfer gets marked as our of sync. Called from the
439 * receiver thread and the worker thread.
440 */
441 void tl_clear(struct drbd_conf *mdev)
442 {
443 struct list_head *le, *tle;
444 struct drbd_request *r;
445
446 spin_lock_irq(&mdev->req_lock);
447
448 _tl_restart(mdev, connection_lost_while_pending);
449
450 /* we expect this list to be empty. */
451 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
452
453 /* but just in case, clean it up anyways! */
454 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
455 r = list_entry(le, struct drbd_request, tl_requests);
456 /* It would be nice to complete outside of spinlock.
457 * But this is easier for now. */
458 _req_mod(r, connection_lost_while_pending);
459 }
460
461 /* ensure bit indicating barrier is required is clear */
462 clear_bit(CREATE_BARRIER, &mdev->flags);
463
464 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
465
466 spin_unlock_irq(&mdev->req_lock);
467 }
468
469 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
470 {
471 spin_lock_irq(&mdev->req_lock);
472 _tl_restart(mdev, what);
473 spin_unlock_irq(&mdev->req_lock);
474 }
475
476 /**
477 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
478 * @mdev: DRBD device.
479 */
480 void tl_abort_disk_io(struct drbd_conf *mdev)
481 {
482 struct drbd_tl_epoch *b;
483 struct list_head *le, *tle;
484 struct drbd_request *req;
485
486 spin_lock_irq(&mdev->req_lock);
487 b = mdev->oldest_tle;
488 while (b) {
489 list_for_each_safe(le, tle, &b->requests) {
490 req = list_entry(le, struct drbd_request, tl_requests);
491 if (!(req->rq_state & RQ_LOCAL_PENDING))
492 continue;
493 _req_mod(req, abort_disk_io);
494 }
495 b = b->next;
496 }
497
498 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
499 req = list_entry(le, struct drbd_request, tl_requests);
500 if (!(req->rq_state & RQ_LOCAL_PENDING))
501 continue;
502 _req_mod(req, abort_disk_io);
503 }
504
505 spin_unlock_irq(&mdev->req_lock);
506 }
507
508 /**
509 * cl_wide_st_chg() - true if the state change is a cluster wide one
510 * @mdev: DRBD device.
511 * @os: old (current) state.
512 * @ns: new (wanted) state.
513 */
514 static int cl_wide_st_chg(struct drbd_conf *mdev,
515 union drbd_state os, union drbd_state ns)
516 {
517 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
518 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
519 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
520 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
521 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
522 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
523 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
524 }
525
526 enum drbd_state_rv
527 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
528 union drbd_state mask, union drbd_state val)
529 {
530 unsigned long flags;
531 union drbd_state os, ns;
532 enum drbd_state_rv rv;
533
534 spin_lock_irqsave(&mdev->req_lock, flags);
535 os = mdev->state;
536 ns.i = (os.i & ~mask.i) | val.i;
537 rv = _drbd_set_state(mdev, ns, f, NULL);
538 ns = mdev->state;
539 spin_unlock_irqrestore(&mdev->req_lock, flags);
540
541 return rv;
542 }
543
544 /**
545 * drbd_force_state() - Impose a change which happens outside our control on our state
546 * @mdev: DRBD device.
547 * @mask: mask of state bits to change.
548 * @val: value of new state bits.
549 */
550 void drbd_force_state(struct drbd_conf *mdev,
551 union drbd_state mask, union drbd_state val)
552 {
553 drbd_change_state(mdev, CS_HARD, mask, val);
554 }
555
556 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
557 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
558 union drbd_state,
559 union drbd_state);
560 enum sanitize_state_warnings {
561 NO_WARNING,
562 ABORTED_ONLINE_VERIFY,
563 ABORTED_RESYNC,
564 CONNECTION_LOST_NEGOTIATING,
565 IMPLICITLY_UPGRADED_DISK,
566 IMPLICITLY_UPGRADED_PDSK,
567 };
568 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
569 union drbd_state ns, enum sanitize_state_warnings *warn);
570 int drbd_send_state_req(struct drbd_conf *,
571 union drbd_state, union drbd_state);
572
573 static enum drbd_state_rv
574 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
575 union drbd_state val)
576 {
577 union drbd_state os, ns;
578 unsigned long flags;
579 enum drbd_state_rv rv;
580
581 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
582 return SS_CW_SUCCESS;
583
584 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
585 return SS_CW_FAILED_BY_PEER;
586
587 rv = 0;
588 spin_lock_irqsave(&mdev->req_lock, flags);
589 os = mdev->state;
590 ns.i = (os.i & ~mask.i) | val.i;
591 ns = sanitize_state(mdev, os, ns, NULL);
592
593 if (!cl_wide_st_chg(mdev, os, ns))
594 rv = SS_CW_NO_NEED;
595 if (!rv) {
596 rv = is_valid_state(mdev, ns);
597 if (rv == SS_SUCCESS) {
598 rv = is_valid_state_transition(mdev, ns, os);
599 if (rv == SS_SUCCESS)
600 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
601 }
602 }
603 spin_unlock_irqrestore(&mdev->req_lock, flags);
604
605 return rv;
606 }
607
608 /**
609 * drbd_req_state() - Perform an eventually cluster wide state change
610 * @mdev: DRBD device.
611 * @mask: mask of state bits to change.
612 * @val: value of new state bits.
613 * @f: flags
614 *
615 * Should not be called directly, use drbd_request_state() or
616 * _drbd_request_state().
617 */
618 static enum drbd_state_rv
619 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
620 union drbd_state val, enum chg_state_flags f)
621 {
622 struct completion done;
623 unsigned long flags;
624 union drbd_state os, ns;
625 enum drbd_state_rv rv;
626
627 init_completion(&done);
628
629 if (f & CS_SERIALIZE)
630 mutex_lock(&mdev->state_mutex);
631
632 spin_lock_irqsave(&mdev->req_lock, flags);
633 os = mdev->state;
634 ns.i = (os.i & ~mask.i) | val.i;
635 ns = sanitize_state(mdev, os, ns, NULL);
636
637 if (cl_wide_st_chg(mdev, os, ns)) {
638 rv = is_valid_state(mdev, ns);
639 if (rv == SS_SUCCESS)
640 rv = is_valid_state_transition(mdev, ns, os);
641 spin_unlock_irqrestore(&mdev->req_lock, flags);
642
643 if (rv < SS_SUCCESS) {
644 if (f & CS_VERBOSE)
645 print_st_err(mdev, os, ns, rv);
646 goto abort;
647 }
648
649 drbd_state_lock(mdev);
650 if (!drbd_send_state_req(mdev, mask, val)) {
651 drbd_state_unlock(mdev);
652 rv = SS_CW_FAILED_BY_PEER;
653 if (f & CS_VERBOSE)
654 print_st_err(mdev, os, ns, rv);
655 goto abort;
656 }
657
658 wait_event(mdev->state_wait,
659 (rv = _req_st_cond(mdev, mask, val)));
660
661 if (rv < SS_SUCCESS) {
662 drbd_state_unlock(mdev);
663 if (f & CS_VERBOSE)
664 print_st_err(mdev, os, ns, rv);
665 goto abort;
666 }
667 spin_lock_irqsave(&mdev->req_lock, flags);
668 os = mdev->state;
669 ns.i = (os.i & ~mask.i) | val.i;
670 rv = _drbd_set_state(mdev, ns, f, &done);
671 drbd_state_unlock(mdev);
672 } else {
673 rv = _drbd_set_state(mdev, ns, f, &done);
674 }
675
676 spin_unlock_irqrestore(&mdev->req_lock, flags);
677
678 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
679 D_ASSERT(current != mdev->worker.task);
680 wait_for_completion(&done);
681 }
682
683 abort:
684 if (f & CS_SERIALIZE)
685 mutex_unlock(&mdev->state_mutex);
686
687 return rv;
688 }
689
690 /**
691 * _drbd_request_state() - Request a state change (with flags)
692 * @mdev: DRBD device.
693 * @mask: mask of state bits to change.
694 * @val: value of new state bits.
695 * @f: flags
696 *
697 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
698 * flag, or when logging of failed state change requests is not desired.
699 */
700 enum drbd_state_rv
701 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
702 union drbd_state val, enum chg_state_flags f)
703 {
704 enum drbd_state_rv rv;
705
706 wait_event(mdev->state_wait,
707 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
708
709 return rv;
710 }
711
712 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
713 {
714 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
715 name,
716 drbd_conn_str(ns.conn),
717 drbd_role_str(ns.role),
718 drbd_role_str(ns.peer),
719 drbd_disk_str(ns.disk),
720 drbd_disk_str(ns.pdsk),
721 is_susp(ns) ? 's' : 'r',
722 ns.aftr_isp ? 'a' : '-',
723 ns.peer_isp ? 'p' : '-',
724 ns.user_isp ? 'u' : '-'
725 );
726 }
727
728 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
729 union drbd_state ns, enum drbd_state_rv err)
730 {
731 if (err == SS_IN_TRANSIENT_STATE)
732 return;
733 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
734 print_st(mdev, " state", os);
735 print_st(mdev, "wanted", ns);
736 }
737
738
739 /**
740 * is_valid_state() - Returns an SS_ error code if ns is not valid
741 * @mdev: DRBD device.
742 * @ns: State to consider.
743 */
744 static enum drbd_state_rv
745 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
746 {
747 /* See drbd_state_sw_errors in drbd_strings.c */
748
749 enum drbd_fencing_p fp;
750 enum drbd_state_rv rv = SS_SUCCESS;
751
752 fp = FP_DONT_CARE;
753 if (get_ldev(mdev)) {
754 fp = mdev->ldev->dc.fencing;
755 put_ldev(mdev);
756 }
757
758 if (get_net_conf(mdev)) {
759 if (!mdev->net_conf->two_primaries &&
760 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
761 rv = SS_TWO_PRIMARIES;
762 put_net_conf(mdev);
763 }
764
765 if (rv <= 0)
766 /* already found a reason to abort */;
767 else if (ns.role == R_SECONDARY && mdev->open_cnt)
768 rv = SS_DEVICE_IN_USE;
769
770 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
771 rv = SS_NO_UP_TO_DATE_DISK;
772
773 else if (fp >= FP_RESOURCE &&
774 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
775 rv = SS_PRIMARY_NOP;
776
777 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
778 rv = SS_NO_UP_TO_DATE_DISK;
779
780 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
781 rv = SS_NO_LOCAL_DISK;
782
783 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
784 rv = SS_NO_REMOTE_DISK;
785
786 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
787 rv = SS_NO_UP_TO_DATE_DISK;
788
789 else if ((ns.conn == C_CONNECTED ||
790 ns.conn == C_WF_BITMAP_S ||
791 ns.conn == C_SYNC_SOURCE ||
792 ns.conn == C_PAUSED_SYNC_S) &&
793 ns.disk == D_OUTDATED)
794 rv = SS_CONNECTED_OUTDATES;
795
796 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
797 (mdev->sync_conf.verify_alg[0] == 0))
798 rv = SS_NO_VERIFY_ALG;
799
800 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
801 mdev->agreed_pro_version < 88)
802 rv = SS_NOT_SUPPORTED;
803
804 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
805 rv = SS_CONNECTED_OUTDATES;
806
807 return rv;
808 }
809
810 /**
811 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
812 * @mdev: DRBD device.
813 * @ns: new state.
814 * @os: old state.
815 */
816 static enum drbd_state_rv
817 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
818 union drbd_state os)
819 {
820 enum drbd_state_rv rv = SS_SUCCESS;
821
822 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
823 os.conn > C_CONNECTED)
824 rv = SS_RESYNC_RUNNING;
825
826 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
827 rv = SS_ALREADY_STANDALONE;
828
829 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
830 rv = SS_IS_DISKLESS;
831
832 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
833 rv = SS_NO_NET_CONFIG;
834
835 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
836 rv = SS_LOWER_THAN_OUTDATED;
837
838 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
839 rv = SS_IN_TRANSIENT_STATE;
840
841 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
842 rv = SS_IN_TRANSIENT_STATE;
843
844 /* While establishing a connection only allow cstate to change.
845 Delay/refuse role changes, detach attach etc... */
846 if (test_bit(STATE_SENT, &mdev->flags) &&
847 !(os.conn == C_WF_REPORT_PARAMS ||
848 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
849 rv = SS_IN_TRANSIENT_STATE;
850
851 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
852 rv = SS_NEED_CONNECTION;
853
854 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
855 ns.conn != os.conn && os.conn > C_CONNECTED)
856 rv = SS_RESYNC_RUNNING;
857
858 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
859 os.conn < C_CONNECTED)
860 rv = SS_NEED_CONNECTION;
861
862 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
863 && os.conn < C_WF_REPORT_PARAMS)
864 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
865
866 return rv;
867 }
868
869 static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
870 {
871 static const char *msg_table[] = {
872 [NO_WARNING] = "",
873 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
874 [ABORTED_RESYNC] = "Resync aborted.",
875 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
876 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
877 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
878 };
879
880 if (warn != NO_WARNING)
881 dev_warn(DEV, "%s\n", msg_table[warn]);
882 }
883
884 /**
885 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
886 * @mdev: DRBD device.
887 * @os: old state.
888 * @ns: new state.
889 * @warn_sync_abort:
890 *
891 * When we loose connection, we have to set the state of the peers disk (pdsk)
892 * to D_UNKNOWN. This rule and many more along those lines are in this function.
893 */
894 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
895 union drbd_state ns, enum sanitize_state_warnings *warn)
896 {
897 enum drbd_fencing_p fp;
898 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
899
900 if (warn)
901 *warn = NO_WARNING;
902
903 fp = FP_DONT_CARE;
904 if (get_ldev(mdev)) {
905 fp = mdev->ldev->dc.fencing;
906 put_ldev(mdev);
907 }
908
909 /* Disallow Network errors to configure a device's network part */
910 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
911 os.conn <= C_DISCONNECTING)
912 ns.conn = os.conn;
913
914 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
915 * If you try to go into some Sync* state, that shall fail (elsewhere). */
916 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
917 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
918 ns.conn = os.conn;
919
920 /* we cannot fail (again) if we already detached */
921 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
922 ns.disk = D_DISKLESS;
923
924 /* After C_DISCONNECTING only C_STANDALONE may follow */
925 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
926 ns.conn = os.conn;
927
928 if (ns.conn < C_CONNECTED) {
929 ns.peer_isp = 0;
930 ns.peer = R_UNKNOWN;
931 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
932 ns.pdsk = D_UNKNOWN;
933 }
934
935 /* Clear the aftr_isp when becoming unconfigured */
936 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
937 ns.aftr_isp = 0;
938
939 /* Abort resync if a disk fails/detaches */
940 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
941 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
942 if (warn)
943 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
944 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
945 ns.conn = C_CONNECTED;
946 }
947
948 /* Connection breaks down before we finished "Negotiating" */
949 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
950 get_ldev_if_state(mdev, D_NEGOTIATING)) {
951 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
952 ns.disk = mdev->new_state_tmp.disk;
953 ns.pdsk = mdev->new_state_tmp.pdsk;
954 } else {
955 if (warn)
956 *warn = CONNECTION_LOST_NEGOTIATING;
957 ns.disk = D_DISKLESS;
958 ns.pdsk = D_UNKNOWN;
959 }
960 put_ldev(mdev);
961 }
962
963 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
964 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
965 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
966 ns.disk = D_UP_TO_DATE;
967 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
968 ns.pdsk = D_UP_TO_DATE;
969 }
970
971 /* Implications of the connection stat on the disk states */
972 disk_min = D_DISKLESS;
973 disk_max = D_UP_TO_DATE;
974 pdsk_min = D_INCONSISTENT;
975 pdsk_max = D_UNKNOWN;
976 switch ((enum drbd_conns)ns.conn) {
977 case C_WF_BITMAP_T:
978 case C_PAUSED_SYNC_T:
979 case C_STARTING_SYNC_T:
980 case C_WF_SYNC_UUID:
981 case C_BEHIND:
982 disk_min = D_INCONSISTENT;
983 disk_max = D_OUTDATED;
984 pdsk_min = D_UP_TO_DATE;
985 pdsk_max = D_UP_TO_DATE;
986 break;
987 case C_VERIFY_S:
988 case C_VERIFY_T:
989 disk_min = D_UP_TO_DATE;
990 disk_max = D_UP_TO_DATE;
991 pdsk_min = D_UP_TO_DATE;
992 pdsk_max = D_UP_TO_DATE;
993 break;
994 case C_CONNECTED:
995 disk_min = D_DISKLESS;
996 disk_max = D_UP_TO_DATE;
997 pdsk_min = D_DISKLESS;
998 pdsk_max = D_UP_TO_DATE;
999 break;
1000 case C_WF_BITMAP_S:
1001 case C_PAUSED_SYNC_S:
1002 case C_STARTING_SYNC_S:
1003 case C_AHEAD:
1004 disk_min = D_UP_TO_DATE;
1005 disk_max = D_UP_TO_DATE;
1006 pdsk_min = D_INCONSISTENT;
1007 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1008 break;
1009 case C_SYNC_TARGET:
1010 disk_min = D_INCONSISTENT;
1011 disk_max = D_INCONSISTENT;
1012 pdsk_min = D_UP_TO_DATE;
1013 pdsk_max = D_UP_TO_DATE;
1014 break;
1015 case C_SYNC_SOURCE:
1016 disk_min = D_UP_TO_DATE;
1017 disk_max = D_UP_TO_DATE;
1018 pdsk_min = D_INCONSISTENT;
1019 pdsk_max = D_INCONSISTENT;
1020 break;
1021 case C_STANDALONE:
1022 case C_DISCONNECTING:
1023 case C_UNCONNECTED:
1024 case C_TIMEOUT:
1025 case C_BROKEN_PIPE:
1026 case C_NETWORK_FAILURE:
1027 case C_PROTOCOL_ERROR:
1028 case C_TEAR_DOWN:
1029 case C_WF_CONNECTION:
1030 case C_WF_REPORT_PARAMS:
1031 case C_MASK:
1032 break;
1033 }
1034 if (ns.disk > disk_max)
1035 ns.disk = disk_max;
1036
1037 if (ns.disk < disk_min) {
1038 if (warn)
1039 *warn = IMPLICITLY_UPGRADED_DISK;
1040 ns.disk = disk_min;
1041 }
1042 if (ns.pdsk > pdsk_max)
1043 ns.pdsk = pdsk_max;
1044
1045 if (ns.pdsk < pdsk_min) {
1046 if (warn)
1047 *warn = IMPLICITLY_UPGRADED_PDSK;
1048 ns.pdsk = pdsk_min;
1049 }
1050
1051 if (fp == FP_STONITH &&
1052 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1053 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
1054 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
1055
1056 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1057 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1058 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
1059 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
1060
1061 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1062 if (ns.conn == C_SYNC_SOURCE)
1063 ns.conn = C_PAUSED_SYNC_S;
1064 if (ns.conn == C_SYNC_TARGET)
1065 ns.conn = C_PAUSED_SYNC_T;
1066 } else {
1067 if (ns.conn == C_PAUSED_SYNC_S)
1068 ns.conn = C_SYNC_SOURCE;
1069 if (ns.conn == C_PAUSED_SYNC_T)
1070 ns.conn = C_SYNC_TARGET;
1071 }
1072
1073 return ns;
1074 }
1075
1076 /* helper for __drbd_set_state */
1077 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1078 {
1079 if (mdev->agreed_pro_version < 90)
1080 mdev->ov_start_sector = 0;
1081 mdev->rs_total = drbd_bm_bits(mdev);
1082 mdev->ov_position = 0;
1083 if (cs == C_VERIFY_T) {
1084 /* starting online verify from an arbitrary position
1085 * does not fit well into the existing protocol.
1086 * on C_VERIFY_T, we initialize ov_left and friends
1087 * implicitly in receive_DataRequest once the
1088 * first P_OV_REQUEST is received */
1089 mdev->ov_start_sector = ~(sector_t)0;
1090 } else {
1091 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1092 if (bit >= mdev->rs_total) {
1093 mdev->ov_start_sector =
1094 BM_BIT_TO_SECT(mdev->rs_total - 1);
1095 mdev->rs_total = 1;
1096 } else
1097 mdev->rs_total -= bit;
1098 mdev->ov_position = mdev->ov_start_sector;
1099 }
1100 mdev->ov_left = mdev->rs_total;
1101 }
1102
1103 static void drbd_resume_al(struct drbd_conf *mdev)
1104 {
1105 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1106 dev_info(DEV, "Resumed AL updates\n");
1107 }
1108
1109 /**
1110 * __drbd_set_state() - Set a new DRBD state
1111 * @mdev: DRBD device.
1112 * @ns: new state.
1113 * @flags: Flags
1114 * @done: Optional completion, that will get completed after the after_state_ch() finished
1115 *
1116 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1117 */
1118 enum drbd_state_rv
1119 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1120 enum chg_state_flags flags, struct completion *done)
1121 {
1122 union drbd_state os;
1123 enum drbd_state_rv rv = SS_SUCCESS;
1124 enum sanitize_state_warnings ssw;
1125 struct after_state_chg_work *ascw;
1126
1127 os = mdev->state;
1128
1129 ns = sanitize_state(mdev, os, ns, &ssw);
1130
1131 if (ns.i == os.i)
1132 return SS_NOTHING_TO_DO;
1133
1134 if (!(flags & CS_HARD)) {
1135 /* pre-state-change checks ; only look at ns */
1136 /* See drbd_state_sw_errors in drbd_strings.c */
1137
1138 rv = is_valid_state(mdev, ns);
1139 if (rv < SS_SUCCESS) {
1140 /* If the old state was illegal as well, then let
1141 this happen...*/
1142
1143 if (is_valid_state(mdev, os) == rv)
1144 rv = is_valid_state_transition(mdev, ns, os);
1145 } else
1146 rv = is_valid_state_transition(mdev, ns, os);
1147 }
1148
1149 if (rv < SS_SUCCESS) {
1150 if (flags & CS_VERBOSE)
1151 print_st_err(mdev, os, ns, rv);
1152 return rv;
1153 }
1154
1155 print_sanitize_warnings(mdev, ssw);
1156
1157 {
1158 char *pbp, pb[300];
1159 pbp = pb;
1160 *pbp = 0;
1161 if (ns.role != os.role)
1162 pbp += sprintf(pbp, "role( %s -> %s ) ",
1163 drbd_role_str(os.role),
1164 drbd_role_str(ns.role));
1165 if (ns.peer != os.peer)
1166 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1167 drbd_role_str(os.peer),
1168 drbd_role_str(ns.peer));
1169 if (ns.conn != os.conn)
1170 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1171 drbd_conn_str(os.conn),
1172 drbd_conn_str(ns.conn));
1173 if (ns.disk != os.disk)
1174 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1175 drbd_disk_str(os.disk),
1176 drbd_disk_str(ns.disk));
1177 if (ns.pdsk != os.pdsk)
1178 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1179 drbd_disk_str(os.pdsk),
1180 drbd_disk_str(ns.pdsk));
1181 if (is_susp(ns) != is_susp(os))
1182 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1183 is_susp(os),
1184 is_susp(ns));
1185 if (ns.aftr_isp != os.aftr_isp)
1186 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1187 os.aftr_isp,
1188 ns.aftr_isp);
1189 if (ns.peer_isp != os.peer_isp)
1190 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1191 os.peer_isp,
1192 ns.peer_isp);
1193 if (ns.user_isp != os.user_isp)
1194 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1195 os.user_isp,
1196 ns.user_isp);
1197 dev_info(DEV, "%s\n", pb);
1198 }
1199
1200 /* solve the race between becoming unconfigured,
1201 * worker doing the cleanup, and
1202 * admin reconfiguring us:
1203 * on (re)configure, first set CONFIG_PENDING,
1204 * then wait for a potentially exiting worker,
1205 * start the worker, and schedule one no_op.
1206 * then proceed with configuration.
1207 */
1208 if (ns.disk == D_DISKLESS &&
1209 ns.conn == C_STANDALONE &&
1210 ns.role == R_SECONDARY &&
1211 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1212 set_bit(DEVICE_DYING, &mdev->flags);
1213
1214 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1215 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1216 * drbd_ldev_destroy() won't happen before our corresponding
1217 * after_state_ch works run, where we put_ldev again. */
1218 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1219 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1220 atomic_inc(&mdev->local_cnt);
1221
1222 mdev->state = ns;
1223
1224 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1225 drbd_print_uuids(mdev, "attached to UUIDs");
1226
1227 wake_up(&mdev->misc_wait);
1228 wake_up(&mdev->state_wait);
1229
1230 /* aborted verify run. log the last position */
1231 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1232 ns.conn < C_CONNECTED) {
1233 mdev->ov_start_sector =
1234 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1235 dev_info(DEV, "Online Verify reached sector %llu\n",
1236 (unsigned long long)mdev->ov_start_sector);
1237 }
1238
1239 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1240 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1241 dev_info(DEV, "Syncer continues.\n");
1242 mdev->rs_paused += (long)jiffies
1243 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1244 if (ns.conn == C_SYNC_TARGET)
1245 mod_timer(&mdev->resync_timer, jiffies);
1246 }
1247
1248 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1249 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1250 dev_info(DEV, "Resync suspended\n");
1251 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1252 }
1253
1254 if (os.conn == C_CONNECTED &&
1255 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1256 unsigned long now = jiffies;
1257 int i;
1258
1259 set_ov_position(mdev, ns.conn);
1260 mdev->rs_start = now;
1261 mdev->rs_last_events = 0;
1262 mdev->rs_last_sect_ev = 0;
1263 mdev->ov_last_oos_size = 0;
1264 mdev->ov_last_oos_start = 0;
1265
1266 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1267 mdev->rs_mark_left[i] = mdev->ov_left;
1268 mdev->rs_mark_time[i] = now;
1269 }
1270
1271 drbd_rs_controller_reset(mdev);
1272
1273 if (ns.conn == C_VERIFY_S) {
1274 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1275 (unsigned long long)mdev->ov_position);
1276 mod_timer(&mdev->resync_timer, jiffies);
1277 }
1278 }
1279
1280 if (get_ldev(mdev)) {
1281 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1282 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1283 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1284
1285 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1286 mdf |= MDF_CRASHED_PRIMARY;
1287 if (mdev->state.role == R_PRIMARY ||
1288 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1289 mdf |= MDF_PRIMARY_IND;
1290 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1291 mdf |= MDF_CONNECTED_IND;
1292 if (mdev->state.disk > D_INCONSISTENT)
1293 mdf |= MDF_CONSISTENT;
1294 if (mdev->state.disk > D_OUTDATED)
1295 mdf |= MDF_WAS_UP_TO_DATE;
1296 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1297 mdf |= MDF_PEER_OUT_DATED;
1298 if (mdf != mdev->ldev->md.flags) {
1299 mdev->ldev->md.flags = mdf;
1300 drbd_md_mark_dirty(mdev);
1301 }
1302 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1303 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1304 put_ldev(mdev);
1305 }
1306
1307 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1308 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1309 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1310 set_bit(CONSIDER_RESYNC, &mdev->flags);
1311
1312 /* Receiver should clean up itself */
1313 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1314 drbd_thread_stop_nowait(&mdev->receiver);
1315
1316 /* Now the receiver finished cleaning up itself, it should die */
1317 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1318 drbd_thread_stop_nowait(&mdev->receiver);
1319
1320 /* Upon network failure, we need to restart the receiver. */
1321 if (os.conn > C_WF_CONNECTION &&
1322 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1323 drbd_thread_restart_nowait(&mdev->receiver);
1324
1325 /* Resume AL writing if we get a connection */
1326 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1327 drbd_resume_al(mdev);
1328
1329 /* remember last connect and attach times so request_timer_fn() won't
1330 * kill newly established sessions while we are still trying to thaw
1331 * previously frozen IO */
1332 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1333 mdev->last_reconnect_jif = jiffies;
1334 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1335 ns.disk > D_NEGOTIATING)
1336 mdev->last_reattach_jif = jiffies;
1337
1338 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1339 if (ascw) {
1340 ascw->os = os;
1341 ascw->ns = ns;
1342 ascw->flags = flags;
1343 ascw->w.cb = w_after_state_ch;
1344 ascw->done = done;
1345 drbd_queue_work(&mdev->data.work, &ascw->w);
1346 } else {
1347 dev_warn(DEV, "Could not kmalloc an ascw\n");
1348 }
1349
1350 return rv;
1351 }
1352
1353 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1354 {
1355 struct after_state_chg_work *ascw =
1356 container_of(w, struct after_state_chg_work, w);
1357 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1358 if (ascw->flags & CS_WAIT_COMPLETE) {
1359 D_ASSERT(ascw->done != NULL);
1360 complete(ascw->done);
1361 }
1362 kfree(ascw);
1363
1364 return 1;
1365 }
1366
1367 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1368 {
1369 if (rv) {
1370 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1371 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1372 return;
1373 }
1374
1375 switch (mdev->state.conn) {
1376 case C_STARTING_SYNC_T:
1377 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1378 break;
1379 case C_STARTING_SYNC_S:
1380 drbd_start_resync(mdev, C_SYNC_SOURCE);
1381 break;
1382 }
1383 }
1384
1385 int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1386 int (*io_fn)(struct drbd_conf *),
1387 char *why, enum bm_flag flags)
1388 {
1389 int rv;
1390
1391 D_ASSERT(current == mdev->worker.task);
1392
1393 /* open coded non-blocking drbd_suspend_io(mdev); */
1394 set_bit(SUSPEND_IO, &mdev->flags);
1395
1396 drbd_bm_lock(mdev, why, flags);
1397 rv = io_fn(mdev);
1398 drbd_bm_unlock(mdev);
1399
1400 drbd_resume_io(mdev);
1401
1402 return rv;
1403 }
1404
1405 /**
1406 * after_state_ch() - Perform after state change actions that may sleep
1407 * @mdev: DRBD device.
1408 * @os: old state.
1409 * @ns: new state.
1410 * @flags: Flags
1411 */
1412 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1413 union drbd_state ns, enum chg_state_flags flags)
1414 {
1415 enum drbd_fencing_p fp;
1416 enum drbd_req_event what = nothing;
1417 union drbd_state nsm = (union drbd_state){ .i = -1 };
1418
1419 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1420 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1421 if (mdev->p_uuid)
1422 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1423 }
1424
1425 fp = FP_DONT_CARE;
1426 if (get_ldev(mdev)) {
1427 fp = mdev->ldev->dc.fencing;
1428 put_ldev(mdev);
1429 }
1430
1431 /* Inform userspace about the change... */
1432 drbd_bcast_state(mdev, ns);
1433
1434 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1435 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1436 drbd_khelper(mdev, "pri-on-incon-degr");
1437
1438 /* Here we have the actions that are performed after a
1439 state change. This function might sleep */
1440
1441 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1442 mod_timer(&mdev->request_timer, jiffies + HZ);
1443
1444 nsm.i = -1;
1445 if (ns.susp_nod) {
1446 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1447 what = resend;
1448
1449 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1450 ns.disk > D_NEGOTIATING)
1451 what = restart_frozen_disk_io;
1452
1453 if (what != nothing)
1454 nsm.susp_nod = 0;
1455 }
1456
1457 if (ns.susp_fen) {
1458 /* case1: The outdate peer handler is successful: */
1459 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
1460 tl_clear(mdev);
1461 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1462 drbd_uuid_new_current(mdev);
1463 clear_bit(NEW_CUR_UUID, &mdev->flags);
1464 }
1465 spin_lock_irq(&mdev->req_lock);
1466 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1467 spin_unlock_irq(&mdev->req_lock);
1468 }
1469 /* case2: The connection was established again: */
1470 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1471 clear_bit(NEW_CUR_UUID, &mdev->flags);
1472 what = resend;
1473 nsm.susp_fen = 0;
1474 }
1475 }
1476
1477 if (what != nothing) {
1478 spin_lock_irq(&mdev->req_lock);
1479 _tl_restart(mdev, what);
1480 nsm.i &= mdev->state.i;
1481 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1482 spin_unlock_irq(&mdev->req_lock);
1483 }
1484
1485 /* Became sync source. With protocol >= 96, we still need to send out
1486 * the sync uuid now. Need to do that before any drbd_send_state, or
1487 * the other side may go "paused sync" before receiving the sync uuids,
1488 * which is unexpected. */
1489 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1490 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1491 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1492 drbd_gen_and_send_sync_uuid(mdev);
1493 put_ldev(mdev);
1494 }
1495
1496 /* Do not change the order of the if above and the two below... */
1497 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1498 drbd_send_uuids(mdev);
1499 drbd_send_state(mdev, ns);
1500 }
1501 /* No point in queuing send_bitmap if we don't have a connection
1502 * anymore, so check also the _current_ state, not only the new state
1503 * at the time this work was queued. */
1504 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1505 mdev->state.conn == C_WF_BITMAP_S)
1506 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1507 "send_bitmap (WFBitMapS)",
1508 BM_LOCKED_TEST_ALLOWED);
1509
1510 /* Lost contact to peer's copy of the data */
1511 if ((os.pdsk >= D_INCONSISTENT &&
1512 os.pdsk != D_UNKNOWN &&
1513 os.pdsk != D_OUTDATED)
1514 && (ns.pdsk < D_INCONSISTENT ||
1515 ns.pdsk == D_UNKNOWN ||
1516 ns.pdsk == D_OUTDATED)) {
1517 if (get_ldev(mdev)) {
1518 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1519 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1520 if (is_susp(mdev->state)) {
1521 set_bit(NEW_CUR_UUID, &mdev->flags);
1522 } else {
1523 drbd_uuid_new_current(mdev);
1524 drbd_send_uuids(mdev);
1525 }
1526 }
1527 put_ldev(mdev);
1528 }
1529 }
1530
1531 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1532 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1533 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1534 drbd_uuid_new_current(mdev);
1535 drbd_send_uuids(mdev);
1536 }
1537 /* D_DISKLESS Peer becomes secondary */
1538 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1539 /* We may still be Primary ourselves.
1540 * No harm done if the bitmap still changes,
1541 * redirtied pages will follow later. */
1542 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1543 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
1544 put_ldev(mdev);
1545 }
1546
1547 /* Write out all changed bits on demote.
1548 * Though, no need to da that just yet
1549 * if there is a resync going on still */
1550 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1551 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1552 /* No changes to the bitmap expected this time, so assert that,
1553 * even though no harm was done if it did change. */
1554 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1555 "demote", BM_LOCKED_TEST_ALLOWED);
1556 put_ldev(mdev);
1557 }
1558
1559 /* Last part of the attaching process ... */
1560 if (ns.conn >= C_CONNECTED &&
1561 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1562 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
1563 drbd_send_uuids(mdev);
1564 drbd_send_state(mdev, ns);
1565 }
1566
1567 /* We want to pause/continue resync, tell peer. */
1568 if (ns.conn >= C_CONNECTED &&
1569 ((os.aftr_isp != ns.aftr_isp) ||
1570 (os.user_isp != ns.user_isp)))
1571 drbd_send_state(mdev, ns);
1572
1573 /* In case one of the isp bits got set, suspend other devices. */
1574 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1575 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1576 suspend_other_sg(mdev);
1577
1578 /* Make sure the peer gets informed about eventual state
1579 changes (ISP bits) while we were in WFReportParams. */
1580 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1581 drbd_send_state(mdev, ns);
1582
1583 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1584 drbd_send_state(mdev, ns);
1585
1586 /* We are in the progress to start a full sync... */
1587 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1588 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1589 /* no other bitmap changes expected during this phase */
1590 drbd_queue_bitmap_io(mdev,
1591 &drbd_bmio_set_n_write, &abw_start_sync,
1592 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1593
1594 /* We are invalidating our self... */
1595 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1596 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1597 /* other bitmap operation expected during this phase */
1598 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1599 "set_n_write from invalidate", BM_LOCKED_MASK);
1600
1601 /* first half of local IO error, failure to attach,
1602 * or administrative detach */
1603 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1604 enum drbd_io_error_p eh = EP_PASS_ON;
1605 int was_io_error = 0;
1606 /* corresponding get_ldev was in __drbd_set_state, to serialize
1607 * our cleanup here with the transition to D_DISKLESS.
1608 * But is is still not save to dreference ldev here, since
1609 * we might come from an failed Attach before ldev was set. */
1610 if (mdev->ldev) {
1611 eh = mdev->ldev->dc.on_io_error;
1612 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1613
1614 /* Immediately allow completion of all application IO, that waits
1615 for completion from the local disk. */
1616 tl_abort_disk_io(mdev);
1617
1618 /* current state still has to be D_FAILED,
1619 * there is only one way out: to D_DISKLESS,
1620 * and that may only happen after our put_ldev below. */
1621 if (mdev->state.disk != D_FAILED)
1622 dev_err(DEV,
1623 "ASSERT FAILED: disk is %s during detach\n",
1624 drbd_disk_str(mdev->state.disk));
1625
1626 if (ns.conn >= C_CONNECTED)
1627 drbd_send_state(mdev, ns);
1628
1629 drbd_rs_cancel_all(mdev);
1630
1631 /* In case we want to get something to stable storage still,
1632 * this may be the last chance.
1633 * Following put_ldev may transition to D_DISKLESS. */
1634 drbd_md_sync(mdev);
1635 }
1636 put_ldev(mdev);
1637
1638 if (was_io_error && eh == EP_CALL_HELPER)
1639 drbd_khelper(mdev, "local-io-error");
1640 }
1641
1642 /* second half of local IO error, failure to attach,
1643 * or administrative detach,
1644 * after local_cnt references have reached zero again */
1645 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1646 /* We must still be diskless,
1647 * re-attach has to be serialized with this! */
1648 if (mdev->state.disk != D_DISKLESS)
1649 dev_err(DEV,
1650 "ASSERT FAILED: disk is %s while going diskless\n",
1651 drbd_disk_str(mdev->state.disk));
1652
1653 mdev->rs_total = 0;
1654 mdev->rs_failed = 0;
1655 atomic_set(&mdev->rs_pending_cnt, 0);
1656
1657 if (ns.conn >= C_CONNECTED)
1658 drbd_send_state(mdev, ns);
1659
1660 /* corresponding get_ldev in __drbd_set_state
1661 * this may finally trigger drbd_ldev_destroy. */
1662 put_ldev(mdev);
1663 }
1664
1665 /* Notify peer that I had a local IO error, and did not detached.. */
1666 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
1667 drbd_send_state(mdev, ns);
1668
1669 /* Disks got bigger while they were detached */
1670 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1671 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1672 if (ns.conn == C_CONNECTED)
1673 resync_after_online_grow(mdev);
1674 }
1675
1676 /* A resync finished or aborted, wake paused devices... */
1677 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1678 (os.peer_isp && !ns.peer_isp) ||
1679 (os.user_isp && !ns.user_isp))
1680 resume_next_sg(mdev);
1681
1682 /* sync target done with resync. Explicitly notify peer, even though
1683 * it should (at least for non-empty resyncs) already know itself. */
1684 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1685 drbd_send_state(mdev, ns);
1686
1687 /* Wake up role changes, that were delayed because of connection establishing */
1688 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1689 clear_bit(STATE_SENT, &mdev->flags);
1690 wake_up(&mdev->state_wait);
1691 }
1692
1693 /* This triggers bitmap writeout of potentially still unwritten pages
1694 * if the resync finished cleanly, or aborted because of peer disk
1695 * failure, or because of connection loss.
1696 * For resync aborted because of local disk failure, we cannot do
1697 * any bitmap writeout anymore.
1698 * No harm done if some bits change during this phase.
1699 */
1700 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1701 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1702 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
1703 put_ldev(mdev);
1704 }
1705
1706 /* free tl_hash if we Got thawed and are C_STANDALONE */
1707 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1708 drbd_free_tl_hash(mdev);
1709
1710 /* Upon network connection, we need to start the receiver */
1711 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1712 drbd_thread_start(&mdev->receiver);
1713
1714 /* Terminate worker thread if we are unconfigured - it will be
1715 restarted as needed... */
1716 if (ns.disk == D_DISKLESS &&
1717 ns.conn == C_STANDALONE &&
1718 ns.role == R_SECONDARY) {
1719 if (os.aftr_isp != ns.aftr_isp)
1720 resume_next_sg(mdev);
1721 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1722 if (test_bit(DEVICE_DYING, &mdev->flags))
1723 drbd_thread_stop_nowait(&mdev->worker);
1724 }
1725
1726 drbd_md_sync(mdev);
1727 }
1728
1729
1730 static int drbd_thread_setup(void *arg)
1731 {
1732 struct drbd_thread *thi = (struct drbd_thread *) arg;
1733 struct drbd_conf *mdev = thi->mdev;
1734 unsigned long flags;
1735 int retval;
1736
1737 restart:
1738 retval = thi->function(thi);
1739
1740 spin_lock_irqsave(&thi->t_lock, flags);
1741
1742 /* if the receiver has been "Exiting", the last thing it did
1743 * was set the conn state to "StandAlone",
1744 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1745 * and receiver thread will be "started".
1746 * drbd_thread_start needs to set "Restarting" in that case.
1747 * t_state check and assignment needs to be within the same spinlock,
1748 * so either thread_start sees Exiting, and can remap to Restarting,
1749 * or thread_start see None, and can proceed as normal.
1750 */
1751
1752 if (thi->t_state == Restarting) {
1753 dev_info(DEV, "Restarting %s\n", current->comm);
1754 thi->t_state = Running;
1755 spin_unlock_irqrestore(&thi->t_lock, flags);
1756 goto restart;
1757 }
1758
1759 thi->task = NULL;
1760 thi->t_state = None;
1761 smp_mb();
1762 complete(&thi->stop);
1763 spin_unlock_irqrestore(&thi->t_lock, flags);
1764
1765 dev_info(DEV, "Terminating %s\n", current->comm);
1766
1767 /* Release mod reference taken when thread was started */
1768 module_put(THIS_MODULE);
1769 return retval;
1770 }
1771
1772 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1773 int (*func) (struct drbd_thread *))
1774 {
1775 spin_lock_init(&thi->t_lock);
1776 thi->task = NULL;
1777 thi->t_state = None;
1778 thi->function = func;
1779 thi->mdev = mdev;
1780 }
1781
1782 int drbd_thread_start(struct drbd_thread *thi)
1783 {
1784 struct drbd_conf *mdev = thi->mdev;
1785 struct task_struct *nt;
1786 unsigned long flags;
1787
1788 const char *me =
1789 thi == &mdev->receiver ? "receiver" :
1790 thi == &mdev->asender ? "asender" :
1791 thi == &mdev->worker ? "worker" : "NONSENSE";
1792
1793 /* is used from state engine doing drbd_thread_stop_nowait,
1794 * while holding the req lock irqsave */
1795 spin_lock_irqsave(&thi->t_lock, flags);
1796
1797 switch (thi->t_state) {
1798 case None:
1799 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1800 me, current->comm, current->pid);
1801
1802 /* Get ref on module for thread - this is released when thread exits */
1803 if (!try_module_get(THIS_MODULE)) {
1804 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1805 spin_unlock_irqrestore(&thi->t_lock, flags);
1806 return false;
1807 }
1808
1809 init_completion(&thi->stop);
1810 D_ASSERT(thi->task == NULL);
1811 thi->reset_cpu_mask = 1;
1812 thi->t_state = Running;
1813 spin_unlock_irqrestore(&thi->t_lock, flags);
1814 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1815
1816 nt = kthread_create(drbd_thread_setup, (void *) thi,
1817 "drbd%d_%s", mdev_to_minor(mdev), me);
1818
1819 if (IS_ERR(nt)) {
1820 dev_err(DEV, "Couldn't start thread\n");
1821
1822 module_put(THIS_MODULE);
1823 return false;
1824 }
1825 spin_lock_irqsave(&thi->t_lock, flags);
1826 thi->task = nt;
1827 thi->t_state = Running;
1828 spin_unlock_irqrestore(&thi->t_lock, flags);
1829 wake_up_process(nt);
1830 break;
1831 case Exiting:
1832 thi->t_state = Restarting;
1833 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1834 me, current->comm, current->pid);
1835 /* fall through */
1836 case Running:
1837 case Restarting:
1838 default:
1839 spin_unlock_irqrestore(&thi->t_lock, flags);
1840 break;
1841 }
1842
1843 return true;
1844 }
1845
1846
1847 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1848 {
1849 unsigned long flags;
1850
1851 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1852
1853 /* may be called from state engine, holding the req lock irqsave */
1854 spin_lock_irqsave(&thi->t_lock, flags);
1855
1856 if (thi->t_state == None) {
1857 spin_unlock_irqrestore(&thi->t_lock, flags);
1858 if (restart)
1859 drbd_thread_start(thi);
1860 return;
1861 }
1862
1863 if (thi->t_state != ns) {
1864 if (thi->task == NULL) {
1865 spin_unlock_irqrestore(&thi->t_lock, flags);
1866 return;
1867 }
1868
1869 thi->t_state = ns;
1870 smp_mb();
1871 init_completion(&thi->stop);
1872 if (thi->task != current)
1873 force_sig(DRBD_SIGKILL, thi->task);
1874
1875 }
1876
1877 spin_unlock_irqrestore(&thi->t_lock, flags);
1878
1879 if (wait)
1880 wait_for_completion(&thi->stop);
1881 }
1882
1883 #ifdef CONFIG_SMP
1884 /**
1885 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1886 * @mdev: DRBD device.
1887 *
1888 * Forces all threads of a device onto the same CPU. This is beneficial for
1889 * DRBD's performance. May be overwritten by user's configuration.
1890 */
1891 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1892 {
1893 int ord, cpu;
1894
1895 /* user override. */
1896 if (cpumask_weight(mdev->cpu_mask))
1897 return;
1898
1899 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1900 for_each_online_cpu(cpu) {
1901 if (ord-- == 0) {
1902 cpumask_set_cpu(cpu, mdev->cpu_mask);
1903 return;
1904 }
1905 }
1906 /* should not be reached */
1907 cpumask_setall(mdev->cpu_mask);
1908 }
1909
1910 /**
1911 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1912 * @mdev: DRBD device.
1913 *
1914 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1915 * prematurely.
1916 */
1917 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1918 {
1919 struct task_struct *p = current;
1920 struct drbd_thread *thi =
1921 p == mdev->asender.task ? &mdev->asender :
1922 p == mdev->receiver.task ? &mdev->receiver :
1923 p == mdev->worker.task ? &mdev->worker :
1924 NULL;
1925 ERR_IF(thi == NULL)
1926 return;
1927 if (!thi->reset_cpu_mask)
1928 return;
1929 thi->reset_cpu_mask = 0;
1930 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1931 }
1932 #endif
1933
1934 /* the appropriate socket mutex must be held already */
1935 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1936 enum drbd_packets cmd, struct p_header80 *h,
1937 size_t size, unsigned msg_flags)
1938 {
1939 int sent, ok;
1940
1941 ERR_IF(!h) return false;
1942 ERR_IF(!size) return false;
1943
1944 h->magic = BE_DRBD_MAGIC;
1945 h->command = cpu_to_be16(cmd);
1946 h->length = cpu_to_be16(size-sizeof(struct p_header80));
1947
1948 sent = drbd_send(mdev, sock, h, size, msg_flags);
1949
1950 ok = (sent == size);
1951 if (!ok && !signal_pending(current))
1952 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1953 cmdname(cmd), (int)size, sent);
1954 return ok;
1955 }
1956
1957 /* don't pass the socket. we may only look at it
1958 * when we hold the appropriate socket mutex.
1959 */
1960 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1961 enum drbd_packets cmd, struct p_header80 *h, size_t size)
1962 {
1963 int ok = 0;
1964 struct socket *sock;
1965
1966 if (use_data_socket) {
1967 mutex_lock(&mdev->data.mutex);
1968 sock = mdev->data.socket;
1969 } else {
1970 mutex_lock(&mdev->meta.mutex);
1971 sock = mdev->meta.socket;
1972 }
1973
1974 /* drbd_disconnect() could have called drbd_free_sock()
1975 * while we were waiting in down()... */
1976 if (likely(sock != NULL))
1977 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1978
1979 if (use_data_socket)
1980 mutex_unlock(&mdev->data.mutex);
1981 else
1982 mutex_unlock(&mdev->meta.mutex);
1983 return ok;
1984 }
1985
1986 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1987 size_t size)
1988 {
1989 struct p_header80 h;
1990 int ok;
1991
1992 h.magic = BE_DRBD_MAGIC;
1993 h.command = cpu_to_be16(cmd);
1994 h.length = cpu_to_be16(size);
1995
1996 if (!drbd_get_data_sock(mdev))
1997 return 0;
1998
1999 ok = (sizeof(h) ==
2000 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2001 ok = ok && (size ==
2002 drbd_send(mdev, mdev->data.socket, data, size, 0));
2003
2004 drbd_put_data_sock(mdev);
2005
2006 return ok;
2007 }
2008
2009 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2010 {
2011 struct p_rs_param_95 *p;
2012 struct socket *sock;
2013 int size, rv;
2014 const int apv = mdev->agreed_pro_version;
2015
2016 size = apv <= 87 ? sizeof(struct p_rs_param)
2017 : apv == 88 ? sizeof(struct p_rs_param)
2018 + strlen(mdev->sync_conf.verify_alg) + 1
2019 : apv <= 94 ? sizeof(struct p_rs_param_89)
2020 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2021
2022 /* used from admin command context and receiver/worker context.
2023 * to avoid kmalloc, grab the socket right here,
2024 * then use the pre-allocated sbuf there */
2025 mutex_lock(&mdev->data.mutex);
2026 sock = mdev->data.socket;
2027
2028 if (likely(sock != NULL)) {
2029 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2030
2031 p = &mdev->data.sbuf.rs_param_95;
2032
2033 /* initialize verify_alg and csums_alg */
2034 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2035
2036 p->rate = cpu_to_be32(sc->rate);
2037 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2038 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2039 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2040 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
2041
2042 if (apv >= 88)
2043 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2044 if (apv >= 89)
2045 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2046
2047 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2048 } else
2049 rv = 0; /* not ok */
2050
2051 mutex_unlock(&mdev->data.mutex);
2052
2053 return rv;
2054 }
2055
2056 int drbd_send_protocol(struct drbd_conf *mdev)
2057 {
2058 struct p_protocol *p;
2059 int size, cf, rv;
2060
2061 size = sizeof(struct p_protocol);
2062
2063 if (mdev->agreed_pro_version >= 87)
2064 size += strlen(mdev->net_conf->integrity_alg) + 1;
2065
2066 /* we must not recurse into our own queue,
2067 * as that is blocked during handshake */
2068 p = kmalloc(size, GFP_NOIO);
2069 if (p == NULL)
2070 return 0;
2071
2072 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2073 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2074 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2075 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
2076 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2077
2078 cf = 0;
2079 if (mdev->net_conf->want_lose)
2080 cf |= CF_WANT_LOSE;
2081 if (mdev->net_conf->dry_run) {
2082 if (mdev->agreed_pro_version >= 92)
2083 cf |= CF_DRY_RUN;
2084 else {
2085 dev_err(DEV, "--dry-run is not supported by peer");
2086 kfree(p);
2087 return -1;
2088 }
2089 }
2090 p->conn_flags = cpu_to_be32(cf);
2091
2092 if (mdev->agreed_pro_version >= 87)
2093 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2094
2095 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
2096 (struct p_header80 *)p, size);
2097 kfree(p);
2098 return rv;
2099 }
2100
2101 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2102 {
2103 struct p_uuids p;
2104 int i;
2105
2106 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2107 return 1;
2108
2109 for (i = UI_CURRENT; i < UI_SIZE; i++)
2110 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2111
2112 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2113 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2114 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2115 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2116 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2117 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2118
2119 put_ldev(mdev);
2120
2121 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2122 (struct p_header80 *)&p, sizeof(p));
2123 }
2124
2125 int drbd_send_uuids(struct drbd_conf *mdev)
2126 {
2127 return _drbd_send_uuids(mdev, 0);
2128 }
2129
2130 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2131 {
2132 return _drbd_send_uuids(mdev, 8);
2133 }
2134
2135 void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2136 {
2137 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2138 u64 *uuid = mdev->ldev->md.uuid;
2139 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2140 text,
2141 (unsigned long long)uuid[UI_CURRENT],
2142 (unsigned long long)uuid[UI_BITMAP],
2143 (unsigned long long)uuid[UI_HISTORY_START],
2144 (unsigned long long)uuid[UI_HISTORY_END]);
2145 put_ldev(mdev);
2146 } else {
2147 dev_info(DEV, "%s effective data uuid: %016llX\n",
2148 text,
2149 (unsigned long long)mdev->ed_uuid);
2150 }
2151 }
2152
2153 int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2154 {
2155 struct p_rs_uuid p;
2156 u64 uuid;
2157
2158 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2159
2160 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2161 if (uuid && uuid != UUID_JUST_CREATED)
2162 uuid = uuid + UUID_NEW_BM_OFFSET;
2163 else
2164 get_random_bytes(&uuid, sizeof(u64));
2165 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2166 drbd_print_uuids(mdev, "updated sync UUID");
2167 drbd_md_sync(mdev);
2168 p.uuid = cpu_to_be64(uuid);
2169
2170 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2171 (struct p_header80 *)&p, sizeof(p));
2172 }
2173
2174 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2175 {
2176 struct p_sizes p;
2177 sector_t d_size, u_size;
2178 int q_order_type, max_bio_size;
2179 int ok;
2180
2181 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2182 D_ASSERT(mdev->ldev->backing_bdev);
2183 d_size = drbd_get_max_capacity(mdev->ldev);
2184 u_size = mdev->ldev->dc.disk_size;
2185 q_order_type = drbd_queue_order_type(mdev);
2186 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2187 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
2188 put_ldev(mdev);
2189 } else {
2190 d_size = 0;
2191 u_size = 0;
2192 q_order_type = QUEUE_ORDERED_NONE;
2193 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2194 }
2195
2196 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2197 if (mdev->agreed_pro_version <= 94)
2198 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2199
2200 p.d_size = cpu_to_be64(d_size);
2201 p.u_size = cpu_to_be64(u_size);
2202 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2203 p.max_bio_size = cpu_to_be32(max_bio_size);
2204 p.queue_order_type = cpu_to_be16(q_order_type);
2205 p.dds_flags = cpu_to_be16(flags);
2206
2207 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2208 (struct p_header80 *)&p, sizeof(p));
2209 return ok;
2210 }
2211
2212 /**
2213 * drbd_send_current_state() - Sends the drbd state to the peer
2214 * @mdev: DRBD device.
2215 */
2216 int drbd_send_current_state(struct drbd_conf *mdev)
2217 {
2218 struct socket *sock;
2219 struct p_state p;
2220 int ok = 0;
2221
2222 /* Grab state lock so we wont send state if we're in the middle
2223 * of a cluster wide state change on another thread */
2224 drbd_state_lock(mdev);
2225
2226 mutex_lock(&mdev->data.mutex);
2227
2228 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2229 sock = mdev->data.socket;
2230
2231 if (likely(sock != NULL)) {
2232 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2233 (struct p_header80 *)&p, sizeof(p), 0);
2234 }
2235
2236 mutex_unlock(&mdev->data.mutex);
2237
2238 drbd_state_unlock(mdev);
2239 return ok;
2240 }
2241
2242 /**
2243 * drbd_send_state() - After a state change, sends the new state to the peer
2244 * @mdev: DRBD device.
2245 * @state: the state to send, not necessarily the current state.
2246 *
2247 * Each state change queues an "after_state_ch" work, which will eventually
2248 * send the resulting new state to the peer. If more state changes happen
2249 * between queuing and processing of the after_state_ch work, we still
2250 * want to send each intermediary state in the order it occurred.
2251 */
2252 int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2253 {
2254 struct socket *sock;
2255 struct p_state p;
2256 int ok = 0;
2257
2258 mutex_lock(&mdev->data.mutex);
2259
2260 p.state = cpu_to_be32(state.i);
2261 sock = mdev->data.socket;
2262
2263 if (likely(sock != NULL)) {
2264 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2265 (struct p_header80 *)&p, sizeof(p), 0);
2266 }
2267
2268 mutex_unlock(&mdev->data.mutex);
2269
2270 return ok;
2271 }
2272
2273 int drbd_send_state_req(struct drbd_conf *mdev,
2274 union drbd_state mask, union drbd_state val)
2275 {
2276 struct p_req_state p;
2277
2278 p.mask = cpu_to_be32(mask.i);
2279 p.val = cpu_to_be32(val.i);
2280
2281 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2282 (struct p_header80 *)&p, sizeof(p));
2283 }
2284
2285 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2286 {
2287 struct p_req_state_reply p;
2288
2289 p.retcode = cpu_to_be32(retcode);
2290
2291 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2292 (struct p_header80 *)&p, sizeof(p));
2293 }
2294
2295 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2296 struct p_compressed_bm *p,
2297 struct bm_xfer_ctx *c)
2298 {
2299 struct bitstream bs;
2300 unsigned long plain_bits;
2301 unsigned long tmp;
2302 unsigned long rl;
2303 unsigned len;
2304 unsigned toggle;
2305 int bits;
2306
2307 /* may we use this feature? */
2308 if ((mdev->sync_conf.use_rle == 0) ||
2309 (mdev->agreed_pro_version < 90))
2310 return 0;
2311
2312 if (c->bit_offset >= c->bm_bits)
2313 return 0; /* nothing to do. */
2314
2315 /* use at most thus many bytes */
2316 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2317 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2318 /* plain bits covered in this code string */
2319 plain_bits = 0;
2320
2321 /* p->encoding & 0x80 stores whether the first run length is set.
2322 * bit offset is implicit.
2323 * start with toggle == 2 to be able to tell the first iteration */
2324 toggle = 2;
2325
2326 /* see how much plain bits we can stuff into one packet
2327 * using RLE and VLI. */
2328 do {
2329 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2330 : _drbd_bm_find_next(mdev, c->bit_offset);
2331 if (tmp == -1UL)
2332 tmp = c->bm_bits;
2333 rl = tmp - c->bit_offset;
2334
2335 if (toggle == 2) { /* first iteration */
2336 if (rl == 0) {
2337 /* the first checked bit was set,
2338 * store start value, */
2339 DCBP_set_start(p, 1);
2340 /* but skip encoding of zero run length */
2341 toggle = !toggle;
2342 continue;
2343 }
2344 DCBP_set_start(p, 0);
2345 }
2346
2347 /* paranoia: catch zero runlength.
2348 * can only happen if bitmap is modified while we scan it. */
2349 if (rl == 0) {
2350 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2351 "t:%u bo:%lu\n", toggle, c->bit_offset);
2352 return -1;
2353 }
2354
2355 bits = vli_encode_bits(&bs, rl);
2356 if (bits == -ENOBUFS) /* buffer full */
2357 break;
2358 if (bits <= 0) {
2359 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2360 return 0;
2361 }
2362
2363 toggle = !toggle;
2364 plain_bits += rl;
2365 c->bit_offset = tmp;
2366 } while (c->bit_offset < c->bm_bits);
2367
2368 len = bs.cur.b - p->code + !!bs.cur.bit;
2369
2370 if (plain_bits < (len << 3)) {
2371 /* incompressible with this method.
2372 * we need to rewind both word and bit position. */
2373 c->bit_offset -= plain_bits;
2374 bm_xfer_ctx_bit_to_word_offset(c);
2375 c->bit_offset = c->word_offset * BITS_PER_LONG;
2376 return 0;
2377 }
2378
2379 /* RLE + VLI was able to compress it just fine.
2380 * update c->word_offset. */
2381 bm_xfer_ctx_bit_to_word_offset(c);
2382
2383 /* store pad_bits */
2384 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2385
2386 return len;
2387 }
2388
2389 /**
2390 * send_bitmap_rle_or_plain
2391 *
2392 * Return 0 when done, 1 when another iteration is needed, and a negative error
2393 * code upon failure.
2394 */
2395 static int
2396 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2397 struct p_header80 *h, struct bm_xfer_ctx *c)
2398 {
2399 struct p_compressed_bm *p = (void*)h;
2400 unsigned long num_words;
2401 int len;
2402 int ok;
2403
2404 len = fill_bitmap_rle_bits(mdev, p, c);
2405
2406 if (len < 0)
2407 return -EIO;
2408
2409 if (len) {
2410 DCBP_set_code(p, RLE_VLI_Bits);
2411 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2412 sizeof(*p) + len, 0);
2413
2414 c->packets[0]++;
2415 c->bytes[0] += sizeof(*p) + len;
2416
2417 if (c->bit_offset >= c->bm_bits)
2418 len = 0; /* DONE */
2419 } else {
2420 /* was not compressible.
2421 * send a buffer full of plain text bits instead. */
2422 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2423 len = num_words * sizeof(long);
2424 if (len)
2425 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2426 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2427 h, sizeof(struct p_header80) + len, 0);
2428 c->word_offset += num_words;
2429 c->bit_offset = c->word_offset * BITS_PER_LONG;
2430
2431 c->packets[1]++;
2432 c->bytes[1] += sizeof(struct p_header80) + len;
2433
2434 if (c->bit_offset > c->bm_bits)
2435 c->bit_offset = c->bm_bits;
2436 }
2437 if (ok) {
2438 if (len == 0) {
2439 INFO_bm_xfer_stats(mdev, "send", c);
2440 return 0;
2441 } else
2442 return 1;
2443 }
2444 return -EIO;
2445 }
2446
2447 /* See the comment at receive_bitmap() */
2448 int _drbd_send_bitmap(struct drbd_conf *mdev)
2449 {
2450 struct bm_xfer_ctx c;
2451 struct p_header80 *p;
2452 int err;
2453
2454 ERR_IF(!mdev->bitmap) return false;
2455
2456 /* maybe we should use some per thread scratch page,
2457 * and allocate that during initial device creation? */
2458 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2459 if (!p) {
2460 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2461 return false;
2462 }
2463
2464 if (get_ldev(mdev)) {
2465 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2466 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2467 drbd_bm_set_all(mdev);
2468 if (drbd_bm_write(mdev)) {
2469 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2470 * but otherwise process as per normal - need to tell other
2471 * side that a full resync is required! */
2472 dev_err(DEV, "Failed to write bitmap to disk!\n");
2473 } else {
2474 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2475 drbd_md_sync(mdev);
2476 }
2477 }
2478 put_ldev(mdev);
2479 }
2480
2481 c = (struct bm_xfer_ctx) {
2482 .bm_bits = drbd_bm_bits(mdev),
2483 .bm_words = drbd_bm_words(mdev),
2484 };
2485
2486 do {
2487 err = send_bitmap_rle_or_plain(mdev, p, &c);
2488 } while (err > 0);
2489
2490 free_page((unsigned long) p);
2491 return err == 0;
2492 }
2493
2494 int drbd_send_bitmap(struct drbd_conf *mdev)
2495 {
2496 int err;
2497
2498 if (!drbd_get_data_sock(mdev))
2499 return -1;
2500 err = !_drbd_send_bitmap(mdev);
2501 drbd_put_data_sock(mdev);
2502 return err;
2503 }
2504
2505 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2506 {
2507 int ok;
2508 struct p_barrier_ack p;
2509
2510 p.barrier = barrier_nr;
2511 p.set_size = cpu_to_be32(set_size);
2512
2513 if (mdev->state.conn < C_CONNECTED)
2514 return false;
2515 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2516 (struct p_header80 *)&p, sizeof(p));
2517 return ok;
2518 }
2519
2520 /**
2521 * _drbd_send_ack() - Sends an ack packet
2522 * @mdev: DRBD device.
2523 * @cmd: Packet command code.
2524 * @sector: sector, needs to be in big endian byte order
2525 * @blksize: size in byte, needs to be in big endian byte order
2526 * @block_id: Id, big endian byte order
2527 */
2528 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2529 u64 sector,
2530 u32 blksize,
2531 u64 block_id)
2532 {
2533 int ok;
2534 struct p_block_ack p;
2535
2536 p.sector = sector;
2537 p.block_id = block_id;
2538 p.blksize = blksize;
2539 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2540
2541 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2542 return false;
2543 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2544 (struct p_header80 *)&p, sizeof(p));
2545 return ok;
2546 }
2547
2548 /* dp->sector and dp->block_id already/still in network byte order,
2549 * data_size is payload size according to dp->head,
2550 * and may need to be corrected for digest size. */
2551 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2552 struct p_data *dp, int data_size)
2553 {
2554 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2555 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2556 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2557 dp->block_id);
2558 }
2559
2560 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2561 struct p_block_req *rp)
2562 {
2563 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2564 }
2565
2566 /**
2567 * drbd_send_ack() - Sends an ack packet
2568 * @mdev: DRBD device.
2569 * @cmd: Packet command code.
2570 * @e: Epoch entry.
2571 */
2572 int drbd_send_ack(struct drbd_conf *mdev,
2573 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2574 {
2575 return _drbd_send_ack(mdev, cmd,
2576 cpu_to_be64(e->sector),
2577 cpu_to_be32(e->size),
2578 e->block_id);
2579 }
2580
2581 /* This function misuses the block_id field to signal if the blocks
2582 * are is sync or not. */
2583 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2584 sector_t sector, int blksize, u64 block_id)
2585 {
2586 return _drbd_send_ack(mdev, cmd,
2587 cpu_to_be64(sector),
2588 cpu_to_be32(blksize),
2589 cpu_to_be64(block_id));
2590 }
2591
2592 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2593 sector_t sector, int size, u64 block_id)
2594 {
2595 int ok;
2596 struct p_block_req p;
2597
2598 p.sector = cpu_to_be64(sector);
2599 p.block_id = block_id;
2600 p.blksize = cpu_to_be32(size);
2601
2602 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2603 (struct p_header80 *)&p, sizeof(p));
2604 return ok;
2605 }
2606
2607 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2608 sector_t sector, int size,
2609 void *digest, int digest_size,
2610 enum drbd_packets cmd)
2611 {
2612 int ok;
2613 struct p_block_req p;
2614
2615 p.sector = cpu_to_be64(sector);
2616 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2617 p.blksize = cpu_to_be32(size);
2618
2619 p.head.magic = BE_DRBD_MAGIC;
2620 p.head.command = cpu_to_be16(cmd);
2621 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2622
2623 mutex_lock(&mdev->data.mutex);
2624
2625 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2626 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2627
2628 mutex_unlock(&mdev->data.mutex);
2629
2630 return ok;
2631 }
2632
2633 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2634 {
2635 int ok;
2636 struct p_block_req p;
2637
2638 p.sector = cpu_to_be64(sector);
2639 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2640 p.blksize = cpu_to_be32(size);
2641
2642 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2643 (struct p_header80 *)&p, sizeof(p));
2644 return ok;
2645 }
2646
2647 /* called on sndtimeo
2648 * returns false if we should retry,
2649 * true if we think connection is dead
2650 */
2651 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2652 {
2653 int drop_it;
2654 /* long elapsed = (long)(jiffies - mdev->last_received); */
2655
2656 drop_it = mdev->meta.socket == sock
2657 || !mdev->asender.task
2658 || get_t_state(&mdev->asender) != Running
2659 || mdev->state.conn < C_CONNECTED;
2660
2661 if (drop_it)
2662 return true;
2663
2664 drop_it = !--mdev->ko_count;
2665 if (!drop_it) {
2666 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2667 current->comm, current->pid, mdev->ko_count);
2668 request_ping(mdev);
2669 }
2670
2671 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2672 }
2673
2674 /* The idea of sendpage seems to be to put some kind of reference
2675 * to the page into the skb, and to hand it over to the NIC. In
2676 * this process get_page() gets called.
2677 *
2678 * As soon as the page was really sent over the network put_page()
2679 * gets called by some part of the network layer. [ NIC driver? ]
2680 *
2681 * [ get_page() / put_page() increment/decrement the count. If count
2682 * reaches 0 the page will be freed. ]
2683 *
2684 * This works nicely with pages from FSs.
2685 * But this means that in protocol A we might signal IO completion too early!
2686 *
2687 * In order not to corrupt data during a resync we must make sure
2688 * that we do not reuse our own buffer pages (EEs) to early, therefore
2689 * we have the net_ee list.
2690 *
2691 * XFS seems to have problems, still, it submits pages with page_count == 0!
2692 * As a workaround, we disable sendpage on pages
2693 * with page_count == 0 or PageSlab.
2694 */
2695 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2696 int offset, size_t size, unsigned msg_flags)
2697 {
2698 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2699 kunmap(page);
2700 if (sent == size)
2701 mdev->send_cnt += size>>9;
2702 return sent == size;
2703 }
2704
2705 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2706 int offset, size_t size, unsigned msg_flags)
2707 {
2708 mm_segment_t oldfs = get_fs();
2709 int sent, ok;
2710 int len = size;
2711
2712 /* e.g. XFS meta- & log-data is in slab pages, which have a
2713 * page_count of 0 and/or have PageSlab() set.
2714 * we cannot use send_page for those, as that does get_page();
2715 * put_page(); and would cause either a VM_BUG directly, or
2716 * __page_cache_release a page that would actually still be referenced
2717 * by someone, leading to some obscure delayed Oops somewhere else. */
2718 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2719 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2720
2721 msg_flags |= MSG_NOSIGNAL;
2722 drbd_update_congested(mdev);
2723 set_fs(KERNEL_DS);
2724 do {
2725 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2726 offset, len,
2727 msg_flags);
2728 if (sent == -EAGAIN) {
2729 if (we_should_drop_the_connection(mdev,
2730 mdev->data.socket))
2731 break;
2732 else
2733 continue;
2734 }
2735 if (sent <= 0) {
2736 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2737 __func__, (int)size, len, sent);
2738 break;
2739 }
2740 len -= sent;
2741 offset += sent;
2742 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2743 set_fs(oldfs);
2744 clear_bit(NET_CONGESTED, &mdev->flags);
2745
2746 ok = (len == 0);
2747 if (likely(ok))
2748 mdev->send_cnt += size>>9;
2749 return ok;
2750 }
2751
2752 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2753 {
2754 struct bio_vec *bvec;
2755 int i;
2756 /* hint all but last page with MSG_MORE */
2757 bio_for_each_segment(bvec, bio, i) {
2758 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2759 bvec->bv_offset, bvec->bv_len,
2760 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2761 return 0;
2762 }
2763 return 1;
2764 }
2765
2766 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2767 {
2768 struct bio_vec *bvec;
2769 int i;
2770 /* hint all but last page with MSG_MORE */
2771 bio_for_each_segment(bvec, bio, i) {
2772 if (!_drbd_send_page(mdev, bvec->bv_page,
2773 bvec->bv_offset, bvec->bv_len,
2774 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2775 return 0;
2776 }
2777 return 1;
2778 }
2779
2780 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2781 {
2782 struct page *page = e->pages;
2783 unsigned len = e->size;
2784 /* hint all but last page with MSG_MORE */
2785 page_chain_for_each(page) {
2786 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2787 if (!_drbd_send_page(mdev, page, 0, l,
2788 page_chain_next(page) ? MSG_MORE : 0))
2789 return 0;
2790 len -= l;
2791 }
2792 return 1;
2793 }
2794
2795 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2796 {
2797 if (mdev->agreed_pro_version >= 95)
2798 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2799 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2800 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2801 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2802 else
2803 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2804 }
2805
2806 /* Used to send write requests
2807 * R_PRIMARY -> Peer (P_DATA)
2808 */
2809 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2810 {
2811 int ok = 1;
2812 struct p_data p;
2813 unsigned int dp_flags = 0;
2814 void *dgb;
2815 int dgs;
2816
2817 if (!drbd_get_data_sock(mdev))
2818 return 0;
2819
2820 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2821 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2822
2823 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2824 p.head.h80.magic = BE_DRBD_MAGIC;
2825 p.head.h80.command = cpu_to_be16(P_DATA);
2826 p.head.h80.length =
2827 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2828 } else {
2829 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2830 p.head.h95.command = cpu_to_be16(P_DATA);
2831 p.head.h95.length =
2832 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2833 }
2834
2835 p.sector = cpu_to_be64(req->sector);
2836 p.block_id = (unsigned long)req;
2837 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2838
2839 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2840
2841 if (mdev->state.conn >= C_SYNC_SOURCE &&
2842 mdev->state.conn <= C_PAUSED_SYNC_T)
2843 dp_flags |= DP_MAY_SET_IN_SYNC;
2844
2845 p.dp_flags = cpu_to_be32(dp_flags);
2846 set_bit(UNPLUG_REMOTE, &mdev->flags);
2847 ok = (sizeof(p) ==
2848 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2849 if (ok && dgs) {
2850 dgb = mdev->int_dig_out;
2851 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2852 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2853 }
2854 if (ok) {
2855 /* For protocol A, we have to memcpy the payload into
2856 * socket buffers, as we may complete right away
2857 * as soon as we handed it over to tcp, at which point the data
2858 * pages may become invalid.
2859 *
2860 * For data-integrity enabled, we copy it as well, so we can be
2861 * sure that even if the bio pages may still be modified, it
2862 * won't change the data on the wire, thus if the digest checks
2863 * out ok after sending on this side, but does not fit on the
2864 * receiving side, we sure have detected corruption elsewhere.
2865 */
2866 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2867 ok = _drbd_send_bio(mdev, req->master_bio);
2868 else
2869 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2870
2871 /* double check digest, sometimes buffers have been modified in flight. */
2872 if (dgs > 0 && dgs <= 64) {
2873 /* 64 byte, 512 bit, is the largest digest size
2874 * currently supported in kernel crypto. */
2875 unsigned char digest[64];
2876 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2877 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2878 dev_warn(DEV,
2879 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2880 (unsigned long long)req->sector, req->size);
2881 }
2882 } /* else if (dgs > 64) {
2883 ... Be noisy about digest too large ...
2884 } */
2885 }
2886
2887 drbd_put_data_sock(mdev);
2888
2889 return ok;
2890 }
2891
2892 /* answer packet, used to send data back for read requests:
2893 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2894 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2895 */
2896 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2897 struct drbd_epoch_entry *e)
2898 {
2899 int ok;
2900 struct p_data p;
2901 void *dgb;
2902 int dgs;
2903
2904 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2905 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2906
2907 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2908 p.head.h80.magic = BE_DRBD_MAGIC;
2909 p.head.h80.command = cpu_to_be16(cmd);
2910 p.head.h80.length =
2911 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2912 } else {
2913 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2914 p.head.h95.command = cpu_to_be16(cmd);
2915 p.head.h95.length =
2916 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2917 }
2918
2919 p.sector = cpu_to_be64(e->sector);
2920 p.block_id = e->block_id;
2921 /* p.seq_num = 0; No sequence numbers here.. */
2922
2923 /* Only called by our kernel thread.
2924 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2925 * in response to admin command or module unload.
2926 */
2927 if (!drbd_get_data_sock(mdev))
2928 return 0;
2929
2930 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2931 if (ok && dgs) {
2932 dgb = mdev->int_dig_out;
2933 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2934 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2935 }
2936 if (ok)
2937 ok = _drbd_send_zc_ee(mdev, e);
2938
2939 drbd_put_data_sock(mdev);
2940
2941 return ok;
2942 }
2943
2944 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2945 {
2946 struct p_block_desc p;
2947
2948 p.sector = cpu_to_be64(req->sector);
2949 p.blksize = cpu_to_be32(req->size);
2950
2951 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2952 }
2953
2954 /*
2955 drbd_send distinguishes two cases:
2956
2957 Packets sent via the data socket "sock"
2958 and packets sent via the meta data socket "msock"
2959
2960 sock msock
2961 -----------------+-------------------------+------------------------------
2962 timeout conf.timeout / 2 conf.timeout / 2
2963 timeout action send a ping via msock Abort communication
2964 and close all sockets
2965 */
2966
2967 /*
2968 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2969 */
2970 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2971 void *buf, size_t size, unsigned msg_flags)
2972 {
2973 struct kvec iov;
2974 struct msghdr msg;
2975 int rv, sent = 0;
2976
2977 if (!sock)
2978 return -1000;
2979
2980 /* THINK if (signal_pending) return ... ? */
2981
2982 iov.iov_base = buf;
2983 iov.iov_len = size;
2984
2985 msg.msg_name = NULL;
2986 msg.msg_namelen = 0;
2987 msg.msg_control = NULL;
2988 msg.msg_controllen = 0;
2989 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2990
2991 if (sock == mdev->data.socket) {
2992 mdev->ko_count = mdev->net_conf->ko_count;
2993 drbd_update_congested(mdev);
2994 }
2995 do {
2996 /* STRANGE
2997 * tcp_sendmsg does _not_ use its size parameter at all ?
2998 *
2999 * -EAGAIN on timeout, -EINTR on signal.
3000 */
3001 /* THINK
3002 * do we need to block DRBD_SIG if sock == &meta.socket ??
3003 * otherwise wake_asender() might interrupt some send_*Ack !
3004 */
3005 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3006 if (rv == -EAGAIN) {
3007 if (we_should_drop_the_connection(mdev, sock))
3008 break;
3009 else
3010 continue;
3011 }
3012 D_ASSERT(rv != 0);
3013 if (rv == -EINTR) {
3014 flush_signals(current);
3015 rv = 0;
3016 }
3017 if (rv < 0)
3018 break;
3019 sent += rv;
3020 iov.iov_base += rv;
3021 iov.iov_len -= rv;
3022 } while (sent < size);
3023
3024 if (sock == mdev->data.socket)
3025 clear_bit(NET_CONGESTED, &mdev->flags);
3026
3027 if (rv <= 0) {
3028 if (rv != -EAGAIN) {
3029 dev_err(DEV, "%s_sendmsg returned %d\n",
3030 sock == mdev->meta.socket ? "msock" : "sock",
3031 rv);
3032 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3033 } else
3034 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3035 }
3036
3037 return sent;
3038 }
3039
3040 static int drbd_open(struct block_device *bdev, fmode_t mode)
3041 {
3042 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3043 unsigned long flags;
3044 int rv = 0;
3045
3046 mutex_lock(&drbd_main_mutex);
3047 spin_lock_irqsave(&mdev->req_lock, flags);
3048 /* to have a stable mdev->state.role
3049 * and no race with updating open_cnt */
3050
3051 if (mdev->state.role != R_PRIMARY) {
3052 if (mode & FMODE_WRITE)
3053 rv = -EROFS;
3054 else if (!allow_oos)
3055 rv = -EMEDIUMTYPE;
3056 }
3057
3058 if (!rv)
3059 mdev->open_cnt++;
3060 spin_unlock_irqrestore(&mdev->req_lock, flags);
3061 mutex_unlock(&drbd_main_mutex);
3062
3063 return rv;
3064 }
3065
3066 static int drbd_release(struct gendisk *gd, fmode_t mode)
3067 {
3068 struct drbd_conf *mdev = gd->private_data;
3069 mutex_lock(&drbd_main_mutex);
3070 mdev->open_cnt--;
3071 mutex_unlock(&drbd_main_mutex);
3072 return 0;
3073 }
3074
3075 static void drbd_set_defaults(struct drbd_conf *mdev)
3076 {
3077 /* This way we get a compile error when sync_conf grows,
3078 and we forgot to initialize it here */
3079 mdev->sync_conf = (struct syncer_conf) {
3080 /* .rate = */ DRBD_RATE_DEF,
3081 /* .after = */ DRBD_AFTER_DEF,
3082 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
3083 /* .verify_alg = */ {}, 0,
3084 /* .cpu_mask = */ {}, 0,
3085 /* .csums_alg = */ {}, 0,
3086 /* .use_rle = */ 0,
3087 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3088 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3089 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3090 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
3091 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3092 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
3093 };
3094
3095 /* Have to use that way, because the layout differs between
3096 big endian and little endian */
3097 mdev->state = (union drbd_state) {
3098 { .role = R_SECONDARY,
3099 .peer = R_UNKNOWN,
3100 .conn = C_STANDALONE,
3101 .disk = D_DISKLESS,
3102 .pdsk = D_UNKNOWN,
3103 .susp = 0,
3104 .susp_nod = 0,
3105 .susp_fen = 0
3106 } };
3107 }
3108
3109 void drbd_init_set_defaults(struct drbd_conf *mdev)
3110 {
3111 /* the memset(,0,) did most of this.
3112 * note: only assignments, no allocation in here */
3113
3114 drbd_set_defaults(mdev);
3115
3116 atomic_set(&mdev->ap_bio_cnt, 0);
3117 atomic_set(&mdev->ap_pending_cnt, 0);
3118 atomic_set(&mdev->rs_pending_cnt, 0);
3119 atomic_set(&mdev->unacked_cnt, 0);
3120 atomic_set(&mdev->local_cnt, 0);
3121 atomic_set(&mdev->net_cnt, 0);
3122 atomic_set(&mdev->packet_seq, 0);
3123 atomic_set(&mdev->pp_in_use, 0);
3124 atomic_set(&mdev->pp_in_use_by_net, 0);
3125 atomic_set(&mdev->rs_sect_in, 0);
3126 atomic_set(&mdev->rs_sect_ev, 0);
3127 atomic_set(&mdev->ap_in_flight, 0);
3128 atomic_set(&mdev->md_io_in_use, 0);
3129
3130 mutex_init(&mdev->data.mutex);
3131 mutex_init(&mdev->meta.mutex);
3132 sema_init(&mdev->data.work.s, 0);
3133 sema_init(&mdev->meta.work.s, 0);
3134 mutex_init(&mdev->state_mutex);
3135
3136 spin_lock_init(&mdev->data.work.q_lock);
3137 spin_lock_init(&mdev->meta.work.q_lock);
3138
3139 spin_lock_init(&mdev->al_lock);
3140 spin_lock_init(&mdev->req_lock);
3141 spin_lock_init(&mdev->peer_seq_lock);
3142 spin_lock_init(&mdev->epoch_lock);
3143
3144 INIT_LIST_HEAD(&mdev->active_ee);
3145 INIT_LIST_HEAD(&mdev->sync_ee);
3146 INIT_LIST_HEAD(&mdev->done_ee);
3147 INIT_LIST_HEAD(&mdev->read_ee);
3148 INIT_LIST_HEAD(&mdev->net_ee);
3149 INIT_LIST_HEAD(&mdev->resync_reads);
3150 INIT_LIST_HEAD(&mdev->data.work.q);
3151 INIT_LIST_HEAD(&mdev->meta.work.q);
3152 INIT_LIST_HEAD(&mdev->resync_work.list);
3153 INIT_LIST_HEAD(&mdev->unplug_work.list);
3154 INIT_LIST_HEAD(&mdev->go_diskless.list);
3155 INIT_LIST_HEAD(&mdev->md_sync_work.list);
3156 INIT_LIST_HEAD(&mdev->start_resync_work.list);
3157 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3158
3159 mdev->resync_work.cb = w_resync_timer;
3160 mdev->unplug_work.cb = w_send_write_hint;
3161 mdev->go_diskless.cb = w_go_diskless;
3162 mdev->md_sync_work.cb = w_md_sync;
3163 mdev->bm_io_work.w.cb = w_bitmap_io;
3164 mdev->start_resync_work.cb = w_start_resync;
3165 init_timer(&mdev->resync_timer);
3166 init_timer(&mdev->md_sync_timer);
3167 init_timer(&mdev->start_resync_timer);
3168 init_timer(&mdev->request_timer);
3169 mdev->resync_timer.function = resync_timer_fn;
3170 mdev->resync_timer.data = (unsigned long) mdev;
3171 mdev->md_sync_timer.function = md_sync_timer_fn;
3172 mdev->md_sync_timer.data = (unsigned long) mdev;
3173 mdev->start_resync_timer.function = start_resync_timer_fn;
3174 mdev->start_resync_timer.data = (unsigned long) mdev;
3175 mdev->request_timer.function = request_timer_fn;
3176 mdev->request_timer.data = (unsigned long) mdev;
3177
3178 init_waitqueue_head(&mdev->misc_wait);
3179 init_waitqueue_head(&mdev->state_wait);
3180 init_waitqueue_head(&mdev->net_cnt_wait);
3181 init_waitqueue_head(&mdev->ee_wait);
3182 init_waitqueue_head(&mdev->al_wait);
3183 init_waitqueue_head(&mdev->seq_wait);
3184
3185 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3186 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3187 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3188
3189 mdev->agreed_pro_version = PRO_VERSION_MAX;
3190 mdev->write_ordering = WO_bdev_flush;
3191 mdev->resync_wenr = LC_FREE;
3192 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3193 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3194 }
3195
3196 void drbd_mdev_cleanup(struct drbd_conf *mdev)
3197 {
3198 int i;
3199 if (mdev->receiver.t_state != None)
3200 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3201 mdev->receiver.t_state);
3202
3203 /* no need to lock it, I'm the only thread alive */
3204 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3205 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3206 mdev->al_writ_cnt =
3207 mdev->bm_writ_cnt =
3208 mdev->read_cnt =
3209 mdev->recv_cnt =
3210 mdev->send_cnt =
3211 mdev->writ_cnt =
3212 mdev->p_size =
3213 mdev->rs_start =
3214 mdev->rs_total =
3215 mdev->rs_failed = 0;
3216 mdev->rs_last_events = 0;
3217 mdev->rs_last_sect_ev = 0;
3218 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3219 mdev->rs_mark_left[i] = 0;
3220 mdev->rs_mark_time[i] = 0;
3221 }
3222 D_ASSERT(mdev->net_conf == NULL);
3223
3224 drbd_set_my_capacity(mdev, 0);
3225 if (mdev->bitmap) {
3226 /* maybe never allocated. */
3227 drbd_bm_resize(mdev, 0, 1);
3228 drbd_bm_cleanup(mdev);
3229 }
3230
3231 drbd_free_resources(mdev);
3232 clear_bit(AL_SUSPENDED, &mdev->flags);
3233
3234 /*
3235 * currently we drbd_init_ee only on module load, so
3236 * we may do drbd_release_ee only on module unload!
3237 */
3238 D_ASSERT(list_empty(&mdev->active_ee));
3239 D_ASSERT(list_empty(&mdev->sync_ee));
3240 D_ASSERT(list_empty(&mdev->done_ee));
3241 D_ASSERT(list_empty(&mdev->read_ee));
3242 D_ASSERT(list_empty(&mdev->net_ee));
3243 D_ASSERT(list_empty(&mdev->resync_reads));
3244 D_ASSERT(list_empty(&mdev->data.work.q));
3245 D_ASSERT(list_empty(&mdev->meta.work.q));
3246 D_ASSERT(list_empty(&mdev->resync_work.list));
3247 D_ASSERT(list_empty(&mdev->unplug_work.list));
3248 D_ASSERT(list_empty(&mdev->go_diskless.list));
3249
3250 drbd_set_defaults(mdev);
3251 }
3252
3253
3254 static void drbd_destroy_mempools(void)
3255 {
3256 struct page *page;
3257
3258 while (drbd_pp_pool) {
3259 page = drbd_pp_pool;
3260 drbd_pp_pool = (struct page *)page_private(page);
3261 __free_page(page);
3262 drbd_pp_vacant--;
3263 }
3264
3265 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3266
3267 if (drbd_ee_mempool)
3268 mempool_destroy(drbd_ee_mempool);
3269 if (drbd_request_mempool)
3270 mempool_destroy(drbd_request_mempool);
3271 if (drbd_ee_cache)
3272 kmem_cache_destroy(drbd_ee_cache);
3273 if (drbd_request_cache)
3274 kmem_cache_destroy(drbd_request_cache);
3275 if (drbd_bm_ext_cache)
3276 kmem_cache_destroy(drbd_bm_ext_cache);
3277 if (drbd_al_ext_cache)
3278 kmem_cache_destroy(drbd_al_ext_cache);
3279
3280 drbd_ee_mempool = NULL;
3281 drbd_request_mempool = NULL;
3282 drbd_ee_cache = NULL;
3283 drbd_request_cache = NULL;
3284 drbd_bm_ext_cache = NULL;
3285 drbd_al_ext_cache = NULL;
3286
3287 return;
3288 }
3289
3290 static int drbd_create_mempools(void)
3291 {
3292 struct page *page;
3293 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3294 int i;
3295
3296 /* prepare our caches and mempools */
3297 drbd_request_mempool = NULL;
3298 drbd_ee_cache = NULL;
3299 drbd_request_cache = NULL;
3300 drbd_bm_ext_cache = NULL;
3301 drbd_al_ext_cache = NULL;
3302 drbd_pp_pool = NULL;
3303
3304 /* caches */
3305 drbd_request_cache = kmem_cache_create(
3306 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3307 if (drbd_request_cache == NULL)
3308 goto Enomem;
3309
3310 drbd_ee_cache = kmem_cache_create(
3311 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3312 if (drbd_ee_cache == NULL)
3313 goto Enomem;
3314
3315 drbd_bm_ext_cache = kmem_cache_create(
3316 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3317 if (drbd_bm_ext_cache == NULL)
3318 goto Enomem;
3319
3320 drbd_al_ext_cache = kmem_cache_create(
3321 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3322 if (drbd_al_ext_cache == NULL)
3323 goto Enomem;
3324
3325 /* mempools */
3326 drbd_request_mempool = mempool_create(number,
3327 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3328 if (drbd_request_mempool == NULL)
3329 goto Enomem;
3330
3331 drbd_ee_mempool = mempool_create(number,
3332 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3333 if (drbd_ee_mempool == NULL)
3334 goto Enomem;
3335
3336 /* drbd's page pool */
3337 spin_lock_init(&drbd_pp_lock);
3338
3339 for (i = 0; i < number; i++) {
3340 page = alloc_page(GFP_HIGHUSER);
3341 if (!page)
3342 goto Enomem;
3343 set_page_private(page, (unsigned long)drbd_pp_pool);
3344 drbd_pp_pool = page;
3345 }
3346 drbd_pp_vacant = number;
3347
3348 return 0;
3349
3350 Enomem:
3351 drbd_destroy_mempools(); /* in case we allocated some */
3352 return -ENOMEM;
3353 }
3354
3355 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3356 void *unused)
3357 {
3358 /* just so we have it. you never know what interesting things we
3359 * might want to do here some day...
3360 */
3361
3362 return NOTIFY_DONE;
3363 }
3364
3365 static struct notifier_block drbd_notifier = {
3366 .notifier_call = drbd_notify_sys,
3367 };
3368
3369 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3370 {
3371 int rr;
3372
3373 rr = drbd_release_ee(mdev, &mdev->active_ee);
3374 if (rr)
3375 dev_err(DEV, "%d EEs in active list found!\n", rr);
3376
3377 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3378 if (rr)
3379 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3380
3381 rr = drbd_release_ee(mdev, &mdev->read_ee);
3382 if (rr)
3383 dev_err(DEV, "%d EEs in read list found!\n", rr);
3384
3385 rr = drbd_release_ee(mdev, &mdev->done_ee);
3386 if (rr)
3387 dev_err(DEV, "%d EEs in done list found!\n", rr);
3388
3389 rr = drbd_release_ee(mdev, &mdev->net_ee);
3390 if (rr)
3391 dev_err(DEV, "%d EEs in net list found!\n", rr);
3392 }
3393
3394 /* caution. no locking.
3395 * currently only used from module cleanup code. */
3396 static void drbd_delete_device(unsigned int minor)
3397 {
3398 struct drbd_conf *mdev = minor_to_mdev(minor);
3399
3400 if (!mdev)
3401 return;
3402
3403 del_timer_sync(&mdev->request_timer);
3404
3405 /* paranoia asserts */
3406 if (mdev->open_cnt != 0)
3407 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3408 __FILE__ , __LINE__);
3409
3410 ERR_IF (!list_empty(&mdev->data.work.q)) {
3411 struct list_head *lp;
3412 list_for_each(lp, &mdev->data.work.q) {
3413 dev_err(DEV, "lp = %p\n", lp);
3414 }
3415 };
3416 /* end paranoia asserts */
3417
3418 del_gendisk(mdev->vdisk);
3419
3420 /* cleanup stuff that may have been allocated during
3421 * device (re-)configuration or state changes */
3422
3423 if (mdev->this_bdev)
3424 bdput(mdev->this_bdev);
3425
3426 drbd_free_resources(mdev);
3427
3428 drbd_release_ee_lists(mdev);
3429
3430 /* should be freed on disconnect? */
3431 kfree(mdev->ee_hash);
3432 /*
3433 mdev->ee_hash_s = 0;
3434 mdev->ee_hash = NULL;
3435 */
3436
3437 lc_destroy(mdev->act_log);
3438 lc_destroy(mdev->resync);
3439
3440 kfree(mdev->p_uuid);
3441 /* mdev->p_uuid = NULL; */
3442
3443 kfree(mdev->int_dig_out);
3444 kfree(mdev->int_dig_in);
3445 kfree(mdev->int_dig_vv);
3446
3447 /* cleanup the rest that has been
3448 * allocated from drbd_new_device
3449 * and actually free the mdev itself */
3450 drbd_free_mdev(mdev);
3451 }
3452
3453 static void drbd_cleanup(void)
3454 {
3455 unsigned int i;
3456
3457 unregister_reboot_notifier(&drbd_notifier);
3458
3459 /* first remove proc,
3460 * drbdsetup uses it's presence to detect
3461 * whether DRBD is loaded.
3462 * If we would get stuck in proc removal,
3463 * but have netlink already deregistered,
3464 * some drbdsetup commands may wait forever
3465 * for an answer.
3466 */
3467 if (drbd_proc)
3468 remove_proc_entry("drbd", NULL);
3469
3470 drbd_nl_cleanup();
3471
3472 if (minor_table) {
3473 i = minor_count;
3474 while (i--)
3475 drbd_delete_device(i);
3476 drbd_destroy_mempools();
3477 }
3478
3479 kfree(minor_table);
3480
3481 unregister_blkdev(DRBD_MAJOR, "drbd");
3482
3483 printk(KERN_INFO "drbd: module cleanup done.\n");
3484 }
3485
3486 /**
3487 * drbd_congested() - Callback for pdflush
3488 * @congested_data: User data
3489 * @bdi_bits: Bits pdflush is currently interested in
3490 *
3491 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3492 */
3493 static int drbd_congested(void *congested_data, int bdi_bits)
3494 {
3495 struct drbd_conf *mdev = congested_data;
3496 struct request_queue *q;
3497 char reason = '-';
3498 int r = 0;
3499
3500 if (!may_inc_ap_bio(mdev)) {
3501 /* DRBD has frozen IO */
3502 r = bdi_bits;
3503 reason = 'd';
3504 goto out;
3505 }
3506
3507 if (get_ldev(mdev)) {
3508 q = bdev_get_queue(mdev->ldev->backing_bdev);
3509 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3510 put_ldev(mdev);
3511 if (r)
3512 reason = 'b';
3513 }
3514
3515 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3516 r |= (1 << BDI_async_congested);
3517 reason = reason == 'b' ? 'a' : 'n';
3518 }
3519
3520 out:
3521 mdev->congestion_reason = reason;
3522 return r;
3523 }
3524
3525 struct drbd_conf *drbd_new_device(unsigned int minor)
3526 {
3527 struct drbd_conf *mdev;
3528 struct gendisk *disk;
3529 struct request_queue *q;
3530
3531 /* GFP_KERNEL, we are outside of all write-out paths */
3532 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3533 if (!mdev)
3534 return NULL;
3535 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3536 goto out_no_cpumask;
3537
3538 mdev->minor = minor;
3539
3540 drbd_init_set_defaults(mdev);
3541
3542 q = blk_alloc_queue(GFP_KERNEL);
3543 if (!q)
3544 goto out_no_q;
3545 mdev->rq_queue = q;
3546 q->queuedata = mdev;
3547
3548 disk = alloc_disk(1);
3549 if (!disk)
3550 goto out_no_disk;
3551 mdev->vdisk = disk;
3552
3553 set_disk_ro(disk, true);
3554
3555 disk->queue = q;
3556 disk->major = DRBD_MAJOR;
3557 disk->first_minor = minor;
3558 disk->fops = &drbd_ops;
3559 sprintf(disk->disk_name, "drbd%d", minor);
3560 disk->private_data = mdev;
3561
3562 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3563 /* we have no partitions. we contain only ourselves. */
3564 mdev->this_bdev->bd_contains = mdev->this_bdev;
3565
3566 q->backing_dev_info.congested_fn = drbd_congested;
3567 q->backing_dev_info.congested_data = mdev;
3568
3569 blk_queue_make_request(q, drbd_make_request);
3570 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3571 This triggers a max_bio_size message upon first attach or connect */
3572 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3573 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3574 blk_queue_merge_bvec(q, drbd_merge_bvec);
3575 q->queue_lock = &mdev->req_lock;
3576
3577 mdev->md_io_page = alloc_page(GFP_KERNEL);
3578 if (!mdev->md_io_page)
3579 goto out_no_io_page;
3580
3581 if (drbd_bm_init(mdev))
3582 goto out_no_bitmap;
3583 /* no need to lock access, we are still initializing this minor device. */
3584 if (!tl_init(mdev))
3585 goto out_no_tl;
3586
3587 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3588 if (!mdev->app_reads_hash)
3589 goto out_no_app_reads;
3590
3591 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3592 if (!mdev->current_epoch)
3593 goto out_no_epoch;
3594
3595 INIT_LIST_HEAD(&mdev->current_epoch->list);
3596 mdev->epochs = 1;
3597
3598 return mdev;
3599
3600 /* out_whatever_else:
3601 kfree(mdev->current_epoch); */
3602 out_no_epoch:
3603 kfree(mdev->app_reads_hash);
3604 out_no_app_reads:
3605 tl_cleanup(mdev);
3606 out_no_tl:
3607 drbd_bm_cleanup(mdev);
3608 out_no_bitmap:
3609 __free_page(mdev->md_io_page);
3610 out_no_io_page:
3611 put_disk(disk);
3612 out_no_disk:
3613 blk_cleanup_queue(q);
3614 out_no_q:
3615 free_cpumask_var(mdev->cpu_mask);
3616 out_no_cpumask:
3617 kfree(mdev);
3618 return NULL;
3619 }
3620
3621 /* counterpart of drbd_new_device.
3622 * last part of drbd_delete_device. */
3623 void drbd_free_mdev(struct drbd_conf *mdev)
3624 {
3625 kfree(mdev->current_epoch);
3626 kfree(mdev->app_reads_hash);
3627 tl_cleanup(mdev);
3628 if (mdev->bitmap) /* should no longer be there. */
3629 drbd_bm_cleanup(mdev);
3630 __free_page(mdev->md_io_page);
3631 put_disk(mdev->vdisk);
3632 blk_cleanup_queue(mdev->rq_queue);
3633 free_cpumask_var(mdev->cpu_mask);
3634 drbd_free_tl_hash(mdev);
3635 kfree(mdev);
3636 }
3637
3638
3639 int __init drbd_init(void)
3640 {
3641 int err;
3642
3643 if (sizeof(struct p_handshake) != 80) {
3644 printk(KERN_ERR
3645 "drbd: never change the size or layout "
3646 "of the HandShake packet.\n");
3647 return -EINVAL;
3648 }
3649
3650 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3651 printk(KERN_ERR
3652 "drbd: invalid minor_count (%d)\n", minor_count);
3653 #ifdef MODULE
3654 return -EINVAL;
3655 #else
3656 minor_count = 8;
3657 #endif
3658 }
3659
3660 err = drbd_nl_init();
3661 if (err)
3662 return err;
3663
3664 err = register_blkdev(DRBD_MAJOR, "drbd");
3665 if (err) {
3666 printk(KERN_ERR
3667 "drbd: unable to register block device major %d\n",
3668 DRBD_MAJOR);
3669 return err;
3670 }
3671
3672 register_reboot_notifier(&drbd_notifier);
3673
3674 /*
3675 * allocate all necessary structs
3676 */
3677 err = -ENOMEM;
3678
3679 init_waitqueue_head(&drbd_pp_wait);
3680
3681 drbd_proc = NULL; /* play safe for drbd_cleanup */
3682 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3683 GFP_KERNEL);
3684 if (!minor_table)
3685 goto Enomem;
3686
3687 err = drbd_create_mempools();
3688 if (err)
3689 goto Enomem;
3690
3691 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3692 if (!drbd_proc) {
3693 printk(KERN_ERR "drbd: unable to register proc file\n");
3694 goto Enomem;
3695 }
3696
3697 rwlock_init(&global_state_lock);
3698
3699 printk(KERN_INFO "drbd: initialized. "
3700 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3701 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3702 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3703 printk(KERN_INFO "drbd: registered as block device major %d\n",
3704 DRBD_MAJOR);
3705 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3706
3707 return 0; /* Success! */
3708
3709 Enomem:
3710 drbd_cleanup();
3711 if (err == -ENOMEM)
3712 /* currently always the case */
3713 printk(KERN_ERR "drbd: ran out of memory\n");
3714 else
3715 printk(KERN_ERR "drbd: initialization failure\n");
3716 return err;
3717 }
3718
3719 void drbd_free_bc(struct drbd_backing_dev *ldev)
3720 {
3721 if (ldev == NULL)
3722 return;
3723
3724 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3725 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3726
3727 kfree(ldev);
3728 }
3729
3730 void drbd_free_sock(struct drbd_conf *mdev)
3731 {
3732 if (mdev->data.socket) {
3733 mutex_lock(&mdev->data.mutex);
3734 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3735 sock_release(mdev->data.socket);
3736 mdev->data.socket = NULL;
3737 mutex_unlock(&mdev->data.mutex);
3738 }
3739 if (mdev->meta.socket) {
3740 mutex_lock(&mdev->meta.mutex);
3741 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3742 sock_release(mdev->meta.socket);
3743 mdev->meta.socket = NULL;
3744 mutex_unlock(&mdev->meta.mutex);
3745 }
3746 }
3747
3748
3749 void drbd_free_resources(struct drbd_conf *mdev)
3750 {
3751 crypto_free_hash(mdev->csums_tfm);
3752 mdev->csums_tfm = NULL;
3753 crypto_free_hash(mdev->verify_tfm);
3754 mdev->verify_tfm = NULL;
3755 crypto_free_hash(mdev->cram_hmac_tfm);
3756 mdev->cram_hmac_tfm = NULL;
3757 crypto_free_hash(mdev->integrity_w_tfm);
3758 mdev->integrity_w_tfm = NULL;
3759 crypto_free_hash(mdev->integrity_r_tfm);
3760 mdev->integrity_r_tfm = NULL;
3761
3762 drbd_free_sock(mdev);
3763
3764 __no_warn(local,
3765 drbd_free_bc(mdev->ldev);
3766 mdev->ldev = NULL;);
3767 }
3768
3769 /* meta data management */
3770
3771 struct meta_data_on_disk {
3772 u64 la_size; /* last agreed size. */
3773 u64 uuid[UI_SIZE]; /* UUIDs. */
3774 u64 device_uuid;
3775 u64 reserved_u64_1;
3776 u32 flags; /* MDF */
3777 u32 magic;
3778 u32 md_size_sect;
3779 u32 al_offset; /* offset to this block */
3780 u32 al_nr_extents; /* important for restoring the AL */
3781 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3782 u32 bm_offset; /* offset to the bitmap, from here */
3783 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3784 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3785 u32 reserved_u32[3];
3786
3787 } __packed;
3788
3789 /**
3790 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3791 * @mdev: DRBD device.
3792 */
3793 void drbd_md_sync(struct drbd_conf *mdev)
3794 {
3795 struct meta_data_on_disk *buffer;
3796 sector_t sector;
3797 int i;
3798
3799 del_timer(&mdev->md_sync_timer);
3800 /* timer may be rearmed by drbd_md_mark_dirty() now. */
3801 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3802 return;
3803
3804 /* We use here D_FAILED and not D_ATTACHING because we try to write
3805 * metadata even if we detach due to a disk failure! */
3806 if (!get_ldev_if_state(mdev, D_FAILED))
3807 return;
3808
3809 buffer = drbd_md_get_buffer(mdev);
3810 if (!buffer)
3811 goto out;
3812
3813 memset(buffer, 0, 512);
3814
3815 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3816 for (i = UI_CURRENT; i < UI_SIZE; i++)
3817 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3818 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3819 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3820
3821 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3822 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3823 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3824 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3825 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3826
3827 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3828 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
3829
3830 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3831 sector = mdev->ldev->md.md_offset;
3832
3833 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3834 /* this was a try anyways ... */
3835 dev_err(DEV, "meta data update failed!\n");
3836 drbd_chk_io_error(mdev, 1, true);
3837 }
3838
3839 /* Update mdev->ldev->md.la_size_sect,
3840 * since we updated it on metadata. */
3841 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3842
3843 drbd_md_put_buffer(mdev);
3844 out:
3845 put_ldev(mdev);
3846 }
3847
3848 /**
3849 * drbd_md_read() - Reads in the meta data super block
3850 * @mdev: DRBD device.
3851 * @bdev: Device from which the meta data should be read in.
3852 *
3853 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3854 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3855 */
3856 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3857 {
3858 struct meta_data_on_disk *buffer;
3859 int i, rv = NO_ERROR;
3860
3861 if (!get_ldev_if_state(mdev, D_ATTACHING))
3862 return ERR_IO_MD_DISK;
3863
3864 buffer = drbd_md_get_buffer(mdev);
3865 if (!buffer)
3866 goto out;
3867
3868 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3869 /* NOTE: can't do normal error processing here as this is
3870 called BEFORE disk is attached */
3871 dev_err(DEV, "Error while reading metadata.\n");
3872 rv = ERR_IO_MD_DISK;
3873 goto err;
3874 }
3875
3876 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3877 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3878 rv = ERR_MD_INVALID;
3879 goto err;
3880 }
3881 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3882 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3883 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3884 rv = ERR_MD_INVALID;
3885 goto err;
3886 }
3887 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3888 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3889 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3890 rv = ERR_MD_INVALID;
3891 goto err;
3892 }
3893 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3894 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3895 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3896 rv = ERR_MD_INVALID;
3897 goto err;
3898 }
3899
3900 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3901 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3902 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3903 rv = ERR_MD_INVALID;
3904 goto err;
3905 }
3906
3907 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3908 for (i = UI_CURRENT; i < UI_SIZE; i++)
3909 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3910 bdev->md.flags = be32_to_cpu(buffer->flags);
3911 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3912 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3913
3914 spin_lock_irq(&mdev->req_lock);
3915 if (mdev->state.conn < C_CONNECTED) {
3916 int peer;
3917 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3918 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3919 mdev->peer_max_bio_size = peer;
3920 }
3921 spin_unlock_irq(&mdev->req_lock);
3922
3923 if (mdev->sync_conf.al_extents < 7)
3924 mdev->sync_conf.al_extents = 127;
3925
3926 err:
3927 drbd_md_put_buffer(mdev);
3928 out:
3929 put_ldev(mdev);
3930
3931 return rv;
3932 }
3933
3934 /**
3935 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3936 * @mdev: DRBD device.
3937 *
3938 * Call this function if you change anything that should be written to
3939 * the meta-data super block. This function sets MD_DIRTY, and starts a
3940 * timer that ensures that within five seconds you have to call drbd_md_sync().
3941 */
3942 #ifdef DEBUG
3943 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3944 {
3945 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3946 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3947 mdev->last_md_mark_dirty.line = line;
3948 mdev->last_md_mark_dirty.func = func;
3949 }
3950 }
3951 #else
3952 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3953 {
3954 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3955 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3956 }
3957 #endif
3958
3959 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3960 {
3961 int i;
3962
3963 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3964 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3965 }
3966
3967 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3968 {
3969 if (idx == UI_CURRENT) {
3970 if (mdev->state.role == R_PRIMARY)
3971 val |= 1;
3972 else
3973 val &= ~((u64)1);
3974
3975 drbd_set_ed_uuid(mdev, val);
3976 }
3977
3978 mdev->ldev->md.uuid[idx] = val;
3979 drbd_md_mark_dirty(mdev);
3980 }
3981
3982
3983 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3984 {
3985 if (mdev->ldev->md.uuid[idx]) {
3986 drbd_uuid_move_history(mdev);
3987 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3988 }
3989 _drbd_uuid_set(mdev, idx, val);
3990 }
3991
3992 /**
3993 * drbd_uuid_new_current() - Creates a new current UUID
3994 * @mdev: DRBD device.
3995 *
3996 * Creates a new current UUID, and rotates the old current UUID into
3997 * the bitmap slot. Causes an incremental resync upon next connect.
3998 */
3999 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4000 {
4001 u64 val;
4002 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4003
4004 if (bm_uuid)
4005 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4006
4007 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
4008
4009 get_random_bytes(&val, sizeof(u64));
4010 _drbd_uuid_set(mdev, UI_CURRENT, val);
4011 drbd_print_uuids(mdev, "new current UUID");
4012 /* get it to stable storage _now_ */
4013 drbd_md_sync(mdev);
4014 }
4015
4016 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4017 {
4018 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4019 return;
4020
4021 if (val == 0) {
4022 drbd_uuid_move_history(mdev);
4023 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4024 mdev->ldev->md.uuid[UI_BITMAP] = 0;
4025 } else {
4026 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4027 if (bm_uuid)
4028 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4029
4030 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
4031 }
4032 drbd_md_mark_dirty(mdev);
4033 }
4034
4035 /**
4036 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4037 * @mdev: DRBD device.
4038 *
4039 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4040 */
4041 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4042 {
4043 int rv = -EIO;
4044
4045 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4046 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4047 drbd_md_sync(mdev);
4048 drbd_bm_set_all(mdev);
4049
4050 rv = drbd_bm_write(mdev);
4051
4052 if (!rv) {
4053 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4054 drbd_md_sync(mdev);
4055 }
4056
4057 put_ldev(mdev);
4058 }
4059
4060 return rv;
4061 }
4062
4063 /**
4064 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4065 * @mdev: DRBD device.
4066 *
4067 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4068 */
4069 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4070 {
4071 int rv = -EIO;
4072
4073 drbd_resume_al(mdev);
4074 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4075 drbd_bm_clear_all(mdev);
4076 rv = drbd_bm_write(mdev);
4077 put_ldev(mdev);
4078 }
4079
4080 return rv;
4081 }
4082
4083 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4084 {
4085 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
4086 int rv = -EIO;
4087
4088 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4089
4090 if (get_ldev(mdev)) {
4091 drbd_bm_lock(mdev, work->why, work->flags);
4092 rv = work->io_fn(mdev);
4093 drbd_bm_unlock(mdev);
4094 put_ldev(mdev);
4095 }
4096
4097 clear_bit(BITMAP_IO, &mdev->flags);
4098 smp_mb__after_clear_bit();
4099 wake_up(&mdev->misc_wait);
4100
4101 if (work->done)
4102 work->done(mdev, rv);
4103
4104 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4105 work->why = NULL;
4106 work->flags = 0;
4107
4108 return 1;
4109 }
4110
4111 void drbd_ldev_destroy(struct drbd_conf *mdev)
4112 {
4113 lc_destroy(mdev->resync);
4114 mdev->resync = NULL;
4115 lc_destroy(mdev->act_log);
4116 mdev->act_log = NULL;
4117 __no_warn(local,
4118 drbd_free_bc(mdev->ldev);
4119 mdev->ldev = NULL;);
4120
4121 if (mdev->md_io_tmpp) {
4122 __free_page(mdev->md_io_tmpp);
4123 mdev->md_io_tmpp = NULL;
4124 }
4125 clear_bit(GO_DISKLESS, &mdev->flags);
4126 }
4127
4128 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4129 {
4130 D_ASSERT(mdev->state.disk == D_FAILED);
4131 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4132 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
4133 * the protected members anymore, though, so once put_ldev reaches zero
4134 * again, it will be safe to free them. */
4135 drbd_force_state(mdev, NS(disk, D_DISKLESS));
4136 return 1;
4137 }
4138
4139 void drbd_go_diskless(struct drbd_conf *mdev)
4140 {
4141 D_ASSERT(mdev->state.disk == D_FAILED);
4142 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
4143 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
4144 }
4145
4146 /**
4147 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4148 * @mdev: DRBD device.
4149 * @io_fn: IO callback to be called when bitmap IO is possible
4150 * @done: callback to be called after the bitmap IO was performed
4151 * @why: Descriptive text of the reason for doing the IO
4152 *
4153 * While IO on the bitmap happens we freeze application IO thus we ensure
4154 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4155 * called from worker context. It MUST NOT be used while a previous such
4156 * work is still pending!
4157 */
4158 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4159 int (*io_fn)(struct drbd_conf *),
4160 void (*done)(struct drbd_conf *, int),
4161 char *why, enum bm_flag flags)
4162 {
4163 D_ASSERT(current == mdev->worker.task);
4164
4165 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4166 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4167 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4168 if (mdev->bm_io_work.why)
4169 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4170 why, mdev->bm_io_work.why);
4171
4172 mdev->bm_io_work.io_fn = io_fn;
4173 mdev->bm_io_work.done = done;
4174 mdev->bm_io_work.why = why;
4175 mdev->bm_io_work.flags = flags;
4176
4177 spin_lock_irq(&mdev->req_lock);
4178 set_bit(BITMAP_IO, &mdev->flags);
4179 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
4180 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
4181 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
4182 }
4183 spin_unlock_irq(&mdev->req_lock);
4184 }
4185
4186 /**
4187 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4188 * @mdev: DRBD device.
4189 * @io_fn: IO callback to be called when bitmap IO is possible
4190 * @why: Descriptive text of the reason for doing the IO
4191 *
4192 * freezes application IO while that the actual IO operations runs. This
4193 * functions MAY NOT be called from worker context.
4194 */
4195 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4196 char *why, enum bm_flag flags)
4197 {
4198 int rv;
4199
4200 D_ASSERT(current != mdev->worker.task);
4201
4202 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4203 drbd_suspend_io(mdev);
4204
4205 drbd_bm_lock(mdev, why, flags);
4206 rv = io_fn(mdev);
4207 drbd_bm_unlock(mdev);
4208
4209 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4210 drbd_resume_io(mdev);
4211
4212 return rv;
4213 }
4214
4215 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4216 {
4217 if ((mdev->ldev->md.flags & flag) != flag) {
4218 drbd_md_mark_dirty(mdev);
4219 mdev->ldev->md.flags |= flag;
4220 }
4221 }
4222
4223 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4224 {
4225 if ((mdev->ldev->md.flags & flag) != 0) {
4226 drbd_md_mark_dirty(mdev);
4227 mdev->ldev->md.flags &= ~flag;
4228 }
4229 }
4230 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4231 {
4232 return (bdev->md.flags & flag) != 0;
4233 }
4234
4235 static void md_sync_timer_fn(unsigned long data)
4236 {
4237 struct drbd_conf *mdev = (struct drbd_conf *) data;
4238
4239 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4240 }
4241
4242 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4243 {
4244 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4245 #ifdef DEBUG
4246 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4247 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4248 #endif
4249 drbd_md_sync(mdev);
4250 return 1;
4251 }
4252
4253 #ifdef CONFIG_DRBD_FAULT_INJECTION
4254 /* Fault insertion support including random number generator shamelessly
4255 * stolen from kernel/rcutorture.c */
4256 struct fault_random_state {
4257 unsigned long state;
4258 unsigned long count;
4259 };
4260
4261 #define FAULT_RANDOM_MULT 39916801 /* prime */
4262 #define FAULT_RANDOM_ADD 479001701 /* prime */
4263 #define FAULT_RANDOM_REFRESH 10000
4264
4265 /*
4266 * Crude but fast random-number generator. Uses a linear congruential
4267 * generator, with occasional help from get_random_bytes().
4268 */
4269 static unsigned long
4270 _drbd_fault_random(struct fault_random_state *rsp)
4271 {
4272 long refresh;
4273
4274 if (!rsp->count--) {
4275 get_random_bytes(&refresh, sizeof(refresh));
4276 rsp->state += refresh;
4277 rsp->count = FAULT_RANDOM_REFRESH;
4278 }
4279 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4280 return swahw32(rsp->state);
4281 }
4282
4283 static char *
4284 _drbd_fault_str(unsigned int type) {
4285 static char *_faults[] = {
4286 [DRBD_FAULT_MD_WR] = "Meta-data write",
4287 [DRBD_FAULT_MD_RD] = "Meta-data read",
4288 [DRBD_FAULT_RS_WR] = "Resync write",
4289 [DRBD_FAULT_RS_RD] = "Resync read",
4290 [DRBD_FAULT_DT_WR] = "Data write",
4291 [DRBD_FAULT_DT_RD] = "Data read",
4292 [DRBD_FAULT_DT_RA] = "Data read ahead",
4293 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4294 [DRBD_FAULT_AL_EE] = "EE allocation",
4295 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4296 };
4297
4298 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4299 }
4300
4301 unsigned int
4302 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4303 {
4304 static struct fault_random_state rrs = {0, 0};
4305
4306 unsigned int ret = (
4307 (fault_devs == 0 ||
4308 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4309 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4310
4311 if (ret) {
4312 fault_count++;
4313
4314 if (__ratelimit(&drbd_ratelimit_state))
4315 dev_warn(DEV, "***Simulating %s failure\n",
4316 _drbd_fault_str(type));
4317 }
4318
4319 return ret;
4320 }
4321 #endif
4322
4323 const char *drbd_buildtag(void)
4324 {
4325 /* DRBD built from external sources has here a reference to the
4326 git hash of the source code. */
4327
4328 static char buildtag[38] = "\0uilt-in";
4329
4330 if (buildtag[0] == 0) {
4331 #ifdef CONFIG_MODULES
4332 if (THIS_MODULE != NULL)
4333 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4334 else
4335 #endif
4336 buildtag[0] = 'b';
4337 }
4338
4339 return buildtag;
4340 }
4341
4342 module_init(drbd_init)
4343 module_exit(drbd_cleanup)
4344
4345 EXPORT_SYMBOL(drbd_conn_str);
4346 EXPORT_SYMBOL(drbd_role_str);
4347 EXPORT_SYMBOL(drbd_disk_str);
4348 EXPORT_SYMBOL(drbd_set_st_err_str);