]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/drbd/drbd_main.c
drbd: Move drbd_free_tl_hash() to drbd_main()
[mirror_ubuntu-bionic-kernel.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
2a48fc0a 67static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
68int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
e9e6f3ec 81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
b411b363 82
b411b363
PR
83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
2b8a90b5
PR
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
90MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
2b8a90b5 119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
120int disable_sendpage;
121int allow_oos;
122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
7d4e9d09 156static const struct block_device_operations drbd_ops = {
b411b363
PR
157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
7e602c0a 205 b->n_writes = 0;
b411b363
PR
206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211
212 mdev->tl_hash = NULL;
213 mdev->tl_hash_s = 0;
214
215 return 1;
216}
217
218static void tl_cleanup(struct drbd_conf *mdev)
219{
220 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222 kfree(mdev->oldest_tle);
223 mdev->oldest_tle = NULL;
224 kfree(mdev->unused_spare_tle);
225 mdev->unused_spare_tle = NULL;
226 kfree(mdev->tl_hash);
227 mdev->tl_hash = NULL;
228 mdev->tl_hash_s = 0;
229}
230
d628769b
AG
231static void drbd_free_tl_hash(struct drbd_conf *mdev)
232{
233 struct hlist_head *h;
234
235 spin_lock_irq(&mdev->req_lock);
236
237 if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
238 spin_unlock_irq(&mdev->req_lock);
239 return;
240 }
241 /* paranoia code */
242 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
243 if (h->first)
244 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
245 (int)(h - mdev->ee_hash), h->first);
246 kfree(mdev->ee_hash);
247 mdev->ee_hash = NULL;
248 mdev->ee_hash_s = 0;
249
250 /* paranoia code */
251 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
252 if (h->first)
253 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
254 (int)(h - mdev->tl_hash), h->first);
255 kfree(mdev->tl_hash);
256 mdev->tl_hash = NULL;
257 mdev->tl_hash_s = 0;
258 spin_unlock_irq(&mdev->req_lock);
259}
260
b411b363
PR
261/**
262 * _tl_add_barrier() - Adds a barrier to the transfer log
263 * @mdev: DRBD device.
264 * @new: Barrier to be added before the current head of the TL.
265 *
266 * The caller must hold the req_lock.
267 */
268void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
269{
270 struct drbd_tl_epoch *newest_before;
271
272 INIT_LIST_HEAD(&new->requests);
273 INIT_LIST_HEAD(&new->w.list);
274 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
275 new->next = NULL;
7e602c0a 276 new->n_writes = 0;
b411b363
PR
277
278 newest_before = mdev->newest_tle;
279 /* never send a barrier number == 0, because that is special-cased
280 * when using TCQ for our write ordering code */
281 new->br_number = (newest_before->br_number+1) ?: 1;
282 if (mdev->newest_tle != new) {
283 mdev->newest_tle->next = new;
284 mdev->newest_tle = new;
285 }
286}
287
288/**
289 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
290 * @mdev: DRBD device.
291 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
292 * @set_size: Expected number of requests before that barrier.
293 *
294 * In case the passed barrier_nr or set_size does not match the oldest
295 * &struct drbd_tl_epoch objects this function will cause a termination
296 * of the connection.
297 */
298void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
299 unsigned int set_size)
300{
301 struct drbd_tl_epoch *b, *nob; /* next old barrier */
302 struct list_head *le, *tle;
303 struct drbd_request *r;
304
305 spin_lock_irq(&mdev->req_lock);
306
307 b = mdev->oldest_tle;
308
309 /* first some paranoia code */
310 if (b == NULL) {
311 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
312 barrier_nr);
313 goto bail;
314 }
315 if (b->br_number != barrier_nr) {
316 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
317 barrier_nr, b->br_number);
318 goto bail;
319 }
7e602c0a
PR
320 if (b->n_writes != set_size) {
321 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
322 barrier_nr, set_size, b->n_writes);
b411b363
PR
323 goto bail;
324 }
325
326 /* Clean up list of requests processed during current epoch */
327 list_for_each_safe(le, tle, &b->requests) {
328 r = list_entry(le, struct drbd_request, tl_requests);
329 _req_mod(r, barrier_acked);
330 }
331 /* There could be requests on the list waiting for completion
332 of the write to the local disk. To avoid corruptions of
333 slab's data structures we have to remove the lists head.
334
335 Also there could have been a barrier ack out of sequence, overtaking
336 the write acks - which would be a bug and violating write ordering.
337 To not deadlock in case we lose connection while such requests are
338 still pending, we need some way to find them for the
339 _req_mode(connection_lost_while_pending).
340
341 These have been list_move'd to the out_of_sequence_requests list in
342 _req_mod(, barrier_acked) above.
343 */
344 list_del_init(&b->requests);
345
346 nob = b->next;
347 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
348 _tl_add_barrier(mdev, b);
349 if (nob)
350 mdev->oldest_tle = nob;
351 /* if nob == NULL b was the only barrier, and becomes the new
352 barrier. Therefore mdev->oldest_tle points already to b */
353 } else {
354 D_ASSERT(nob != NULL);
355 mdev->oldest_tle = nob;
356 kfree(b);
357 }
358
359 spin_unlock_irq(&mdev->req_lock);
360 dec_ap_pending(mdev);
361
362 return;
363
364bail:
365 spin_unlock_irq(&mdev->req_lock);
366 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
367}
368
617049aa 369
b411b363 370/**
11b58e73 371 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 372 * @mdev: DRBD device.
11b58e73 373 * @what: The action/event to perform with all request objects
b411b363 374 *
11b58e73
PR
375 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
376 * restart_frozen_disk_io.
b411b363 377 */
11b58e73 378static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b411b363 379{
11b58e73 380 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 381 struct list_head *le, *tle, carry_reads;
11b58e73
PR
382 struct drbd_request *req;
383 int rv, n_writes, n_reads;
b411b363
PR
384
385 b = mdev->oldest_tle;
11b58e73 386 pn = &mdev->oldest_tle;
b411b363 387 while (b) {
11b58e73
PR
388 n_writes = 0;
389 n_reads = 0;
b9b98716 390 INIT_LIST_HEAD(&carry_reads);
b411b363 391 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
392 req = list_entry(le, struct drbd_request, tl_requests);
393 rv = _req_mod(req, what);
394
395 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
396 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
397 }
398 tmp = b->next;
399
b9b98716 400 if (n_writes) {
11b58e73
PR
401 if (what == resend) {
402 b->n_writes = n_writes;
403 if (b->w.cb == NULL) {
404 b->w.cb = w_send_barrier;
405 inc_ap_pending(mdev);
406 set_bit(CREATE_BARRIER, &mdev->flags);
407 }
408
409 drbd_queue_work(&mdev->data.work, &b->w);
410 }
411 pn = &b->next;
412 } else {
b9b98716
PR
413 if (n_reads)
414 list_add(&carry_reads, &b->requests);
11b58e73
PR
415 /* there could still be requests on that ring list,
416 * in case local io is still pending */
417 list_del(&b->requests);
418
419 /* dec_ap_pending corresponding to queue_barrier.
420 * the newest barrier may not have been queued yet,
421 * in which case w.cb is still NULL. */
422 if (b->w.cb != NULL)
423 dec_ap_pending(mdev);
424
425 if (b == mdev->newest_tle) {
426 /* recycle, but reinit! */
427 D_ASSERT(tmp == NULL);
428 INIT_LIST_HEAD(&b->requests);
b9b98716 429 list_splice(&carry_reads, &b->requests);
11b58e73
PR
430 INIT_LIST_HEAD(&b->w.list);
431 b->w.cb = NULL;
432 b->br_number = net_random();
433 b->n_writes = 0;
434
435 *pn = b;
436 break;
437 }
438 *pn = tmp;
439 kfree(b);
b411b363 440 }
b411b363 441 b = tmp;
b9b98716 442 list_splice(&carry_reads, &b->requests);
b411b363 443 }
11b58e73
PR
444}
445
b411b363
PR
446
447/**
448 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
449 * @mdev: DRBD device.
450 *
451 * This is called after the connection to the peer was lost. The storage covered
452 * by the requests on the transfer gets marked as our of sync. Called from the
453 * receiver thread and the worker thread.
454 */
455void tl_clear(struct drbd_conf *mdev)
456{
b411b363
PR
457 struct list_head *le, *tle;
458 struct drbd_request *r;
b411b363
PR
459
460 spin_lock_irq(&mdev->req_lock);
461
11b58e73 462 _tl_restart(mdev, connection_lost_while_pending);
b411b363
PR
463
464 /* we expect this list to be empty. */
465 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
466
467 /* but just in case, clean it up anyways! */
468 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
469 r = list_entry(le, struct drbd_request, tl_requests);
470 /* It would be nice to complete outside of spinlock.
471 * But this is easier for now. */
472 _req_mod(r, connection_lost_while_pending);
473 }
474
475 /* ensure bit indicating barrier is required is clear */
476 clear_bit(CREATE_BARRIER, &mdev->flags);
477
288f422e
PR
478 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
479
b411b363
PR
480 spin_unlock_irq(&mdev->req_lock);
481}
482
11b58e73
PR
483void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
484{
485 spin_lock_irq(&mdev->req_lock);
486 _tl_restart(mdev, what);
b411b363
PR
487 spin_unlock_irq(&mdev->req_lock);
488}
489
490/**
81e84650 491 * cl_wide_st_chg() - true if the state change is a cluster wide one
b411b363
PR
492 * @mdev: DRBD device.
493 * @os: old (current) state.
494 * @ns: new (wanted) state.
495 */
496static int cl_wide_st_chg(struct drbd_conf *mdev,
497 union drbd_state os, union drbd_state ns)
498{
499 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
500 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
501 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
502 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
503 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
504 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
505 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
506}
507
bf885f8a
AG
508enum drbd_state_rv
509drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
510 union drbd_state mask, union drbd_state val)
b411b363
PR
511{
512 unsigned long flags;
513 union drbd_state os, ns;
bf885f8a 514 enum drbd_state_rv rv;
b411b363
PR
515
516 spin_lock_irqsave(&mdev->req_lock, flags);
517 os = mdev->state;
518 ns.i = (os.i & ~mask.i) | val.i;
519 rv = _drbd_set_state(mdev, ns, f, NULL);
520 ns = mdev->state;
521 spin_unlock_irqrestore(&mdev->req_lock, flags);
522
523 return rv;
524}
525
526/**
527 * drbd_force_state() - Impose a change which happens outside our control on our state
528 * @mdev: DRBD device.
529 * @mask: mask of state bits to change.
530 * @val: value of new state bits.
531 */
532void drbd_force_state(struct drbd_conf *mdev,
533 union drbd_state mask, union drbd_state val)
534{
535 drbd_change_state(mdev, CS_HARD, mask, val);
536}
537
bf885f8a
AG
538static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
539static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
540 union drbd_state,
541 union drbd_state);
b411b363 542static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
02bc7174 543 union drbd_state ns, const char **warn_sync_abort);
b411b363
PR
544int drbd_send_state_req(struct drbd_conf *,
545 union drbd_state, union drbd_state);
546
c8b32563
AG
547static enum drbd_state_rv
548_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
549 union drbd_state val)
b411b363
PR
550{
551 union drbd_state os, ns;
552 unsigned long flags;
bf885f8a 553 enum drbd_state_rv rv;
b411b363
PR
554
555 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
556 return SS_CW_SUCCESS;
557
558 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
559 return SS_CW_FAILED_BY_PEER;
560
561 rv = 0;
562 spin_lock_irqsave(&mdev->req_lock, flags);
563 os = mdev->state;
564 ns.i = (os.i & ~mask.i) | val.i;
565 ns = sanitize_state(mdev, os, ns, NULL);
566
567 if (!cl_wide_st_chg(mdev, os, ns))
568 rv = SS_CW_NO_NEED;
569 if (!rv) {
570 rv = is_valid_state(mdev, ns);
571 if (rv == SS_SUCCESS) {
572 rv = is_valid_state_transition(mdev, ns, os);
573 if (rv == SS_SUCCESS)
bf885f8a 574 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
b411b363
PR
575 }
576 }
577 spin_unlock_irqrestore(&mdev->req_lock, flags);
578
579 return rv;
580}
581
582/**
583 * drbd_req_state() - Perform an eventually cluster wide state change
584 * @mdev: DRBD device.
585 * @mask: mask of state bits to change.
586 * @val: value of new state bits.
587 * @f: flags
588 *
589 * Should not be called directly, use drbd_request_state() or
590 * _drbd_request_state().
591 */
bf885f8a
AG
592static enum drbd_state_rv
593drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
594 union drbd_state val, enum chg_state_flags f)
b411b363
PR
595{
596 struct completion done;
597 unsigned long flags;
598 union drbd_state os, ns;
bf885f8a 599 enum drbd_state_rv rv;
b411b363
PR
600
601 init_completion(&done);
602
603 if (f & CS_SERIALIZE)
604 mutex_lock(&mdev->state_mutex);
605
606 spin_lock_irqsave(&mdev->req_lock, flags);
607 os = mdev->state;
608 ns.i = (os.i & ~mask.i) | val.i;
609 ns = sanitize_state(mdev, os, ns, NULL);
610
611 if (cl_wide_st_chg(mdev, os, ns)) {
612 rv = is_valid_state(mdev, ns);
613 if (rv == SS_SUCCESS)
614 rv = is_valid_state_transition(mdev, ns, os);
615 spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617 if (rv < SS_SUCCESS) {
618 if (f & CS_VERBOSE)
619 print_st_err(mdev, os, ns, rv);
620 goto abort;
621 }
622
623 drbd_state_lock(mdev);
624 if (!drbd_send_state_req(mdev, mask, val)) {
625 drbd_state_unlock(mdev);
626 rv = SS_CW_FAILED_BY_PEER;
627 if (f & CS_VERBOSE)
628 print_st_err(mdev, os, ns, rv);
629 goto abort;
630 }
631
632 wait_event(mdev->state_wait,
633 (rv = _req_st_cond(mdev, mask, val)));
634
635 if (rv < SS_SUCCESS) {
636 drbd_state_unlock(mdev);
637 if (f & CS_VERBOSE)
638 print_st_err(mdev, os, ns, rv);
639 goto abort;
640 }
641 spin_lock_irqsave(&mdev->req_lock, flags);
642 os = mdev->state;
643 ns.i = (os.i & ~mask.i) | val.i;
644 rv = _drbd_set_state(mdev, ns, f, &done);
645 drbd_state_unlock(mdev);
646 } else {
647 rv = _drbd_set_state(mdev, ns, f, &done);
648 }
649
650 spin_unlock_irqrestore(&mdev->req_lock, flags);
651
652 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
653 D_ASSERT(current != mdev->worker.task);
654 wait_for_completion(&done);
655 }
656
657abort:
658 if (f & CS_SERIALIZE)
659 mutex_unlock(&mdev->state_mutex);
660
661 return rv;
662}
663
664/**
665 * _drbd_request_state() - Request a state change (with flags)
666 * @mdev: DRBD device.
667 * @mask: mask of state bits to change.
668 * @val: value of new state bits.
669 * @f: flags
670 *
671 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
672 * flag, or when logging of failed state change requests is not desired.
673 */
bf885f8a
AG
674enum drbd_state_rv
675_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
676 union drbd_state val, enum chg_state_flags f)
b411b363 677{
bf885f8a 678 enum drbd_state_rv rv;
b411b363
PR
679
680 wait_event(mdev->state_wait,
681 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
682
683 return rv;
684}
685
686static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
687{
688 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
689 name,
690 drbd_conn_str(ns.conn),
691 drbd_role_str(ns.role),
692 drbd_role_str(ns.peer),
693 drbd_disk_str(ns.disk),
694 drbd_disk_str(ns.pdsk),
fb22c402 695 is_susp(ns) ? 's' : 'r',
b411b363
PR
696 ns.aftr_isp ? 'a' : '-',
697 ns.peer_isp ? 'p' : '-',
698 ns.user_isp ? 'u' : '-'
699 );
700}
701
bf885f8a
AG
702void print_st_err(struct drbd_conf *mdev, union drbd_state os,
703 union drbd_state ns, enum drbd_state_rv err)
b411b363
PR
704{
705 if (err == SS_IN_TRANSIENT_STATE)
706 return;
707 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
708 print_st(mdev, " state", os);
709 print_st(mdev, "wanted", ns);
710}
711
712
b411b363
PR
713/**
714 * is_valid_state() - Returns an SS_ error code if ns is not valid
715 * @mdev: DRBD device.
716 * @ns: State to consider.
717 */
bf885f8a
AG
718static enum drbd_state_rv
719is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
b411b363
PR
720{
721 /* See drbd_state_sw_errors in drbd_strings.c */
722
723 enum drbd_fencing_p fp;
bf885f8a 724 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
725
726 fp = FP_DONT_CARE;
727 if (get_ldev(mdev)) {
728 fp = mdev->ldev->dc.fencing;
729 put_ldev(mdev);
730 }
731
732 if (get_net_conf(mdev)) {
733 if (!mdev->net_conf->two_primaries &&
734 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
735 rv = SS_TWO_PRIMARIES;
736 put_net_conf(mdev);
737 }
738
739 if (rv <= 0)
740 /* already found a reason to abort */;
741 else if (ns.role == R_SECONDARY && mdev->open_cnt)
742 rv = SS_DEVICE_IN_USE;
743
744 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
745 rv = SS_NO_UP_TO_DATE_DISK;
746
747 else if (fp >= FP_RESOURCE &&
748 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
749 rv = SS_PRIMARY_NOP;
750
751 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
752 rv = SS_NO_UP_TO_DATE_DISK;
753
754 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
755 rv = SS_NO_LOCAL_DISK;
756
757 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
758 rv = SS_NO_REMOTE_DISK;
759
8d4ce82b
LE
760 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
761 rv = SS_NO_UP_TO_DATE_DISK;
762
b411b363
PR
763 else if ((ns.conn == C_CONNECTED ||
764 ns.conn == C_WF_BITMAP_S ||
765 ns.conn == C_SYNC_SOURCE ||
766 ns.conn == C_PAUSED_SYNC_S) &&
767 ns.disk == D_OUTDATED)
768 rv = SS_CONNECTED_OUTDATES;
769
770 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
771 (mdev->sync_conf.verify_alg[0] == 0))
772 rv = SS_NO_VERIFY_ALG;
773
774 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
775 mdev->agreed_pro_version < 88)
776 rv = SS_NOT_SUPPORTED;
777
fa7d9396
PR
778 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
779 rv = SS_CONNECTED_OUTDATES;
780
b411b363
PR
781 return rv;
782}
783
784/**
785 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
786 * @mdev: DRBD device.
787 * @ns: new state.
788 * @os: old state.
789 */
bf885f8a
AG
790static enum drbd_state_rv
791is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
792 union drbd_state os)
b411b363 793{
bf885f8a 794 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
795
796 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
797 os.conn > C_CONNECTED)
798 rv = SS_RESYNC_RUNNING;
799
800 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
801 rv = SS_ALREADY_STANDALONE;
802
803 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
804 rv = SS_IS_DISKLESS;
805
806 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
807 rv = SS_NO_NET_CONFIG;
808
809 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
810 rv = SS_LOWER_THAN_OUTDATED;
811
812 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
813 rv = SS_IN_TRANSIENT_STATE;
814
815 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
816 rv = SS_IN_TRANSIENT_STATE;
817
818 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
819 rv = SS_NEED_CONNECTION;
820
821 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
822 ns.conn != os.conn && os.conn > C_CONNECTED)
823 rv = SS_RESYNC_RUNNING;
824
825 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
826 os.conn < C_CONNECTED)
827 rv = SS_NEED_CONNECTION;
828
1fc80cf3
PR
829 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
830 && os.conn < C_WF_REPORT_PARAMS)
831 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
832
b411b363
PR
833 return rv;
834}
835
836/**
837 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
838 * @mdev: DRBD device.
839 * @os: old state.
840 * @ns: new state.
841 * @warn_sync_abort:
842 *
843 * When we loose connection, we have to set the state of the peers disk (pdsk)
844 * to D_UNKNOWN. This rule and many more along those lines are in this function.
845 */
846static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
02bc7174 847 union drbd_state ns, const char **warn_sync_abort)
b411b363
PR
848{
849 enum drbd_fencing_p fp;
ab17b68f 850 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
b411b363
PR
851
852 fp = FP_DONT_CARE;
853 if (get_ldev(mdev)) {
854 fp = mdev->ldev->dc.fencing;
855 put_ldev(mdev);
856 }
857
858 /* Disallow Network errors to configure a device's network part */
859 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
860 os.conn <= C_DISCONNECTING)
861 ns.conn = os.conn;
862
f2906e18
LE
863 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
864 * If you try to go into some Sync* state, that shall fail (elsewhere). */
b411b363 865 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
f2906e18 866 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
b411b363
PR
867 ns.conn = os.conn;
868
82f59cc6
LE
869 /* we cannot fail (again) if we already detached */
870 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
871 ns.disk = D_DISKLESS;
872
873 /* if we are only D_ATTACHING yet,
874 * we can (and should) go directly to D_DISKLESS. */
875 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
876 ns.disk = D_DISKLESS;
877
b411b363
PR
878 /* After C_DISCONNECTING only C_STANDALONE may follow */
879 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
880 ns.conn = os.conn;
881
882 if (ns.conn < C_CONNECTED) {
883 ns.peer_isp = 0;
884 ns.peer = R_UNKNOWN;
885 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
886 ns.pdsk = D_UNKNOWN;
887 }
888
889 /* Clear the aftr_isp when becoming unconfigured */
890 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
891 ns.aftr_isp = 0;
892
b411b363
PR
893 /* Abort resync if a disk fails/detaches */
894 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
895 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
896 if (warn_sync_abort)
02bc7174
LE
897 *warn_sync_abort =
898 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
899 "Online-verify" : "Resync";
b411b363
PR
900 ns.conn = C_CONNECTED;
901 }
902
b411b363
PR
903 /* Connection breaks down before we finished "Negotiating" */
904 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
905 get_ldev_if_state(mdev, D_NEGOTIATING)) {
906 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
907 ns.disk = mdev->new_state_tmp.disk;
908 ns.pdsk = mdev->new_state_tmp.pdsk;
909 } else {
910 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
911 ns.disk = D_DISKLESS;
912 ns.pdsk = D_UNKNOWN;
913 }
914 put_ldev(mdev);
915 }
916
ab17b68f
PR
917 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
918 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
919 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
920 ns.disk = D_UP_TO_DATE;
921 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
922 ns.pdsk = D_UP_TO_DATE;
923 }
924
925 /* Implications of the connection stat on the disk states */
926 disk_min = D_DISKLESS;
927 disk_max = D_UP_TO_DATE;
928 pdsk_min = D_INCONSISTENT;
929 pdsk_max = D_UNKNOWN;
930 switch ((enum drbd_conns)ns.conn) {
931 case C_WF_BITMAP_T:
932 case C_PAUSED_SYNC_T:
933 case C_STARTING_SYNC_T:
934 case C_WF_SYNC_UUID:
935 case C_BEHIND:
936 disk_min = D_INCONSISTENT;
937 disk_max = D_OUTDATED;
938 pdsk_min = D_UP_TO_DATE;
939 pdsk_max = D_UP_TO_DATE;
940 break;
941 case C_VERIFY_S:
942 case C_VERIFY_T:
943 disk_min = D_UP_TO_DATE;
944 disk_max = D_UP_TO_DATE;
945 pdsk_min = D_UP_TO_DATE;
946 pdsk_max = D_UP_TO_DATE;
947 break;
948 case C_CONNECTED:
949 disk_min = D_DISKLESS;
950 disk_max = D_UP_TO_DATE;
951 pdsk_min = D_DISKLESS;
952 pdsk_max = D_UP_TO_DATE;
953 break;
954 case C_WF_BITMAP_S:
955 case C_PAUSED_SYNC_S:
956 case C_STARTING_SYNC_S:
957 case C_AHEAD:
958 disk_min = D_UP_TO_DATE;
959 disk_max = D_UP_TO_DATE;
960 pdsk_min = D_INCONSISTENT;
961 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
962 break;
963 case C_SYNC_TARGET:
964 disk_min = D_INCONSISTENT;
965 disk_max = D_INCONSISTENT;
966 pdsk_min = D_UP_TO_DATE;
967 pdsk_max = D_UP_TO_DATE;
968 break;
969 case C_SYNC_SOURCE:
970 disk_min = D_UP_TO_DATE;
971 disk_max = D_UP_TO_DATE;
972 pdsk_min = D_INCONSISTENT;
973 pdsk_max = D_INCONSISTENT;
974 break;
975 case C_STANDALONE:
976 case C_DISCONNECTING:
977 case C_UNCONNECTED:
978 case C_TIMEOUT:
979 case C_BROKEN_PIPE:
980 case C_NETWORK_FAILURE:
981 case C_PROTOCOL_ERROR:
982 case C_TEAR_DOWN:
983 case C_WF_CONNECTION:
984 case C_WF_REPORT_PARAMS:
985 case C_MASK:
986 break;
987 }
988 if (ns.disk > disk_max)
989 ns.disk = disk_max;
990
991 if (ns.disk < disk_min) {
992 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
993 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
994 ns.disk = disk_min;
995 }
996 if (ns.pdsk > pdsk_max)
997 ns.pdsk = pdsk_max;
998
999 if (ns.pdsk < pdsk_min) {
1000 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
1001 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
1002 ns.pdsk = pdsk_min;
1003 }
1004
b411b363 1005 if (fp == FP_STONITH &&
0a492166
PR
1006 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1007 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
fb22c402 1008 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
265be2d0
PR
1009
1010 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1011 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1012 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
fb22c402 1013 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
b411b363
PR
1014
1015 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1016 if (ns.conn == C_SYNC_SOURCE)
1017 ns.conn = C_PAUSED_SYNC_S;
1018 if (ns.conn == C_SYNC_TARGET)
1019 ns.conn = C_PAUSED_SYNC_T;
1020 } else {
1021 if (ns.conn == C_PAUSED_SYNC_S)
1022 ns.conn = C_SYNC_SOURCE;
1023 if (ns.conn == C_PAUSED_SYNC_T)
1024 ns.conn = C_SYNC_TARGET;
1025 }
1026
1027 return ns;
1028}
1029
1030/* helper for __drbd_set_state */
1031static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1032{
30b743a2
LE
1033 if (mdev->agreed_pro_version < 90)
1034 mdev->ov_start_sector = 0;
1035 mdev->rs_total = drbd_bm_bits(mdev);
1036 mdev->ov_position = 0;
b411b363
PR
1037 if (cs == C_VERIFY_T) {
1038 /* starting online verify from an arbitrary position
1039 * does not fit well into the existing protocol.
1040 * on C_VERIFY_T, we initialize ov_left and friends
1041 * implicitly in receive_DataRequest once the
1042 * first P_OV_REQUEST is received */
1043 mdev->ov_start_sector = ~(sector_t)0;
1044 } else {
1045 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
30b743a2 1046 if (bit >= mdev->rs_total) {
b411b363
PR
1047 mdev->ov_start_sector =
1048 BM_BIT_TO_SECT(mdev->rs_total - 1);
30b743a2
LE
1049 mdev->rs_total = 1;
1050 } else
1051 mdev->rs_total -= bit;
b411b363
PR
1052 mdev->ov_position = mdev->ov_start_sector;
1053 }
30b743a2 1054 mdev->ov_left = mdev->rs_total;
b411b363
PR
1055}
1056
0778286a
PR
1057static void drbd_resume_al(struct drbd_conf *mdev)
1058{
1059 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1060 dev_info(DEV, "Resumed AL updates\n");
1061}
1062
b411b363
PR
1063/**
1064 * __drbd_set_state() - Set a new DRBD state
1065 * @mdev: DRBD device.
1066 * @ns: new state.
1067 * @flags: Flags
1068 * @done: Optional completion, that will get completed after the after_state_ch() finished
1069 *
1070 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1071 */
bf885f8a
AG
1072enum drbd_state_rv
1073__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1074 enum chg_state_flags flags, struct completion *done)
b411b363
PR
1075{
1076 union drbd_state os;
bf885f8a 1077 enum drbd_state_rv rv = SS_SUCCESS;
02bc7174 1078 const char *warn_sync_abort = NULL;
b411b363
PR
1079 struct after_state_chg_work *ascw;
1080
1081 os = mdev->state;
1082
1083 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1084
1085 if (ns.i == os.i)
1086 return SS_NOTHING_TO_DO;
1087
1088 if (!(flags & CS_HARD)) {
1089 /* pre-state-change checks ; only look at ns */
1090 /* See drbd_state_sw_errors in drbd_strings.c */
1091
1092 rv = is_valid_state(mdev, ns);
1093 if (rv < SS_SUCCESS) {
1094 /* If the old state was illegal as well, then let
1095 this happen...*/
1096
1616a254 1097 if (is_valid_state(mdev, os) == rv)
b411b363 1098 rv = is_valid_state_transition(mdev, ns, os);
b411b363
PR
1099 } else
1100 rv = is_valid_state_transition(mdev, ns, os);
1101 }
1102
1103 if (rv < SS_SUCCESS) {
1104 if (flags & CS_VERBOSE)
1105 print_st_err(mdev, os, ns, rv);
1106 return rv;
1107 }
1108
1109 if (warn_sync_abort)
02bc7174 1110 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
b411b363
PR
1111
1112 {
662d91a2
AG
1113 char *pbp, pb[300];
1114 pbp = pb;
1115 *pbp = 0;
1116 if (ns.role != os.role)
1117 pbp += sprintf(pbp, "role( %s -> %s ) ",
1118 drbd_role_str(os.role),
1119 drbd_role_str(ns.role));
1120 if (ns.peer != os.peer)
1121 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1122 drbd_role_str(os.peer),
1123 drbd_role_str(ns.peer));
1124 if (ns.conn != os.conn)
1125 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1126 drbd_conn_str(os.conn),
1127 drbd_conn_str(ns.conn));
1128 if (ns.disk != os.disk)
1129 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1130 drbd_disk_str(os.disk),
1131 drbd_disk_str(ns.disk));
1132 if (ns.pdsk != os.pdsk)
1133 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1134 drbd_disk_str(os.pdsk),
1135 drbd_disk_str(ns.pdsk));
1136 if (is_susp(ns) != is_susp(os))
1137 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1138 is_susp(os),
1139 is_susp(ns));
1140 if (ns.aftr_isp != os.aftr_isp)
1141 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1142 os.aftr_isp,
1143 ns.aftr_isp);
1144 if (ns.peer_isp != os.peer_isp)
1145 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1146 os.peer_isp,
1147 ns.peer_isp);
1148 if (ns.user_isp != os.user_isp)
1149 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1150 os.user_isp,
1151 ns.user_isp);
1152 dev_info(DEV, "%s\n", pb);
b411b363
PR
1153 }
1154
1155 /* solve the race between becoming unconfigured,
1156 * worker doing the cleanup, and
1157 * admin reconfiguring us:
1158 * on (re)configure, first set CONFIG_PENDING,
1159 * then wait for a potentially exiting worker,
1160 * start the worker, and schedule one no_op.
1161 * then proceed with configuration.
1162 */
1163 if (ns.disk == D_DISKLESS &&
1164 ns.conn == C_STANDALONE &&
1165 ns.role == R_SECONDARY &&
1166 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1167 set_bit(DEVICE_DYING, &mdev->flags);
1168
82f59cc6
LE
1169 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1170 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1171 * drbd_ldev_destroy() won't happen before our corresponding
1172 * after_state_ch works run, where we put_ldev again. */
1173 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1174 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1175 atomic_inc(&mdev->local_cnt);
1176
1177 mdev->state = ns;
62b0da3a
LE
1178
1179 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1180 drbd_print_uuids(mdev, "attached to UUIDs");
1181
b411b363
PR
1182 wake_up(&mdev->misc_wait);
1183 wake_up(&mdev->state_wait);
1184
b411b363
PR
1185 /* aborted verify run. log the last position */
1186 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1187 ns.conn < C_CONNECTED) {
1188 mdev->ov_start_sector =
30b743a2 1189 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
b411b363
PR
1190 dev_info(DEV, "Online Verify reached sector %llu\n",
1191 (unsigned long long)mdev->ov_start_sector);
1192 }
1193
1194 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1195 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1196 dev_info(DEV, "Syncer continues.\n");
1d7734a0
LE
1197 mdev->rs_paused += (long)jiffies
1198 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
63106d3c
PR
1199 if (ns.conn == C_SYNC_TARGET)
1200 mod_timer(&mdev->resync_timer, jiffies);
b411b363
PR
1201 }
1202
1203 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1204 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1205 dev_info(DEV, "Resync suspended\n");
1d7734a0 1206 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
b411b363
PR
1207 }
1208
1209 if (os.conn == C_CONNECTED &&
1210 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1d7734a0
LE
1211 unsigned long now = jiffies;
1212 int i;
1213
30b743a2 1214 set_ov_position(mdev, ns.conn);
1d7734a0 1215 mdev->rs_start = now;
0f0601f4
LE
1216 mdev->rs_last_events = 0;
1217 mdev->rs_last_sect_ev = 0;
b411b363
PR
1218 mdev->ov_last_oos_size = 0;
1219 mdev->ov_last_oos_start = 0;
1220
1d7734a0 1221 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
30b743a2 1222 mdev->rs_mark_left[i] = mdev->ov_left;
1d7734a0
LE
1223 mdev->rs_mark_time[i] = now;
1224 }
1225
2649f080
LE
1226 drbd_rs_controller_reset(mdev);
1227
b411b363
PR
1228 if (ns.conn == C_VERIFY_S) {
1229 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1230 (unsigned long long)mdev->ov_position);
1231 mod_timer(&mdev->resync_timer, jiffies);
1232 }
1233 }
1234
1235 if (get_ldev(mdev)) {
1236 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1237 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1238 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1239
1240 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1241 mdf |= MDF_CRASHED_PRIMARY;
1242 if (mdev->state.role == R_PRIMARY ||
1243 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1244 mdf |= MDF_PRIMARY_IND;
1245 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1246 mdf |= MDF_CONNECTED_IND;
1247 if (mdev->state.disk > D_INCONSISTENT)
1248 mdf |= MDF_CONSISTENT;
1249 if (mdev->state.disk > D_OUTDATED)
1250 mdf |= MDF_WAS_UP_TO_DATE;
1251 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1252 mdf |= MDF_PEER_OUT_DATED;
1253 if (mdf != mdev->ldev->md.flags) {
1254 mdev->ldev->md.flags = mdf;
1255 drbd_md_mark_dirty(mdev);
1256 }
1257 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1258 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1259 put_ldev(mdev);
1260 }
1261
1262 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1263 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1264 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1265 set_bit(CONSIDER_RESYNC, &mdev->flags);
1266
1267 /* Receiver should clean up itself */
1268 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1269 drbd_thread_stop_nowait(&mdev->receiver);
1270
1271 /* Now the receiver finished cleaning up itself, it should die */
1272 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1273 drbd_thread_stop_nowait(&mdev->receiver);
1274
1275 /* Upon network failure, we need to restart the receiver. */
1276 if (os.conn > C_TEAR_DOWN &&
1277 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1278 drbd_thread_restart_nowait(&mdev->receiver);
1279
0778286a
PR
1280 /* Resume AL writing if we get a connection */
1281 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1282 drbd_resume_al(mdev);
1283
b411b363
PR
1284 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1285 if (ascw) {
1286 ascw->os = os;
1287 ascw->ns = ns;
1288 ascw->flags = flags;
1289 ascw->w.cb = w_after_state_ch;
1290 ascw->done = done;
1291 drbd_queue_work(&mdev->data.work, &ascw->w);
1292 } else {
1293 dev_warn(DEV, "Could not kmalloc an ascw\n");
1294 }
1295
1296 return rv;
1297}
1298
1299static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1300{
1301 struct after_state_chg_work *ascw =
1302 container_of(w, struct after_state_chg_work, w);
1303 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1304 if (ascw->flags & CS_WAIT_COMPLETE) {
1305 D_ASSERT(ascw->done != NULL);
1306 complete(ascw->done);
1307 }
1308 kfree(ascw);
1309
1310 return 1;
1311}
1312
1313static void abw_start_sync(struct drbd_conf *mdev, int rv)
1314{
1315 if (rv) {
1316 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1317 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1318 return;
1319 }
1320
1321 switch (mdev->state.conn) {
1322 case C_STARTING_SYNC_T:
1323 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1324 break;
1325 case C_STARTING_SYNC_S:
1326 drbd_start_resync(mdev, C_SYNC_SOURCE);
1327 break;
1328 }
1329}
1330
20ceb2b2
LE
1331int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1332 int (*io_fn)(struct drbd_conf *),
1333 char *why, enum bm_flag flags)
19f843aa
LE
1334{
1335 int rv;
1336
1337 D_ASSERT(current == mdev->worker.task);
1338
1339 /* open coded non-blocking drbd_suspend_io(mdev); */
1340 set_bit(SUSPEND_IO, &mdev->flags);
19f843aa 1341
20ceb2b2 1342 drbd_bm_lock(mdev, why, flags);
19f843aa
LE
1343 rv = io_fn(mdev);
1344 drbd_bm_unlock(mdev);
1345
1346 drbd_resume_io(mdev);
1347
1348 return rv;
1349}
1350
b411b363
PR
1351/**
1352 * after_state_ch() - Perform after state change actions that may sleep
1353 * @mdev: DRBD device.
1354 * @os: old state.
1355 * @ns: new state.
1356 * @flags: Flags
1357 */
1358static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1359 union drbd_state ns, enum chg_state_flags flags)
1360{
1361 enum drbd_fencing_p fp;
67098930 1362 enum drbd_req_event what = nothing;
fb22c402 1363 union drbd_state nsm = (union drbd_state){ .i = -1 };
b411b363
PR
1364
1365 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1366 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1367 if (mdev->p_uuid)
1368 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1369 }
1370
1371 fp = FP_DONT_CARE;
1372 if (get_ldev(mdev)) {
1373 fp = mdev->ldev->dc.fencing;
1374 put_ldev(mdev);
1375 }
1376
1377 /* Inform userspace about the change... */
1378 drbd_bcast_state(mdev, ns);
1379
1380 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1381 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1382 drbd_khelper(mdev, "pri-on-incon-degr");
1383
1384 /* Here we have the actions that are performed after a
1385 state change. This function might sleep */
1386
fb22c402
PR
1387 nsm.i = -1;
1388 if (ns.susp_nod) {
3f98688a
PR
1389 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1390 what = resend;
265be2d0 1391
67098930 1392 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
3f98688a 1393 what = restart_frozen_disk_io;
fb22c402 1394
3f98688a
PR
1395 if (what != nothing)
1396 nsm.susp_nod = 0;
265be2d0
PR
1397 }
1398
fb22c402 1399 if (ns.susp_fen) {
43a5182c
PR
1400 /* case1: The outdate peer handler is successful: */
1401 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
b411b363 1402 tl_clear(mdev);
43a5182c
PR
1403 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1404 drbd_uuid_new_current(mdev);
1405 clear_bit(NEW_CUR_UUID, &mdev->flags);
43a5182c 1406 }
b411b363 1407 spin_lock_irq(&mdev->req_lock);
fb22c402 1408 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
b411b363
PR
1409 spin_unlock_irq(&mdev->req_lock);
1410 }
43a5182c
PR
1411 /* case2: The connection was established again: */
1412 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1413 clear_bit(NEW_CUR_UUID, &mdev->flags);
67098930 1414 what = resend;
fb22c402 1415 nsm.susp_fen = 0;
43a5182c 1416 }
b411b363 1417 }
67098930
PR
1418
1419 if (what != nothing) {
1420 spin_lock_irq(&mdev->req_lock);
1421 _tl_restart(mdev, what);
fb22c402
PR
1422 nsm.i &= mdev->state.i;
1423 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
67098930 1424 spin_unlock_irq(&mdev->req_lock);
b411b363 1425 }
67098930 1426
5a22db89
LE
1427 /* Became sync source. With protocol >= 96, we still need to send out
1428 * the sync uuid now. Need to do that before any drbd_send_state, or
1429 * the other side may go "paused sync" before receiving the sync uuids,
1430 * which is unexpected. */
1431 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1432 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1433 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1434 drbd_gen_and_send_sync_uuid(mdev);
1435 put_ldev(mdev);
1436 }
1437
b411b363
PR
1438 /* Do not change the order of the if above and the two below... */
1439 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1440 drbd_send_uuids(mdev);
1441 drbd_send_state(mdev);
1442 }
54b956ab
LE
1443 /* No point in queuing send_bitmap if we don't have a connection
1444 * anymore, so check also the _current_ state, not only the new state
1445 * at the time this work was queued. */
1446 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1447 mdev->state.conn == C_WF_BITMAP_S)
1448 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
20ceb2b2
LE
1449 "send_bitmap (WFBitMapS)",
1450 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1451
1452 /* Lost contact to peer's copy of the data */
1453 if ((os.pdsk >= D_INCONSISTENT &&
1454 os.pdsk != D_UNKNOWN &&
1455 os.pdsk != D_OUTDATED)
1456 && (ns.pdsk < D_INCONSISTENT ||
1457 ns.pdsk == D_UNKNOWN ||
1458 ns.pdsk == D_OUTDATED)) {
b411b363
PR
1459 if (get_ldev(mdev)) {
1460 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
2c8d1967 1461 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
fb22c402 1462 if (is_susp(mdev->state)) {
43a5182c
PR
1463 set_bit(NEW_CUR_UUID, &mdev->flags);
1464 } else {
1465 drbd_uuid_new_current(mdev);
1466 drbd_send_uuids(mdev);
1467 }
2c8d1967 1468 }
b411b363
PR
1469 put_ldev(mdev);
1470 }
1471 }
1472
1473 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
18a50fa2 1474 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
2c8d1967 1475 drbd_uuid_new_current(mdev);
18a50fa2
PR
1476 drbd_send_uuids(mdev);
1477 }
b411b363
PR
1478
1479 /* D_DISKLESS Peer becomes secondary */
1480 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
20ceb2b2
LE
1481 /* We may still be Primary ourselves.
1482 * No harm done if the bitmap still changes,
1483 * redirtied pages will follow later. */
1484 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1485 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
19f843aa
LE
1486 put_ldev(mdev);
1487 }
1488
06d33e96
LE
1489 /* Write out all changed bits on demote.
1490 * Though, no need to da that just yet
1491 * if there is a resync going on still */
1492 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1493 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
20ceb2b2
LE
1494 /* No changes to the bitmap expected this time, so assert that,
1495 * even though no harm was done if it did change. */
1496 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1497 "demote", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1498 put_ldev(mdev);
1499 }
1500
1501 /* Last part of the attaching process ... */
1502 if (ns.conn >= C_CONNECTED &&
1503 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
e89b591c 1504 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363
PR
1505 drbd_send_uuids(mdev);
1506 drbd_send_state(mdev);
1507 }
1508
1509 /* We want to pause/continue resync, tell peer. */
1510 if (ns.conn >= C_CONNECTED &&
1511 ((os.aftr_isp != ns.aftr_isp) ||
1512 (os.user_isp != ns.user_isp)))
1513 drbd_send_state(mdev);
1514
1515 /* In case one of the isp bits got set, suspend other devices. */
1516 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1517 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1518 suspend_other_sg(mdev);
1519
1520 /* Make sure the peer gets informed about eventual state
1521 changes (ISP bits) while we were in WFReportParams. */
1522 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1523 drbd_send_state(mdev);
1524
67531718
PR
1525 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1526 drbd_send_state(mdev);
1527
b411b363
PR
1528 /* We are in the progress to start a full sync... */
1529 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1530 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
20ceb2b2
LE
1531 /* no other bitmap changes expected during this phase */
1532 drbd_queue_bitmap_io(mdev,
1533 &drbd_bmio_set_n_write, &abw_start_sync,
1534 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1535
1536 /* We are invalidating our self... */
1537 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1538 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
20ceb2b2
LE
1539 /* other bitmap operation expected during this phase */
1540 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1541 "set_n_write from invalidate", BM_LOCKED_MASK);
b411b363 1542
82f59cc6
LE
1543 /* first half of local IO error, failure to attach,
1544 * or administrative detach */
1545 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1546 enum drbd_io_error_p eh;
1547 int was_io_error;
1548 /* corresponding get_ldev was in __drbd_set_state, to serialize
1549 * our cleanup here with the transition to D_DISKLESS,
1550 * so it is safe to dreference ldev here. */
1551 eh = mdev->ldev->dc.on_io_error;
1552 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1553
1554 /* current state still has to be D_FAILED,
1555 * there is only one way out: to D_DISKLESS,
1556 * and that may only happen after our put_ldev below. */
1557 if (mdev->state.disk != D_FAILED)
1558 dev_err(DEV,
1559 "ASSERT FAILED: disk is %s during detach\n",
1560 drbd_disk_str(mdev->state.disk));
e9e6f3ec
LE
1561
1562 if (drbd_send_state(mdev))
82f59cc6 1563 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
e9e6f3ec 1564 else
82f59cc6 1565 dev_err(DEV, "Sending state for detaching disk failed\n");
e9e6f3ec
LE
1566
1567 drbd_rs_cancel_all(mdev);
b411b363 1568
82f59cc6
LE
1569 /* In case we want to get something to stable storage still,
1570 * this may be the last chance.
1571 * Following put_ldev may transition to D_DISKLESS. */
1572 drbd_md_sync(mdev);
1573 put_ldev(mdev);
1574
1575 if (was_io_error && eh == EP_CALL_HELPER)
e9e6f3ec
LE
1576 drbd_khelper(mdev, "local-io-error");
1577 }
b411b363 1578
82f59cc6
LE
1579 /* second half of local IO error, failure to attach,
1580 * or administrative detach,
1581 * after local_cnt references have reached zero again */
1582 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1583 /* We must still be diskless,
1584 * re-attach has to be serialized with this! */
1585 if (mdev->state.disk != D_DISKLESS)
1586 dev_err(DEV,
1587 "ASSERT FAILED: disk is %s while going diskless\n",
1588 drbd_disk_str(mdev->state.disk));
e9e6f3ec 1589
82f59cc6
LE
1590 mdev->rs_total = 0;
1591 mdev->rs_failed = 0;
1592 atomic_set(&mdev->rs_pending_cnt, 0);
9d282875 1593
e9e6f3ec 1594 if (drbd_send_state(mdev))
82f59cc6 1595 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
82f59cc6 1596 /* corresponding get_ldev in __drbd_set_state
25985edc 1597 * this may finally trigger drbd_ldev_destroy. */
82f59cc6 1598 put_ldev(mdev);
b411b363
PR
1599 }
1600
738a84b2
PR
1601 /* Notify peer that I had a local IO error, and did not detached.. */
1602 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1603 drbd_send_state(mdev);
1604
b411b363
PR
1605 /* Disks got bigger while they were detached */
1606 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1607 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1608 if (ns.conn == C_CONNECTED)
1609 resync_after_online_grow(mdev);
1610 }
1611
1612 /* A resync finished or aborted, wake paused devices... */
1613 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1614 (os.peer_isp && !ns.peer_isp) ||
1615 (os.user_isp && !ns.user_isp))
1616 resume_next_sg(mdev);
1617
af85e8e8
LE
1618 /* sync target done with resync. Explicitly notify peer, even though
1619 * it should (at least for non-empty resyncs) already know itself. */
1620 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1621 drbd_send_state(mdev);
1622
79a30d2d
LE
1623 /* This triggers bitmap writeout of potentially still unwritten pages
1624 * if the resync finished cleanly, or aborted because of peer disk
20ceb2b2 1625 * failure, or because of connection loss.
79a30d2d
LE
1626 * For resync aborted because of local disk failure, we cannot do
1627 * any bitmap writeout anymore.
20ceb2b2 1628 * No harm done if some bits change during this phase.
79a30d2d 1629 */
20ceb2b2
LE
1630 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1631 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1632 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
79a30d2d
LE
1633 put_ldev(mdev);
1634 }
02851e9f 1635
f70b3511 1636 /* free tl_hash if we Got thawed and are C_STANDALONE */
fb22c402 1637 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
f70b3511
PR
1638 drbd_free_tl_hash(mdev);
1639
b411b363
PR
1640 /* Upon network connection, we need to start the receiver */
1641 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1642 drbd_thread_start(&mdev->receiver);
1643
1644 /* Terminate worker thread if we are unconfigured - it will be
1645 restarted as needed... */
1646 if (ns.disk == D_DISKLESS &&
1647 ns.conn == C_STANDALONE &&
1648 ns.role == R_SECONDARY) {
1649 if (os.aftr_isp != ns.aftr_isp)
1650 resume_next_sg(mdev);
1651 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1652 if (test_bit(DEVICE_DYING, &mdev->flags))
1653 drbd_thread_stop_nowait(&mdev->worker);
1654 }
1655
1656 drbd_md_sync(mdev);
1657}
1658
1659
1660static int drbd_thread_setup(void *arg)
1661{
1662 struct drbd_thread *thi = (struct drbd_thread *) arg;
1663 struct drbd_conf *mdev = thi->mdev;
1664 unsigned long flags;
1665 int retval;
1666
1667restart:
1668 retval = thi->function(thi);
1669
1670 spin_lock_irqsave(&thi->t_lock, flags);
1671
1672 /* if the receiver has been "Exiting", the last thing it did
1673 * was set the conn state to "StandAlone",
1674 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1675 * and receiver thread will be "started".
1676 * drbd_thread_start needs to set "Restarting" in that case.
1677 * t_state check and assignment needs to be within the same spinlock,
1678 * so either thread_start sees Exiting, and can remap to Restarting,
1679 * or thread_start see None, and can proceed as normal.
1680 */
1681
1682 if (thi->t_state == Restarting) {
1683 dev_info(DEV, "Restarting %s\n", current->comm);
1684 thi->t_state = Running;
1685 spin_unlock_irqrestore(&thi->t_lock, flags);
1686 goto restart;
1687 }
1688
1689 thi->task = NULL;
1690 thi->t_state = None;
1691 smp_mb();
1692 complete(&thi->stop);
1693 spin_unlock_irqrestore(&thi->t_lock, flags);
1694
1695 dev_info(DEV, "Terminating %s\n", current->comm);
1696
1697 /* Release mod reference taken when thread was started */
1698 module_put(THIS_MODULE);
1699 return retval;
1700}
1701
1702static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1703 int (*func) (struct drbd_thread *))
1704{
1705 spin_lock_init(&thi->t_lock);
1706 thi->task = NULL;
1707 thi->t_state = None;
1708 thi->function = func;
1709 thi->mdev = mdev;
1710}
1711
1712int drbd_thread_start(struct drbd_thread *thi)
1713{
1714 struct drbd_conf *mdev = thi->mdev;
1715 struct task_struct *nt;
1716 unsigned long flags;
1717
1718 const char *me =
1719 thi == &mdev->receiver ? "receiver" :
1720 thi == &mdev->asender ? "asender" :
1721 thi == &mdev->worker ? "worker" : "NONSENSE";
1722
1723 /* is used from state engine doing drbd_thread_stop_nowait,
1724 * while holding the req lock irqsave */
1725 spin_lock_irqsave(&thi->t_lock, flags);
1726
1727 switch (thi->t_state) {
1728 case None:
1729 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1730 me, current->comm, current->pid);
1731
1732 /* Get ref on module for thread - this is released when thread exits */
1733 if (!try_module_get(THIS_MODULE)) {
1734 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1735 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 1736 return false;
b411b363
PR
1737 }
1738
1739 init_completion(&thi->stop);
1740 D_ASSERT(thi->task == NULL);
1741 thi->reset_cpu_mask = 1;
1742 thi->t_state = Running;
1743 spin_unlock_irqrestore(&thi->t_lock, flags);
1744 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1745
1746 nt = kthread_create(drbd_thread_setup, (void *) thi,
1747 "drbd%d_%s", mdev_to_minor(mdev), me);
1748
1749 if (IS_ERR(nt)) {
1750 dev_err(DEV, "Couldn't start thread\n");
1751
1752 module_put(THIS_MODULE);
81e84650 1753 return false;
b411b363
PR
1754 }
1755 spin_lock_irqsave(&thi->t_lock, flags);
1756 thi->task = nt;
1757 thi->t_state = Running;
1758 spin_unlock_irqrestore(&thi->t_lock, flags);
1759 wake_up_process(nt);
1760 break;
1761 case Exiting:
1762 thi->t_state = Restarting;
1763 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1764 me, current->comm, current->pid);
1765 /* fall through */
1766 case Running:
1767 case Restarting:
1768 default:
1769 spin_unlock_irqrestore(&thi->t_lock, flags);
1770 break;
1771 }
1772
81e84650 1773 return true;
b411b363
PR
1774}
1775
1776
1777void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1778{
1779 unsigned long flags;
1780
1781 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1782
1783 /* may be called from state engine, holding the req lock irqsave */
1784 spin_lock_irqsave(&thi->t_lock, flags);
1785
1786 if (thi->t_state == None) {
1787 spin_unlock_irqrestore(&thi->t_lock, flags);
1788 if (restart)
1789 drbd_thread_start(thi);
1790 return;
1791 }
1792
1793 if (thi->t_state != ns) {
1794 if (thi->task == NULL) {
1795 spin_unlock_irqrestore(&thi->t_lock, flags);
1796 return;
1797 }
1798
1799 thi->t_state = ns;
1800 smp_mb();
1801 init_completion(&thi->stop);
1802 if (thi->task != current)
1803 force_sig(DRBD_SIGKILL, thi->task);
1804
1805 }
1806
1807 spin_unlock_irqrestore(&thi->t_lock, flags);
1808
1809 if (wait)
1810 wait_for_completion(&thi->stop);
1811}
1812
1813#ifdef CONFIG_SMP
1814/**
1815 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1816 * @mdev: DRBD device.
1817 *
1818 * Forces all threads of a device onto the same CPU. This is beneficial for
1819 * DRBD's performance. May be overwritten by user's configuration.
1820 */
1821void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1822{
1823 int ord, cpu;
1824
1825 /* user override. */
1826 if (cpumask_weight(mdev->cpu_mask))
1827 return;
1828
1829 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1830 for_each_online_cpu(cpu) {
1831 if (ord-- == 0) {
1832 cpumask_set_cpu(cpu, mdev->cpu_mask);
1833 return;
1834 }
1835 }
1836 /* should not be reached */
1837 cpumask_setall(mdev->cpu_mask);
1838}
1839
1840/**
1841 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1842 * @mdev: DRBD device.
1843 *
1844 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1845 * prematurely.
1846 */
1847void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1848{
1849 struct task_struct *p = current;
1850 struct drbd_thread *thi =
1851 p == mdev->asender.task ? &mdev->asender :
1852 p == mdev->receiver.task ? &mdev->receiver :
1853 p == mdev->worker.task ? &mdev->worker :
1854 NULL;
1855 ERR_IF(thi == NULL)
1856 return;
1857 if (!thi->reset_cpu_mask)
1858 return;
1859 thi->reset_cpu_mask = 0;
1860 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1861}
1862#endif
1863
1864/* the appropriate socket mutex must be held already */
1865int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
0b70a13d 1866 enum drbd_packets cmd, struct p_header80 *h,
b411b363
PR
1867 size_t size, unsigned msg_flags)
1868{
1869 int sent, ok;
1870
81e84650
AG
1871 ERR_IF(!h) return false;
1872 ERR_IF(!size) return false;
b411b363 1873
ca9bc12b 1874 h->magic = cpu_to_be32(DRBD_MAGIC);
b411b363 1875 h->command = cpu_to_be16(cmd);
0b70a13d 1876 h->length = cpu_to_be16(size-sizeof(struct p_header80));
b411b363 1877
b411b363
PR
1878 sent = drbd_send(mdev, sock, h, size, msg_flags);
1879
1880 ok = (sent == size);
0ddc5549
LE
1881 if (!ok && !signal_pending(current))
1882 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
b411b363
PR
1883 cmdname(cmd), (int)size, sent);
1884 return ok;
1885}
1886
1887/* don't pass the socket. we may only look at it
1888 * when we hold the appropriate socket mutex.
1889 */
1890int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
0b70a13d 1891 enum drbd_packets cmd, struct p_header80 *h, size_t size)
b411b363
PR
1892{
1893 int ok = 0;
1894 struct socket *sock;
1895
1896 if (use_data_socket) {
1897 mutex_lock(&mdev->data.mutex);
1898 sock = mdev->data.socket;
1899 } else {
1900 mutex_lock(&mdev->meta.mutex);
1901 sock = mdev->meta.socket;
1902 }
1903
1904 /* drbd_disconnect() could have called drbd_free_sock()
1905 * while we were waiting in down()... */
1906 if (likely(sock != NULL))
1907 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1908
1909 if (use_data_socket)
1910 mutex_unlock(&mdev->data.mutex);
1911 else
1912 mutex_unlock(&mdev->meta.mutex);
1913 return ok;
1914}
1915
1916int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1917 size_t size)
1918{
0b70a13d 1919 struct p_header80 h;
b411b363
PR
1920 int ok;
1921
ca9bc12b 1922 h.magic = cpu_to_be32(DRBD_MAGIC);
b411b363
PR
1923 h.command = cpu_to_be16(cmd);
1924 h.length = cpu_to_be16(size);
1925
1926 if (!drbd_get_data_sock(mdev))
1927 return 0;
1928
b411b363
PR
1929 ok = (sizeof(h) ==
1930 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1931 ok = ok && (size ==
1932 drbd_send(mdev, mdev->data.socket, data, size, 0));
1933
1934 drbd_put_data_sock(mdev);
1935
1936 return ok;
1937}
1938
1939int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1940{
8e26f9cc 1941 struct p_rs_param_95 *p;
b411b363
PR
1942 struct socket *sock;
1943 int size, rv;
1944 const int apv = mdev->agreed_pro_version;
1945
1946 size = apv <= 87 ? sizeof(struct p_rs_param)
1947 : apv == 88 ? sizeof(struct p_rs_param)
1948 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
1949 : apv <= 94 ? sizeof(struct p_rs_param_89)
1950 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
1951
1952 /* used from admin command context and receiver/worker context.
1953 * to avoid kmalloc, grab the socket right here,
1954 * then use the pre-allocated sbuf there */
1955 mutex_lock(&mdev->data.mutex);
1956 sock = mdev->data.socket;
1957
1958 if (likely(sock != NULL)) {
1959 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1960
8e26f9cc 1961 p = &mdev->data.sbuf.rs_param_95;
b411b363
PR
1962
1963 /* initialize verify_alg and csums_alg */
1964 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1965
1966 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
1967 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1968 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1969 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1970 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
1971
1972 if (apv >= 88)
1973 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1974 if (apv >= 89)
1975 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1976
1977 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1978 } else
1979 rv = 0; /* not ok */
1980
1981 mutex_unlock(&mdev->data.mutex);
1982
1983 return rv;
1984}
1985
1986int drbd_send_protocol(struct drbd_conf *mdev)
1987{
1988 struct p_protocol *p;
cf14c2e9 1989 int size, cf, rv;
b411b363
PR
1990
1991 size = sizeof(struct p_protocol);
1992
1993 if (mdev->agreed_pro_version >= 87)
1994 size += strlen(mdev->net_conf->integrity_alg) + 1;
1995
1996 /* we must not recurse into our own queue,
1997 * as that is blocked during handshake */
1998 p = kmalloc(size, GFP_NOIO);
1999 if (p == NULL)
2000 return 0;
2001
2002 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2003 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2004 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2005 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
b411b363
PR
2006 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2007
cf14c2e9
PR
2008 cf = 0;
2009 if (mdev->net_conf->want_lose)
2010 cf |= CF_WANT_LOSE;
2011 if (mdev->net_conf->dry_run) {
2012 if (mdev->agreed_pro_version >= 92)
2013 cf |= CF_DRY_RUN;
2014 else {
2015 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 2016 kfree(p);
148efa16 2017 return -1;
cf14c2e9
PR
2018 }
2019 }
2020 p->conn_flags = cpu_to_be32(cf);
2021
b411b363
PR
2022 if (mdev->agreed_pro_version >= 87)
2023 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2024
2025 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
0b70a13d 2026 (struct p_header80 *)p, size);
b411b363
PR
2027 kfree(p);
2028 return rv;
2029}
2030
2031int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2032{
2033 struct p_uuids p;
2034 int i;
2035
2036 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2037 return 1;
2038
2039 for (i = UI_CURRENT; i < UI_SIZE; i++)
2040 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2041
2042 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2043 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2044 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2045 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2046 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2047 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2048
2049 put_ldev(mdev);
2050
2051 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
0b70a13d 2052 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2053}
2054
2055int drbd_send_uuids(struct drbd_conf *mdev)
2056{
2057 return _drbd_send_uuids(mdev, 0);
2058}
2059
2060int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2061{
2062 return _drbd_send_uuids(mdev, 8);
2063}
2064
62b0da3a
LE
2065void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2066{
2067 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2068 u64 *uuid = mdev->ldev->md.uuid;
2069 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2070 text,
2071 (unsigned long long)uuid[UI_CURRENT],
2072 (unsigned long long)uuid[UI_BITMAP],
2073 (unsigned long long)uuid[UI_HISTORY_START],
2074 (unsigned long long)uuid[UI_HISTORY_END]);
2075 put_ldev(mdev);
2076 } else {
2077 dev_info(DEV, "%s effective data uuid: %016llX\n",
2078 text,
2079 (unsigned long long)mdev->ed_uuid);
2080 }
2081}
2082
5a22db89 2083int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
2084{
2085 struct p_rs_uuid p;
5a22db89
LE
2086 u64 uuid;
2087
2088 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 2089
4a23f264 2090 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 2091 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 2092 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
2093 drbd_md_sync(mdev);
2094 p.uuid = cpu_to_be64(uuid);
b411b363
PR
2095
2096 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
0b70a13d 2097 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2098}
2099
e89b591c 2100int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
2101{
2102 struct p_sizes p;
2103 sector_t d_size, u_size;
99432fcc 2104 int q_order_type, max_bio_size;
b411b363
PR
2105 int ok;
2106
2107 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2108 D_ASSERT(mdev->ldev->backing_bdev);
2109 d_size = drbd_get_max_capacity(mdev->ldev);
2110 u_size = mdev->ldev->dc.disk_size;
2111 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
2112 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2113 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
2114 put_ldev(mdev);
2115 } else {
2116 d_size = 0;
2117 u_size = 0;
2118 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 2119 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
2120 }
2121
2122 p.d_size = cpu_to_be64(d_size);
2123 p.u_size = cpu_to_be64(u_size);
2124 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 2125 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
2126 p.queue_order_type = cpu_to_be16(q_order_type);
2127 p.dds_flags = cpu_to_be16(flags);
b411b363
PR
2128
2129 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
0b70a13d 2130 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2131 return ok;
2132}
2133
2134/**
2135 * drbd_send_state() - Sends the drbd state to the peer
2136 * @mdev: DRBD device.
2137 */
2138int drbd_send_state(struct drbd_conf *mdev)
2139{
2140 struct socket *sock;
2141 struct p_state p;
2142 int ok = 0;
2143
2144 /* Grab state lock so we wont send state if we're in the middle
2145 * of a cluster wide state change on another thread */
2146 drbd_state_lock(mdev);
2147
2148 mutex_lock(&mdev->data.mutex);
2149
2150 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2151 sock = mdev->data.socket;
2152
2153 if (likely(sock != NULL)) {
2154 ok = _drbd_send_cmd(mdev, sock, P_STATE,
0b70a13d 2155 (struct p_header80 *)&p, sizeof(p), 0);
b411b363
PR
2156 }
2157
2158 mutex_unlock(&mdev->data.mutex);
2159
2160 drbd_state_unlock(mdev);
2161 return ok;
2162}
2163
2164int drbd_send_state_req(struct drbd_conf *mdev,
2165 union drbd_state mask, union drbd_state val)
2166{
2167 struct p_req_state p;
2168
2169 p.mask = cpu_to_be32(mask.i);
2170 p.val = cpu_to_be32(val.i);
2171
2172 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
0b70a13d 2173 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2174}
2175
bf885f8a 2176int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
2177{
2178 struct p_req_state_reply p;
2179
2180 p.retcode = cpu_to_be32(retcode);
2181
2182 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
0b70a13d 2183 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2184}
2185
2186int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2187 struct p_compressed_bm *p,
2188 struct bm_xfer_ctx *c)
2189{
2190 struct bitstream bs;
2191 unsigned long plain_bits;
2192 unsigned long tmp;
2193 unsigned long rl;
2194 unsigned len;
2195 unsigned toggle;
2196 int bits;
2197
2198 /* may we use this feature? */
2199 if ((mdev->sync_conf.use_rle == 0) ||
2200 (mdev->agreed_pro_version < 90))
2201 return 0;
2202
2203 if (c->bit_offset >= c->bm_bits)
2204 return 0; /* nothing to do. */
2205
2206 /* use at most thus many bytes */
2207 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2208 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2209 /* plain bits covered in this code string */
2210 plain_bits = 0;
2211
2212 /* p->encoding & 0x80 stores whether the first run length is set.
2213 * bit offset is implicit.
2214 * start with toggle == 2 to be able to tell the first iteration */
2215 toggle = 2;
2216
2217 /* see how much plain bits we can stuff into one packet
2218 * using RLE and VLI. */
2219 do {
2220 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2221 : _drbd_bm_find_next(mdev, c->bit_offset);
2222 if (tmp == -1UL)
2223 tmp = c->bm_bits;
2224 rl = tmp - c->bit_offset;
2225
2226 if (toggle == 2) { /* first iteration */
2227 if (rl == 0) {
2228 /* the first checked bit was set,
2229 * store start value, */
2230 DCBP_set_start(p, 1);
2231 /* but skip encoding of zero run length */
2232 toggle = !toggle;
2233 continue;
2234 }
2235 DCBP_set_start(p, 0);
2236 }
2237
2238 /* paranoia: catch zero runlength.
2239 * can only happen if bitmap is modified while we scan it. */
2240 if (rl == 0) {
2241 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2242 "t:%u bo:%lu\n", toggle, c->bit_offset);
2243 return -1;
2244 }
2245
2246 bits = vli_encode_bits(&bs, rl);
2247 if (bits == -ENOBUFS) /* buffer full */
2248 break;
2249 if (bits <= 0) {
2250 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2251 return 0;
2252 }
2253
2254 toggle = !toggle;
2255 plain_bits += rl;
2256 c->bit_offset = tmp;
2257 } while (c->bit_offset < c->bm_bits);
2258
2259 len = bs.cur.b - p->code + !!bs.cur.bit;
2260
2261 if (plain_bits < (len << 3)) {
2262 /* incompressible with this method.
2263 * we need to rewind both word and bit position. */
2264 c->bit_offset -= plain_bits;
2265 bm_xfer_ctx_bit_to_word_offset(c);
2266 c->bit_offset = c->word_offset * BITS_PER_LONG;
2267 return 0;
2268 }
2269
2270 /* RLE + VLI was able to compress it just fine.
2271 * update c->word_offset. */
2272 bm_xfer_ctx_bit_to_word_offset(c);
2273
2274 /* store pad_bits */
2275 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2276
2277 return len;
2278}
2279
f70af118
AG
2280/**
2281 * send_bitmap_rle_or_plain
2282 *
2283 * Return 0 when done, 1 when another iteration is needed, and a negative error
2284 * code upon failure.
2285 */
2286static int
b411b363 2287send_bitmap_rle_or_plain(struct drbd_conf *mdev,
f70af118 2288 struct p_header80 *h, struct bm_xfer_ctx *c)
b411b363
PR
2289{
2290 struct p_compressed_bm *p = (void*)h;
2291 unsigned long num_words;
2292 int len;
2293 int ok;
2294
2295 len = fill_bitmap_rle_bits(mdev, p, c);
2296
2297 if (len < 0)
f70af118 2298 return -EIO;
b411b363
PR
2299
2300 if (len) {
2301 DCBP_set_code(p, RLE_VLI_Bits);
2302 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2303 sizeof(*p) + len, 0);
2304
2305 c->packets[0]++;
2306 c->bytes[0] += sizeof(*p) + len;
2307
2308 if (c->bit_offset >= c->bm_bits)
2309 len = 0; /* DONE */
2310 } else {
2311 /* was not compressible.
2312 * send a buffer full of plain text bits instead. */
2313 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2314 len = num_words * sizeof(long);
2315 if (len)
2316 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2317 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
0b70a13d 2318 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
2319 c->word_offset += num_words;
2320 c->bit_offset = c->word_offset * BITS_PER_LONG;
2321
2322 c->packets[1]++;
0b70a13d 2323 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
2324
2325 if (c->bit_offset > c->bm_bits)
2326 c->bit_offset = c->bm_bits;
2327 }
f70af118
AG
2328 if (ok) {
2329 if (len == 0) {
2330 INFO_bm_xfer_stats(mdev, "send", c);
2331 return 0;
2332 } else
2333 return 1;
2334 }
2335 return -EIO;
b411b363
PR
2336}
2337
2338/* See the comment at receive_bitmap() */
2339int _drbd_send_bitmap(struct drbd_conf *mdev)
2340{
2341 struct bm_xfer_ctx c;
0b70a13d 2342 struct p_header80 *p;
f70af118 2343 int err;
b411b363 2344
81e84650 2345 ERR_IF(!mdev->bitmap) return false;
b411b363
PR
2346
2347 /* maybe we should use some per thread scratch page,
2348 * and allocate that during initial device creation? */
0b70a13d 2349 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
b411b363
PR
2350 if (!p) {
2351 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 2352 return false;
b411b363
PR
2353 }
2354
2355 if (get_ldev(mdev)) {
2356 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2357 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2358 drbd_bm_set_all(mdev);
2359 if (drbd_bm_write(mdev)) {
2360 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2361 * but otherwise process as per normal - need to tell other
2362 * side that a full resync is required! */
2363 dev_err(DEV, "Failed to write bitmap to disk!\n");
2364 } else {
2365 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2366 drbd_md_sync(mdev);
2367 }
2368 }
2369 put_ldev(mdev);
2370 }
2371
2372 c = (struct bm_xfer_ctx) {
2373 .bm_bits = drbd_bm_bits(mdev),
2374 .bm_words = drbd_bm_words(mdev),
2375 };
2376
2377 do {
f70af118
AG
2378 err = send_bitmap_rle_or_plain(mdev, p, &c);
2379 } while (err > 0);
b411b363
PR
2380
2381 free_page((unsigned long) p);
f70af118 2382 return err == 0;
b411b363
PR
2383}
2384
2385int drbd_send_bitmap(struct drbd_conf *mdev)
2386{
2387 int err;
2388
2389 if (!drbd_get_data_sock(mdev))
2390 return -1;
2391 err = !_drbd_send_bitmap(mdev);
2392 drbd_put_data_sock(mdev);
2393 return err;
2394}
2395
2396int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2397{
2398 int ok;
2399 struct p_barrier_ack p;
2400
2401 p.barrier = barrier_nr;
2402 p.set_size = cpu_to_be32(set_size);
2403
2404 if (mdev->state.conn < C_CONNECTED)
81e84650 2405 return false;
b411b363 2406 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
0b70a13d 2407 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2408 return ok;
2409}
2410
2411/**
2412 * _drbd_send_ack() - Sends an ack packet
2413 * @mdev: DRBD device.
2414 * @cmd: Packet command code.
2415 * @sector: sector, needs to be in big endian byte order
2416 * @blksize: size in byte, needs to be in big endian byte order
2417 * @block_id: Id, big endian byte order
2418 */
2419static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2420 u64 sector,
2421 u32 blksize,
2422 u64 block_id)
2423{
2424 int ok;
2425 struct p_block_ack p;
2426
2427 p.sector = sector;
2428 p.block_id = block_id;
2429 p.blksize = blksize;
2430 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2431
2432 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 2433 return false;
b411b363 2434 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
0b70a13d 2435 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2436 return ok;
2437}
2438
2b2bf214
LE
2439/* dp->sector and dp->block_id already/still in network byte order,
2440 * data_size is payload size according to dp->head,
2441 * and may need to be corrected for digest size. */
b411b363 2442int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2b2bf214 2443 struct p_data *dp, int data_size)
b411b363 2444{
2b2bf214
LE
2445 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2446 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
b411b363
PR
2447 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2448 dp->block_id);
2449}
2450
2451int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2452 struct p_block_req *rp)
2453{
2454 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2455}
2456
2457/**
2458 * drbd_send_ack() - Sends an ack packet
2459 * @mdev: DRBD device.
2460 * @cmd: Packet command code.
2461 * @e: Epoch entry.
2462 */
2463int drbd_send_ack(struct drbd_conf *mdev,
2464 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2465{
2466 return _drbd_send_ack(mdev, cmd,
2467 cpu_to_be64(e->sector),
2468 cpu_to_be32(e->size),
2469 e->block_id);
2470}
2471
2472/* This function misuses the block_id field to signal if the blocks
2473 * are is sync or not. */
2474int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2475 sector_t sector, int blksize, u64 block_id)
2476{
2477 return _drbd_send_ack(mdev, cmd,
2478 cpu_to_be64(sector),
2479 cpu_to_be32(blksize),
2480 cpu_to_be64(block_id));
2481}
2482
2483int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2484 sector_t sector, int size, u64 block_id)
2485{
2486 int ok;
2487 struct p_block_req p;
2488
2489 p.sector = cpu_to_be64(sector);
2490 p.block_id = block_id;
2491 p.blksize = cpu_to_be32(size);
2492
2493 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
0b70a13d 2494 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2495 return ok;
2496}
2497
2498int drbd_send_drequest_csum(struct drbd_conf *mdev,
2499 sector_t sector, int size,
2500 void *digest, int digest_size,
2501 enum drbd_packets cmd)
2502{
2503 int ok;
2504 struct p_block_req p;
2505
2506 p.sector = cpu_to_be64(sector);
9a8e7753 2507 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
2508 p.blksize = cpu_to_be32(size);
2509
ca9bc12b 2510 p.head.magic = cpu_to_be32(DRBD_MAGIC);
b411b363 2511 p.head.command = cpu_to_be16(cmd);
0b70a13d 2512 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
b411b363
PR
2513
2514 mutex_lock(&mdev->data.mutex);
2515
2516 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2517 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2518
2519 mutex_unlock(&mdev->data.mutex);
2520
2521 return ok;
2522}
2523
2524int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2525{
2526 int ok;
2527 struct p_block_req p;
2528
2529 p.sector = cpu_to_be64(sector);
9a8e7753 2530 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
2531 p.blksize = cpu_to_be32(size);
2532
2533 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
0b70a13d 2534 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2535 return ok;
2536}
2537
2538/* called on sndtimeo
81e84650
AG
2539 * returns false if we should retry,
2540 * true if we think connection is dead
b411b363
PR
2541 */
2542static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2543{
2544 int drop_it;
2545 /* long elapsed = (long)(jiffies - mdev->last_received); */
2546
2547 drop_it = mdev->meta.socket == sock
2548 || !mdev->asender.task
2549 || get_t_state(&mdev->asender) != Running
2550 || mdev->state.conn < C_CONNECTED;
2551
2552 if (drop_it)
81e84650 2553 return true;
b411b363
PR
2554
2555 drop_it = !--mdev->ko_count;
2556 if (!drop_it) {
2557 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2558 current->comm, current->pid, mdev->ko_count);
2559 request_ping(mdev);
2560 }
2561
2562 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2563}
2564
2565/* The idea of sendpage seems to be to put some kind of reference
2566 * to the page into the skb, and to hand it over to the NIC. In
2567 * this process get_page() gets called.
2568 *
2569 * As soon as the page was really sent over the network put_page()
2570 * gets called by some part of the network layer. [ NIC driver? ]
2571 *
2572 * [ get_page() / put_page() increment/decrement the count. If count
2573 * reaches 0 the page will be freed. ]
2574 *
2575 * This works nicely with pages from FSs.
2576 * But this means that in protocol A we might signal IO completion too early!
2577 *
2578 * In order not to corrupt data during a resync we must make sure
2579 * that we do not reuse our own buffer pages (EEs) to early, therefore
2580 * we have the net_ee list.
2581 *
2582 * XFS seems to have problems, still, it submits pages with page_count == 0!
2583 * As a workaround, we disable sendpage on pages
2584 * with page_count == 0 or PageSlab.
2585 */
2586static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2587 int offset, size_t size, unsigned msg_flags)
b411b363 2588{
ba11ad9a 2589 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
2590 kunmap(page);
2591 if (sent == size)
2592 mdev->send_cnt += size>>9;
2593 return sent == size;
2594}
2595
2596static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2597 int offset, size_t size, unsigned msg_flags)
b411b363
PR
2598{
2599 mm_segment_t oldfs = get_fs();
2600 int sent, ok;
2601 int len = size;
2602
2603 /* e.g. XFS meta- & log-data is in slab pages, which have a
2604 * page_count of 0 and/or have PageSlab() set.
2605 * we cannot use send_page for those, as that does get_page();
2606 * put_page(); and would cause either a VM_BUG directly, or
2607 * __page_cache_release a page that would actually still be referenced
2608 * by someone, leading to some obscure delayed Oops somewhere else. */
2609 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 2610 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 2611
ba11ad9a 2612 msg_flags |= MSG_NOSIGNAL;
b411b363
PR
2613 drbd_update_congested(mdev);
2614 set_fs(KERNEL_DS);
2615 do {
2616 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2617 offset, len,
ba11ad9a 2618 msg_flags);
b411b363
PR
2619 if (sent == -EAGAIN) {
2620 if (we_should_drop_the_connection(mdev,
2621 mdev->data.socket))
2622 break;
2623 else
2624 continue;
2625 }
2626 if (sent <= 0) {
2627 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2628 __func__, (int)size, len, sent);
2629 break;
2630 }
2631 len -= sent;
2632 offset += sent;
2633 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2634 set_fs(oldfs);
2635 clear_bit(NET_CONGESTED, &mdev->flags);
2636
2637 ok = (len == 0);
2638 if (likely(ok))
2639 mdev->send_cnt += size>>9;
2640 return ok;
2641}
2642
2643static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2644{
2645 struct bio_vec *bvec;
2646 int i;
ba11ad9a 2647 /* hint all but last page with MSG_MORE */
b411b363
PR
2648 __bio_for_each_segment(bvec, bio, i, 0) {
2649 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2650 bvec->bv_offset, bvec->bv_len,
2651 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2652 return 0;
2653 }
2654 return 1;
2655}
2656
2657static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2658{
2659 struct bio_vec *bvec;
2660 int i;
ba11ad9a 2661 /* hint all but last page with MSG_MORE */
b411b363
PR
2662 __bio_for_each_segment(bvec, bio, i, 0) {
2663 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2664 bvec->bv_offset, bvec->bv_len,
2665 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2666 return 0;
2667 }
b411b363
PR
2668 return 1;
2669}
2670
45bb912b
LE
2671static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2672{
2673 struct page *page = e->pages;
2674 unsigned len = e->size;
ba11ad9a 2675 /* hint all but last page with MSG_MORE */
45bb912b
LE
2676 page_chain_for_each(page) {
2677 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
2678 if (!_drbd_send_page(mdev, page, 0, l,
2679 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
2680 return 0;
2681 len -= l;
2682 }
2683 return 1;
2684}
2685
76d2e7ec
PR
2686static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2687{
2688 if (mdev->agreed_pro_version >= 95)
2689 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
2690 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2691 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2692 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2693 else
721a9602 2694 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
2695}
2696
b411b363
PR
2697/* Used to send write requests
2698 * R_PRIMARY -> Peer (P_DATA)
2699 */
2700int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2701{
2702 int ok = 1;
2703 struct p_data p;
2704 unsigned int dp_flags = 0;
2705 void *dgb;
2706 int dgs;
2707
2708 if (!drbd_get_data_sock(mdev))
2709 return 0;
2710
2711 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2712 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2713
d5373389 2714 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
ca9bc12b 2715 p.head.h80.magic = cpu_to_be32(DRBD_MAGIC);
0b70a13d
PR
2716 p.head.h80.command = cpu_to_be16(P_DATA);
2717 p.head.h80.length =
2718 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2719 } else {
ca9bc12b 2720 p.head.h95.magic = cpu_to_be16(DRBD_MAGIC_BIG);
0b70a13d
PR
2721 p.head.h95.command = cpu_to_be16(P_DATA);
2722 p.head.h95.length =
2723 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2724 }
b411b363
PR
2725
2726 p.sector = cpu_to_be64(req->sector);
2727 p.block_id = (unsigned long)req;
2728 p.seq_num = cpu_to_be32(req->seq_num =
2729 atomic_add_return(1, &mdev->packet_seq));
b411b363 2730
76d2e7ec
PR
2731 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2732
b411b363
PR
2733 if (mdev->state.conn >= C_SYNC_SOURCE &&
2734 mdev->state.conn <= C_PAUSED_SYNC_T)
2735 dp_flags |= DP_MAY_SET_IN_SYNC;
2736
2737 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2738 set_bit(UNPLUG_REMOTE, &mdev->flags);
2739 ok = (sizeof(p) ==
ba11ad9a 2740 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363
PR
2741 if (ok && dgs) {
2742 dgb = mdev->int_dig_out;
45bb912b 2743 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
cab2f74b 2744 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2745 }
2746 if (ok) {
470be44a
LE
2747 /* For protocol A, we have to memcpy the payload into
2748 * socket buffers, as we may complete right away
2749 * as soon as we handed it over to tcp, at which point the data
2750 * pages may become invalid.
2751 *
2752 * For data-integrity enabled, we copy it as well, so we can be
2753 * sure that even if the bio pages may still be modified, it
2754 * won't change the data on the wire, thus if the digest checks
2755 * out ok after sending on this side, but does not fit on the
2756 * receiving side, we sure have detected corruption elsewhere.
2757 */
2758 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
2759 ok = _drbd_send_bio(mdev, req->master_bio);
2760 else
2761 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
2762
2763 /* double check digest, sometimes buffers have been modified in flight. */
2764 if (dgs > 0 && dgs <= 64) {
24c4830c 2765 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
2766 * currently supported in kernel crypto. */
2767 unsigned char digest[64];
2768 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2769 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2770 dev_warn(DEV,
2771 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2772 (unsigned long long)req->sector, req->size);
2773 }
2774 } /* else if (dgs > 64) {
2775 ... Be noisy about digest too large ...
2776 } */
b411b363
PR
2777 }
2778
2779 drbd_put_data_sock(mdev);
bd26bfc5 2780
b411b363
PR
2781 return ok;
2782}
2783
2784/* answer packet, used to send data back for read requests:
2785 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2786 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2787 */
2788int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2789 struct drbd_epoch_entry *e)
2790{
2791 int ok;
2792 struct p_data p;
2793 void *dgb;
2794 int dgs;
2795
2796 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2797 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2798
d5373389 2799 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
ca9bc12b 2800 p.head.h80.magic = cpu_to_be32(DRBD_MAGIC);
0b70a13d
PR
2801 p.head.h80.command = cpu_to_be16(cmd);
2802 p.head.h80.length =
2803 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2804 } else {
ca9bc12b 2805 p.head.h95.magic = cpu_to_be16(DRBD_MAGIC_BIG);
0b70a13d
PR
2806 p.head.h95.command = cpu_to_be16(cmd);
2807 p.head.h95.length =
2808 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2809 }
b411b363
PR
2810
2811 p.sector = cpu_to_be64(e->sector);
2812 p.block_id = e->block_id;
2813 /* p.seq_num = 0; No sequence numbers here.. */
2814
2815 /* Only called by our kernel thread.
2816 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2817 * in response to admin command or module unload.
2818 */
2819 if (!drbd_get_data_sock(mdev))
2820 return 0;
2821
0b70a13d 2822 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363
PR
2823 if (ok && dgs) {
2824 dgb = mdev->int_dig_out;
45bb912b 2825 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
cab2f74b 2826 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2827 }
2828 if (ok)
45bb912b 2829 ok = _drbd_send_zc_ee(mdev, e);
b411b363
PR
2830
2831 drbd_put_data_sock(mdev);
bd26bfc5 2832
b411b363
PR
2833 return ok;
2834}
2835
73a01a18
PR
2836int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2837{
2838 struct p_block_desc p;
2839
2840 p.sector = cpu_to_be64(req->sector);
2841 p.blksize = cpu_to_be32(req->size);
2842
2843 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2844}
2845
b411b363
PR
2846/*
2847 drbd_send distinguishes two cases:
2848
2849 Packets sent via the data socket "sock"
2850 and packets sent via the meta data socket "msock"
2851
2852 sock msock
2853 -----------------+-------------------------+------------------------------
2854 timeout conf.timeout / 2 conf.timeout / 2
2855 timeout action send a ping via msock Abort communication
2856 and close all sockets
2857*/
2858
2859/*
2860 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2861 */
2862int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2863 void *buf, size_t size, unsigned msg_flags)
2864{
2865 struct kvec iov;
2866 struct msghdr msg;
2867 int rv, sent = 0;
2868
2869 if (!sock)
2870 return -1000;
2871
2872 /* THINK if (signal_pending) return ... ? */
2873
2874 iov.iov_base = buf;
2875 iov.iov_len = size;
2876
2877 msg.msg_name = NULL;
2878 msg.msg_namelen = 0;
2879 msg.msg_control = NULL;
2880 msg.msg_controllen = 0;
2881 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2882
2883 if (sock == mdev->data.socket) {
2884 mdev->ko_count = mdev->net_conf->ko_count;
2885 drbd_update_congested(mdev);
2886 }
2887 do {
2888 /* STRANGE
2889 * tcp_sendmsg does _not_ use its size parameter at all ?
2890 *
2891 * -EAGAIN on timeout, -EINTR on signal.
2892 */
2893/* THINK
2894 * do we need to block DRBD_SIG if sock == &meta.socket ??
2895 * otherwise wake_asender() might interrupt some send_*Ack !
2896 */
2897 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2898 if (rv == -EAGAIN) {
2899 if (we_should_drop_the_connection(mdev, sock))
2900 break;
2901 else
2902 continue;
2903 }
2904 D_ASSERT(rv != 0);
2905 if (rv == -EINTR) {
2906 flush_signals(current);
2907 rv = 0;
2908 }
2909 if (rv < 0)
2910 break;
2911 sent += rv;
2912 iov.iov_base += rv;
2913 iov.iov_len -= rv;
2914 } while (sent < size);
2915
2916 if (sock == mdev->data.socket)
2917 clear_bit(NET_CONGESTED, &mdev->flags);
2918
2919 if (rv <= 0) {
2920 if (rv != -EAGAIN) {
2921 dev_err(DEV, "%s_sendmsg returned %d\n",
2922 sock == mdev->meta.socket ? "msock" : "sock",
2923 rv);
2924 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2925 } else
2926 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2927 }
2928
2929 return sent;
2930}
2931
2932static int drbd_open(struct block_device *bdev, fmode_t mode)
2933{
2934 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2935 unsigned long flags;
2936 int rv = 0;
2937
2a48fc0a 2938 mutex_lock(&drbd_main_mutex);
b411b363
PR
2939 spin_lock_irqsave(&mdev->req_lock, flags);
2940 /* to have a stable mdev->state.role
2941 * and no race with updating open_cnt */
2942
2943 if (mdev->state.role != R_PRIMARY) {
2944 if (mode & FMODE_WRITE)
2945 rv = -EROFS;
2946 else if (!allow_oos)
2947 rv = -EMEDIUMTYPE;
2948 }
2949
2950 if (!rv)
2951 mdev->open_cnt++;
2952 spin_unlock_irqrestore(&mdev->req_lock, flags);
2a48fc0a 2953 mutex_unlock(&drbd_main_mutex);
b411b363
PR
2954
2955 return rv;
2956}
2957
2958static int drbd_release(struct gendisk *gd, fmode_t mode)
2959{
2960 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 2961 mutex_lock(&drbd_main_mutex);
b411b363 2962 mdev->open_cnt--;
2a48fc0a 2963 mutex_unlock(&drbd_main_mutex);
b411b363
PR
2964 return 0;
2965}
2966
b411b363
PR
2967static void drbd_set_defaults(struct drbd_conf *mdev)
2968{
85f4cc17
PR
2969 /* This way we get a compile error when sync_conf grows,
2970 and we forgot to initialize it here */
2971 mdev->sync_conf = (struct syncer_conf) {
2972 /* .rate = */ DRBD_RATE_DEF,
2973 /* .after = */ DRBD_AFTER_DEF,
2974 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
2975 /* .verify_alg = */ {}, 0,
2976 /* .cpu_mask = */ {}, 0,
2977 /* .csums_alg = */ {}, 0,
e756414f 2978 /* .use_rle = */ 0,
9a31d716
PR
2979 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2980 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2981 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2982 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
2983 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2984 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
2985 };
2986
2987 /* Have to use that way, because the layout differs between
2988 big endian and little endian */
b411b363
PR
2989 mdev->state = (union drbd_state) {
2990 { .role = R_SECONDARY,
2991 .peer = R_UNKNOWN,
2992 .conn = C_STANDALONE,
2993 .disk = D_DISKLESS,
2994 .pdsk = D_UNKNOWN,
fb22c402
PR
2995 .susp = 0,
2996 .susp_nod = 0,
2997 .susp_fen = 0
b411b363
PR
2998 } };
2999}
3000
3001void drbd_init_set_defaults(struct drbd_conf *mdev)
3002{
3003 /* the memset(,0,) did most of this.
3004 * note: only assignments, no allocation in here */
3005
3006 drbd_set_defaults(mdev);
3007
b411b363
PR
3008 atomic_set(&mdev->ap_bio_cnt, 0);
3009 atomic_set(&mdev->ap_pending_cnt, 0);
3010 atomic_set(&mdev->rs_pending_cnt, 0);
3011 atomic_set(&mdev->unacked_cnt, 0);
3012 atomic_set(&mdev->local_cnt, 0);
3013 atomic_set(&mdev->net_cnt, 0);
3014 atomic_set(&mdev->packet_seq, 0);
3015 atomic_set(&mdev->pp_in_use, 0);
435f0740 3016 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 3017 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 3018 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 3019 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
3020
3021 mutex_init(&mdev->md_io_mutex);
3022 mutex_init(&mdev->data.mutex);
3023 mutex_init(&mdev->meta.mutex);
3024 sema_init(&mdev->data.work.s, 0);
3025 sema_init(&mdev->meta.work.s, 0);
3026 mutex_init(&mdev->state_mutex);
3027
3028 spin_lock_init(&mdev->data.work.q_lock);
3029 spin_lock_init(&mdev->meta.work.q_lock);
3030
3031 spin_lock_init(&mdev->al_lock);
3032 spin_lock_init(&mdev->req_lock);
3033 spin_lock_init(&mdev->peer_seq_lock);
3034 spin_lock_init(&mdev->epoch_lock);
3035
3036 INIT_LIST_HEAD(&mdev->active_ee);
3037 INIT_LIST_HEAD(&mdev->sync_ee);
3038 INIT_LIST_HEAD(&mdev->done_ee);
3039 INIT_LIST_HEAD(&mdev->read_ee);
3040 INIT_LIST_HEAD(&mdev->net_ee);
3041 INIT_LIST_HEAD(&mdev->resync_reads);
3042 INIT_LIST_HEAD(&mdev->data.work.q);
3043 INIT_LIST_HEAD(&mdev->meta.work.q);
3044 INIT_LIST_HEAD(&mdev->resync_work.list);
3045 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 3046 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 3047 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 3048 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 3049 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 3050
794abb75 3051 mdev->resync_work.cb = w_resync_timer;
b411b363 3052 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 3053 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
3054 mdev->md_sync_work.cb = w_md_sync;
3055 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 3056 mdev->start_resync_work.cb = w_start_resync;
b411b363
PR
3057 init_timer(&mdev->resync_timer);
3058 init_timer(&mdev->md_sync_timer);
370a43e7 3059 init_timer(&mdev->start_resync_timer);
7fde2be9 3060 init_timer(&mdev->request_timer);
b411b363
PR
3061 mdev->resync_timer.function = resync_timer_fn;
3062 mdev->resync_timer.data = (unsigned long) mdev;
3063 mdev->md_sync_timer.function = md_sync_timer_fn;
3064 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
3065 mdev->start_resync_timer.function = start_resync_timer_fn;
3066 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
3067 mdev->request_timer.function = request_timer_fn;
3068 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
3069
3070 init_waitqueue_head(&mdev->misc_wait);
3071 init_waitqueue_head(&mdev->state_wait);
84dfb9f5 3072 init_waitqueue_head(&mdev->net_cnt_wait);
b411b363
PR
3073 init_waitqueue_head(&mdev->ee_wait);
3074 init_waitqueue_head(&mdev->al_wait);
3075 init_waitqueue_head(&mdev->seq_wait);
3076
3077 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3078 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3079 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3080
3081 mdev->agreed_pro_version = PRO_VERSION_MAX;
2451fc3b 3082 mdev->write_ordering = WO_bdev_flush;
b411b363 3083 mdev->resync_wenr = LC_FREE;
99432fcc
PR
3084 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3085 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
3086}
3087
3088void drbd_mdev_cleanup(struct drbd_conf *mdev)
3089{
1d7734a0 3090 int i;
b411b363
PR
3091 if (mdev->receiver.t_state != None)
3092 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3093 mdev->receiver.t_state);
3094
3095 /* no need to lock it, I'm the only thread alive */
3096 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3097 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3098 mdev->al_writ_cnt =
3099 mdev->bm_writ_cnt =
3100 mdev->read_cnt =
3101 mdev->recv_cnt =
3102 mdev->send_cnt =
3103 mdev->writ_cnt =
3104 mdev->p_size =
3105 mdev->rs_start =
3106 mdev->rs_total =
1d7734a0
LE
3107 mdev->rs_failed = 0;
3108 mdev->rs_last_events = 0;
0f0601f4 3109 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
3110 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3111 mdev->rs_mark_left[i] = 0;
3112 mdev->rs_mark_time[i] = 0;
3113 }
b411b363
PR
3114 D_ASSERT(mdev->net_conf == NULL);
3115
3116 drbd_set_my_capacity(mdev, 0);
3117 if (mdev->bitmap) {
3118 /* maybe never allocated. */
02d9a94b 3119 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
3120 drbd_bm_cleanup(mdev);
3121 }
3122
3123 drbd_free_resources(mdev);
0778286a 3124 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
3125
3126 /*
3127 * currently we drbd_init_ee only on module load, so
3128 * we may do drbd_release_ee only on module unload!
3129 */
3130 D_ASSERT(list_empty(&mdev->active_ee));
3131 D_ASSERT(list_empty(&mdev->sync_ee));
3132 D_ASSERT(list_empty(&mdev->done_ee));
3133 D_ASSERT(list_empty(&mdev->read_ee));
3134 D_ASSERT(list_empty(&mdev->net_ee));
3135 D_ASSERT(list_empty(&mdev->resync_reads));
3136 D_ASSERT(list_empty(&mdev->data.work.q));
3137 D_ASSERT(list_empty(&mdev->meta.work.q));
3138 D_ASSERT(list_empty(&mdev->resync_work.list));
3139 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 3140 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
3141
3142 drbd_set_defaults(mdev);
b411b363
PR
3143}
3144
3145
3146static void drbd_destroy_mempools(void)
3147{
3148 struct page *page;
3149
3150 while (drbd_pp_pool) {
3151 page = drbd_pp_pool;
3152 drbd_pp_pool = (struct page *)page_private(page);
3153 __free_page(page);
3154 drbd_pp_vacant--;
3155 }
3156
3157 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3158
3159 if (drbd_ee_mempool)
3160 mempool_destroy(drbd_ee_mempool);
3161 if (drbd_request_mempool)
3162 mempool_destroy(drbd_request_mempool);
3163 if (drbd_ee_cache)
3164 kmem_cache_destroy(drbd_ee_cache);
3165 if (drbd_request_cache)
3166 kmem_cache_destroy(drbd_request_cache);
3167 if (drbd_bm_ext_cache)
3168 kmem_cache_destroy(drbd_bm_ext_cache);
3169 if (drbd_al_ext_cache)
3170 kmem_cache_destroy(drbd_al_ext_cache);
3171
3172 drbd_ee_mempool = NULL;
3173 drbd_request_mempool = NULL;
3174 drbd_ee_cache = NULL;
3175 drbd_request_cache = NULL;
3176 drbd_bm_ext_cache = NULL;
3177 drbd_al_ext_cache = NULL;
3178
3179 return;
3180}
3181
3182static int drbd_create_mempools(void)
3183{
3184 struct page *page;
1816a2b4 3185 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
3186 int i;
3187
3188 /* prepare our caches and mempools */
3189 drbd_request_mempool = NULL;
3190 drbd_ee_cache = NULL;
3191 drbd_request_cache = NULL;
3192 drbd_bm_ext_cache = NULL;
3193 drbd_al_ext_cache = NULL;
3194 drbd_pp_pool = NULL;
3195
3196 /* caches */
3197 drbd_request_cache = kmem_cache_create(
3198 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3199 if (drbd_request_cache == NULL)
3200 goto Enomem;
3201
3202 drbd_ee_cache = kmem_cache_create(
3203 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3204 if (drbd_ee_cache == NULL)
3205 goto Enomem;
3206
3207 drbd_bm_ext_cache = kmem_cache_create(
3208 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3209 if (drbd_bm_ext_cache == NULL)
3210 goto Enomem;
3211
3212 drbd_al_ext_cache = kmem_cache_create(
3213 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3214 if (drbd_al_ext_cache == NULL)
3215 goto Enomem;
3216
3217 /* mempools */
3218 drbd_request_mempool = mempool_create(number,
3219 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3220 if (drbd_request_mempool == NULL)
3221 goto Enomem;
3222
3223 drbd_ee_mempool = mempool_create(number,
3224 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 3225 if (drbd_ee_mempool == NULL)
b411b363
PR
3226 goto Enomem;
3227
3228 /* drbd's page pool */
3229 spin_lock_init(&drbd_pp_lock);
3230
3231 for (i = 0; i < number; i++) {
3232 page = alloc_page(GFP_HIGHUSER);
3233 if (!page)
3234 goto Enomem;
3235 set_page_private(page, (unsigned long)drbd_pp_pool);
3236 drbd_pp_pool = page;
3237 }
3238 drbd_pp_vacant = number;
3239
3240 return 0;
3241
3242Enomem:
3243 drbd_destroy_mempools(); /* in case we allocated some */
3244 return -ENOMEM;
3245}
3246
3247static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3248 void *unused)
3249{
3250 /* just so we have it. you never know what interesting things we
3251 * might want to do here some day...
3252 */
3253
3254 return NOTIFY_DONE;
3255}
3256
3257static struct notifier_block drbd_notifier = {
3258 .notifier_call = drbd_notify_sys,
3259};
3260
3261static void drbd_release_ee_lists(struct drbd_conf *mdev)
3262{
3263 int rr;
3264
3265 rr = drbd_release_ee(mdev, &mdev->active_ee);
3266 if (rr)
3267 dev_err(DEV, "%d EEs in active list found!\n", rr);
3268
3269 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3270 if (rr)
3271 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3272
3273 rr = drbd_release_ee(mdev, &mdev->read_ee);
3274 if (rr)
3275 dev_err(DEV, "%d EEs in read list found!\n", rr);
3276
3277 rr = drbd_release_ee(mdev, &mdev->done_ee);
3278 if (rr)
3279 dev_err(DEV, "%d EEs in done list found!\n", rr);
3280
3281 rr = drbd_release_ee(mdev, &mdev->net_ee);
3282 if (rr)
3283 dev_err(DEV, "%d EEs in net list found!\n", rr);
3284}
3285
3286/* caution. no locking.
3287 * currently only used from module cleanup code. */
3288static void drbd_delete_device(unsigned int minor)
3289{
3290 struct drbd_conf *mdev = minor_to_mdev(minor);
3291
3292 if (!mdev)
3293 return;
3294
3295 /* paranoia asserts */
3296 if (mdev->open_cnt != 0)
3297 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3298 __FILE__ , __LINE__);
3299
3300 ERR_IF (!list_empty(&mdev->data.work.q)) {
3301 struct list_head *lp;
3302 list_for_each(lp, &mdev->data.work.q) {
3303 dev_err(DEV, "lp = %p\n", lp);
3304 }
3305 };
3306 /* end paranoia asserts */
3307
3308 del_gendisk(mdev->vdisk);
3309
3310 /* cleanup stuff that may have been allocated during
3311 * device (re-)configuration or state changes */
3312
3313 if (mdev->this_bdev)
3314 bdput(mdev->this_bdev);
3315
3316 drbd_free_resources(mdev);
3317
3318 drbd_release_ee_lists(mdev);
3319
24c4830c 3320 /* should be freed on disconnect? */
b411b363
PR
3321 kfree(mdev->ee_hash);
3322 /*
3323 mdev->ee_hash_s = 0;
3324 mdev->ee_hash = NULL;
3325 */
3326
3327 lc_destroy(mdev->act_log);
3328 lc_destroy(mdev->resync);
3329
3330 kfree(mdev->p_uuid);
3331 /* mdev->p_uuid = NULL; */
3332
3333 kfree(mdev->int_dig_out);
3334 kfree(mdev->int_dig_in);
3335 kfree(mdev->int_dig_vv);
3336
3337 /* cleanup the rest that has been
3338 * allocated from drbd_new_device
3339 * and actually free the mdev itself */
3340 drbd_free_mdev(mdev);
3341}
3342
3343static void drbd_cleanup(void)
3344{
3345 unsigned int i;
3346
3347 unregister_reboot_notifier(&drbd_notifier);
3348
17a93f30
LE
3349 /* first remove proc,
3350 * drbdsetup uses it's presence to detect
3351 * whether DRBD is loaded.
3352 * If we would get stuck in proc removal,
3353 * but have netlink already deregistered,
3354 * some drbdsetup commands may wait forever
3355 * for an answer.
3356 */
3357 if (drbd_proc)
3358 remove_proc_entry("drbd", NULL);
3359
b411b363
PR
3360 drbd_nl_cleanup();
3361
3362 if (minor_table) {
b411b363
PR
3363 i = minor_count;
3364 while (i--)
3365 drbd_delete_device(i);
3366 drbd_destroy_mempools();
3367 }
3368
3369 kfree(minor_table);
3370
3371 unregister_blkdev(DRBD_MAJOR, "drbd");
3372
3373 printk(KERN_INFO "drbd: module cleanup done.\n");
3374}
3375
3376/**
3377 * drbd_congested() - Callback for pdflush
3378 * @congested_data: User data
3379 * @bdi_bits: Bits pdflush is currently interested in
3380 *
3381 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3382 */
3383static int drbd_congested(void *congested_data, int bdi_bits)
3384{
3385 struct drbd_conf *mdev = congested_data;
3386 struct request_queue *q;
3387 char reason = '-';
3388 int r = 0;
3389
1b881ef7 3390 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
3391 /* DRBD has frozen IO */
3392 r = bdi_bits;
3393 reason = 'd';
3394 goto out;
3395 }
3396
3397 if (get_ldev(mdev)) {
3398 q = bdev_get_queue(mdev->ldev->backing_bdev);
3399 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3400 put_ldev(mdev);
3401 if (r)
3402 reason = 'b';
3403 }
3404
3405 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3406 r |= (1 << BDI_async_congested);
3407 reason = reason == 'b' ? 'a' : 'n';
3408 }
3409
3410out:
3411 mdev->congestion_reason = reason;
3412 return r;
3413}
3414
3415struct drbd_conf *drbd_new_device(unsigned int minor)
3416{
3417 struct drbd_conf *mdev;
3418 struct gendisk *disk;
3419 struct request_queue *q;
3420
3421 /* GFP_KERNEL, we are outside of all write-out paths */
3422 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3423 if (!mdev)
3424 return NULL;
3425 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3426 goto out_no_cpumask;
3427
3428 mdev->minor = minor;
3429
3430 drbd_init_set_defaults(mdev);
3431
3432 q = blk_alloc_queue(GFP_KERNEL);
3433 if (!q)
3434 goto out_no_q;
3435 mdev->rq_queue = q;
3436 q->queuedata = mdev;
b411b363
PR
3437
3438 disk = alloc_disk(1);
3439 if (!disk)
3440 goto out_no_disk;
3441 mdev->vdisk = disk;
3442
81e84650 3443 set_disk_ro(disk, true);
b411b363
PR
3444
3445 disk->queue = q;
3446 disk->major = DRBD_MAJOR;
3447 disk->first_minor = minor;
3448 disk->fops = &drbd_ops;
3449 sprintf(disk->disk_name, "drbd%d", minor);
3450 disk->private_data = mdev;
3451
3452 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3453 /* we have no partitions. we contain only ourselves. */
3454 mdev->this_bdev->bd_contains = mdev->this_bdev;
3455
3456 q->backing_dev_info.congested_fn = drbd_congested;
3457 q->backing_dev_info.congested_data = mdev;
3458
2f58dcfc 3459 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
3460 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3461 This triggers a max_bio_size message upon first attach or connect */
3462 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
3463 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3464 blk_queue_merge_bvec(q, drbd_merge_bvec);
7eaceacc 3465 q->queue_lock = &mdev->req_lock;
b411b363
PR
3466
3467 mdev->md_io_page = alloc_page(GFP_KERNEL);
3468 if (!mdev->md_io_page)
3469 goto out_no_io_page;
3470
3471 if (drbd_bm_init(mdev))
3472 goto out_no_bitmap;
3473 /* no need to lock access, we are still initializing this minor device. */
3474 if (!tl_init(mdev))
3475 goto out_no_tl;
3476
3477 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3478 if (!mdev->app_reads_hash)
3479 goto out_no_app_reads;
3480
3481 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3482 if (!mdev->current_epoch)
3483 goto out_no_epoch;
3484
3485 INIT_LIST_HEAD(&mdev->current_epoch->list);
3486 mdev->epochs = 1;
3487
3488 return mdev;
3489
3490/* out_whatever_else:
3491 kfree(mdev->current_epoch); */
3492out_no_epoch:
3493 kfree(mdev->app_reads_hash);
3494out_no_app_reads:
3495 tl_cleanup(mdev);
3496out_no_tl:
3497 drbd_bm_cleanup(mdev);
3498out_no_bitmap:
3499 __free_page(mdev->md_io_page);
3500out_no_io_page:
3501 put_disk(disk);
3502out_no_disk:
3503 blk_cleanup_queue(q);
3504out_no_q:
3505 free_cpumask_var(mdev->cpu_mask);
3506out_no_cpumask:
3507 kfree(mdev);
3508 return NULL;
3509}
3510
3511/* counterpart of drbd_new_device.
3512 * last part of drbd_delete_device. */
3513void drbd_free_mdev(struct drbd_conf *mdev)
3514{
3515 kfree(mdev->current_epoch);
3516 kfree(mdev->app_reads_hash);
3517 tl_cleanup(mdev);
3518 if (mdev->bitmap) /* should no longer be there. */
3519 drbd_bm_cleanup(mdev);
3520 __free_page(mdev->md_io_page);
3521 put_disk(mdev->vdisk);
3522 blk_cleanup_queue(mdev->rq_queue);
3523 free_cpumask_var(mdev->cpu_mask);
3719094e 3524 drbd_free_tl_hash(mdev);
b411b363
PR
3525 kfree(mdev);
3526}
3527
3528
3529int __init drbd_init(void)
3530{
3531 int err;
3532
3533 if (sizeof(struct p_handshake) != 80) {
3534 printk(KERN_ERR
3535 "drbd: never change the size or layout "
3536 "of the HandShake packet.\n");
3537 return -EINVAL;
3538 }
3539
2b8a90b5 3540 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363
PR
3541 printk(KERN_ERR
3542 "drbd: invalid minor_count (%d)\n", minor_count);
3543#ifdef MODULE
3544 return -EINVAL;
3545#else
3546 minor_count = 8;
3547#endif
3548 }
3549
3550 err = drbd_nl_init();
3551 if (err)
3552 return err;
3553
3554 err = register_blkdev(DRBD_MAJOR, "drbd");
3555 if (err) {
3556 printk(KERN_ERR
3557 "drbd: unable to register block device major %d\n",
3558 DRBD_MAJOR);
3559 return err;
3560 }
3561
3562 register_reboot_notifier(&drbd_notifier);
3563
3564 /*
3565 * allocate all necessary structs
3566 */
3567 err = -ENOMEM;
3568
3569 init_waitqueue_head(&drbd_pp_wait);
3570
3571 drbd_proc = NULL; /* play safe for drbd_cleanup */
3572 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3573 GFP_KERNEL);
3574 if (!minor_table)
3575 goto Enomem;
3576
3577 err = drbd_create_mempools();
3578 if (err)
3579 goto Enomem;
3580
8c484ee4 3581 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3582 if (!drbd_proc) {
3583 printk(KERN_ERR "drbd: unable to register proc file\n");
3584 goto Enomem;
3585 }
3586
3587 rwlock_init(&global_state_lock);
3588
3589 printk(KERN_INFO "drbd: initialized. "
3590 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3591 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3592 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3593 printk(KERN_INFO "drbd: registered as block device major %d\n",
3594 DRBD_MAJOR);
3595 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3596
3597 return 0; /* Success! */
3598
3599Enomem:
3600 drbd_cleanup();
3601 if (err == -ENOMEM)
3602 /* currently always the case */
3603 printk(KERN_ERR "drbd: ran out of memory\n");
3604 else
3605 printk(KERN_ERR "drbd: initialization failure\n");
3606 return err;
3607}
3608
3609void drbd_free_bc(struct drbd_backing_dev *ldev)
3610{
3611 if (ldev == NULL)
3612 return;
3613
e525fd89
TH
3614 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3615 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
3616
3617 kfree(ldev);
3618}
3619
3620void drbd_free_sock(struct drbd_conf *mdev)
3621{
3622 if (mdev->data.socket) {
4589d7f8 3623 mutex_lock(&mdev->data.mutex);
b411b363
PR
3624 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3625 sock_release(mdev->data.socket);
3626 mdev->data.socket = NULL;
4589d7f8 3627 mutex_unlock(&mdev->data.mutex);
b411b363
PR
3628 }
3629 if (mdev->meta.socket) {
4589d7f8 3630 mutex_lock(&mdev->meta.mutex);
b411b363
PR
3631 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3632 sock_release(mdev->meta.socket);
3633 mdev->meta.socket = NULL;
4589d7f8 3634 mutex_unlock(&mdev->meta.mutex);
b411b363
PR
3635 }
3636}
3637
3638
3639void drbd_free_resources(struct drbd_conf *mdev)
3640{
3641 crypto_free_hash(mdev->csums_tfm);
3642 mdev->csums_tfm = NULL;
3643 crypto_free_hash(mdev->verify_tfm);
3644 mdev->verify_tfm = NULL;
3645 crypto_free_hash(mdev->cram_hmac_tfm);
3646 mdev->cram_hmac_tfm = NULL;
3647 crypto_free_hash(mdev->integrity_w_tfm);
3648 mdev->integrity_w_tfm = NULL;
3649 crypto_free_hash(mdev->integrity_r_tfm);
3650 mdev->integrity_r_tfm = NULL;
3651
3652 drbd_free_sock(mdev);
3653
3654 __no_warn(local,
3655 drbd_free_bc(mdev->ldev);
3656 mdev->ldev = NULL;);
3657}
3658
3659/* meta data management */
3660
3661struct meta_data_on_disk {
3662 u64 la_size; /* last agreed size. */
3663 u64 uuid[UI_SIZE]; /* UUIDs. */
3664 u64 device_uuid;
3665 u64 reserved_u64_1;
3666 u32 flags; /* MDF */
3667 u32 magic;
3668 u32 md_size_sect;
3669 u32 al_offset; /* offset to this block */
3670 u32 al_nr_extents; /* important for restoring the AL */
3671 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3672 u32 bm_offset; /* offset to the bitmap, from here */
3673 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
3674 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3675 u32 reserved_u32[3];
b411b363
PR
3676
3677} __packed;
3678
3679/**
3680 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3681 * @mdev: DRBD device.
3682 */
3683void drbd_md_sync(struct drbd_conf *mdev)
3684{
3685 struct meta_data_on_disk *buffer;
3686 sector_t sector;
3687 int i;
3688
ee15b038
LE
3689 del_timer(&mdev->md_sync_timer);
3690 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
3691 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3692 return;
b411b363
PR
3693
3694 /* We use here D_FAILED and not D_ATTACHING because we try to write
3695 * metadata even if we detach due to a disk failure! */
3696 if (!get_ldev_if_state(mdev, D_FAILED))
3697 return;
3698
b411b363
PR
3699 mutex_lock(&mdev->md_io_mutex);
3700 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3701 memset(buffer, 0, 512);
3702
3703 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3704 for (i = UI_CURRENT; i < UI_SIZE; i++)
3705 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3706 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3707 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3708
3709 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3710 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3711 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3712 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3713 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3714
3715 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 3716 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
3717
3718 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3719 sector = mdev->ldev->md.md_offset;
3720
3f3a9b84 3721 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
3722 /* this was a try anyways ... */
3723 dev_err(DEV, "meta data update failed!\n");
81e84650 3724 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
3725 }
3726
3727 /* Update mdev->ldev->md.la_size_sect,
3728 * since we updated it on metadata. */
3729 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3730
3731 mutex_unlock(&mdev->md_io_mutex);
3732 put_ldev(mdev);
3733}
3734
3735/**
3736 * drbd_md_read() - Reads in the meta data super block
3737 * @mdev: DRBD device.
3738 * @bdev: Device from which the meta data should be read in.
3739 *
116676ca 3740 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
3741 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3742 */
3743int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3744{
3745 struct meta_data_on_disk *buffer;
3746 int i, rv = NO_ERROR;
3747
3748 if (!get_ldev_if_state(mdev, D_ATTACHING))
3749 return ERR_IO_MD_DISK;
3750
b411b363
PR
3751 mutex_lock(&mdev->md_io_mutex);
3752 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3753
3754 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 3755 /* NOTE: can't do normal error processing here as this is
b411b363
PR
3756 called BEFORE disk is attached */
3757 dev_err(DEV, "Error while reading metadata.\n");
3758 rv = ERR_IO_MD_DISK;
3759 goto err;
3760 }
3761
e7fad8af 3762 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
3763 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3764 rv = ERR_MD_INVALID;
3765 goto err;
3766 }
3767 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3768 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3769 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3770 rv = ERR_MD_INVALID;
3771 goto err;
3772 }
3773 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3774 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3775 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3776 rv = ERR_MD_INVALID;
3777 goto err;
3778 }
3779 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3780 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3781 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3782 rv = ERR_MD_INVALID;
3783 goto err;
3784 }
3785
3786 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3787 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3788 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3789 rv = ERR_MD_INVALID;
3790 goto err;
3791 }
3792
3793 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3794 for (i = UI_CURRENT; i < UI_SIZE; i++)
3795 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3796 bdev->md.flags = be32_to_cpu(buffer->flags);
3797 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3798 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3799
99432fcc
PR
3800 spin_lock_irq(&mdev->req_lock);
3801 if (mdev->state.conn < C_CONNECTED) {
3802 int peer;
3803 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3804 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3805 mdev->peer_max_bio_size = peer;
3806 }
3807 spin_unlock_irq(&mdev->req_lock);
3808
b411b363
PR
3809 if (mdev->sync_conf.al_extents < 7)
3810 mdev->sync_conf.al_extents = 127;
3811
3812 err:
3813 mutex_unlock(&mdev->md_io_mutex);
3814 put_ldev(mdev);
3815
3816 return rv;
3817}
3818
3819/**
3820 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3821 * @mdev: DRBD device.
3822 *
3823 * Call this function if you change anything that should be written to
3824 * the meta-data super block. This function sets MD_DIRTY, and starts a
3825 * timer that ensures that within five seconds you have to call drbd_md_sync().
3826 */
ca0e6098 3827#ifdef DEBUG
ee15b038
LE
3828void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3829{
3830 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3831 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3832 mdev->last_md_mark_dirty.line = line;
3833 mdev->last_md_mark_dirty.func = func;
3834 }
3835}
3836#else
b411b363
PR
3837void drbd_md_mark_dirty(struct drbd_conf *mdev)
3838{
ee15b038 3839 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 3840 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 3841}
ee15b038 3842#endif
b411b363
PR
3843
3844static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3845{
3846 int i;
3847
62b0da3a 3848 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 3849 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
3850}
3851
3852void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3853{
3854 if (idx == UI_CURRENT) {
3855 if (mdev->state.role == R_PRIMARY)
3856 val |= 1;
3857 else
3858 val &= ~((u64)1);
3859
3860 drbd_set_ed_uuid(mdev, val);
3861 }
3862
3863 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
3864 drbd_md_mark_dirty(mdev);
3865}
3866
3867
3868void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3869{
3870 if (mdev->ldev->md.uuid[idx]) {
3871 drbd_uuid_move_history(mdev);
3872 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
3873 }
3874 _drbd_uuid_set(mdev, idx, val);
3875}
3876
3877/**
3878 * drbd_uuid_new_current() - Creates a new current UUID
3879 * @mdev: DRBD device.
3880 *
3881 * Creates a new current UUID, and rotates the old current UUID into
3882 * the bitmap slot. Causes an incremental resync upon next connect.
3883 */
3884void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3885{
3886 u64 val;
62b0da3a
LE
3887 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3888
3889 if (bm_uuid)
3890 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 3891
b411b363 3892 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
3893
3894 get_random_bytes(&val, sizeof(u64));
3895 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 3896 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
3897 /* get it to stable storage _now_ */
3898 drbd_md_sync(mdev);
b411b363
PR
3899}
3900
3901void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3902{
3903 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3904 return;
3905
3906 if (val == 0) {
3907 drbd_uuid_move_history(mdev);
3908 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3909 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 3910 } else {
62b0da3a
LE
3911 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3912 if (bm_uuid)
3913 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 3914
62b0da3a 3915 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
3916 }
3917 drbd_md_mark_dirty(mdev);
3918}
3919
3920/**
3921 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3922 * @mdev: DRBD device.
3923 *
3924 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3925 */
3926int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3927{
3928 int rv = -EIO;
3929
3930 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3931 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3932 drbd_md_sync(mdev);
3933 drbd_bm_set_all(mdev);
3934
3935 rv = drbd_bm_write(mdev);
3936
3937 if (!rv) {
3938 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3939 drbd_md_sync(mdev);
3940 }
3941
3942 put_ldev(mdev);
3943 }
3944
3945 return rv;
3946}
3947
3948/**
3949 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3950 * @mdev: DRBD device.
3951 *
3952 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3953 */
3954int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3955{
3956 int rv = -EIO;
3957
0778286a 3958 drbd_resume_al(mdev);
b411b363
PR
3959 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3960 drbd_bm_clear_all(mdev);
3961 rv = drbd_bm_write(mdev);
3962 put_ldev(mdev);
3963 }
3964
3965 return rv;
3966}
3967
3968static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3969{
3970 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
02851e9f 3971 int rv = -EIO;
b411b363
PR
3972
3973 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3974
02851e9f 3975 if (get_ldev(mdev)) {
20ceb2b2 3976 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
3977 rv = work->io_fn(mdev);
3978 drbd_bm_unlock(mdev);
3979 put_ldev(mdev);
3980 }
b411b363
PR
3981
3982 clear_bit(BITMAP_IO, &mdev->flags);
127b3178 3983 smp_mb__after_clear_bit();
b411b363
PR
3984 wake_up(&mdev->misc_wait);
3985
3986 if (work->done)
3987 work->done(mdev, rv);
3988
3989 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3990 work->why = NULL;
20ceb2b2 3991 work->flags = 0;
b411b363
PR
3992
3993 return 1;
3994}
3995
82f59cc6
LE
3996void drbd_ldev_destroy(struct drbd_conf *mdev)
3997{
3998 lc_destroy(mdev->resync);
3999 mdev->resync = NULL;
4000 lc_destroy(mdev->act_log);
4001 mdev->act_log = NULL;
4002 __no_warn(local,
4003 drbd_free_bc(mdev->ldev);
4004 mdev->ldev = NULL;);
4005
4006 if (mdev->md_io_tmpp) {
4007 __free_page(mdev->md_io_tmpp);
4008 mdev->md_io_tmpp = NULL;
4009 }
4010 clear_bit(GO_DISKLESS, &mdev->flags);
4011}
4012
e9e6f3ec
LE
4013static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4014{
4015 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
4016 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4017 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
4018 * the protected members anymore, though, so once put_ldev reaches zero
4019 * again, it will be safe to free them. */
e9e6f3ec 4020 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
4021 return 1;
4022}
4023
4024void drbd_go_diskless(struct drbd_conf *mdev)
4025{
4026 D_ASSERT(mdev->state.disk == D_FAILED);
4027 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
9d282875 4028 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
e9e6f3ec
LE
4029}
4030
b411b363
PR
4031/**
4032 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4033 * @mdev: DRBD device.
4034 * @io_fn: IO callback to be called when bitmap IO is possible
4035 * @done: callback to be called after the bitmap IO was performed
4036 * @why: Descriptive text of the reason for doing the IO
4037 *
4038 * While IO on the bitmap happens we freeze application IO thus we ensure
4039 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4040 * called from worker context. It MUST NOT be used while a previous such
4041 * work is still pending!
4042 */
4043void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4044 int (*io_fn)(struct drbd_conf *),
4045 void (*done)(struct drbd_conf *, int),
20ceb2b2 4046 char *why, enum bm_flag flags)
b411b363
PR
4047{
4048 D_ASSERT(current == mdev->worker.task);
4049
4050 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4051 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4052 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4053 if (mdev->bm_io_work.why)
4054 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4055 why, mdev->bm_io_work.why);
4056
4057 mdev->bm_io_work.io_fn = io_fn;
4058 mdev->bm_io_work.done = done;
4059 mdev->bm_io_work.why = why;
20ceb2b2 4060 mdev->bm_io_work.flags = flags;
b411b363 4061
22afd7ee 4062 spin_lock_irq(&mdev->req_lock);
b411b363
PR
4063 set_bit(BITMAP_IO, &mdev->flags);
4064 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 4065 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
b411b363 4066 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
b411b363 4067 }
22afd7ee 4068 spin_unlock_irq(&mdev->req_lock);
b411b363
PR
4069}
4070
4071/**
4072 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4073 * @mdev: DRBD device.
4074 * @io_fn: IO callback to be called when bitmap IO is possible
4075 * @why: Descriptive text of the reason for doing the IO
4076 *
4077 * freezes application IO while that the actual IO operations runs. This
4078 * functions MAY NOT be called from worker context.
4079 */
20ceb2b2
LE
4080int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4081 char *why, enum bm_flag flags)
b411b363
PR
4082{
4083 int rv;
4084
4085 D_ASSERT(current != mdev->worker.task);
4086
20ceb2b2
LE
4087 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4088 drbd_suspend_io(mdev);
b411b363 4089
20ceb2b2 4090 drbd_bm_lock(mdev, why, flags);
b411b363
PR
4091 rv = io_fn(mdev);
4092 drbd_bm_unlock(mdev);
4093
20ceb2b2
LE
4094 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4095 drbd_resume_io(mdev);
b411b363
PR
4096
4097 return rv;
4098}
4099
4100void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4101{
4102 if ((mdev->ldev->md.flags & flag) != flag) {
4103 drbd_md_mark_dirty(mdev);
4104 mdev->ldev->md.flags |= flag;
4105 }
4106}
4107
4108void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4109{
4110 if ((mdev->ldev->md.flags & flag) != 0) {
4111 drbd_md_mark_dirty(mdev);
4112 mdev->ldev->md.flags &= ~flag;
4113 }
4114}
4115int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4116{
4117 return (bdev->md.flags & flag) != 0;
4118}
4119
4120static void md_sync_timer_fn(unsigned long data)
4121{
4122 struct drbd_conf *mdev = (struct drbd_conf *) data;
4123
4124 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4125}
4126
4127static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4128{
4129 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
4130#ifdef DEBUG
4131 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4132 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4133#endif
b411b363 4134 drbd_md_sync(mdev);
b411b363
PR
4135 return 1;
4136}
4137
4138#ifdef CONFIG_DRBD_FAULT_INJECTION
4139/* Fault insertion support including random number generator shamelessly
4140 * stolen from kernel/rcutorture.c */
4141struct fault_random_state {
4142 unsigned long state;
4143 unsigned long count;
4144};
4145
4146#define FAULT_RANDOM_MULT 39916801 /* prime */
4147#define FAULT_RANDOM_ADD 479001701 /* prime */
4148#define FAULT_RANDOM_REFRESH 10000
4149
4150/*
4151 * Crude but fast random-number generator. Uses a linear congruential
4152 * generator, with occasional help from get_random_bytes().
4153 */
4154static unsigned long
4155_drbd_fault_random(struct fault_random_state *rsp)
4156{
4157 long refresh;
4158
49829ea7 4159 if (!rsp->count--) {
b411b363
PR
4160 get_random_bytes(&refresh, sizeof(refresh));
4161 rsp->state += refresh;
4162 rsp->count = FAULT_RANDOM_REFRESH;
4163 }
4164 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4165 return swahw32(rsp->state);
4166}
4167
4168static char *
4169_drbd_fault_str(unsigned int type) {
4170 static char *_faults[] = {
4171 [DRBD_FAULT_MD_WR] = "Meta-data write",
4172 [DRBD_FAULT_MD_RD] = "Meta-data read",
4173 [DRBD_FAULT_RS_WR] = "Resync write",
4174 [DRBD_FAULT_RS_RD] = "Resync read",
4175 [DRBD_FAULT_DT_WR] = "Data write",
4176 [DRBD_FAULT_DT_RD] = "Data read",
4177 [DRBD_FAULT_DT_RA] = "Data read ahead",
4178 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
4179 [DRBD_FAULT_AL_EE] = "EE allocation",
4180 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
4181 };
4182
4183 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4184}
4185
4186unsigned int
4187_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4188{
4189 static struct fault_random_state rrs = {0, 0};
4190
4191 unsigned int ret = (
4192 (fault_devs == 0 ||
4193 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4194 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4195
4196 if (ret) {
4197 fault_count++;
4198
7383506c 4199 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
4200 dev_warn(DEV, "***Simulating %s failure\n",
4201 _drbd_fault_str(type));
4202 }
4203
4204 return ret;
4205}
4206#endif
4207
4208const char *drbd_buildtag(void)
4209{
4210 /* DRBD built from external sources has here a reference to the
4211 git hash of the source code. */
4212
4213 static char buildtag[38] = "\0uilt-in";
4214
4215 if (buildtag[0] == 0) {
4216#ifdef CONFIG_MODULES
4217 if (THIS_MODULE != NULL)
4218 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4219 else
4220#endif
4221 buildtag[0] = 'b';
4222 }
4223
4224 return buildtag;
4225}
4226
4227module_init(drbd_init)
4228module_exit(drbd_cleanup)
4229
b411b363
PR
4230EXPORT_SYMBOL(drbd_conn_str);
4231EXPORT_SYMBOL(drbd_role_str);
4232EXPORT_SYMBOL(drbd_disk_str);
4233EXPORT_SYMBOL(drbd_set_st_err_str);