]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/block/drbd/drbd_worker.c
drbd: reduce number of spinlock drop/re-aquire cycles
[mirror_ubuntu-artful-kernel.git] / drivers / block / drbd / drbd_worker.c
CommitLineData
b411b363
PR
1/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
84b8c06b 24*/
b411b363 25
b411b363 26#include <linux/module.h>
b411b363
PR
27#include <linux/drbd.h>
28#include <linux/sched.h>
b411b363
PR
29#include <linux/wait.h>
30#include <linux/mm.h>
31#include <linux/memcontrol.h>
32#include <linux/mm_inline.h>
33#include <linux/slab.h>
34#include <linux/random.h>
b411b363
PR
35#include <linux/string.h>
36#include <linux/scatterlist.h>
37
38#include "drbd_int.h"
a3603a6e 39#include "drbd_protocol.h"
b411b363 40#include "drbd_req.h"
b411b363 41
d448a2e1
AG
42static int make_ov_request(struct drbd_device *, int);
43static int make_resync_request(struct drbd_device *, int);
b411b363 44
c5a91619
AG
45/* endio handlers:
46 * drbd_md_io_complete (defined here)
fcefa62e
AG
47 * drbd_request_endio (defined here)
48 * drbd_peer_request_endio (defined here)
c5a91619
AG
49 * bm_async_io_complete (defined in drbd_bitmap.c)
50 *
b411b363
PR
51 * For all these callbacks, note the following:
52 * The callbacks will be called in irq context by the IDE drivers,
53 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
54 * Try to get the locking right :)
55 *
56 */
57
58
59/* About the global_state_lock
60 Each state transition on an device holds a read lock. In case we have
95f8efd0 61 to evaluate the resync after dependencies, we grab a write lock, because
b411b363
PR
62 we need stable states on all devices for that. */
63rwlock_t global_state_lock;
64
65/* used for synchronous meta data and bitmap IO
66 * submitted by drbd_md_sync_page_io()
67 */
68void drbd_md_io_complete(struct bio *bio, int error)
69{
70 struct drbd_md_io *md_io;
b30ab791 71 struct drbd_device *device;
b411b363
PR
72
73 md_io = (struct drbd_md_io *)bio->bi_private;
b30ab791 74 device = container_of(md_io, struct drbd_device, md_io);
cdfda633 75
b411b363
PR
76 md_io->error = error;
77
0cfac5dd
PR
78 /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
79 * to timeout on the lower level device, and eventually detach from it.
80 * If this io completion runs after that timeout expired, this
81 * drbd_md_put_buffer() may allow us to finally try and re-attach.
82 * During normal operation, this only puts that extra reference
83 * down to 1 again.
84 * Make sure we first drop the reference, and only then signal
85 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
86 * next drbd_md_sync_page_io(), that we trigger the
b30ab791 87 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
0cfac5dd 88 */
b30ab791 89 drbd_md_put_buffer(device);
cdfda633 90 md_io->done = 1;
b30ab791 91 wake_up(&device->misc_wait);
cdfda633 92 bio_put(bio);
b30ab791
AG
93 if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
94 put_ldev(device);
b411b363
PR
95}
96
97/* reads on behalf of the partner,
98 * "submitted" by the receiver
99 */
a186e478 100static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
b411b363
PR
101{
102 unsigned long flags = 0;
6780139c
AG
103 struct drbd_peer_device *peer_device = peer_req->peer_device;
104 struct drbd_device *device = peer_device->device;
b411b363 105
0500813f 106 spin_lock_irqsave(&device->resource->req_lock, flags);
b30ab791 107 device->read_cnt += peer_req->i.size >> 9;
a8cd15ba 108 list_del(&peer_req->w.list);
b30ab791
AG
109 if (list_empty(&device->read_ee))
110 wake_up(&device->ee_wait);
db830c46 111 if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
b30ab791 112 __drbd_chk_io_error(device, DRBD_READ_ERROR);
0500813f 113 spin_unlock_irqrestore(&device->resource->req_lock, flags);
b411b363 114
6780139c 115 drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
b30ab791 116 put_ldev(device);
b411b363
PR
117}
118
119/* writes on behalf of the partner, or resync writes,
45bb912b 120 * "submitted" by the receiver, final stage. */
a0fb3c47 121void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
b411b363
PR
122{
123 unsigned long flags = 0;
6780139c
AG
124 struct drbd_peer_device *peer_device = peer_req->peer_device;
125 struct drbd_device *device = peer_device->device;
181286ad 126 struct drbd_interval i;
b411b363 127 int do_wake;
579b57ed 128 u64 block_id;
b411b363 129 int do_al_complete_io;
b411b363 130
db830c46 131 /* after we moved peer_req to done_ee,
b411b363
PR
132 * we may no longer access it,
133 * it may be freed/reused already!
134 * (as soon as we release the req_lock) */
181286ad 135 i = peer_req->i;
db830c46
AG
136 do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
137 block_id = peer_req->block_id;
b411b363 138
0500813f 139 spin_lock_irqsave(&device->resource->req_lock, flags);
b30ab791 140 device->writ_cnt += peer_req->i.size >> 9;
a8cd15ba 141 list_move_tail(&peer_req->w.list, &device->done_ee);
b411b363 142
bb3bfe96 143 /*
5e472264 144 * Do not remove from the write_requests tree here: we did not send the
bb3bfe96
AG
145 * Ack yet and did not wake possibly waiting conflicting requests.
146 * Removed from the tree from "drbd_process_done_ee" within the
84b8c06b 147 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
bb3bfe96
AG
148 * _drbd_clear_done_ee.
149 */
b411b363 150
b30ab791 151 do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
b411b363 152
a0fb3c47
LE
153 /* FIXME do we want to detach for failed REQ_DISCARD?
154 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
155 if (peer_req->flags & EE_WAS_ERROR)
b30ab791 156 __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
0500813f 157 spin_unlock_irqrestore(&device->resource->req_lock, flags);
b411b363 158
579b57ed 159 if (block_id == ID_SYNCER)
b30ab791 160 drbd_rs_complete_io(device, i.sector);
b411b363
PR
161
162 if (do_wake)
b30ab791 163 wake_up(&device->ee_wait);
b411b363
PR
164
165 if (do_al_complete_io)
b30ab791 166 drbd_al_complete_io(device, &i);
b411b363 167
6780139c 168 wake_asender(peer_device->connection);
b30ab791 169 put_ldev(device);
45bb912b 170}
b411b363 171
45bb912b
LE
172/* writes on behalf of the partner, or resync writes,
173 * "submitted" by the receiver.
174 */
fcefa62e 175void drbd_peer_request_endio(struct bio *bio, int error)
45bb912b 176{
db830c46 177 struct drbd_peer_request *peer_req = bio->bi_private;
a8cd15ba 178 struct drbd_device *device = peer_req->peer_device->device;
45bb912b
LE
179 int uptodate = bio_flagged(bio, BIO_UPTODATE);
180 int is_write = bio_data_dir(bio) == WRITE;
a0fb3c47 181 int is_discard = !!(bio->bi_rw & REQ_DISCARD);
45bb912b 182
07194272 183 if (error && __ratelimit(&drbd_ratelimit_state))
d0180171 184 drbd_warn(device, "%s: error=%d s=%llus\n",
a0fb3c47
LE
185 is_write ? (is_discard ? "discard" : "write")
186 : "read", error,
db830c46 187 (unsigned long long)peer_req->i.sector);
45bb912b 188 if (!error && !uptodate) {
07194272 189 if (__ratelimit(&drbd_ratelimit_state))
d0180171 190 drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
07194272 191 is_write ? "write" : "read",
db830c46 192 (unsigned long long)peer_req->i.sector);
45bb912b
LE
193 /* strange behavior of some lower level drivers...
194 * fail the request by clearing the uptodate flag,
195 * but do not return any error?! */
196 error = -EIO;
197 }
198
199 if (error)
db830c46 200 set_bit(__EE_WAS_ERROR, &peer_req->flags);
45bb912b
LE
201
202 bio_put(bio); /* no need for the bio anymore */
db830c46 203 if (atomic_dec_and_test(&peer_req->pending_bios)) {
45bb912b 204 if (is_write)
db830c46 205 drbd_endio_write_sec_final(peer_req);
45bb912b 206 else
db830c46 207 drbd_endio_read_sec_final(peer_req);
45bb912b 208 }
b411b363
PR
209}
210
211/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
212 */
fcefa62e 213void drbd_request_endio(struct bio *bio, int error)
b411b363 214{
a115413d 215 unsigned long flags;
b411b363 216 struct drbd_request *req = bio->bi_private;
84b8c06b 217 struct drbd_device *device = req->device;
a115413d 218 struct bio_and_error m;
b411b363
PR
219 enum drbd_req_event what;
220 int uptodate = bio_flagged(bio, BIO_UPTODATE);
221
b411b363 222 if (!error && !uptodate) {
d0180171 223 drbd_warn(device, "p %s: setting error to -EIO\n",
b411b363
PR
224 bio_data_dir(bio) == WRITE ? "write" : "read");
225 /* strange behavior of some lower level drivers...
226 * fail the request by clearing the uptodate flag,
227 * but do not return any error?! */
228 error = -EIO;
229 }
230
1b6dd252
PR
231
232 /* If this request was aborted locally before,
233 * but now was completed "successfully",
234 * chances are that this caused arbitrary data corruption.
235 *
236 * "aborting" requests, or force-detaching the disk, is intended for
237 * completely blocked/hung local backing devices which do no longer
238 * complete requests at all, not even do error completions. In this
239 * situation, usually a hard-reset and failover is the only way out.
240 *
241 * By "aborting", basically faking a local error-completion,
242 * we allow for a more graceful swichover by cleanly migrating services.
243 * Still the affected node has to be rebooted "soon".
244 *
245 * By completing these requests, we allow the upper layers to re-use
246 * the associated data pages.
247 *
248 * If later the local backing device "recovers", and now DMAs some data
249 * from disk into the original request pages, in the best case it will
250 * just put random data into unused pages; but typically it will corrupt
251 * meanwhile completely unrelated data, causing all sorts of damage.
252 *
253 * Which means delayed successful completion,
254 * especially for READ requests,
255 * is a reason to panic().
256 *
257 * We assume that a delayed *error* completion is OK,
258 * though we still will complain noisily about it.
259 */
260 if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
261 if (__ratelimit(&drbd_ratelimit_state))
d0180171 262 drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
1b6dd252
PR
263
264 if (!error)
265 panic("possible random memory corruption caused by delayed completion of aborted local request\n");
266 }
267
b411b363
PR
268 /* to avoid recursion in __req_mod */
269 if (unlikely(error)) {
2f632aeb
LE
270 if (bio->bi_rw & REQ_DISCARD)
271 what = (error == -EOPNOTSUPP)
272 ? DISCARD_COMPLETED_NOTSUPP
273 : DISCARD_COMPLETED_WITH_ERROR;
274 else
275 what = (bio_data_dir(bio) == WRITE)
8554df1c 276 ? WRITE_COMPLETED_WITH_ERROR
5c3c7e64 277 : (bio_rw(bio) == READ)
8554df1c
AG
278 ? READ_COMPLETED_WITH_ERROR
279 : READ_AHEAD_COMPLETED_WITH_ERROR;
b411b363 280 } else
8554df1c 281 what = COMPLETED_OK;
b411b363
PR
282
283 bio_put(req->private_bio);
284 req->private_bio = ERR_PTR(error);
285
a115413d 286 /* not req_mod(), we need irqsave here! */
0500813f 287 spin_lock_irqsave(&device->resource->req_lock, flags);
a115413d 288 __req_mod(req, what, &m);
0500813f 289 spin_unlock_irqrestore(&device->resource->req_lock, flags);
b30ab791 290 put_ldev(device);
a115413d
LE
291
292 if (m.bio)
b30ab791 293 complete_master_bio(device, &m);
b411b363
PR
294}
295
79a3c8d3 296void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
45bb912b
LE
297{
298 struct hash_desc desc;
299 struct scatterlist sg;
db830c46 300 struct page *page = peer_req->pages;
45bb912b
LE
301 struct page *tmp;
302 unsigned len;
303
304 desc.tfm = tfm;
305 desc.flags = 0;
306
307 sg_init_table(&sg, 1);
308 crypto_hash_init(&desc);
309
310 while ((tmp = page_chain_next(page))) {
311 /* all but the last page will be fully used */
312 sg_set_page(&sg, page, PAGE_SIZE, 0);
313 crypto_hash_update(&desc, &sg, sg.length);
314 page = tmp;
315 }
316 /* and now the last, possibly only partially used page */
db830c46 317 len = peer_req->i.size & (PAGE_SIZE - 1);
45bb912b
LE
318 sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
319 crypto_hash_update(&desc, &sg, sg.length);
320 crypto_hash_final(&desc, digest);
321}
322
79a3c8d3 323void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
b411b363
PR
324{
325 struct hash_desc desc;
326 struct scatterlist sg;
7988613b
KO
327 struct bio_vec bvec;
328 struct bvec_iter iter;
b411b363
PR
329
330 desc.tfm = tfm;
331 desc.flags = 0;
332
333 sg_init_table(&sg, 1);
334 crypto_hash_init(&desc);
335
7988613b
KO
336 bio_for_each_segment(bvec, bio, iter) {
337 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
b411b363
PR
338 crypto_hash_update(&desc, &sg, sg.length);
339 }
340 crypto_hash_final(&desc, digest);
341}
342
9676c760 343/* MAYBE merge common code with w_e_end_ov_req */
99920dc5 344static int w_e_send_csum(struct drbd_work *w, int cancel)
b411b363 345{
a8cd15ba 346 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
6780139c
AG
347 struct drbd_peer_device *peer_device = peer_req->peer_device;
348 struct drbd_device *device = peer_device->device;
b411b363
PR
349 int digest_size;
350 void *digest;
99920dc5 351 int err = 0;
b411b363 352
53ea4331
LE
353 if (unlikely(cancel))
354 goto out;
b411b363 355
9676c760 356 if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
53ea4331 357 goto out;
b411b363 358
6780139c 359 digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
53ea4331
LE
360 digest = kmalloc(digest_size, GFP_NOIO);
361 if (digest) {
db830c46
AG
362 sector_t sector = peer_req->i.sector;
363 unsigned int size = peer_req->i.size;
6780139c 364 drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
9676c760 365 /* Free peer_req and pages before send.
53ea4331
LE
366 * In case we block on congestion, we could otherwise run into
367 * some distributed deadlock, if the other side blocks on
368 * congestion as well, because our receiver blocks in
c37c8ecf 369 * drbd_alloc_pages due to pp_in_use > max_buffers. */
b30ab791 370 drbd_free_peer_req(device, peer_req);
db830c46 371 peer_req = NULL;
b30ab791 372 inc_rs_pending(device);
6780139c 373 err = drbd_send_drequest_csum(peer_device, sector, size,
db1b0b72
AG
374 digest, digest_size,
375 P_CSUM_RS_REQUEST);
53ea4331
LE
376 kfree(digest);
377 } else {
d0180171 378 drbd_err(device, "kmalloc() of digest failed.\n");
99920dc5 379 err = -ENOMEM;
53ea4331 380 }
b411b363 381
53ea4331 382out:
db830c46 383 if (peer_req)
b30ab791 384 drbd_free_peer_req(device, peer_req);
b411b363 385
99920dc5 386 if (unlikely(err))
d0180171 387 drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
99920dc5 388 return err;
b411b363
PR
389}
390
391#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
392
69a22773 393static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
b411b363 394{
69a22773 395 struct drbd_device *device = peer_device->device;
db830c46 396 struct drbd_peer_request *peer_req;
b411b363 397
b30ab791 398 if (!get_ldev(device))
80a40e43 399 return -EIO;
b411b363 400
b30ab791 401 if (drbd_rs_should_slow_down(device, sector))
0f0601f4
LE
402 goto defer;
403
b411b363
PR
404 /* GFP_TRY, because if there is no memory available right now, this may
405 * be rescheduled for later. It is "only" background resync, after all. */
69a22773 406 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
a0fb3c47 407 size, true /* has real payload */, GFP_TRY);
db830c46 408 if (!peer_req)
80a40e43 409 goto defer;
b411b363 410
a8cd15ba 411 peer_req->w.cb = w_e_send_csum;
0500813f 412 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 413 list_add(&peer_req->w.list, &device->read_ee);
0500813f 414 spin_unlock_irq(&device->resource->req_lock);
b411b363 415
b30ab791
AG
416 atomic_add(size >> 9, &device->rs_sect_ev);
417 if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
80a40e43 418 return 0;
b411b363 419
10f6d992
LE
420 /* If it failed because of ENOMEM, retry should help. If it failed
421 * because bio_add_page failed (probably broken lower level driver),
422 * retry may or may not help.
423 * If it does not, you may need to force disconnect. */
0500813f 424 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 425 list_del(&peer_req->w.list);
0500813f 426 spin_unlock_irq(&device->resource->req_lock);
22cc37a9 427
b30ab791 428 drbd_free_peer_req(device, peer_req);
80a40e43 429defer:
b30ab791 430 put_ldev(device);
80a40e43 431 return -EAGAIN;
b411b363
PR
432}
433
99920dc5 434int w_resync_timer(struct drbd_work *w, int cancel)
b411b363 435{
84b8c06b
AG
436 struct drbd_device *device =
437 container_of(w, struct drbd_device, resync_work);
438
b30ab791 439 switch (device->state.conn) {
63106d3c 440 case C_VERIFY_S:
d448a2e1 441 make_ov_request(device, cancel);
63106d3c
PR
442 break;
443 case C_SYNC_TARGET:
d448a2e1 444 make_resync_request(device, cancel);
63106d3c 445 break;
b411b363
PR
446 }
447
99920dc5 448 return 0;
794abb75
PR
449}
450
451void resync_timer_fn(unsigned long data)
452{
b30ab791 453 struct drbd_device *device = (struct drbd_device *) data;
794abb75 454
b30ab791 455 if (list_empty(&device->resync_work.list))
84b8c06b
AG
456 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
457 &device->resync_work);
b411b363
PR
458}
459
778f271d
PR
460static void fifo_set(struct fifo_buffer *fb, int value)
461{
462 int i;
463
464 for (i = 0; i < fb->size; i++)
f10f2623 465 fb->values[i] = value;
778f271d
PR
466}
467
468static int fifo_push(struct fifo_buffer *fb, int value)
469{
470 int ov;
471
472 ov = fb->values[fb->head_index];
473 fb->values[fb->head_index++] = value;
474
475 if (fb->head_index >= fb->size)
476 fb->head_index = 0;
477
478 return ov;
479}
480
481static void fifo_add_val(struct fifo_buffer *fb, int value)
482{
483 int i;
484
485 for (i = 0; i < fb->size; i++)
486 fb->values[i] += value;
487}
488
9958c857
PR
489struct fifo_buffer *fifo_alloc(int fifo_size)
490{
491 struct fifo_buffer *fb;
492
8747d30a 493 fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
9958c857
PR
494 if (!fb)
495 return NULL;
496
497 fb->head_index = 0;
498 fb->size = fifo_size;
499 fb->total = 0;
500
501 return fb;
502}
503
0e49d7b0 504static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
778f271d 505{
daeda1cc 506 struct disk_conf *dc;
778f271d
PR
507 unsigned int want; /* The number of sectors we want in the proxy */
508 int req_sect; /* Number of sectors to request in this turn */
509 int correction; /* Number of sectors more we need in the proxy*/
510 int cps; /* correction per invocation of drbd_rs_controller() */
511 int steps; /* Number of time steps to plan ahead */
512 int curr_corr;
513 int max_sect;
813472ce 514 struct fifo_buffer *plan;
778f271d 515
b30ab791
AG
516 dc = rcu_dereference(device->ldev->disk_conf);
517 plan = rcu_dereference(device->rs_plan_s);
778f271d 518
813472ce 519 steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
778f271d 520
b30ab791 521 if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
daeda1cc 522 want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
778f271d 523 } else { /* normal path */
daeda1cc
PR
524 want = dc->c_fill_target ? dc->c_fill_target :
525 sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
778f271d
PR
526 }
527
b30ab791 528 correction = want - device->rs_in_flight - plan->total;
778f271d
PR
529
530 /* Plan ahead */
531 cps = correction / steps;
813472ce
PR
532 fifo_add_val(plan, cps);
533 plan->total += cps * steps;
778f271d
PR
534
535 /* What we do in this step */
813472ce
PR
536 curr_corr = fifo_push(plan, 0);
537 plan->total -= curr_corr;
778f271d
PR
538
539 req_sect = sect_in + curr_corr;
540 if (req_sect < 0)
541 req_sect = 0;
542
daeda1cc 543 max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
778f271d
PR
544 if (req_sect > max_sect)
545 req_sect = max_sect;
546
547 /*
d0180171 548 drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
b30ab791
AG
549 sect_in, device->rs_in_flight, want, correction,
550 steps, cps, device->rs_planed, curr_corr, req_sect);
778f271d
PR
551 */
552
553 return req_sect;
554}
555
b30ab791 556static int drbd_rs_number_requests(struct drbd_device *device)
e65f440d 557{
0e49d7b0
LE
558 unsigned int sect_in; /* Number of sectors that came in since the last turn */
559 int number, mxb;
560
561 sect_in = atomic_xchg(&device->rs_sect_in, 0);
562 device->rs_in_flight -= sect_in;
813472ce
PR
563
564 rcu_read_lock();
0e49d7b0 565 mxb = drbd_get_max_buffers(device) / 2;
b30ab791 566 if (rcu_dereference(device->rs_plan_s)->size) {
0e49d7b0 567 number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
b30ab791 568 device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
e65f440d 569 } else {
b30ab791
AG
570 device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
571 number = SLEEP_TIME * device->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
e65f440d 572 }
813472ce 573 rcu_read_unlock();
e65f440d 574
0e49d7b0
LE
575 /* Don't have more than "max-buffers"/2 in-flight.
576 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
577 * potentially causing a distributed deadlock on congestion during
578 * online-verify or (checksum-based) resync, if max-buffers,
579 * socket buffer sizes and resync rate settings are mis-configured. */
580 if (mxb - device->rs_in_flight < number)
581 number = mxb - device->rs_in_flight;
582
e65f440d
LE
583 return number;
584}
585
d448a2e1 586static int make_resync_request(struct drbd_device *device, int cancel)
b411b363
PR
587{
588 unsigned long bit;
589 sector_t sector;
b30ab791 590 const sector_t capacity = drbd_get_capacity(device->this_bdev);
1816a2b4 591 int max_bio_size;
e65f440d 592 int number, rollback_i, size;
b411b363 593 int align, queued, sndbuf;
0f0601f4 594 int i = 0;
b411b363
PR
595
596 if (unlikely(cancel))
99920dc5 597 return 0;
b411b363 598
b30ab791 599 if (device->rs_total == 0) {
af85e8e8 600 /* empty resync? */
b30ab791 601 drbd_resync_finished(device);
99920dc5 602 return 0;
af85e8e8
LE
603 }
604
b30ab791
AG
605 if (!get_ldev(device)) {
606 /* Since we only need to access device->rsync a
607 get_ldev_if_state(device,D_FAILED) would be sufficient, but
b411b363
PR
608 to continue resync with a broken disk makes no sense at
609 all */
d0180171 610 drbd_err(device, "Disk broke down during resync!\n");
99920dc5 611 return 0;
b411b363
PR
612 }
613
b30ab791
AG
614 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
615 number = drbd_rs_number_requests(device);
0e49d7b0 616 if (number <= 0)
0f0601f4 617 goto requeue;
b411b363 618
b411b363
PR
619 for (i = 0; i < number; i++) {
620 /* Stop generating RS requests, when half of the send buffer is filled */
a6b32bc3
AG
621 mutex_lock(&first_peer_device(device)->connection->data.mutex);
622 if (first_peer_device(device)->connection->data.socket) {
623 queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
624 sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
b411b363
PR
625 } else {
626 queued = 1;
627 sndbuf = 0;
628 }
a6b32bc3 629 mutex_unlock(&first_peer_device(device)->connection->data.mutex);
b411b363
PR
630 if (queued > sndbuf / 2)
631 goto requeue;
632
633next_sector:
634 size = BM_BLOCK_SIZE;
b30ab791 635 bit = drbd_bm_find_next(device, device->bm_resync_fo);
b411b363 636
4b0715f0 637 if (bit == DRBD_END_OF_BITMAP) {
b30ab791
AG
638 device->bm_resync_fo = drbd_bm_bits(device);
639 put_ldev(device);
99920dc5 640 return 0;
b411b363
PR
641 }
642
643 sector = BM_BIT_TO_SECT(bit);
644
b30ab791
AG
645 if (drbd_rs_should_slow_down(device, sector) ||
646 drbd_try_rs_begin_io(device, sector)) {
647 device->bm_resync_fo = bit;
b411b363
PR
648 goto requeue;
649 }
b30ab791 650 device->bm_resync_fo = bit + 1;
b411b363 651
b30ab791
AG
652 if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
653 drbd_rs_complete_io(device, sector);
b411b363
PR
654 goto next_sector;
655 }
656
1816a2b4 657#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
b411b363
PR
658 /* try to find some adjacent bits.
659 * we stop if we have already the maximum req size.
660 *
661 * Additionally always align bigger requests, in order to
662 * be prepared for all stripe sizes of software RAIDs.
b411b363
PR
663 */
664 align = 1;
d207450c 665 rollback_i = i;
6377b923 666 while (i < number) {
1816a2b4 667 if (size + BM_BLOCK_SIZE > max_bio_size)
b411b363
PR
668 break;
669
670 /* Be always aligned */
671 if (sector & ((1<<(align+3))-1))
672 break;
673
674 /* do not cross extent boundaries */
675 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
676 break;
677 /* now, is it actually dirty, after all?
678 * caution, drbd_bm_test_bit is tri-state for some
679 * obscure reason; ( b == 0 ) would get the out-of-band
680 * only accidentally right because of the "oddly sized"
681 * adjustment below */
b30ab791 682 if (drbd_bm_test_bit(device, bit+1) != 1)
b411b363
PR
683 break;
684 bit++;
685 size += BM_BLOCK_SIZE;
686 if ((BM_BLOCK_SIZE << align) <= size)
687 align++;
688 i++;
689 }
690 /* if we merged some,
691 * reset the offset to start the next drbd_bm_find_next from */
692 if (size > BM_BLOCK_SIZE)
b30ab791 693 device->bm_resync_fo = bit + 1;
b411b363
PR
694#endif
695
696 /* adjust very last sectors, in case we are oddly sized */
697 if (sector + (size>>9) > capacity)
698 size = (capacity-sector)<<9;
a6b32bc3
AG
699 if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
700 first_peer_device(device)->connection->csums_tfm) {
69a22773 701 switch (read_for_csum(first_peer_device(device), sector, size)) {
80a40e43 702 case -EIO: /* Disk failure */
b30ab791 703 put_ldev(device);
99920dc5 704 return -EIO;
80a40e43 705 case -EAGAIN: /* allocation failed, or ldev busy */
b30ab791
AG
706 drbd_rs_complete_io(device, sector);
707 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
d207450c 708 i = rollback_i;
b411b363 709 goto requeue;
80a40e43
LE
710 case 0:
711 /* everything ok */
712 break;
713 default:
714 BUG();
b411b363
PR
715 }
716 } else {
99920dc5
AG
717 int err;
718
b30ab791 719 inc_rs_pending(device);
69a22773 720 err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
99920dc5
AG
721 sector, size, ID_SYNCER);
722 if (err) {
d0180171 723 drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
b30ab791
AG
724 dec_rs_pending(device);
725 put_ldev(device);
99920dc5 726 return err;
b411b363
PR
727 }
728 }
729 }
730
b30ab791 731 if (device->bm_resync_fo >= drbd_bm_bits(device)) {
b411b363
PR
732 /* last syncer _request_ was sent,
733 * but the P_RS_DATA_REPLY not yet received. sync will end (and
734 * next sync group will resume), as soon as we receive the last
735 * resync data block, and the last bit is cleared.
736 * until then resync "work" is "inactive" ...
737 */
b30ab791 738 put_ldev(device);
99920dc5 739 return 0;
b411b363
PR
740 }
741
742 requeue:
b30ab791
AG
743 device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
744 mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
745 put_ldev(device);
99920dc5 746 return 0;
b411b363
PR
747}
748
d448a2e1 749static int make_ov_request(struct drbd_device *device, int cancel)
b411b363
PR
750{
751 int number, i, size;
752 sector_t sector;
b30ab791 753 const sector_t capacity = drbd_get_capacity(device->this_bdev);
58ffa580 754 bool stop_sector_reached = false;
b411b363
PR
755
756 if (unlikely(cancel))
757 return 1;
758
b30ab791 759 number = drbd_rs_number_requests(device);
b411b363 760
b30ab791 761 sector = device->ov_position;
b411b363 762 for (i = 0; i < number; i++) {
58ffa580 763 if (sector >= capacity)
b411b363 764 return 1;
58ffa580
LE
765
766 /* We check for "finished" only in the reply path:
767 * w_e_end_ov_reply().
768 * We need to send at least one request out. */
769 stop_sector_reached = i > 0
b30ab791
AG
770 && verify_can_do_stop_sector(device)
771 && sector >= device->ov_stop_sector;
58ffa580
LE
772 if (stop_sector_reached)
773 break;
b411b363
PR
774
775 size = BM_BLOCK_SIZE;
776
b30ab791
AG
777 if (drbd_rs_should_slow_down(device, sector) ||
778 drbd_try_rs_begin_io(device, sector)) {
779 device->ov_position = sector;
b411b363
PR
780 goto requeue;
781 }
782
783 if (sector + (size>>9) > capacity)
784 size = (capacity-sector)<<9;
785
b30ab791 786 inc_rs_pending(device);
69a22773 787 if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
b30ab791 788 dec_rs_pending(device);
b411b363
PR
789 return 0;
790 }
791 sector += BM_SECT_PER_BIT;
792 }
b30ab791 793 device->ov_position = sector;
b411b363
PR
794
795 requeue:
b30ab791 796 device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
58ffa580 797 if (i == 0 || !stop_sector_reached)
b30ab791 798 mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
b411b363
PR
799 return 1;
800}
801
99920dc5 802int w_ov_finished(struct drbd_work *w, int cancel)
b411b363 803{
84b8c06b
AG
804 struct drbd_device_work *dw =
805 container_of(w, struct drbd_device_work, w);
806 struct drbd_device *device = dw->device;
807 kfree(dw);
b30ab791
AG
808 ov_out_of_sync_print(device);
809 drbd_resync_finished(device);
b411b363 810
99920dc5 811 return 0;
b411b363
PR
812}
813
99920dc5 814static int w_resync_finished(struct drbd_work *w, int cancel)
b411b363 815{
84b8c06b
AG
816 struct drbd_device_work *dw =
817 container_of(w, struct drbd_device_work, w);
818 struct drbd_device *device = dw->device;
819 kfree(dw);
b411b363 820
b30ab791 821 drbd_resync_finished(device);
b411b363 822
99920dc5 823 return 0;
b411b363
PR
824}
825
b30ab791 826static void ping_peer(struct drbd_device *device)
af85e8e8 827{
a6b32bc3 828 struct drbd_connection *connection = first_peer_device(device)->connection;
2a67d8b9 829
bde89a9e
AG
830 clear_bit(GOT_PING_ACK, &connection->flags);
831 request_ping(connection);
832 wait_event(connection->ping_wait,
833 test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
af85e8e8
LE
834}
835
b30ab791 836int drbd_resync_finished(struct drbd_device *device)
b411b363
PR
837{
838 unsigned long db, dt, dbdt;
839 unsigned long n_oos;
840 union drbd_state os, ns;
84b8c06b 841 struct drbd_device_work *dw;
b411b363 842 char *khelper_cmd = NULL;
26525618 843 int verify_done = 0;
b411b363
PR
844
845 /* Remove all elements from the resync LRU. Since future actions
846 * might set bits in the (main) bitmap, then the entries in the
847 * resync LRU would be wrong. */
b30ab791 848 if (drbd_rs_del_all(device)) {
b411b363
PR
849 /* In case this is not possible now, most probably because
850 * there are P_RS_DATA_REPLY Packets lingering on the worker's
851 * queue (or even the read operations for those packets
852 * is not finished by now). Retry in 100ms. */
853
20ee6390 854 schedule_timeout_interruptible(HZ / 10);
84b8c06b
AG
855 dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
856 if (dw) {
857 dw->w.cb = w_resync_finished;
858 dw->device = device;
859 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
860 &dw->w);
b411b363
PR
861 return 1;
862 }
84b8c06b 863 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
b411b363
PR
864 }
865
b30ab791 866 dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
b411b363
PR
867 if (dt <= 0)
868 dt = 1;
84b8c06b 869
b30ab791 870 db = device->rs_total;
58ffa580 871 /* adjust for verify start and stop sectors, respective reached position */
b30ab791
AG
872 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
873 db -= device->ov_left;
58ffa580 874
b411b363 875 dbdt = Bit2KB(db/dt);
b30ab791 876 device->rs_paused /= HZ;
b411b363 877
b30ab791 878 if (!get_ldev(device))
b411b363
PR
879 goto out;
880
b30ab791 881 ping_peer(device);
af85e8e8 882
0500813f 883 spin_lock_irq(&device->resource->req_lock);
b30ab791 884 os = drbd_read_state(device);
b411b363 885
26525618
LE
886 verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
887
b411b363
PR
888 /* This protects us against multiple calls (that can happen in the presence
889 of application IO), and against connectivity loss just before we arrive here. */
890 if (os.conn <= C_CONNECTED)
891 goto out_unlock;
892
893 ns = os;
894 ns.conn = C_CONNECTED;
895
d0180171 896 drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
58ffa580 897 verify_done ? "Online verify" : "Resync",
b30ab791 898 dt + device->rs_paused, device->rs_paused, dbdt);
b411b363 899
b30ab791 900 n_oos = drbd_bm_total_weight(device);
b411b363
PR
901
902 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
903 if (n_oos) {
d0180171 904 drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
b411b363
PR
905 n_oos, Bit2KB(1));
906 khelper_cmd = "out-of-sync";
907 }
908 } else {
0b0ba1ef 909 D_ASSERT(device, (n_oos - device->rs_failed) == 0);
b411b363
PR
910
911 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
912 khelper_cmd = "after-resync-target";
913
a6b32bc3 914 if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
b30ab791
AG
915 const unsigned long s = device->rs_same_csum;
916 const unsigned long t = device->rs_total;
b411b363
PR
917 const int ratio =
918 (t == 0) ? 0 :
919 (t < 100000) ? ((s*100)/t) : (s/(t/100));
d0180171 920 drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
b411b363
PR
921 "transferred %luK total %luK\n",
922 ratio,
b30ab791
AG
923 Bit2KB(device->rs_same_csum),
924 Bit2KB(device->rs_total - device->rs_same_csum),
925 Bit2KB(device->rs_total));
b411b363
PR
926 }
927 }
928
b30ab791 929 if (device->rs_failed) {
d0180171 930 drbd_info(device, " %lu failed blocks\n", device->rs_failed);
b411b363
PR
931
932 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
933 ns.disk = D_INCONSISTENT;
934 ns.pdsk = D_UP_TO_DATE;
935 } else {
936 ns.disk = D_UP_TO_DATE;
937 ns.pdsk = D_INCONSISTENT;
938 }
939 } else {
940 ns.disk = D_UP_TO_DATE;
941 ns.pdsk = D_UP_TO_DATE;
942
943 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
b30ab791 944 if (device->p_uuid) {
b411b363
PR
945 int i;
946 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
b30ab791
AG
947 _drbd_uuid_set(device, i, device->p_uuid[i]);
948 drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
949 _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
b411b363 950 } else {
d0180171 951 drbd_err(device, "device->p_uuid is NULL! BUG\n");
b411b363
PR
952 }
953 }
954
62b0da3a
LE
955 if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
956 /* for verify runs, we don't update uuids here,
957 * so there would be nothing to report. */
b30ab791
AG
958 drbd_uuid_set_bm(device, 0UL);
959 drbd_print_uuids(device, "updated UUIDs");
960 if (device->p_uuid) {
62b0da3a
LE
961 /* Now the two UUID sets are equal, update what we
962 * know of the peer. */
963 int i;
964 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
b30ab791 965 device->p_uuid[i] = device->ldev->md.uuid[i];
62b0da3a 966 }
b411b363
PR
967 }
968 }
969
b30ab791 970 _drbd_set_state(device, ns, CS_VERBOSE, NULL);
b411b363 971out_unlock:
0500813f 972 spin_unlock_irq(&device->resource->req_lock);
b30ab791 973 put_ldev(device);
b411b363 974out:
b30ab791
AG
975 device->rs_total = 0;
976 device->rs_failed = 0;
977 device->rs_paused = 0;
58ffa580
LE
978
979 /* reset start sector, if we reached end of device */
b30ab791
AG
980 if (verify_done && device->ov_left == 0)
981 device->ov_start_sector = 0;
b411b363 982
b30ab791 983 drbd_md_sync(device);
13d42685 984
b411b363 985 if (khelper_cmd)
b30ab791 986 drbd_khelper(device, khelper_cmd);
b411b363
PR
987
988 return 1;
989}
990
991/* helper */
b30ab791 992static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
b411b363 993{
045417f7 994 if (drbd_peer_req_has_active_page(peer_req)) {
b411b363 995 /* This might happen if sendpage() has not finished */
db830c46 996 int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
b30ab791
AG
997 atomic_add(i, &device->pp_in_use_by_net);
998 atomic_sub(i, &device->pp_in_use);
0500813f 999 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 1000 list_add_tail(&peer_req->w.list, &device->net_ee);
0500813f 1001 spin_unlock_irq(&device->resource->req_lock);
435f0740 1002 wake_up(&drbd_pp_wait);
b411b363 1003 } else
b30ab791 1004 drbd_free_peer_req(device, peer_req);
b411b363
PR
1005}
1006
1007/**
1008 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
b30ab791 1009 * @device: DRBD device.
b411b363
PR
1010 * @w: work object.
1011 * @cancel: The connection will be closed anyways
1012 */
99920dc5 1013int w_e_end_data_req(struct drbd_work *w, int cancel)
b411b363 1014{
a8cd15ba 1015 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
6780139c
AG
1016 struct drbd_peer_device *peer_device = peer_req->peer_device;
1017 struct drbd_device *device = peer_device->device;
99920dc5 1018 int err;
b411b363
PR
1019
1020 if (unlikely(cancel)) {
b30ab791
AG
1021 drbd_free_peer_req(device, peer_req);
1022 dec_unacked(device);
99920dc5 1023 return 0;
b411b363
PR
1024 }
1025
db830c46 1026 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
6780139c 1027 err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
b411b363
PR
1028 } else {
1029 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1030 drbd_err(device, "Sending NegDReply. sector=%llus.\n",
db830c46 1031 (unsigned long long)peer_req->i.sector);
b411b363 1032
6780139c 1033 err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
b411b363
PR
1034 }
1035
b30ab791 1036 dec_unacked(device);
b411b363 1037
b30ab791 1038 move_to_net_ee_or_free(device, peer_req);
b411b363 1039
99920dc5 1040 if (unlikely(err))
d0180171 1041 drbd_err(device, "drbd_send_block() failed\n");
99920dc5 1042 return err;
b411b363
PR
1043}
1044
1045/**
a209b4ae 1046 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
b411b363
PR
1047 * @w: work object.
1048 * @cancel: The connection will be closed anyways
1049 */
99920dc5 1050int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
b411b363 1051{
a8cd15ba 1052 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
6780139c
AG
1053 struct drbd_peer_device *peer_device = peer_req->peer_device;
1054 struct drbd_device *device = peer_device->device;
99920dc5 1055 int err;
b411b363
PR
1056
1057 if (unlikely(cancel)) {
b30ab791
AG
1058 drbd_free_peer_req(device, peer_req);
1059 dec_unacked(device);
99920dc5 1060 return 0;
b411b363
PR
1061 }
1062
b30ab791
AG
1063 if (get_ldev_if_state(device, D_FAILED)) {
1064 drbd_rs_complete_io(device, peer_req->i.sector);
1065 put_ldev(device);
b411b363
PR
1066 }
1067
b30ab791 1068 if (device->state.conn == C_AHEAD) {
6780139c 1069 err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
db830c46 1070 } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791
AG
1071 if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1072 inc_rs_pending(device);
6780139c 1073 err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
b411b363
PR
1074 } else {
1075 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1076 drbd_err(device, "Not sending RSDataReply, "
b411b363 1077 "partner DISKLESS!\n");
99920dc5 1078 err = 0;
b411b363
PR
1079 }
1080 } else {
1081 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1082 drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
db830c46 1083 (unsigned long long)peer_req->i.sector);
b411b363 1084
6780139c 1085 err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
b411b363
PR
1086
1087 /* update resync data with failure */
b30ab791 1088 drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
b411b363
PR
1089 }
1090
b30ab791 1091 dec_unacked(device);
b411b363 1092
b30ab791 1093 move_to_net_ee_or_free(device, peer_req);
b411b363 1094
99920dc5 1095 if (unlikely(err))
d0180171 1096 drbd_err(device, "drbd_send_block() failed\n");
99920dc5 1097 return err;
b411b363
PR
1098}
1099
99920dc5 1100int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
b411b363 1101{
a8cd15ba 1102 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
6780139c
AG
1103 struct drbd_peer_device *peer_device = peer_req->peer_device;
1104 struct drbd_device *device = peer_device->device;
b411b363
PR
1105 struct digest_info *di;
1106 int digest_size;
1107 void *digest = NULL;
99920dc5 1108 int err, eq = 0;
b411b363
PR
1109
1110 if (unlikely(cancel)) {
b30ab791
AG
1111 drbd_free_peer_req(device, peer_req);
1112 dec_unacked(device);
99920dc5 1113 return 0;
b411b363
PR
1114 }
1115
b30ab791
AG
1116 if (get_ldev(device)) {
1117 drbd_rs_complete_io(device, peer_req->i.sector);
1118 put_ldev(device);
1d53f09e 1119 }
b411b363 1120
db830c46 1121 di = peer_req->digest;
b411b363 1122
db830c46 1123 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1124 /* quick hack to try to avoid a race against reconfiguration.
1125 * a real fix would be much more involved,
1126 * introducing more locking mechanisms */
6780139c
AG
1127 if (peer_device->connection->csums_tfm) {
1128 digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
0b0ba1ef 1129 D_ASSERT(device, digest_size == di->digest_size);
b411b363
PR
1130 digest = kmalloc(digest_size, GFP_NOIO);
1131 }
1132 if (digest) {
6780139c 1133 drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
b411b363
PR
1134 eq = !memcmp(digest, di->digest, digest_size);
1135 kfree(digest);
1136 }
1137
1138 if (eq) {
b30ab791 1139 drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
676396d5 1140 /* rs_same_csums unit is BM_BLOCK_SIZE */
b30ab791 1141 device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
6780139c 1142 err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
b411b363 1143 } else {
b30ab791 1144 inc_rs_pending(device);
db830c46
AG
1145 peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1146 peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
204bba99 1147 kfree(di);
6780139c 1148 err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
b411b363
PR
1149 }
1150 } else {
6780139c 1151 err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
b411b363 1152 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1153 drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
b411b363
PR
1154 }
1155
b30ab791
AG
1156 dec_unacked(device);
1157 move_to_net_ee_or_free(device, peer_req);
b411b363 1158
99920dc5 1159 if (unlikely(err))
d0180171 1160 drbd_err(device, "drbd_send_block/ack() failed\n");
99920dc5 1161 return err;
b411b363
PR
1162}
1163
99920dc5 1164int w_e_end_ov_req(struct drbd_work *w, int cancel)
b411b363 1165{
a8cd15ba 1166 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
6780139c
AG
1167 struct drbd_peer_device *peer_device = peer_req->peer_device;
1168 struct drbd_device *device = peer_device->device;
db830c46
AG
1169 sector_t sector = peer_req->i.sector;
1170 unsigned int size = peer_req->i.size;
b411b363
PR
1171 int digest_size;
1172 void *digest;
99920dc5 1173 int err = 0;
b411b363
PR
1174
1175 if (unlikely(cancel))
1176 goto out;
1177
6780139c 1178 digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
b411b363 1179 digest = kmalloc(digest_size, GFP_NOIO);
8f21420e 1180 if (!digest) {
99920dc5 1181 err = 1; /* terminate the connection in case the allocation failed */
8f21420e 1182 goto out;
b411b363
PR
1183 }
1184
db830c46 1185 if (likely(!(peer_req->flags & EE_WAS_ERROR)))
6780139c 1186 drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
8f21420e
PR
1187 else
1188 memset(digest, 0, digest_size);
1189
53ea4331
LE
1190 /* Free e and pages before send.
1191 * In case we block on congestion, we could otherwise run into
1192 * some distributed deadlock, if the other side blocks on
1193 * congestion as well, because our receiver blocks in
c37c8ecf 1194 * drbd_alloc_pages due to pp_in_use > max_buffers. */
b30ab791 1195 drbd_free_peer_req(device, peer_req);
db830c46 1196 peer_req = NULL;
b30ab791 1197 inc_rs_pending(device);
6780139c 1198 err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
99920dc5 1199 if (err)
b30ab791 1200 dec_rs_pending(device);
8f21420e
PR
1201 kfree(digest);
1202
b411b363 1203out:
db830c46 1204 if (peer_req)
b30ab791
AG
1205 drbd_free_peer_req(device, peer_req);
1206 dec_unacked(device);
99920dc5 1207 return err;
b411b363
PR
1208}
1209
b30ab791 1210void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
b411b363 1211{
b30ab791
AG
1212 if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
1213 device->ov_last_oos_size += size>>9;
b411b363 1214 } else {
b30ab791
AG
1215 device->ov_last_oos_start = sector;
1216 device->ov_last_oos_size = size>>9;
b411b363 1217 }
b30ab791 1218 drbd_set_out_of_sync(device, sector, size);
b411b363
PR
1219}
1220
99920dc5 1221int w_e_end_ov_reply(struct drbd_work *w, int cancel)
b411b363 1222{
a8cd15ba 1223 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
6780139c
AG
1224 struct drbd_peer_device *peer_device = peer_req->peer_device;
1225 struct drbd_device *device = peer_device->device;
b411b363 1226 struct digest_info *di;
b411b363 1227 void *digest;
db830c46
AG
1228 sector_t sector = peer_req->i.sector;
1229 unsigned int size = peer_req->i.size;
53ea4331 1230 int digest_size;
99920dc5 1231 int err, eq = 0;
58ffa580 1232 bool stop_sector_reached = false;
b411b363
PR
1233
1234 if (unlikely(cancel)) {
b30ab791
AG
1235 drbd_free_peer_req(device, peer_req);
1236 dec_unacked(device);
99920dc5 1237 return 0;
b411b363
PR
1238 }
1239
1240 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1241 * the resync lru has been cleaned up already */
b30ab791
AG
1242 if (get_ldev(device)) {
1243 drbd_rs_complete_io(device, peer_req->i.sector);
1244 put_ldev(device);
1d53f09e 1245 }
b411b363 1246
db830c46 1247 di = peer_req->digest;
b411b363 1248
db830c46 1249 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
6780139c 1250 digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
b411b363
PR
1251 digest = kmalloc(digest_size, GFP_NOIO);
1252 if (digest) {
6780139c 1253 drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
b411b363 1254
0b0ba1ef 1255 D_ASSERT(device, digest_size == di->digest_size);
b411b363
PR
1256 eq = !memcmp(digest, di->digest, digest_size);
1257 kfree(digest);
1258 }
b411b363
PR
1259 }
1260
9676c760
LE
1261 /* Free peer_req and pages before send.
1262 * In case we block on congestion, we could otherwise run into
1263 * some distributed deadlock, if the other side blocks on
1264 * congestion as well, because our receiver blocks in
c37c8ecf 1265 * drbd_alloc_pages due to pp_in_use > max_buffers. */
b30ab791 1266 drbd_free_peer_req(device, peer_req);
b411b363 1267 if (!eq)
b30ab791 1268 drbd_ov_out_of_sync_found(device, sector, size);
b411b363 1269 else
b30ab791 1270 ov_out_of_sync_print(device);
b411b363 1271
6780139c 1272 err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
fa79abd8 1273 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
b411b363 1274
b30ab791 1275 dec_unacked(device);
b411b363 1276
b30ab791 1277 --device->ov_left;
ea5442af
LE
1278
1279 /* let's advance progress step marks only for every other megabyte */
b30ab791
AG
1280 if ((device->ov_left & 0x200) == 0x200)
1281 drbd_advance_rs_marks(device, device->ov_left);
ea5442af 1282
b30ab791
AG
1283 stop_sector_reached = verify_can_do_stop_sector(device) &&
1284 (sector + (size>>9)) >= device->ov_stop_sector;
58ffa580 1285
b30ab791
AG
1286 if (device->ov_left == 0 || stop_sector_reached) {
1287 ov_out_of_sync_print(device);
1288 drbd_resync_finished(device);
b411b363
PR
1289 }
1290
99920dc5 1291 return err;
b411b363
PR
1292}
1293
b6dd1a89
LE
1294/* FIXME
1295 * We need to track the number of pending barrier acks,
1296 * and to be able to wait for them.
1297 * See also comment in drbd_adm_attach before drbd_suspend_io.
1298 */
bde89a9e 1299static int drbd_send_barrier(struct drbd_connection *connection)
b411b363 1300{
9f5bdc33 1301 struct p_barrier *p;
b6dd1a89 1302 struct drbd_socket *sock;
b411b363 1303
bde89a9e
AG
1304 sock = &connection->data;
1305 p = conn_prepare_command(connection, sock);
9f5bdc33
AG
1306 if (!p)
1307 return -EIO;
bde89a9e 1308 p->barrier = connection->send.current_epoch_nr;
b6dd1a89 1309 p->pad = 0;
bde89a9e 1310 connection->send.current_epoch_writes = 0;
b6dd1a89 1311
bde89a9e 1312 return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
b411b363
PR
1313}
1314
99920dc5 1315int w_send_write_hint(struct drbd_work *w, int cancel)
b411b363 1316{
84b8c06b
AG
1317 struct drbd_device *device =
1318 container_of(w, struct drbd_device, unplug_work);
9f5bdc33
AG
1319 struct drbd_socket *sock;
1320
b411b363 1321 if (cancel)
99920dc5 1322 return 0;
a6b32bc3 1323 sock = &first_peer_device(device)->connection->data;
69a22773 1324 if (!drbd_prepare_command(first_peer_device(device), sock))
9f5bdc33 1325 return -EIO;
69a22773 1326 return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
b411b363
PR
1327}
1328
bde89a9e 1329static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
4eb9b3cb 1330{
bde89a9e
AG
1331 if (!connection->send.seen_any_write_yet) {
1332 connection->send.seen_any_write_yet = true;
1333 connection->send.current_epoch_nr = epoch;
1334 connection->send.current_epoch_writes = 0;
4eb9b3cb
LE
1335 }
1336}
1337
bde89a9e 1338static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
4eb9b3cb
LE
1339{
1340 /* re-init if first write on this connection */
bde89a9e 1341 if (!connection->send.seen_any_write_yet)
4eb9b3cb 1342 return;
bde89a9e
AG
1343 if (connection->send.current_epoch_nr != epoch) {
1344 if (connection->send.current_epoch_writes)
1345 drbd_send_barrier(connection);
1346 connection->send.current_epoch_nr = epoch;
4eb9b3cb
LE
1347 }
1348}
1349
8f7bed77 1350int w_send_out_of_sync(struct drbd_work *w, int cancel)
73a01a18
PR
1351{
1352 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1353 struct drbd_device *device = req->device;
a6b32bc3 1354 struct drbd_connection *connection = first_peer_device(device)->connection;
99920dc5 1355 int err;
73a01a18
PR
1356
1357 if (unlikely(cancel)) {
8554df1c 1358 req_mod(req, SEND_CANCELED);
99920dc5 1359 return 0;
73a01a18
PR
1360 }
1361
bde89a9e 1362 /* this time, no connection->send.current_epoch_writes++;
b6dd1a89
LE
1363 * If it was sent, it was the closing barrier for the last
1364 * replicated epoch, before we went into AHEAD mode.
1365 * No more barriers will be sent, until we leave AHEAD mode again. */
bde89a9e 1366 maybe_send_barrier(connection, req->epoch);
b6dd1a89 1367
69a22773 1368 err = drbd_send_out_of_sync(first_peer_device(device), req);
8554df1c 1369 req_mod(req, OOS_HANDED_TO_NETWORK);
73a01a18 1370
99920dc5 1371 return err;
73a01a18
PR
1372}
1373
b411b363
PR
1374/**
1375 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
b411b363
PR
1376 * @w: work object.
1377 * @cancel: The connection will be closed anyways
1378 */
99920dc5 1379int w_send_dblock(struct drbd_work *w, int cancel)
b411b363
PR
1380{
1381 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1382 struct drbd_device *device = req->device;
a6b32bc3 1383 struct drbd_connection *connection = first_peer_device(device)->connection;
99920dc5 1384 int err;
b411b363
PR
1385
1386 if (unlikely(cancel)) {
8554df1c 1387 req_mod(req, SEND_CANCELED);
99920dc5 1388 return 0;
b411b363
PR
1389 }
1390
bde89a9e
AG
1391 re_init_if_first_write(connection, req->epoch);
1392 maybe_send_barrier(connection, req->epoch);
1393 connection->send.current_epoch_writes++;
b6dd1a89 1394
69a22773 1395 err = drbd_send_dblock(first_peer_device(device), req);
99920dc5 1396 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
b411b363 1397
99920dc5 1398 return err;
b411b363
PR
1399}
1400
1401/**
1402 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
b411b363
PR
1403 * @w: work object.
1404 * @cancel: The connection will be closed anyways
1405 */
99920dc5 1406int w_send_read_req(struct drbd_work *w, int cancel)
b411b363
PR
1407{
1408 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1409 struct drbd_device *device = req->device;
a6b32bc3 1410 struct drbd_connection *connection = first_peer_device(device)->connection;
99920dc5 1411 int err;
b411b363
PR
1412
1413 if (unlikely(cancel)) {
8554df1c 1414 req_mod(req, SEND_CANCELED);
99920dc5 1415 return 0;
b411b363
PR
1416 }
1417
b6dd1a89
LE
1418 /* Even read requests may close a write epoch,
1419 * if there was any yet. */
bde89a9e 1420 maybe_send_barrier(connection, req->epoch);
b6dd1a89 1421
69a22773 1422 err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
6c1005e7 1423 (unsigned long)req);
b411b363 1424
99920dc5 1425 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
b411b363 1426
99920dc5 1427 return err;
b411b363
PR
1428}
1429
99920dc5 1430int w_restart_disk_io(struct drbd_work *w, int cancel)
265be2d0
PR
1431{
1432 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1433 struct drbd_device *device = req->device;
265be2d0 1434
0778286a 1435 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
b30ab791 1436 drbd_al_begin_io(device, &req->i, false);
265be2d0
PR
1437
1438 drbd_req_make_private_bio(req, req->master_bio);
b30ab791 1439 req->private_bio->bi_bdev = device->ldev->backing_bdev;
265be2d0
PR
1440 generic_make_request(req->private_bio);
1441
99920dc5 1442 return 0;
265be2d0
PR
1443}
1444
b30ab791 1445static int _drbd_may_sync_now(struct drbd_device *device)
b411b363 1446{
b30ab791 1447 struct drbd_device *odev = device;
95f8efd0 1448 int resync_after;
b411b363
PR
1449
1450 while (1) {
a3f8f7dc 1451 if (!odev->ldev || odev->state.disk == D_DISKLESS)
438c8374 1452 return 1;
daeda1cc 1453 rcu_read_lock();
95f8efd0 1454 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
daeda1cc 1455 rcu_read_unlock();
95f8efd0 1456 if (resync_after == -1)
b411b363 1457 return 1;
b30ab791 1458 odev = minor_to_device(resync_after);
a3f8f7dc 1459 if (!odev)
841ce241 1460 return 1;
b411b363
PR
1461 if ((odev->state.conn >= C_SYNC_SOURCE &&
1462 odev->state.conn <= C_PAUSED_SYNC_T) ||
1463 odev->state.aftr_isp || odev->state.peer_isp ||
1464 odev->state.user_isp)
1465 return 0;
1466 }
1467}
1468
1469/**
1470 * _drbd_pause_after() - Pause resync on all devices that may not resync now
b30ab791 1471 * @device: DRBD device.
b411b363
PR
1472 *
1473 * Called from process context only (admin command and after_state_ch).
1474 */
b30ab791 1475static int _drbd_pause_after(struct drbd_device *device)
b411b363 1476{
54761697 1477 struct drbd_device *odev;
b411b363
PR
1478 int i, rv = 0;
1479
695d08fa 1480 rcu_read_lock();
05a10ec7 1481 idr_for_each_entry(&drbd_devices, odev, i) {
b411b363
PR
1482 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1483 continue;
1484 if (!_drbd_may_sync_now(odev))
1485 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1486 != SS_NOTHING_TO_DO);
1487 }
695d08fa 1488 rcu_read_unlock();
b411b363
PR
1489
1490 return rv;
1491}
1492
1493/**
1494 * _drbd_resume_next() - Resume resync on all devices that may resync now
b30ab791 1495 * @device: DRBD device.
b411b363
PR
1496 *
1497 * Called from process context only (admin command and worker).
1498 */
b30ab791 1499static int _drbd_resume_next(struct drbd_device *device)
b411b363 1500{
54761697 1501 struct drbd_device *odev;
b411b363
PR
1502 int i, rv = 0;
1503
695d08fa 1504 rcu_read_lock();
05a10ec7 1505 idr_for_each_entry(&drbd_devices, odev, i) {
b411b363
PR
1506 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1507 continue;
1508 if (odev->state.aftr_isp) {
1509 if (_drbd_may_sync_now(odev))
1510 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1511 CS_HARD, NULL)
1512 != SS_NOTHING_TO_DO) ;
1513 }
1514 }
695d08fa 1515 rcu_read_unlock();
b411b363
PR
1516 return rv;
1517}
1518
b30ab791 1519void resume_next_sg(struct drbd_device *device)
b411b363
PR
1520{
1521 write_lock_irq(&global_state_lock);
b30ab791 1522 _drbd_resume_next(device);
b411b363
PR
1523 write_unlock_irq(&global_state_lock);
1524}
1525
b30ab791 1526void suspend_other_sg(struct drbd_device *device)
b411b363
PR
1527{
1528 write_lock_irq(&global_state_lock);
b30ab791 1529 _drbd_pause_after(device);
b411b363
PR
1530 write_unlock_irq(&global_state_lock);
1531}
1532
dc97b708 1533/* caller must hold global_state_lock */
b30ab791 1534enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
b411b363 1535{
54761697 1536 struct drbd_device *odev;
95f8efd0 1537 int resync_after;
b411b363
PR
1538
1539 if (o_minor == -1)
1540 return NO_ERROR;
a3f8f7dc 1541 if (o_minor < -1 || o_minor > MINORMASK)
95f8efd0 1542 return ERR_RESYNC_AFTER;
b411b363
PR
1543
1544 /* check for loops */
b30ab791 1545 odev = minor_to_device(o_minor);
b411b363 1546 while (1) {
b30ab791 1547 if (odev == device)
95f8efd0 1548 return ERR_RESYNC_AFTER_CYCLE;
b411b363 1549
a3f8f7dc
LE
1550 /* You are free to depend on diskless, non-existing,
1551 * or not yet/no longer existing minors.
1552 * We only reject dependency loops.
1553 * We cannot follow the dependency chain beyond a detached or
1554 * missing minor.
1555 */
1556 if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1557 return NO_ERROR;
1558
daeda1cc 1559 rcu_read_lock();
95f8efd0 1560 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
daeda1cc 1561 rcu_read_unlock();
b411b363 1562 /* dependency chain ends here, no cycles. */
95f8efd0 1563 if (resync_after == -1)
b411b363
PR
1564 return NO_ERROR;
1565
1566 /* follow the dependency chain */
b30ab791 1567 odev = minor_to_device(resync_after);
b411b363
PR
1568 }
1569}
1570
dc97b708 1571/* caller must hold global_state_lock */
b30ab791 1572void drbd_resync_after_changed(struct drbd_device *device)
b411b363
PR
1573{
1574 int changes;
b411b363 1575
dc97b708 1576 do {
b30ab791
AG
1577 changes = _drbd_pause_after(device);
1578 changes |= _drbd_resume_next(device);
dc97b708 1579 } while (changes);
b411b363
PR
1580}
1581
b30ab791 1582void drbd_rs_controller_reset(struct drbd_device *device)
9bd28d3c 1583{
813472ce
PR
1584 struct fifo_buffer *plan;
1585
b30ab791
AG
1586 atomic_set(&device->rs_sect_in, 0);
1587 atomic_set(&device->rs_sect_ev, 0);
1588 device->rs_in_flight = 0;
813472ce
PR
1589
1590 /* Updating the RCU protected object in place is necessary since
1591 this function gets called from atomic context.
1592 It is valid since all other updates also lead to an completely
1593 empty fifo */
1594 rcu_read_lock();
b30ab791 1595 plan = rcu_dereference(device->rs_plan_s);
813472ce
PR
1596 plan->total = 0;
1597 fifo_set(plan, 0);
1598 rcu_read_unlock();
9bd28d3c
LE
1599}
1600
1f04af33
PR
1601void start_resync_timer_fn(unsigned long data)
1602{
b30ab791 1603 struct drbd_device *device = (struct drbd_device *) data;
1f04af33 1604
84b8c06b
AG
1605 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
1606 &device->start_resync_work);
1f04af33
PR
1607}
1608
99920dc5 1609int w_start_resync(struct drbd_work *w, int cancel)
1f04af33 1610{
84b8c06b
AG
1611 struct drbd_device *device =
1612 container_of(w, struct drbd_device, start_resync_work);
00d56944 1613
b30ab791 1614 if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
d0180171 1615 drbd_warn(device, "w_start_resync later...\n");
b30ab791
AG
1616 device->start_resync_timer.expires = jiffies + HZ/10;
1617 add_timer(&device->start_resync_timer);
99920dc5 1618 return 0;
1f04af33
PR
1619 }
1620
b30ab791
AG
1621 drbd_start_resync(device, C_SYNC_SOURCE);
1622 clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
99920dc5 1623 return 0;
1f04af33
PR
1624}
1625
b411b363
PR
1626/**
1627 * drbd_start_resync() - Start the resync process
b30ab791 1628 * @device: DRBD device.
b411b363
PR
1629 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1630 *
1631 * This function might bring you directly into one of the
1632 * C_PAUSED_SYNC_* states.
1633 */
b30ab791 1634void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
b411b363
PR
1635{
1636 union drbd_state ns;
1637 int r;
1638
b30ab791 1639 if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
d0180171 1640 drbd_err(device, "Resync already running!\n");
b411b363
PR
1641 return;
1642 }
1643
b30ab791 1644 if (!test_bit(B_RS_H_DONE, &device->flags)) {
e64a3294
PR
1645 if (side == C_SYNC_TARGET) {
1646 /* Since application IO was locked out during C_WF_BITMAP_T and
1647 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1648 we check that we might make the data inconsistent. */
b30ab791 1649 r = drbd_khelper(device, "before-resync-target");
e64a3294
PR
1650 r = (r >> 8) & 0xff;
1651 if (r > 0) {
d0180171 1652 drbd_info(device, "before-resync-target handler returned %d, "
09b9e797 1653 "dropping connection.\n", r);
a6b32bc3 1654 conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
09b9e797
PR
1655 return;
1656 }
e64a3294 1657 } else /* C_SYNC_SOURCE */ {
b30ab791 1658 r = drbd_khelper(device, "before-resync-source");
e64a3294
PR
1659 r = (r >> 8) & 0xff;
1660 if (r > 0) {
1661 if (r == 3) {
d0180171 1662 drbd_info(device, "before-resync-source handler returned %d, "
e64a3294
PR
1663 "ignoring. Old userland tools?", r);
1664 } else {
d0180171 1665 drbd_info(device, "before-resync-source handler returned %d, "
e64a3294 1666 "dropping connection.\n", r);
a6b32bc3
AG
1667 conn_request_state(first_peer_device(device)->connection,
1668 NS(conn, C_DISCONNECTING), CS_HARD);
e64a3294
PR
1669 return;
1670 }
1671 }
09b9e797 1672 }
b411b363
PR
1673 }
1674
a6b32bc3 1675 if (current == first_peer_device(device)->connection->worker.task) {
dad20554 1676 /* The worker should not sleep waiting for state_mutex,
e64a3294 1677 that can take long */
b30ab791
AG
1678 if (!mutex_trylock(device->state_mutex)) {
1679 set_bit(B_RS_H_DONE, &device->flags);
1680 device->start_resync_timer.expires = jiffies + HZ/5;
1681 add_timer(&device->start_resync_timer);
e64a3294
PR
1682 return;
1683 }
1684 } else {
b30ab791 1685 mutex_lock(device->state_mutex);
e64a3294 1686 }
b30ab791 1687 clear_bit(B_RS_H_DONE, &device->flags);
b411b363 1688
074f4afe
LE
1689 /* req_lock: serialize with drbd_send_and_submit() and others
1690 * global_state_lock: for stable sync-after dependencies */
1691 spin_lock_irq(&device->resource->req_lock);
1692 write_lock(&global_state_lock);
a700471b 1693 /* Did some connection breakage or IO error race with us? */
b30ab791
AG
1694 if (device->state.conn < C_CONNECTED
1695 || !get_ldev_if_state(device, D_NEGOTIATING)) {
074f4afe
LE
1696 write_unlock(&global_state_lock);
1697 spin_unlock_irq(&device->resource->req_lock);
b30ab791 1698 mutex_unlock(device->state_mutex);
b411b363
PR
1699 return;
1700 }
1701
b30ab791 1702 ns = drbd_read_state(device);
b411b363 1703
b30ab791 1704 ns.aftr_isp = !_drbd_may_sync_now(device);
b411b363
PR
1705
1706 ns.conn = side;
1707
1708 if (side == C_SYNC_TARGET)
1709 ns.disk = D_INCONSISTENT;
1710 else /* side == C_SYNC_SOURCE */
1711 ns.pdsk = D_INCONSISTENT;
1712
b30ab791
AG
1713 r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
1714 ns = drbd_read_state(device);
b411b363
PR
1715
1716 if (ns.conn < C_CONNECTED)
1717 r = SS_UNKNOWN_ERROR;
1718
1719 if (r == SS_SUCCESS) {
b30ab791 1720 unsigned long tw = drbd_bm_total_weight(device);
1d7734a0
LE
1721 unsigned long now = jiffies;
1722 int i;
1723
b30ab791
AG
1724 device->rs_failed = 0;
1725 device->rs_paused = 0;
1726 device->rs_same_csum = 0;
1727 device->rs_last_events = 0;
1728 device->rs_last_sect_ev = 0;
1729 device->rs_total = tw;
1730 device->rs_start = now;
1d7734a0 1731 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
b30ab791
AG
1732 device->rs_mark_left[i] = tw;
1733 device->rs_mark_time[i] = now;
1d7734a0 1734 }
b30ab791 1735 _drbd_pause_after(device);
b411b363 1736 }
074f4afe
LE
1737 write_unlock(&global_state_lock);
1738 spin_unlock_irq(&device->resource->req_lock);
5a22db89 1739
b411b363 1740 if (r == SS_SUCCESS) {
328e0f12
PR
1741 /* reset rs_last_bcast when a resync or verify is started,
1742 * to deal with potential jiffies wrap. */
b30ab791 1743 device->rs_last_bcast = jiffies - HZ;
328e0f12 1744
d0180171 1745 drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
b411b363 1746 drbd_conn_str(ns.conn),
b30ab791
AG
1747 (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1748 (unsigned long) device->rs_total);
6c922ed5 1749 if (side == C_SYNC_TARGET)
b30ab791 1750 device->bm_resync_fo = 0;
6c922ed5
LE
1751
1752 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1753 * with w_send_oos, or the sync target will get confused as to
1754 * how much bits to resync. We cannot do that always, because for an
1755 * empty resync and protocol < 95, we need to do it here, as we call
1756 * drbd_resync_finished from here in that case.
1757 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1758 * and from after_state_ch otherwise. */
a6b32bc3
AG
1759 if (side == C_SYNC_SOURCE &&
1760 first_peer_device(device)->connection->agreed_pro_version < 96)
69a22773 1761 drbd_gen_and_send_sync_uuid(first_peer_device(device));
b411b363 1762
a6b32bc3
AG
1763 if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
1764 device->rs_total == 0) {
af85e8e8
LE
1765 /* This still has a race (about when exactly the peers
1766 * detect connection loss) that can lead to a full sync
1767 * on next handshake. In 8.3.9 we fixed this with explicit
1768 * resync-finished notifications, but the fix
1769 * introduces a protocol change. Sleeping for some
1770 * time longer than the ping interval + timeout on the
1771 * SyncSource, to give the SyncTarget the chance to
1772 * detect connection loss, then waiting for a ping
1773 * response (implicit in drbd_resync_finished) reduces
1774 * the race considerably, but does not solve it. */
44ed167d
PR
1775 if (side == C_SYNC_SOURCE) {
1776 struct net_conf *nc;
1777 int timeo;
1778
1779 rcu_read_lock();
a6b32bc3 1780 nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
44ed167d
PR
1781 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1782 rcu_read_unlock();
1783 schedule_timeout_interruptible(timeo);
1784 }
b30ab791 1785 drbd_resync_finished(device);
b411b363
PR
1786 }
1787
b30ab791
AG
1788 drbd_rs_controller_reset(device);
1789 /* ns.conn may already be != device->state.conn,
b411b363
PR
1790 * we may have been paused in between, or become paused until
1791 * the timer triggers.
1792 * No matter, that is handled in resync_timer_fn() */
1793 if (ns.conn == C_SYNC_TARGET)
b30ab791 1794 mod_timer(&device->resync_timer, jiffies);
b411b363 1795
b30ab791 1796 drbd_md_sync(device);
b411b363 1797 }
b30ab791
AG
1798 put_ldev(device);
1799 mutex_unlock(device->state_mutex);
b411b363
PR
1800}
1801
a186e478 1802static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
8c0785a5
LE
1803{
1804 spin_lock_irq(&queue->q_lock);
1805 list_splice_init(&queue->q, work_list);
1806 spin_unlock_irq(&queue->q_lock);
1807 return !list_empty(work_list);
1808}
1809
a186e478 1810static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
8c0785a5
LE
1811{
1812 spin_lock_irq(&queue->q_lock);
1813 if (!list_empty(&queue->q))
1814 list_move(queue->q.next, work_list);
1815 spin_unlock_irq(&queue->q_lock);
1816 return !list_empty(work_list);
1817}
1818
bde89a9e 1819static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
b6dd1a89
LE
1820{
1821 DEFINE_WAIT(wait);
1822 struct net_conf *nc;
1823 int uncork, cork;
1824
1825 dequeue_work_item(&connection->sender_work, work_list);
1826 if (!list_empty(work_list))
1827 return;
1828
1829 /* Still nothing to do?
1830 * Maybe we still need to close the current epoch,
1831 * even if no new requests are queued yet.
1832 *
1833 * Also, poke TCP, just in case.
1834 * Then wait for new work (or signal). */
1835 rcu_read_lock();
1836 nc = rcu_dereference(connection->net_conf);
1837 uncork = nc ? nc->tcp_cork : 0;
1838 rcu_read_unlock();
1839 if (uncork) {
1840 mutex_lock(&connection->data.mutex);
1841 if (connection->data.socket)
1842 drbd_tcp_uncork(connection->data.socket);
1843 mutex_unlock(&connection->data.mutex);
1844 }
1845
1846 for (;;) {
1847 int send_barrier;
1848 prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
0500813f 1849 spin_lock_irq(&connection->resource->req_lock);
b6dd1a89 1850 spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
bc317a9e
LE
1851 /* dequeue single item only,
1852 * we still use drbd_queue_work_front() in some places */
1853 if (!list_empty(&connection->sender_work.q))
1854 list_move(connection->sender_work.q.next, work_list);
b6dd1a89
LE
1855 spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
1856 if (!list_empty(work_list) || signal_pending(current)) {
0500813f 1857 spin_unlock_irq(&connection->resource->req_lock);
b6dd1a89
LE
1858 break;
1859 }
f9c78128
LE
1860
1861 /* We found nothing new to do, no to-be-communicated request,
1862 * no other work item. We may still need to close the last
1863 * epoch. Next incoming request epoch will be connection ->
1864 * current transfer log epoch number. If that is different
1865 * from the epoch of the last request we communicated, it is
1866 * safe to send the epoch separating barrier now.
1867 */
1868 send_barrier =
1869 atomic_read(&connection->current_tle_nr) !=
1870 connection->send.current_epoch_nr;
0500813f 1871 spin_unlock_irq(&connection->resource->req_lock);
f9c78128
LE
1872
1873 if (send_barrier)
1874 maybe_send_barrier(connection,
1875 connection->send.current_epoch_nr + 1);
b6dd1a89
LE
1876 schedule();
1877 /* may be woken up for other things but new work, too,
1878 * e.g. if the current epoch got closed.
1879 * In which case we send the barrier above. */
1880 }
1881 finish_wait(&connection->sender_work.q_wait, &wait);
1882
1883 /* someone may have changed the config while we have been waiting above. */
1884 rcu_read_lock();
1885 nc = rcu_dereference(connection->net_conf);
1886 cork = nc ? nc->tcp_cork : 0;
1887 rcu_read_unlock();
1888 mutex_lock(&connection->data.mutex);
1889 if (connection->data.socket) {
1890 if (cork)
1891 drbd_tcp_cork(connection->data.socket);
1892 else if (!uncork)
1893 drbd_tcp_uncork(connection->data.socket);
1894 }
1895 mutex_unlock(&connection->data.mutex);
1896}
1897
b411b363
PR
1898int drbd_worker(struct drbd_thread *thi)
1899{
bde89a9e 1900 struct drbd_connection *connection = thi->connection;
6db7e50a 1901 struct drbd_work *w = NULL;
c06ece6b 1902 struct drbd_peer_device *peer_device;
b411b363 1903 LIST_HEAD(work_list);
8c0785a5 1904 int vnr;
b411b363 1905
e77a0a5c 1906 while (get_t_state(thi) == RUNNING) {
80822284 1907 drbd_thread_current_set_cpu(thi);
b411b363 1908
8c0785a5
LE
1909 /* as long as we use drbd_queue_work_front(),
1910 * we may only dequeue single work items here, not batches. */
1911 if (list_empty(&work_list))
bde89a9e 1912 wait_for_work(connection, &work_list);
b411b363 1913
8c0785a5 1914 if (signal_pending(current)) {
b411b363 1915 flush_signals(current);
19393e10 1916 if (get_t_state(thi) == RUNNING) {
1ec861eb 1917 drbd_warn(connection, "Worker got an unexpected signal\n");
b411b363 1918 continue;
19393e10 1919 }
b411b363
PR
1920 break;
1921 }
1922
e77a0a5c 1923 if (get_t_state(thi) != RUNNING)
b411b363 1924 break;
b411b363 1925
8c0785a5 1926 while (!list_empty(&work_list)) {
6db7e50a
AG
1927 w = list_first_entry(&work_list, struct drbd_work, list);
1928 list_del_init(&w->list);
1929 if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
8c0785a5 1930 continue;
bde89a9e
AG
1931 if (connection->cstate >= C_WF_REPORT_PARAMS)
1932 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
b411b363
PR
1933 }
1934 }
b411b363 1935
8c0785a5 1936 do {
b411b363 1937 while (!list_empty(&work_list)) {
6db7e50a
AG
1938 w = list_first_entry(&work_list, struct drbd_work, list);
1939 list_del_init(&w->list);
1940 w->cb(w, 1);
b411b363 1941 }
bde89a9e 1942 dequeue_work_batch(&connection->sender_work, &work_list);
8c0785a5 1943 } while (!list_empty(&work_list));
b411b363 1944
c141ebda 1945 rcu_read_lock();
c06ece6b
AG
1946 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1947 struct drbd_device *device = peer_device->device;
0b0ba1ef 1948 D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
b30ab791 1949 kref_get(&device->kref);
c141ebda 1950 rcu_read_unlock();
b30ab791 1951 drbd_device_cleanup(device);
05a10ec7 1952 kref_put(&device->kref, drbd_destroy_device);
c141ebda 1953 rcu_read_lock();
0e29d163 1954 }
c141ebda 1955 rcu_read_unlock();
b411b363
PR
1956
1957 return 0;
1958}