]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/block/drbd/drbd_worker.c
drbd: struct after_conn_state_chg_work: Use drbd_work instead of drbd_device_work
[mirror_ubuntu-artful-kernel.git] / drivers / block / drbd / drbd_worker.c
CommitLineData
b411b363
PR
1/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
84b8c06b 24*/
b411b363 25
b411b363 26#include <linux/module.h>
b411b363
PR
27#include <linux/drbd.h>
28#include <linux/sched.h>
b411b363
PR
29#include <linux/wait.h>
30#include <linux/mm.h>
31#include <linux/memcontrol.h>
32#include <linux/mm_inline.h>
33#include <linux/slab.h>
34#include <linux/random.h>
b411b363
PR
35#include <linux/string.h>
36#include <linux/scatterlist.h>
37
38#include "drbd_int.h"
a3603a6e 39#include "drbd_protocol.h"
b411b363 40#include "drbd_req.h"
b411b363 41
84b8c06b 42static int w_make_ov_request(struct drbd_work *, int);
b411b363
PR
43
44
c5a91619
AG
45/* endio handlers:
46 * drbd_md_io_complete (defined here)
fcefa62e
AG
47 * drbd_request_endio (defined here)
48 * drbd_peer_request_endio (defined here)
c5a91619
AG
49 * bm_async_io_complete (defined in drbd_bitmap.c)
50 *
b411b363
PR
51 * For all these callbacks, note the following:
52 * The callbacks will be called in irq context by the IDE drivers,
53 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
54 * Try to get the locking right :)
55 *
56 */
57
58
59/* About the global_state_lock
60 Each state transition on an device holds a read lock. In case we have
95f8efd0 61 to evaluate the resync after dependencies, we grab a write lock, because
b411b363
PR
62 we need stable states on all devices for that. */
63rwlock_t global_state_lock;
64
65/* used for synchronous meta data and bitmap IO
66 * submitted by drbd_md_sync_page_io()
67 */
68void drbd_md_io_complete(struct bio *bio, int error)
69{
70 struct drbd_md_io *md_io;
b30ab791 71 struct drbd_device *device;
b411b363
PR
72
73 md_io = (struct drbd_md_io *)bio->bi_private;
b30ab791 74 device = container_of(md_io, struct drbd_device, md_io);
cdfda633 75
b411b363
PR
76 md_io->error = error;
77
0cfac5dd
PR
78 /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
79 * to timeout on the lower level device, and eventually detach from it.
80 * If this io completion runs after that timeout expired, this
81 * drbd_md_put_buffer() may allow us to finally try and re-attach.
82 * During normal operation, this only puts that extra reference
83 * down to 1 again.
84 * Make sure we first drop the reference, and only then signal
85 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
86 * next drbd_md_sync_page_io(), that we trigger the
b30ab791 87 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
0cfac5dd 88 */
b30ab791 89 drbd_md_put_buffer(device);
cdfda633 90 md_io->done = 1;
b30ab791 91 wake_up(&device->misc_wait);
cdfda633 92 bio_put(bio);
b30ab791
AG
93 if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
94 put_ldev(device);
b411b363
PR
95}
96
97/* reads on behalf of the partner,
98 * "submitted" by the receiver
99 */
a186e478 100static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
b411b363
PR
101{
102 unsigned long flags = 0;
84b8c06b 103 struct drbd_device *device = peer_req->dw.device;
b411b363 104
0500813f 105 spin_lock_irqsave(&device->resource->req_lock, flags);
b30ab791 106 device->read_cnt += peer_req->i.size >> 9;
84b8c06b 107 list_del(&peer_req->dw.w.list);
b30ab791
AG
108 if (list_empty(&device->read_ee))
109 wake_up(&device->ee_wait);
db830c46 110 if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
b30ab791 111 __drbd_chk_io_error(device, DRBD_READ_ERROR);
0500813f 112 spin_unlock_irqrestore(&device->resource->req_lock, flags);
b411b363 113
84b8c06b
AG
114 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
115 &peer_req->dw.w);
b30ab791 116 put_ldev(device);
b411b363
PR
117}
118
119/* writes on behalf of the partner, or resync writes,
45bb912b 120 * "submitted" by the receiver, final stage. */
db830c46 121static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
b411b363
PR
122{
123 unsigned long flags = 0;
84b8c06b 124 struct drbd_device *device = peer_req->dw.device;
181286ad 125 struct drbd_interval i;
b411b363 126 int do_wake;
579b57ed 127 u64 block_id;
b411b363 128 int do_al_complete_io;
b411b363 129
db830c46 130 /* after we moved peer_req to done_ee,
b411b363
PR
131 * we may no longer access it,
132 * it may be freed/reused already!
133 * (as soon as we release the req_lock) */
181286ad 134 i = peer_req->i;
db830c46
AG
135 do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
136 block_id = peer_req->block_id;
b411b363 137
0500813f 138 spin_lock_irqsave(&device->resource->req_lock, flags);
b30ab791 139 device->writ_cnt += peer_req->i.size >> 9;
84b8c06b 140 list_move_tail(&peer_req->dw.w.list, &device->done_ee);
b411b363 141
bb3bfe96 142 /*
5e472264 143 * Do not remove from the write_requests tree here: we did not send the
bb3bfe96
AG
144 * Ack yet and did not wake possibly waiting conflicting requests.
145 * Removed from the tree from "drbd_process_done_ee" within the
84b8c06b 146 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
bb3bfe96
AG
147 * _drbd_clear_done_ee.
148 */
b411b363 149
b30ab791 150 do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
b411b363 151
db830c46 152 if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
b30ab791 153 __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
0500813f 154 spin_unlock_irqrestore(&device->resource->req_lock, flags);
b411b363 155
579b57ed 156 if (block_id == ID_SYNCER)
b30ab791 157 drbd_rs_complete_io(device, i.sector);
b411b363
PR
158
159 if (do_wake)
b30ab791 160 wake_up(&device->ee_wait);
b411b363
PR
161
162 if (do_al_complete_io)
b30ab791 163 drbd_al_complete_io(device, &i);
b411b363 164
a6b32bc3 165 wake_asender(first_peer_device(device)->connection);
b30ab791 166 put_ldev(device);
45bb912b 167}
b411b363 168
45bb912b
LE
169/* writes on behalf of the partner, or resync writes,
170 * "submitted" by the receiver.
171 */
fcefa62e 172void drbd_peer_request_endio(struct bio *bio, int error)
45bb912b 173{
db830c46 174 struct drbd_peer_request *peer_req = bio->bi_private;
84b8c06b 175 struct drbd_device *device = peer_req->dw.device;
45bb912b
LE
176 int uptodate = bio_flagged(bio, BIO_UPTODATE);
177 int is_write = bio_data_dir(bio) == WRITE;
178
07194272 179 if (error && __ratelimit(&drbd_ratelimit_state))
d0180171 180 drbd_warn(device, "%s: error=%d s=%llus\n",
45bb912b 181 is_write ? "write" : "read", error,
db830c46 182 (unsigned long long)peer_req->i.sector);
45bb912b 183 if (!error && !uptodate) {
07194272 184 if (__ratelimit(&drbd_ratelimit_state))
d0180171 185 drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
07194272 186 is_write ? "write" : "read",
db830c46 187 (unsigned long long)peer_req->i.sector);
45bb912b
LE
188 /* strange behavior of some lower level drivers...
189 * fail the request by clearing the uptodate flag,
190 * but do not return any error?! */
191 error = -EIO;
192 }
193
194 if (error)
db830c46 195 set_bit(__EE_WAS_ERROR, &peer_req->flags);
45bb912b
LE
196
197 bio_put(bio); /* no need for the bio anymore */
db830c46 198 if (atomic_dec_and_test(&peer_req->pending_bios)) {
45bb912b 199 if (is_write)
db830c46 200 drbd_endio_write_sec_final(peer_req);
45bb912b 201 else
db830c46 202 drbd_endio_read_sec_final(peer_req);
45bb912b 203 }
b411b363
PR
204}
205
206/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
207 */
fcefa62e 208void drbd_request_endio(struct bio *bio, int error)
b411b363 209{
a115413d 210 unsigned long flags;
b411b363 211 struct drbd_request *req = bio->bi_private;
84b8c06b 212 struct drbd_device *device = req->device;
a115413d 213 struct bio_and_error m;
b411b363
PR
214 enum drbd_req_event what;
215 int uptodate = bio_flagged(bio, BIO_UPTODATE);
216
b411b363 217 if (!error && !uptodate) {
d0180171 218 drbd_warn(device, "p %s: setting error to -EIO\n",
b411b363
PR
219 bio_data_dir(bio) == WRITE ? "write" : "read");
220 /* strange behavior of some lower level drivers...
221 * fail the request by clearing the uptodate flag,
222 * but do not return any error?! */
223 error = -EIO;
224 }
225
1b6dd252
PR
226
227 /* If this request was aborted locally before,
228 * but now was completed "successfully",
229 * chances are that this caused arbitrary data corruption.
230 *
231 * "aborting" requests, or force-detaching the disk, is intended for
232 * completely blocked/hung local backing devices which do no longer
233 * complete requests at all, not even do error completions. In this
234 * situation, usually a hard-reset and failover is the only way out.
235 *
236 * By "aborting", basically faking a local error-completion,
237 * we allow for a more graceful swichover by cleanly migrating services.
238 * Still the affected node has to be rebooted "soon".
239 *
240 * By completing these requests, we allow the upper layers to re-use
241 * the associated data pages.
242 *
243 * If later the local backing device "recovers", and now DMAs some data
244 * from disk into the original request pages, in the best case it will
245 * just put random data into unused pages; but typically it will corrupt
246 * meanwhile completely unrelated data, causing all sorts of damage.
247 *
248 * Which means delayed successful completion,
249 * especially for READ requests,
250 * is a reason to panic().
251 *
252 * We assume that a delayed *error* completion is OK,
253 * though we still will complain noisily about it.
254 */
255 if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
256 if (__ratelimit(&drbd_ratelimit_state))
d0180171 257 drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
1b6dd252
PR
258
259 if (!error)
260 panic("possible random memory corruption caused by delayed completion of aborted local request\n");
261 }
262
b411b363
PR
263 /* to avoid recursion in __req_mod */
264 if (unlikely(error)) {
265 what = (bio_data_dir(bio) == WRITE)
8554df1c 266 ? WRITE_COMPLETED_WITH_ERROR
5c3c7e64 267 : (bio_rw(bio) == READ)
8554df1c
AG
268 ? READ_COMPLETED_WITH_ERROR
269 : READ_AHEAD_COMPLETED_WITH_ERROR;
b411b363 270 } else
8554df1c 271 what = COMPLETED_OK;
b411b363
PR
272
273 bio_put(req->private_bio);
274 req->private_bio = ERR_PTR(error);
275
a115413d 276 /* not req_mod(), we need irqsave here! */
0500813f 277 spin_lock_irqsave(&device->resource->req_lock, flags);
a115413d 278 __req_mod(req, what, &m);
0500813f 279 spin_unlock_irqrestore(&device->resource->req_lock, flags);
b30ab791 280 put_ldev(device);
a115413d
LE
281
282 if (m.bio)
b30ab791 283 complete_master_bio(device, &m);
b411b363
PR
284}
285
79a3c8d3 286void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
45bb912b
LE
287{
288 struct hash_desc desc;
289 struct scatterlist sg;
db830c46 290 struct page *page = peer_req->pages;
45bb912b
LE
291 struct page *tmp;
292 unsigned len;
293
294 desc.tfm = tfm;
295 desc.flags = 0;
296
297 sg_init_table(&sg, 1);
298 crypto_hash_init(&desc);
299
300 while ((tmp = page_chain_next(page))) {
301 /* all but the last page will be fully used */
302 sg_set_page(&sg, page, PAGE_SIZE, 0);
303 crypto_hash_update(&desc, &sg, sg.length);
304 page = tmp;
305 }
306 /* and now the last, possibly only partially used page */
db830c46 307 len = peer_req->i.size & (PAGE_SIZE - 1);
45bb912b
LE
308 sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
309 crypto_hash_update(&desc, &sg, sg.length);
310 crypto_hash_final(&desc, digest);
311}
312
79a3c8d3 313void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
b411b363
PR
314{
315 struct hash_desc desc;
316 struct scatterlist sg;
7988613b
KO
317 struct bio_vec bvec;
318 struct bvec_iter iter;
b411b363
PR
319
320 desc.tfm = tfm;
321 desc.flags = 0;
322
323 sg_init_table(&sg, 1);
324 crypto_hash_init(&desc);
325
7988613b
KO
326 bio_for_each_segment(bvec, bio, iter) {
327 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
b411b363
PR
328 crypto_hash_update(&desc, &sg, sg.length);
329 }
330 crypto_hash_final(&desc, digest);
331}
332
9676c760 333/* MAYBE merge common code with w_e_end_ov_req */
99920dc5 334static int w_e_send_csum(struct drbd_work *w, int cancel)
b411b363 335{
84b8c06b
AG
336 struct drbd_device_work *dw = device_work(w);
337 struct drbd_peer_request *peer_req = container_of(dw, struct drbd_peer_request, dw);
338 struct drbd_device *device = dw->device;
b411b363
PR
339 int digest_size;
340 void *digest;
99920dc5 341 int err = 0;
b411b363 342
53ea4331
LE
343 if (unlikely(cancel))
344 goto out;
b411b363 345
9676c760 346 if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
53ea4331 347 goto out;
b411b363 348
a6b32bc3 349 digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->csums_tfm);
53ea4331
LE
350 digest = kmalloc(digest_size, GFP_NOIO);
351 if (digest) {
db830c46
AG
352 sector_t sector = peer_req->i.sector;
353 unsigned int size = peer_req->i.size;
79a3c8d3 354 drbd_csum_ee(first_peer_device(device)->connection->csums_tfm, peer_req, digest);
9676c760 355 /* Free peer_req and pages before send.
53ea4331
LE
356 * In case we block on congestion, we could otherwise run into
357 * some distributed deadlock, if the other side blocks on
358 * congestion as well, because our receiver blocks in
c37c8ecf 359 * drbd_alloc_pages due to pp_in_use > max_buffers. */
b30ab791 360 drbd_free_peer_req(device, peer_req);
db830c46 361 peer_req = NULL;
b30ab791 362 inc_rs_pending(device);
69a22773 363 err = drbd_send_drequest_csum(first_peer_device(device), sector, size,
db1b0b72
AG
364 digest, digest_size,
365 P_CSUM_RS_REQUEST);
53ea4331
LE
366 kfree(digest);
367 } else {
d0180171 368 drbd_err(device, "kmalloc() of digest failed.\n");
99920dc5 369 err = -ENOMEM;
53ea4331 370 }
b411b363 371
53ea4331 372out:
db830c46 373 if (peer_req)
b30ab791 374 drbd_free_peer_req(device, peer_req);
b411b363 375
99920dc5 376 if (unlikely(err))
d0180171 377 drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
99920dc5 378 return err;
b411b363
PR
379}
380
381#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
382
69a22773 383static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
b411b363 384{
69a22773 385 struct drbd_device *device = peer_device->device;
db830c46 386 struct drbd_peer_request *peer_req;
b411b363 387
b30ab791 388 if (!get_ldev(device))
80a40e43 389 return -EIO;
b411b363 390
b30ab791 391 if (drbd_rs_should_slow_down(device, sector))
0f0601f4
LE
392 goto defer;
393
b411b363
PR
394 /* GFP_TRY, because if there is no memory available right now, this may
395 * be rescheduled for later. It is "only" background resync, after all. */
69a22773 396 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
0db55363 397 size, GFP_TRY);
db830c46 398 if (!peer_req)
80a40e43 399 goto defer;
b411b363 400
84b8c06b 401 peer_req->dw.w.cb = w_e_send_csum;
0500813f 402 spin_lock_irq(&device->resource->req_lock);
84b8c06b 403 list_add(&peer_req->dw.w.list, &device->read_ee);
0500813f 404 spin_unlock_irq(&device->resource->req_lock);
b411b363 405
b30ab791
AG
406 atomic_add(size >> 9, &device->rs_sect_ev);
407 if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
80a40e43 408 return 0;
b411b363 409
10f6d992
LE
410 /* If it failed because of ENOMEM, retry should help. If it failed
411 * because bio_add_page failed (probably broken lower level driver),
412 * retry may or may not help.
413 * If it does not, you may need to force disconnect. */
0500813f 414 spin_lock_irq(&device->resource->req_lock);
84b8c06b 415 list_del(&peer_req->dw.w.list);
0500813f 416 spin_unlock_irq(&device->resource->req_lock);
22cc37a9 417
b30ab791 418 drbd_free_peer_req(device, peer_req);
80a40e43 419defer:
b30ab791 420 put_ldev(device);
80a40e43 421 return -EAGAIN;
b411b363
PR
422}
423
99920dc5 424int w_resync_timer(struct drbd_work *w, int cancel)
b411b363 425{
84b8c06b
AG
426 struct drbd_device *device =
427 container_of(w, struct drbd_device, resync_work);
428
b30ab791 429 switch (device->state.conn) {
63106d3c 430 case C_VERIFY_S:
00d56944 431 w_make_ov_request(w, cancel);
63106d3c
PR
432 break;
433 case C_SYNC_TARGET:
00d56944 434 w_make_resync_request(w, cancel);
63106d3c 435 break;
b411b363
PR
436 }
437
99920dc5 438 return 0;
794abb75
PR
439}
440
441void resync_timer_fn(unsigned long data)
442{
b30ab791 443 struct drbd_device *device = (struct drbd_device *) data;
794abb75 444
b30ab791 445 if (list_empty(&device->resync_work.list))
84b8c06b
AG
446 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
447 &device->resync_work);
b411b363
PR
448}
449
778f271d
PR
450static void fifo_set(struct fifo_buffer *fb, int value)
451{
452 int i;
453
454 for (i = 0; i < fb->size; i++)
f10f2623 455 fb->values[i] = value;
778f271d
PR
456}
457
458static int fifo_push(struct fifo_buffer *fb, int value)
459{
460 int ov;
461
462 ov = fb->values[fb->head_index];
463 fb->values[fb->head_index++] = value;
464
465 if (fb->head_index >= fb->size)
466 fb->head_index = 0;
467
468 return ov;
469}
470
471static void fifo_add_val(struct fifo_buffer *fb, int value)
472{
473 int i;
474
475 for (i = 0; i < fb->size; i++)
476 fb->values[i] += value;
477}
478
9958c857
PR
479struct fifo_buffer *fifo_alloc(int fifo_size)
480{
481 struct fifo_buffer *fb;
482
8747d30a 483 fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
9958c857
PR
484 if (!fb)
485 return NULL;
486
487 fb->head_index = 0;
488 fb->size = fifo_size;
489 fb->total = 0;
490
491 return fb;
492}
493
b30ab791 494static int drbd_rs_controller(struct drbd_device *device)
778f271d 495{
daeda1cc 496 struct disk_conf *dc;
778f271d
PR
497 unsigned int sect_in; /* Number of sectors that came in since the last turn */
498 unsigned int want; /* The number of sectors we want in the proxy */
499 int req_sect; /* Number of sectors to request in this turn */
500 int correction; /* Number of sectors more we need in the proxy*/
501 int cps; /* correction per invocation of drbd_rs_controller() */
502 int steps; /* Number of time steps to plan ahead */
503 int curr_corr;
504 int max_sect;
813472ce 505 struct fifo_buffer *plan;
778f271d 506
b30ab791
AG
507 sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */
508 device->rs_in_flight -= sect_in;
778f271d 509
b30ab791
AG
510 dc = rcu_dereference(device->ldev->disk_conf);
511 plan = rcu_dereference(device->rs_plan_s);
778f271d 512
813472ce 513 steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
778f271d 514
b30ab791 515 if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
daeda1cc 516 want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
778f271d 517 } else { /* normal path */
daeda1cc
PR
518 want = dc->c_fill_target ? dc->c_fill_target :
519 sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
778f271d
PR
520 }
521
b30ab791 522 correction = want - device->rs_in_flight - plan->total;
778f271d
PR
523
524 /* Plan ahead */
525 cps = correction / steps;
813472ce
PR
526 fifo_add_val(plan, cps);
527 plan->total += cps * steps;
778f271d
PR
528
529 /* What we do in this step */
813472ce
PR
530 curr_corr = fifo_push(plan, 0);
531 plan->total -= curr_corr;
778f271d
PR
532
533 req_sect = sect_in + curr_corr;
534 if (req_sect < 0)
535 req_sect = 0;
536
daeda1cc 537 max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
778f271d
PR
538 if (req_sect > max_sect)
539 req_sect = max_sect;
540
541 /*
d0180171 542 drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
b30ab791
AG
543 sect_in, device->rs_in_flight, want, correction,
544 steps, cps, device->rs_planed, curr_corr, req_sect);
778f271d
PR
545 */
546
547 return req_sect;
548}
549
b30ab791 550static int drbd_rs_number_requests(struct drbd_device *device)
e65f440d
LE
551{
552 int number;
813472ce
PR
553
554 rcu_read_lock();
b30ab791
AG
555 if (rcu_dereference(device->rs_plan_s)->size) {
556 number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9);
557 device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
e65f440d 558 } else {
b30ab791
AG
559 device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
560 number = SLEEP_TIME * device->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
e65f440d 561 }
813472ce 562 rcu_read_unlock();
e65f440d 563
e65f440d
LE
564 /* ignore the amount of pending requests, the resync controller should
565 * throttle down to incoming reply rate soon enough anyways. */
566 return number;
567}
568
99920dc5 569int w_make_resync_request(struct drbd_work *w, int cancel)
b411b363 570{
84b8c06b
AG
571 struct drbd_device_work *dw = device_work(w);
572 struct drbd_device *device = dw->device;
b411b363
PR
573 unsigned long bit;
574 sector_t sector;
b30ab791 575 const sector_t capacity = drbd_get_capacity(device->this_bdev);
1816a2b4 576 int max_bio_size;
e65f440d 577 int number, rollback_i, size;
b411b363 578 int align, queued, sndbuf;
0f0601f4 579 int i = 0;
b411b363
PR
580
581 if (unlikely(cancel))
99920dc5 582 return 0;
b411b363 583
b30ab791 584 if (device->rs_total == 0) {
af85e8e8 585 /* empty resync? */
b30ab791 586 drbd_resync_finished(device);
99920dc5 587 return 0;
af85e8e8
LE
588 }
589
b30ab791
AG
590 if (!get_ldev(device)) {
591 /* Since we only need to access device->rsync a
592 get_ldev_if_state(device,D_FAILED) would be sufficient, but
b411b363
PR
593 to continue resync with a broken disk makes no sense at
594 all */
d0180171 595 drbd_err(device, "Disk broke down during resync!\n");
99920dc5 596 return 0;
b411b363
PR
597 }
598
b30ab791
AG
599 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
600 number = drbd_rs_number_requests(device);
e65f440d 601 if (number == 0)
0f0601f4 602 goto requeue;
b411b363 603
b411b363
PR
604 for (i = 0; i < number; i++) {
605 /* Stop generating RS requests, when half of the send buffer is filled */
a6b32bc3
AG
606 mutex_lock(&first_peer_device(device)->connection->data.mutex);
607 if (first_peer_device(device)->connection->data.socket) {
608 queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
609 sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
b411b363
PR
610 } else {
611 queued = 1;
612 sndbuf = 0;
613 }
a6b32bc3 614 mutex_unlock(&first_peer_device(device)->connection->data.mutex);
b411b363
PR
615 if (queued > sndbuf / 2)
616 goto requeue;
617
618next_sector:
619 size = BM_BLOCK_SIZE;
b30ab791 620 bit = drbd_bm_find_next(device, device->bm_resync_fo);
b411b363 621
4b0715f0 622 if (bit == DRBD_END_OF_BITMAP) {
b30ab791
AG
623 device->bm_resync_fo = drbd_bm_bits(device);
624 put_ldev(device);
99920dc5 625 return 0;
b411b363
PR
626 }
627
628 sector = BM_BIT_TO_SECT(bit);
629
b30ab791
AG
630 if (drbd_rs_should_slow_down(device, sector) ||
631 drbd_try_rs_begin_io(device, sector)) {
632 device->bm_resync_fo = bit;
b411b363
PR
633 goto requeue;
634 }
b30ab791 635 device->bm_resync_fo = bit + 1;
b411b363 636
b30ab791
AG
637 if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
638 drbd_rs_complete_io(device, sector);
b411b363
PR
639 goto next_sector;
640 }
641
1816a2b4 642#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
b411b363
PR
643 /* try to find some adjacent bits.
644 * we stop if we have already the maximum req size.
645 *
646 * Additionally always align bigger requests, in order to
647 * be prepared for all stripe sizes of software RAIDs.
b411b363
PR
648 */
649 align = 1;
d207450c 650 rollback_i = i;
b411b363 651 for (;;) {
1816a2b4 652 if (size + BM_BLOCK_SIZE > max_bio_size)
b411b363
PR
653 break;
654
655 /* Be always aligned */
656 if (sector & ((1<<(align+3))-1))
657 break;
658
659 /* do not cross extent boundaries */
660 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
661 break;
662 /* now, is it actually dirty, after all?
663 * caution, drbd_bm_test_bit is tri-state for some
664 * obscure reason; ( b == 0 ) would get the out-of-band
665 * only accidentally right because of the "oddly sized"
666 * adjustment below */
b30ab791 667 if (drbd_bm_test_bit(device, bit+1) != 1)
b411b363
PR
668 break;
669 bit++;
670 size += BM_BLOCK_SIZE;
671 if ((BM_BLOCK_SIZE << align) <= size)
672 align++;
673 i++;
674 }
675 /* if we merged some,
676 * reset the offset to start the next drbd_bm_find_next from */
677 if (size > BM_BLOCK_SIZE)
b30ab791 678 device->bm_resync_fo = bit + 1;
b411b363
PR
679#endif
680
681 /* adjust very last sectors, in case we are oddly sized */
682 if (sector + (size>>9) > capacity)
683 size = (capacity-sector)<<9;
a6b32bc3
AG
684 if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
685 first_peer_device(device)->connection->csums_tfm) {
69a22773 686 switch (read_for_csum(first_peer_device(device), sector, size)) {
80a40e43 687 case -EIO: /* Disk failure */
b30ab791 688 put_ldev(device);
99920dc5 689 return -EIO;
80a40e43 690 case -EAGAIN: /* allocation failed, or ldev busy */
b30ab791
AG
691 drbd_rs_complete_io(device, sector);
692 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
d207450c 693 i = rollback_i;
b411b363 694 goto requeue;
80a40e43
LE
695 case 0:
696 /* everything ok */
697 break;
698 default:
699 BUG();
b411b363
PR
700 }
701 } else {
99920dc5
AG
702 int err;
703
b30ab791 704 inc_rs_pending(device);
69a22773 705 err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
99920dc5
AG
706 sector, size, ID_SYNCER);
707 if (err) {
d0180171 708 drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
b30ab791
AG
709 dec_rs_pending(device);
710 put_ldev(device);
99920dc5 711 return err;
b411b363
PR
712 }
713 }
714 }
715
b30ab791 716 if (device->bm_resync_fo >= drbd_bm_bits(device)) {
b411b363
PR
717 /* last syncer _request_ was sent,
718 * but the P_RS_DATA_REPLY not yet received. sync will end (and
719 * next sync group will resume), as soon as we receive the last
720 * resync data block, and the last bit is cleared.
721 * until then resync "work" is "inactive" ...
722 */
b30ab791 723 put_ldev(device);
99920dc5 724 return 0;
b411b363
PR
725 }
726
727 requeue:
b30ab791
AG
728 device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
729 mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
730 put_ldev(device);
99920dc5 731 return 0;
b411b363
PR
732}
733
00d56944 734static int w_make_ov_request(struct drbd_work *w, int cancel)
b411b363 735{
84b8c06b 736 struct drbd_device *device = device_work(w)->device;
b411b363
PR
737 int number, i, size;
738 sector_t sector;
b30ab791 739 const sector_t capacity = drbd_get_capacity(device->this_bdev);
58ffa580 740 bool stop_sector_reached = false;
b411b363
PR
741
742 if (unlikely(cancel))
743 return 1;
744
b30ab791 745 number = drbd_rs_number_requests(device);
b411b363 746
b30ab791 747 sector = device->ov_position;
b411b363 748 for (i = 0; i < number; i++) {
58ffa580 749 if (sector >= capacity)
b411b363 750 return 1;
58ffa580
LE
751
752 /* We check for "finished" only in the reply path:
753 * w_e_end_ov_reply().
754 * We need to send at least one request out. */
755 stop_sector_reached = i > 0
b30ab791
AG
756 && verify_can_do_stop_sector(device)
757 && sector >= device->ov_stop_sector;
58ffa580
LE
758 if (stop_sector_reached)
759 break;
b411b363
PR
760
761 size = BM_BLOCK_SIZE;
762
b30ab791
AG
763 if (drbd_rs_should_slow_down(device, sector) ||
764 drbd_try_rs_begin_io(device, sector)) {
765 device->ov_position = sector;
b411b363
PR
766 goto requeue;
767 }
768
769 if (sector + (size>>9) > capacity)
770 size = (capacity-sector)<<9;
771
b30ab791 772 inc_rs_pending(device);
69a22773 773 if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
b30ab791 774 dec_rs_pending(device);
b411b363
PR
775 return 0;
776 }
777 sector += BM_SECT_PER_BIT;
778 }
b30ab791 779 device->ov_position = sector;
b411b363
PR
780
781 requeue:
b30ab791 782 device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
58ffa580 783 if (i == 0 || !stop_sector_reached)
b30ab791 784 mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
b411b363
PR
785 return 1;
786}
787
99920dc5 788int w_ov_finished(struct drbd_work *w, int cancel)
b411b363 789{
84b8c06b
AG
790 struct drbd_device_work *dw =
791 container_of(w, struct drbd_device_work, w);
792 struct drbd_device *device = dw->device;
793 kfree(dw);
b30ab791
AG
794 ov_out_of_sync_print(device);
795 drbd_resync_finished(device);
b411b363 796
99920dc5 797 return 0;
b411b363
PR
798}
799
99920dc5 800static int w_resync_finished(struct drbd_work *w, int cancel)
b411b363 801{
84b8c06b
AG
802 struct drbd_device_work *dw =
803 container_of(w, struct drbd_device_work, w);
804 struct drbd_device *device = dw->device;
805 kfree(dw);
b411b363 806
b30ab791 807 drbd_resync_finished(device);
b411b363 808
99920dc5 809 return 0;
b411b363
PR
810}
811
b30ab791 812static void ping_peer(struct drbd_device *device)
af85e8e8 813{
a6b32bc3 814 struct drbd_connection *connection = first_peer_device(device)->connection;
2a67d8b9 815
bde89a9e
AG
816 clear_bit(GOT_PING_ACK, &connection->flags);
817 request_ping(connection);
818 wait_event(connection->ping_wait,
819 test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
af85e8e8
LE
820}
821
b30ab791 822int drbd_resync_finished(struct drbd_device *device)
b411b363
PR
823{
824 unsigned long db, dt, dbdt;
825 unsigned long n_oos;
826 union drbd_state os, ns;
84b8c06b 827 struct drbd_device_work *dw;
b411b363 828 char *khelper_cmd = NULL;
26525618 829 int verify_done = 0;
b411b363
PR
830
831 /* Remove all elements from the resync LRU. Since future actions
832 * might set bits in the (main) bitmap, then the entries in the
833 * resync LRU would be wrong. */
b30ab791 834 if (drbd_rs_del_all(device)) {
b411b363
PR
835 /* In case this is not possible now, most probably because
836 * there are P_RS_DATA_REPLY Packets lingering on the worker's
837 * queue (or even the read operations for those packets
838 * is not finished by now). Retry in 100ms. */
839
20ee6390 840 schedule_timeout_interruptible(HZ / 10);
84b8c06b
AG
841 dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
842 if (dw) {
843 dw->w.cb = w_resync_finished;
844 dw->device = device;
845 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
846 &dw->w);
b411b363
PR
847 return 1;
848 }
84b8c06b 849 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
b411b363
PR
850 }
851
b30ab791 852 dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
b411b363
PR
853 if (dt <= 0)
854 dt = 1;
84b8c06b 855
b30ab791 856 db = device->rs_total;
58ffa580 857 /* adjust for verify start and stop sectors, respective reached position */
b30ab791
AG
858 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
859 db -= device->ov_left;
58ffa580 860
b411b363 861 dbdt = Bit2KB(db/dt);
b30ab791 862 device->rs_paused /= HZ;
b411b363 863
b30ab791 864 if (!get_ldev(device))
b411b363
PR
865 goto out;
866
b30ab791 867 ping_peer(device);
af85e8e8 868
0500813f 869 spin_lock_irq(&device->resource->req_lock);
b30ab791 870 os = drbd_read_state(device);
b411b363 871
26525618
LE
872 verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
873
b411b363
PR
874 /* This protects us against multiple calls (that can happen in the presence
875 of application IO), and against connectivity loss just before we arrive here. */
876 if (os.conn <= C_CONNECTED)
877 goto out_unlock;
878
879 ns = os;
880 ns.conn = C_CONNECTED;
881
d0180171 882 drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
58ffa580 883 verify_done ? "Online verify" : "Resync",
b30ab791 884 dt + device->rs_paused, device->rs_paused, dbdt);
b411b363 885
b30ab791 886 n_oos = drbd_bm_total_weight(device);
b411b363
PR
887
888 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
889 if (n_oos) {
d0180171 890 drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
b411b363
PR
891 n_oos, Bit2KB(1));
892 khelper_cmd = "out-of-sync";
893 }
894 } else {
0b0ba1ef 895 D_ASSERT(device, (n_oos - device->rs_failed) == 0);
b411b363
PR
896
897 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
898 khelper_cmd = "after-resync-target";
899
a6b32bc3 900 if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
b30ab791
AG
901 const unsigned long s = device->rs_same_csum;
902 const unsigned long t = device->rs_total;
b411b363
PR
903 const int ratio =
904 (t == 0) ? 0 :
905 (t < 100000) ? ((s*100)/t) : (s/(t/100));
d0180171 906 drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
b411b363
PR
907 "transferred %luK total %luK\n",
908 ratio,
b30ab791
AG
909 Bit2KB(device->rs_same_csum),
910 Bit2KB(device->rs_total - device->rs_same_csum),
911 Bit2KB(device->rs_total));
b411b363
PR
912 }
913 }
914
b30ab791 915 if (device->rs_failed) {
d0180171 916 drbd_info(device, " %lu failed blocks\n", device->rs_failed);
b411b363
PR
917
918 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
919 ns.disk = D_INCONSISTENT;
920 ns.pdsk = D_UP_TO_DATE;
921 } else {
922 ns.disk = D_UP_TO_DATE;
923 ns.pdsk = D_INCONSISTENT;
924 }
925 } else {
926 ns.disk = D_UP_TO_DATE;
927 ns.pdsk = D_UP_TO_DATE;
928
929 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
b30ab791 930 if (device->p_uuid) {
b411b363
PR
931 int i;
932 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
b30ab791
AG
933 _drbd_uuid_set(device, i, device->p_uuid[i]);
934 drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
935 _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
b411b363 936 } else {
d0180171 937 drbd_err(device, "device->p_uuid is NULL! BUG\n");
b411b363
PR
938 }
939 }
940
62b0da3a
LE
941 if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
942 /* for verify runs, we don't update uuids here,
943 * so there would be nothing to report. */
b30ab791
AG
944 drbd_uuid_set_bm(device, 0UL);
945 drbd_print_uuids(device, "updated UUIDs");
946 if (device->p_uuid) {
62b0da3a
LE
947 /* Now the two UUID sets are equal, update what we
948 * know of the peer. */
949 int i;
950 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
b30ab791 951 device->p_uuid[i] = device->ldev->md.uuid[i];
62b0da3a 952 }
b411b363
PR
953 }
954 }
955
b30ab791 956 _drbd_set_state(device, ns, CS_VERBOSE, NULL);
b411b363 957out_unlock:
0500813f 958 spin_unlock_irq(&device->resource->req_lock);
b30ab791 959 put_ldev(device);
b411b363 960out:
b30ab791
AG
961 device->rs_total = 0;
962 device->rs_failed = 0;
963 device->rs_paused = 0;
58ffa580
LE
964
965 /* reset start sector, if we reached end of device */
b30ab791
AG
966 if (verify_done && device->ov_left == 0)
967 device->ov_start_sector = 0;
b411b363 968
b30ab791 969 drbd_md_sync(device);
13d42685 970
b411b363 971 if (khelper_cmd)
b30ab791 972 drbd_khelper(device, khelper_cmd);
b411b363
PR
973
974 return 1;
975}
976
977/* helper */
b30ab791 978static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
b411b363 979{
045417f7 980 if (drbd_peer_req_has_active_page(peer_req)) {
b411b363 981 /* This might happen if sendpage() has not finished */
db830c46 982 int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
b30ab791
AG
983 atomic_add(i, &device->pp_in_use_by_net);
984 atomic_sub(i, &device->pp_in_use);
0500813f 985 spin_lock_irq(&device->resource->req_lock);
84b8c06b 986 list_add_tail(&peer_req->dw.w.list, &device->net_ee);
0500813f 987 spin_unlock_irq(&device->resource->req_lock);
435f0740 988 wake_up(&drbd_pp_wait);
b411b363 989 } else
b30ab791 990 drbd_free_peer_req(device, peer_req);
b411b363
PR
991}
992
993/**
994 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
b30ab791 995 * @device: DRBD device.
b411b363
PR
996 * @w: work object.
997 * @cancel: The connection will be closed anyways
998 */
99920dc5 999int w_e_end_data_req(struct drbd_work *w, int cancel)
b411b363 1000{
84b8c06b
AG
1001 struct drbd_device_work *dw = device_work(w);
1002 struct drbd_peer_request *peer_req = container_of(dw, struct drbd_peer_request, dw);
1003 struct drbd_device *device = dw->device;
99920dc5 1004 int err;
b411b363
PR
1005
1006 if (unlikely(cancel)) {
b30ab791
AG
1007 drbd_free_peer_req(device, peer_req);
1008 dec_unacked(device);
99920dc5 1009 return 0;
b411b363
PR
1010 }
1011
db830c46 1012 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
69a22773 1013 err = drbd_send_block(first_peer_device(device), P_DATA_REPLY, peer_req);
b411b363
PR
1014 } else {
1015 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1016 drbd_err(device, "Sending NegDReply. sector=%llus.\n",
db830c46 1017 (unsigned long long)peer_req->i.sector);
b411b363 1018
69a22773 1019 err = drbd_send_ack(first_peer_device(device), P_NEG_DREPLY, peer_req);
b411b363
PR
1020 }
1021
b30ab791 1022 dec_unacked(device);
b411b363 1023
b30ab791 1024 move_to_net_ee_or_free(device, peer_req);
b411b363 1025
99920dc5 1026 if (unlikely(err))
d0180171 1027 drbd_err(device, "drbd_send_block() failed\n");
99920dc5 1028 return err;
b411b363
PR
1029}
1030
1031/**
a209b4ae 1032 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
b411b363
PR
1033 * @w: work object.
1034 * @cancel: The connection will be closed anyways
1035 */
99920dc5 1036int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
b411b363 1037{
84b8c06b
AG
1038 struct drbd_device_work *dw = device_work(w);
1039 struct drbd_peer_request *peer_req = container_of(dw, struct drbd_peer_request, dw);
1040 struct drbd_device *device = dw->device;
99920dc5 1041 int err;
b411b363
PR
1042
1043 if (unlikely(cancel)) {
b30ab791
AG
1044 drbd_free_peer_req(device, peer_req);
1045 dec_unacked(device);
99920dc5 1046 return 0;
b411b363
PR
1047 }
1048
b30ab791
AG
1049 if (get_ldev_if_state(device, D_FAILED)) {
1050 drbd_rs_complete_io(device, peer_req->i.sector);
1051 put_ldev(device);
b411b363
PR
1052 }
1053
b30ab791 1054 if (device->state.conn == C_AHEAD) {
69a22773 1055 err = drbd_send_ack(first_peer_device(device), P_RS_CANCEL, peer_req);
db830c46 1056 } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791
AG
1057 if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1058 inc_rs_pending(device);
69a22773 1059 err = drbd_send_block(first_peer_device(device), P_RS_DATA_REPLY, peer_req);
b411b363
PR
1060 } else {
1061 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1062 drbd_err(device, "Not sending RSDataReply, "
b411b363 1063 "partner DISKLESS!\n");
99920dc5 1064 err = 0;
b411b363
PR
1065 }
1066 } else {
1067 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1068 drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
db830c46 1069 (unsigned long long)peer_req->i.sector);
b411b363 1070
69a22773 1071 err = drbd_send_ack(first_peer_device(device), P_NEG_RS_DREPLY, peer_req);
b411b363
PR
1072
1073 /* update resync data with failure */
b30ab791 1074 drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
b411b363
PR
1075 }
1076
b30ab791 1077 dec_unacked(device);
b411b363 1078
b30ab791 1079 move_to_net_ee_or_free(device, peer_req);
b411b363 1080
99920dc5 1081 if (unlikely(err))
d0180171 1082 drbd_err(device, "drbd_send_block() failed\n");
99920dc5 1083 return err;
b411b363
PR
1084}
1085
99920dc5 1086int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
b411b363 1087{
84b8c06b
AG
1088 struct drbd_device_work *dw = device_work(w);
1089 struct drbd_peer_request *peer_req = container_of(dw, struct drbd_peer_request, dw);
1090 struct drbd_device *device = dw->device;
b411b363
PR
1091 struct digest_info *di;
1092 int digest_size;
1093 void *digest = NULL;
99920dc5 1094 int err, eq = 0;
b411b363
PR
1095
1096 if (unlikely(cancel)) {
b30ab791
AG
1097 drbd_free_peer_req(device, peer_req);
1098 dec_unacked(device);
99920dc5 1099 return 0;
b411b363
PR
1100 }
1101
b30ab791
AG
1102 if (get_ldev(device)) {
1103 drbd_rs_complete_io(device, peer_req->i.sector);
1104 put_ldev(device);
1d53f09e 1105 }
b411b363 1106
db830c46 1107 di = peer_req->digest;
b411b363 1108
db830c46 1109 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1110 /* quick hack to try to avoid a race against reconfiguration.
1111 * a real fix would be much more involved,
1112 * introducing more locking mechanisms */
a6b32bc3
AG
1113 if (first_peer_device(device)->connection->csums_tfm) {
1114 digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->csums_tfm);
0b0ba1ef 1115 D_ASSERT(device, digest_size == di->digest_size);
b411b363
PR
1116 digest = kmalloc(digest_size, GFP_NOIO);
1117 }
1118 if (digest) {
79a3c8d3 1119 drbd_csum_ee(first_peer_device(device)->connection->csums_tfm, peer_req, digest);
b411b363
PR
1120 eq = !memcmp(digest, di->digest, digest_size);
1121 kfree(digest);
1122 }
1123
1124 if (eq) {
b30ab791 1125 drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
676396d5 1126 /* rs_same_csums unit is BM_BLOCK_SIZE */
b30ab791 1127 device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
69a22773 1128 err = drbd_send_ack(first_peer_device(device), P_RS_IS_IN_SYNC, peer_req);
b411b363 1129 } else {
b30ab791 1130 inc_rs_pending(device);
db830c46
AG
1131 peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1132 peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
204bba99 1133 kfree(di);
69a22773 1134 err = drbd_send_block(first_peer_device(device), P_RS_DATA_REPLY, peer_req);
b411b363
PR
1135 }
1136 } else {
69a22773 1137 err = drbd_send_ack(first_peer_device(device), P_NEG_RS_DREPLY, peer_req);
b411b363 1138 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1139 drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
b411b363
PR
1140 }
1141
b30ab791
AG
1142 dec_unacked(device);
1143 move_to_net_ee_or_free(device, peer_req);
b411b363 1144
99920dc5 1145 if (unlikely(err))
d0180171 1146 drbd_err(device, "drbd_send_block/ack() failed\n");
99920dc5 1147 return err;
b411b363
PR
1148}
1149
99920dc5 1150int w_e_end_ov_req(struct drbd_work *w, int cancel)
b411b363 1151{
84b8c06b
AG
1152 struct drbd_device_work *dw = device_work(w);
1153 struct drbd_peer_request *peer_req = container_of(dw, struct drbd_peer_request, dw);
1154 struct drbd_device *device = dw->device;
db830c46
AG
1155 sector_t sector = peer_req->i.sector;
1156 unsigned int size = peer_req->i.size;
b411b363
PR
1157 int digest_size;
1158 void *digest;
99920dc5 1159 int err = 0;
b411b363
PR
1160
1161 if (unlikely(cancel))
1162 goto out;
1163
a6b32bc3 1164 digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->verify_tfm);
b411b363 1165 digest = kmalloc(digest_size, GFP_NOIO);
8f21420e 1166 if (!digest) {
99920dc5 1167 err = 1; /* terminate the connection in case the allocation failed */
8f21420e 1168 goto out;
b411b363
PR
1169 }
1170
db830c46 1171 if (likely(!(peer_req->flags & EE_WAS_ERROR)))
79a3c8d3 1172 drbd_csum_ee(first_peer_device(device)->connection->verify_tfm, peer_req, digest);
8f21420e
PR
1173 else
1174 memset(digest, 0, digest_size);
1175
53ea4331
LE
1176 /* Free e and pages before send.
1177 * In case we block on congestion, we could otherwise run into
1178 * some distributed deadlock, if the other side blocks on
1179 * congestion as well, because our receiver blocks in
c37c8ecf 1180 * drbd_alloc_pages due to pp_in_use > max_buffers. */
b30ab791 1181 drbd_free_peer_req(device, peer_req);
db830c46 1182 peer_req = NULL;
b30ab791 1183 inc_rs_pending(device);
69a22773 1184 err = drbd_send_drequest_csum(first_peer_device(device), sector, size, digest, digest_size, P_OV_REPLY);
99920dc5 1185 if (err)
b30ab791 1186 dec_rs_pending(device);
8f21420e
PR
1187 kfree(digest);
1188
b411b363 1189out:
db830c46 1190 if (peer_req)
b30ab791
AG
1191 drbd_free_peer_req(device, peer_req);
1192 dec_unacked(device);
99920dc5 1193 return err;
b411b363
PR
1194}
1195
b30ab791 1196void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
b411b363 1197{
b30ab791
AG
1198 if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
1199 device->ov_last_oos_size += size>>9;
b411b363 1200 } else {
b30ab791
AG
1201 device->ov_last_oos_start = sector;
1202 device->ov_last_oos_size = size>>9;
b411b363 1203 }
b30ab791 1204 drbd_set_out_of_sync(device, sector, size);
b411b363
PR
1205}
1206
99920dc5 1207int w_e_end_ov_reply(struct drbd_work *w, int cancel)
b411b363 1208{
84b8c06b
AG
1209 struct drbd_device_work *dw = device_work(w);
1210 struct drbd_peer_request *peer_req = container_of(dw, struct drbd_peer_request, dw);
1211 struct drbd_device *device = dw->device;
b411b363 1212 struct digest_info *di;
b411b363 1213 void *digest;
db830c46
AG
1214 sector_t sector = peer_req->i.sector;
1215 unsigned int size = peer_req->i.size;
53ea4331 1216 int digest_size;
99920dc5 1217 int err, eq = 0;
58ffa580 1218 bool stop_sector_reached = false;
b411b363
PR
1219
1220 if (unlikely(cancel)) {
b30ab791
AG
1221 drbd_free_peer_req(device, peer_req);
1222 dec_unacked(device);
99920dc5 1223 return 0;
b411b363
PR
1224 }
1225
1226 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1227 * the resync lru has been cleaned up already */
b30ab791
AG
1228 if (get_ldev(device)) {
1229 drbd_rs_complete_io(device, peer_req->i.sector);
1230 put_ldev(device);
1d53f09e 1231 }
b411b363 1232
db830c46 1233 di = peer_req->digest;
b411b363 1234
db830c46 1235 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
a6b32bc3 1236 digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->verify_tfm);
b411b363
PR
1237 digest = kmalloc(digest_size, GFP_NOIO);
1238 if (digest) {
79a3c8d3 1239 drbd_csum_ee(first_peer_device(device)->connection->verify_tfm, peer_req, digest);
b411b363 1240
0b0ba1ef 1241 D_ASSERT(device, digest_size == di->digest_size);
b411b363
PR
1242 eq = !memcmp(digest, di->digest, digest_size);
1243 kfree(digest);
1244 }
b411b363
PR
1245 }
1246
9676c760
LE
1247 /* Free peer_req and pages before send.
1248 * In case we block on congestion, we could otherwise run into
1249 * some distributed deadlock, if the other side blocks on
1250 * congestion as well, because our receiver blocks in
c37c8ecf 1251 * drbd_alloc_pages due to pp_in_use > max_buffers. */
b30ab791 1252 drbd_free_peer_req(device, peer_req);
b411b363 1253 if (!eq)
b30ab791 1254 drbd_ov_out_of_sync_found(device, sector, size);
b411b363 1255 else
b30ab791 1256 ov_out_of_sync_print(device);
b411b363 1257
69a22773 1258 err = drbd_send_ack_ex(first_peer_device(device), P_OV_RESULT, sector, size,
fa79abd8 1259 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
b411b363 1260
b30ab791 1261 dec_unacked(device);
b411b363 1262
b30ab791 1263 --device->ov_left;
ea5442af
LE
1264
1265 /* let's advance progress step marks only for every other megabyte */
b30ab791
AG
1266 if ((device->ov_left & 0x200) == 0x200)
1267 drbd_advance_rs_marks(device, device->ov_left);
ea5442af 1268
b30ab791
AG
1269 stop_sector_reached = verify_can_do_stop_sector(device) &&
1270 (sector + (size>>9)) >= device->ov_stop_sector;
58ffa580 1271
b30ab791
AG
1272 if (device->ov_left == 0 || stop_sector_reached) {
1273 ov_out_of_sync_print(device);
1274 drbd_resync_finished(device);
b411b363
PR
1275 }
1276
99920dc5 1277 return err;
b411b363
PR
1278}
1279
b6dd1a89
LE
1280/* FIXME
1281 * We need to track the number of pending barrier acks,
1282 * and to be able to wait for them.
1283 * See also comment in drbd_adm_attach before drbd_suspend_io.
1284 */
bde89a9e 1285static int drbd_send_barrier(struct drbd_connection *connection)
b411b363 1286{
9f5bdc33 1287 struct p_barrier *p;
b6dd1a89 1288 struct drbd_socket *sock;
b411b363 1289
bde89a9e
AG
1290 sock = &connection->data;
1291 p = conn_prepare_command(connection, sock);
9f5bdc33
AG
1292 if (!p)
1293 return -EIO;
bde89a9e 1294 p->barrier = connection->send.current_epoch_nr;
b6dd1a89 1295 p->pad = 0;
bde89a9e 1296 connection->send.current_epoch_writes = 0;
b6dd1a89 1297
bde89a9e 1298 return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
b411b363
PR
1299}
1300
99920dc5 1301int w_send_write_hint(struct drbd_work *w, int cancel)
b411b363 1302{
84b8c06b
AG
1303 struct drbd_device *device =
1304 container_of(w, struct drbd_device, unplug_work);
9f5bdc33
AG
1305 struct drbd_socket *sock;
1306
b411b363 1307 if (cancel)
99920dc5 1308 return 0;
a6b32bc3 1309 sock = &first_peer_device(device)->connection->data;
69a22773 1310 if (!drbd_prepare_command(first_peer_device(device), sock))
9f5bdc33 1311 return -EIO;
69a22773 1312 return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
b411b363
PR
1313}
1314
bde89a9e 1315static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
4eb9b3cb 1316{
bde89a9e
AG
1317 if (!connection->send.seen_any_write_yet) {
1318 connection->send.seen_any_write_yet = true;
1319 connection->send.current_epoch_nr = epoch;
1320 connection->send.current_epoch_writes = 0;
4eb9b3cb
LE
1321 }
1322}
1323
bde89a9e 1324static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
4eb9b3cb
LE
1325{
1326 /* re-init if first write on this connection */
bde89a9e 1327 if (!connection->send.seen_any_write_yet)
4eb9b3cb 1328 return;
bde89a9e
AG
1329 if (connection->send.current_epoch_nr != epoch) {
1330 if (connection->send.current_epoch_writes)
1331 drbd_send_barrier(connection);
1332 connection->send.current_epoch_nr = epoch;
4eb9b3cb
LE
1333 }
1334}
1335
8f7bed77 1336int w_send_out_of_sync(struct drbd_work *w, int cancel)
73a01a18
PR
1337{
1338 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1339 struct drbd_device *device = req->device;
a6b32bc3 1340 struct drbd_connection *connection = first_peer_device(device)->connection;
99920dc5 1341 int err;
73a01a18
PR
1342
1343 if (unlikely(cancel)) {
8554df1c 1344 req_mod(req, SEND_CANCELED);
99920dc5 1345 return 0;
73a01a18
PR
1346 }
1347
bde89a9e 1348 /* this time, no connection->send.current_epoch_writes++;
b6dd1a89
LE
1349 * If it was sent, it was the closing barrier for the last
1350 * replicated epoch, before we went into AHEAD mode.
1351 * No more barriers will be sent, until we leave AHEAD mode again. */
bde89a9e 1352 maybe_send_barrier(connection, req->epoch);
b6dd1a89 1353
69a22773 1354 err = drbd_send_out_of_sync(first_peer_device(device), req);
8554df1c 1355 req_mod(req, OOS_HANDED_TO_NETWORK);
73a01a18 1356
99920dc5 1357 return err;
73a01a18
PR
1358}
1359
b411b363
PR
1360/**
1361 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
b411b363
PR
1362 * @w: work object.
1363 * @cancel: The connection will be closed anyways
1364 */
99920dc5 1365int w_send_dblock(struct drbd_work *w, int cancel)
b411b363
PR
1366{
1367 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1368 struct drbd_device *device = req->device;
a6b32bc3 1369 struct drbd_connection *connection = first_peer_device(device)->connection;
99920dc5 1370 int err;
b411b363
PR
1371
1372 if (unlikely(cancel)) {
8554df1c 1373 req_mod(req, SEND_CANCELED);
99920dc5 1374 return 0;
b411b363
PR
1375 }
1376
bde89a9e
AG
1377 re_init_if_first_write(connection, req->epoch);
1378 maybe_send_barrier(connection, req->epoch);
1379 connection->send.current_epoch_writes++;
b6dd1a89 1380
69a22773 1381 err = drbd_send_dblock(first_peer_device(device), req);
99920dc5 1382 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
b411b363 1383
99920dc5 1384 return err;
b411b363
PR
1385}
1386
1387/**
1388 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
b411b363
PR
1389 * @w: work object.
1390 * @cancel: The connection will be closed anyways
1391 */
99920dc5 1392int w_send_read_req(struct drbd_work *w, int cancel)
b411b363
PR
1393{
1394 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1395 struct drbd_device *device = req->device;
a6b32bc3 1396 struct drbd_connection *connection = first_peer_device(device)->connection;
99920dc5 1397 int err;
b411b363
PR
1398
1399 if (unlikely(cancel)) {
8554df1c 1400 req_mod(req, SEND_CANCELED);
99920dc5 1401 return 0;
b411b363
PR
1402 }
1403
b6dd1a89
LE
1404 /* Even read requests may close a write epoch,
1405 * if there was any yet. */
bde89a9e 1406 maybe_send_barrier(connection, req->epoch);
b6dd1a89 1407
69a22773 1408 err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
6c1005e7 1409 (unsigned long)req);
b411b363 1410
99920dc5 1411 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
b411b363 1412
99920dc5 1413 return err;
b411b363
PR
1414}
1415
99920dc5 1416int w_restart_disk_io(struct drbd_work *w, int cancel)
265be2d0
PR
1417{
1418 struct drbd_request *req = container_of(w, struct drbd_request, w);
84b8c06b 1419 struct drbd_device *device = req->device;
265be2d0 1420
0778286a 1421 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
b30ab791 1422 drbd_al_begin_io(device, &req->i, false);
265be2d0
PR
1423
1424 drbd_req_make_private_bio(req, req->master_bio);
b30ab791 1425 req->private_bio->bi_bdev = device->ldev->backing_bdev;
265be2d0
PR
1426 generic_make_request(req->private_bio);
1427
99920dc5 1428 return 0;
265be2d0
PR
1429}
1430
b30ab791 1431static int _drbd_may_sync_now(struct drbd_device *device)
b411b363 1432{
b30ab791 1433 struct drbd_device *odev = device;
95f8efd0 1434 int resync_after;
b411b363
PR
1435
1436 while (1) {
a3f8f7dc 1437 if (!odev->ldev || odev->state.disk == D_DISKLESS)
438c8374 1438 return 1;
daeda1cc 1439 rcu_read_lock();
95f8efd0 1440 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
daeda1cc 1441 rcu_read_unlock();
95f8efd0 1442 if (resync_after == -1)
b411b363 1443 return 1;
b30ab791 1444 odev = minor_to_device(resync_after);
a3f8f7dc 1445 if (!odev)
841ce241 1446 return 1;
b411b363
PR
1447 if ((odev->state.conn >= C_SYNC_SOURCE &&
1448 odev->state.conn <= C_PAUSED_SYNC_T) ||
1449 odev->state.aftr_isp || odev->state.peer_isp ||
1450 odev->state.user_isp)
1451 return 0;
1452 }
1453}
1454
1455/**
1456 * _drbd_pause_after() - Pause resync on all devices that may not resync now
b30ab791 1457 * @device: DRBD device.
b411b363
PR
1458 *
1459 * Called from process context only (admin command and after_state_ch).
1460 */
b30ab791 1461static int _drbd_pause_after(struct drbd_device *device)
b411b363 1462{
54761697 1463 struct drbd_device *odev;
b411b363
PR
1464 int i, rv = 0;
1465
695d08fa 1466 rcu_read_lock();
05a10ec7 1467 idr_for_each_entry(&drbd_devices, odev, i) {
b411b363
PR
1468 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1469 continue;
1470 if (!_drbd_may_sync_now(odev))
1471 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1472 != SS_NOTHING_TO_DO);
1473 }
695d08fa 1474 rcu_read_unlock();
b411b363
PR
1475
1476 return rv;
1477}
1478
1479/**
1480 * _drbd_resume_next() - Resume resync on all devices that may resync now
b30ab791 1481 * @device: DRBD device.
b411b363
PR
1482 *
1483 * Called from process context only (admin command and worker).
1484 */
b30ab791 1485static int _drbd_resume_next(struct drbd_device *device)
b411b363 1486{
54761697 1487 struct drbd_device *odev;
b411b363
PR
1488 int i, rv = 0;
1489
695d08fa 1490 rcu_read_lock();
05a10ec7 1491 idr_for_each_entry(&drbd_devices, odev, i) {
b411b363
PR
1492 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1493 continue;
1494 if (odev->state.aftr_isp) {
1495 if (_drbd_may_sync_now(odev))
1496 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1497 CS_HARD, NULL)
1498 != SS_NOTHING_TO_DO) ;
1499 }
1500 }
695d08fa 1501 rcu_read_unlock();
b411b363
PR
1502 return rv;
1503}
1504
b30ab791 1505void resume_next_sg(struct drbd_device *device)
b411b363
PR
1506{
1507 write_lock_irq(&global_state_lock);
b30ab791 1508 _drbd_resume_next(device);
b411b363
PR
1509 write_unlock_irq(&global_state_lock);
1510}
1511
b30ab791 1512void suspend_other_sg(struct drbd_device *device)
b411b363
PR
1513{
1514 write_lock_irq(&global_state_lock);
b30ab791 1515 _drbd_pause_after(device);
b411b363
PR
1516 write_unlock_irq(&global_state_lock);
1517}
1518
dc97b708 1519/* caller must hold global_state_lock */
b30ab791 1520enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
b411b363 1521{
54761697 1522 struct drbd_device *odev;
95f8efd0 1523 int resync_after;
b411b363
PR
1524
1525 if (o_minor == -1)
1526 return NO_ERROR;
a3f8f7dc 1527 if (o_minor < -1 || o_minor > MINORMASK)
95f8efd0 1528 return ERR_RESYNC_AFTER;
b411b363
PR
1529
1530 /* check for loops */
b30ab791 1531 odev = minor_to_device(o_minor);
b411b363 1532 while (1) {
b30ab791 1533 if (odev == device)
95f8efd0 1534 return ERR_RESYNC_AFTER_CYCLE;
b411b363 1535
a3f8f7dc
LE
1536 /* You are free to depend on diskless, non-existing,
1537 * or not yet/no longer existing minors.
1538 * We only reject dependency loops.
1539 * We cannot follow the dependency chain beyond a detached or
1540 * missing minor.
1541 */
1542 if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1543 return NO_ERROR;
1544
daeda1cc 1545 rcu_read_lock();
95f8efd0 1546 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
daeda1cc 1547 rcu_read_unlock();
b411b363 1548 /* dependency chain ends here, no cycles. */
95f8efd0 1549 if (resync_after == -1)
b411b363
PR
1550 return NO_ERROR;
1551
1552 /* follow the dependency chain */
b30ab791 1553 odev = minor_to_device(resync_after);
b411b363
PR
1554 }
1555}
1556
dc97b708 1557/* caller must hold global_state_lock */
b30ab791 1558void drbd_resync_after_changed(struct drbd_device *device)
b411b363
PR
1559{
1560 int changes;
b411b363 1561
dc97b708 1562 do {
b30ab791
AG
1563 changes = _drbd_pause_after(device);
1564 changes |= _drbd_resume_next(device);
dc97b708 1565 } while (changes);
b411b363
PR
1566}
1567
b30ab791 1568void drbd_rs_controller_reset(struct drbd_device *device)
9bd28d3c 1569{
813472ce
PR
1570 struct fifo_buffer *plan;
1571
b30ab791
AG
1572 atomic_set(&device->rs_sect_in, 0);
1573 atomic_set(&device->rs_sect_ev, 0);
1574 device->rs_in_flight = 0;
813472ce
PR
1575
1576 /* Updating the RCU protected object in place is necessary since
1577 this function gets called from atomic context.
1578 It is valid since all other updates also lead to an completely
1579 empty fifo */
1580 rcu_read_lock();
b30ab791 1581 plan = rcu_dereference(device->rs_plan_s);
813472ce
PR
1582 plan->total = 0;
1583 fifo_set(plan, 0);
1584 rcu_read_unlock();
9bd28d3c
LE
1585}
1586
1f04af33
PR
1587void start_resync_timer_fn(unsigned long data)
1588{
b30ab791 1589 struct drbd_device *device = (struct drbd_device *) data;
1f04af33 1590
84b8c06b
AG
1591 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
1592 &device->start_resync_work);
1f04af33
PR
1593}
1594
99920dc5 1595int w_start_resync(struct drbd_work *w, int cancel)
1f04af33 1596{
84b8c06b
AG
1597 struct drbd_device *device =
1598 container_of(w, struct drbd_device, start_resync_work);
00d56944 1599
b30ab791 1600 if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
d0180171 1601 drbd_warn(device, "w_start_resync later...\n");
b30ab791
AG
1602 device->start_resync_timer.expires = jiffies + HZ/10;
1603 add_timer(&device->start_resync_timer);
99920dc5 1604 return 0;
1f04af33
PR
1605 }
1606
b30ab791
AG
1607 drbd_start_resync(device, C_SYNC_SOURCE);
1608 clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
99920dc5 1609 return 0;
1f04af33
PR
1610}
1611
b411b363
PR
1612/**
1613 * drbd_start_resync() - Start the resync process
b30ab791 1614 * @device: DRBD device.
b411b363
PR
1615 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1616 *
1617 * This function might bring you directly into one of the
1618 * C_PAUSED_SYNC_* states.
1619 */
b30ab791 1620void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
b411b363
PR
1621{
1622 union drbd_state ns;
1623 int r;
1624
b30ab791 1625 if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
d0180171 1626 drbd_err(device, "Resync already running!\n");
b411b363
PR
1627 return;
1628 }
1629
b30ab791 1630 if (!test_bit(B_RS_H_DONE, &device->flags)) {
e64a3294
PR
1631 if (side == C_SYNC_TARGET) {
1632 /* Since application IO was locked out during C_WF_BITMAP_T and
1633 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1634 we check that we might make the data inconsistent. */
b30ab791 1635 r = drbd_khelper(device, "before-resync-target");
e64a3294
PR
1636 r = (r >> 8) & 0xff;
1637 if (r > 0) {
d0180171 1638 drbd_info(device, "before-resync-target handler returned %d, "
09b9e797 1639 "dropping connection.\n", r);
a6b32bc3 1640 conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
09b9e797
PR
1641 return;
1642 }
e64a3294 1643 } else /* C_SYNC_SOURCE */ {
b30ab791 1644 r = drbd_khelper(device, "before-resync-source");
e64a3294
PR
1645 r = (r >> 8) & 0xff;
1646 if (r > 0) {
1647 if (r == 3) {
d0180171 1648 drbd_info(device, "before-resync-source handler returned %d, "
e64a3294
PR
1649 "ignoring. Old userland tools?", r);
1650 } else {
d0180171 1651 drbd_info(device, "before-resync-source handler returned %d, "
e64a3294 1652 "dropping connection.\n", r);
a6b32bc3
AG
1653 conn_request_state(first_peer_device(device)->connection,
1654 NS(conn, C_DISCONNECTING), CS_HARD);
e64a3294
PR
1655 return;
1656 }
1657 }
09b9e797 1658 }
b411b363
PR
1659 }
1660
a6b32bc3 1661 if (current == first_peer_device(device)->connection->worker.task) {
dad20554 1662 /* The worker should not sleep waiting for state_mutex,
e64a3294 1663 that can take long */
b30ab791
AG
1664 if (!mutex_trylock(device->state_mutex)) {
1665 set_bit(B_RS_H_DONE, &device->flags);
1666 device->start_resync_timer.expires = jiffies + HZ/5;
1667 add_timer(&device->start_resync_timer);
e64a3294
PR
1668 return;
1669 }
1670 } else {
b30ab791 1671 mutex_lock(device->state_mutex);
e64a3294 1672 }
b30ab791 1673 clear_bit(B_RS_H_DONE, &device->flags);
b411b363 1674
0cfac5dd 1675 write_lock_irq(&global_state_lock);
a700471b 1676 /* Did some connection breakage or IO error race with us? */
b30ab791
AG
1677 if (device->state.conn < C_CONNECTED
1678 || !get_ldev_if_state(device, D_NEGOTIATING)) {
0cfac5dd 1679 write_unlock_irq(&global_state_lock);
b30ab791 1680 mutex_unlock(device->state_mutex);
b411b363
PR
1681 return;
1682 }
1683
b30ab791 1684 ns = drbd_read_state(device);
b411b363 1685
b30ab791 1686 ns.aftr_isp = !_drbd_may_sync_now(device);
b411b363
PR
1687
1688 ns.conn = side;
1689
1690 if (side == C_SYNC_TARGET)
1691 ns.disk = D_INCONSISTENT;
1692 else /* side == C_SYNC_SOURCE */
1693 ns.pdsk = D_INCONSISTENT;
1694
b30ab791
AG
1695 r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
1696 ns = drbd_read_state(device);
b411b363
PR
1697
1698 if (ns.conn < C_CONNECTED)
1699 r = SS_UNKNOWN_ERROR;
1700
1701 if (r == SS_SUCCESS) {
b30ab791 1702 unsigned long tw = drbd_bm_total_weight(device);
1d7734a0
LE
1703 unsigned long now = jiffies;
1704 int i;
1705
b30ab791
AG
1706 device->rs_failed = 0;
1707 device->rs_paused = 0;
1708 device->rs_same_csum = 0;
1709 device->rs_last_events = 0;
1710 device->rs_last_sect_ev = 0;
1711 device->rs_total = tw;
1712 device->rs_start = now;
1d7734a0 1713 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
b30ab791
AG
1714 device->rs_mark_left[i] = tw;
1715 device->rs_mark_time[i] = now;
1d7734a0 1716 }
b30ab791 1717 _drbd_pause_after(device);
b411b363
PR
1718 }
1719 write_unlock_irq(&global_state_lock);
5a22db89 1720
b411b363 1721 if (r == SS_SUCCESS) {
328e0f12
PR
1722 /* reset rs_last_bcast when a resync or verify is started,
1723 * to deal with potential jiffies wrap. */
b30ab791 1724 device->rs_last_bcast = jiffies - HZ;
328e0f12 1725
d0180171 1726 drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
b411b363 1727 drbd_conn_str(ns.conn),
b30ab791
AG
1728 (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1729 (unsigned long) device->rs_total);
6c922ed5 1730 if (side == C_SYNC_TARGET)
b30ab791 1731 device->bm_resync_fo = 0;
6c922ed5
LE
1732
1733 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1734 * with w_send_oos, or the sync target will get confused as to
1735 * how much bits to resync. We cannot do that always, because for an
1736 * empty resync and protocol < 95, we need to do it here, as we call
1737 * drbd_resync_finished from here in that case.
1738 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1739 * and from after_state_ch otherwise. */
a6b32bc3
AG
1740 if (side == C_SYNC_SOURCE &&
1741 first_peer_device(device)->connection->agreed_pro_version < 96)
69a22773 1742 drbd_gen_and_send_sync_uuid(first_peer_device(device));
b411b363 1743
a6b32bc3
AG
1744 if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
1745 device->rs_total == 0) {
af85e8e8
LE
1746 /* This still has a race (about when exactly the peers
1747 * detect connection loss) that can lead to a full sync
1748 * on next handshake. In 8.3.9 we fixed this with explicit
1749 * resync-finished notifications, but the fix
1750 * introduces a protocol change. Sleeping for some
1751 * time longer than the ping interval + timeout on the
1752 * SyncSource, to give the SyncTarget the chance to
1753 * detect connection loss, then waiting for a ping
1754 * response (implicit in drbd_resync_finished) reduces
1755 * the race considerably, but does not solve it. */
44ed167d
PR
1756 if (side == C_SYNC_SOURCE) {
1757 struct net_conf *nc;
1758 int timeo;
1759
1760 rcu_read_lock();
a6b32bc3 1761 nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
44ed167d
PR
1762 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1763 rcu_read_unlock();
1764 schedule_timeout_interruptible(timeo);
1765 }
b30ab791 1766 drbd_resync_finished(device);
b411b363
PR
1767 }
1768
b30ab791
AG
1769 drbd_rs_controller_reset(device);
1770 /* ns.conn may already be != device->state.conn,
b411b363
PR
1771 * we may have been paused in between, or become paused until
1772 * the timer triggers.
1773 * No matter, that is handled in resync_timer_fn() */
1774 if (ns.conn == C_SYNC_TARGET)
b30ab791 1775 mod_timer(&device->resync_timer, jiffies);
b411b363 1776
b30ab791 1777 drbd_md_sync(device);
b411b363 1778 }
b30ab791
AG
1779 put_ldev(device);
1780 mutex_unlock(device->state_mutex);
b411b363
PR
1781}
1782
b6dd1a89
LE
1783/* If the resource already closed the current epoch, but we did not
1784 * (because we have not yet seen new requests), we should send the
1785 * corresponding barrier now. Must be checked within the same spinlock
1786 * that is used to check for new requests. */
bde89a9e 1787static bool need_to_send_barrier(struct drbd_connection *connection)
b6dd1a89
LE
1788{
1789 if (!connection->send.seen_any_write_yet)
1790 return false;
1791
1792 /* Skip barriers that do not contain any writes.
1793 * This may happen during AHEAD mode. */
1794 if (!connection->send.current_epoch_writes)
1795 return false;
1796
1797 /* ->req_lock is held when requests are queued on
1798 * connection->sender_work, and put into ->transfer_log.
1799 * It is also held when ->current_tle_nr is increased.
1800 * So either there are already new requests queued,
1801 * and corresponding barriers will be send there.
1802 * Or nothing new is queued yet, so the difference will be 1.
1803 */
1804 if (atomic_read(&connection->current_tle_nr) !=
1805 connection->send.current_epoch_nr + 1)
1806 return false;
1807
1808 return true;
1809}
1810
a186e478 1811static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
8c0785a5
LE
1812{
1813 spin_lock_irq(&queue->q_lock);
1814 list_splice_init(&queue->q, work_list);
1815 spin_unlock_irq(&queue->q_lock);
1816 return !list_empty(work_list);
1817}
1818
a186e478 1819static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
8c0785a5
LE
1820{
1821 spin_lock_irq(&queue->q_lock);
1822 if (!list_empty(&queue->q))
1823 list_move(queue->q.next, work_list);
1824 spin_unlock_irq(&queue->q_lock);
1825 return !list_empty(work_list);
1826}
1827
bde89a9e 1828static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
b6dd1a89
LE
1829{
1830 DEFINE_WAIT(wait);
1831 struct net_conf *nc;
1832 int uncork, cork;
1833
1834 dequeue_work_item(&connection->sender_work, work_list);
1835 if (!list_empty(work_list))
1836 return;
1837
1838 /* Still nothing to do?
1839 * Maybe we still need to close the current epoch,
1840 * even if no new requests are queued yet.
1841 *
1842 * Also, poke TCP, just in case.
1843 * Then wait for new work (or signal). */
1844 rcu_read_lock();
1845 nc = rcu_dereference(connection->net_conf);
1846 uncork = nc ? nc->tcp_cork : 0;
1847 rcu_read_unlock();
1848 if (uncork) {
1849 mutex_lock(&connection->data.mutex);
1850 if (connection->data.socket)
1851 drbd_tcp_uncork(connection->data.socket);
1852 mutex_unlock(&connection->data.mutex);
1853 }
1854
1855 for (;;) {
1856 int send_barrier;
1857 prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
0500813f 1858 spin_lock_irq(&connection->resource->req_lock);
b6dd1a89 1859 spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
bc317a9e
LE
1860 /* dequeue single item only,
1861 * we still use drbd_queue_work_front() in some places */
1862 if (!list_empty(&connection->sender_work.q))
1863 list_move(connection->sender_work.q.next, work_list);
b6dd1a89
LE
1864 spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
1865 if (!list_empty(work_list) || signal_pending(current)) {
0500813f 1866 spin_unlock_irq(&connection->resource->req_lock);
b6dd1a89
LE
1867 break;
1868 }
1869 send_barrier = need_to_send_barrier(connection);
0500813f 1870 spin_unlock_irq(&connection->resource->req_lock);
b6dd1a89
LE
1871 if (send_barrier) {
1872 drbd_send_barrier(connection);
1873 connection->send.current_epoch_nr++;
1874 }
1875 schedule();
1876 /* may be woken up for other things but new work, too,
1877 * e.g. if the current epoch got closed.
1878 * In which case we send the barrier above. */
1879 }
1880 finish_wait(&connection->sender_work.q_wait, &wait);
1881
1882 /* someone may have changed the config while we have been waiting above. */
1883 rcu_read_lock();
1884 nc = rcu_dereference(connection->net_conf);
1885 cork = nc ? nc->tcp_cork : 0;
1886 rcu_read_unlock();
1887 mutex_lock(&connection->data.mutex);
1888 if (connection->data.socket) {
1889 if (cork)
1890 drbd_tcp_cork(connection->data.socket);
1891 else if (!uncork)
1892 drbd_tcp_uncork(connection->data.socket);
1893 }
1894 mutex_unlock(&connection->data.mutex);
1895}
1896
b411b363
PR
1897int drbd_worker(struct drbd_thread *thi)
1898{
bde89a9e 1899 struct drbd_connection *connection = thi->connection;
84b8c06b 1900 struct drbd_device_work *dw = NULL;
c06ece6b 1901 struct drbd_peer_device *peer_device;
b411b363 1902 LIST_HEAD(work_list);
8c0785a5 1903 int vnr;
b411b363 1904
e77a0a5c 1905 while (get_t_state(thi) == RUNNING) {
80822284 1906 drbd_thread_current_set_cpu(thi);
b411b363 1907
8c0785a5
LE
1908 /* as long as we use drbd_queue_work_front(),
1909 * we may only dequeue single work items here, not batches. */
1910 if (list_empty(&work_list))
bde89a9e 1911 wait_for_work(connection, &work_list);
b411b363 1912
8c0785a5 1913 if (signal_pending(current)) {
b411b363 1914 flush_signals(current);
19393e10 1915 if (get_t_state(thi) == RUNNING) {
1ec861eb 1916 drbd_warn(connection, "Worker got an unexpected signal\n");
b411b363 1917 continue;
19393e10 1918 }
b411b363
PR
1919 break;
1920 }
1921
e77a0a5c 1922 if (get_t_state(thi) != RUNNING)
b411b363 1923 break;
b411b363 1924
8c0785a5 1925 while (!list_empty(&work_list)) {
84b8c06b
AG
1926 dw = list_first_entry(&work_list, struct drbd_device_work, w.list);
1927 list_del_init(&dw->w.list);
1928 if (dw->w.cb(&dw->w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
8c0785a5 1929 continue;
bde89a9e
AG
1930 if (connection->cstate >= C_WF_REPORT_PARAMS)
1931 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
b411b363
PR
1932 }
1933 }
b411b363 1934
8c0785a5 1935 do {
b411b363 1936 while (!list_empty(&work_list)) {
84b8c06b
AG
1937 dw = list_first_entry(&work_list, struct drbd_device_work, w.list);
1938 list_del_init(&dw->w.list);
1939 dw->w.cb(&dw->w, 1);
b411b363 1940 }
bde89a9e 1941 dequeue_work_batch(&connection->sender_work, &work_list);
8c0785a5 1942 } while (!list_empty(&work_list));
b411b363 1943
c141ebda 1944 rcu_read_lock();
c06ece6b
AG
1945 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1946 struct drbd_device *device = peer_device->device;
0b0ba1ef 1947 D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
b30ab791 1948 kref_get(&device->kref);
c141ebda 1949 rcu_read_unlock();
b30ab791 1950 drbd_device_cleanup(device);
05a10ec7 1951 kref_put(&device->kref, drbd_destroy_device);
c141ebda 1952 rcu_read_lock();
0e29d163 1953 }
c141ebda 1954 rcu_read_unlock();
b411b363
PR
1955
1956 return 0;
1957}