]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/drbd/drbd_req.c
drbd: move put_ldev from __req_mod() to the endio callback
[mirror_ubuntu-bionic-kernel.git] / drivers / block / drbd / drbd_req.c
CommitLineData
b411b363
PR
1/*
2 drbd_req.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
b411b363
PR
26#include <linux/module.h>
27
28#include <linux/slab.h>
29#include <linux/drbd.h>
30#include "drbd_int.h"
b411b363
PR
31#include "drbd_req.h"
32
33
57bcb6cf
PR
34static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
35
b411b363
PR
36/* Update disk stats at start of I/O request */
37static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
38{
39 const int rw = bio_data_dir(bio);
40 int cpu;
41 cpu = part_stat_lock();
72585d24 42 part_round_stats(cpu, &mdev->vdisk->part0);
b411b363
PR
43 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
44 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
376694a0
PR
45 (void) cpu; /* The macro invocations above want the cpu argument, I do not like
46 the compiler warning about cpu only assigned but never used... */
753c8913 47 part_inc_in_flight(&mdev->vdisk->part0, rw);
b411b363 48 part_stat_unlock();
b411b363
PR
49}
50
51/* Update disk stats when completing request upwards */
52static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
53{
54 int rw = bio_data_dir(req->master_bio);
55 unsigned long duration = jiffies - req->start_time;
56 int cpu;
57 cpu = part_stat_lock();
58 part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
59 part_round_stats(cpu, &mdev->vdisk->part0);
753c8913 60 part_dec_in_flight(&mdev->vdisk->part0, rw);
b411b363 61 part_stat_unlock();
b411b363
PR
62}
63
9e204cdd
AG
64static struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
65 struct bio *bio_src)
66{
67 struct drbd_request *req;
68
69 req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
70 if (!req)
71 return NULL;
72
73 drbd_req_make_private_bio(req, bio_src);
74 req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
a21e9298 75 req->w.mdev = mdev;
9e204cdd
AG
76 req->master_bio = bio_src;
77 req->epoch = 0;
53840641 78
9e204cdd
AG
79 drbd_clear_interval(&req->i);
80 req->i.sector = bio_src->bi_sector;
81 req->i.size = bio_src->bi_size;
5e472264 82 req->i.local = true;
53840641
AG
83 req->i.waiting = false;
84
9e204cdd
AG
85 INIT_LIST_HEAD(&req->tl_requests);
86 INIT_LIST_HEAD(&req->w.list);
87
88 return req;
89}
90
91static void drbd_req_free(struct drbd_request *req)
92{
93 mempool_free(req, drbd_request_mempool);
94}
95
96/* rw is bio_data_dir(), only READ or WRITE */
b411b363
PR
97static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
98{
99 const unsigned long s = req->rq_state;
288f422e
PR
100
101 /* remove it from the transfer log.
102 * well, only if it had been there in the first
103 * place... if it had not (local only or conflicting
104 * and never sent), it should still be "empty" as
105 * initialized in drbd_req_new(), so we can list_del() it
106 * here unconditionally */
2312f0b3 107 list_del_init(&req->tl_requests);
288f422e 108
b411b363
PR
109 /* if it was a write, we may have to set the corresponding
110 * bit(s) out-of-sync first. If it had a local part, we need to
111 * release the reference to the activity log. */
112 if (rw == WRITE) {
b411b363
PR
113 /* Set out-of-sync unless both OK flags are set
114 * (local only or remote failed).
115 * Other places where we set out-of-sync:
116 * READ with local io-error */
117 if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
ace652ac 118 drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
b411b363
PR
119
120 if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
ace652ac 121 drbd_set_in_sync(mdev, req->i.sector, req->i.size);
b411b363
PR
122
123 /* one might be tempted to move the drbd_al_complete_io
fcefa62e 124 * to the local io completion callback drbd_request_endio.
b411b363
PR
125 * but, if this was a mirror write, we may only
126 * drbd_al_complete_io after this is RQ_NET_DONE,
127 * otherwise the extent could be dropped from the al
128 * before it has actually been written on the peer.
129 * if we crash before our peer knows about the request,
130 * but after the extent has been dropped from the al,
131 * we would forget to resync the corresponding extent.
132 */
133 if (s & RQ_LOCAL_MASK) {
134 if (get_ldev_if_state(mdev, D_FAILED)) {
0778286a 135 if (s & RQ_IN_ACT_LOG)
181286ad 136 drbd_al_complete_io(mdev, &req->i);
b411b363
PR
137 put_ldev(mdev);
138 } else if (__ratelimit(&drbd_ratelimit_state)) {
181286ad
LE
139 dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), "
140 "but my Disk seems to have failed :(\n",
141 (unsigned long long) req->i.sector, req->i.size);
b411b363
PR
142 }
143 }
144 }
145
2312f0b3
LE
146 if (s & RQ_POSTPONED)
147 drbd_restart_write(req);
148 else
149 drbd_req_free(req);
b411b363
PR
150}
151
152static void queue_barrier(struct drbd_conf *mdev)
153{
154 struct drbd_tl_epoch *b;
6936fcb4 155 struct drbd_tconn *tconn = mdev->tconn;
b411b363
PR
156
157 /* We are within the req_lock. Once we queued the barrier for sending,
158 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
159 * barrier/epoch object is added. This is the only place this bit is
160 * set. It indicates that the barrier for this epoch is already queued,
161 * and no new epoch has been created yet. */
6936fcb4 162 if (test_bit(CREATE_BARRIER, &tconn->flags))
b411b363
PR
163 return;
164
6936fcb4 165 b = tconn->newest_tle;
b411b363 166 b->w.cb = w_send_barrier;
a21e9298 167 b->w.mdev = mdev;
b411b363
PR
168 /* inc_ap_pending done here, so we won't
169 * get imbalanced on connection loss.
170 * dec_ap_pending will be done in got_BarrierAck
171 * or (on connection loss) in tl_clear. */
172 inc_ap_pending(mdev);
6936fcb4
PR
173 drbd_queue_work(&tconn->data.work, &b->w);
174 set_bit(CREATE_BARRIER, &tconn->flags);
b411b363
PR
175}
176
177static void _about_to_complete_local_write(struct drbd_conf *mdev,
178 struct drbd_request *req)
179{
180 const unsigned long s = req->rq_state;
b411b363 181
8a3c1044
LE
182 /* Before we can signal completion to the upper layers,
183 * we may need to close the current epoch.
184 * We can skip this, if this request has not even been sent, because we
185 * did not have a fully established connection yet/anymore, during
186 * bitmap exchange, or while we are C_AHEAD due to congestion policy.
187 */
188 if (mdev->state.conn >= C_CONNECTED &&
189 (s & RQ_NET_SENT) != 0 &&
87eeee41 190 req->epoch == mdev->tconn->newest_tle->br_number)
b411b363 191 queue_barrier(mdev);
b411b363
PR
192}
193
194void complete_master_bio(struct drbd_conf *mdev,
195 struct bio_and_error *m)
196{
b411b363
PR
197 bio_endio(m->bio, m->error);
198 dec_ap_bio(mdev);
199}
200
53840641
AG
201
202static void drbd_remove_request_interval(struct rb_root *root,
203 struct drbd_request *req)
204{
a21e9298 205 struct drbd_conf *mdev = req->w.mdev;
53840641
AG
206 struct drbd_interval *i = &req->i;
207
208 drbd_remove_interval(root, i);
209
210 /* Wake up any processes waiting for this request to complete. */
211 if (i->waiting)
212 wake_up(&mdev->misc_wait);
213}
214
8d6cdd78
LE
215static void maybe_wakeup_conflicting_requests(struct drbd_request *req)
216{
217 const unsigned long s = req->rq_state;
218 if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
219 return;
220 if (req->i.waiting)
221 /* Retry all conflicting peer requests. */
222 wake_up(&req->w.mdev->misc_wait);
223}
224
6870ca6d
LE
225static
226void req_may_be_done(struct drbd_request *req)
227{
228 const unsigned long s = req->rq_state;
229 struct drbd_conf *mdev = req->w.mdev;
230 int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
231
232 /* req->master_bio still present means: Not yet completed.
233 *
234 * Unless this is RQ_POSTPONED, which will cause _req_is_done() to
235 * queue it on the retry workqueue instead of destroying it.
236 */
237 if (req->master_bio && !(s & RQ_POSTPONED))
238 return;
239
240 /* Local still pending, even though master_bio is already completed?
241 * may happen for RQ_LOCAL_ABORTED requests. */
242 if (s & RQ_LOCAL_PENDING)
243 return;
244
245 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
246 /* this is disconnected (local only) operation,
247 * or protocol A, B, or C P_BARRIER_ACK,
248 * or killed from the transfer log due to connection loss. */
249 _req_is_done(mdev, req, rw);
250 }
251 /* else: network part and not DONE yet. that is
252 * protocol A, B, or C, barrier ack still pending... */
253}
254
b411b363
PR
255/* Helper for __req_mod().
256 * Set m->bio to the master bio, if it is fit to be completed,
257 * or leave it alone (it is initialized to NULL in __req_mod),
258 * if it has already been completed, or cannot be completed yet.
259 * If m->bio is set, the error status to be returned is placed in m->error.
260 */
6870ca6d
LE
261static
262void req_may_be_completed(struct drbd_request *req, struct bio_and_error *m)
b411b363
PR
263{
264 const unsigned long s = req->rq_state;
a21e9298 265 struct drbd_conf *mdev = req->w.mdev;
cdfda633 266 int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
b411b363 267
b411b363
PR
268 /* we must not complete the master bio, while it is
269 * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
270 * not yet acknowledged by the peer
271 * not yet completed by the local io subsystem
272 * these flags may get cleared in any order by
273 * the worker,
274 * the receiver,
275 * the bio_endio completion callbacks.
276 */
cdfda633 277 if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
7be8da07 278 return;
b411b363
PR
279 if (s & RQ_NET_QUEUED)
280 return;
281 if (s & RQ_NET_PENDING)
282 return;
b411b363
PR
283
284 if (req->master_bio) {
8554df1c 285 /* this is DATA_RECEIVED (remote read)
b411b363
PR
286 * or protocol C P_WRITE_ACK
287 * or protocol B P_RECV_ACK
8554df1c 288 * or protocol A "HANDED_OVER_TO_NETWORK" (SendAck)
b411b363
PR
289 * or canceled or failed,
290 * or killed from the transfer log due to connection loss.
291 */
292
293 /*
294 * figure out whether to report success or failure.
295 *
296 * report success when at least one of the operations succeeded.
297 * or, to put the other way,
298 * only report failure, when both operations failed.
299 *
300 * what to do about the failures is handled elsewhere.
301 * what we need to do here is just: complete the master_bio.
302 *
303 * local completion error, if any, has been stored as ERR_PTR
fcefa62e 304 * in private_bio within drbd_request_endio.
b411b363
PR
305 */
306 int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
307 int error = PTR_ERR(req->private_bio);
308
309 /* remove the request from the conflict detection
310 * respective block_id verification hash */
dac1389c
AG
311 if (!drbd_interval_empty(&req->i)) {
312 struct rb_root *root;
313
dac1389c
AG
314 if (rw == WRITE)
315 root = &mdev->write_requests;
316 else
317 root = &mdev->read_requests;
53840641 318 drbd_remove_request_interval(root, req);
7be8da07 319 } else if (!(s & RQ_POSTPONED))
8825f7c3 320 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
b411b363
PR
321
322 /* for writes we need to do some extra housekeeping */
323 if (rw == WRITE)
324 _about_to_complete_local_write(mdev, req);
325
326 /* Update disk stats */
327 _drbd_end_io_acct(mdev, req);
328
7be8da07
AG
329 if (!(s & RQ_POSTPONED)) {
330 m->error = ok ? 0 : (error ?: -EIO);
331 m->bio = req->master_bio;
2312f0b3
LE
332 req->master_bio = NULL;
333 } else {
334 /* Assert that this will be _req_is_done()
335 * with this very invokation. */
336 /* FIXME:
337 * what about (RQ_LOCAL_PENDING | RQ_LOCAL_ABORTED)?
338 */
339 D_ASSERT(!(s & RQ_LOCAL_PENDING));
340 D_ASSERT(s & RQ_NET_DONE);
7be8da07 341 }
b411b363 342 }
6870ca6d 343 req_may_be_done(req);
b411b363
PR
344}
345
6870ca6d 346static void req_may_be_completed_not_susp(struct drbd_request *req, struct bio_and_error *m)
cfa03415 347{
a21e9298 348 struct drbd_conf *mdev = req->w.mdev;
cfa03415 349
2aebfabb 350 if (!drbd_suspended(mdev))
6870ca6d 351 req_may_be_completed(req, m);
cfa03415
PR
352}
353
b411b363
PR
354/* obviously this could be coded as many single functions
355 * instead of one huge switch,
356 * or by putting the code directly in the respective locations
357 * (as it has been before).
358 *
359 * but having it this way
360 * enforces that it is all in this one place, where it is easier to audit,
361 * it makes it obvious that whatever "event" "happens" to a request should
362 * happen "atomically" within the req_lock,
363 * and it enforces that we have to think in a very structured manner
364 * about the "events" that may happen to a request during its life time ...
365 */
2a80699f 366int __req_mod(struct drbd_request *req, enum drbd_req_event what,
b411b363
PR
367 struct bio_and_error *m)
368{
a21e9298 369 struct drbd_conf *mdev = req->w.mdev;
44ed167d 370 struct net_conf *nc;
303d1448 371 int p, rv = 0;
7be8da07
AG
372
373 if (m)
374 m->bio = NULL;
b411b363 375
b411b363
PR
376 switch (what) {
377 default:
378 dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
379 break;
380
381 /* does not happen...
382 * initialization done in drbd_req_new
8554df1c 383 case CREATED:
b411b363
PR
384 break;
385 */
386
8554df1c 387 case TO_BE_SENT: /* via network */
7be8da07 388 /* reached via __drbd_make_request
b411b363
PR
389 * and from w_read_retry_remote */
390 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
391 req->rq_state |= RQ_NET_PENDING;
44ed167d
PR
392 rcu_read_lock();
393 nc = rcu_dereference(mdev->tconn->net_conf);
394 p = nc->wire_protocol;
395 rcu_read_unlock();
303d1448
PR
396 req->rq_state |=
397 p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
398 p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
b411b363
PR
399 inc_ap_pending(mdev);
400 break;
401
8554df1c 402 case TO_BE_SUBMITTED: /* locally */
7be8da07 403 /* reached via __drbd_make_request */
b411b363
PR
404 D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
405 req->rq_state |= RQ_LOCAL_PENDING;
406 break;
407
8554df1c 408 case COMPLETED_OK:
cdfda633 409 if (req->rq_state & RQ_WRITE)
ace652ac 410 mdev->writ_cnt += req->i.size >> 9;
b411b363 411 else
ace652ac 412 mdev->read_cnt += req->i.size >> 9;
b411b363
PR
413
414 req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
415 req->rq_state &= ~RQ_LOCAL_PENDING;
416
8d6cdd78 417 maybe_wakeup_conflicting_requests(req);
6870ca6d 418 req_may_be_completed_not_susp(req, m);
b411b363
PR
419 break;
420
cdfda633
PR
421 case ABORT_DISK_IO:
422 req->rq_state |= RQ_LOCAL_ABORTED;
423 if (req->rq_state & RQ_WRITE)
6870ca6d 424 req_may_be_completed_not_susp(req, m);
cdfda633
PR
425 else
426 goto goto_queue_for_net_read;
427 break;
428
8554df1c 429 case WRITE_COMPLETED_WITH_ERROR:
b411b363
PR
430 req->rq_state |= RQ_LOCAL_COMPLETED;
431 req->rq_state &= ~RQ_LOCAL_PENDING;
432
81e84650 433 __drbd_chk_io_error(mdev, false);
8d6cdd78 434 maybe_wakeup_conflicting_requests(req);
6870ca6d 435 req_may_be_completed_not_susp(req, m);
b411b363
PR
436 break;
437
8554df1c 438 case READ_AHEAD_COMPLETED_WITH_ERROR:
b411b363
PR
439 /* it is legal to fail READA */
440 req->rq_state |= RQ_LOCAL_COMPLETED;
441 req->rq_state &= ~RQ_LOCAL_PENDING;
6870ca6d 442 req_may_be_completed_not_susp(req, m);
b411b363
PR
443 break;
444
8554df1c 445 case READ_COMPLETED_WITH_ERROR:
ace652ac 446 drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
b411b363
PR
447
448 req->rq_state |= RQ_LOCAL_COMPLETED;
449 req->rq_state &= ~RQ_LOCAL_PENDING;
450
b411b363 451 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
b411b363 452
81e84650 453 __drbd_chk_io_error(mdev, false);
b411b363 454
cdfda633
PR
455 goto_queue_for_net_read:
456
d255e5ff
LE
457 /* no point in retrying if there is no good remote data,
458 * or we have no connection. */
459 if (mdev->state.pdsk != D_UP_TO_DATE) {
6870ca6d 460 req_may_be_completed_not_susp(req, m);
d255e5ff
LE
461 break;
462 }
463
8554df1c 464 /* _req_mod(req,TO_BE_SENT); oops, recursion... */
d255e5ff
LE
465 req->rq_state |= RQ_NET_PENDING;
466 inc_ap_pending(mdev);
8554df1c 467 /* fall through: _req_mod(req,QUEUE_FOR_NET_READ); */
b411b363 468
8554df1c 469 case QUEUE_FOR_NET_READ:
b411b363
PR
470 /* READ or READA, and
471 * no local disk,
472 * or target area marked as invalid,
473 * or just got an io-error. */
7be8da07 474 /* from __drbd_make_request
b411b363
PR
475 * or from bio_endio during read io-error recovery */
476
6870ca6d
LE
477 /* So we can verify the handle in the answer packet.
478 * Corresponding drbd_remove_request_interval is in
479 * req_may_be_completed() */
97ddb687 480 D_ASSERT(drbd_interval_empty(&req->i));
dac1389c 481 drbd_insert_interval(&mdev->read_requests, &req->i);
b411b363 482
83c38830 483 set_bit(UNPLUG_REMOTE, &mdev->flags);
b411b363
PR
484
485 D_ASSERT(req->rq_state & RQ_NET_PENDING);
486 req->rq_state |= RQ_NET_QUEUED;
487 req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
488 ? w_read_retry_remote
489 : w_send_read_req;
e42325a5 490 drbd_queue_work(&mdev->tconn->data.work, &req->w);
b411b363
PR
491 break;
492
8554df1c 493 case QUEUE_FOR_NET_WRITE:
b411b363 494 /* assert something? */
7be8da07 495 /* from __drbd_make_request only */
b411b363 496
6870ca6d
LE
497 /* Corresponding drbd_remove_request_interval is in
498 * req_may_be_completed() */
97ddb687 499 D_ASSERT(drbd_interval_empty(&req->i));
de696716 500 drbd_insert_interval(&mdev->write_requests, &req->i);
b411b363
PR
501
502 /* NOTE
503 * In case the req ended up on the transfer log before being
504 * queued on the worker, it could lead to this request being
505 * missed during cleanup after connection loss.
506 * So we have to do both operations here,
507 * within the same lock that protects the transfer log.
508 *
509 * _req_add_to_epoch(req); this has to be after the
510 * _maybe_start_new_epoch(req); which happened in
7be8da07 511 * __drbd_make_request, because we now may set the bit
b411b363
PR
512 * again ourselves to close the current epoch.
513 *
514 * Add req to the (now) current epoch (barrier). */
515
83c38830
LE
516 /* otherwise we may lose an unplug, which may cause some remote
517 * io-scheduler timeout to expire, increasing maximum latency,
518 * hurting performance. */
519 set_bit(UNPLUG_REMOTE, &mdev->flags);
520
7be8da07 521 /* see __drbd_make_request,
b411b363 522 * just after it grabs the req_lock */
6936fcb4 523 D_ASSERT(test_bit(CREATE_BARRIER, &mdev->tconn->flags) == 0);
b411b363 524
87eeee41 525 req->epoch = mdev->tconn->newest_tle->br_number;
b411b363
PR
526
527 /* increment size of current epoch */
87eeee41 528 mdev->tconn->newest_tle->n_writes++;
b411b363
PR
529
530 /* queue work item to send data */
531 D_ASSERT(req->rq_state & RQ_NET_PENDING);
532 req->rq_state |= RQ_NET_QUEUED;
533 req->w.cb = w_send_dblock;
e42325a5 534 drbd_queue_work(&mdev->tconn->data.work, &req->w);
b411b363
PR
535
536 /* close the epoch, in case it outgrew the limit */
44ed167d
PR
537 rcu_read_lock();
538 nc = rcu_dereference(mdev->tconn->net_conf);
539 p = nc->max_epoch_size;
540 rcu_read_unlock();
541 if (mdev->tconn->newest_tle->n_writes >= p)
b411b363
PR
542 queue_barrier(mdev);
543
544 break;
545
8554df1c 546 case QUEUE_FOR_SEND_OOS:
73a01a18 547 req->rq_state |= RQ_NET_QUEUED;
8f7bed77 548 req->w.cb = w_send_out_of_sync;
e42325a5 549 drbd_queue_work(&mdev->tconn->data.work, &req->w);
73a01a18
PR
550 break;
551
ea9d6729 552 case READ_RETRY_REMOTE_CANCELED:
8554df1c 553 case SEND_CANCELED:
8554df1c 554 case SEND_FAILED:
b411b363
PR
555 /* real cleanup will be done from tl_clear. just update flags
556 * so it is no longer marked as on the worker queue */
557 req->rq_state &= ~RQ_NET_QUEUED;
558 /* if we did it right, tl_clear should be scheduled only after
559 * this, so this should not be necessary! */
6870ca6d 560 req_may_be_completed_not_susp(req, m);
b411b363
PR
561 break;
562
8554df1c 563 case HANDED_OVER_TO_NETWORK:
b411b363 564 /* assert something? */
759fbdfb 565 if (bio_data_dir(req->master_bio) == WRITE)
ace652ac 566 atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
759fbdfb 567
b411b363 568 if (bio_data_dir(req->master_bio) == WRITE &&
303d1448 569 !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
b411b363
PR
570 /* this is what is dangerous about protocol A:
571 * pretend it was successfully written on the peer. */
572 if (req->rq_state & RQ_NET_PENDING) {
573 dec_ap_pending(mdev);
574 req->rq_state &= ~RQ_NET_PENDING;
575 req->rq_state |= RQ_NET_OK;
576 } /* else: neg-ack was faster... */
577 /* it is still not yet RQ_NET_DONE until the
578 * corresponding epoch barrier got acked as well,
579 * so we know what to dirty on connection loss */
580 }
581 req->rq_state &= ~RQ_NET_QUEUED;
582 req->rq_state |= RQ_NET_SENT;
6870ca6d 583 req_may_be_completed_not_susp(req, m);
27a434fe
LE
584 break;
585
586 case OOS_HANDED_TO_NETWORK:
587 /* Was not set PENDING, no longer QUEUED, so is now DONE
588 * as far as this connection is concerned. */
589 req->rq_state &= ~RQ_NET_QUEUED;
590 req->rq_state |= RQ_NET_DONE;
6870ca6d 591 req_may_be_completed_not_susp(req, m);
b411b363
PR
592 break;
593
8554df1c 594 case CONNECTION_LOST_WHILE_PENDING:
b411b363
PR
595 /* transfer log cleanup after connection loss */
596 /* assert something? */
597 if (req->rq_state & RQ_NET_PENDING)
598 dec_ap_pending(mdev);
57bcb6cf
PR
599
600 p = !(req->rq_state & RQ_WRITE) && req->rq_state & RQ_NET_PENDING;
601
b411b363
PR
602 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
603 req->rq_state |= RQ_NET_DONE;
759fbdfb 604 if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
ace652ac 605 atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
759fbdfb 606
b411b363
PR
607 /* if it is still queued, we may not complete it here.
608 * it will be canceled soon. */
57bcb6cf
PR
609 if (!(req->rq_state & RQ_NET_QUEUED)) {
610 if (p)
611 goto goto_read_retry_local;
6870ca6d 612 req_may_be_completed(req, m); /* Allowed while state.susp */
57bcb6cf 613 }
b411b363
PR
614 break;
615
7be8da07 616 case DISCARD_WRITE:
b411b363
PR
617 /* for discarded conflicting writes of multiple primaries,
618 * there is no need to keep anything in the tl, potential
619 * node crashes are covered by the activity log. */
b411b363
PR
620 req->rq_state |= RQ_NET_DONE;
621 /* fall through */
0afd569a 622 case WRITE_ACKED_BY_PEER_AND_SIS:
8554df1c 623 case WRITE_ACKED_BY_PEER:
0afd569a
LE
624 if (what == WRITE_ACKED_BY_PEER_AND_SIS)
625 req->rq_state |= RQ_NET_SIS;
303d1448 626 D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
b411b363 627 /* protocol C; successfully written on peer.
0afd569a 628 * Nothing more to do here.
b411b363 629 * We want to keep the tl in place for all protocols, to cater
0afd569a 630 * for volatile write-back caches on lower level devices. */
b411b363 631
303d1448 632 goto ack_common;
8554df1c 633 case RECV_ACKED_BY_PEER:
303d1448 634 D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK);
b411b363 635 /* protocol B; pretends to be successfully written on peer.
8554df1c 636 * see also notes above in HANDED_OVER_TO_NETWORK about
b411b363 637 * protocol != C */
303d1448 638 ack_common:
b411b363
PR
639 req->rq_state |= RQ_NET_OK;
640 D_ASSERT(req->rq_state & RQ_NET_PENDING);
641 dec_ap_pending(mdev);
ace652ac 642 atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
b411b363 643 req->rq_state &= ~RQ_NET_PENDING;
8d6cdd78 644 maybe_wakeup_conflicting_requests(req);
6870ca6d 645 req_may_be_completed_not_susp(req, m);
b411b363
PR
646 break;
647
7be8da07 648 case POSTPONE_WRITE:
303d1448
PR
649 D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
650 /* If this node has already detected the write conflict, the
7be8da07
AG
651 * worker will be waiting on misc_wait. Wake it up once this
652 * request has completed locally.
653 */
654 D_ASSERT(req->rq_state & RQ_NET_PENDING);
655 req->rq_state |= RQ_POSTPONED;
8d6cdd78 656 maybe_wakeup_conflicting_requests(req);
6870ca6d 657 req_may_be_completed_not_susp(req, m);
7be8da07
AG
658 break;
659
8554df1c 660 case NEG_ACKED:
b411b363 661 /* assert something? */
759fbdfb 662 if (req->rq_state & RQ_NET_PENDING) {
b411b363 663 dec_ap_pending(mdev);
e8cdc343
PR
664 if (req->rq_state & RQ_WRITE)
665 atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
759fbdfb 666 }
b411b363
PR
667 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
668
669 req->rq_state |= RQ_NET_DONE;
380207d0 670
57bcb6cf 671 if (!(req->rq_state & RQ_WRITE))
380207d0
PR
672 goto goto_read_retry_local;
673
8d6cdd78 674 maybe_wakeup_conflicting_requests(req);
6870ca6d 675 req_may_be_completed_not_susp(req, m);
8554df1c 676 /* else: done by HANDED_OVER_TO_NETWORK */
b411b363
PR
677 break;
678
380207d0 679 goto_read_retry_local:
57bcb6cf 680 if (!drbd_may_do_local_read(mdev, req->i.sector, req->i.size)) {
6870ca6d 681 req_may_be_completed_not_susp(req, m);
57bcb6cf
PR
682 break;
683 }
684 D_ASSERT(!(req->rq_state & RQ_LOCAL_PENDING));
380207d0 685 req->rq_state |= RQ_LOCAL_PENDING;
57bcb6cf
PR
686
687 get_ldev(mdev);
688 req->w.cb = w_restart_disk_io;
689 drbd_queue_work(&mdev->tconn->data.work, &req->w);
380207d0
PR
690 break;
691
8554df1c 692 case FAIL_FROZEN_DISK_IO:
265be2d0
PR
693 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
694 break;
695
6870ca6d 696 req_may_be_completed(req, m); /* Allowed while state.susp */
265be2d0
PR
697 break;
698
8554df1c 699 case RESTART_FROZEN_DISK_IO:
265be2d0
PR
700 if (!(req->rq_state & RQ_LOCAL_COMPLETED))
701 break;
702
703 req->rq_state &= ~RQ_LOCAL_COMPLETED;
704
705 rv = MR_READ;
706 if (bio_data_dir(req->master_bio) == WRITE)
707 rv = MR_WRITE;
708
709 get_ldev(mdev);
710 req->w.cb = w_restart_disk_io;
e42325a5 711 drbd_queue_work(&mdev->tconn->data.work, &req->w);
265be2d0
PR
712 break;
713
8554df1c 714 case RESEND:
11b58e73 715 /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
47ff2d0a 716 before the connection loss (B&C only); only P_BARRIER_ACK was missing.
6870ca6d
LE
717 Throwing them out of the TL here by pretending we got a BARRIER_ACK.
718 During connection handshake, we ensure that the peer was not rebooted. */
11b58e73
PR
719 if (!(req->rq_state & RQ_NET_OK)) {
720 if (req->w.cb) {
e42325a5 721 drbd_queue_work(&mdev->tconn->data.work, &req->w);
11b58e73
PR
722 rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
723 }
724 break;
725 }
8554df1c 726 /* else, fall through to BARRIER_ACKED */
11b58e73 727
8554df1c 728 case BARRIER_ACKED:
288f422e
PR
729 if (!(req->rq_state & RQ_WRITE))
730 break;
731
b411b363 732 if (req->rq_state & RQ_NET_PENDING) {
a209b4ae 733 /* barrier came in before all requests were acked.
b411b363
PR
734 * this is bad, because if the connection is lost now,
735 * we won't be able to clean them up... */
8554df1c 736 dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n");
87eeee41 737 list_move(&req->tl_requests, &mdev->tconn->out_of_sequence_requests);
b411b363 738 }
e636db5b
LE
739 if ((req->rq_state & RQ_NET_MASK) != 0) {
740 req->rq_state |= RQ_NET_DONE;
303d1448 741 if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)))
89e58e75 742 atomic_sub(req->i.size>>9, &mdev->ap_in_flight);
e636db5b 743 }
6870ca6d 744 req_may_be_done(req); /* Allowed while state.susp */
b411b363
PR
745 break;
746
8554df1c 747 case DATA_RECEIVED:
b411b363
PR
748 D_ASSERT(req->rq_state & RQ_NET_PENDING);
749 dec_ap_pending(mdev);
750 req->rq_state &= ~RQ_NET_PENDING;
751 req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
6870ca6d 752 req_may_be_completed_not_susp(req, m);
b411b363
PR
753 break;
754 };
2a80699f
PR
755
756 return rv;
b411b363
PR
757}
758
759/* we may do a local read if:
760 * - we are consistent (of course),
761 * - or we are generally inconsistent,
762 * BUT we are still/already IN SYNC for this area.
763 * since size may be bigger than BM_BLOCK_SIZE,
764 * we may need to check several bits.
765 */
0da34df0 766static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
b411b363
PR
767{
768 unsigned long sbnr, ebnr;
769 sector_t esector, nr_sectors;
770
771 if (mdev->state.disk == D_UP_TO_DATE)
0da34df0 772 return true;
8c387def 773 if (mdev->state.disk != D_INCONSISTENT)
0da34df0 774 return false;
b411b363 775 esector = sector + (size >> 9) - 1;
8ca9844f 776 nr_sectors = drbd_get_capacity(mdev->this_bdev);
b411b363
PR
777 D_ASSERT(sector < nr_sectors);
778 D_ASSERT(esector < nr_sectors);
779
780 sbnr = BM_SECT_TO_BIT(sector);
781 ebnr = BM_SECT_TO_BIT(esector);
782
0da34df0 783 return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
b411b363
PR
784}
785
d60de03a 786static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector)
380207d0
PR
787{
788 enum drbd_read_balancing rbm;
789 struct backing_dev_info *bdi;
d60de03a 790 int stripe_shift;
380207d0
PR
791
792 if (mdev->state.pdsk < D_UP_TO_DATE)
793 return false;
794
795 rcu_read_lock();
796 rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
797 rcu_read_unlock();
798
799 switch (rbm) {
800 case RB_CONGESTED_REMOTE:
801 bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
802 return bdi_read_congested(bdi);
803 case RB_LEAST_PENDING:
804 return atomic_read(&mdev->local_cnt) >
805 atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt);
d60de03a
PR
806 case RB_32K_STRIPING: /* stripe_shift = 15 */
807 case RB_64K_STRIPING:
808 case RB_128K_STRIPING:
809 case RB_256K_STRIPING:
810 case RB_512K_STRIPING:
811 case RB_1M_STRIPING: /* stripe_shift = 20 */
812 stripe_shift = (rbm - RB_32K_STRIPING + 15);
813 return (sector >> (stripe_shift - 9)) & 1;
380207d0
PR
814 case RB_ROUND_ROBIN:
815 return test_and_change_bit(READ_BALANCE_RR, &mdev->flags);
816 case RB_PREFER_REMOTE:
817 return true;
818 case RB_PREFER_LOCAL:
819 default:
820 return false;
821 }
822}
823
6024fece
AG
824/*
825 * complete_conflicting_writes - wait for any conflicting write requests
826 *
827 * The write_requests tree contains all active write requests which we
828 * currently know about. Wait for any requests to complete which conflict with
829 * the new one.
830 */
831static int complete_conflicting_writes(struct drbd_conf *mdev,
832 sector_t sector, int size)
833{
834 for(;;) {
6024fece 835 struct drbd_interval *i;
7be8da07 836 int err;
6024fece
AG
837
838 i = drbd_find_overlap(&mdev->write_requests, sector, size);
839 if (!i)
840 return 0;
7be8da07
AG
841 err = drbd_wait_misc(mdev, i);
842 if (err)
843 return err;
6024fece
AG
844 }
845}
846
7be8da07 847int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
b411b363
PR
848{
849 const int rw = bio_rw(bio);
850 const int size = bio->bi_size;
851 const sector_t sector = bio->bi_sector;
852 struct drbd_tl_epoch *b = NULL;
853 struct drbd_request *req;
44ed167d 854 struct net_conf *nc;
73a01a18 855 int local, remote, send_oos = 0;
6024fece 856 int err;
9a25a04c 857 int ret = 0;
81f44862 858 union drbd_dev_state s;
b411b363
PR
859
860 /* allocate outside of all locks; */
861 req = drbd_req_new(mdev, bio);
862 if (!req) {
863 dec_ap_bio(mdev);
864 /* only pass the error to the upper layers.
865 * if user cannot handle io errors, that's not our business. */
866 dev_err(DEV, "could not kmalloc() req\n");
867 bio_endio(bio, -ENOMEM);
868 return 0;
869 }
aeda1cd6 870 req->start_time = start_time;
b411b363 871
b411b363
PR
872 local = get_ldev(mdev);
873 if (!local) {
874 bio_put(req->private_bio); /* or we get a bio leak */
875 req->private_bio = NULL;
876 }
877 if (rw == WRITE) {
878 remote = 1;
879 } else {
880 /* READ || READA */
881 if (local) {
57bcb6cf
PR
882 if (!drbd_may_do_local_read(mdev, sector, size) ||
883 remote_due_to_read_balancing(mdev, sector)) {
b411b363
PR
884 /* we could kick the syncer to
885 * sync this extent asap, wait for
886 * it, then continue locally.
887 * Or just issue the request remotely.
888 */
889 local = 0;
890 bio_put(req->private_bio);
891 req->private_bio = NULL;
892 put_ldev(mdev);
893 }
894 }
895 remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
896 }
897
898 /* If we have a disk, but a READA request is mapped to remote,
899 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
900 * Just fail that READA request right here.
901 *
902 * THINK: maybe fail all READA when not local?
903 * or make this configurable...
904 * if network is slow, READA won't do any good.
905 */
906 if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
907 err = -EWOULDBLOCK;
908 goto fail_and_free_req;
909 }
910
911 /* For WRITES going to the local disk, grab a reference on the target
912 * extent. This waits for any resync activity in the corresponding
913 * resync extent to finish, and, if necessary, pulls in the target
914 * extent into the activity log, which involves further disk io because
915 * of transactional on-disk meta data updates. */
0778286a
PR
916 if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
917 req->rq_state |= RQ_IN_ACT_LOG;
181286ad 918 drbd_al_begin_io(mdev, &req->i);
0778286a 919 }
b411b363 920
81f44862
LE
921 s = mdev->state;
922 remote = remote && drbd_should_do_remote(s);
923 send_oos = rw == WRITE && drbd_should_send_out_of_sync(s);
3719094e 924 D_ASSERT(!(remote && send_oos));
b411b363 925
2aebfabb 926 if (!(local || remote) && !drbd_suspended(mdev)) {
fb2c7a10
LE
927 if (__ratelimit(&drbd_ratelimit_state))
928 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
6024fece 929 err = -EIO;
b411b363
PR
930 goto fail_free_complete;
931 }
932
933 /* For WRITE request, we have to make sure that we have an
934 * unused_spare_tle, in case we need to start a new epoch.
935 * I try to be smart and avoid to pre-allocate always "just in case",
936 * but there is a race between testing the bit and pointer outside the
937 * spinlock, and grabbing the spinlock.
938 * if we lost that race, we retry. */
73a01a18 939 if (rw == WRITE && (remote || send_oos) &&
87eeee41 940 mdev->tconn->unused_spare_tle == NULL &&
6936fcb4 941 test_bit(CREATE_BARRIER, &mdev->tconn->flags)) {
b411b363
PR
942allocate_barrier:
943 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
944 if (!b) {
945 dev_err(DEV, "Failed to alloc barrier.\n");
946 err = -ENOMEM;
947 goto fail_free_complete;
948 }
949 }
950
951 /* GOOD, everything prepared, grab the spin_lock */
87eeee41 952 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 953
6024fece
AG
954 if (rw == WRITE) {
955 err = complete_conflicting_writes(mdev, sector, size);
956 if (err) {
7be8da07
AG
957 if (err != -ERESTARTSYS)
958 _conn_request_state(mdev->tconn,
959 NS(conn, C_TIMEOUT),
960 CS_HARD);
6024fece 961 spin_unlock_irq(&mdev->tconn->req_lock);
7be8da07 962 err = -EIO;
6024fece
AG
963 goto fail_free_complete;
964 }
965 }
966
2aebfabb 967 if (drbd_suspended(mdev)) {
69b6a3b1
PR
968 /* If we got suspended, use the retry mechanism in
969 drbd_make_request() to restart processing of this
2f58dcfc 970 bio. In the next call to drbd_make_request
9a25a04c
PR
971 we sleep in inc_ap_bio() */
972 ret = 1;
87eeee41 973 spin_unlock_irq(&mdev->tconn->req_lock);
9a25a04c
PR
974 goto fail_free_complete;
975 }
976
73a01a18 977 if (remote || send_oos) {
6a35c45f 978 remote = drbd_should_do_remote(mdev->state);
8f7bed77 979 send_oos = rw == WRITE && drbd_should_send_out_of_sync(mdev->state);
3719094e 980 D_ASSERT(!(remote && send_oos));
73a01a18
PR
981
982 if (!(remote || send_oos))
b411b363
PR
983 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
984 if (!(local || remote)) {
985 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
87eeee41 986 spin_unlock_irq(&mdev->tconn->req_lock);
6024fece 987 err = -EIO;
b411b363
PR
988 goto fail_free_complete;
989 }
990 }
991
87eeee41
PR
992 if (b && mdev->tconn->unused_spare_tle == NULL) {
993 mdev->tconn->unused_spare_tle = b;
b411b363
PR
994 b = NULL;
995 }
73a01a18 996 if (rw == WRITE && (remote || send_oos) &&
87eeee41 997 mdev->tconn->unused_spare_tle == NULL &&
6936fcb4 998 test_bit(CREATE_BARRIER, &mdev->tconn->flags)) {
b411b363
PR
999 /* someone closed the current epoch
1000 * while we were grabbing the spinlock */
87eeee41 1001 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
1002 goto allocate_barrier;
1003 }
1004
1005
1006 /* Update disk stats */
1007 _drbd_start_io_acct(mdev, req, bio);
1008
1009 /* _maybe_start_new_epoch(mdev);
1010 * If we need to generate a write barrier packet, we have to add the
1011 * new epoch (barrier) object, and queue the barrier packet for sending,
1012 * and queue the req's data after it _within the same lock_, otherwise
1013 * we have race conditions were the reorder domains could be mixed up.
1014 *
1015 * Even read requests may start a new epoch and queue the corresponding
1016 * barrier packet. To get the write ordering right, we only have to
1017 * make sure that, if this is a write request and it triggered a
1018 * barrier packet, this request is queued within the same spinlock. */
87eeee41 1019 if ((remote || send_oos) && mdev->tconn->unused_spare_tle &&
6936fcb4 1020 test_and_clear_bit(CREATE_BARRIER, &mdev->tconn->flags)) {
2f5cdd0b 1021 _tl_add_barrier(mdev->tconn, mdev->tconn->unused_spare_tle);
87eeee41 1022 mdev->tconn->unused_spare_tle = NULL;
b411b363
PR
1023 } else {
1024 D_ASSERT(!(remote && rw == WRITE &&
6936fcb4 1025 test_bit(CREATE_BARRIER, &mdev->tconn->flags)));
b411b363
PR
1026 }
1027
1028 /* NOTE
1029 * Actually, 'local' may be wrong here already, since we may have failed
1030 * to write to the meta data, and may become wrong anytime because of
1031 * local io-error for some other request, which would lead to us
1032 * "detaching" the local disk.
1033 *
1034 * 'remote' may become wrong any time because the network could fail.
1035 *
1036 * This is a harmless race condition, though, since it is handled
1037 * correctly at the appropriate places; so it just defers the failure
1038 * of the respective operation.
1039 */
1040
1041 /* mark them early for readability.
1042 * this just sets some state flags. */
1043 if (remote)
8554df1c 1044 _req_mod(req, TO_BE_SENT);
b411b363 1045 if (local)
8554df1c 1046 _req_mod(req, TO_BE_SUBMITTED);
b411b363 1047
87eeee41 1048 list_add_tail(&req->tl_requests, &mdev->tconn->newest_tle->requests);
288f422e 1049
b411b363
PR
1050 /* NOTE remote first: to get the concurrent write detection right,
1051 * we must register the request before start of local IO. */
1052 if (remote) {
1053 /* either WRITE and C_CONNECTED,
1054 * or READ, and no local disk,
1055 * or READ, but not in sync.
1056 */
1057 _req_mod(req, (rw == WRITE)
8554df1c
AG
1058 ? QUEUE_FOR_NET_WRITE
1059 : QUEUE_FOR_NET_READ);
b411b363 1060 }
73a01a18 1061 if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
8554df1c 1062 _req_mod(req, QUEUE_FOR_SEND_OOS);
67531718 1063
44ed167d
PR
1064 rcu_read_lock();
1065 nc = rcu_dereference(mdev->tconn->net_conf);
73a01a18 1066 if (remote &&
44ed167d 1067 nc->on_congestion != OC_BLOCK && mdev->tconn->agreed_pro_version >= 96) {
67531718
PR
1068 int congested = 0;
1069
44ed167d
PR
1070 if (nc->cong_fill &&
1071 atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
67531718
PR
1072 dev_info(DEV, "Congestion-fill threshold reached\n");
1073 congested = 1;
1074 }
1075
44ed167d 1076 if (mdev->act_log->used >= nc->cong_extents) {
67531718
PR
1077 dev_info(DEV, "Congestion-extents threshold reached\n");
1078 congested = 1;
1079 }
1080
71c78cfb 1081 if (congested) {
039312b6 1082 queue_barrier(mdev); /* last barrier, after mirrored writes */
73a01a18 1083
44ed167d 1084 if (nc->on_congestion == OC_PULL_AHEAD)
67531718 1085 _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
44ed167d 1086 else /*nc->on_congestion == OC_DISCONNECT */
67531718
PR
1087 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
1088 }
1089 }
44ed167d 1090 rcu_read_unlock();
67531718 1091
87eeee41 1092 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
1093 kfree(b); /* if someone else has beaten us to it... */
1094
1095 if (local) {
1096 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
1097
6719fb03
LE
1098 /* State may have changed since we grabbed our reference on the
1099 * mdev->ldev member. Double check, and short-circuit to endio.
1100 * In case the last activity log transaction failed to get on
1101 * stable storage, and this is a WRITE, we may not even submit
1102 * this bio. */
1103 if (get_ldev(mdev)) {
0cf9d27e
AG
1104 if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
1105 : rw == READ ? DRBD_FAULT_DT_RD
1106 : DRBD_FAULT_DT_RA))
6719fb03
LE
1107 bio_endio(req->private_bio, -EIO);
1108 else
1109 generic_make_request(req->private_bio);
1110 put_ldev(mdev);
1111 } else
b411b363 1112 bio_endio(req->private_bio, -EIO);
b411b363
PR
1113 }
1114
b411b363
PR
1115 return 0;
1116
1117fail_free_complete:
76727f68 1118 if (req->rq_state & RQ_IN_ACT_LOG)
181286ad 1119 drbd_al_complete_io(mdev, &req->i);
b411b363 1120fail_and_free_req:
57bcb6cf 1121 if (local) {
b411b363
PR
1122 bio_put(req->private_bio);
1123 req->private_bio = NULL;
1124 put_ldev(mdev);
1125 }
9a25a04c
PR
1126 if (!ret)
1127 bio_endio(bio, err);
1128
b411b363
PR
1129 drbd_req_free(req);
1130 dec_ap_bio(mdev);
1131 kfree(b);
1132
9a25a04c 1133 return ret;
b411b363
PR
1134}
1135
2f58dcfc 1136int drbd_make_request(struct request_queue *q, struct bio *bio)
b411b363 1137{
b411b363 1138 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
aeda1cd6 1139 unsigned long start_time;
b411b363 1140
aeda1cd6
PR
1141 start_time = jiffies;
1142
b411b363
PR
1143 /*
1144 * what we "blindly" assume:
1145 */
1146 D_ASSERT(bio->bi_size > 0);
c670a398 1147 D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
b411b363 1148
69b6a3b1
PR
1149 do {
1150 inc_ap_bio(mdev);
1151 } while (__drbd_make_request(mdev, bio, start_time));
1152
1153 return 0;
b411b363
PR
1154}
1155
23361cf3
LE
1156/* This is called by bio_add_page().
1157 *
1158 * q->max_hw_sectors and other global limits are already enforced there.
b411b363 1159 *
23361cf3
LE
1160 * We need to call down to our lower level device,
1161 * in case it has special restrictions.
1162 *
1163 * We also may need to enforce configured max-bio-bvecs limits.
b411b363
PR
1164 *
1165 * As long as the BIO is empty we have to allow at least one bvec,
23361cf3 1166 * regardless of size and offset, so no need to ask lower levels.
b411b363
PR
1167 */
1168int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1169{
1170 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
b411b363 1171 unsigned int bio_size = bvm->bi_size;
23361cf3
LE
1172 int limit = DRBD_MAX_BIO_SIZE;
1173 int backing_limit;
1174
1175 if (bio_size && get_ldev(mdev)) {
b411b363
PR
1176 struct request_queue * const b =
1177 mdev->ldev->backing_bdev->bd_disk->queue;
a1c88d0d 1178 if (b->merge_bvec_fn) {
b411b363
PR
1179 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1180 limit = min(limit, backing_limit);
1181 }
1182 put_ldev(mdev);
1183 }
1184 return limit;
1185}
7fde2be9
PR
1186
1187void request_timer_fn(unsigned long data)
1188{
1189 struct drbd_conf *mdev = (struct drbd_conf *) data;
8b924f1d 1190 struct drbd_tconn *tconn = mdev->tconn;
7fde2be9
PR
1191 struct drbd_request *req; /* oldest request */
1192 struct list_head *le;
44ed167d 1193 struct net_conf *nc;
3b03ad59 1194 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
44ed167d
PR
1195
1196 rcu_read_lock();
1197 nc = rcu_dereference(tconn->net_conf);
cdfda633
PR
1198 ent = nc ? nc->timeout * HZ/10 * nc->ko_count : 0;
1199
1200 if (get_ldev(mdev)) {
1201 dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10;
1202 put_ldev(mdev);
1203 }
44ed167d 1204 rcu_read_unlock();
7fde2be9 1205
cdfda633
PR
1206 et = min_not_zero(dt, ent);
1207
1208 if (!et || (mdev->state.conn < C_WF_REPORT_PARAMS && mdev->state.disk <= D_FAILED))
7fde2be9
PR
1209 return; /* Recurring timer stopped */
1210
8b924f1d
PR
1211 spin_lock_irq(&tconn->req_lock);
1212 le = &tconn->oldest_tle->requests;
7fde2be9 1213 if (list_empty(le)) {
8b924f1d 1214 spin_unlock_irq(&tconn->req_lock);
7fde2be9
PR
1215 mod_timer(&mdev->request_timer, jiffies + et);
1216 return;
1217 }
1218
1219 le = le->prev;
1220 req = list_entry(le, struct drbd_request, tl_requests);
cdfda633
PR
1221 if (ent && req->rq_state & RQ_NET_PENDING) {
1222 if (time_is_before_eq_jiffies(req->start_time + ent)) {
7fde2be9 1223 dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
cdfda633
PR
1224 _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
1225 }
1226 }
38a05c16 1227 if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev) {
cdfda633
PR
1228 if (time_is_before_eq_jiffies(req->start_time + dt)) {
1229 dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
1230 __drbd_chk_io_error(mdev, 1);
7fde2be9 1231 }
7fde2be9 1232 }
3b03ad59 1233 nt = (time_is_before_eq_jiffies(req->start_time + et) ? jiffies : req->start_time) + et;
8b924f1d 1234 spin_unlock_irq(&tconn->req_lock);
3b03ad59 1235 mod_timer(&mdev->request_timer, nt);
7fde2be9 1236}