struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t *drbd_request_mempool;
mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
/* I do not use a standard mempool, because:
1) I want to hand out the pre-allocated objects first.
.release = drbd_release,
};
-#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+static void bio_destructor_drbd(struct bio *bio)
+{
+ bio_free(bio, drbd_md_io_bio_set);
+}
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+ struct bio *bio;
+
+ if (!drbd_md_io_bio_set)
+ return bio_alloc(gfp_mask, 1);
+
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
+ bio->bi_destructor = bio_destructor_drbd;
+ return bio;
+}
#ifdef __CHECKER__
/* When checking with sparse, and this is an inline function, sparse will
new->n_writes = 0;
newest_before = mdev->newest_tle;
- /* never send a barrier number == 0, because that is special-cased
- * when using TCQ for our write ordering code */
- new->br_number = (newest_before->br_number+1) ?: 1;
+ new->br_number = newest_before->br_number+1;
if (mdev->newest_tle != new) {
mdev->newest_tle->next = new;
mdev->newest_tle = new;
* @what: The action/event to perform with all request objects
*
* @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
- * restart_frozen_disk_io, abort_disk_io.
+ * restart_frozen_disk_io.
*/
static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
{
}
tmp = b->next;
- if (what == abort_disk_io) {
- /* Only walk the TL, leave barrier objects in place */
- b = tmp;
- continue;
- }
-
if (n_writes) {
if (what == resend) {
b->n_writes = n_writes;
/* Actions operating on the disk state, also want to work on
requests that got barrier acked. */
switch (what) {
- case abort_disk_io:
case fail_frozen_disk_io:
case restart_frozen_disk_io:
list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
spin_unlock_irq(&mdev->req_lock);
}
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev: DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_conf *mdev)
+{
+ struct drbd_tl_epoch *b;
+ struct list_head *le, *tle;
+ struct drbd_request *req;
+
+ spin_lock_irq(&mdev->req_lock);
+ b = mdev->oldest_tle;
+ while (b) {
+ list_for_each_safe(le, tle, &b->requests) {
+ req = list_entry(le, struct drbd_request, tl_requests);
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ _req_mod(req, abort_disk_io);
+ }
+ b = b->next;
+ }
+
+ list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+ req = list_entry(le, struct drbd_request, tl_requests);
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ _req_mod(req, abort_disk_io);
+ }
+
+ spin_unlock_irq(&mdev->req_lock);
+}
+
/**
* cl_wide_st_chg() - true if the state change is a cluster wide one
* @mdev: DRBD device.
if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
rv = SS_IN_TRANSIENT_STATE;
+ /* While establishing a connection only allow cstate to change.
+ Delay/refuse role changes, detach attach etc... */
+ if (test_bit(STATE_SENT, &mdev->flags) &&
+ !(os.conn == C_WF_REPORT_PARAMS ||
+ (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+ rv = SS_IN_TRANSIENT_STATE;
+
if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
rv = SS_NEED_CONNECTION;
/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
* If you try to go into some Sync* state, that shall fail (elsewhere). */
if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
+ ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
ns.conn = os.conn;
/* we cannot fail (again) if we already detached */
if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
ns.disk = D_DISKLESS;
- /* if we are only D_ATTACHING yet,
- * we can (and should) go directly to D_DISKLESS. */
- if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
- ns.disk = D_DISKLESS;
-
/* After C_DISCONNECTING only C_STANDALONE may follow */
if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
ns.conn = os.conn;
drbd_thread_stop_nowait(&mdev->receiver);
/* Upon network failure, we need to restart the receiver. */
- if (os.conn > C_TEAR_DOWN &&
+ if (os.conn > C_WF_CONNECTION &&
ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
drbd_thread_restart_nowait(&mdev->receiver);
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
drbd_resume_al(mdev);
+ /* remember last connect and attach times so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
+ mdev->last_reconnect_jif = jiffies;
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
+ mdev->last_reattach_jif = jiffies;
+
ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
if (ascw) {
ascw->os = os;
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
what = resend;
- if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
what = restart_frozen_disk_io;
if (what != nothing)
/* Do not change the order of the if above and the two below... */
if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
+ /* we probably will start a resync soon.
+ * make sure those things are properly reset. */
+ mdev->rs_total = 0;
+ mdev->rs_failed = 0;
+ atomic_set(&mdev->rs_pending_cnt, 0);
+ drbd_rs_cancel_all(mdev);
+
drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
}
/* No point in queuing send_bitmap if we don't have a connection
* anymore, so check also the _current_ state, not only the new state
os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
drbd_send_sizes(mdev, 0, 0); /* to start sync... */
drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
}
/* We want to pause/continue resync, tell peer. */
if (ns.conn >= C_CONNECTED &&
((os.aftr_isp != ns.aftr_isp) ||
(os.user_isp != ns.user_isp)))
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
/* In case one of the isp bits got set, suspend other devices. */
if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
/* Make sure the peer gets informed about eventual state
changes (ISP bits) while we were in WFReportParams. */
if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
/* We are in the progress to start a full sync... */
if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
/* first half of local IO error, failure to attach,
* or administrative detach */
if (os.disk != D_FAILED && ns.disk == D_FAILED) {
- enum drbd_io_error_p eh;
- int was_io_error;
+ enum drbd_io_error_p eh = EP_PASS_ON;
+ int was_io_error = 0;
/* corresponding get_ldev was in __drbd_set_state, to serialize
- * our cleanup here with the transition to D_DISKLESS,
- * so it is safe to dreference ldev here. */
- eh = mdev->ldev->dc.on_io_error;
- was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
-
- /* Immediately allow completion of all application IO, that waits
- for completion from the local disk. */
- tl_restart(mdev, abort_disk_io);
-
- /* current state still has to be D_FAILED,
- * there is only one way out: to D_DISKLESS,
- * and that may only happen after our put_ldev below. */
- if (mdev->state.disk != D_FAILED)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s during detach\n",
- drbd_disk_str(mdev->state.disk));
-
- if (drbd_send_state(mdev))
- dev_info(DEV, "Notified peer that I am detaching my disk\n");
-
- drbd_rs_cancel_all(mdev);
-
- /* In case we want to get something to stable storage still,
- * this may be the last chance.
- * Following put_ldev may transition to D_DISKLESS. */
- drbd_md_sync(mdev);
+ * our cleanup here with the transition to D_DISKLESS.
+ * But is is still not save to dreference ldev here, since
+ * we might come from an failed Attach before ldev was set. */
+ if (mdev->ldev) {
+ eh = mdev->ldev->dc.on_io_error;
+ was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+ if (was_io_error && eh == EP_CALL_HELPER)
+ drbd_khelper(mdev, "local-io-error");
+
+ /* Immediately allow completion of all application IO,
+ * that waits for completion from the local disk,
+ * if this was a force-detach due to disk_timeout
+ * or administrator request (drbdsetup detach --force).
+ * Do NOT abort otherwise.
+ * Aborting local requests may cause serious problems,
+ * if requests are completed to upper layers already,
+ * and then later the already submitted local bio completes.
+ * This can cause DMA into former bio pages that meanwhile
+ * have been re-used for other things.
+ * So aborting local requests may cause crashes,
+ * or even worse, silent data corruption.
+ */
+ if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
+ tl_abort_disk_io(mdev);
+
+ /* current state still has to be D_FAILED,
+ * there is only one way out: to D_DISKLESS,
+ * and that may only happen after our put_ldev below. */
+ if (mdev->state.disk != D_FAILED)
+ dev_err(DEV,
+ "ASSERT FAILED: disk is %s during detach\n",
+ drbd_disk_str(mdev->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
+
+ drbd_rs_cancel_all(mdev);
+
+ /* In case we want to get something to stable storage still,
+ * this may be the last chance.
+ * Following put_ldev may transition to D_DISKLESS. */
+ drbd_md_sync(mdev);
+ }
put_ldev(mdev);
-
- if (was_io_error && eh == EP_CALL_HELPER)
- drbd_khelper(mdev, "local-io-error");
}
/* second half of local IO error, failure to attach,
"ASSERT FAILED: disk is %s while going diskless\n",
drbd_disk_str(mdev->state.disk));
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
- if (drbd_send_state(mdev))
- dev_info(DEV, "Notified peer that I'm now diskless.\n");
/* corresponding get_ldev in __drbd_set_state
* this may finally trigger drbd_ldev_destroy. */
put_ldev(mdev);
}
/* Notify peer that I had a local IO error, and did not detached.. */
- if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
- drbd_send_state(mdev);
+ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
/* Disks got bigger while they were detached */
if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
/* sync target done with resync. Explicitly notify peer, even though
* it should (at least for non-empty resyncs) already know itself. */
if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
+
+ /* Wake up role changes, that were delayed because of connection establishing */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
+ clear_bit(STATE_SENT, &mdev->flags);
+ wake_up(&mdev->state_wait);
+ }
/* This triggers bitmap writeout of potentially still unwritten pages
* if the resync finished cleanly, or aborted because of peer disk
* No harm done if some bits change during this phase.
*/
if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
- drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
- "write from resync_finished", BM_LOCKED_SET_ALLOWED);
+ drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+ "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
put_ldev(mdev);
}
D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
- uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
+ uuid = mdev->ldev->md.uuid[UI_BITMAP];
+ if (uuid && uuid != UUID_JUST_CREATED)
+ uuid = uuid + UUID_NEW_BM_OFFSET;
+ else
+ get_random_bytes(&uuid, sizeof(u64));
drbd_uuid_set(mdev, UI_BITMAP, uuid);
drbd_print_uuids(mdev, "updated sync UUID");
drbd_md_sync(mdev);
}
/**
- * drbd_send_state() - Sends the drbd state to the peer
+ * drbd_send_current_state() - Sends the drbd state to the peer
* @mdev: DRBD device.
*/
-int drbd_send_state(struct drbd_conf *mdev)
+int drbd_send_current_state(struct drbd_conf *mdev)
{
struct socket *sock;
struct p_state p;
return ok;
}
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @mdev: DRBD device.
+ * @state: the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
+{
+ struct socket *sock;
+ struct p_state p;
+ int ok = 0;
+
+ mutex_lock(&mdev->data.mutex);
+
+ p.state = cpu_to_be32(state.i);
+ sock = mdev->data.socket;
+
+ if (likely(sock != NULL)) {
+ ok = _drbd_send_cmd(mdev, sock, P_STATE,
+ (struct p_header80 *)&p, sizeof(p), 0);
+ }
+
+ mutex_unlock(&mdev->data.mutex);
+
+ return ok;
+}
+
int drbd_send_state_req(struct drbd_conf *mdev,
union drbd_state mask, union drbd_state val)
{
struct bio_vec *bvec;
int i;
/* hint all but last page with MSG_MORE */
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment(bvec, bio, i) {
if (!_drbd_no_send_page(mdev, bvec->bv_page,
bvec->bv_offset, bvec->bv_len,
i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
struct bio_vec *bvec;
int i;
/* hint all but last page with MSG_MORE */
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment(bvec, bio, i) {
if (!_drbd_send_page(mdev, bvec->bv_page,
bvec->bv_offset, bvec->bv_len,
i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
p.sector = cpu_to_be64(req->sector);
p.block_id = (unsigned long)req;
- p.seq_num = cpu_to_be32(req->seq_num =
- atomic_add_return(1, &mdev->packet_seq));
+ p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
+ if (drbd_md_io_bio_set)
+ bioset_free(drbd_md_io_bio_set);
+ if (drbd_md_io_page_pool)
+ mempool_destroy(drbd_md_io_page_pool);
if (drbd_ee_mempool)
mempool_destroy(drbd_ee_mempool);
if (drbd_request_mempool)
if (drbd_al_ext_cache)
kmem_cache_destroy(drbd_al_ext_cache);
+ drbd_md_io_bio_set = NULL;
+ drbd_md_io_page_pool = NULL;
drbd_ee_mempool = NULL;
drbd_request_mempool = NULL;
drbd_ee_cache = NULL;
drbd_bm_ext_cache = NULL;
drbd_al_ext_cache = NULL;
drbd_pp_pool = NULL;
+ drbd_md_io_page_pool = NULL;
+ drbd_md_io_bio_set = NULL;
/* caches */
drbd_request_cache = kmem_cache_create(
goto Enomem;
/* mempools */
+#ifdef COMPAT_HAVE_BIOSET_CREATE
+ drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_bio_set == NULL)
+ goto Enomem;
+#endif
+
+ drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_page_pool == NULL)
+ goto Enomem;
+
drbd_request_mempool = mempool_create(number,
mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
if (drbd_request_mempool == NULL)
goto out;
}
+ if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
+ r |= (1 << BDI_async_congested);
+ /* Without good local data, we would need to read from remote,
+ * and that would need the worker thread as well, which is
+ * currently blocked waiting for that usermode helper to
+ * finish.
+ */
+ if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
+ r |= (1 << BDI_sync_congested);
+ else
+ put_ldev(mdev);
+ r &= bdi_bits;
+ reason = 'c';
+ goto out;
+ }
+
if (get_ldev(mdev)) {
q = bdev_get_queue(mdev->ldev->backing_bdev);
r = bdi_congested(&q->backing_dev_info, bdi_bits);
if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
/* this was a try anyways ... */
dev_err(DEV, "meta data update failed!\n");
- drbd_chk_io_error(mdev, 1, true);
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
}
/* Update mdev->ldev->md.la_size_sect,
static char buildtag[38] = "\0uilt-in";
if (buildtag[0] == 0) {
-#ifdef CONFIG_MODULES
- if (THIS_MODULE != NULL)
- sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
- else
+#ifdef MODULE
+ sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+ buildtag[0] = 'b';
#endif
- buildtag[0] = 'b';
}
return buildtag;