]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/block/nbd.c
nbd: separate out the config information
[mirror_ubuntu-artful-kernel.git] / drivers / block / nbd.c
CommitLineData
1da177e4
LT
1/*
2 * Network block device - make block devices work over TCP
3 *
4 * Note that you can not swap over this thing, yet. Seems to work but
5 * deadlocks sometimes - you can not swap over TCP in general.
6 *
a2531293 7 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
1da177e4
LT
8 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
9 *
dbf492d6 10 * This file is released under GPLv2 or later.
1da177e4 11 *
dbf492d6 12 * (part of code stolen from loop.c)
1da177e4
LT
13 */
14
15#include <linux/major.h>
16
17#include <linux/blkdev.h>
18#include <linux/module.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/bio.h>
23#include <linux/stat.h>
24#include <linux/errno.h>
25#include <linux/file.h>
26#include <linux/ioctl.h>
2a48fc0a 27#include <linux/mutex.h>
4b2f0260
HX
28#include <linux/compiler.h>
29#include <linux/err.h>
30#include <linux/kernel.h>
5a0e3ad6 31#include <linux/slab.h>
1da177e4 32#include <net/sock.h>
91cf45f0 33#include <linux/net.h>
48cf6061 34#include <linux/kthread.h>
b9c495bb 35#include <linux/types.h>
30d53d9c 36#include <linux/debugfs.h>
fd8383fd 37#include <linux/blk-mq.h>
1da177e4 38
7c0f6ba6 39#include <linux/uaccess.h>
1da177e4
LT
40#include <asm/types.h>
41
42#include <linux/nbd.h>
43
b0d9111a
JB
44static DEFINE_IDR(nbd_index_idr);
45static DEFINE_MUTEX(nbd_index_mutex);
46
9561a7ad
JB
47struct nbd_sock {
48 struct socket *sock;
49 struct mutex tx_lock;
9dd5d3ab
JB
50 struct request *pending;
51 int sent;
f3733247
JB
52 bool dead;
53 int fallback_index;
9561a7ad
JB
54};
55
5ea8d108
JB
56struct recv_thread_args {
57 struct work_struct work;
58 struct nbd_device *nbd;
59 int index;
60};
61
9b4a6ba9
JB
62#define NBD_TIMEDOUT 0
63#define NBD_DISCONNECT_REQUESTED 1
9561a7ad 64#define NBD_DISCONNECTED 2
5ea8d108 65#define NBD_HAS_PID_FILE 3
9b4a6ba9 66
5ea8d108 67struct nbd_config {
22d109c1 68 u32 flags;
9b4a6ba9 69 unsigned long runtime_flags;
13e71d69 70
5ea8d108 71 struct nbd_sock **socks;
9561a7ad 72 int num_connections;
5ea8d108 73
9561a7ad
JB
74 atomic_t recv_threads;
75 wait_queue_head_t recv_wq;
ef77b515 76 loff_t blksize;
b9c495bb 77 loff_t bytesize;
30d53d9c
MP
78#if IS_ENABLED(CONFIG_DEBUG_FS)
79 struct dentry *dbg_dir;
80#endif
13e71d69
MP
81};
82
5ea8d108
JB
83struct nbd_device {
84 struct blk_mq_tag_set tag_set;
85
86 refcount_t config_refs;
87 struct nbd_config *config;
88 struct mutex config_lock;
89 struct gendisk *disk;
90
91 struct task_struct *task_recv;
92 struct task_struct *task_setup;
93};
94
fd8383fd
JB
95struct nbd_cmd {
96 struct nbd_device *nbd;
f3733247 97 int index;
9561a7ad 98 struct completion send_complete;
fd8383fd
JB
99};
100
30d53d9c
MP
101#if IS_ENABLED(CONFIG_DEBUG_FS)
102static struct dentry *nbd_dbg_dir;
103#endif
104
105#define nbd_name(nbd) ((nbd)->disk->disk_name)
106
f4507164 107#define NBD_MAGIC 0x68797548
1da177e4 108
9c7a4169 109static unsigned int nbds_max = 16;
d71a6d73 110static int max_part;
124d6db0 111static struct workqueue_struct *recv_workqueue;
b0d9111a 112static int part_shift;
1da177e4 113
9442b739
JB
114static int nbd_dev_dbg_init(struct nbd_device *nbd);
115static void nbd_dev_dbg_close(struct nbd_device *nbd);
5ea8d108 116static void nbd_config_put(struct nbd_device *nbd);
9442b739 117
d18509f5 118static inline struct device *nbd_to_dev(struct nbd_device *nbd)
1da177e4 119{
d18509f5 120 return disk_to_dev(nbd->disk);
1da177e4
LT
121}
122
37091fdd
MP
123static bool nbd_is_connected(struct nbd_device *nbd)
124{
125 return !!nbd->task_recv;
126}
127
1da177e4
LT
128static const char *nbdcmd_to_ascii(int cmd)
129{
130 switch (cmd) {
131 case NBD_CMD_READ: return "read";
132 case NBD_CMD_WRITE: return "write";
133 case NBD_CMD_DISC: return "disconnect";
75f187ab 134 case NBD_CMD_FLUSH: return "flush";
a336d298 135 case NBD_CMD_TRIM: return "trim/discard";
1da177e4
LT
136 }
137 return "invalid";
138}
1da177e4 139
5ea8d108
JB
140static ssize_t pid_show(struct device *dev,
141 struct device_attribute *attr, char *buf)
142{
143 struct gendisk *disk = dev_to_disk(dev);
144 struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
145
146 return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
147}
148
149static struct device_attribute pid_attr = {
150 .attr = { .name = "pid", .mode = S_IRUGO},
151 .show = pid_show,
152};
153
f3733247
JB
154static void nbd_mark_nsock_dead(struct nbd_sock *nsock)
155{
156 if (!nsock->dead)
157 kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
158 nsock->dead = true;
159 nsock->pending = NULL;
160 nsock->sent = 0;
161}
162
37091fdd
MP
163static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
164{
5ea8d108
JB
165 if (nbd->config->bytesize) {
166 if (bdev->bd_openers <= 1)
167 bd_set_size(bdev, 0);
168 set_capacity(nbd->disk, 0);
169 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
170 }
37091fdd
MP
171
172 return 0;
173}
174
175static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
176{
5ea8d108
JB
177 struct nbd_config *config = nbd->config;
178 blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
179 blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
180 bd_set_size(bdev, config->bytesize);
181 set_capacity(nbd->disk, config->bytesize >> 9);
37091fdd
MP
182 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
183}
184
e544541b 185static void nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
ef77b515 186 loff_t blocksize, loff_t nr_blocks)
37091fdd 187{
5ea8d108
JB
188 struct nbd_config *config = nbd->config;
189 config->blksize = blocksize;
190 config->bytesize = blocksize * nr_blocks;
e544541b
JB
191 if (nbd_is_connected(nbd))
192 nbd_size_update(nbd, bdev);
37091fdd
MP
193}
194
fd8383fd 195static void nbd_end_request(struct nbd_cmd *cmd)
1da177e4 196{
fd8383fd
JB
197 struct nbd_device *nbd = cmd->nbd;
198 struct request *req = blk_mq_rq_from_pdu(cmd);
097c94a4 199 int error = req->errors ? -EIO : 0;
1da177e4 200
fd8383fd 201 dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
d18509f5 202 error ? "failed" : "done");
1da177e4 203
fd8383fd 204 blk_mq_complete_request(req, error);
1da177e4
LT
205}
206
e018e757
MP
207/*
208 * Forcibly shutdown the socket causing all listeners to error
209 */
36e47bee 210static void sock_shutdown(struct nbd_device *nbd)
7fdfd406 211{
5ea8d108 212 struct nbd_config *config = nbd->config;
9561a7ad 213 int i;
23272a67 214
5ea8d108 215 if (config->num_connections == 0)
9561a7ad 216 return;
5ea8d108 217 if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
260bbce4 218 return;
23272a67 219
5ea8d108
JB
220 for (i = 0; i < config->num_connections; i++) {
221 struct nbd_sock *nsock = config->socks[i];
9561a7ad
JB
222 mutex_lock(&nsock->tx_lock);
223 kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
5ea8d108 224 nbd_mark_nsock_dead(nsock);
9561a7ad
JB
225 mutex_unlock(&nsock->tx_lock);
226 }
227 dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
7fdfd406
PC
228}
229
0eadf37a
JB
230static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
231 bool reserved)
7fdfd406 232{
0eadf37a
JB
233 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
234 struct nbd_device *nbd = cmd->nbd;
5ea8d108
JB
235 struct nbd_config *config;
236
237 if (!refcount_inc_not_zero(&nbd->config_refs)) {
238 req->errors = -EIO;
239 return BLK_EH_HANDLED;
240 }
241
242 config = nbd->config;
dcc909d9 243
5ea8d108 244 if (config->num_connections > 1) {
f3733247
JB
245 dev_err_ratelimited(nbd_to_dev(nbd),
246 "Connection timed out, retrying\n");
f3733247
JB
247 /*
248 * Hooray we have more connections, requeue this IO, the submit
249 * path will put it on a real connection.
250 */
5ea8d108
JB
251 if (config->socks && config->num_connections > 1) {
252 if (cmd->index < config->num_connections) {
f3733247 253 struct nbd_sock *nsock =
5ea8d108 254 config->socks[cmd->index];
f3733247
JB
255 mutex_lock(&nsock->tx_lock);
256 nbd_mark_nsock_dead(nsock);
257 mutex_unlock(&nsock->tx_lock);
258 }
f3733247 259 blk_mq_requeue_request(req, true);
5ea8d108 260 nbd_config_put(nbd);
f3733247
JB
261 return BLK_EH_NOT_HANDLED;
262 }
f3733247
JB
263 } else {
264 dev_err_ratelimited(nbd_to_dev(nbd),
265 "Connection timed out\n");
266 }
5ea8d108 267 set_bit(NBD_TIMEDOUT, &config->runtime_flags);
c103b4da 268 req->errors = -EIO;
9561a7ad 269 sock_shutdown(nbd);
5ea8d108
JB
270 nbd_config_put(nbd);
271
0eadf37a 272 return BLK_EH_HANDLED;
7fdfd406
PC
273}
274
1da177e4
LT
275/*
276 * Send or receive packet.
277 */
c9f2b6ae 278static int sock_xmit(struct nbd_device *nbd, int index, int send,
9dd5d3ab 279 struct iov_iter *iter, int msg_flags, int *sent)
1da177e4 280{
5ea8d108
JB
281 struct nbd_config *config = nbd->config;
282 struct socket *sock = config->socks[index]->sock;
1da177e4
LT
283 int result;
284 struct msghdr msg;
7f338fe4 285 unsigned long pflags = current->flags;
1da177e4 286
ffc41cf8 287 if (unlikely(!sock)) {
a897b666 288 dev_err_ratelimited(disk_to_dev(nbd->disk),
7f1b90f9
WC
289 "Attempted %s on closed socket in sock_xmit\n",
290 (send ? "send" : "recv"));
ffc41cf8
MS
291 return -EINVAL;
292 }
293
c9f2b6ae 294 msg.msg_iter = *iter;
c1696cab 295
7f338fe4 296 current->flags |= PF_MEMALLOC;
1da177e4 297 do {
7f338fe4 298 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
1da177e4
LT
299 msg.msg_name = NULL;
300 msg.msg_namelen = 0;
301 msg.msg_control = NULL;
302 msg.msg_controllen = 0;
1da177e4
LT
303 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
304
7e2893a1 305 if (send)
c1696cab 306 result = sock_sendmsg(sock, &msg);
7e2893a1 307 else
c1696cab 308 result = sock_recvmsg(sock, &msg, msg.msg_flags);
1da177e4 309
1da177e4
LT
310 if (result <= 0) {
311 if (result == 0)
312 result = -EPIPE; /* short read */
313 break;
314 }
9dd5d3ab
JB
315 if (sent)
316 *sent += result;
c1696cab 317 } while (msg_data_left(&msg));
1da177e4 318
7f338fe4 319 tsk_restore_flags(current, pflags, PF_MEMALLOC);
1da177e4
LT
320
321 return result;
322}
323
7fdfd406 324/* always call with the tx_lock held */
9561a7ad 325static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
1da177e4 326{
fd8383fd 327 struct request *req = blk_mq_rq_from_pdu(cmd);
5ea8d108
JB
328 struct nbd_config *config = nbd->config;
329 struct nbd_sock *nsock = config->socks[index];
d61b7f97 330 int result;
c9f2b6ae
AV
331 struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
332 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
333 struct iov_iter from;
1011c1b9 334 unsigned long size = blk_rq_bytes(req);
429a787b 335 struct bio *bio;
9dc6c806 336 u32 type;
9561a7ad 337 u32 tag = blk_mq_unique_tag(req);
9dd5d3ab 338 int sent = nsock->sent, skip = 0;
9dc6c806 339
c9f2b6ae
AV
340 iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
341
aebf526b
CH
342 switch (req_op(req)) {
343 case REQ_OP_DISCARD:
9dc6c806 344 type = NBD_CMD_TRIM;
aebf526b
CH
345 break;
346 case REQ_OP_FLUSH:
9dc6c806 347 type = NBD_CMD_FLUSH;
aebf526b
CH
348 break;
349 case REQ_OP_WRITE:
9dc6c806 350 type = NBD_CMD_WRITE;
aebf526b
CH
351 break;
352 case REQ_OP_READ:
9dc6c806 353 type = NBD_CMD_READ;
aebf526b
CH
354 break;
355 default:
356 return -EIO;
357 }
1da177e4 358
09fc54cc 359 if (rq_data_dir(req) == WRITE &&
5ea8d108 360 (config->flags & NBD_FLAG_READ_ONLY)) {
09fc54cc
CH
361 dev_err_ratelimited(disk_to_dev(nbd->disk),
362 "Write on read-only\n");
363 return -EIO;
364 }
365
9dd5d3ab
JB
366 /* We did a partial send previously, and we at least sent the whole
367 * request struct, so just go and send the rest of the pages in the
368 * request.
369 */
370 if (sent) {
371 if (sent >= sizeof(request)) {
372 skip = sent - sizeof(request);
373 goto send_pages;
374 }
375 iov_iter_advance(&from, sent);
376 }
f3733247 377 cmd->index = index;
9dc6c806 378 request.type = htonl(type);
9561a7ad 379 if (type != NBD_CMD_FLUSH) {
75f187ab
AB
380 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
381 request.len = htonl(size);
382 }
9561a7ad 383 memcpy(request.handle, &tag, sizeof(tag));
1da177e4 384
d18509f5 385 dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
fd8383fd 386 cmd, nbdcmd_to_ascii(type),
d18509f5 387 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
c9f2b6ae 388 result = sock_xmit(nbd, index, 1, &from,
9dd5d3ab 389 (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
1da177e4 390 if (result <= 0) {
9dd5d3ab
JB
391 if (result == -ERESTARTSYS) {
392 /* If we havne't sent anything we can just return BUSY,
393 * however if we have sent something we need to make
394 * sure we only allow this req to be sent until we are
395 * completely done.
396 */
397 if (sent) {
398 nsock->pending = req;
399 nsock->sent = sent;
400 }
401 return BLK_MQ_RQ_QUEUE_BUSY;
402 }
a897b666 403 dev_err_ratelimited(disk_to_dev(nbd->disk),
7f1b90f9 404 "Send control failed (result %d)\n", result);
f3733247 405 return -EAGAIN;
1da177e4 406 }
9dd5d3ab 407send_pages:
429a787b 408 if (type != NBD_CMD_WRITE)
9dd5d3ab 409 goto out;
429a787b 410
429a787b
JA
411 bio = req->bio;
412 while (bio) {
413 struct bio *next = bio->bi_next;
414 struct bvec_iter iter;
7988613b 415 struct bio_vec bvec;
429a787b
JA
416
417 bio_for_each_segment(bvec, bio, iter) {
418 bool is_last = !next && bio_iter_last(bvec, iter);
d61b7f97 419 int flags = is_last ? 0 : MSG_MORE;
429a787b 420
d18509f5 421 dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
fd8383fd 422 cmd, bvec.bv_len);
c9f2b6ae
AV
423 iov_iter_bvec(&from, ITER_BVEC | WRITE,
424 &bvec, 1, bvec.bv_len);
9dd5d3ab
JB
425 if (skip) {
426 if (skip >= iov_iter_count(&from)) {
427 skip -= iov_iter_count(&from);
428 continue;
429 }
430 iov_iter_advance(&from, skip);
431 skip = 0;
432 }
433 result = sock_xmit(nbd, index, 1, &from, flags, &sent);
6c92e699 434 if (result <= 0) {
9dd5d3ab
JB
435 if (result == -ERESTARTSYS) {
436 /* We've already sent the header, we
437 * have no choice but to set pending and
438 * return BUSY.
439 */
440 nsock->pending = req;
441 nsock->sent = sent;
442 return BLK_MQ_RQ_QUEUE_BUSY;
443 }
f4507164 444 dev_err(disk_to_dev(nbd->disk),
7f1b90f9
WC
445 "Send data failed (result %d)\n",
446 result);
f3733247 447 return -EAGAIN;
6c92e699 448 }
429a787b
JA
449 /*
450 * The completion might already have come in,
451 * so break for the last one instead of letting
452 * the iterator do it. This prevents use-after-free
453 * of the bio.
454 */
455 if (is_last)
456 break;
1da177e4 457 }
429a787b 458 bio = next;
1da177e4 459 }
9dd5d3ab
JB
460out:
461 nsock->pending = NULL;
462 nsock->sent = 0;
1da177e4 463 return 0;
1da177e4
LT
464}
465
5ea8d108 466static int nbd_disconnected(struct nbd_config *config)
f3733247 467{
5ea8d108
JB
468 return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
469 test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
f3733247
JB
470}
471
1da177e4 472/* NULL returned = something went wrong, inform userspace */
9561a7ad 473static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
1da177e4 474{
5ea8d108 475 struct nbd_config *config = nbd->config;
1da177e4
LT
476 int result;
477 struct nbd_reply reply;
fd8383fd
JB
478 struct nbd_cmd *cmd;
479 struct request *req = NULL;
480 u16 hwq;
9561a7ad 481 u32 tag;
c9f2b6ae
AV
482 struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
483 struct iov_iter to;
1da177e4
LT
484
485 reply.magic = 0;
c9f2b6ae 486 iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
9dd5d3ab 487 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
1da177e4 488 if (result <= 0) {
5ea8d108 489 if (!nbd_disconnected(config))
9561a7ad
JB
490 dev_err(disk_to_dev(nbd->disk),
491 "Receive control failed (result %d)\n", result);
19391830 492 return ERR_PTR(result);
1da177e4 493 }
e4b57e08
MF
494
495 if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
f4507164 496 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
e4b57e08 497 (unsigned long)ntohl(reply.magic));
19391830 498 return ERR_PTR(-EPROTO);
e4b57e08
MF
499 }
500
9561a7ad 501 memcpy(&tag, reply.handle, sizeof(u32));
4b2f0260 502
fd8383fd
JB
503 hwq = blk_mq_unique_tag_to_hwq(tag);
504 if (hwq < nbd->tag_set.nr_hw_queues)
505 req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
506 blk_mq_unique_tag_to_tag(tag));
507 if (!req || !blk_mq_request_started(req)) {
508 dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
509 tag, req);
510 return ERR_PTR(-ENOENT);
1da177e4 511 }
fd8383fd 512 cmd = blk_mq_rq_to_pdu(req);
1da177e4 513 if (ntohl(reply.error)) {
f4507164 514 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
7f1b90f9 515 ntohl(reply.error));
c103b4da 516 req->errors = -EIO;
fd8383fd 517 return cmd;
1da177e4
LT
518 }
519
fd8383fd 520 dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
9dc6c806 521 if (rq_data_dir(req) != WRITE) {
5705f702 522 struct req_iterator iter;
7988613b 523 struct bio_vec bvec;
5705f702
N
524
525 rq_for_each_segment(bvec, req, iter) {
c9f2b6ae
AV
526 iov_iter_bvec(&to, ITER_BVEC | READ,
527 &bvec, 1, bvec.bv_len);
9dd5d3ab 528 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
6c92e699 529 if (result <= 0) {
f4507164 530 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
7f1b90f9 531 result);
f3733247
JB
532 /*
533 * If we've disconnected or we only have 1
534 * connection then we need to make sure we
535 * complete this request, otherwise error out
536 * and let the timeout stuff handle resubmitting
537 * this request onto another connection.
538 */
5ea8d108
JB
539 if (nbd_disconnected(config) ||
540 config->num_connections <= 1) {
f3733247
JB
541 req->errors = -EIO;
542 return cmd;
543 }
544 return ERR_PTR(-EIO);
6c92e699 545 }
d18509f5 546 dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
fd8383fd 547 cmd, bvec.bv_len);
1da177e4 548 }
9561a7ad
JB
549 } else {
550 /* See the comment in nbd_queue_rq. */
551 wait_for_completion(&cmd->send_complete);
1da177e4 552 }
fd8383fd 553 return cmd;
1da177e4
LT
554}
555
9561a7ad 556static void recv_work(struct work_struct *work)
1da177e4 557{
9561a7ad
JB
558 struct recv_thread_args *args = container_of(work,
559 struct recv_thread_args,
560 work);
561 struct nbd_device *nbd = args->nbd;
5ea8d108 562 struct nbd_config *config = nbd->config;
fd8383fd 563 struct nbd_cmd *cmd;
9561a7ad 564 int ret = 0;
1da177e4 565
19391830 566 while (1) {
9561a7ad 567 cmd = nbd_read_stat(nbd, args->index);
fd8383fd 568 if (IS_ERR(cmd)) {
5ea8d108 569 struct nbd_sock *nsock = config->socks[args->index];
f3733247
JB
570
571 mutex_lock(&nsock->tx_lock);
572 nbd_mark_nsock_dead(nsock);
573 mutex_unlock(&nsock->tx_lock);
fd8383fd 574 ret = PTR_ERR(cmd);
19391830
MP
575 break;
576 }
577
fd8383fd 578 nbd_end_request(cmd);
19391830 579 }
5ea8d108
JB
580 atomic_dec(&config->recv_threads);
581 wake_up(&config->recv_wq);
582 nbd_config_put(nbd);
583 kfree(args);
1da177e4
LT
584}
585
fd8383fd 586static void nbd_clear_req(struct request *req, void *data, bool reserved)
1da177e4 587{
fd8383fd 588 struct nbd_cmd *cmd;
1da177e4 589
fd8383fd
JB
590 if (!blk_mq_request_started(req))
591 return;
592 cmd = blk_mq_rq_to_pdu(req);
c103b4da 593 req->errors = -EIO;
fd8383fd
JB
594 nbd_end_request(cmd);
595}
596
597static void nbd_clear_que(struct nbd_device *nbd)
598{
fd8383fd 599 blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
e78273c8 600 dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
1da177e4
LT
601}
602
f3733247
JB
603static int find_fallback(struct nbd_device *nbd, int index)
604{
5ea8d108 605 struct nbd_config *config = nbd->config;
f3733247 606 int new_index = -1;
5ea8d108 607 struct nbd_sock *nsock = config->socks[index];
f3733247
JB
608 int fallback = nsock->fallback_index;
609
5ea8d108 610 if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
f3733247
JB
611 return new_index;
612
5ea8d108 613 if (config->num_connections <= 1) {
f3733247
JB
614 dev_err_ratelimited(disk_to_dev(nbd->disk),
615 "Attempted send on invalid socket\n");
616 return new_index;
617 }
618
5ea8d108
JB
619 if (fallback >= 0 && fallback < config->num_connections &&
620 !config->socks[fallback]->dead)
f3733247
JB
621 return fallback;
622
623 if (nsock->fallback_index < 0 ||
5ea8d108
JB
624 nsock->fallback_index >= config->num_connections ||
625 config->socks[nsock->fallback_index]->dead) {
f3733247 626 int i;
5ea8d108 627 for (i = 0; i < config->num_connections; i++) {
f3733247
JB
628 if (i == index)
629 continue;
5ea8d108 630 if (!config->socks[i]->dead) {
f3733247
JB
631 new_index = i;
632 break;
633 }
634 }
635 nsock->fallback_index = new_index;
636 if (new_index < 0) {
637 dev_err_ratelimited(disk_to_dev(nbd->disk),
638 "Dead connection, failed to find a fallback\n");
639 return new_index;
640 }
641 }
642 new_index = nsock->fallback_index;
643 return new_index;
644}
7fdfd406 645
9dd5d3ab 646static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
48cf6061 647{
fd8383fd
JB
648 struct request *req = blk_mq_rq_from_pdu(cmd);
649 struct nbd_device *nbd = cmd->nbd;
5ea8d108 650 struct nbd_config *config;
9561a7ad 651 struct nbd_sock *nsock;
9dd5d3ab 652 int ret;
fd8383fd 653
5ea8d108
JB
654 if (!refcount_inc_not_zero(&nbd->config_refs)) {
655 dev_err_ratelimited(disk_to_dev(nbd->disk),
656 "Socks array is empty\n");
657 return -EINVAL;
658 }
659 config = nbd->config;
660
661 if (index >= config->num_connections) {
a897b666
JB
662 dev_err_ratelimited(disk_to_dev(nbd->disk),
663 "Attempted send on invalid socket\n");
5ea8d108 664 nbd_config_put(nbd);
9dd5d3ab 665 return -EINVAL;
9561a7ad 666 }
48cf6061 667 req->errors = 0;
f3733247 668again:
5ea8d108 669 nsock = config->socks[index];
9561a7ad 670 mutex_lock(&nsock->tx_lock);
f3733247
JB
671 if (nsock->dead) {
672 index = find_fallback(nbd, index);
5ea8d108
JB
673 if (index < 0) {
674 ret = -EIO;
675 goto out;
676 }
9561a7ad 677 mutex_unlock(&nsock->tx_lock);
f3733247 678 goto again;
48cf6061
LV
679 }
680
9dd5d3ab
JB
681 /* Handle the case that we have a pending request that was partially
682 * transmitted that _has_ to be serviced first. We need to call requeue
683 * here so that it gets put _after_ the request that is already on the
684 * dispatch list.
685 */
686 if (unlikely(nsock->pending && nsock->pending != req)) {
687 blk_mq_requeue_request(req, true);
688 ret = 0;
689 goto out;
48cf6061 690 }
f3733247
JB
691 /*
692 * Some failures are related to the link going down, so anything that
693 * returns EAGAIN can be retried on a different socket.
694 */
9dd5d3ab 695 ret = nbd_send_cmd(nbd, cmd, index);
f3733247
JB
696 if (ret == -EAGAIN) {
697 dev_err_ratelimited(disk_to_dev(nbd->disk),
698 "Request send failed trying another connection\n");
699 nbd_mark_nsock_dead(nsock);
700 mutex_unlock(&nsock->tx_lock);
701 goto again;
702 }
9dd5d3ab 703out:
9561a7ad 704 mutex_unlock(&nsock->tx_lock);
5ea8d108 705 nbd_config_put(nbd);
9dd5d3ab 706 return ret;
48cf6061
LV
707}
708
fd8383fd
JB
709static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
710 const struct blk_mq_queue_data *bd)
1da177e4 711{
fd8383fd 712 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
9dd5d3ab 713 int ret;
1da177e4 714
9561a7ad
JB
715 /*
716 * Since we look at the bio's to send the request over the network we
717 * need to make sure the completion work doesn't mark this request done
718 * before we are done doing our send. This keeps us from dereferencing
719 * freed data if we have particularly fast completions (ie we get the
720 * completion before we exit sock_xmit on the last bvec) or in the case
721 * that the server is misbehaving (or there was an error) before we're
722 * done sending everything over the wire.
723 */
724 init_completion(&cmd->send_complete);
fd8383fd 725 blk_mq_start_request(bd->rq);
9dd5d3ab
JB
726
727 /* We can be called directly from the user space process, which means we
728 * could possibly have signals pending so our sendmsg will fail. In
729 * this case we need to return that we are busy, otherwise error out as
730 * appropriate.
731 */
732 ret = nbd_handle_cmd(cmd, hctx->queue_num);
733 if (ret < 0)
734 ret = BLK_MQ_RQ_QUEUE_ERROR;
735 if (!ret)
736 ret = BLK_MQ_RQ_QUEUE_OK;
9561a7ad
JB
737 complete(&cmd->send_complete);
738
9dd5d3ab 739 return ret;
1da177e4
LT
740}
741
9442b739
JB
742static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
743 unsigned long arg)
23272a67 744{
5ea8d108 745 struct nbd_config *config = nbd->config;
9442b739 746 struct socket *sock;
9561a7ad
JB
747 struct nbd_sock **socks;
748 struct nbd_sock *nsock;
9442b739
JB
749 int err;
750
751 sock = sockfd_lookup(arg, &err);
752 if (!sock)
753 return err;
23272a67 754
9561a7ad
JB
755 if (!nbd->task_setup)
756 nbd->task_setup = current;
757 if (nbd->task_setup != current) {
758 dev_err(disk_to_dev(nbd->disk),
759 "Device being setup by another task");
9b1355d5 760 sockfd_put(sock);
9561a7ad 761 return -EINVAL;
23272a67
MP
762 }
763
5ea8d108 764 socks = krealloc(config->socks, (config->num_connections + 1) *
9561a7ad 765 sizeof(struct nbd_sock *), GFP_KERNEL);
9b1355d5
JB
766 if (!socks) {
767 sockfd_put(sock);
9561a7ad 768 return -ENOMEM;
9b1355d5 769 }
9561a7ad 770 nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
9b1355d5
JB
771 if (!nsock) {
772 sockfd_put(sock);
9561a7ad 773 return -ENOMEM;
9b1355d5 774 }
9561a7ad 775
5ea8d108 776 config->socks = socks;
23272a67 777
f3733247
JB
778 nsock->fallback_index = -1;
779 nsock->dead = false;
9561a7ad
JB
780 mutex_init(&nsock->tx_lock);
781 nsock->sock = sock;
9dd5d3ab
JB
782 nsock->pending = NULL;
783 nsock->sent = 0;
5ea8d108 784 socks[config->num_connections++] = nsock;
23272a67 785
9442b739
JB
786 if (max_part)
787 bdev->bd_invalidated = 1;
9561a7ad 788 return 0;
23272a67
MP
789}
790
0e4f0f6f
MP
791/* Reset all properties of an NBD device */
792static void nbd_reset(struct nbd_device *nbd)
793{
5ea8d108 794 nbd->config = NULL;
0eadf37a 795 nbd->tag_set.timeout = 0;
0e4f0f6f 796 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
0e4f0f6f
MP
797}
798
799static void nbd_bdev_reset(struct block_device *bdev)
800{
abbbdf12
RMB
801 if (bdev->bd_openers > 1)
802 return;
0e4f0f6f
MP
803 set_device_ro(bdev, false);
804 bdev->bd_inode->i_size = 0;
805 if (max_part > 0) {
806 blkdev_reread_part(bdev);
807 bdev->bd_invalidated = 1;
808 }
809}
810
d02cf531
MP
811static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
812{
5ea8d108
JB
813 struct nbd_config *config = nbd->config;
814 if (config->flags & NBD_FLAG_READ_ONLY)
d02cf531 815 set_device_ro(bdev, true);
5ea8d108 816 if (config->flags & NBD_FLAG_SEND_TRIM)
d02cf531 817 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
5ea8d108 818 if (config->flags & NBD_FLAG_SEND_FLUSH)
aafb1eec 819 blk_queue_write_cache(nbd->disk->queue, true, false);
d02cf531 820 else
aafb1eec 821 blk_queue_write_cache(nbd->disk->queue, false, false);
d02cf531
MP
822}
823
9561a7ad
JB
824static void send_disconnects(struct nbd_device *nbd)
825{
5ea8d108 826 struct nbd_config *config = nbd->config;
c9f2b6ae
AV
827 struct nbd_request request = {
828 .magic = htonl(NBD_REQUEST_MAGIC),
829 .type = htonl(NBD_CMD_DISC),
830 };
831 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
832 struct iov_iter from;
9561a7ad
JB
833 int i, ret;
834
5ea8d108 835 for (i = 0; i < config->num_connections; i++) {
c9f2b6ae 836 iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
9dd5d3ab 837 ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
9561a7ad
JB
838 if (ret <= 0)
839 dev_err(disk_to_dev(nbd->disk),
840 "Send disconnect failed %d\n", ret);
841 }
842}
843
9442b739
JB
844static int nbd_disconnect(struct nbd_device *nbd, struct block_device *bdev)
845{
5ea8d108 846 struct nbd_config *config = nbd->config;
30d53d9c 847
5ea8d108 848 dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
9442b739
JB
849 mutex_unlock(&nbd->config_lock);
850 fsync_bdev(bdev);
851 mutex_lock(&nbd->config_lock);
852
9442b739 853 if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
5ea8d108 854 &config->runtime_flags))
9442b739
JB
855 send_disconnects(nbd);
856 return 0;
857}
858
859static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
1a2ad211 860{
9442b739
JB
861 sock_shutdown(nbd);
862 nbd_clear_que(nbd);
abbbdf12
RMB
863
864 __invalidate_device(bdev, true);
9442b739 865 nbd_bdev_reset(bdev);
5ea8d108
JB
866 nbd->task_setup = NULL;
867 return 0;
868}
869
870static void nbd_config_put(struct nbd_device *nbd)
871{
872 if (refcount_dec_and_mutex_lock(&nbd->config_refs,
873 &nbd->config_lock)) {
874 struct block_device *bdev;
875 struct nbd_config *config = nbd->config;
9442b739 876
5ea8d108
JB
877 bdev = bdget_disk(nbd->disk, 0);
878 if (!bdev) {
879 mutex_unlock(&nbd->config_lock);
880 return;
6a8a2154 881 }
9561a7ad 882
5ea8d108
JB
883 nbd_dev_dbg_close(nbd);
884 nbd_size_clear(nbd, bdev);
885 if (test_and_clear_bit(NBD_HAS_PID_FILE,
886 &config->runtime_flags))
887 device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
888 nbd->task_recv = NULL;
889 nbd_clear_sock(nbd, bdev);
890 if (config->num_connections) {
891 int i;
892 for (i = 0; i < config->num_connections; i++) {
893 sockfd_put(config->socks[i]->sock);
894 kfree(config->socks[i]);
895 }
896 kfree(config->socks);
897 }
898 nbd_reset(nbd);
899 mutex_unlock(&nbd->config_lock);
900 bdput(bdev);
901 module_put(THIS_MODULE);
902 }
9442b739
JB
903}
904
905static int nbd_start_device(struct nbd_device *nbd, struct block_device *bdev)
906{
5ea8d108
JB
907 struct nbd_config *config = nbd->config;
908 int num_connections = config->num_connections;
9442b739 909 int error = 0, i;
1a2ad211 910
9442b739
JB
911 if (nbd->task_recv)
912 return -EBUSY;
5ea8d108 913 if (!config->socks)
9442b739 914 return -EINVAL;
5ea8d108 915
9442b739 916 if (num_connections > 1 &&
5ea8d108 917 !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
9442b739 918 dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
5ea8d108 919 return -EINVAL;
9442b739 920 }
23272a67 921
5ea8d108 922 blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
9442b739
JB
923 nbd->task_recv = current;
924 mutex_unlock(&nbd->config_lock);
23272a67 925
9442b739 926 nbd_parse_flags(nbd, bdev);
23272a67 927
9442b739
JB
928 error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
929 if (error) {
930 dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
5ea8d108 931 return error;
1a2ad211 932 }
5ea8d108 933 set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
1a2ad211 934
9442b739 935 nbd_size_update(nbd, bdev);
37091fdd 936
9442b739
JB
937 nbd_dev_dbg_init(nbd);
938 for (i = 0; i < num_connections; i++) {
5ea8d108
JB
939 struct recv_thread_args *args;
940
941 args = kzalloc(sizeof(*args), GFP_KERNEL);
942 if (!args) {
943 sock_shutdown(nbd);
944 return -ENOMEM;
945 }
946 sk_set_memalloc(config->socks[i]->sock->sk);
947 atomic_inc(&config->recv_threads);
948 refcount_inc(&nbd->config_refs);
949 INIT_WORK(&args->work, recv_work);
950 args->nbd = nbd;
951 args->index = i;
952 queue_work(recv_workqueue, &args->work);
37091fdd 953 }
5ea8d108
JB
954 error = wait_event_interruptible(config->recv_wq,
955 atomic_read(&config->recv_threads) == 0);
956 if (error)
957 sock_shutdown(nbd);
9442b739 958 mutex_lock(&nbd->config_lock);
9442b739
JB
959
960 /* user requested, ignore socket errors */
5ea8d108 961 if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
9442b739 962 error = 0;
5ea8d108 963 if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
9442b739 964 error = -ETIMEDOUT;
9442b739
JB
965 return error;
966}
967
968/* Must be called with config_lock held */
969static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
970 unsigned int cmd, unsigned long arg)
971{
5ea8d108
JB
972 struct nbd_config *config = nbd->config;
973
9442b739
JB
974 switch (cmd) {
975 case NBD_DISCONNECT:
976 return nbd_disconnect(nbd, bdev);
977 case NBD_CLEAR_SOCK:
978 return nbd_clear_sock(nbd, bdev);
979 case NBD_SET_SOCK:
980 return nbd_add_socket(nbd, bdev, arg);
981 case NBD_SET_BLKSIZE:
e544541b 982 nbd_size_set(nbd, bdev, arg,
5ea8d108 983 div_s64(config->bytesize, arg));
e544541b 984 return 0;
1da177e4 985 case NBD_SET_SIZE:
5ea8d108
JB
986 nbd_size_set(nbd, bdev, config->blksize,
987 div_s64(arg, config->blksize));
e544541b 988 return 0;
37091fdd 989 case NBD_SET_SIZE_BLOCKS:
5ea8d108 990 nbd_size_set(nbd, bdev, config->blksize, arg);
e544541b 991 return 0;
7fdfd406 992 case NBD_SET_TIMEOUT:
f8586855
JB
993 if (arg) {
994 nbd->tag_set.timeout = arg * HZ;
995 blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
996 }
7fdfd406 997 return 0;
1a2ad211 998
2f012508 999 case NBD_SET_FLAGS:
5ea8d108 1000 config->flags = arg;
2f012508 1001 return 0;
9442b739
JB
1002 case NBD_DO_IT:
1003 return nbd_start_device(nbd, bdev);
1da177e4 1004 case NBD_CLEAR_QUE:
4b2f0260
HX
1005 /*
1006 * This is for compatibility only. The queue is always cleared
1007 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1008 */
1da177e4
LT
1009 return 0;
1010 case NBD_PRINT_DEBUG:
fd8383fd
JB
1011 /*
1012 * For compatibility only, we no longer keep a list of
1013 * outstanding requests.
1014 */
1da177e4
LT
1015 return 0;
1016 }
1a2ad211
PM
1017 return -ENOTTY;
1018}
1019
1020static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1021 unsigned int cmd, unsigned long arg)
1022{
f4507164 1023 struct nbd_device *nbd = bdev->bd_disk->private_data;
1a2ad211
PM
1024 int error;
1025
1026 if (!capable(CAP_SYS_ADMIN))
1027 return -EPERM;
1028
9561a7ad 1029 mutex_lock(&nbd->config_lock);
f4507164 1030 error = __nbd_ioctl(bdev, nbd, cmd, arg);
9561a7ad 1031 mutex_unlock(&nbd->config_lock);
1a2ad211 1032 return error;
1da177e4
LT
1033}
1034
5ea8d108
JB
1035static struct nbd_config *nbd_alloc_config(void)
1036{
1037 struct nbd_config *config;
1038
1039 config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1040 if (!config)
1041 return NULL;
1042 atomic_set(&config->recv_threads, 0);
1043 init_waitqueue_head(&config->recv_wq);
1044 config->blksize = 1024;
1045 try_module_get(THIS_MODULE);
1046 return config;
1047}
1048
1049static int nbd_open(struct block_device *bdev, fmode_t mode)
1050{
1051 struct nbd_device *nbd;
1052 int ret = 0;
1053
1054 mutex_lock(&nbd_index_mutex);
1055 nbd = bdev->bd_disk->private_data;
1056 if (!nbd) {
1057 ret = -ENXIO;
1058 goto out;
1059 }
1060 if (!refcount_inc_not_zero(&nbd->config_refs)) {
1061 struct nbd_config *config;
1062
1063 mutex_lock(&nbd->config_lock);
1064 if (refcount_inc_not_zero(&nbd->config_refs)) {
1065 mutex_unlock(&nbd->config_lock);
1066 goto out;
1067 }
1068 config = nbd->config = nbd_alloc_config();
1069 if (!config) {
1070 ret = -ENOMEM;
1071 mutex_unlock(&nbd->config_lock);
1072 goto out;
1073 }
1074 refcount_set(&nbd->config_refs, 1);
1075 mutex_unlock(&nbd->config_lock);
1076 }
1077out:
1078 mutex_unlock(&nbd_index_mutex);
1079 return ret;
1080}
1081
1082static void nbd_release(struct gendisk *disk, fmode_t mode)
1083{
1084 struct nbd_device *nbd = disk->private_data;
1085 nbd_config_put(nbd);
1086}
1087
83d5cde4 1088static const struct block_device_operations nbd_fops =
1da177e4
LT
1089{
1090 .owner = THIS_MODULE,
5ea8d108
JB
1091 .open = nbd_open,
1092 .release = nbd_release,
8a6cfeb6 1093 .ioctl = nbd_ioctl,
263a3df1 1094 .compat_ioctl = nbd_ioctl,
1da177e4
LT
1095};
1096
30d53d9c
MP
1097#if IS_ENABLED(CONFIG_DEBUG_FS)
1098
1099static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1100{
1101 struct nbd_device *nbd = s->private;
1102
1103 if (nbd->task_recv)
1104 seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
30d53d9c
MP
1105
1106 return 0;
1107}
1108
1109static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1110{
1111 return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1112}
1113
1114static const struct file_operations nbd_dbg_tasks_ops = {
1115 .open = nbd_dbg_tasks_open,
1116 .read = seq_read,
1117 .llseek = seq_lseek,
1118 .release = single_release,
1119};
1120
1121static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1122{
1123 struct nbd_device *nbd = s->private;
5ea8d108 1124 u32 flags = nbd->config->flags;
30d53d9c
MP
1125
1126 seq_printf(s, "Hex: 0x%08x\n\n", flags);
1127
1128 seq_puts(s, "Known flags:\n");
1129
1130 if (flags & NBD_FLAG_HAS_FLAGS)
1131 seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1132 if (flags & NBD_FLAG_READ_ONLY)
1133 seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1134 if (flags & NBD_FLAG_SEND_FLUSH)
1135 seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1136 if (flags & NBD_FLAG_SEND_TRIM)
1137 seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1138
1139 return 0;
1140}
1141
1142static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1143{
1144 return single_open(file, nbd_dbg_flags_show, inode->i_private);
1145}
1146
1147static const struct file_operations nbd_dbg_flags_ops = {
1148 .open = nbd_dbg_flags_open,
1149 .read = seq_read,
1150 .llseek = seq_lseek,
1151 .release = single_release,
1152};
1153
1154static int nbd_dev_dbg_init(struct nbd_device *nbd)
1155{
1156 struct dentry *dir;
5ea8d108 1157 struct nbd_config *config = nbd->config;
27ea43fe
MP
1158
1159 if (!nbd_dbg_dir)
1160 return -EIO;
30d53d9c
MP
1161
1162 dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
27ea43fe
MP
1163 if (!dir) {
1164 dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1165 nbd_name(nbd));
1166 return -EIO;
30d53d9c 1167 }
5ea8d108 1168 config->dbg_dir = dir;
30d53d9c 1169
27ea43fe 1170 debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
5ea8d108 1171 debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
0eadf37a 1172 debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
5ea8d108 1173 debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
d366a0ff 1174 debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
30d53d9c
MP
1175
1176 return 0;
1177}
1178
1179static void nbd_dev_dbg_close(struct nbd_device *nbd)
1180{
5ea8d108 1181 debugfs_remove_recursive(nbd->config->dbg_dir);
30d53d9c
MP
1182}
1183
1184static int nbd_dbg_init(void)
1185{
1186 struct dentry *dbg_dir;
1187
1188 dbg_dir = debugfs_create_dir("nbd", NULL);
27ea43fe
MP
1189 if (!dbg_dir)
1190 return -EIO;
30d53d9c
MP
1191
1192 nbd_dbg_dir = dbg_dir;
1193
1194 return 0;
1195}
1196
1197static void nbd_dbg_close(void)
1198{
1199 debugfs_remove_recursive(nbd_dbg_dir);
1200}
1201
1202#else /* IS_ENABLED(CONFIG_DEBUG_FS) */
1203
1204static int nbd_dev_dbg_init(struct nbd_device *nbd)
1205{
1206 return 0;
1207}
1208
1209static void nbd_dev_dbg_close(struct nbd_device *nbd)
1210{
1211}
1212
1213static int nbd_dbg_init(void)
1214{
1215 return 0;
1216}
1217
1218static void nbd_dbg_close(void)
1219{
1220}
1221
1222#endif
1223
fd8383fd
JB
1224static int nbd_init_request(void *data, struct request *rq,
1225 unsigned int hctx_idx, unsigned int request_idx,
1226 unsigned int numa_node)
1227{
1228 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
fd8383fd 1229 cmd->nbd = data;
fd8383fd
JB
1230 return 0;
1231}
1232
f363b089 1233static const struct blk_mq_ops nbd_mq_ops = {
fd8383fd 1234 .queue_rq = nbd_queue_rq,
fd8383fd 1235 .init_request = nbd_init_request,
0eadf37a 1236 .timeout = nbd_xmit_timeout,
fd8383fd
JB
1237};
1238
b0d9111a
JB
1239static void nbd_dev_remove(struct nbd_device *nbd)
1240{
1241 struct gendisk *disk = nbd->disk;
b0d9111a
JB
1242 if (disk) {
1243 del_gendisk(disk);
1244 blk_cleanup_queue(disk->queue);
1245 blk_mq_free_tag_set(&nbd->tag_set);
1246 put_disk(disk);
1247 }
1248 kfree(nbd);
1249}
1250
1251static int nbd_dev_add(int index)
1252{
1253 struct nbd_device *nbd;
1254 struct gendisk *disk;
1255 struct request_queue *q;
1256 int err = -ENOMEM;
1257
1258 nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1259 if (!nbd)
1260 goto out;
1261
1262 disk = alloc_disk(1 << part_shift);
1263 if (!disk)
1264 goto out_free_nbd;
1265
1266 if (index >= 0) {
1267 err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1268 GFP_KERNEL);
1269 if (err == -ENOSPC)
1270 err = -EEXIST;
1271 } else {
1272 err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1273 if (err >= 0)
1274 index = err;
1275 }
1276 if (err < 0)
1277 goto out_free_disk;
1278
1279 nbd->disk = disk;
1280 nbd->tag_set.ops = &nbd_mq_ops;
1281 nbd->tag_set.nr_hw_queues = 1;
1282 nbd->tag_set.queue_depth = 128;
1283 nbd->tag_set.numa_node = NUMA_NO_NODE;
1284 nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1285 nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1286 BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
1287 nbd->tag_set.driver_data = nbd;
1288
1289 err = blk_mq_alloc_tag_set(&nbd->tag_set);
1290 if (err)
1291 goto out_free_idr;
1292
1293 q = blk_mq_init_queue(&nbd->tag_set);
1294 if (IS_ERR(q)) {
1295 err = PTR_ERR(q);
1296 goto out_free_tags;
1297 }
1298 disk->queue = q;
1299
1300 /*
1301 * Tell the block layer that we are not a rotational device
1302 */
1303 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
1304 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1305 disk->queue->limits.discard_granularity = 512;
1306 blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
b0d9111a
JB
1307 blk_queue_max_hw_sectors(disk->queue, 65536);
1308 disk->queue->limits.max_sectors = 256;
1309
b0d9111a 1310 mutex_init(&nbd->config_lock);
5ea8d108 1311 refcount_set(&nbd->config_refs, 0);
b0d9111a
JB
1312 disk->major = NBD_MAJOR;
1313 disk->first_minor = index << part_shift;
1314 disk->fops = &nbd_fops;
1315 disk->private_data = nbd;
1316 sprintf(disk->disk_name, "nbd%d", index);
b0d9111a
JB
1317 nbd_reset(nbd);
1318 add_disk(disk);
1319 return index;
1320
1321out_free_tags:
1322 blk_mq_free_tag_set(&nbd->tag_set);
1323out_free_idr:
1324 idr_remove(&nbd_index_idr, index);
1325out_free_disk:
1326 put_disk(disk);
1327out_free_nbd:
1328 kfree(nbd);
1329out:
1330 return err;
1331}
1332
1da177e4
LT
1333/*
1334 * And here should be modules and kernel interface
1335 * (Just smiley confuses emacs :-)
1336 */
1337
1338static int __init nbd_init(void)
1339{
1da177e4
LT
1340 int i;
1341
5b7b18cc 1342 BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
1da177e4 1343
d71a6d73 1344 if (max_part < 0) {
7742ce4a 1345 printk(KERN_ERR "nbd: max_part must be >= 0\n");
d71a6d73
LV
1346 return -EINVAL;
1347 }
1348
1349 part_shift = 0;
5988ce23 1350 if (max_part > 0) {
d71a6d73
LV
1351 part_shift = fls(max_part);
1352
5988ce23
NK
1353 /*
1354 * Adjust max_part according to part_shift as it is exported
1355 * to user space so that user can know the max number of
1356 * partition kernel should be able to manage.
1357 *
1358 * Note that -1 is required because partition 0 is reserved
1359 * for the whole disk.
1360 */
1361 max_part = (1UL << part_shift) - 1;
1362 }
1363
3b271082
NK
1364 if ((1UL << part_shift) > DISK_MAX_PARTS)
1365 return -EINVAL;
1366
1367 if (nbds_max > 1UL << (MINORBITS - part_shift))
1368 return -EINVAL;
124d6db0
JB
1369 recv_workqueue = alloc_workqueue("knbd-recv",
1370 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1371 if (!recv_workqueue)
1372 return -ENOMEM;
3b271082 1373
6330a2d0
JB
1374 if (register_blkdev(NBD_MAJOR, "nbd")) {
1375 destroy_workqueue(recv_workqueue);
b0d9111a 1376 return -EIO;
6330a2d0 1377 }
1da177e4 1378
30d53d9c
MP
1379 nbd_dbg_init();
1380
b0d9111a
JB
1381 mutex_lock(&nbd_index_mutex);
1382 for (i = 0; i < nbds_max; i++)
1383 nbd_dev_add(i);
1384 mutex_unlock(&nbd_index_mutex);
1385 return 0;
1386}
1da177e4 1387
b0d9111a
JB
1388static int nbd_exit_cb(int id, void *ptr, void *data)
1389{
1390 struct nbd_device *nbd = ptr;
1391 nbd_dev_remove(nbd);
1da177e4 1392 return 0;
1da177e4
LT
1393}
1394
1395static void __exit nbd_cleanup(void)
1396{
30d53d9c
MP
1397 nbd_dbg_close();
1398
b0d9111a
JB
1399 idr_for_each(&nbd_index_idr, &nbd_exit_cb, NULL);
1400 idr_destroy(&nbd_index_idr);
124d6db0 1401 destroy_workqueue(recv_workqueue);
1da177e4 1402 unregister_blkdev(NBD_MAJOR, "nbd");
1da177e4
LT
1403}
1404
1405module_init(nbd_init);
1406module_exit(nbd_cleanup);
1407
1408MODULE_DESCRIPTION("Network Block Device");
1409MODULE_LICENSE("GPL");
1410
40be0c28 1411module_param(nbds_max, int, 0444);
d71a6d73
LV
1412MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
1413module_param(max_part, int, 0444);
1414MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");