]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - drivers/block/nbd.c
nbd: Aovid double completion of a request
[mirror_ubuntu-hirsute-kernel.git] / drivers / block / nbd.c
CommitLineData
eb1fe3bf 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Network block device - make block devices work over TCP
4 *
5 * Note that you can not swap over this thing, yet. Seems to work but
6 * deadlocks sometimes - you can not swap over TCP in general.
7 *
a2531293 8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
1da177e4
LT
9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
10 *
dbf492d6 11 * (part of code stolen from loop.c)
1da177e4
LT
12 */
13
14#include <linux/major.h>
15
16#include <linux/blkdev.h>
17#include <linux/module.h>
18#include <linux/init.h>
19#include <linux/sched.h>
f1083048 20#include <linux/sched/mm.h>
1da177e4
LT
21#include <linux/fs.h>
22#include <linux/bio.h>
23#include <linux/stat.h>
24#include <linux/errno.h>
25#include <linux/file.h>
26#include <linux/ioctl.h>
2a48fc0a 27#include <linux/mutex.h>
4b2f0260 28#include <linux/compiler.h>
8454d685 29#include <linux/completion.h>
4b2f0260
HX
30#include <linux/err.h>
31#include <linux/kernel.h>
5a0e3ad6 32#include <linux/slab.h>
1da177e4 33#include <net/sock.h>
91cf45f0 34#include <linux/net.h>
48cf6061 35#include <linux/kthread.h>
b9c495bb 36#include <linux/types.h>
30d53d9c 37#include <linux/debugfs.h>
fd8383fd 38#include <linux/blk-mq.h>
1da177e4 39
7c0f6ba6 40#include <linux/uaccess.h>
1da177e4
LT
41#include <asm/types.h>
42
43#include <linux/nbd.h>
e46c7287
JB
44#include <linux/nbd-netlink.h>
45#include <net/genetlink.h>
1da177e4 46
ea106722
MM
47#define CREATE_TRACE_POINTS
48#include <trace/events/nbd.h>
49
b0d9111a
JB
50static DEFINE_IDR(nbd_index_idr);
51static DEFINE_MUTEX(nbd_index_mutex);
47d902b9 52static int nbd_total_devices = 0;
b0d9111a 53
9561a7ad
JB
54struct nbd_sock {
55 struct socket *sock;
56 struct mutex tx_lock;
9dd5d3ab
JB
57 struct request *pending;
58 int sent;
f3733247
JB
59 bool dead;
60 int fallback_index;
799f9a38 61 int cookie;
9561a7ad
JB
62};
63
5ea8d108
JB
64struct recv_thread_args {
65 struct work_struct work;
66 struct nbd_device *nbd;
67 int index;
68};
69
799f9a38
JB
70struct link_dead_args {
71 struct work_struct work;
72 int index;
73};
74
ec76a7b9
XL
75#define NBD_RT_TIMEDOUT 0
76#define NBD_RT_DISCONNECT_REQUESTED 1
77#define NBD_RT_DISCONNECTED 2
78#define NBD_RT_HAS_PID_FILE 3
79#define NBD_RT_HAS_CONFIG_REF 4
80#define NBD_RT_BOUND 5
50317b9a 81#define NBD_RT_DISCONNECT_ON_CLOSE 6
9b4a6ba9 82
8454d685
XL
83#define NBD_DESTROY_ON_DISCONNECT 0
84#define NBD_DISCONNECT_REQUESTED 1
85
5ea8d108 86struct nbd_config {
22d109c1 87 u32 flags;
9b4a6ba9 88 unsigned long runtime_flags;
560bc4b3 89 u64 dead_conn_timeout;
13e71d69 90
5ea8d108 91 struct nbd_sock **socks;
9561a7ad 92 int num_connections;
560bc4b3
JB
93 atomic_t live_connections;
94 wait_queue_head_t conn_wait;
5ea8d108 95
9561a7ad
JB
96 atomic_t recv_threads;
97 wait_queue_head_t recv_wq;
ef77b515 98 loff_t blksize;
b9c495bb 99 loff_t bytesize;
30d53d9c
MP
100#if IS_ENABLED(CONFIG_DEBUG_FS)
101 struct dentry *dbg_dir;
102#endif
13e71d69
MP
103};
104
5ea8d108
JB
105struct nbd_device {
106 struct blk_mq_tag_set tag_set;
107
e46c7287 108 int index;
5ea8d108 109 refcount_t config_refs;
c6a4759e 110 refcount_t refs;
5ea8d108
JB
111 struct nbd_config *config;
112 struct mutex config_lock;
113 struct gendisk *disk;
e9e006f5 114 struct workqueue_struct *recv_workq;
5ea8d108 115
c6a4759e 116 struct list_head list;
5ea8d108
JB
117 struct task_struct *task_recv;
118 struct task_struct *task_setup;
8454d685
XL
119
120 struct completion *destroy_complete;
121 unsigned long flags;
5ea8d108
JB
122};
123
d7d94d48
JB
124#define NBD_CMD_REQUEUED 1
125
fd8383fd
JB
126struct nbd_cmd {
127 struct nbd_device *nbd;
8f3ea359 128 struct mutex lock;
f3733247 129 int index;
799f9a38 130 int cookie;
2da22da5 131 int retries;
2a842aca 132 blk_status_t status;
d7d94d48 133 unsigned long flags;
8f3ea359 134 u32 cmd_cookie;
fd8383fd
JB
135};
136
30d53d9c
MP
137#if IS_ENABLED(CONFIG_DEBUG_FS)
138static struct dentry *nbd_dbg_dir;
139#endif
140
141#define nbd_name(nbd) ((nbd)->disk->disk_name)
142
f4507164 143#define NBD_MAGIC 0x68797548
1da177e4 144
553768d1
XL
145#define NBD_DEF_BLKSIZE 1024
146
9c7a4169 147static unsigned int nbds_max = 16;
7a8362a0 148static int max_part = 16;
b0d9111a 149static int part_shift;
1da177e4 150
9442b739
JB
151static int nbd_dev_dbg_init(struct nbd_device *nbd);
152static void nbd_dev_dbg_close(struct nbd_device *nbd);
5ea8d108 153static void nbd_config_put(struct nbd_device *nbd);
e46c7287 154static void nbd_connect_reply(struct genl_info *info, int index);
47d902b9 155static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
799f9a38 156static void nbd_dead_link_work(struct work_struct *work);
08ba91ee 157static void nbd_disconnect_and_put(struct nbd_device *nbd);
9442b739 158
d18509f5 159static inline struct device *nbd_to_dev(struct nbd_device *nbd)
1da177e4 160{
d18509f5 161 return disk_to_dev(nbd->disk);
1da177e4
LT
162}
163
d7d94d48
JB
164static void nbd_requeue_cmd(struct nbd_cmd *cmd)
165{
166 struct request *req = blk_mq_rq_from_pdu(cmd);
167
168 if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
169 blk_mq_requeue_request(req, true);
170}
171
8f3ea359
JB
172#define NBD_COOKIE_BITS 32
173
174static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
175{
176 struct request *req = blk_mq_rq_from_pdu(cmd);
177 u32 tag = blk_mq_unique_tag(req);
178 u64 cookie = cmd->cmd_cookie;
179
180 return (cookie << NBD_COOKIE_BITS) | tag;
181}
182
183static u32 nbd_handle_to_tag(u64 handle)
184{
185 return (u32)handle;
186}
187
188static u32 nbd_handle_to_cookie(u64 handle)
189{
190 return (u32)(handle >> NBD_COOKIE_BITS);
191}
192
1da177e4
LT
193static const char *nbdcmd_to_ascii(int cmd)
194{
195 switch (cmd) {
196 case NBD_CMD_READ: return "read";
197 case NBD_CMD_WRITE: return "write";
198 case NBD_CMD_DISC: return "disconnect";
75f187ab 199 case NBD_CMD_FLUSH: return "flush";
a336d298 200 case NBD_CMD_TRIM: return "trim/discard";
1da177e4
LT
201 }
202 return "invalid";
203}
1da177e4 204
5ea8d108
JB
205static ssize_t pid_show(struct device *dev,
206 struct device_attribute *attr, char *buf)
207{
208 struct gendisk *disk = dev_to_disk(dev);
209 struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
210
211 return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
212}
213
dfbde552 214static const struct device_attribute pid_attr = {
5657a819 215 .attr = { .name = "pid", .mode = 0444},
5ea8d108
JB
216 .show = pid_show,
217};
218
c6a4759e
JB
219static void nbd_dev_remove(struct nbd_device *nbd)
220{
221 struct gendisk *disk = nbd->disk;
8364da47
JB
222 struct request_queue *q;
223
c6a4759e 224 if (disk) {
8364da47 225 q = disk->queue;
c6a4759e 226 del_gendisk(disk);
8364da47 227 blk_cleanup_queue(q);
c6a4759e 228 blk_mq_free_tag_set(&nbd->tag_set);
a2c97909 229 disk->private_data = NULL;
c6a4759e
JB
230 put_disk(disk);
231 }
8454d685
XL
232
233 /*
234 * Place this in the last just before the nbd is freed to
235 * make sure that the disk and the related kobject are also
236 * totally removed to avoid duplicate creation of the same
237 * one.
238 */
239 if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete)
240 complete(nbd->destroy_complete);
241
c6a4759e
JB
242 kfree(nbd);
243}
244
245static void nbd_put(struct nbd_device *nbd)
246{
247 if (refcount_dec_and_mutex_lock(&nbd->refs,
248 &nbd_index_mutex)) {
249 idr_remove(&nbd_index_idr, nbd->index);
c6a4759e 250 nbd_dev_remove(nbd);
86248810 251 mutex_unlock(&nbd_index_mutex);
c6a4759e
JB
252 }
253}
254
799f9a38
JB
255static int nbd_disconnected(struct nbd_config *config)
256{
ec76a7b9
XL
257 return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
258 test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
799f9a38
JB
259}
260
261static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
262 int notify)
f3733247 263{
799f9a38
JB
264 if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
265 struct link_dead_args *args;
266 args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
267 if (args) {
268 INIT_WORK(&args->work, nbd_dead_link_work);
269 args->index = nbd->index;
270 queue_work(system_wq, &args->work);
271 }
272 }
560bc4b3 273 if (!nsock->dead) {
f3733247 274 kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
5e3c3a7e 275 if (atomic_dec_return(&nbd->config->live_connections) == 0) {
ec76a7b9 276 if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
5e3c3a7e 277 &nbd->config->runtime_flags)) {
ec76a7b9 278 set_bit(NBD_RT_DISCONNECTED,
5e3c3a7e
KV
279 &nbd->config->runtime_flags);
280 dev_info(nbd_to_dev(nbd),
281 "Disconnected due to user request.\n");
282 }
283 }
560bc4b3 284 }
f3733247
JB
285 nsock->dead = true;
286 nsock->pending = NULL;
287 nsock->sent = 0;
288}
289
29eaadc0 290static void nbd_size_clear(struct nbd_device *nbd)
37091fdd 291{
5ea8d108 292 if (nbd->config->bytesize) {
5ea8d108
JB
293 set_capacity(nbd->disk, 0);
294 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
295 }
37091fdd
MP
296}
297
dcbddf54 298static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
2dc691cc 299 loff_t blksize)
37091fdd 300{
dcbddf54
CH
301 if (!blksize)
302 blksize = NBD_DEF_BLKSIZE;
303 if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize))
304 return -EINVAL;
305
2dc691cc
CH
306 nbd->config->bytesize = bytesize;
307 nbd->config->blksize = blksize;
308
92f93c3a 309 if (!nbd->task_recv)
dcbddf54 310 return 0;
9e2b1967 311
2dc691cc
CH
312 if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
313 nbd->disk->queue->limits.discard_granularity = blksize;
314 nbd->disk->queue->limits.discard_alignment = blksize;
6df133a1
JB
315 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
316 }
2dc691cc
CH
317 blk_queue_logical_block_size(nbd->disk->queue, blksize);
318 blk_queue_physical_block_size(nbd->disk->queue, blksize);
92f93c3a 319
1aba169e
JT
320 if (max_part)
321 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
2ebcabf3
CH
322 if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
323 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
dcbddf54 324 return 0;
37091fdd
MP
325}
326
1e388ae0 327static void nbd_complete_rq(struct request *req)
1da177e4 328{
1e388ae0 329 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
1da177e4 330
ee57a05c 331 dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
1e388ae0 332 cmd->status ? "failed" : "done");
1da177e4 333
1e388ae0 334 blk_mq_end_request(req, cmd->status);
1da177e4
LT
335}
336
e018e757
MP
337/*
338 * Forcibly shutdown the socket causing all listeners to error
339 */
36e47bee 340static void sock_shutdown(struct nbd_device *nbd)
7fdfd406 341{
5ea8d108 342 struct nbd_config *config = nbd->config;
9561a7ad 343 int i;
23272a67 344
5ea8d108 345 if (config->num_connections == 0)
9561a7ad 346 return;
ec76a7b9 347 if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
260bbce4 348 return;
23272a67 349
5ea8d108
JB
350 for (i = 0; i < config->num_connections; i++) {
351 struct nbd_sock *nsock = config->socks[i];
9561a7ad 352 mutex_lock(&nsock->tx_lock);
799f9a38 353 nbd_mark_nsock_dead(nbd, nsock, 0);
9561a7ad
JB
354 mutex_unlock(&nsock->tx_lock);
355 }
356 dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
7fdfd406
PC
357}
358
00514677
MC
359static u32 req_to_nbd_cmd_type(struct request *req)
360{
361 switch (req_op(req)) {
362 case REQ_OP_DISCARD:
363 return NBD_CMD_TRIM;
364 case REQ_OP_FLUSH:
365 return NBD_CMD_FLUSH;
366 case REQ_OP_WRITE:
367 return NBD_CMD_WRITE;
368 case REQ_OP_READ:
369 return NBD_CMD_READ;
370 default:
371 return U32_MAX;
372 }
373}
374
0eadf37a
JB
375static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
376 bool reserved)
7fdfd406 377{
0eadf37a
JB
378 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
379 struct nbd_device *nbd = cmd->nbd;
5ea8d108
JB
380 struct nbd_config *config;
381
de6346ec
JB
382 if (!mutex_trylock(&cmd->lock))
383 return BLK_EH_RESET_TIMER;
384
5ea8d108 385 if (!refcount_inc_not_zero(&nbd->config_refs)) {
2a842aca 386 cmd->status = BLK_STS_TIMEOUT;
de6346ec 387 mutex_unlock(&cmd->lock);
e5eab017 388 goto done;
5ea8d108 389 }
5ea8d108 390 config = nbd->config;
dcc909d9 391
d970958b
HP
392 if (config->num_connections > 1 ||
393 (config->num_connections == 1 && nbd->tag_set.timeout)) {
f3733247 394 dev_err_ratelimited(nbd_to_dev(nbd),
5e3c3a7e
KV
395 "Connection timed out, retrying (%d/%d alive)\n",
396 atomic_read(&config->live_connections),
397 config->num_connections);
f3733247
JB
398 /*
399 * Hooray we have more connections, requeue this IO, the submit
d970958b
HP
400 * path will put it on a real connection. Or if only one
401 * connection is configured, the submit path will wait util
402 * a new connection is reconfigured or util dead timeout.
f3733247 403 */
d970958b 404 if (config->socks) {
5ea8d108 405 if (cmd->index < config->num_connections) {
f3733247 406 struct nbd_sock *nsock =
5ea8d108 407 config->socks[cmd->index];
f3733247 408 mutex_lock(&nsock->tx_lock);
799f9a38
JB
409 /* We can have multiple outstanding requests, so
410 * we don't want to mark the nsock dead if we've
411 * already reconnected with a new socket, so
412 * only mark it dead if its the same socket we
413 * were sent out on.
414 */
415 if (cmd->cookie == nsock->cookie)
416 nbd_mark_nsock_dead(nbd, nsock, 1);
f3733247
JB
417 mutex_unlock(&nsock->tx_lock);
418 }
8f3ea359 419 mutex_unlock(&cmd->lock);
d7d94d48 420 nbd_requeue_cmd(cmd);
5ea8d108 421 nbd_config_put(nbd);
6600593c 422 return BLK_EH_DONE;
f3733247 423 }
f3733247 424 }
2da22da5
MC
425
426 if (!nbd->tag_set.timeout) {
427 /*
428 * Userspace sets timeout=0 to disable socket disconnection,
429 * so just warn and reset the timer.
430 */
2c272542 431 struct nbd_sock *nsock = config->socks[cmd->index];
2da22da5
MC
432 cmd->retries++;
433 dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
434 req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
435 (unsigned long long)blk_rq_pos(req) << 9,
436 blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
437
2c272542
HP
438 mutex_lock(&nsock->tx_lock);
439 if (cmd->cookie != nsock->cookie) {
440 nbd_requeue_cmd(cmd);
441 mutex_unlock(&nsock->tx_lock);
442 mutex_unlock(&cmd->lock);
443 nbd_config_put(nbd);
444 return BLK_EH_DONE;
445 }
446 mutex_unlock(&nsock->tx_lock);
2da22da5
MC
447 mutex_unlock(&cmd->lock);
448 nbd_config_put(nbd);
449 return BLK_EH_RESET_TIMER;
450 }
451
452 dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
ec76a7b9 453 set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
2a842aca 454 cmd->status = BLK_STS_IOERR;
8f3ea359 455 mutex_unlock(&cmd->lock);
9561a7ad 456 sock_shutdown(nbd);
5ea8d108 457 nbd_config_put(nbd);
e5eab017
CH
458done:
459 blk_mq_complete_request(req);
460 return BLK_EH_DONE;
7fdfd406
PC
461}
462
1da177e4
LT
463/*
464 * Send or receive packet.
465 */
c9f2b6ae 466static int sock_xmit(struct nbd_device *nbd, int index, int send,
9dd5d3ab 467 struct iov_iter *iter, int msg_flags, int *sent)
1da177e4 468{
5ea8d108
JB
469 struct nbd_config *config = nbd->config;
470 struct socket *sock = config->socks[index]->sock;
1da177e4
LT
471 int result;
472 struct msghdr msg;
f1083048 473 unsigned int noreclaim_flag;
1da177e4 474
ffc41cf8 475 if (unlikely(!sock)) {
a897b666 476 dev_err_ratelimited(disk_to_dev(nbd->disk),
7f1b90f9
WC
477 "Attempted %s on closed socket in sock_xmit\n",
478 (send ? "send" : "recv"));
ffc41cf8
MS
479 return -EINVAL;
480 }
481
c9f2b6ae 482 msg.msg_iter = *iter;
c1696cab 483
f1083048 484 noreclaim_flag = memalloc_noreclaim_save();
1da177e4 485 do {
7f338fe4 486 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
1da177e4
LT
487 msg.msg_name = NULL;
488 msg.msg_namelen = 0;
489 msg.msg_control = NULL;
490 msg.msg_controllen = 0;
1da177e4
LT
491 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
492
7e2893a1 493 if (send)
c1696cab 494 result = sock_sendmsg(sock, &msg);
7e2893a1 495 else
c1696cab 496 result = sock_recvmsg(sock, &msg, msg.msg_flags);
1da177e4 497
1da177e4
LT
498 if (result <= 0) {
499 if (result == 0)
500 result = -EPIPE; /* short read */
501 break;
502 }
9dd5d3ab
JB
503 if (sent)
504 *sent += result;
c1696cab 505 } while (msg_data_left(&msg));
1da177e4 506
f1083048 507 memalloc_noreclaim_restore(noreclaim_flag);
1da177e4
LT
508
509 return result;
510}
511
32e67a3a
JB
512/*
513 * Different settings for sk->sk_sndtimeo can result in different return values
514 * if there is a signal pending when we enter sendmsg, because reasons?
515 */
516static inline int was_interrupted(int result)
517{
518 return result == -ERESTARTSYS || result == -EINTR;
519}
520
7fdfd406 521/* always call with the tx_lock held */
9561a7ad 522static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
1da177e4 523{
fd8383fd 524 struct request *req = blk_mq_rq_from_pdu(cmd);
5ea8d108
JB
525 struct nbd_config *config = nbd->config;
526 struct nbd_sock *nsock = config->socks[index];
d61b7f97 527 int result;
c9f2b6ae
AV
528 struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
529 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
530 struct iov_iter from;
1011c1b9 531 unsigned long size = blk_rq_bytes(req);
429a787b 532 struct bio *bio;
8f3ea359 533 u64 handle;
9dc6c806 534 u32 type;
685c9b24 535 u32 nbd_cmd_flags = 0;
9dd5d3ab 536 int sent = nsock->sent, skip = 0;
9dc6c806 537
aa563d7b 538 iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
c9f2b6ae 539
00514677
MC
540 type = req_to_nbd_cmd_type(req);
541 if (type == U32_MAX)
aebf526b 542 return -EIO;
1da177e4 543
09fc54cc 544 if (rq_data_dir(req) == WRITE &&
5ea8d108 545 (config->flags & NBD_FLAG_READ_ONLY)) {
09fc54cc
CH
546 dev_err_ratelimited(disk_to_dev(nbd->disk),
547 "Write on read-only\n");
548 return -EIO;
549 }
550
685c9b24
SM
551 if (req->cmd_flags & REQ_FUA)
552 nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
553
9dd5d3ab
JB
554 /* We did a partial send previously, and we at least sent the whole
555 * request struct, so just go and send the rest of the pages in the
556 * request.
557 */
558 if (sent) {
559 if (sent >= sizeof(request)) {
560 skip = sent - sizeof(request);
2abd2de7
AH
561
562 /* initialize handle for tracing purposes */
563 handle = nbd_cmd_handle(cmd);
564
9dd5d3ab
JB
565 goto send_pages;
566 }
567 iov_iter_advance(&from, sent);
8f3ea359
JB
568 } else {
569 cmd->cmd_cookie++;
9dd5d3ab 570 }
f3733247 571 cmd->index = index;
799f9a38 572 cmd->cookie = nsock->cookie;
2da22da5 573 cmd->retries = 0;
685c9b24 574 request.type = htonl(type | nbd_cmd_flags);
9561a7ad 575 if (type != NBD_CMD_FLUSH) {
75f187ab
AB
576 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
577 request.len = htonl(size);
578 }
8f3ea359
JB
579 handle = nbd_cmd_handle(cmd);
580 memcpy(request.handle, &handle, sizeof(handle));
1da177e4 581
ea106722
MM
582 trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
583
d18509f5 584 dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
ee57a05c 585 req, nbdcmd_to_ascii(type),
d18509f5 586 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
c9f2b6ae 587 result = sock_xmit(nbd, index, 1, &from,
9dd5d3ab 588 (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
2abd2de7 589 trace_nbd_header_sent(req, handle);
1da177e4 590 if (result <= 0) {
32e67a3a 591 if (was_interrupted(result)) {
9dd5d3ab
JB
592 /* If we havne't sent anything we can just return BUSY,
593 * however if we have sent something we need to make
594 * sure we only allow this req to be sent until we are
595 * completely done.
596 */
597 if (sent) {
598 nsock->pending = req;
599 nsock->sent = sent;
600 }
d7d94d48 601 set_bit(NBD_CMD_REQUEUED, &cmd->flags);
fc17b653 602 return BLK_STS_RESOURCE;
9dd5d3ab 603 }
a897b666 604 dev_err_ratelimited(disk_to_dev(nbd->disk),
7f1b90f9 605 "Send control failed (result %d)\n", result);
f3733247 606 return -EAGAIN;
1da177e4 607 }
9dd5d3ab 608send_pages:
429a787b 609 if (type != NBD_CMD_WRITE)
9dd5d3ab 610 goto out;
429a787b 611
429a787b
JA
612 bio = req->bio;
613 while (bio) {
614 struct bio *next = bio->bi_next;
615 struct bvec_iter iter;
7988613b 616 struct bio_vec bvec;
429a787b
JA
617
618 bio_for_each_segment(bvec, bio, iter) {
619 bool is_last = !next && bio_iter_last(bvec, iter);
d61b7f97 620 int flags = is_last ? 0 : MSG_MORE;
429a787b 621
d18509f5 622 dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
ee57a05c 623 req, bvec.bv_len);
aa563d7b 624 iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
9dd5d3ab
JB
625 if (skip) {
626 if (skip >= iov_iter_count(&from)) {
627 skip -= iov_iter_count(&from);
628 continue;
629 }
630 iov_iter_advance(&from, skip);
631 skip = 0;
632 }
633 result = sock_xmit(nbd, index, 1, &from, flags, &sent);
6c92e699 634 if (result <= 0) {
32e67a3a 635 if (was_interrupted(result)) {
9dd5d3ab
JB
636 /* We've already sent the header, we
637 * have no choice but to set pending and
638 * return BUSY.
639 */
640 nsock->pending = req;
641 nsock->sent = sent;
d7d94d48 642 set_bit(NBD_CMD_REQUEUED, &cmd->flags);
fc17b653 643 return BLK_STS_RESOURCE;
9dd5d3ab 644 }
f4507164 645 dev_err(disk_to_dev(nbd->disk),
7f1b90f9
WC
646 "Send data failed (result %d)\n",
647 result);
f3733247 648 return -EAGAIN;
6c92e699 649 }
429a787b
JA
650 /*
651 * The completion might already have come in,
652 * so break for the last one instead of letting
653 * the iterator do it. This prevents use-after-free
654 * of the bio.
655 */
656 if (is_last)
657 break;
1da177e4 658 }
429a787b 659 bio = next;
1da177e4 660 }
9dd5d3ab 661out:
2abd2de7 662 trace_nbd_payload_sent(req, handle);
9dd5d3ab
JB
663 nsock->pending = NULL;
664 nsock->sent = 0;
1da177e4 665 return 0;
1da177e4
LT
666}
667
1da177e4 668/* NULL returned = something went wrong, inform userspace */
9561a7ad 669static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
1da177e4 670{
5ea8d108 671 struct nbd_config *config = nbd->config;
1da177e4
LT
672 int result;
673 struct nbd_reply reply;
fd8383fd
JB
674 struct nbd_cmd *cmd;
675 struct request *req = NULL;
8f3ea359 676 u64 handle;
fd8383fd 677 u16 hwq;
9561a7ad 678 u32 tag;
c9f2b6ae
AV
679 struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
680 struct iov_iter to;
8f3ea359 681 int ret = 0;
1da177e4
LT
682
683 reply.magic = 0;
aa563d7b 684 iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
9dd5d3ab 685 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
1da177e4 686 if (result <= 0) {
5ea8d108 687 if (!nbd_disconnected(config))
9561a7ad
JB
688 dev_err(disk_to_dev(nbd->disk),
689 "Receive control failed (result %d)\n", result);
19391830 690 return ERR_PTR(result);
1da177e4 691 }
e4b57e08
MF
692
693 if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
f4507164 694 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
e4b57e08 695 (unsigned long)ntohl(reply.magic));
19391830 696 return ERR_PTR(-EPROTO);
e4b57e08
MF
697 }
698
8f3ea359
JB
699 memcpy(&handle, reply.handle, sizeof(handle));
700 tag = nbd_handle_to_tag(handle);
fd8383fd
JB
701 hwq = blk_mq_unique_tag_to_hwq(tag);
702 if (hwq < nbd->tag_set.nr_hw_queues)
703 req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
704 blk_mq_unique_tag_to_tag(tag));
705 if (!req || !blk_mq_request_started(req)) {
706 dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
707 tag, req);
708 return ERR_PTR(-ENOENT);
1da177e4 709 }
2abd2de7 710 trace_nbd_header_received(req, handle);
fd8383fd 711 cmd = blk_mq_rq_to_pdu(req);
8f3ea359
JB
712
713 mutex_lock(&cmd->lock);
714 if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
715 dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
716 req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
717 ret = -ENOENT;
718 goto out;
719 }
7ce23e8e
JB
720 if (cmd->status != BLK_STS_OK) {
721 dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
722 req);
723 ret = -ENOENT;
724 goto out;
725 }
8f3ea359
JB
726 if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
727 dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
728 req);
729 ret = -ENOENT;
730 goto out;
731 }
1da177e4 732 if (ntohl(reply.error)) {
f4507164 733 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
7f1b90f9 734 ntohl(reply.error));
2a842aca 735 cmd->status = BLK_STS_IOERR;
8f3ea359 736 goto out;
1da177e4
LT
737 }
738
ee57a05c 739 dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
9dc6c806 740 if (rq_data_dir(req) != WRITE) {
5705f702 741 struct req_iterator iter;
7988613b 742 struct bio_vec bvec;
5705f702
N
743
744 rq_for_each_segment(bvec, req, iter) {
aa563d7b 745 iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
9dd5d3ab 746 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
6c92e699 747 if (result <= 0) {
f4507164 748 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
7f1b90f9 749 result);
f3733247 750 /*
d970958b 751 * If we've disconnected, we need to make sure we
f3733247
JB
752 * complete this request, otherwise error out
753 * and let the timeout stuff handle resubmitting
754 * this request onto another connection.
755 */
d970958b 756 if (nbd_disconnected(config)) {
2a842aca 757 cmd->status = BLK_STS_IOERR;
8f3ea359 758 goto out;
f3733247 759 }
8f3ea359
JB
760 ret = -EIO;
761 goto out;
6c92e699 762 }
d18509f5 763 dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
ee57a05c 764 req, bvec.bv_len);
1da177e4
LT
765 }
766 }
8f3ea359 767out:
2abd2de7 768 trace_nbd_payload_received(req, handle);
8f3ea359
JB
769 mutex_unlock(&cmd->lock);
770 return ret ? ERR_PTR(ret) : cmd;
1da177e4
LT
771}
772
9561a7ad 773static void recv_work(struct work_struct *work)
1da177e4 774{
9561a7ad
JB
775 struct recv_thread_args *args = container_of(work,
776 struct recv_thread_args,
777 work);
778 struct nbd_device *nbd = args->nbd;
5ea8d108 779 struct nbd_config *config = nbd->config;
fd8383fd 780 struct nbd_cmd *cmd;
15f73f5b 781 struct request *rq;
1da177e4 782
19391830 783 while (1) {
9561a7ad 784 cmd = nbd_read_stat(nbd, args->index);
fd8383fd 785 if (IS_ERR(cmd)) {
5ea8d108 786 struct nbd_sock *nsock = config->socks[args->index];
f3733247
JB
787
788 mutex_lock(&nsock->tx_lock);
799f9a38 789 nbd_mark_nsock_dead(nbd, nsock, 1);
f3733247 790 mutex_unlock(&nsock->tx_lock);
19391830
MP
791 break;
792 }
793
15f73f5b
CH
794 rq = blk_mq_rq_from_pdu(cmd);
795 if (likely(!blk_should_fake_timeout(rq->q)))
796 blk_mq_complete_request(rq);
19391830 797 }
87aac3a8 798 nbd_config_put(nbd);
5ea8d108
JB
799 atomic_dec(&config->recv_threads);
800 wake_up(&config->recv_wq);
5ea8d108 801 kfree(args);
1da177e4
LT
802}
803
7baa8572 804static bool nbd_clear_req(struct request *req, void *data, bool reserved)
1da177e4 805{
d250bf4e 806 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
1da177e4 807
6bc5cc7c
XY
808 /* don't abort one completed request */
809 if (blk_mq_request_completed(req))
810 return true;
811
de6346ec 812 mutex_lock(&cmd->lock);
2a842aca 813 cmd->status = BLK_STS_IOERR;
de6346ec
JB
814 mutex_unlock(&cmd->lock);
815
08e0029a 816 blk_mq_complete_request(req);
7baa8572 817 return true;
fd8383fd
JB
818}
819
820static void nbd_clear_que(struct nbd_device *nbd)
821{
b52c2e92 822 blk_mq_quiesce_queue(nbd->disk->queue);
fd8383fd 823 blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
b52c2e92 824 blk_mq_unquiesce_queue(nbd->disk->queue);
e78273c8 825 dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
1da177e4
LT
826}
827
f3733247
JB
828static int find_fallback(struct nbd_device *nbd, int index)
829{
5ea8d108 830 struct nbd_config *config = nbd->config;
f3733247 831 int new_index = -1;
5ea8d108 832 struct nbd_sock *nsock = config->socks[index];
f3733247
JB
833 int fallback = nsock->fallback_index;
834
ec76a7b9 835 if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
f3733247
JB
836 return new_index;
837
5ea8d108 838 if (config->num_connections <= 1) {
f3733247 839 dev_err_ratelimited(disk_to_dev(nbd->disk),
d970958b 840 "Dead connection, failed to find a fallback\n");
f3733247
JB
841 return new_index;
842 }
843
5ea8d108
JB
844 if (fallback >= 0 && fallback < config->num_connections &&
845 !config->socks[fallback]->dead)
f3733247
JB
846 return fallback;
847
848 if (nsock->fallback_index < 0 ||
5ea8d108
JB
849 nsock->fallback_index >= config->num_connections ||
850 config->socks[nsock->fallback_index]->dead) {
f3733247 851 int i;
5ea8d108 852 for (i = 0; i < config->num_connections; i++) {
f3733247
JB
853 if (i == index)
854 continue;
5ea8d108 855 if (!config->socks[i]->dead) {
f3733247
JB
856 new_index = i;
857 break;
858 }
859 }
860 nsock->fallback_index = new_index;
861 if (new_index < 0) {
862 dev_err_ratelimited(disk_to_dev(nbd->disk),
863 "Dead connection, failed to find a fallback\n");
864 return new_index;
865 }
866 }
867 new_index = nsock->fallback_index;
868 return new_index;
869}
7fdfd406 870
560bc4b3
JB
871static int wait_for_reconnect(struct nbd_device *nbd)
872{
873 struct nbd_config *config = nbd->config;
874 if (!config->dead_conn_timeout)
875 return 0;
ec76a7b9 876 if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
560bc4b3 877 return 0;
5e3c3a7e
KV
878 return wait_event_timeout(config->conn_wait,
879 atomic_read(&config->live_connections) > 0,
880 config->dead_conn_timeout) > 0;
560bc4b3
JB
881}
882
9dd5d3ab 883static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
48cf6061 884{
fd8383fd
JB
885 struct request *req = blk_mq_rq_from_pdu(cmd);
886 struct nbd_device *nbd = cmd->nbd;
5ea8d108 887 struct nbd_config *config;
9561a7ad 888 struct nbd_sock *nsock;
9dd5d3ab 889 int ret;
fd8383fd 890
5ea8d108
JB
891 if (!refcount_inc_not_zero(&nbd->config_refs)) {
892 dev_err_ratelimited(disk_to_dev(nbd->disk),
893 "Socks array is empty\n");
6a468d59 894 blk_mq_start_request(req);
5ea8d108
JB
895 return -EINVAL;
896 }
897 config = nbd->config;
898
899 if (index >= config->num_connections) {
a897b666
JB
900 dev_err_ratelimited(disk_to_dev(nbd->disk),
901 "Attempted send on invalid socket\n");
5ea8d108 902 nbd_config_put(nbd);
6a468d59 903 blk_mq_start_request(req);
9dd5d3ab 904 return -EINVAL;
9561a7ad 905 }
2a842aca 906 cmd->status = BLK_STS_OK;
f3733247 907again:
5ea8d108 908 nsock = config->socks[index];
9561a7ad 909 mutex_lock(&nsock->tx_lock);
f3733247 910 if (nsock->dead) {
560bc4b3 911 int old_index = index;
f3733247 912 index = find_fallback(nbd, index);
560bc4b3 913 mutex_unlock(&nsock->tx_lock);
5ea8d108 914 if (index < 0) {
560bc4b3
JB
915 if (wait_for_reconnect(nbd)) {
916 index = old_index;
917 goto again;
918 }
919 /* All the sockets should already be down at this point,
920 * we just want to make sure that DISCONNECTED is set so
921 * any requests that come in that were queue'ed waiting
922 * for the reconnect timer don't trigger the timer again
923 * and instead just error out.
924 */
925 sock_shutdown(nbd);
926 nbd_config_put(nbd);
6a468d59 927 blk_mq_start_request(req);
560bc4b3 928 return -EIO;
5ea8d108 929 }
f3733247 930 goto again;
48cf6061
LV
931 }
932
9dd5d3ab
JB
933 /* Handle the case that we have a pending request that was partially
934 * transmitted that _has_ to be serviced first. We need to call requeue
935 * here so that it gets put _after_ the request that is already on the
936 * dispatch list.
937 */
6a468d59 938 blk_mq_start_request(req);
9dd5d3ab 939 if (unlikely(nsock->pending && nsock->pending != req)) {
d7d94d48 940 nbd_requeue_cmd(cmd);
9dd5d3ab
JB
941 ret = 0;
942 goto out;
48cf6061 943 }
f3733247
JB
944 /*
945 * Some failures are related to the link going down, so anything that
946 * returns EAGAIN can be retried on a different socket.
947 */
9dd5d3ab 948 ret = nbd_send_cmd(nbd, cmd, index);
f3733247
JB
949 if (ret == -EAGAIN) {
950 dev_err_ratelimited(disk_to_dev(nbd->disk),
6a468d59 951 "Request send failed, requeueing\n");
799f9a38 952 nbd_mark_nsock_dead(nbd, nsock, 1);
d7d94d48 953 nbd_requeue_cmd(cmd);
6a468d59 954 ret = 0;
f3733247 955 }
9dd5d3ab 956out:
9561a7ad 957 mutex_unlock(&nsock->tx_lock);
5ea8d108 958 nbd_config_put(nbd);
9dd5d3ab 959 return ret;
48cf6061
LV
960}
961
fc17b653 962static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
fd8383fd 963 const struct blk_mq_queue_data *bd)
1da177e4 964{
fd8383fd 965 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
9dd5d3ab 966 int ret;
1da177e4 967
9561a7ad
JB
968 /*
969 * Since we look at the bio's to send the request over the network we
970 * need to make sure the completion work doesn't mark this request done
971 * before we are done doing our send. This keeps us from dereferencing
972 * freed data if we have particularly fast completions (ie we get the
973 * completion before we exit sock_xmit on the last bvec) or in the case
974 * that the server is misbehaving (or there was an error) before we're
975 * done sending everything over the wire.
976 */
8f3ea359 977 mutex_lock(&cmd->lock);
d7d94d48 978 clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
9dd5d3ab
JB
979
980 /* We can be called directly from the user space process, which means we
981 * could possibly have signals pending so our sendmsg will fail. In
982 * this case we need to return that we are busy, otherwise error out as
983 * appropriate.
984 */
985 ret = nbd_handle_cmd(cmd, hctx->queue_num);
6e60a3bb
JB
986 if (ret < 0)
987 ret = BLK_STS_IOERR;
988 else if (!ret)
989 ret = BLK_STS_OK;
8f3ea359 990 mutex_unlock(&cmd->lock);
9561a7ad 991
6e60a3bb 992 return ret;
1da177e4
LT
993}
994
cf1b2326
MC
995static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
996 int *err)
997{
998 struct socket *sock;
999
1000 *err = 0;
1001 sock = sockfd_lookup(fd, err);
1002 if (!sock)
1003 return NULL;
1004
1005 if (sock->ops->shutdown == sock_no_shutdown) {
1006 dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
1007 *err = -EINVAL;
dff10bbe 1008 sockfd_put(sock);
cf1b2326
MC
1009 return NULL;
1010 }
1011
1012 return sock;
1013}
1014
e46c7287
JB
1015static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
1016 bool netlink)
23272a67 1017{
5ea8d108 1018 struct nbd_config *config = nbd->config;
9442b739 1019 struct socket *sock;
9561a7ad
JB
1020 struct nbd_sock **socks;
1021 struct nbd_sock *nsock;
9442b739
JB
1022 int err;
1023
cf1b2326 1024 sock = nbd_get_socket(nbd, arg, &err);
9442b739
JB
1025 if (!sock)
1026 return err;
23272a67 1027
b98e762e
JB
1028 /*
1029 * We need to make sure we don't get any errant requests while we're
1030 * reallocating the ->socks array.
1031 */
1032 blk_mq_freeze_queue(nbd->disk->queue);
1033
e46c7287 1034 if (!netlink && !nbd->task_setup &&
ec76a7b9 1035 !test_bit(NBD_RT_BOUND, &config->runtime_flags))
9561a7ad 1036 nbd->task_setup = current;
e46c7287
JB
1037
1038 if (!netlink &&
1039 (nbd->task_setup != current ||
ec76a7b9 1040 test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
9561a7ad
JB
1041 dev_err(disk_to_dev(nbd->disk),
1042 "Device being setup by another task");
579dd91a
ZB
1043 err = -EBUSY;
1044 goto put_socket;
1045 }
1046
1047 nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
1048 if (!nsock) {
1049 err = -ENOMEM;
1050 goto put_socket;
23272a67
MP
1051 }
1052
5ea8d108 1053 socks = krealloc(config->socks, (config->num_connections + 1) *
9561a7ad 1054 sizeof(struct nbd_sock *), GFP_KERNEL);
9b1355d5 1055 if (!socks) {
579dd91a
ZB
1056 kfree(nsock);
1057 err = -ENOMEM;
1058 goto put_socket;
9b1355d5 1059 }
03bf73c3
NE
1060
1061 config->socks = socks;
1062
f3733247
JB
1063 nsock->fallback_index = -1;
1064 nsock->dead = false;
9561a7ad
JB
1065 mutex_init(&nsock->tx_lock);
1066 nsock->sock = sock;
9dd5d3ab
JB
1067 nsock->pending = NULL;
1068 nsock->sent = 0;
799f9a38 1069 nsock->cookie = 0;
5ea8d108 1070 socks[config->num_connections++] = nsock;
560bc4b3 1071 atomic_inc(&config->live_connections);
b98e762e 1072 blk_mq_unfreeze_queue(nbd->disk->queue);
23272a67 1073
9561a7ad 1074 return 0;
579dd91a
ZB
1075
1076put_socket:
b98e762e 1077 blk_mq_unfreeze_queue(nbd->disk->queue);
579dd91a
ZB
1078 sockfd_put(sock);
1079 return err;
23272a67
MP
1080}
1081
b7aa3d39
JB
1082static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
1083{
1084 struct nbd_config *config = nbd->config;
1085 struct socket *sock, *old;
1086 struct recv_thread_args *args;
1087 int i;
1088 int err;
1089
cf1b2326 1090 sock = nbd_get_socket(nbd, arg, &err);
b7aa3d39
JB
1091 if (!sock)
1092 return err;
1093
1094 args = kzalloc(sizeof(*args), GFP_KERNEL);
1095 if (!args) {
1096 sockfd_put(sock);
1097 return -ENOMEM;
1098 }
1099
1100 for (i = 0; i < config->num_connections; i++) {
1101 struct nbd_sock *nsock = config->socks[i];
1102
1103 if (!nsock->dead)
1104 continue;
1105
1106 mutex_lock(&nsock->tx_lock);
1107 if (!nsock->dead) {
1108 mutex_unlock(&nsock->tx_lock);
1109 continue;
1110 }
1111 sk_set_memalloc(sock->sk);
a7ee8cf1
JB
1112 if (nbd->tag_set.timeout)
1113 sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
b7aa3d39
JB
1114 atomic_inc(&config->recv_threads);
1115 refcount_inc(&nbd->config_refs);
1116 old = nsock->sock;
1117 nsock->fallback_index = -1;
1118 nsock->sock = sock;
1119 nsock->dead = false;
1120 INIT_WORK(&args->work, recv_work);
1121 args->index = i;
1122 args->nbd = nbd;
799f9a38 1123 nsock->cookie++;
b7aa3d39
JB
1124 mutex_unlock(&nsock->tx_lock);
1125 sockfd_put(old);
1126
ec76a7b9 1127 clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
7a362ea9 1128
b7aa3d39
JB
1129 /* We take the tx_mutex in an error path in the recv_work, so we
1130 * need to queue_work outside of the tx_mutex.
1131 */
e9e006f5 1132 queue_work(nbd->recv_workq, &args->work);
560bc4b3
JB
1133
1134 atomic_inc(&config->live_connections);
1135 wake_up(&config->conn_wait);
b7aa3d39
JB
1136 return 0;
1137 }
1138 sockfd_put(sock);
1139 kfree(args);
1140 return -ENOSPC;
1141}
1142
0e4f0f6f
MP
1143static void nbd_bdev_reset(struct block_device *bdev)
1144{
abbbdf12
RMB
1145 if (bdev->bd_openers > 1)
1146 return;
a782483c 1147 set_capacity(bdev->bd_disk, 0);
0e4f0f6f
MP
1148}
1149
29eaadc0 1150static void nbd_parse_flags(struct nbd_device *nbd)
d02cf531 1151{
5ea8d108
JB
1152 struct nbd_config *config = nbd->config;
1153 if (config->flags & NBD_FLAG_READ_ONLY)
29eaadc0
JB
1154 set_disk_ro(nbd->disk, true);
1155 else
1156 set_disk_ro(nbd->disk, false);
5ea8d108 1157 if (config->flags & NBD_FLAG_SEND_TRIM)
8b904b5b 1158 blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
685c9b24
SM
1159 if (config->flags & NBD_FLAG_SEND_FLUSH) {
1160 if (config->flags & NBD_FLAG_SEND_FUA)
1161 blk_queue_write_cache(nbd->disk->queue, true, true);
1162 else
1163 blk_queue_write_cache(nbd->disk->queue, true, false);
1164 }
d02cf531 1165 else
aafb1eec 1166 blk_queue_write_cache(nbd->disk->queue, false, false);
d02cf531
MP
1167}
1168
9561a7ad
JB
1169static void send_disconnects(struct nbd_device *nbd)
1170{
5ea8d108 1171 struct nbd_config *config = nbd->config;
c9f2b6ae
AV
1172 struct nbd_request request = {
1173 .magic = htonl(NBD_REQUEST_MAGIC),
1174 .type = htonl(NBD_CMD_DISC),
1175 };
1176 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1177 struct iov_iter from;
9561a7ad
JB
1178 int i, ret;
1179
5ea8d108 1180 for (i = 0; i < config->num_connections; i++) {
b4b2aecc
JB
1181 struct nbd_sock *nsock = config->socks[i];
1182
aa563d7b 1183 iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
b4b2aecc 1184 mutex_lock(&nsock->tx_lock);
9dd5d3ab 1185 ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
9561a7ad
JB
1186 if (ret <= 0)
1187 dev_err(disk_to_dev(nbd->disk),
1188 "Send disconnect failed %d\n", ret);
b4b2aecc 1189 mutex_unlock(&nsock->tx_lock);
9561a7ad
JB
1190 }
1191}
1192
29eaadc0 1193static int nbd_disconnect(struct nbd_device *nbd)
9442b739 1194{
5ea8d108 1195 struct nbd_config *config = nbd->config;
30d53d9c 1196
5ea8d108 1197 dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
ec76a7b9 1198 set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
8454d685 1199 set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
2e13456f 1200 send_disconnects(nbd);
9442b739
JB
1201 return 0;
1202}
1203
29eaadc0 1204static void nbd_clear_sock(struct nbd_device *nbd)
1a2ad211 1205{
9442b739
JB
1206 sock_shutdown(nbd);
1207 nbd_clear_que(nbd);
5ea8d108 1208 nbd->task_setup = NULL;
5ea8d108
JB
1209}
1210
1211static void nbd_config_put(struct nbd_device *nbd)
1212{
1213 if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1214 &nbd->config_lock)) {
5ea8d108 1215 struct nbd_config *config = nbd->config;
5ea8d108 1216 nbd_dev_dbg_close(nbd);
29eaadc0 1217 nbd_size_clear(nbd);
ec76a7b9 1218 if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
5ea8d108
JB
1219 &config->runtime_flags))
1220 device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1221 nbd->task_recv = NULL;
29eaadc0 1222 nbd_clear_sock(nbd);
5ea8d108
JB
1223 if (config->num_connections) {
1224 int i;
1225 for (i = 0; i < config->num_connections; i++) {
1226 sockfd_put(config->socks[i]->sock);
1227 kfree(config->socks[i]);
1228 }
1229 kfree(config->socks);
1230 }
fa976532 1231 kfree(nbd->config);
af622b86
ID
1232 nbd->config = NULL;
1233
e9e006f5
MC
1234 if (nbd->recv_workq)
1235 destroy_workqueue(nbd->recv_workq);
1236 nbd->recv_workq = NULL;
1237
af622b86 1238 nbd->tag_set.timeout = 0;
6df133a1 1239 nbd->disk->queue->limits.discard_granularity = 0;
07ce213f 1240 nbd->disk->queue->limits.discard_alignment = 0;
6df133a1 1241 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
8b904b5b 1242 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
a2c97909 1243
5ea8d108 1244 mutex_unlock(&nbd->config_lock);
c6a4759e 1245 nbd_put(nbd);
5ea8d108
JB
1246 module_put(THIS_MODULE);
1247 }
9442b739
JB
1248}
1249
e46c7287 1250static int nbd_start_device(struct nbd_device *nbd)
9442b739 1251{
5ea8d108
JB
1252 struct nbd_config *config = nbd->config;
1253 int num_connections = config->num_connections;
9442b739 1254 int error = 0, i;
1a2ad211 1255
9442b739
JB
1256 if (nbd->task_recv)
1257 return -EBUSY;
5ea8d108 1258 if (!config->socks)
9442b739
JB
1259 return -EINVAL;
1260 if (num_connections > 1 &&
5ea8d108 1261 !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
9442b739 1262 dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
5ea8d108 1263 return -EINVAL;
9442b739 1264 }
23272a67 1265
e9e006f5
MC
1266 nbd->recv_workq = alloc_workqueue("knbd%d-recv",
1267 WQ_MEM_RECLAIM | WQ_HIGHPRI |
1268 WQ_UNBOUND, 0, nbd->index);
1269 if (!nbd->recv_workq) {
1270 dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
1271 return -ENOMEM;
1272 }
1273
5ea8d108 1274 blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
9442b739 1275 nbd->task_recv = current;
23272a67 1276
29eaadc0 1277 nbd_parse_flags(nbd);
23272a67 1278
9442b739
JB
1279 error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1280 if (error) {
1281 dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
5ea8d108 1282 return error;
1a2ad211 1283 }
ec76a7b9 1284 set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
37091fdd 1285
9442b739
JB
1286 nbd_dev_dbg_init(nbd);
1287 for (i = 0; i < num_connections; i++) {
5ea8d108
JB
1288 struct recv_thread_args *args;
1289
1290 args = kzalloc(sizeof(*args), GFP_KERNEL);
1291 if (!args) {
1292 sock_shutdown(nbd);
5c0dd228
SK
1293 /*
1294 * If num_connections is m (2 < m),
1295 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
1296 * But NO.(n + 1) failed. We still have n recv threads.
1297 * So, add flush_workqueue here to prevent recv threads
1298 * dropping the last config_refs and trying to destroy
1299 * the workqueue from inside the workqueue.
1300 */
1301 if (i)
1302 flush_workqueue(nbd->recv_workq);
5ea8d108
JB
1303 return -ENOMEM;
1304 }
1305 sk_set_memalloc(config->socks[i]->sock->sk);
a7ee8cf1
JB
1306 if (nbd->tag_set.timeout)
1307 config->socks[i]->sock->sk->sk_sndtimeo =
1308 nbd->tag_set.timeout;
5ea8d108
JB
1309 atomic_inc(&config->recv_threads);
1310 refcount_inc(&nbd->config_refs);
1311 INIT_WORK(&args->work, recv_work);
1312 args->nbd = nbd;
1313 args->index = i;
e9e006f5 1314 queue_work(nbd->recv_workq, &args->work);
37091fdd 1315 }
dcbddf54 1316 return nbd_set_size(nbd, config->bytesize, config->blksize);
e46c7287
JB
1317}
1318
1319static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1320{
1321 struct nbd_config *config = nbd->config;
1322 int ret;
1323
1324 ret = nbd_start_device(nbd);
1325 if (ret)
1326 return ret;
1327
e46c7287 1328 if (max_part)
38430f08 1329 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
e46c7287
JB
1330 mutex_unlock(&nbd->config_lock);
1331 ret = wait_event_interruptible(config->recv_wq,
5ea8d108 1332 atomic_read(&config->recv_threads) == 0);
1c05839a 1333 if (ret)
5ea8d108 1334 sock_shutdown(nbd);
1c05839a
MC
1335 flush_workqueue(nbd->recv_workq);
1336
9442b739 1337 mutex_lock(&nbd->config_lock);
76aa1d34 1338 nbd_bdev_reset(bdev);
9442b739 1339 /* user requested, ignore socket errors */
ec76a7b9 1340 if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
e46c7287 1341 ret = 0;
ec76a7b9 1342 if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
e46c7287
JB
1343 ret = -ETIMEDOUT;
1344 return ret;
9442b739
JB
1345}
1346
29eaadc0
JB
1347static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1348 struct block_device *bdev)
1349{
2516ab15 1350 sock_shutdown(nbd);
2b5c8f00 1351 __invalidate_device(bdev, true);
29eaadc0 1352 nbd_bdev_reset(bdev);
ec76a7b9 1353 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
e46c7287
JB
1354 &nbd->config->runtime_flags))
1355 nbd_config_put(nbd);
29eaadc0
JB
1356}
1357
55313e92
MC
1358static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
1359{
1360 nbd->tag_set.timeout = timeout * HZ;
2da22da5
MC
1361 if (timeout)
1362 blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
acb19e17
HP
1363 else
1364 blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
55313e92
MC
1365}
1366
9442b739
JB
1367/* Must be called with config_lock held */
1368static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1369 unsigned int cmd, unsigned long arg)
1370{
5ea8d108
JB
1371 struct nbd_config *config = nbd->config;
1372
9442b739
JB
1373 switch (cmd) {
1374 case NBD_DISCONNECT:
29eaadc0 1375 return nbd_disconnect(nbd);
9442b739 1376 case NBD_CLEAR_SOCK:
29eaadc0
JB
1377 nbd_clear_sock_ioctl(nbd, bdev);
1378 return 0;
9442b739 1379 case NBD_SET_SOCK:
e46c7287 1380 return nbd_add_socket(nbd, arg, false);
9442b739 1381 case NBD_SET_BLKSIZE:
dcbddf54 1382 return nbd_set_size(nbd, config->bytesize, arg);
1da177e4 1383 case NBD_SET_SIZE:
dcbddf54 1384 return nbd_set_size(nbd, arg, config->blksize);
37091fdd 1385 case NBD_SET_SIZE_BLOCKS:
dcbddf54
CH
1386 return nbd_set_size(nbd, arg * config->blksize,
1387 config->blksize);
7fdfd406 1388 case NBD_SET_TIMEOUT:
2da22da5 1389 nbd_set_cmd_timeout(nbd, arg);
7fdfd406 1390 return 0;
1a2ad211 1391
2f012508 1392 case NBD_SET_FLAGS:
5ea8d108 1393 config->flags = arg;
2f012508 1394 return 0;
9442b739 1395 case NBD_DO_IT:
e46c7287 1396 return nbd_start_device_ioctl(nbd, bdev);
1da177e4 1397 case NBD_CLEAR_QUE:
4b2f0260
HX
1398 /*
1399 * This is for compatibility only. The queue is always cleared
1400 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1401 */
1da177e4
LT
1402 return 0;
1403 case NBD_PRINT_DEBUG:
fd8383fd
JB
1404 /*
1405 * For compatibility only, we no longer keep a list of
1406 * outstanding requests.
1407 */
1da177e4
LT
1408 return 0;
1409 }
1a2ad211
PM
1410 return -ENOTTY;
1411}
1412
1413static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1414 unsigned int cmd, unsigned long arg)
1415{
f4507164 1416 struct nbd_device *nbd = bdev->bd_disk->private_data;
e46c7287
JB
1417 struct nbd_config *config = nbd->config;
1418 int error = -EINVAL;
1a2ad211
PM
1419
1420 if (!capable(CAP_SYS_ADMIN))
1421 return -EPERM;
1422
1dae69be
JB
1423 /* The block layer will pass back some non-nbd ioctls in case we have
1424 * special handling for them, but we don't so just return an error.
1425 */
1426 if (_IOC_TYPE(cmd) != 0xab)
1427 return -EINVAL;
1428
9561a7ad 1429 mutex_lock(&nbd->config_lock);
e46c7287
JB
1430
1431 /* Don't allow ioctl operations on a nbd device that was created with
1432 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1433 */
ec76a7b9 1434 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
e46c7287
JB
1435 (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1436 error = __nbd_ioctl(bdev, nbd, cmd, arg);
1437 else
1438 dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
9561a7ad 1439 mutex_unlock(&nbd->config_lock);
1a2ad211 1440 return error;
1da177e4
LT
1441}
1442
5ea8d108
JB
1443static struct nbd_config *nbd_alloc_config(void)
1444{
1445 struct nbd_config *config;
1446
1447 config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1448 if (!config)
1449 return NULL;
1450 atomic_set(&config->recv_threads, 0);
1451 init_waitqueue_head(&config->recv_wq);
560bc4b3 1452 init_waitqueue_head(&config->conn_wait);
553768d1 1453 config->blksize = NBD_DEF_BLKSIZE;
560bc4b3 1454 atomic_set(&config->live_connections, 0);
5ea8d108
JB
1455 try_module_get(THIS_MODULE);
1456 return config;
1457}
1458
1459static int nbd_open(struct block_device *bdev, fmode_t mode)
1460{
1461 struct nbd_device *nbd;
1462 int ret = 0;
1463
1464 mutex_lock(&nbd_index_mutex);
1465 nbd = bdev->bd_disk->private_data;
1466 if (!nbd) {
1467 ret = -ENXIO;
1468 goto out;
1469 }
c6a4759e
JB
1470 if (!refcount_inc_not_zero(&nbd->refs)) {
1471 ret = -ENXIO;
1472 goto out;
1473 }
5ea8d108
JB
1474 if (!refcount_inc_not_zero(&nbd->config_refs)) {
1475 struct nbd_config *config;
1476
1477 mutex_lock(&nbd->config_lock);
1478 if (refcount_inc_not_zero(&nbd->config_refs)) {
1479 mutex_unlock(&nbd->config_lock);
1480 goto out;
1481 }
1482 config = nbd->config = nbd_alloc_config();
1483 if (!config) {
1484 ret = -ENOMEM;
1485 mutex_unlock(&nbd->config_lock);
1486 goto out;
1487 }
1488 refcount_set(&nbd->config_refs, 1);
c6a4759e 1489 refcount_inc(&nbd->refs);
5ea8d108 1490 mutex_unlock(&nbd->config_lock);
1aba169e
JT
1491 if (max_part)
1492 set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
fe1f9e66 1493 } else if (nbd_disconnected(nbd->config)) {
1aba169e
JT
1494 if (max_part)
1495 set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
5ea8d108
JB
1496 }
1497out:
1498 mutex_unlock(&nbd_index_mutex);
1499 return ret;
1500}
1501
1502static void nbd_release(struct gendisk *disk, fmode_t mode)
1503{
1504 struct nbd_device *nbd = disk->private_data;
08ba91ee 1505
ec76a7b9 1506 if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
977115c0 1507 disk->part0->bd_openers == 0)
08ba91ee
DRK
1508 nbd_disconnect_and_put(nbd);
1509
5ea8d108 1510 nbd_config_put(nbd);
c6a4759e 1511 nbd_put(nbd);
5ea8d108
JB
1512}
1513
83d5cde4 1514static const struct block_device_operations nbd_fops =
1da177e4
LT
1515{
1516 .owner = THIS_MODULE,
5ea8d108
JB
1517 .open = nbd_open,
1518 .release = nbd_release,
8a6cfeb6 1519 .ioctl = nbd_ioctl,
263a3df1 1520 .compat_ioctl = nbd_ioctl,
1da177e4
LT
1521};
1522
30d53d9c
MP
1523#if IS_ENABLED(CONFIG_DEBUG_FS)
1524
1525static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1526{
1527 struct nbd_device *nbd = s->private;
1528
1529 if (nbd->task_recv)
1530 seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
30d53d9c
MP
1531
1532 return 0;
1533}
1534
1535static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1536{
1537 return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1538}
1539
1540static const struct file_operations nbd_dbg_tasks_ops = {
1541 .open = nbd_dbg_tasks_open,
1542 .read = seq_read,
1543 .llseek = seq_lseek,
1544 .release = single_release,
1545};
1546
1547static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1548{
1549 struct nbd_device *nbd = s->private;
5ea8d108 1550 u32 flags = nbd->config->flags;
30d53d9c
MP
1551
1552 seq_printf(s, "Hex: 0x%08x\n\n", flags);
1553
1554 seq_puts(s, "Known flags:\n");
1555
1556 if (flags & NBD_FLAG_HAS_FLAGS)
1557 seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1558 if (flags & NBD_FLAG_READ_ONLY)
1559 seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1560 if (flags & NBD_FLAG_SEND_FLUSH)
1561 seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
685c9b24
SM
1562 if (flags & NBD_FLAG_SEND_FUA)
1563 seq_puts(s, "NBD_FLAG_SEND_FUA\n");
30d53d9c
MP
1564 if (flags & NBD_FLAG_SEND_TRIM)
1565 seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1566
1567 return 0;
1568}
1569
1570static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1571{
1572 return single_open(file, nbd_dbg_flags_show, inode->i_private);
1573}
1574
1575static const struct file_operations nbd_dbg_flags_ops = {
1576 .open = nbd_dbg_flags_open,
1577 .read = seq_read,
1578 .llseek = seq_lseek,
1579 .release = single_release,
1580};
1581
1582static int nbd_dev_dbg_init(struct nbd_device *nbd)
1583{
1584 struct dentry *dir;
5ea8d108 1585 struct nbd_config *config = nbd->config;
27ea43fe
MP
1586
1587 if (!nbd_dbg_dir)
1588 return -EIO;
30d53d9c
MP
1589
1590 dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
27ea43fe
MP
1591 if (!dir) {
1592 dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1593 nbd_name(nbd));
1594 return -EIO;
30d53d9c 1595 }
5ea8d108 1596 config->dbg_dir = dir;
30d53d9c 1597
27ea43fe 1598 debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
5ea8d108 1599 debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
0eadf37a 1600 debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
5ea8d108 1601 debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
d366a0ff 1602 debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
30d53d9c
MP
1603
1604 return 0;
1605}
1606
1607static void nbd_dev_dbg_close(struct nbd_device *nbd)
1608{
5ea8d108 1609 debugfs_remove_recursive(nbd->config->dbg_dir);
30d53d9c
MP
1610}
1611
1612static int nbd_dbg_init(void)
1613{
1614 struct dentry *dbg_dir;
1615
1616 dbg_dir = debugfs_create_dir("nbd", NULL);
27ea43fe
MP
1617 if (!dbg_dir)
1618 return -EIO;
30d53d9c
MP
1619
1620 nbd_dbg_dir = dbg_dir;
1621
1622 return 0;
1623}
1624
1625static void nbd_dbg_close(void)
1626{
1627 debugfs_remove_recursive(nbd_dbg_dir);
1628}
1629
1630#else /* IS_ENABLED(CONFIG_DEBUG_FS) */
1631
1632static int nbd_dev_dbg_init(struct nbd_device *nbd)
1633{
1634 return 0;
1635}
1636
1637static void nbd_dev_dbg_close(struct nbd_device *nbd)
1638{
1639}
1640
1641static int nbd_dbg_init(void)
1642{
1643 return 0;
1644}
1645
1646static void nbd_dbg_close(void)
1647{
1648}
1649
1650#endif
1651
d6296d39
CH
1652static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1653 unsigned int hctx_idx, unsigned int numa_node)
fd8383fd
JB
1654{
1655 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
d6296d39 1656 cmd->nbd = set->driver_data;
d7d94d48 1657 cmd->flags = 0;
8f3ea359 1658 mutex_init(&cmd->lock);
fd8383fd
JB
1659 return 0;
1660}
1661
f363b089 1662static const struct blk_mq_ops nbd_mq_ops = {
fd8383fd 1663 .queue_rq = nbd_queue_rq,
1e388ae0 1664 .complete = nbd_complete_rq,
fd8383fd 1665 .init_request = nbd_init_request,
0eadf37a 1666 .timeout = nbd_xmit_timeout,
fd8383fd
JB
1667};
1668
b0d9111a
JB
1669static int nbd_dev_add(int index)
1670{
1671 struct nbd_device *nbd;
1672 struct gendisk *disk;
1673 struct request_queue *q;
1674 int err = -ENOMEM;
1675
1676 nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1677 if (!nbd)
1678 goto out;
1679
1680 disk = alloc_disk(1 << part_shift);
1681 if (!disk)
1682 goto out_free_nbd;
1683
1684 if (index >= 0) {
1685 err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1686 GFP_KERNEL);
1687 if (err == -ENOSPC)
1688 err = -EEXIST;
1689 } else {
1690 err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1691 if (err >= 0)
1692 index = err;
1693 }
1694 if (err < 0)
1695 goto out_free_disk;
1696
e46c7287 1697 nbd->index = index;
b0d9111a
JB
1698 nbd->disk = disk;
1699 nbd->tag_set.ops = &nbd_mq_ops;
1700 nbd->tag_set.nr_hw_queues = 1;
1701 nbd->tag_set.queue_depth = 128;
1702 nbd->tag_set.numa_node = NUMA_NO_NODE;
1703 nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1704 nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
56d18f62 1705 BLK_MQ_F_BLOCKING;
b0d9111a 1706 nbd->tag_set.driver_data = nbd;
8454d685 1707 nbd->destroy_complete = NULL;
b0d9111a
JB
1708
1709 err = blk_mq_alloc_tag_set(&nbd->tag_set);
1710 if (err)
1711 goto out_free_idr;
1712
1713 q = blk_mq_init_queue(&nbd->tag_set);
1714 if (IS_ERR(q)) {
1715 err = PTR_ERR(q);
1716 goto out_free_tags;
1717 }
1718 disk->queue = q;
1719
1720 /*
1721 * Tell the block layer that we are not a rotational device
1722 */
8b904b5b
BVA
1723 blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1724 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
6df133a1 1725 disk->queue->limits.discard_granularity = 0;
07ce213f 1726 disk->queue->limits.discard_alignment = 0;
6df133a1 1727 blk_queue_max_discard_sectors(disk->queue, 0);
ebb16d0d 1728 blk_queue_max_segment_size(disk->queue, UINT_MAX);
1cc1f17a 1729 blk_queue_max_segments(disk->queue, USHRT_MAX);
b0d9111a
JB
1730 blk_queue_max_hw_sectors(disk->queue, 65536);
1731 disk->queue->limits.max_sectors = 256;
1732
b0d9111a 1733 mutex_init(&nbd->config_lock);
5ea8d108 1734 refcount_set(&nbd->config_refs, 0);
c6a4759e
JB
1735 refcount_set(&nbd->refs, 1);
1736 INIT_LIST_HEAD(&nbd->list);
b0d9111a
JB
1737 disk->major = NBD_MAJOR;
1738 disk->first_minor = index << part_shift;
1739 disk->fops = &nbd_fops;
1740 disk->private_data = nbd;
1741 sprintf(disk->disk_name, "nbd%d", index);
b0d9111a 1742 add_disk(disk);
47d902b9 1743 nbd_total_devices++;
b0d9111a
JB
1744 return index;
1745
1746out_free_tags:
1747 blk_mq_free_tag_set(&nbd->tag_set);
1748out_free_idr:
1749 idr_remove(&nbd_index_idr, index);
1750out_free_disk:
1751 put_disk(disk);
1752out_free_nbd:
1753 kfree(nbd);
1754out:
1755 return err;
1756}
1757
e46c7287
JB
1758static int find_free_cb(int id, void *ptr, void *data)
1759{
1760 struct nbd_device *nbd = ptr;
1761 struct nbd_device **found = data;
1762
1763 if (!refcount_read(&nbd->config_refs)) {
1764 *found = nbd;
1765 return 1;
1766 }
1767 return 0;
1768}
1769
1770/* Netlink interface. */
a86c4120 1771static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
e46c7287
JB
1772 [NBD_ATTR_INDEX] = { .type = NLA_U32 },
1773 [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 },
1774 [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 },
1775 [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 },
1776 [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 },
1777 [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 },
1778 [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
560bc4b3 1779 [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
47d902b9 1780 [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED},
e46c7287
JB
1781};
1782
a86c4120 1783static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
e46c7287
JB
1784 [NBD_SOCK_FD] = { .type = NLA_U32 },
1785};
1786
47d902b9
JB
1787/* We don't use this right now since we don't parse the incoming list, but we
1788 * still want it here so userspace knows what to expect.
1789 */
a86c4120 1790static const struct nla_policy __attribute__((unused))
47d902b9
JB
1791nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1792 [NBD_DEVICE_INDEX] = { .type = NLA_U32 },
1793 [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
1794};
1795
4ddeaae8
MC
1796static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
1797{
1798 struct nbd_config *config = nbd->config;
1799 u64 bsize = config->blksize;
1800 u64 bytes = config->bytesize;
1801
1802 if (info->attrs[NBD_ATTR_SIZE_BYTES])
1803 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1804
dcbddf54 1805 if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
4ddeaae8 1806 bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
4ddeaae8
MC
1807
1808 if (bytes != config->bytesize || bsize != config->blksize)
dcbddf54 1809 return nbd_set_size(nbd, bytes, bsize);
4ddeaae8
MC
1810 return 0;
1811}
1812
e46c7287
JB
1813static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1814{
8454d685 1815 DECLARE_COMPLETION_ONSTACK(destroy_complete);
e46c7287
JB
1816 struct nbd_device *nbd = NULL;
1817 struct nbd_config *config;
1818 int index = -1;
1819 int ret;
a2c97909 1820 bool put_dev = false;
e46c7287
JB
1821
1822 if (!netlink_capable(skb, CAP_SYS_ADMIN))
1823 return -EPERM;
1824
1825 if (info->attrs[NBD_ATTR_INDEX])
1826 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1827 if (!info->attrs[NBD_ATTR_SOCKETS]) {
1828 printk(KERN_ERR "nbd: must specify at least one socket\n");
1829 return -EINVAL;
1830 }
1831 if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1832 printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1833 return -EINVAL;
1834 }
1835again:
1836 mutex_lock(&nbd_index_mutex);
1837 if (index == -1) {
1838 ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1839 if (ret == 0) {
1840 int new_index;
1841 new_index = nbd_dev_add(-1);
1842 if (new_index < 0) {
1843 mutex_unlock(&nbd_index_mutex);
1844 printk(KERN_ERR "nbd: failed to add new device\n");
0979962f 1845 return new_index;
e46c7287
JB
1846 }
1847 nbd = idr_find(&nbd_index_idr, new_index);
1848 }
1849 } else {
1850 nbd = idr_find(&nbd_index_idr, index);
e6a76272
JB
1851 if (!nbd) {
1852 ret = nbd_dev_add(index);
1853 if (ret < 0) {
1854 mutex_unlock(&nbd_index_mutex);
1855 printk(KERN_ERR "nbd: failed to add new device\n");
1856 return ret;
1857 }
1858 nbd = idr_find(&nbd_index_idr, index);
1859 }
e46c7287 1860 }
e46c7287
JB
1861 if (!nbd) {
1862 printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1863 index);
c6a4759e
JB
1864 mutex_unlock(&nbd_index_mutex);
1865 return -EINVAL;
1866 }
8454d685
XL
1867
1868 if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
1869 test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
1870 nbd->destroy_complete = &destroy_complete;
1871 mutex_unlock(&nbd_index_mutex);
1872
1873 /* Wait untill the the nbd stuff is totally destroyed */
1874 wait_for_completion(&destroy_complete);
1875 goto again;
1876 }
1877
c6a4759e
JB
1878 if (!refcount_inc_not_zero(&nbd->refs)) {
1879 mutex_unlock(&nbd_index_mutex);
1880 if (index == -1)
1881 goto again;
1882 printk(KERN_ERR "nbd: device at index %d is going down\n",
1883 index);
e46c7287
JB
1884 return -EINVAL;
1885 }
c6a4759e 1886 mutex_unlock(&nbd_index_mutex);
e46c7287
JB
1887
1888 mutex_lock(&nbd->config_lock);
1889 if (refcount_read(&nbd->config_refs)) {
1890 mutex_unlock(&nbd->config_lock);
c6a4759e 1891 nbd_put(nbd);
e46c7287
JB
1892 if (index == -1)
1893 goto again;
1894 printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1895 return -EBUSY;
1896 }
1897 if (WARN_ON(nbd->config)) {
1898 mutex_unlock(&nbd->config_lock);
c6a4759e 1899 nbd_put(nbd);
e46c7287
JB
1900 return -EINVAL;
1901 }
1902 config = nbd->config = nbd_alloc_config();
1903 if (!nbd->config) {
1904 mutex_unlock(&nbd->config_lock);
c6a4759e 1905 nbd_put(nbd);
e46c7287
JB
1906 printk(KERN_ERR "nbd: couldn't allocate config\n");
1907 return -ENOMEM;
1908 }
1909 refcount_set(&nbd->config_refs, 1);
ec76a7b9 1910 set_bit(NBD_RT_BOUND, &config->runtime_flags);
e46c7287 1911
4ddeaae8
MC
1912 ret = nbd_genl_size_set(info, nbd);
1913 if (ret)
1914 goto out;
1915
55313e92
MC
1916 if (info->attrs[NBD_ATTR_TIMEOUT])
1917 nbd_set_cmd_timeout(nbd,
1918 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
560bc4b3
JB
1919 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1920 config->dead_conn_timeout =
1921 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1922 config->dead_conn_timeout *= HZ;
1923 }
e46c7287
JB
1924 if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1925 config->flags =
1926 nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
a2c97909
JB
1927 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1928 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1929 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
50317b9a
JB
1930 /*
1931 * We have 1 ref to keep the device around, and then 1
1932 * ref for our current operation here, which will be
1933 * inherited by the config. If we already have
1934 * DESTROY_ON_DISCONNECT set then we know we don't have
1935 * that extra ref already held so we don't need the
1936 * put_dev.
1937 */
1938 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1939 &nbd->flags))
1940 put_dev = true;
8454d685 1941 } else {
50317b9a
JB
1942 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1943 &nbd->flags))
1944 refcount_inc(&nbd->refs);
a2c97909 1945 }
08ba91ee 1946 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
ec76a7b9 1947 set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
08ba91ee
DRK
1948 &config->runtime_flags);
1949 }
a2c97909
JB
1950 }
1951
e46c7287
JB
1952 if (info->attrs[NBD_ATTR_SOCKETS]) {
1953 struct nlattr *attr;
1954 int rem, fd;
1955
1956 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1957 rem) {
1958 struct nlattr *socks[NBD_SOCK_MAX+1];
1959
1960 if (nla_type(attr) != NBD_SOCK_ITEM) {
1961 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1962 ret = -EINVAL;
1963 goto out;
1964 }
8cb08174
JB
1965 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
1966 attr,
1967 nbd_sock_policy,
1968 info->extack);
e46c7287
JB
1969 if (ret != 0) {
1970 printk(KERN_ERR "nbd: error processing sock list\n");
1971 ret = -EINVAL;
1972 goto out;
1973 }
1974 if (!socks[NBD_SOCK_FD])
1975 continue;
1976 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1977 ret = nbd_add_socket(nbd, fd, true);
1978 if (ret)
1979 goto out;
1980 }
1981 }
1982 ret = nbd_start_device(nbd);
1983out:
1984 mutex_unlock(&nbd->config_lock);
1985 if (!ret) {
ec76a7b9 1986 set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
e46c7287
JB
1987 refcount_inc(&nbd->config_refs);
1988 nbd_connect_reply(info, nbd->index);
1989 }
1990 nbd_config_put(nbd);
a2c97909
JB
1991 if (put_dev)
1992 nbd_put(nbd);
e46c7287
JB
1993 return ret;
1994}
1995
08ba91ee
DRK
1996static void nbd_disconnect_and_put(struct nbd_device *nbd)
1997{
1998 mutex_lock(&nbd->config_lock);
1999 nbd_disconnect(nbd);
6bc5cc7c 2000 sock_shutdown(nbd);
e9e006f5
MC
2001 /*
2002 * Make sure recv thread has finished, so it does not drop the last
2003 * config ref and try to destroy the workqueue from inside the work
6bc5cc7c
XY
2004 * queue. And this also ensure that we can safely call nbd_clear_que()
2005 * to cancel the inflight I/Os.
e9e006f5 2006 */
669a82e8
SK
2007 if (nbd->recv_workq)
2008 flush_workqueue(nbd->recv_workq);
6bc5cc7c
XY
2009 nbd_clear_que(nbd);
2010 nbd->task_setup = NULL;
2011 mutex_unlock(&nbd->config_lock);
2012
ec76a7b9 2013 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
08ba91ee
DRK
2014 &nbd->config->runtime_flags))
2015 nbd_config_put(nbd);
2016}
2017
e46c7287
JB
2018static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
2019{
2020 struct nbd_device *nbd;
2021 int index;
2022
2023 if (!netlink_capable(skb, CAP_SYS_ADMIN))
2024 return -EPERM;
2025
2026 if (!info->attrs[NBD_ATTR_INDEX]) {
2027 printk(KERN_ERR "nbd: must specify an index to disconnect\n");
2028 return -EINVAL;
2029 }
2030 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2031 mutex_lock(&nbd_index_mutex);
2032 nbd = idr_find(&nbd_index_idr, index);
e46c7287 2033 if (!nbd) {
c6a4759e 2034 mutex_unlock(&nbd_index_mutex);
e46c7287
JB
2035 printk(KERN_ERR "nbd: couldn't find device at index %d\n",
2036 index);
2037 return -EINVAL;
2038 }
c6a4759e
JB
2039 if (!refcount_inc_not_zero(&nbd->refs)) {
2040 mutex_unlock(&nbd_index_mutex);
2041 printk(KERN_ERR "nbd: device at index %d is going down\n",
2042 index);
2043 return -EINVAL;
2044 }
2045 mutex_unlock(&nbd_index_mutex);
2046 if (!refcount_inc_not_zero(&nbd->config_refs)) {
2047 nbd_put(nbd);
e46c7287 2048 return 0;
c6a4759e 2049 }
08ba91ee 2050 nbd_disconnect_and_put(nbd);
e46c7287 2051 nbd_config_put(nbd);
c6a4759e 2052 nbd_put(nbd);
e46c7287
JB
2053 return 0;
2054}
2055
b7aa3d39
JB
2056static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
2057{
2058 struct nbd_device *nbd = NULL;
2059 struct nbd_config *config;
2060 int index;
08ba91ee 2061 int ret = 0;
a2c97909 2062 bool put_dev = false;
b7aa3d39
JB
2063
2064 if (!netlink_capable(skb, CAP_SYS_ADMIN))
2065 return -EPERM;
2066
2067 if (!info->attrs[NBD_ATTR_INDEX]) {
2068 printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
2069 return -EINVAL;
2070 }
2071 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2072 mutex_lock(&nbd_index_mutex);
2073 nbd = idr_find(&nbd_index_idr, index);
b7aa3d39 2074 if (!nbd) {
c6a4759e 2075 mutex_unlock(&nbd_index_mutex);
b7aa3d39
JB
2076 printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
2077 index);
2078 return -EINVAL;
2079 }
c6a4759e
JB
2080 if (!refcount_inc_not_zero(&nbd->refs)) {
2081 mutex_unlock(&nbd_index_mutex);
2082 printk(KERN_ERR "nbd: device at index %d is going down\n",
2083 index);
2084 return -EINVAL;
2085 }
2086 mutex_unlock(&nbd_index_mutex);
b7aa3d39
JB
2087
2088 if (!refcount_inc_not_zero(&nbd->config_refs)) {
2089 dev_err(nbd_to_dev(nbd),
2090 "not configured, cannot reconfigure\n");
c6a4759e 2091 nbd_put(nbd);
b7aa3d39
JB
2092 return -EINVAL;
2093 }
2094
2095 mutex_lock(&nbd->config_lock);
2096 config = nbd->config;
ec76a7b9 2097 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
b7aa3d39
JB
2098 !nbd->task_recv) {
2099 dev_err(nbd_to_dev(nbd),
2100 "not configured, cannot reconfigure\n");
08ba91ee 2101 ret = -EINVAL;
b7aa3d39
JB
2102 goto out;
2103 }
2104
4ddeaae8
MC
2105 ret = nbd_genl_size_set(info, nbd);
2106 if (ret)
2107 goto out;
2108
55313e92
MC
2109 if (info->attrs[NBD_ATTR_TIMEOUT])
2110 nbd_set_cmd_timeout(nbd,
2111 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
560bc4b3
JB
2112 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
2113 config->dead_conn_timeout =
2114 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
2115 config->dead_conn_timeout *= HZ;
2116 }
a2c97909
JB
2117 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
2118 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
2119 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
50317b9a
JB
2120 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
2121 &nbd->flags))
a2c97909
JB
2122 put_dev = true;
2123 } else {
50317b9a
JB
2124 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
2125 &nbd->flags))
a2c97909
JB
2126 refcount_inc(&nbd->refs);
2127 }
08ba91ee
DRK
2128
2129 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
ec76a7b9 2130 set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
08ba91ee
DRK
2131 &config->runtime_flags);
2132 } else {
ec76a7b9 2133 clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
08ba91ee
DRK
2134 &config->runtime_flags);
2135 }
a2c97909 2136 }
b7aa3d39
JB
2137
2138 if (info->attrs[NBD_ATTR_SOCKETS]) {
2139 struct nlattr *attr;
2140 int rem, fd;
2141
2142 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2143 rem) {
2144 struct nlattr *socks[NBD_SOCK_MAX+1];
2145
2146 if (nla_type(attr) != NBD_SOCK_ITEM) {
2147 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
2148 ret = -EINVAL;
2149 goto out;
2150 }
8cb08174
JB
2151 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2152 attr,
2153 nbd_sock_policy,
2154 info->extack);
b7aa3d39
JB
2155 if (ret != 0) {
2156 printk(KERN_ERR "nbd: error processing sock list\n");
2157 ret = -EINVAL;
2158 goto out;
2159 }
2160 if (!socks[NBD_SOCK_FD])
2161 continue;
2162 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2163 ret = nbd_reconnect_socket(nbd, fd);
2164 if (ret) {
2165 if (ret == -ENOSPC)
2166 ret = 0;
2167 goto out;
2168 }
2169 dev_info(nbd_to_dev(nbd), "reconnected socket\n");
2170 }
2171 }
2172out:
2173 mutex_unlock(&nbd->config_lock);
2174 nbd_config_put(nbd);
c6a4759e 2175 nbd_put(nbd);
a2c97909
JB
2176 if (put_dev)
2177 nbd_put(nbd);
b7aa3d39
JB
2178 return ret;
2179}
2180
66a9b928 2181static const struct genl_small_ops nbd_connect_genl_ops[] = {
e46c7287
JB
2182 {
2183 .cmd = NBD_CMD_CONNECT,
ef6243ac 2184 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
e46c7287
JB
2185 .doit = nbd_genl_connect,
2186 },
2187 {
2188 .cmd = NBD_CMD_DISCONNECT,
ef6243ac 2189 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
e46c7287
JB
2190 .doit = nbd_genl_disconnect,
2191 },
b7aa3d39
JB
2192 {
2193 .cmd = NBD_CMD_RECONFIGURE,
ef6243ac 2194 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
b7aa3d39
JB
2195 .doit = nbd_genl_reconfigure,
2196 },
47d902b9
JB
2197 {
2198 .cmd = NBD_CMD_STATUS,
ef6243ac 2199 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
47d902b9
JB
2200 .doit = nbd_genl_status,
2201 },
e46c7287
JB
2202};
2203
799f9a38
JB
2204static const struct genl_multicast_group nbd_mcast_grps[] = {
2205 { .name = NBD_GENL_MCAST_GROUP_NAME, },
2206};
2207
e46c7287
JB
2208static struct genl_family nbd_genl_family __ro_after_init = {
2209 .hdrsize = 0,
2210 .name = NBD_GENL_FAMILY_NAME,
2211 .version = NBD_GENL_VERSION,
2212 .module = THIS_MODULE,
66a9b928
JK
2213 .small_ops = nbd_connect_genl_ops,
2214 .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops),
e46c7287 2215 .maxattr = NBD_ATTR_MAX,
3b0f31f2 2216 .policy = nbd_attr_policy,
799f9a38
JB
2217 .mcgrps = nbd_mcast_grps,
2218 .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
e46c7287
JB
2219};
2220
47d902b9
JB
2221static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2222{
2223 struct nlattr *dev_opt;
2224 u8 connected = 0;
2225 int ret;
2226
2227 /* This is a little racey, but for status it's ok. The
2228 * reason we don't take a ref here is because we can't
2229 * take a ref in the index == -1 case as we would need
2230 * to put under the nbd_index_mutex, which could
2231 * deadlock if we are configured to remove ourselves
2232 * once we're disconnected.
2233 */
2234 if (refcount_read(&nbd->config_refs))
2235 connected = 1;
ae0be8de 2236 dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
47d902b9
JB
2237 if (!dev_opt)
2238 return -EMSGSIZE;
2239 ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2240 if (ret)
2241 return -EMSGSIZE;
2242 ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2243 connected);
2244 if (ret)
2245 return -EMSGSIZE;
2246 nla_nest_end(reply, dev_opt);
2247 return 0;
2248}
2249
2250static int status_cb(int id, void *ptr, void *data)
2251{
2252 struct nbd_device *nbd = ptr;
2253 return populate_nbd_status(nbd, (struct sk_buff *)data);
2254}
2255
2256static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2257{
2258 struct nlattr *dev_list;
2259 struct sk_buff *reply;
2260 void *reply_head;
2261 size_t msg_size;
2262 int index = -1;
2263 int ret = -ENOMEM;
2264
2265 if (info->attrs[NBD_ATTR_INDEX])
2266 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2267
2268 mutex_lock(&nbd_index_mutex);
2269
2270 msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2271 nla_attr_size(sizeof(u8)));
2272 msg_size *= (index == -1) ? nbd_total_devices : 1;
2273
2274 reply = genlmsg_new(msg_size, GFP_KERNEL);
2275 if (!reply)
2276 goto out;
2277 reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2278 NBD_CMD_STATUS);
2279 if (!reply_head) {
2280 nlmsg_free(reply);
2281 goto out;
2282 }
2283
ae0be8de 2284 dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
8608ba09
NE
2285 if (!dev_list) {
2286 nlmsg_free(reply);
2287 ret = -EMSGSIZE;
2288 goto out;
2289 }
2290
47d902b9
JB
2291 if (index == -1) {
2292 ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2293 if (ret) {
2294 nlmsg_free(reply);
2295 goto out;
2296 }
2297 } else {
2298 struct nbd_device *nbd;
2299 nbd = idr_find(&nbd_index_idr, index);
2300 if (nbd) {
2301 ret = populate_nbd_status(nbd, reply);
2302 if (ret) {
2303 nlmsg_free(reply);
2304 goto out;
2305 }
2306 }
2307 }
2308 nla_nest_end(reply, dev_list);
2309 genlmsg_end(reply, reply_head);
cd46eb89 2310 ret = genlmsg_reply(reply, info);
47d902b9
JB
2311out:
2312 mutex_unlock(&nbd_index_mutex);
2313 return ret;
2314}
2315
e46c7287
JB
2316static void nbd_connect_reply(struct genl_info *info, int index)
2317{
2318 struct sk_buff *skb;
2319 void *msg_head;
2320 int ret;
2321
2322 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2323 if (!skb)
2324 return;
2325 msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2326 NBD_CMD_CONNECT);
2327 if (!msg_head) {
2328 nlmsg_free(skb);
2329 return;
2330 }
2331 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2332 if (ret) {
2333 nlmsg_free(skb);
2334 return;
2335 }
2336 genlmsg_end(skb, msg_head);
2337 genlmsg_reply(skb, info);
2338}
1da177e4 2339
799f9a38
JB
2340static void nbd_mcast_index(int index)
2341{
2342 struct sk_buff *skb;
2343 void *msg_head;
2344 int ret;
2345
2346 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2347 if (!skb)
2348 return;
2349 msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2350 NBD_CMD_LINK_DEAD);
2351 if (!msg_head) {
2352 nlmsg_free(skb);
2353 return;
2354 }
2355 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2356 if (ret) {
2357 nlmsg_free(skb);
2358 return;
2359 }
2360 genlmsg_end(skb, msg_head);
2361 genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2362}
2363
2364static void nbd_dead_link_work(struct work_struct *work)
2365{
2366 struct link_dead_args *args = container_of(work, struct link_dead_args,
2367 work);
2368 nbd_mcast_index(args->index);
2369 kfree(args);
2370}
2371
1da177e4
LT
2372static int __init nbd_init(void)
2373{
1da177e4
LT
2374 int i;
2375
5b7b18cc 2376 BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
1da177e4 2377
d71a6d73 2378 if (max_part < 0) {
7742ce4a 2379 printk(KERN_ERR "nbd: max_part must be >= 0\n");
d71a6d73
LV
2380 return -EINVAL;
2381 }
2382
2383 part_shift = 0;
5988ce23 2384 if (max_part > 0) {
d71a6d73
LV
2385 part_shift = fls(max_part);
2386
5988ce23
NK
2387 /*
2388 * Adjust max_part according to part_shift as it is exported
2389 * to user space so that user can know the max number of
2390 * partition kernel should be able to manage.
2391 *
2392 * Note that -1 is required because partition 0 is reserved
2393 * for the whole disk.
2394 */
2395 max_part = (1UL << part_shift) - 1;
2396 }
2397
3b271082
NK
2398 if ((1UL << part_shift) > DISK_MAX_PARTS)
2399 return -EINVAL;
2400
2401 if (nbds_max > 1UL << (MINORBITS - part_shift))
2402 return -EINVAL;
2403
e9e006f5 2404 if (register_blkdev(NBD_MAJOR, "nbd"))
b0d9111a 2405 return -EIO;
1da177e4 2406
e46c7287
JB
2407 if (genl_register_family(&nbd_genl_family)) {
2408 unregister_blkdev(NBD_MAJOR, "nbd");
e46c7287
JB
2409 return -EINVAL;
2410 }
30d53d9c
MP
2411 nbd_dbg_init();
2412
b0d9111a
JB
2413 mutex_lock(&nbd_index_mutex);
2414 for (i = 0; i < nbds_max; i++)
2415 nbd_dev_add(i);
2416 mutex_unlock(&nbd_index_mutex);
2417 return 0;
2418}
1da177e4 2419
b0d9111a
JB
2420static int nbd_exit_cb(int id, void *ptr, void *data)
2421{
c6a4759e 2422 struct list_head *list = (struct list_head *)data;
b0d9111a 2423 struct nbd_device *nbd = ptr;
c6a4759e 2424
c6a4759e 2425 list_add_tail(&nbd->list, list);
1da177e4 2426 return 0;
1da177e4
LT
2427}
2428
2429static void __exit nbd_cleanup(void)
2430{
c6a4759e
JB
2431 struct nbd_device *nbd;
2432 LIST_HEAD(del_list);
2433
30d53d9c
MP
2434 nbd_dbg_close();
2435
c6a4759e
JB
2436 mutex_lock(&nbd_index_mutex);
2437 idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2438 mutex_unlock(&nbd_index_mutex);
2439
60ae36ad
JB
2440 while (!list_empty(&del_list)) {
2441 nbd = list_first_entry(&del_list, struct nbd_device, list);
2442 list_del_init(&nbd->list);
2443 if (refcount_read(&nbd->refs) != 1)
c6a4759e
JB
2444 printk(KERN_ERR "nbd: possibly leaking a device\n");
2445 nbd_put(nbd);
c6a4759e
JB
2446 }
2447
b0d9111a 2448 idr_destroy(&nbd_index_idr);
e46c7287 2449 genl_unregister_family(&nbd_genl_family);
1da177e4 2450 unregister_blkdev(NBD_MAJOR, "nbd");
1da177e4
LT
2451}
2452
2453module_init(nbd_init);
2454module_exit(nbd_cleanup);
2455
2456MODULE_DESCRIPTION("Network Block Device");
2457MODULE_LICENSE("GPL");
2458
40be0c28 2459module_param(nbds_max, int, 0444);
d71a6d73
LV
2460MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2461module_param(max_part, int, 0444);
7a8362a0 2462MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");