]> git.proxmox.com Git - mirror_qemu.git/blob - block/sheepdog.c
rbd: Fix to cleanly reject -drive without pool or image
[mirror_qemu.git] / block / sheepdog.c
1 /*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10 *
11 * Contributions after 2012-01-13 are licensed under the terms of the
12 * GNU GPL, version 2 or (at your option) any later version.
13 */
14
15 #include "qemu/osdep.h"
16 #include "qapi/error.h"
17 #include "qapi/qmp/qdict.h"
18 #include "qapi/qmp/qint.h"
19 #include "qemu/uri.h"
20 #include "qemu/error-report.h"
21 #include "qemu/sockets.h"
22 #include "block/block_int.h"
23 #include "sysemu/block-backend.h"
24 #include "qemu/bitops.h"
25 #include "qemu/cutils.h"
26
27 #define SD_PROTO_VER 0x01
28
29 #define SD_DEFAULT_ADDR "localhost"
30 #define SD_DEFAULT_PORT 7000
31
32 #define SD_OP_CREATE_AND_WRITE_OBJ 0x01
33 #define SD_OP_READ_OBJ 0x02
34 #define SD_OP_WRITE_OBJ 0x03
35 /* 0x04 is used internally by Sheepdog */
36
37 #define SD_OP_NEW_VDI 0x11
38 #define SD_OP_LOCK_VDI 0x12
39 #define SD_OP_RELEASE_VDI 0x13
40 #define SD_OP_GET_VDI_INFO 0x14
41 #define SD_OP_READ_VDIS 0x15
42 #define SD_OP_FLUSH_VDI 0x16
43 #define SD_OP_DEL_VDI 0x17
44 #define SD_OP_GET_CLUSTER_DEFAULT 0x18
45
46 #define SD_FLAG_CMD_WRITE 0x01
47 #define SD_FLAG_CMD_COW 0x02
48 #define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
49 #define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
50
51 #define SD_RES_SUCCESS 0x00 /* Success */
52 #define SD_RES_UNKNOWN 0x01 /* Unknown error */
53 #define SD_RES_NO_OBJ 0x02 /* No object found */
54 #define SD_RES_EIO 0x03 /* I/O error */
55 #define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
56 #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
57 #define SD_RES_SYSTEM_ERROR 0x06 /* System error */
58 #define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
59 #define SD_RES_NO_VDI 0x08 /* No vdi found */
60 #define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
61 #define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
62 #define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
63 #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
64 #define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
65 #define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
66 #define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
67 #define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
68 #define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
69 #define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
70 #define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
71 #define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
72 #define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
73 #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
74 #define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
75 #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
76 #define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */
77 #define SD_RES_READONLY 0x1A /* Object is read-only */
78
79 /*
80 * Object ID rules
81 *
82 * 0 - 19 (20 bits): data object space
83 * 20 - 31 (12 bits): reserved data object space
84 * 32 - 55 (24 bits): vdi object space
85 * 56 - 59 ( 4 bits): reserved vdi object space
86 * 60 - 63 ( 4 bits): object type identifier space
87 */
88
89 #define VDI_SPACE_SHIFT 32
90 #define VDI_BIT (UINT64_C(1) << 63)
91 #define VMSTATE_BIT (UINT64_C(1) << 62)
92 #define MAX_DATA_OBJS (UINT64_C(1) << 20)
93 #define MAX_CHILDREN 1024
94 #define SD_MAX_VDI_LEN 256
95 #define SD_MAX_VDI_TAG_LEN 256
96 #define SD_NR_VDIS (1U << 24)
97 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
98 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
99 #define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
100 /*
101 * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
102 * (SD_EC_MAX_STRIP - 1) for parity strips
103 *
104 * SD_MAX_COPIES is sum of number of data strips and parity strips.
105 */
106 #define SD_EC_MAX_STRIP 16
107 #define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
108
109 #define SD_INODE_SIZE (sizeof(SheepdogInode))
110 #define CURRENT_VDI_ID 0
111
112 #define LOCK_TYPE_NORMAL 0
113 #define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */
114
115 typedef struct SheepdogReq {
116 uint8_t proto_ver;
117 uint8_t opcode;
118 uint16_t flags;
119 uint32_t epoch;
120 uint32_t id;
121 uint32_t data_length;
122 uint32_t opcode_specific[8];
123 } SheepdogReq;
124
125 typedef struct SheepdogRsp {
126 uint8_t proto_ver;
127 uint8_t opcode;
128 uint16_t flags;
129 uint32_t epoch;
130 uint32_t id;
131 uint32_t data_length;
132 uint32_t result;
133 uint32_t opcode_specific[7];
134 } SheepdogRsp;
135
136 typedef struct SheepdogObjReq {
137 uint8_t proto_ver;
138 uint8_t opcode;
139 uint16_t flags;
140 uint32_t epoch;
141 uint32_t id;
142 uint32_t data_length;
143 uint64_t oid;
144 uint64_t cow_oid;
145 uint8_t copies;
146 uint8_t copy_policy;
147 uint8_t reserved[6];
148 uint64_t offset;
149 } SheepdogObjReq;
150
151 typedef struct SheepdogObjRsp {
152 uint8_t proto_ver;
153 uint8_t opcode;
154 uint16_t flags;
155 uint32_t epoch;
156 uint32_t id;
157 uint32_t data_length;
158 uint32_t result;
159 uint8_t copies;
160 uint8_t copy_policy;
161 uint8_t reserved[2];
162 uint32_t pad[6];
163 } SheepdogObjRsp;
164
165 typedef struct SheepdogVdiReq {
166 uint8_t proto_ver;
167 uint8_t opcode;
168 uint16_t flags;
169 uint32_t epoch;
170 uint32_t id;
171 uint32_t data_length;
172 uint64_t vdi_size;
173 uint32_t base_vdi_id;
174 uint8_t copies;
175 uint8_t copy_policy;
176 uint8_t store_policy;
177 uint8_t block_size_shift;
178 uint32_t snapid;
179 uint32_t type;
180 uint32_t pad[2];
181 } SheepdogVdiReq;
182
183 typedef struct SheepdogVdiRsp {
184 uint8_t proto_ver;
185 uint8_t opcode;
186 uint16_t flags;
187 uint32_t epoch;
188 uint32_t id;
189 uint32_t data_length;
190 uint32_t result;
191 uint32_t rsvd;
192 uint32_t vdi_id;
193 uint32_t pad[5];
194 } SheepdogVdiRsp;
195
196 typedef struct SheepdogClusterRsp {
197 uint8_t proto_ver;
198 uint8_t opcode;
199 uint16_t flags;
200 uint32_t epoch;
201 uint32_t id;
202 uint32_t data_length;
203 uint32_t result;
204 uint8_t nr_copies;
205 uint8_t copy_policy;
206 uint8_t block_size_shift;
207 uint8_t __pad1;
208 uint32_t __pad2[6];
209 } SheepdogClusterRsp;
210
211 typedef struct SheepdogInode {
212 char name[SD_MAX_VDI_LEN];
213 char tag[SD_MAX_VDI_TAG_LEN];
214 uint64_t ctime;
215 uint64_t snap_ctime;
216 uint64_t vm_clock_nsec;
217 uint64_t vdi_size;
218 uint64_t vm_state_size;
219 uint16_t copy_policy;
220 uint8_t nr_copies;
221 uint8_t block_size_shift;
222 uint32_t snap_id;
223 uint32_t vdi_id;
224 uint32_t parent_vdi_id;
225 uint32_t child_vdi_id[MAX_CHILDREN];
226 uint32_t data_vdi_id[MAX_DATA_OBJS];
227 } SheepdogInode;
228
229 #define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id)
230
231 /*
232 * 64 bit FNV-1a non-zero initial basis
233 */
234 #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
235
236 /*
237 * 64 bit Fowler/Noll/Vo FNV-1a hash code
238 */
239 static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
240 {
241 unsigned char *bp = buf;
242 unsigned char *be = bp + len;
243 while (bp < be) {
244 hval ^= (uint64_t) *bp++;
245 hval += (hval << 1) + (hval << 4) + (hval << 5) +
246 (hval << 7) + (hval << 8) + (hval << 40);
247 }
248 return hval;
249 }
250
251 static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
252 {
253 return inode->vdi_id == inode->data_vdi_id[idx];
254 }
255
256 static inline bool is_data_obj(uint64_t oid)
257 {
258 return !(VDI_BIT & oid);
259 }
260
261 static inline uint64_t data_oid_to_idx(uint64_t oid)
262 {
263 return oid & (MAX_DATA_OBJS - 1);
264 }
265
266 static inline uint32_t oid_to_vid(uint64_t oid)
267 {
268 return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
269 }
270
271 static inline uint64_t vid_to_vdi_oid(uint32_t vid)
272 {
273 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
274 }
275
276 static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
277 {
278 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
279 }
280
281 static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
282 {
283 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
284 }
285
286 static inline bool is_snapshot(struct SheepdogInode *inode)
287 {
288 return !!inode->snap_ctime;
289 }
290
291 static inline size_t count_data_objs(const struct SheepdogInode *inode)
292 {
293 return DIV_ROUND_UP(inode->vdi_size,
294 (1UL << inode->block_size_shift));
295 }
296
297 #undef DPRINTF
298 #ifdef DEBUG_SDOG
299 #define DEBUG_SDOG_PRINT 1
300 #else
301 #define DEBUG_SDOG_PRINT 0
302 #endif
303 #define DPRINTF(fmt, args...) \
304 do { \
305 if (DEBUG_SDOG_PRINT) { \
306 fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
307 } \
308 } while (0)
309
310 typedef struct SheepdogAIOCB SheepdogAIOCB;
311 typedef struct BDRVSheepdogState BDRVSheepdogState;
312
313 typedef struct AIOReq {
314 SheepdogAIOCB *aiocb;
315 unsigned int iov_offset;
316
317 uint64_t oid;
318 uint64_t base_oid;
319 uint64_t offset;
320 unsigned int data_len;
321 uint8_t flags;
322 uint32_t id;
323 bool create;
324
325 QLIST_ENTRY(AIOReq) aio_siblings;
326 } AIOReq;
327
328 enum AIOCBState {
329 AIOCB_WRITE_UDATA,
330 AIOCB_READ_UDATA,
331 AIOCB_FLUSH_CACHE,
332 AIOCB_DISCARD_OBJ,
333 };
334
335 #define AIOCBOverlapping(x, y) \
336 (!(x->max_affect_data_idx < y->min_affect_data_idx \
337 || y->max_affect_data_idx < x->min_affect_data_idx))
338
339 struct SheepdogAIOCB {
340 BDRVSheepdogState *s;
341
342 QEMUIOVector *qiov;
343
344 int64_t sector_num;
345 int nb_sectors;
346
347 int ret;
348 enum AIOCBState aiocb_type;
349
350 Coroutine *coroutine;
351 int nr_pending;
352
353 uint32_t min_affect_data_idx;
354 uint32_t max_affect_data_idx;
355
356 /*
357 * The difference between affect_data_idx and dirty_data_idx:
358 * affect_data_idx represents range of index of all request types.
359 * dirty_data_idx represents range of index updated by COW requests.
360 * dirty_data_idx is used for updating an inode object.
361 */
362 uint32_t min_dirty_data_idx;
363 uint32_t max_dirty_data_idx;
364
365 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
366 };
367
368 struct BDRVSheepdogState {
369 BlockDriverState *bs;
370 AioContext *aio_context;
371
372 SheepdogInode inode;
373
374 char name[SD_MAX_VDI_LEN];
375 bool is_snapshot;
376 uint32_t cache_flags;
377 bool discard_supported;
378
379 SocketAddress *addr;
380 int fd;
381
382 CoMutex lock;
383 Coroutine *co_send;
384 Coroutine *co_recv;
385
386 uint32_t aioreq_seq_num;
387
388 /* Every aio request must be linked to either of these queues. */
389 QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
390 QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
391
392 CoQueue overlapping_queue;
393 QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
394 };
395
396 typedef struct BDRVSheepdogReopenState {
397 int fd;
398 int cache_flags;
399 } BDRVSheepdogReopenState;
400
401 static const char * sd_strerror(int err)
402 {
403 int i;
404
405 static const struct {
406 int err;
407 const char *desc;
408 } errors[] = {
409 {SD_RES_SUCCESS, "Success"},
410 {SD_RES_UNKNOWN, "Unknown error"},
411 {SD_RES_NO_OBJ, "No object found"},
412 {SD_RES_EIO, "I/O error"},
413 {SD_RES_VDI_EXIST, "VDI exists already"},
414 {SD_RES_INVALID_PARMS, "Invalid parameters"},
415 {SD_RES_SYSTEM_ERROR, "System error"},
416 {SD_RES_VDI_LOCKED, "VDI is already locked"},
417 {SD_RES_NO_VDI, "No vdi found"},
418 {SD_RES_NO_BASE_VDI, "No base VDI found"},
419 {SD_RES_VDI_READ, "Failed read the requested VDI"},
420 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
421 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
422 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
423 {SD_RES_NO_TAG, "Failed to find the requested tag"},
424 {SD_RES_STARTUP, "The system is still booting"},
425 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
426 {SD_RES_SHUTDOWN, "The system is shutting down"},
427 {SD_RES_NO_MEM, "Out of memory on the server"},
428 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
429 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
430 {SD_RES_NO_SPACE, "Server has no space for new objects"},
431 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
432 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
433 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
434 {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
435 {SD_RES_READONLY, "Object is read-only"},
436 };
437
438 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
439 if (errors[i].err == err) {
440 return errors[i].desc;
441 }
442 }
443
444 return "Invalid error code";
445 }
446
447 /*
448 * Sheepdog I/O handling:
449 *
450 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
451 * link the requests to the inflight_list in the
452 * BDRVSheepdogState. The function yields while waiting for
453 * receiving the response.
454 *
455 * 2. We receive the response in aio_read_response, the fd handler to
456 * the sheepdog connection. We switch back to sd_co_readv/sd_writev
457 * after all the requests belonging to the AIOCB are finished. If
458 * needed, sd_co_writev will send another requests for the vdi object.
459 */
460
461 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
462 uint64_t oid, unsigned int data_len,
463 uint64_t offset, uint8_t flags, bool create,
464 uint64_t base_oid, unsigned int iov_offset)
465 {
466 AIOReq *aio_req;
467
468 aio_req = g_malloc(sizeof(*aio_req));
469 aio_req->aiocb = acb;
470 aio_req->iov_offset = iov_offset;
471 aio_req->oid = oid;
472 aio_req->base_oid = base_oid;
473 aio_req->offset = offset;
474 aio_req->data_len = data_len;
475 aio_req->flags = flags;
476 aio_req->id = s->aioreq_seq_num++;
477 aio_req->create = create;
478
479 acb->nr_pending++;
480 return aio_req;
481 }
482
483 static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
484 {
485 SheepdogAIOCB *cb;
486
487 retry:
488 QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
489 if (AIOCBOverlapping(acb, cb)) {
490 qemu_co_queue_wait(&s->overlapping_queue, NULL);
491 goto retry;
492 }
493 }
494 }
495
496 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
497 QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
498 int type)
499 {
500 uint32_t object_size;
501
502 object_size = (UINT32_C(1) << s->inode.block_size_shift);
503
504 acb->s = s;
505
506 acb->qiov = qiov;
507
508 acb->sector_num = sector_num;
509 acb->nb_sectors = nb_sectors;
510
511 acb->coroutine = qemu_coroutine_self();
512 acb->ret = 0;
513 acb->nr_pending = 0;
514
515 acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
516 acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
517 acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
518
519 acb->min_dirty_data_idx = UINT32_MAX;
520 acb->max_dirty_data_idx = 0;
521 acb->aiocb_type = type;
522
523 if (type == AIOCB_FLUSH_CACHE) {
524 return;
525 }
526
527 wait_for_overlapping_aiocb(s, acb);
528 QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
529 }
530
531 static SocketAddress *sd_socket_address(const char *path,
532 const char *host, const char *port)
533 {
534 SocketAddress *addr = g_new0(SocketAddress, 1);
535
536 if (path) {
537 addr->type = SOCKET_ADDRESS_KIND_UNIX;
538 addr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
539 addr->u.q_unix.data->path = g_strdup(path);
540 } else {
541 addr->type = SOCKET_ADDRESS_KIND_INET;
542 addr->u.inet.data = g_new0(InetSocketAddress, 1);
543 addr->u.inet.data->host = g_strdup(host ?: SD_DEFAULT_ADDR);
544 addr->u.inet.data->port = g_strdup(port ?: stringify(SD_DEFAULT_PORT));
545 }
546
547 return addr;
548 }
549
550 /* Return -EIO in case of error, file descriptor on success */
551 static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
552 {
553 int fd;
554
555 fd = socket_connect(s->addr, errp, NULL, NULL);
556
557 if (s->addr->type == SOCKET_ADDRESS_KIND_INET && fd >= 0) {
558 int ret = socket_set_nodelay(fd);
559 if (ret < 0) {
560 error_report("%s", strerror(errno));
561 }
562 }
563
564 if (fd >= 0) {
565 qemu_set_nonblock(fd);
566 } else {
567 fd = -EIO;
568 }
569
570 return fd;
571 }
572
573 /* Return 0 on success and -errno in case of error */
574 static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
575 unsigned int *wlen)
576 {
577 int ret;
578
579 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
580 if (ret != sizeof(*hdr)) {
581 error_report("failed to send a req, %s", strerror(errno));
582 return -errno;
583 }
584
585 ret = qemu_co_send(sockfd, data, *wlen);
586 if (ret != *wlen) {
587 error_report("failed to send a req, %s", strerror(errno));
588 return -errno;
589 }
590
591 return ret;
592 }
593
594 typedef struct SheepdogReqCo {
595 int sockfd;
596 BlockDriverState *bs;
597 AioContext *aio_context;
598 SheepdogReq *hdr;
599 void *data;
600 unsigned int *wlen;
601 unsigned int *rlen;
602 int ret;
603 bool finished;
604 Coroutine *co;
605 } SheepdogReqCo;
606
607 static void restart_co_req(void *opaque)
608 {
609 SheepdogReqCo *srco = opaque;
610
611 aio_co_wake(srco->co);
612 }
613
614 static coroutine_fn void do_co_req(void *opaque)
615 {
616 int ret;
617 SheepdogReqCo *srco = opaque;
618 int sockfd = srco->sockfd;
619 SheepdogReq *hdr = srco->hdr;
620 void *data = srco->data;
621 unsigned int *wlen = srco->wlen;
622 unsigned int *rlen = srco->rlen;
623
624 srco->co = qemu_coroutine_self();
625 aio_set_fd_handler(srco->aio_context, sockfd, false,
626 NULL, restart_co_req, NULL, srco);
627
628 ret = send_co_req(sockfd, hdr, data, wlen);
629 if (ret < 0) {
630 goto out;
631 }
632
633 aio_set_fd_handler(srco->aio_context, sockfd, false,
634 restart_co_req, NULL, NULL, srco);
635
636 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
637 if (ret != sizeof(*hdr)) {
638 error_report("failed to get a rsp, %s", strerror(errno));
639 ret = -errno;
640 goto out;
641 }
642
643 if (*rlen > hdr->data_length) {
644 *rlen = hdr->data_length;
645 }
646
647 if (*rlen) {
648 ret = qemu_co_recv(sockfd, data, *rlen);
649 if (ret != *rlen) {
650 error_report("failed to get the data, %s", strerror(errno));
651 ret = -errno;
652 goto out;
653 }
654 }
655 ret = 0;
656 out:
657 /* there is at most one request for this sockfd, so it is safe to
658 * set each handler to NULL. */
659 aio_set_fd_handler(srco->aio_context, sockfd, false,
660 NULL, NULL, NULL, NULL);
661
662 srco->co = NULL;
663 srco->ret = ret;
664 srco->finished = true;
665 if (srco->bs) {
666 bdrv_wakeup(srco->bs);
667 }
668 }
669
670 /*
671 * Send the request to the sheep in a synchronous manner.
672 *
673 * Return 0 on success, -errno in case of error.
674 */
675 static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
676 void *data, unsigned int *wlen, unsigned int *rlen)
677 {
678 Coroutine *co;
679 SheepdogReqCo srco = {
680 .sockfd = sockfd,
681 .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
682 .bs = bs,
683 .hdr = hdr,
684 .data = data,
685 .wlen = wlen,
686 .rlen = rlen,
687 .ret = 0,
688 .finished = false,
689 };
690
691 if (qemu_in_coroutine()) {
692 do_co_req(&srco);
693 } else {
694 co = qemu_coroutine_create(do_co_req, &srco);
695 if (bs) {
696 qemu_coroutine_enter(co);
697 BDRV_POLL_WHILE(bs, !srco.finished);
698 } else {
699 qemu_coroutine_enter(co);
700 while (!srco.finished) {
701 aio_poll(qemu_get_aio_context(), true);
702 }
703 }
704 }
705
706 return srco.ret;
707 }
708
709 static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
710 struct iovec *iov, int niov,
711 enum AIOCBState aiocb_type);
712 static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
713 static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
714 static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
715 static void co_write_request(void *opaque);
716
717 static coroutine_fn void reconnect_to_sdog(void *opaque)
718 {
719 BDRVSheepdogState *s = opaque;
720 AIOReq *aio_req, *next;
721
722 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
723 NULL, NULL, NULL);
724 close(s->fd);
725 s->fd = -1;
726
727 /* Wait for outstanding write requests to be completed. */
728 while (s->co_send != NULL) {
729 co_write_request(opaque);
730 }
731
732 /* Try to reconnect the sheepdog server every one second. */
733 while (s->fd < 0) {
734 Error *local_err = NULL;
735 s->fd = get_sheep_fd(s, &local_err);
736 if (s->fd < 0) {
737 DPRINTF("Wait for connection to be established\n");
738 error_report_err(local_err);
739 co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME,
740 1000000000ULL);
741 }
742 };
743
744 /*
745 * Now we have to resend all the request in the inflight queue. However,
746 * resend_aioreq() can yield and newly created requests can be added to the
747 * inflight queue before the coroutine is resumed. To avoid mixing them, we
748 * have to move all the inflight requests to the failed queue before
749 * resend_aioreq() is called.
750 */
751 QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
752 QLIST_REMOVE(aio_req, aio_siblings);
753 QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
754 }
755
756 /* Resend all the failed aio requests. */
757 while (!QLIST_EMPTY(&s->failed_aio_head)) {
758 aio_req = QLIST_FIRST(&s->failed_aio_head);
759 QLIST_REMOVE(aio_req, aio_siblings);
760 resend_aioreq(s, aio_req);
761 }
762 }
763
764 /*
765 * Receive responses of the I/O requests.
766 *
767 * This function is registered as a fd handler, and called from the
768 * main loop when s->fd is ready for reading responses.
769 */
770 static void coroutine_fn aio_read_response(void *opaque)
771 {
772 SheepdogObjRsp rsp;
773 BDRVSheepdogState *s = opaque;
774 int fd = s->fd;
775 int ret;
776 AIOReq *aio_req = NULL;
777 SheepdogAIOCB *acb;
778 uint64_t idx;
779
780 /* read a header */
781 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
782 if (ret != sizeof(rsp)) {
783 error_report("failed to get the header, %s", strerror(errno));
784 goto err;
785 }
786
787 /* find the right aio_req from the inflight aio list */
788 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
789 if (aio_req->id == rsp.id) {
790 break;
791 }
792 }
793 if (!aio_req) {
794 error_report("cannot find aio_req %x", rsp.id);
795 goto err;
796 }
797
798 acb = aio_req->aiocb;
799
800 switch (acb->aiocb_type) {
801 case AIOCB_WRITE_UDATA:
802 if (!is_data_obj(aio_req->oid)) {
803 break;
804 }
805 idx = data_oid_to_idx(aio_req->oid);
806
807 if (aio_req->create) {
808 /*
809 * If the object is newly created one, we need to update
810 * the vdi object (metadata object). min_dirty_data_idx
811 * and max_dirty_data_idx are changed to include updated
812 * index between them.
813 */
814 if (rsp.result == SD_RES_SUCCESS) {
815 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
816 acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
817 acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
818 }
819 }
820 break;
821 case AIOCB_READ_UDATA:
822 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
823 aio_req->iov_offset, rsp.data_length);
824 if (ret != rsp.data_length) {
825 error_report("failed to get the data, %s", strerror(errno));
826 goto err;
827 }
828 break;
829 case AIOCB_FLUSH_CACHE:
830 if (rsp.result == SD_RES_INVALID_PARMS) {
831 DPRINTF("disable cache since the server doesn't support it\n");
832 s->cache_flags = SD_FLAG_CMD_DIRECT;
833 rsp.result = SD_RES_SUCCESS;
834 }
835 break;
836 case AIOCB_DISCARD_OBJ:
837 switch (rsp.result) {
838 case SD_RES_INVALID_PARMS:
839 error_report("server doesn't support discard command");
840 rsp.result = SD_RES_SUCCESS;
841 s->discard_supported = false;
842 break;
843 default:
844 break;
845 }
846 }
847
848 /* No more data for this aio_req (reload_inode below uses its own file
849 * descriptor handler which doesn't use co_recv).
850 */
851 s->co_recv = NULL;
852
853 QLIST_REMOVE(aio_req, aio_siblings);
854 switch (rsp.result) {
855 case SD_RES_SUCCESS:
856 break;
857 case SD_RES_READONLY:
858 if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
859 ret = reload_inode(s, 0, "");
860 if (ret < 0) {
861 goto err;
862 }
863 }
864 if (is_data_obj(aio_req->oid)) {
865 aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
866 data_oid_to_idx(aio_req->oid));
867 } else {
868 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
869 }
870 resend_aioreq(s, aio_req);
871 return;
872 default:
873 acb->ret = -EIO;
874 error_report("%s", sd_strerror(rsp.result));
875 break;
876 }
877
878 g_free(aio_req);
879
880 if (!--acb->nr_pending) {
881 /*
882 * We've finished all requests which belong to the AIOCB, so
883 * we can switch back to sd_co_readv/writev now.
884 */
885 aio_co_wake(acb->coroutine);
886 }
887
888 return;
889
890 err:
891 reconnect_to_sdog(opaque);
892 }
893
894 static void co_read_response(void *opaque)
895 {
896 BDRVSheepdogState *s = opaque;
897
898 if (!s->co_recv) {
899 s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
900 }
901
902 aio_co_wake(s->co_recv);
903 }
904
905 static void co_write_request(void *opaque)
906 {
907 BDRVSheepdogState *s = opaque;
908
909 aio_co_wake(s->co_send);
910 }
911
912 /*
913 * Return a socket descriptor to read/write objects.
914 *
915 * We cannot use this descriptor for other operations because
916 * the block driver may be on waiting response from the server.
917 */
918 static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
919 {
920 int fd;
921
922 fd = connect_to_sdog(s, errp);
923 if (fd < 0) {
924 return fd;
925 }
926
927 aio_set_fd_handler(s->aio_context, fd, false,
928 co_read_response, NULL, NULL, s);
929 return fd;
930 }
931
932 /*
933 * Parse numeric snapshot ID in @str
934 * If @str can't be parsed as number, return false.
935 * Else, if the number is zero or too large, set *@snapid to zero and
936 * return true.
937 * Else, set *@snapid to the number and return true.
938 */
939 static bool sd_parse_snapid(const char *str, uint32_t *snapid)
940 {
941 unsigned long ul;
942 int ret;
943
944 ret = qemu_strtoul(str, NULL, 10, &ul);
945 if (ret == -ERANGE) {
946 ul = ret = 0;
947 }
948 if (ret) {
949 return false;
950 }
951 if (ul > UINT32_MAX) {
952 ul = 0;
953 }
954
955 *snapid = ul;
956 return true;
957 }
958
959 static bool sd_parse_snapid_or_tag(const char *str,
960 uint32_t *snapid, char tag[])
961 {
962 if (!sd_parse_snapid(str, snapid)) {
963 *snapid = 0;
964 if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) {
965 return false;
966 }
967 } else if (!*snapid) {
968 return false;
969 } else {
970 tag[0] = 0;
971 }
972 return true;
973 }
974
975 typedef struct {
976 const char *path; /* non-null iff transport is tcp */
977 const char *host; /* valid when transport is tcp */
978 int port; /* valid when transport is tcp */
979 char vdi[SD_MAX_VDI_LEN];
980 char tag[SD_MAX_VDI_TAG_LEN];
981 uint32_t snap_id;
982 /* Remainder is only for sd_config_done() */
983 URI *uri;
984 QueryParams *qp;
985 } SheepdogConfig;
986
987 static void sd_config_done(SheepdogConfig *cfg)
988 {
989 if (cfg->qp) {
990 query_params_free(cfg->qp);
991 }
992 uri_free(cfg->uri);
993 }
994
995 static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
996 Error **errp)
997 {
998 Error *err = NULL;
999 QueryParams *qp = NULL;
1000 bool is_unix;
1001 URI *uri;
1002
1003 memset(cfg, 0, sizeof(*cfg));
1004
1005 cfg->uri = uri = uri_parse(filename);
1006 if (!uri) {
1007 error_setg(&err, "invalid URI");
1008 goto out;
1009 }
1010
1011 /* transport */
1012 if (!strcmp(uri->scheme, "sheepdog")) {
1013 is_unix = false;
1014 } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
1015 is_unix = false;
1016 } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
1017 is_unix = true;
1018 } else {
1019 error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
1020 " or 'sheepdog+unix'");
1021 goto out;
1022 }
1023
1024 if (uri->path == NULL || !strcmp(uri->path, "/")) {
1025 error_setg(&err, "missing file path in URI");
1026 goto out;
1027 }
1028 if (g_strlcpy(cfg->vdi, uri->path + 1, SD_MAX_VDI_LEN)
1029 >= SD_MAX_VDI_LEN) {
1030 error_setg(&err, "VDI name is too long");
1031 goto out;
1032 }
1033
1034 cfg->qp = qp = query_params_parse(uri->query);
1035
1036 if (is_unix) {
1037 /* sheepdog+unix:///vdiname?socket=path */
1038 if (uri->server || uri->port) {
1039 error_setg(&err, "URI scheme %s doesn't accept a server address",
1040 uri->scheme);
1041 goto out;
1042 }
1043 if (!qp->n) {
1044 error_setg(&err,
1045 "URI scheme %s requires query parameter 'socket'",
1046 uri->scheme);
1047 goto out;
1048 }
1049 if (qp->n != 1 || strcmp(qp->p[0].name, "socket")) {
1050 error_setg(&err, "unexpected query parameters");
1051 goto out;
1052 }
1053 cfg->path = qp->p[0].value;
1054 } else {
1055 /* sheepdog[+tcp]://[host:port]/vdiname */
1056 if (qp->n) {
1057 error_setg(&err, "unexpected query parameters");
1058 goto out;
1059 }
1060 cfg->host = uri->server;
1061 cfg->port = uri->port;
1062 }
1063
1064 /* snapshot tag */
1065 if (uri->fragment) {
1066 if (!sd_parse_snapid_or_tag(uri->fragment,
1067 &cfg->snap_id, cfg->tag)) {
1068 error_setg(&err, "'%s' is not a valid snapshot ID",
1069 uri->fragment);
1070 goto out;
1071 }
1072 } else {
1073 cfg->snap_id = CURRENT_VDI_ID; /* search current vdi */
1074 }
1075
1076 out:
1077 if (err) {
1078 error_propagate(errp, err);
1079 sd_config_done(cfg);
1080 }
1081 }
1082
1083 /*
1084 * Parse a filename (old syntax)
1085 *
1086 * filename must be one of the following formats:
1087 * 1. [vdiname]
1088 * 2. [vdiname]:[snapid]
1089 * 3. [vdiname]:[tag]
1090 * 4. [hostname]:[port]:[vdiname]
1091 * 5. [hostname]:[port]:[vdiname]:[snapid]
1092 * 6. [hostname]:[port]:[vdiname]:[tag]
1093 *
1094 * You can boot from the snapshot images by specifying `snapid` or
1095 * `tag'.
1096 *
1097 * You can run VMs outside the Sheepdog cluster by specifying
1098 * `hostname' and `port' (experimental).
1099 */
1100 static void parse_vdiname(SheepdogConfig *cfg, const char *filename,
1101 Error **errp)
1102 {
1103 Error *err = NULL;
1104 char *p, *q, *uri;
1105 const char *host_spec, *vdi_spec;
1106 int nr_sep;
1107
1108 strstart(filename, "sheepdog:", &filename);
1109 p = q = g_strdup(filename);
1110
1111 /* count the number of separators */
1112 nr_sep = 0;
1113 while (*p) {
1114 if (*p == ':') {
1115 nr_sep++;
1116 }
1117 p++;
1118 }
1119 p = q;
1120
1121 /* use the first two tokens as host_spec. */
1122 if (nr_sep >= 2) {
1123 host_spec = p;
1124 p = strchr(p, ':');
1125 p++;
1126 p = strchr(p, ':');
1127 *p++ = '\0';
1128 } else {
1129 host_spec = "";
1130 }
1131
1132 vdi_spec = p;
1133
1134 p = strchr(vdi_spec, ':');
1135 if (p) {
1136 *p++ = '#';
1137 }
1138
1139 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
1140
1141 /*
1142 * FIXME We to escape URI meta-characters, e.g. "x?y=z"
1143 * produces "sheepdog://x?y=z". Because of that ...
1144 */
1145 sd_parse_uri(cfg, uri, &err);
1146 if (err) {
1147 /*
1148 * ... this can fail, but the error message is misleading.
1149 * Replace it by the traditional useless one until the
1150 * escaping is fixed.
1151 */
1152 error_free(err);
1153 error_setg(errp, "Can't parse filename");
1154 }
1155
1156 g_free(q);
1157 g_free(uri);
1158 }
1159
1160 static void sd_parse_filename(const char *filename, QDict *options,
1161 Error **errp)
1162 {
1163 Error *err = NULL;
1164 SheepdogConfig cfg;
1165 char buf[32];
1166
1167 if (strstr(filename, "://")) {
1168 sd_parse_uri(&cfg, filename, &err);
1169 } else {
1170 parse_vdiname(&cfg, filename, &err);
1171 }
1172 if (err) {
1173 error_propagate(errp, err);
1174 return;
1175 }
1176
1177 if (cfg.host) {
1178 qdict_set_default_str(options, "host", cfg.host);
1179 }
1180 if (cfg.port) {
1181 snprintf(buf, sizeof(buf), "%d", cfg.port);
1182 qdict_set_default_str(options, "port", buf);
1183 }
1184 if (cfg.path) {
1185 qdict_set_default_str(options, "path", cfg.path);
1186 }
1187 qdict_set_default_str(options, "vdi", cfg.vdi);
1188 qdict_set_default_str(options, "tag", cfg.tag);
1189 if (cfg.snap_id) {
1190 snprintf(buf, sizeof(buf), "%d", cfg.snap_id);
1191 qdict_set_default_str(options, "snap-id", buf);
1192 }
1193
1194 sd_config_done(&cfg);
1195 }
1196
1197 static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
1198 uint32_t snapid, const char *tag, uint32_t *vid,
1199 bool lock, Error **errp)
1200 {
1201 int ret, fd;
1202 SheepdogVdiReq hdr;
1203 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1204 unsigned int wlen, rlen = 0;
1205 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1206
1207 fd = connect_to_sdog(s, errp);
1208 if (fd < 0) {
1209 return fd;
1210 }
1211
1212 /* This pair of strncpy calls ensures that the buffer is zero-filled,
1213 * which is desirable since we'll soon be sending those bytes, and
1214 * don't want the send_req to read uninitialized data.
1215 */
1216 strncpy(buf, filename, SD_MAX_VDI_LEN);
1217 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1218
1219 memset(&hdr, 0, sizeof(hdr));
1220 if (lock) {
1221 hdr.opcode = SD_OP_LOCK_VDI;
1222 hdr.type = LOCK_TYPE_NORMAL;
1223 } else {
1224 hdr.opcode = SD_OP_GET_VDI_INFO;
1225 }
1226 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1227 hdr.proto_ver = SD_PROTO_VER;
1228 hdr.data_length = wlen;
1229 hdr.snapid = snapid;
1230 hdr.flags = SD_FLAG_CMD_WRITE;
1231
1232 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1233 if (ret) {
1234 error_setg_errno(errp, -ret, "cannot get vdi info");
1235 goto out;
1236 }
1237
1238 if (rsp->result != SD_RES_SUCCESS) {
1239 error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
1240 sd_strerror(rsp->result), filename, snapid, tag);
1241 if (rsp->result == SD_RES_NO_VDI) {
1242 ret = -ENOENT;
1243 } else if (rsp->result == SD_RES_VDI_LOCKED) {
1244 ret = -EBUSY;
1245 } else {
1246 ret = -EIO;
1247 }
1248 goto out;
1249 }
1250 *vid = rsp->vdi_id;
1251
1252 ret = 0;
1253 out:
1254 closesocket(fd);
1255 return ret;
1256 }
1257
1258 static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1259 struct iovec *iov, int niov,
1260 enum AIOCBState aiocb_type)
1261 {
1262 int nr_copies = s->inode.nr_copies;
1263 SheepdogObjReq hdr;
1264 unsigned int wlen = 0;
1265 int ret;
1266 uint64_t oid = aio_req->oid;
1267 unsigned int datalen = aio_req->data_len;
1268 uint64_t offset = aio_req->offset;
1269 uint8_t flags = aio_req->flags;
1270 uint64_t old_oid = aio_req->base_oid;
1271 bool create = aio_req->create;
1272
1273 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1274
1275 if (!nr_copies) {
1276 error_report("bug");
1277 }
1278
1279 memset(&hdr, 0, sizeof(hdr));
1280
1281 switch (aiocb_type) {
1282 case AIOCB_FLUSH_CACHE:
1283 hdr.opcode = SD_OP_FLUSH_VDI;
1284 break;
1285 case AIOCB_READ_UDATA:
1286 hdr.opcode = SD_OP_READ_OBJ;
1287 hdr.flags = flags;
1288 break;
1289 case AIOCB_WRITE_UDATA:
1290 if (create) {
1291 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1292 } else {
1293 hdr.opcode = SD_OP_WRITE_OBJ;
1294 }
1295 wlen = datalen;
1296 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1297 break;
1298 case AIOCB_DISCARD_OBJ:
1299 hdr.opcode = SD_OP_WRITE_OBJ;
1300 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1301 s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
1302 offset = offsetof(SheepdogInode,
1303 data_vdi_id[data_oid_to_idx(oid)]);
1304 oid = vid_to_vdi_oid(s->inode.vdi_id);
1305 wlen = datalen = sizeof(uint32_t);
1306 break;
1307 }
1308
1309 if (s->cache_flags) {
1310 hdr.flags |= s->cache_flags;
1311 }
1312
1313 hdr.oid = oid;
1314 hdr.cow_oid = old_oid;
1315 hdr.copies = s->inode.nr_copies;
1316
1317 hdr.data_length = datalen;
1318 hdr.offset = offset;
1319
1320 hdr.id = aio_req->id;
1321
1322 qemu_co_mutex_lock(&s->lock);
1323 s->co_send = qemu_coroutine_self();
1324 aio_set_fd_handler(s->aio_context, s->fd, false,
1325 co_read_response, co_write_request, NULL, s);
1326 socket_set_cork(s->fd, 1);
1327
1328 /* send a header */
1329 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
1330 if (ret != sizeof(hdr)) {
1331 error_report("failed to send a req, %s", strerror(errno));
1332 goto out;
1333 }
1334
1335 if (wlen) {
1336 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
1337 if (ret != wlen) {
1338 error_report("failed to send a data, %s", strerror(errno));
1339 }
1340 }
1341 out:
1342 socket_set_cork(s->fd, 0);
1343 aio_set_fd_handler(s->aio_context, s->fd, false,
1344 co_read_response, NULL, NULL, s);
1345 s->co_send = NULL;
1346 qemu_co_mutex_unlock(&s->lock);
1347 }
1348
1349 static int read_write_object(int fd, BlockDriverState *bs, char *buf,
1350 uint64_t oid, uint8_t copies,
1351 unsigned int datalen, uint64_t offset,
1352 bool write, bool create, uint32_t cache_flags)
1353 {
1354 SheepdogObjReq hdr;
1355 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1356 unsigned int wlen, rlen;
1357 int ret;
1358
1359 memset(&hdr, 0, sizeof(hdr));
1360
1361 if (write) {
1362 wlen = datalen;
1363 rlen = 0;
1364 hdr.flags = SD_FLAG_CMD_WRITE;
1365 if (create) {
1366 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1367 } else {
1368 hdr.opcode = SD_OP_WRITE_OBJ;
1369 }
1370 } else {
1371 wlen = 0;
1372 rlen = datalen;
1373 hdr.opcode = SD_OP_READ_OBJ;
1374 }
1375
1376 hdr.flags |= cache_flags;
1377
1378 hdr.oid = oid;
1379 hdr.data_length = datalen;
1380 hdr.offset = offset;
1381 hdr.copies = copies;
1382
1383 ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1384 if (ret) {
1385 error_report("failed to send a request to the sheep");
1386 return ret;
1387 }
1388
1389 switch (rsp->result) {
1390 case SD_RES_SUCCESS:
1391 return 0;
1392 default:
1393 error_report("%s", sd_strerror(rsp->result));
1394 return -EIO;
1395 }
1396 }
1397
1398 static int read_object(int fd, BlockDriverState *bs, char *buf,
1399 uint64_t oid, uint8_t copies,
1400 unsigned int datalen, uint64_t offset,
1401 uint32_t cache_flags)
1402 {
1403 return read_write_object(fd, bs, buf, oid, copies,
1404 datalen, offset, false,
1405 false, cache_flags);
1406 }
1407
1408 static int write_object(int fd, BlockDriverState *bs, char *buf,
1409 uint64_t oid, uint8_t copies,
1410 unsigned int datalen, uint64_t offset, bool create,
1411 uint32_t cache_flags)
1412 {
1413 return read_write_object(fd, bs, buf, oid, copies,
1414 datalen, offset, true,
1415 create, cache_flags);
1416 }
1417
1418 /* update inode with the latest state */
1419 static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
1420 {
1421 Error *local_err = NULL;
1422 SheepdogInode *inode;
1423 int ret = 0, fd;
1424 uint32_t vid = 0;
1425
1426 fd = connect_to_sdog(s, &local_err);
1427 if (fd < 0) {
1428 error_report_err(local_err);
1429 return -EIO;
1430 }
1431
1432 inode = g_malloc(SD_INODE_HEADER_SIZE);
1433
1434 ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
1435 if (ret) {
1436 error_report_err(local_err);
1437 goto out;
1438 }
1439
1440 ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
1441 s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
1442 s->cache_flags);
1443 if (ret < 0) {
1444 goto out;
1445 }
1446
1447 if (inode->vdi_id != s->inode.vdi_id) {
1448 memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE);
1449 }
1450
1451 out:
1452 g_free(inode);
1453 closesocket(fd);
1454
1455 return ret;
1456 }
1457
1458 static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
1459 {
1460 SheepdogAIOCB *acb = aio_req->aiocb;
1461
1462 aio_req->create = false;
1463
1464 /* check whether this request becomes a CoW one */
1465 if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
1466 int idx = data_oid_to_idx(aio_req->oid);
1467
1468 if (is_data_obj_writable(&s->inode, idx)) {
1469 goto out;
1470 }
1471
1472 if (s->inode.data_vdi_id[idx]) {
1473 aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
1474 aio_req->flags |= SD_FLAG_CMD_COW;
1475 }
1476 aio_req->create = true;
1477 }
1478 out:
1479 if (is_data_obj(aio_req->oid)) {
1480 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1481 acb->aiocb_type);
1482 } else {
1483 struct iovec iov;
1484 iov.iov_base = &s->inode;
1485 iov.iov_len = sizeof(s->inode);
1486 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
1487 }
1488 }
1489
1490 static void sd_detach_aio_context(BlockDriverState *bs)
1491 {
1492 BDRVSheepdogState *s = bs->opaque;
1493
1494 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
1495 NULL, NULL, NULL);
1496 }
1497
1498 static void sd_attach_aio_context(BlockDriverState *bs,
1499 AioContext *new_context)
1500 {
1501 BDRVSheepdogState *s = bs->opaque;
1502
1503 s->aio_context = new_context;
1504 aio_set_fd_handler(new_context, s->fd, false,
1505 co_read_response, NULL, NULL, s);
1506 }
1507
1508 static QemuOptsList runtime_opts = {
1509 .name = "sheepdog",
1510 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
1511 .desc = {
1512 {
1513 .name = "host",
1514 .type = QEMU_OPT_STRING,
1515 },
1516 {
1517 .name = "port",
1518 .type = QEMU_OPT_STRING,
1519 },
1520 {
1521 .name = "path",
1522 .type = QEMU_OPT_STRING,
1523 },
1524 {
1525 .name = "vdi",
1526 .type = QEMU_OPT_STRING,
1527 },
1528 {
1529 .name = "snap-id",
1530 .type = QEMU_OPT_NUMBER,
1531 },
1532 {
1533 .name = "tag",
1534 .type = QEMU_OPT_STRING,
1535 },
1536 { /* end of list */ }
1537 },
1538 };
1539
1540 static int sd_open(BlockDriverState *bs, QDict *options, int flags,
1541 Error **errp)
1542 {
1543 int ret, fd;
1544 uint32_t vid = 0;
1545 BDRVSheepdogState *s = bs->opaque;
1546 const char *host, *port, *path, *vdi, *snap_id_str, *tag;
1547 uint64_t snap_id;
1548 char *buf = NULL;
1549 QemuOpts *opts;
1550 Error *local_err = NULL;
1551
1552 s->bs = bs;
1553 s->aio_context = bdrv_get_aio_context(bs);
1554
1555 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
1556 qemu_opts_absorb_qdict(opts, options, &local_err);
1557 if (local_err) {
1558 error_propagate(errp, local_err);
1559 ret = -EINVAL;
1560 goto err_no_fd;
1561 }
1562
1563 host = qemu_opt_get(opts, "host");
1564 port = qemu_opt_get(opts, "port");
1565 path = qemu_opt_get(opts, "path");
1566 vdi = qemu_opt_get(opts, "vdi");
1567 snap_id_str = qemu_opt_get(opts, "snap-id");
1568 snap_id = qemu_opt_get_number(opts, "snap-id", CURRENT_VDI_ID);
1569 tag = qemu_opt_get(opts, "tag");
1570
1571 if ((host || port) && path) {
1572 error_setg(errp, "can't use 'path' together with 'host' or 'port'");
1573 ret = -EINVAL;
1574 goto err_no_fd;
1575 }
1576
1577 if (!vdi) {
1578 error_setg(errp, "parameter 'vdi' is missing");
1579 ret = -EINVAL;
1580 goto err_no_fd;
1581 }
1582 if (strlen(vdi) >= SD_MAX_VDI_LEN) {
1583 error_setg(errp, "value of parameter 'vdi' is too long");
1584 ret = -EINVAL;
1585 goto err_no_fd;
1586 }
1587
1588 if (snap_id > UINT32_MAX) {
1589 snap_id = 0;
1590 }
1591 if (snap_id_str && !snap_id) {
1592 error_setg(errp, "'snap-id=%s' is not a valid snapshot ID",
1593 snap_id_str);
1594 ret = -EINVAL;
1595 goto err_no_fd;
1596 }
1597
1598 if (!tag) {
1599 tag = "";
1600 }
1601 if (tag && strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
1602 error_setg(errp, "value of parameter 'tag' is too long");
1603 ret = -EINVAL;
1604 goto err_no_fd;
1605 }
1606
1607 s->addr = sd_socket_address(path, host, port);
1608
1609 QLIST_INIT(&s->inflight_aio_head);
1610 QLIST_INIT(&s->failed_aio_head);
1611 QLIST_INIT(&s->inflight_aiocb_head);
1612
1613 s->fd = get_sheep_fd(s, errp);
1614 if (s->fd < 0) {
1615 ret = s->fd;
1616 goto err_no_fd;
1617 }
1618
1619 ret = find_vdi_name(s, vdi, (uint32_t)snap_id, tag, &vid, true, errp);
1620 if (ret) {
1621 goto err;
1622 }
1623
1624 /*
1625 * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1626 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1627 */
1628 s->cache_flags = SD_FLAG_CMD_CACHE;
1629 if (flags & BDRV_O_NOCACHE) {
1630 s->cache_flags = SD_FLAG_CMD_DIRECT;
1631 }
1632 s->discard_supported = true;
1633
1634 if (snap_id || tag[0]) {
1635 DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
1636 s->is_snapshot = true;
1637 }
1638
1639 fd = connect_to_sdog(s, errp);
1640 if (fd < 0) {
1641 ret = fd;
1642 goto err;
1643 }
1644
1645 buf = g_malloc(SD_INODE_SIZE);
1646 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
1647 0, SD_INODE_SIZE, 0, s->cache_flags);
1648
1649 closesocket(fd);
1650
1651 if (ret) {
1652 error_setg(errp, "Can't read snapshot inode");
1653 goto err;
1654 }
1655
1656 memcpy(&s->inode, buf, sizeof(s->inode));
1657
1658 bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
1659 pstrcpy(s->name, sizeof(s->name), vdi);
1660 qemu_co_mutex_init(&s->lock);
1661 qemu_co_queue_init(&s->overlapping_queue);
1662 qemu_opts_del(opts);
1663 g_free(buf);
1664 return 0;
1665
1666 err:
1667 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
1668 false, NULL, NULL, NULL, NULL);
1669 closesocket(s->fd);
1670 err_no_fd:
1671 qemu_opts_del(opts);
1672 g_free(buf);
1673 return ret;
1674 }
1675
1676 static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
1677 Error **errp)
1678 {
1679 BDRVSheepdogState *s = state->bs->opaque;
1680 BDRVSheepdogReopenState *re_s;
1681 int ret = 0;
1682
1683 re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
1684
1685 re_s->cache_flags = SD_FLAG_CMD_CACHE;
1686 if (state->flags & BDRV_O_NOCACHE) {
1687 re_s->cache_flags = SD_FLAG_CMD_DIRECT;
1688 }
1689
1690 re_s->fd = get_sheep_fd(s, errp);
1691 if (re_s->fd < 0) {
1692 ret = re_s->fd;
1693 return ret;
1694 }
1695
1696 return ret;
1697 }
1698
1699 static void sd_reopen_commit(BDRVReopenState *state)
1700 {
1701 BDRVSheepdogReopenState *re_s = state->opaque;
1702 BDRVSheepdogState *s = state->bs->opaque;
1703
1704 if (s->fd) {
1705 aio_set_fd_handler(s->aio_context, s->fd, false,
1706 NULL, NULL, NULL, NULL);
1707 closesocket(s->fd);
1708 }
1709
1710 s->fd = re_s->fd;
1711 s->cache_flags = re_s->cache_flags;
1712
1713 g_free(state->opaque);
1714 state->opaque = NULL;
1715
1716 return;
1717 }
1718
1719 static void sd_reopen_abort(BDRVReopenState *state)
1720 {
1721 BDRVSheepdogReopenState *re_s = state->opaque;
1722 BDRVSheepdogState *s = state->bs->opaque;
1723
1724 if (re_s == NULL) {
1725 return;
1726 }
1727
1728 if (re_s->fd) {
1729 aio_set_fd_handler(s->aio_context, re_s->fd, false,
1730 NULL, NULL, NULL, NULL);
1731 closesocket(re_s->fd);
1732 }
1733
1734 g_free(state->opaque);
1735 state->opaque = NULL;
1736
1737 return;
1738 }
1739
1740 static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
1741 Error **errp)
1742 {
1743 SheepdogVdiReq hdr;
1744 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1745 int fd, ret;
1746 unsigned int wlen, rlen = 0;
1747 char buf[SD_MAX_VDI_LEN];
1748
1749 fd = connect_to_sdog(s, errp);
1750 if (fd < 0) {
1751 return fd;
1752 }
1753
1754 /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1755 * does not fit in buf? For now, just truncate and avoid buffer overrun.
1756 */
1757 memset(buf, 0, sizeof(buf));
1758 pstrcpy(buf, sizeof(buf), s->name);
1759
1760 memset(&hdr, 0, sizeof(hdr));
1761 hdr.opcode = SD_OP_NEW_VDI;
1762 hdr.base_vdi_id = s->inode.vdi_id;
1763
1764 wlen = SD_MAX_VDI_LEN;
1765
1766 hdr.flags = SD_FLAG_CMD_WRITE;
1767 hdr.snapid = snapshot;
1768
1769 hdr.data_length = wlen;
1770 hdr.vdi_size = s->inode.vdi_size;
1771 hdr.copy_policy = s->inode.copy_policy;
1772 hdr.copies = s->inode.nr_copies;
1773 hdr.block_size_shift = s->inode.block_size_shift;
1774
1775 ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1776
1777 closesocket(fd);
1778
1779 if (ret) {
1780 error_setg_errno(errp, -ret, "create failed");
1781 return ret;
1782 }
1783
1784 if (rsp->result != SD_RES_SUCCESS) {
1785 error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
1786 return -EIO;
1787 }
1788
1789 if (vdi_id) {
1790 *vdi_id = rsp->vdi_id;
1791 }
1792
1793 return 0;
1794 }
1795
1796 static int sd_prealloc(const char *filename, Error **errp)
1797 {
1798 BlockBackend *blk = NULL;
1799 BDRVSheepdogState *base = NULL;
1800 unsigned long buf_size;
1801 uint32_t idx, max_idx;
1802 uint32_t object_size;
1803 int64_t vdi_size;
1804 void *buf = NULL;
1805 int ret;
1806
1807 blk = blk_new_open(filename, NULL, NULL,
1808 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1809 if (blk == NULL) {
1810 ret = -EIO;
1811 goto out_with_err_set;
1812 }
1813
1814 blk_set_allow_write_beyond_eof(blk, true);
1815
1816 vdi_size = blk_getlength(blk);
1817 if (vdi_size < 0) {
1818 ret = vdi_size;
1819 goto out;
1820 }
1821
1822 base = blk_bs(blk)->opaque;
1823 object_size = (UINT32_C(1) << base->inode.block_size_shift);
1824 buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
1825 buf = g_malloc0(buf_size);
1826
1827 max_idx = DIV_ROUND_UP(vdi_size, buf_size);
1828
1829 for (idx = 0; idx < max_idx; idx++) {
1830 /*
1831 * The created image can be a cloned image, so we need to read
1832 * a data from the source image.
1833 */
1834 ret = blk_pread(blk, idx * buf_size, buf, buf_size);
1835 if (ret < 0) {
1836 goto out;
1837 }
1838 ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
1839 if (ret < 0) {
1840 goto out;
1841 }
1842 }
1843
1844 ret = 0;
1845 out:
1846 if (ret < 0) {
1847 error_setg_errno(errp, -ret, "Can't pre-allocate");
1848 }
1849 out_with_err_set:
1850 if (blk) {
1851 blk_unref(blk);
1852 }
1853 g_free(buf);
1854
1855 return ret;
1856 }
1857
1858 /*
1859 * Sheepdog support two kinds of redundancy, full replication and erasure
1860 * coding.
1861 *
1862 * # create a fully replicated vdi with x copies
1863 * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
1864 *
1865 * # create a erasure coded vdi with x data strips and y parity strips
1866 * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
1867 */
1868 static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
1869 {
1870 struct SheepdogInode *inode = &s->inode;
1871 const char *n1, *n2;
1872 long copy, parity;
1873 char p[10];
1874
1875 pstrcpy(p, sizeof(p), opt);
1876 n1 = strtok(p, ":");
1877 n2 = strtok(NULL, ":");
1878
1879 if (!n1) {
1880 return -EINVAL;
1881 }
1882
1883 copy = strtol(n1, NULL, 10);
1884 /* FIXME fix error checking by switching to qemu_strtol() */
1885 if (copy > SD_MAX_COPIES || copy < 1) {
1886 return -EINVAL;
1887 }
1888 if (!n2) {
1889 inode->copy_policy = 0;
1890 inode->nr_copies = copy;
1891 return 0;
1892 }
1893
1894 if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
1895 return -EINVAL;
1896 }
1897
1898 parity = strtol(n2, NULL, 10);
1899 /* FIXME fix error checking by switching to qemu_strtol() */
1900 if (parity >= SD_EC_MAX_STRIP || parity < 1) {
1901 return -EINVAL;
1902 }
1903
1904 /*
1905 * 4 bits for parity and 4 bits for data.
1906 * We have to compress upper data bits because it can't represent 16
1907 */
1908 inode->copy_policy = ((copy / 2) << 4) + parity;
1909 inode->nr_copies = copy + parity;
1910
1911 return 0;
1912 }
1913
1914 static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
1915 {
1916 struct SheepdogInode *inode = &s->inode;
1917 uint64_t object_size;
1918 int obj_order;
1919
1920 object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
1921 if (object_size) {
1922 if ((object_size - 1) & object_size) { /* not a power of 2? */
1923 return -EINVAL;
1924 }
1925 obj_order = ctz32(object_size);
1926 if (obj_order < 20 || obj_order > 31) {
1927 return -EINVAL;
1928 }
1929 inode->block_size_shift = (uint8_t)obj_order;
1930 }
1931
1932 return 0;
1933 }
1934
1935 static int sd_create(const char *filename, QemuOpts *opts,
1936 Error **errp)
1937 {
1938 Error *err = NULL;
1939 int ret = 0;
1940 uint32_t vid = 0;
1941 char *backing_file = NULL;
1942 char *buf = NULL;
1943 BDRVSheepdogState *s;
1944 SheepdogConfig cfg;
1945 uint64_t max_vdi_size;
1946 bool prealloc = false;
1947
1948 s = g_new0(BDRVSheepdogState, 1);
1949
1950 if (strstr(filename, "://")) {
1951 sd_parse_uri(&cfg, filename, &err);
1952 } else {
1953 parse_vdiname(&cfg, filename, &err);
1954 }
1955 if (err) {
1956 error_propagate(errp, err);
1957 goto out;
1958 }
1959
1960 buf = cfg.port ? g_strdup_printf("%d", cfg.port) : NULL;
1961 s->addr = sd_socket_address(cfg.path, cfg.host, buf);
1962 g_free(buf);
1963 strcpy(s->name, cfg.vdi);
1964 sd_config_done(&cfg);
1965
1966 s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1967 BDRV_SECTOR_SIZE);
1968 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
1969 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1970 if (!buf || !strcmp(buf, "off")) {
1971 prealloc = false;
1972 } else if (!strcmp(buf, "full")) {
1973 prealloc = true;
1974 } else {
1975 error_setg(errp, "Invalid preallocation mode: '%s'", buf);
1976 ret = -EINVAL;
1977 goto out;
1978 }
1979
1980 g_free(buf);
1981 buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
1982 if (buf) {
1983 ret = parse_redundancy(s, buf);
1984 if (ret < 0) {
1985 error_setg(errp, "Invalid redundancy mode: '%s'", buf);
1986 goto out;
1987 }
1988 }
1989 ret = parse_block_size_shift(s, opts);
1990 if (ret < 0) {
1991 error_setg(errp, "Invalid object_size."
1992 " obect_size needs to be power of 2"
1993 " and be limited from 2^20 to 2^31");
1994 goto out;
1995 }
1996
1997 if (backing_file) {
1998 BlockBackend *blk;
1999 BDRVSheepdogState *base;
2000 BlockDriver *drv;
2001
2002 /* Currently, only Sheepdog backing image is supported. */
2003 drv = bdrv_find_protocol(backing_file, true, NULL);
2004 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
2005 error_setg(errp, "backing_file must be a sheepdog image");
2006 ret = -EINVAL;
2007 goto out;
2008 }
2009
2010 blk = blk_new_open(backing_file, NULL, NULL,
2011 BDRV_O_PROTOCOL, errp);
2012 if (blk == NULL) {
2013 ret = -EIO;
2014 goto out;
2015 }
2016
2017 base = blk_bs(blk)->opaque;
2018
2019 if (!is_snapshot(&base->inode)) {
2020 error_setg(errp, "cannot clone from a non snapshot vdi");
2021 blk_unref(blk);
2022 ret = -EINVAL;
2023 goto out;
2024 }
2025 s->inode.vdi_id = base->inode.vdi_id;
2026 blk_unref(blk);
2027 }
2028
2029 s->aio_context = qemu_get_aio_context();
2030
2031 /* if block_size_shift is not specified, get cluster default value */
2032 if (s->inode.block_size_shift == 0) {
2033 SheepdogVdiReq hdr;
2034 SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
2035 int fd;
2036 unsigned int wlen = 0, rlen = 0;
2037
2038 fd = connect_to_sdog(s, errp);
2039 if (fd < 0) {
2040 ret = fd;
2041 goto out;
2042 }
2043
2044 memset(&hdr, 0, sizeof(hdr));
2045 hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
2046 hdr.proto_ver = SD_PROTO_VER;
2047
2048 ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
2049 NULL, &wlen, &rlen);
2050 closesocket(fd);
2051 if (ret) {
2052 error_setg_errno(errp, -ret, "failed to get cluster default");
2053 goto out;
2054 }
2055 if (rsp->result == SD_RES_SUCCESS) {
2056 s->inode.block_size_shift = rsp->block_size_shift;
2057 } else {
2058 s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
2059 }
2060 }
2061
2062 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
2063
2064 if (s->inode.vdi_size > max_vdi_size) {
2065 error_setg(errp, "An image is too large."
2066 " The maximum image size is %"PRIu64 "GB",
2067 max_vdi_size / 1024 / 1024 / 1024);
2068 ret = -EINVAL;
2069 goto out;
2070 }
2071
2072 ret = do_sd_create(s, &vid, 0, errp);
2073 if (ret) {
2074 goto out;
2075 }
2076
2077 if (prealloc) {
2078 ret = sd_prealloc(filename, errp);
2079 }
2080 out:
2081 g_free(backing_file);
2082 g_free(buf);
2083 g_free(s);
2084 return ret;
2085 }
2086
2087 static void sd_close(BlockDriverState *bs)
2088 {
2089 Error *local_err = NULL;
2090 BDRVSheepdogState *s = bs->opaque;
2091 SheepdogVdiReq hdr;
2092 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2093 unsigned int wlen, rlen = 0;
2094 int fd, ret;
2095
2096 DPRINTF("%s\n", s->name);
2097
2098 fd = connect_to_sdog(s, &local_err);
2099 if (fd < 0) {
2100 error_report_err(local_err);
2101 return;
2102 }
2103
2104 memset(&hdr, 0, sizeof(hdr));
2105
2106 hdr.opcode = SD_OP_RELEASE_VDI;
2107 hdr.type = LOCK_TYPE_NORMAL;
2108 hdr.base_vdi_id = s->inode.vdi_id;
2109 wlen = strlen(s->name) + 1;
2110 hdr.data_length = wlen;
2111 hdr.flags = SD_FLAG_CMD_WRITE;
2112
2113 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2114 s->name, &wlen, &rlen);
2115
2116 closesocket(fd);
2117
2118 if (!ret && rsp->result != SD_RES_SUCCESS &&
2119 rsp->result != SD_RES_VDI_NOT_LOCKED) {
2120 error_report("%s, %s", sd_strerror(rsp->result), s->name);
2121 }
2122
2123 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
2124 false, NULL, NULL, NULL, NULL);
2125 closesocket(s->fd);
2126 qapi_free_SocketAddress(s->addr);
2127 }
2128
2129 static int64_t sd_getlength(BlockDriverState *bs)
2130 {
2131 BDRVSheepdogState *s = bs->opaque;
2132
2133 return s->inode.vdi_size;
2134 }
2135
2136 static int sd_truncate(BlockDriverState *bs, int64_t offset)
2137 {
2138 Error *local_err = NULL;
2139 BDRVSheepdogState *s = bs->opaque;
2140 int ret, fd;
2141 unsigned int datalen;
2142 uint64_t max_vdi_size;
2143
2144 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
2145 if (offset < s->inode.vdi_size) {
2146 error_report("shrinking is not supported");
2147 return -EINVAL;
2148 } else if (offset > max_vdi_size) {
2149 error_report("too big image size");
2150 return -EINVAL;
2151 }
2152
2153 fd = connect_to_sdog(s, &local_err);
2154 if (fd < 0) {
2155 error_report_err(local_err);
2156 return fd;
2157 }
2158
2159 /* we don't need to update entire object */
2160 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2161 s->inode.vdi_size = offset;
2162 ret = write_object(fd, s->bs, (char *)&s->inode,
2163 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2164 datalen, 0, false, s->cache_flags);
2165 close(fd);
2166
2167 if (ret < 0) {
2168 error_report("failed to update an inode.");
2169 }
2170
2171 return ret;
2172 }
2173
2174 /*
2175 * This function is called after writing data objects. If we need to
2176 * update metadata, this sends a write request to the vdi object.
2177 */
2178 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
2179 {
2180 BDRVSheepdogState *s = acb->s;
2181 struct iovec iov;
2182 AIOReq *aio_req;
2183 uint32_t offset, data_len, mn, mx;
2184
2185 mn = acb->min_dirty_data_idx;
2186 mx = acb->max_dirty_data_idx;
2187 if (mn <= mx) {
2188 /* we need to update the vdi object. */
2189 ++acb->nr_pending;
2190 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
2191 mn * sizeof(s->inode.data_vdi_id[0]);
2192 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
2193
2194 acb->min_dirty_data_idx = UINT32_MAX;
2195 acb->max_dirty_data_idx = 0;
2196
2197 iov.iov_base = &s->inode;
2198 iov.iov_len = sizeof(s->inode);
2199 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
2200 data_len, offset, 0, false, 0, offset);
2201 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
2202 if (--acb->nr_pending) {
2203 qemu_coroutine_yield();
2204 }
2205 }
2206 }
2207
2208 /* Delete current working VDI on the snapshot chain */
2209 static bool sd_delete(BDRVSheepdogState *s)
2210 {
2211 Error *local_err = NULL;
2212 unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
2213 SheepdogVdiReq hdr = {
2214 .opcode = SD_OP_DEL_VDI,
2215 .base_vdi_id = s->inode.vdi_id,
2216 .data_length = wlen,
2217 .flags = SD_FLAG_CMD_WRITE,
2218 };
2219 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2220 int fd, ret;
2221
2222 fd = connect_to_sdog(s, &local_err);
2223 if (fd < 0) {
2224 error_report_err(local_err);
2225 return false;
2226 }
2227
2228 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2229 s->name, &wlen, &rlen);
2230 closesocket(fd);
2231 if (ret) {
2232 return false;
2233 }
2234 switch (rsp->result) {
2235 case SD_RES_NO_VDI:
2236 error_report("%s was already deleted", s->name);
2237 /* fall through */
2238 case SD_RES_SUCCESS:
2239 break;
2240 default:
2241 error_report("%s, %s", sd_strerror(rsp->result), s->name);
2242 return false;
2243 }
2244
2245 return true;
2246 }
2247
2248 /*
2249 * Create a writable VDI from a snapshot
2250 */
2251 static int sd_create_branch(BDRVSheepdogState *s)
2252 {
2253 Error *local_err = NULL;
2254 int ret, fd;
2255 uint32_t vid;
2256 char *buf;
2257 bool deleted;
2258
2259 DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
2260
2261 buf = g_malloc(SD_INODE_SIZE);
2262
2263 /*
2264 * Even If deletion fails, we will just create extra snapshot based on
2265 * the working VDI which was supposed to be deleted. So no need to
2266 * false bail out.
2267 */
2268 deleted = sd_delete(s);
2269 ret = do_sd_create(s, &vid, !deleted, &local_err);
2270 if (ret) {
2271 error_report_err(local_err);
2272 goto out;
2273 }
2274
2275 DPRINTF("%" PRIx32 " is created.\n", vid);
2276
2277 fd = connect_to_sdog(s, &local_err);
2278 if (fd < 0) {
2279 error_report_err(local_err);
2280 ret = fd;
2281 goto out;
2282 }
2283
2284 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
2285 s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
2286
2287 closesocket(fd);
2288
2289 if (ret < 0) {
2290 goto out;
2291 }
2292
2293 memcpy(&s->inode, buf, sizeof(s->inode));
2294
2295 s->is_snapshot = false;
2296 ret = 0;
2297 DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
2298
2299 out:
2300 g_free(buf);
2301
2302 return ret;
2303 }
2304
2305 /*
2306 * Send I/O requests to the server.
2307 *
2308 * This function sends requests to the server, links the requests to
2309 * the inflight_list in BDRVSheepdogState, and exits without
2310 * waiting the response. The responses are received in the
2311 * `aio_read_response' function which is called from the main loop as
2312 * a fd handler.
2313 *
2314 * Returns 1 when we need to wait a response, 0 when there is no sent
2315 * request and -errno in error cases.
2316 */
2317 static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
2318 {
2319 int ret = 0;
2320 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
2321 unsigned long idx;
2322 uint32_t object_size;
2323 uint64_t oid;
2324 uint64_t offset;
2325 BDRVSheepdogState *s = acb->s;
2326 SheepdogInode *inode = &s->inode;
2327 AIOReq *aio_req;
2328
2329 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
2330 /*
2331 * In the case we open the snapshot VDI, Sheepdog creates the
2332 * writable VDI when we do a write operation first.
2333 */
2334 ret = sd_create_branch(s);
2335 if (ret) {
2336 acb->ret = -EIO;
2337 return;
2338 }
2339 }
2340
2341 object_size = (UINT32_C(1) << inode->block_size_shift);
2342 idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
2343 offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
2344
2345 /*
2346 * Make sure we don't free the aiocb before we are done with all requests.
2347 * This additional reference is dropped at the end of this function.
2348 */
2349 acb->nr_pending++;
2350
2351 while (done != total) {
2352 uint8_t flags = 0;
2353 uint64_t old_oid = 0;
2354 bool create = false;
2355
2356 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
2357
2358 len = MIN(total - done, object_size - offset);
2359
2360 switch (acb->aiocb_type) {
2361 case AIOCB_READ_UDATA:
2362 if (!inode->data_vdi_id[idx]) {
2363 qemu_iovec_memset(acb->qiov, done, 0, len);
2364 goto done;
2365 }
2366 break;
2367 case AIOCB_WRITE_UDATA:
2368 if (!inode->data_vdi_id[idx]) {
2369 create = true;
2370 } else if (!is_data_obj_writable(inode, idx)) {
2371 /* Copy-On-Write */
2372 create = true;
2373 old_oid = oid;
2374 flags = SD_FLAG_CMD_COW;
2375 }
2376 break;
2377 case AIOCB_DISCARD_OBJ:
2378 /*
2379 * We discard the object only when the whole object is
2380 * 1) allocated 2) trimmed. Otherwise, simply skip it.
2381 */
2382 if (len != object_size || inode->data_vdi_id[idx] == 0) {
2383 goto done;
2384 }
2385 break;
2386 default:
2387 break;
2388 }
2389
2390 if (create) {
2391 DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
2392 inode->vdi_id, oid,
2393 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
2394 oid = vid_to_data_oid(inode->vdi_id, idx);
2395 DPRINTF("new oid %" PRIx64 "\n", oid);
2396 }
2397
2398 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
2399 old_oid,
2400 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
2401 0 : done);
2402 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
2403 acb->aiocb_type);
2404 done:
2405 offset = 0;
2406 idx++;
2407 done += len;
2408 }
2409 if (--acb->nr_pending) {
2410 qemu_coroutine_yield();
2411 }
2412 }
2413
2414 static void sd_aio_complete(SheepdogAIOCB *acb)
2415 {
2416 if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
2417 return;
2418 }
2419
2420 QLIST_REMOVE(acb, aiocb_siblings);
2421 qemu_co_queue_restart_all(&acb->s->overlapping_queue);
2422 }
2423
2424 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
2425 int nb_sectors, QEMUIOVector *qiov)
2426 {
2427 SheepdogAIOCB acb;
2428 int ret;
2429 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
2430 BDRVSheepdogState *s = bs->opaque;
2431
2432 if (offset > s->inode.vdi_size) {
2433 ret = sd_truncate(bs, offset);
2434 if (ret < 0) {
2435 return ret;
2436 }
2437 }
2438
2439 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
2440 sd_co_rw_vector(&acb);
2441 sd_write_done(&acb);
2442 sd_aio_complete(&acb);
2443
2444 return acb.ret;
2445 }
2446
2447 static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2448 int nb_sectors, QEMUIOVector *qiov)
2449 {
2450 SheepdogAIOCB acb;
2451 BDRVSheepdogState *s = bs->opaque;
2452
2453 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
2454 sd_co_rw_vector(&acb);
2455 sd_aio_complete(&acb);
2456
2457 return acb.ret;
2458 }
2459
2460 static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
2461 {
2462 BDRVSheepdogState *s = bs->opaque;
2463 SheepdogAIOCB acb;
2464 AIOReq *aio_req;
2465
2466 if (s->cache_flags != SD_FLAG_CMD_CACHE) {
2467 return 0;
2468 }
2469
2470 sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
2471
2472 acb.nr_pending++;
2473 aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
2474 0, 0, 0, false, 0, 0);
2475 add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
2476
2477 if (--acb.nr_pending) {
2478 qemu_coroutine_yield();
2479 }
2480
2481 sd_aio_complete(&acb);
2482 return acb.ret;
2483 }
2484
2485 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
2486 {
2487 Error *local_err = NULL;
2488 BDRVSheepdogState *s = bs->opaque;
2489 int ret, fd;
2490 uint32_t new_vid;
2491 SheepdogInode *inode;
2492 unsigned int datalen;
2493
2494 DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
2495 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
2496 s->name, sn_info->vm_state_size, s->is_snapshot);
2497
2498 if (s->is_snapshot) {
2499 error_report("You can't create a snapshot of a snapshot VDI, "
2500 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
2501
2502 return -EINVAL;
2503 }
2504
2505 DPRINTF("%s %s\n", sn_info->name, sn_info->id_str);
2506
2507 s->inode.vm_state_size = sn_info->vm_state_size;
2508 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
2509 /* It appears that inode.tag does not require a NUL terminator,
2510 * which means this use of strncpy is ok.
2511 */
2512 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
2513 /* we don't need to update entire object */
2514 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2515 inode = g_malloc(datalen);
2516
2517 /* refresh inode. */
2518 fd = connect_to_sdog(s, &local_err);
2519 if (fd < 0) {
2520 error_report_err(local_err);
2521 ret = fd;
2522 goto cleanup;
2523 }
2524
2525 ret = write_object(fd, s->bs, (char *)&s->inode,
2526 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2527 datalen, 0, false, s->cache_flags);
2528 if (ret < 0) {
2529 error_report("failed to write snapshot's inode.");
2530 goto cleanup;
2531 }
2532
2533 ret = do_sd_create(s, &new_vid, 1, &local_err);
2534 if (ret < 0) {
2535 error_reportf_err(local_err,
2536 "failed to create inode for snapshot: ");
2537 goto cleanup;
2538 }
2539
2540 ret = read_object(fd, s->bs, (char *)inode,
2541 vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
2542 s->cache_flags);
2543
2544 if (ret < 0) {
2545 error_report("failed to read new inode info. %s", strerror(errno));
2546 goto cleanup;
2547 }
2548
2549 memcpy(&s->inode, inode, datalen);
2550 DPRINTF("s->inode: name %s snap_id %x oid %x\n",
2551 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
2552
2553 cleanup:
2554 g_free(inode);
2555 closesocket(fd);
2556 return ret;
2557 }
2558
2559 /*
2560 * We implement rollback(loadvm) operation to the specified snapshot by
2561 * 1) switch to the snapshot
2562 * 2) rely on sd_create_branch to delete working VDI and
2563 * 3) create a new working VDI based on the specified snapshot
2564 */
2565 static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
2566 {
2567 BDRVSheepdogState *s = bs->opaque;
2568 BDRVSheepdogState *old_s;
2569 char tag[SD_MAX_VDI_TAG_LEN];
2570 uint32_t snapid = 0;
2571 int ret;
2572
2573 if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) {
2574 return -EINVAL;
2575 }
2576
2577 old_s = g_new(BDRVSheepdogState, 1);
2578
2579 memcpy(old_s, s, sizeof(BDRVSheepdogState));
2580
2581 ret = reload_inode(s, snapid, tag);
2582 if (ret) {
2583 goto out;
2584 }
2585
2586 ret = sd_create_branch(s);
2587 if (ret) {
2588 goto out;
2589 }
2590
2591 g_free(old_s);
2592
2593 return 0;
2594 out:
2595 /* recover bdrv_sd_state */
2596 memcpy(s, old_s, sizeof(BDRVSheepdogState));
2597 g_free(old_s);
2598
2599 error_report("failed to open. recover old bdrv_sd_state.");
2600
2601 return ret;
2602 }
2603
2604 #define NR_BATCHED_DISCARD 128
2605
2606 static int remove_objects(BDRVSheepdogState *s, Error **errp)
2607 {
2608 int fd, i = 0, nr_objs = 0;
2609 int ret;
2610 SheepdogInode *inode = &s->inode;
2611
2612 fd = connect_to_sdog(s, errp);
2613 if (fd < 0) {
2614 return fd;
2615 }
2616
2617 nr_objs = count_data_objs(inode);
2618 while (i < nr_objs) {
2619 int start_idx, nr_filled_idx;
2620
2621 while (i < nr_objs && !inode->data_vdi_id[i]) {
2622 i++;
2623 }
2624 start_idx = i;
2625
2626 nr_filled_idx = 0;
2627 while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
2628 if (inode->data_vdi_id[i]) {
2629 inode->data_vdi_id[i] = 0;
2630 nr_filled_idx++;
2631 }
2632
2633 i++;
2634 }
2635
2636 ret = write_object(fd, s->bs,
2637 (char *)&inode->data_vdi_id[start_idx],
2638 vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
2639 (i - start_idx) * sizeof(uint32_t),
2640 offsetof(struct SheepdogInode,
2641 data_vdi_id[start_idx]),
2642 false, s->cache_flags);
2643 if (ret < 0) {
2644 error_setg(errp, "Failed to discard snapshot inode");
2645 goto out;
2646 }
2647 }
2648
2649 ret = 0;
2650 out:
2651 closesocket(fd);
2652 return ret;
2653 }
2654
2655 static int sd_snapshot_delete(BlockDriverState *bs,
2656 const char *snapshot_id,
2657 const char *name,
2658 Error **errp)
2659 {
2660 /*
2661 * FIXME should delete the snapshot matching both @snapshot_id and
2662 * @name, but @name not used here
2663 */
2664 unsigned long snap_id = 0;
2665 char snap_tag[SD_MAX_VDI_TAG_LEN];
2666 int fd, ret;
2667 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
2668 BDRVSheepdogState *s = bs->opaque;
2669 unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
2670 uint32_t vid;
2671 SheepdogVdiReq hdr = {
2672 .opcode = SD_OP_DEL_VDI,
2673 .data_length = wlen,
2674 .flags = SD_FLAG_CMD_WRITE,
2675 };
2676 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2677
2678 ret = remove_objects(s, errp);
2679 if (ret) {
2680 return ret;
2681 }
2682
2683 memset(buf, 0, sizeof(buf));
2684 memset(snap_tag, 0, sizeof(snap_tag));
2685 pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
2686 /* TODO Use sd_parse_snapid() once this mess is cleaned up */
2687 ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
2688 if (ret || snap_id > UINT32_MAX) {
2689 /*
2690 * FIXME Since qemu_strtoul() returns -EINVAL when
2691 * @snapshot_id is null, @snapshot_id is mandatory. Correct
2692 * would be to require at least one of @snapshot_id and @name.
2693 */
2694 error_setg(errp, "Invalid snapshot ID: %s",
2695 snapshot_id ? snapshot_id : "<null>");
2696 return -EINVAL;
2697 }
2698
2699 if (snap_id) {
2700 hdr.snapid = (uint32_t) snap_id;
2701 } else {
2702 /* FIXME I suspect we should use @name here */
2703 /* FIXME don't truncate silently */
2704 pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
2705 pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
2706 }
2707
2708 ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp);
2709 if (ret) {
2710 return ret;
2711 }
2712
2713 fd = connect_to_sdog(s, errp);
2714 if (fd < 0) {
2715 return fd;
2716 }
2717
2718 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
2719 buf, &wlen, &rlen);
2720 closesocket(fd);
2721 if (ret) {
2722 error_setg_errno(errp, -ret, "Couldn't send request to server");
2723 return ret;
2724 }
2725
2726 switch (rsp->result) {
2727 case SD_RES_NO_VDI:
2728 error_setg(errp, "Can't find the snapshot");
2729 return -ENOENT;
2730 case SD_RES_SUCCESS:
2731 break;
2732 default:
2733 error_setg(errp, "%s", sd_strerror(rsp->result));
2734 return -EIO;
2735 }
2736
2737 return 0;
2738 }
2739
2740 static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
2741 {
2742 Error *local_err = NULL;
2743 BDRVSheepdogState *s = bs->opaque;
2744 SheepdogReq req;
2745 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
2746 QEMUSnapshotInfo *sn_tab = NULL;
2747 unsigned wlen, rlen;
2748 int found = 0;
2749 static SheepdogInode inode;
2750 unsigned long *vdi_inuse;
2751 unsigned int start_nr;
2752 uint64_t hval;
2753 uint32_t vid;
2754
2755 vdi_inuse = g_malloc(max);
2756
2757 fd = connect_to_sdog(s, &local_err);
2758 if (fd < 0) {
2759 error_report_err(local_err);
2760 ret = fd;
2761 goto out;
2762 }
2763
2764 rlen = max;
2765 wlen = 0;
2766
2767 memset(&req, 0, sizeof(req));
2768
2769 req.opcode = SD_OP_READ_VDIS;
2770 req.data_length = max;
2771
2772 ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
2773
2774 closesocket(fd);
2775 if (ret) {
2776 goto out;
2777 }
2778
2779 sn_tab = g_new0(QEMUSnapshotInfo, nr);
2780
2781 /* calculate a vdi id with hash function */
2782 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
2783 start_nr = hval & (SD_NR_VDIS - 1);
2784
2785 fd = connect_to_sdog(s, &local_err);
2786 if (fd < 0) {
2787 error_report_err(local_err);
2788 ret = fd;
2789 goto out;
2790 }
2791
2792 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
2793 if (!test_bit(vid, vdi_inuse)) {
2794 break;
2795 }
2796
2797 /* we don't need to read entire object */
2798 ret = read_object(fd, s->bs, (char *)&inode,
2799 vid_to_vdi_oid(vid),
2800 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
2801 s->cache_flags);
2802
2803 if (ret) {
2804 continue;
2805 }
2806
2807 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
2808 sn_tab[found].date_sec = inode.snap_ctime >> 32;
2809 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
2810 sn_tab[found].vm_state_size = inode.vm_state_size;
2811 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
2812
2813 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
2814 "%" PRIu32, inode.snap_id);
2815 pstrcpy(sn_tab[found].name,
2816 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
2817 inode.tag);
2818 found++;
2819 }
2820 }
2821
2822 closesocket(fd);
2823 out:
2824 *psn_tab = sn_tab;
2825
2826 g_free(vdi_inuse);
2827
2828 if (ret < 0) {
2829 return ret;
2830 }
2831
2832 return found;
2833 }
2834
2835 static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
2836 int64_t pos, int size, int load)
2837 {
2838 Error *local_err = NULL;
2839 bool create;
2840 int fd, ret = 0, remaining = size;
2841 unsigned int data_len;
2842 uint64_t vmstate_oid;
2843 uint64_t offset;
2844 uint32_t vdi_index;
2845 uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
2846 uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
2847
2848 fd = connect_to_sdog(s, &local_err);
2849 if (fd < 0) {
2850 error_report_err(local_err);
2851 return fd;
2852 }
2853
2854 while (remaining) {
2855 vdi_index = pos / object_size;
2856 offset = pos % object_size;
2857
2858 data_len = MIN(remaining, object_size - offset);
2859
2860 vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
2861
2862 create = (offset == 0);
2863 if (load) {
2864 ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
2865 s->inode.nr_copies, data_len, offset,
2866 s->cache_flags);
2867 } else {
2868 ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
2869 s->inode.nr_copies, data_len, offset, create,
2870 s->cache_flags);
2871 }
2872
2873 if (ret < 0) {
2874 error_report("failed to save vmstate %s", strerror(errno));
2875 goto cleanup;
2876 }
2877
2878 pos += data_len;
2879 data += data_len;
2880 remaining -= data_len;
2881 }
2882 ret = size;
2883 cleanup:
2884 closesocket(fd);
2885 return ret;
2886 }
2887
2888 static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2889 int64_t pos)
2890 {
2891 BDRVSheepdogState *s = bs->opaque;
2892 void *buf;
2893 int ret;
2894
2895 buf = qemu_blockalign(bs, qiov->size);
2896 qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
2897 ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
2898 qemu_vfree(buf);
2899
2900 return ret;
2901 }
2902
2903 static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2904 int64_t pos)
2905 {
2906 BDRVSheepdogState *s = bs->opaque;
2907 void *buf;
2908 int ret;
2909
2910 buf = qemu_blockalign(bs, qiov->size);
2911 ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
2912 qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
2913 qemu_vfree(buf);
2914
2915 return ret;
2916 }
2917
2918
2919 static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
2920 int count)
2921 {
2922 SheepdogAIOCB acb;
2923 BDRVSheepdogState *s = bs->opaque;
2924 QEMUIOVector discard_iov;
2925 struct iovec iov;
2926 uint32_t zero = 0;
2927
2928 if (!s->discard_supported) {
2929 return 0;
2930 }
2931
2932 memset(&discard_iov, 0, sizeof(discard_iov));
2933 memset(&iov, 0, sizeof(iov));
2934 iov.iov_base = &zero;
2935 iov.iov_len = sizeof(zero);
2936 discard_iov.iov = &iov;
2937 discard_iov.niov = 1;
2938 if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
2939 return -ENOTSUP;
2940 }
2941 sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
2942 count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
2943 sd_co_rw_vector(&acb);
2944 sd_aio_complete(&acb);
2945
2946 return acb.ret;
2947 }
2948
2949 static coroutine_fn int64_t
2950 sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2951 int *pnum, BlockDriverState **file)
2952 {
2953 BDRVSheepdogState *s = bs->opaque;
2954 SheepdogInode *inode = &s->inode;
2955 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
2956 uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
2957 unsigned long start = offset / object_size,
2958 end = DIV_ROUND_UP((sector_num + nb_sectors) *
2959 BDRV_SECTOR_SIZE, object_size);
2960 unsigned long idx;
2961 int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
2962
2963 for (idx = start; idx < end; idx++) {
2964 if (inode->data_vdi_id[idx] == 0) {
2965 break;
2966 }
2967 }
2968 if (idx == start) {
2969 /* Get the longest length of unallocated sectors */
2970 ret = 0;
2971 for (idx = start + 1; idx < end; idx++) {
2972 if (inode->data_vdi_id[idx] != 0) {
2973 break;
2974 }
2975 }
2976 }
2977
2978 *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
2979 if (*pnum > nb_sectors) {
2980 *pnum = nb_sectors;
2981 }
2982 if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
2983 *file = bs;
2984 }
2985 return ret;
2986 }
2987
2988 static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
2989 {
2990 BDRVSheepdogState *s = bs->opaque;
2991 SheepdogInode *inode = &s->inode;
2992 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
2993 unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
2994 uint64_t size = 0;
2995
2996 for (i = 0; i < last; i++) {
2997 if (inode->data_vdi_id[i] == 0) {
2998 continue;
2999 }
3000 size += object_size;
3001 }
3002 return size;
3003 }
3004
3005 static QemuOptsList sd_create_opts = {
3006 .name = "sheepdog-create-opts",
3007 .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head),
3008 .desc = {
3009 {
3010 .name = BLOCK_OPT_SIZE,
3011 .type = QEMU_OPT_SIZE,
3012 .help = "Virtual disk size"
3013 },
3014 {
3015 .name = BLOCK_OPT_BACKING_FILE,
3016 .type = QEMU_OPT_STRING,
3017 .help = "File name of a base image"
3018 },
3019 {
3020 .name = BLOCK_OPT_PREALLOC,
3021 .type = QEMU_OPT_STRING,
3022 .help = "Preallocation mode (allowed values: off, full)"
3023 },
3024 {
3025 .name = BLOCK_OPT_REDUNDANCY,
3026 .type = QEMU_OPT_STRING,
3027 .help = "Redundancy of the image"
3028 },
3029 {
3030 .name = BLOCK_OPT_OBJECT_SIZE,
3031 .type = QEMU_OPT_SIZE,
3032 .help = "Object size of the image"
3033 },
3034 { /* end of list */ }
3035 }
3036 };
3037
3038 static BlockDriver bdrv_sheepdog = {
3039 .format_name = "sheepdog",
3040 .protocol_name = "sheepdog",
3041 .instance_size = sizeof(BDRVSheepdogState),
3042 .bdrv_parse_filename = sd_parse_filename,
3043 .bdrv_file_open = sd_open,
3044 .bdrv_reopen_prepare = sd_reopen_prepare,
3045 .bdrv_reopen_commit = sd_reopen_commit,
3046 .bdrv_reopen_abort = sd_reopen_abort,
3047 .bdrv_close = sd_close,
3048 .bdrv_create = sd_create,
3049 .bdrv_has_zero_init = bdrv_has_zero_init_1,
3050 .bdrv_getlength = sd_getlength,
3051 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
3052 .bdrv_truncate = sd_truncate,
3053
3054 .bdrv_co_readv = sd_co_readv,
3055 .bdrv_co_writev = sd_co_writev,
3056 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
3057 .bdrv_co_pdiscard = sd_co_pdiscard,
3058 .bdrv_co_get_block_status = sd_co_get_block_status,
3059
3060 .bdrv_snapshot_create = sd_snapshot_create,
3061 .bdrv_snapshot_goto = sd_snapshot_goto,
3062 .bdrv_snapshot_delete = sd_snapshot_delete,
3063 .bdrv_snapshot_list = sd_snapshot_list,
3064
3065 .bdrv_save_vmstate = sd_save_vmstate,
3066 .bdrv_load_vmstate = sd_load_vmstate,
3067
3068 .bdrv_detach_aio_context = sd_detach_aio_context,
3069 .bdrv_attach_aio_context = sd_attach_aio_context,
3070
3071 .create_opts = &sd_create_opts,
3072 };
3073
3074 static BlockDriver bdrv_sheepdog_tcp = {
3075 .format_name = "sheepdog",
3076 .protocol_name = "sheepdog+tcp",
3077 .instance_size = sizeof(BDRVSheepdogState),
3078 .bdrv_parse_filename = sd_parse_filename,
3079 .bdrv_file_open = sd_open,
3080 .bdrv_reopen_prepare = sd_reopen_prepare,
3081 .bdrv_reopen_commit = sd_reopen_commit,
3082 .bdrv_reopen_abort = sd_reopen_abort,
3083 .bdrv_close = sd_close,
3084 .bdrv_create = sd_create,
3085 .bdrv_has_zero_init = bdrv_has_zero_init_1,
3086 .bdrv_getlength = sd_getlength,
3087 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
3088 .bdrv_truncate = sd_truncate,
3089
3090 .bdrv_co_readv = sd_co_readv,
3091 .bdrv_co_writev = sd_co_writev,
3092 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
3093 .bdrv_co_pdiscard = sd_co_pdiscard,
3094 .bdrv_co_get_block_status = sd_co_get_block_status,
3095
3096 .bdrv_snapshot_create = sd_snapshot_create,
3097 .bdrv_snapshot_goto = sd_snapshot_goto,
3098 .bdrv_snapshot_delete = sd_snapshot_delete,
3099 .bdrv_snapshot_list = sd_snapshot_list,
3100
3101 .bdrv_save_vmstate = sd_save_vmstate,
3102 .bdrv_load_vmstate = sd_load_vmstate,
3103
3104 .bdrv_detach_aio_context = sd_detach_aio_context,
3105 .bdrv_attach_aio_context = sd_attach_aio_context,
3106
3107 .create_opts = &sd_create_opts,
3108 };
3109
3110 static BlockDriver bdrv_sheepdog_unix = {
3111 .format_name = "sheepdog",
3112 .protocol_name = "sheepdog+unix",
3113 .instance_size = sizeof(BDRVSheepdogState),
3114 .bdrv_parse_filename = sd_parse_filename,
3115 .bdrv_file_open = sd_open,
3116 .bdrv_reopen_prepare = sd_reopen_prepare,
3117 .bdrv_reopen_commit = sd_reopen_commit,
3118 .bdrv_reopen_abort = sd_reopen_abort,
3119 .bdrv_close = sd_close,
3120 .bdrv_create = sd_create,
3121 .bdrv_has_zero_init = bdrv_has_zero_init_1,
3122 .bdrv_getlength = sd_getlength,
3123 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
3124 .bdrv_truncate = sd_truncate,
3125
3126 .bdrv_co_readv = sd_co_readv,
3127 .bdrv_co_writev = sd_co_writev,
3128 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
3129 .bdrv_co_pdiscard = sd_co_pdiscard,
3130 .bdrv_co_get_block_status = sd_co_get_block_status,
3131
3132 .bdrv_snapshot_create = sd_snapshot_create,
3133 .bdrv_snapshot_goto = sd_snapshot_goto,
3134 .bdrv_snapshot_delete = sd_snapshot_delete,
3135 .bdrv_snapshot_list = sd_snapshot_list,
3136
3137 .bdrv_save_vmstate = sd_save_vmstate,
3138 .bdrv_load_vmstate = sd_load_vmstate,
3139
3140 .bdrv_detach_aio_context = sd_detach_aio_context,
3141 .bdrv_attach_aio_context = sd_attach_aio_context,
3142
3143 .create_opts = &sd_create_opts,
3144 };
3145
3146 static void bdrv_sheepdog_init(void)
3147 {
3148 bdrv_register(&bdrv_sheepdog);
3149 bdrv_register(&bdrv_sheepdog_tcp);
3150 bdrv_register(&bdrv_sheepdog_unix);
3151 }
3152 block_init(bdrv_sheepdog_init);