]> git.proxmox.com Git - qemu.git/blame_incremental - block/sheepdog.c
sheepdog: accept URIs
[qemu.git] / block / sheepdog.c
... / ...
CommitLineData
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10 *
11 * Contributions after 2012-01-13 are licensed under the terms of the
12 * GNU GPL, version 2 or (at your option) any later version.
13 */
14
15#include "qemu-common.h"
16#include "qemu/uri.h"
17#include "qemu/error-report.h"
18#include "qemu/sockets.h"
19#include "block/block_int.h"
20#include "qemu/bitops.h"
21
22#define SD_PROTO_VER 0x01
23
24#define SD_DEFAULT_ADDR "localhost"
25#define SD_DEFAULT_PORT "7000"
26
27#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
28#define SD_OP_READ_OBJ 0x02
29#define SD_OP_WRITE_OBJ 0x03
30
31#define SD_OP_NEW_VDI 0x11
32#define SD_OP_LOCK_VDI 0x12
33#define SD_OP_RELEASE_VDI 0x13
34#define SD_OP_GET_VDI_INFO 0x14
35#define SD_OP_READ_VDIS 0x15
36#define SD_OP_FLUSH_VDI 0x16
37
38#define SD_FLAG_CMD_WRITE 0x01
39#define SD_FLAG_CMD_COW 0x02
40#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
41#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
42
43#define SD_RES_SUCCESS 0x00 /* Success */
44#define SD_RES_UNKNOWN 0x01 /* Unknown error */
45#define SD_RES_NO_OBJ 0x02 /* No object found */
46#define SD_RES_EIO 0x03 /* I/O error */
47#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
48#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
49#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
50#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
51#define SD_RES_NO_VDI 0x08 /* No vdi found */
52#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
53#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
54#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
55#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
56#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
57#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
58#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
59#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
60#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
61#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
62#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
63#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
64#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
65#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
66#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
67#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
68
69/*
70 * Object ID rules
71 *
72 * 0 - 19 (20 bits): data object space
73 * 20 - 31 (12 bits): reserved data object space
74 * 32 - 55 (24 bits): vdi object space
75 * 56 - 59 ( 4 bits): reserved vdi object space
76 * 60 - 63 ( 4 bits): object type identifier space
77 */
78
79#define VDI_SPACE_SHIFT 32
80#define VDI_BIT (UINT64_C(1) << 63)
81#define VMSTATE_BIT (UINT64_C(1) << 62)
82#define MAX_DATA_OBJS (UINT64_C(1) << 20)
83#define MAX_CHILDREN 1024
84#define SD_MAX_VDI_LEN 256
85#define SD_MAX_VDI_TAG_LEN 256
86#define SD_NR_VDIS (1U << 24)
87#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
88#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
89#define SECTOR_SIZE 512
90
91#define SD_INODE_SIZE (sizeof(SheepdogInode))
92#define CURRENT_VDI_ID 0
93
94typedef struct SheepdogReq {
95 uint8_t proto_ver;
96 uint8_t opcode;
97 uint16_t flags;
98 uint32_t epoch;
99 uint32_t id;
100 uint32_t data_length;
101 uint32_t opcode_specific[8];
102} SheepdogReq;
103
104typedef struct SheepdogRsp {
105 uint8_t proto_ver;
106 uint8_t opcode;
107 uint16_t flags;
108 uint32_t epoch;
109 uint32_t id;
110 uint32_t data_length;
111 uint32_t result;
112 uint32_t opcode_specific[7];
113} SheepdogRsp;
114
115typedef struct SheepdogObjReq {
116 uint8_t proto_ver;
117 uint8_t opcode;
118 uint16_t flags;
119 uint32_t epoch;
120 uint32_t id;
121 uint32_t data_length;
122 uint64_t oid;
123 uint64_t cow_oid;
124 uint32_t copies;
125 uint32_t rsvd;
126 uint64_t offset;
127} SheepdogObjReq;
128
129typedef struct SheepdogObjRsp {
130 uint8_t proto_ver;
131 uint8_t opcode;
132 uint16_t flags;
133 uint32_t epoch;
134 uint32_t id;
135 uint32_t data_length;
136 uint32_t result;
137 uint32_t copies;
138 uint32_t pad[6];
139} SheepdogObjRsp;
140
141typedef struct SheepdogVdiReq {
142 uint8_t proto_ver;
143 uint8_t opcode;
144 uint16_t flags;
145 uint32_t epoch;
146 uint32_t id;
147 uint32_t data_length;
148 uint64_t vdi_size;
149 uint32_t vdi_id;
150 uint32_t copies;
151 uint32_t snapid;
152 uint32_t pad[3];
153} SheepdogVdiReq;
154
155typedef struct SheepdogVdiRsp {
156 uint8_t proto_ver;
157 uint8_t opcode;
158 uint16_t flags;
159 uint32_t epoch;
160 uint32_t id;
161 uint32_t data_length;
162 uint32_t result;
163 uint32_t rsvd;
164 uint32_t vdi_id;
165 uint32_t pad[5];
166} SheepdogVdiRsp;
167
168typedef struct SheepdogInode {
169 char name[SD_MAX_VDI_LEN];
170 char tag[SD_MAX_VDI_TAG_LEN];
171 uint64_t ctime;
172 uint64_t snap_ctime;
173 uint64_t vm_clock_nsec;
174 uint64_t vdi_size;
175 uint64_t vm_state_size;
176 uint16_t copy_policy;
177 uint8_t nr_copies;
178 uint8_t block_size_shift;
179 uint32_t snap_id;
180 uint32_t vdi_id;
181 uint32_t parent_vdi_id;
182 uint32_t child_vdi_id[MAX_CHILDREN];
183 uint32_t data_vdi_id[MAX_DATA_OBJS];
184} SheepdogInode;
185
186/*
187 * 64 bit FNV-1a non-zero initial basis
188 */
189#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
190
191/*
192 * 64 bit Fowler/Noll/Vo FNV-1a hash code
193 */
194static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
195{
196 unsigned char *bp = buf;
197 unsigned char *be = bp + len;
198 while (bp < be) {
199 hval ^= (uint64_t) *bp++;
200 hval += (hval << 1) + (hval << 4) + (hval << 5) +
201 (hval << 7) + (hval << 8) + (hval << 40);
202 }
203 return hval;
204}
205
206static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
207{
208 return inode->vdi_id == inode->data_vdi_id[idx];
209}
210
211static inline bool is_data_obj(uint64_t oid)
212{
213 return !(VDI_BIT & oid);
214}
215
216static inline uint64_t data_oid_to_idx(uint64_t oid)
217{
218 return oid & (MAX_DATA_OBJS - 1);
219}
220
221static inline uint64_t vid_to_vdi_oid(uint32_t vid)
222{
223 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
224}
225
226static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
227{
228 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
229}
230
231static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
232{
233 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
234}
235
236static inline bool is_snapshot(struct SheepdogInode *inode)
237{
238 return !!inode->snap_ctime;
239}
240
241#undef dprintf
242#ifdef DEBUG_SDOG
243#define dprintf(fmt, args...) \
244 do { \
245 fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
246 } while (0)
247#else
248#define dprintf(fmt, args...)
249#endif
250
251typedef struct SheepdogAIOCB SheepdogAIOCB;
252
253typedef struct AIOReq {
254 SheepdogAIOCB *aiocb;
255 unsigned int iov_offset;
256
257 uint64_t oid;
258 uint64_t base_oid;
259 uint64_t offset;
260 unsigned int data_len;
261 uint8_t flags;
262 uint32_t id;
263
264 QLIST_ENTRY(AIOReq) aio_siblings;
265} AIOReq;
266
267enum AIOCBState {
268 AIOCB_WRITE_UDATA,
269 AIOCB_READ_UDATA,
270 AIOCB_FLUSH_CACHE,
271};
272
273struct SheepdogAIOCB {
274 BlockDriverAIOCB common;
275
276 QEMUIOVector *qiov;
277
278 int64_t sector_num;
279 int nb_sectors;
280
281 int ret;
282 enum AIOCBState aiocb_type;
283
284 Coroutine *coroutine;
285 void (*aio_done_func)(SheepdogAIOCB *);
286
287 bool canceled;
288 int nr_pending;
289};
290
291typedef struct BDRVSheepdogState {
292 SheepdogInode inode;
293
294 uint32_t min_dirty_data_idx;
295 uint32_t max_dirty_data_idx;
296
297 char name[SD_MAX_VDI_LEN];
298 bool is_snapshot;
299 uint32_t cache_flags;
300
301 char *addr;
302 char *port;
303 int fd;
304
305 CoMutex lock;
306 Coroutine *co_send;
307 Coroutine *co_recv;
308
309 uint32_t aioreq_seq_num;
310 QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
311 QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
312} BDRVSheepdogState;
313
314static const char * sd_strerror(int err)
315{
316 int i;
317
318 static const struct {
319 int err;
320 const char *desc;
321 } errors[] = {
322 {SD_RES_SUCCESS, "Success"},
323 {SD_RES_UNKNOWN, "Unknown error"},
324 {SD_RES_NO_OBJ, "No object found"},
325 {SD_RES_EIO, "I/O error"},
326 {SD_RES_VDI_EXIST, "VDI exists already"},
327 {SD_RES_INVALID_PARMS, "Invalid parameters"},
328 {SD_RES_SYSTEM_ERROR, "System error"},
329 {SD_RES_VDI_LOCKED, "VDI is already locked"},
330 {SD_RES_NO_VDI, "No vdi found"},
331 {SD_RES_NO_BASE_VDI, "No base VDI found"},
332 {SD_RES_VDI_READ, "Failed read the requested VDI"},
333 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
334 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
335 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
336 {SD_RES_NO_TAG, "Failed to find the requested tag"},
337 {SD_RES_STARTUP, "The system is still booting"},
338 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
339 {SD_RES_SHUTDOWN, "The system is shutting down"},
340 {SD_RES_NO_MEM, "Out of memory on the server"},
341 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
342 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
343 {SD_RES_NO_SPACE, "Server has no space for new objects"},
344 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
345 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
346 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
347 };
348
349 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
350 if (errors[i].err == err) {
351 return errors[i].desc;
352 }
353 }
354
355 return "Invalid error code";
356}
357
358/*
359 * Sheepdog I/O handling:
360 *
361 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
362 * link the requests to the inflight_list in the
363 * BDRVSheepdogState. The function exits without waiting for
364 * receiving the response.
365 *
366 * 2. We receive the response in aio_read_response, the fd handler to
367 * the sheepdog connection. If metadata update is needed, we send
368 * the write request to the vdi object in sd_write_done, the write
369 * completion function. We switch back to sd_co_readv/writev after
370 * all the requests belonging to the AIOCB are finished.
371 */
372
373static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
374 uint64_t oid, unsigned int data_len,
375 uint64_t offset, uint8_t flags,
376 uint64_t base_oid, unsigned int iov_offset)
377{
378 AIOReq *aio_req;
379
380 aio_req = g_malloc(sizeof(*aio_req));
381 aio_req->aiocb = acb;
382 aio_req->iov_offset = iov_offset;
383 aio_req->oid = oid;
384 aio_req->base_oid = base_oid;
385 aio_req->offset = offset;
386 aio_req->data_len = data_len;
387 aio_req->flags = flags;
388 aio_req->id = s->aioreq_seq_num++;
389
390 acb->nr_pending++;
391 return aio_req;
392}
393
394static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
395{
396 SheepdogAIOCB *acb = aio_req->aiocb;
397
398 QLIST_REMOVE(aio_req, aio_siblings);
399 g_free(aio_req);
400
401 acb->nr_pending--;
402}
403
404static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
405{
406 if (!acb->canceled) {
407 qemu_coroutine_enter(acb->coroutine, NULL);
408 }
409 qemu_aio_release(acb);
410}
411
412static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
413{
414 SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
415
416 /*
417 * Sheepdog cannot cancel the requests which are already sent to
418 * the servers, so we just complete the request with -EIO here.
419 */
420 acb->ret = -EIO;
421 qemu_coroutine_enter(acb->coroutine, NULL);
422 acb->canceled = true;
423}
424
425static const AIOCBInfo sd_aiocb_info = {
426 .aiocb_size = sizeof(SheepdogAIOCB),
427 .cancel = sd_aio_cancel,
428};
429
430static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
431 int64_t sector_num, int nb_sectors)
432{
433 SheepdogAIOCB *acb;
434
435 acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
436
437 acb->qiov = qiov;
438
439 acb->sector_num = sector_num;
440 acb->nb_sectors = nb_sectors;
441
442 acb->aio_done_func = NULL;
443 acb->canceled = false;
444 acb->coroutine = qemu_coroutine_self();
445 acb->ret = 0;
446 acb->nr_pending = 0;
447 return acb;
448}
449
450static int connect_to_sdog(const char *addr, const char *port)
451{
452 char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
453 int fd, ret;
454 struct addrinfo hints, *res, *res0;
455
456 if (!addr) {
457 addr = SD_DEFAULT_ADDR;
458 port = SD_DEFAULT_PORT;
459 }
460
461 memset(&hints, 0, sizeof(hints));
462 hints.ai_socktype = SOCK_STREAM;
463
464 ret = getaddrinfo(addr, port, &hints, &res0);
465 if (ret) {
466 error_report("unable to get address info %s, %s",
467 addr, strerror(errno));
468 return -errno;
469 }
470
471 for (res = res0; res; res = res->ai_next) {
472 ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
473 sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
474 if (ret) {
475 continue;
476 }
477
478 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
479 if (fd < 0) {
480 continue;
481 }
482
483 reconnect:
484 ret = connect(fd, res->ai_addr, res->ai_addrlen);
485 if (ret < 0) {
486 if (errno == EINTR) {
487 goto reconnect;
488 }
489 close(fd);
490 break;
491 }
492
493 dprintf("connected to %s:%s\n", addr, port);
494 goto success;
495 }
496 fd = -errno;
497 error_report("failed connect to %s:%s", addr, port);
498success:
499 freeaddrinfo(res0);
500 return fd;
501}
502
503static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
504 unsigned int *wlen)
505{
506 int ret;
507
508 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
509 if (ret < sizeof(*hdr)) {
510 error_report("failed to send a req, %s", strerror(errno));
511 return ret;
512 }
513
514 ret = qemu_co_send(sockfd, data, *wlen);
515 if (ret < *wlen) {
516 error_report("failed to send a req, %s", strerror(errno));
517 }
518
519 return ret;
520}
521
522static void restart_co_req(void *opaque)
523{
524 Coroutine *co = opaque;
525
526 qemu_coroutine_enter(co, NULL);
527}
528
529typedef struct SheepdogReqCo {
530 int sockfd;
531 SheepdogReq *hdr;
532 void *data;
533 unsigned int *wlen;
534 unsigned int *rlen;
535 int ret;
536 bool finished;
537} SheepdogReqCo;
538
539static coroutine_fn void do_co_req(void *opaque)
540{
541 int ret;
542 Coroutine *co;
543 SheepdogReqCo *srco = opaque;
544 int sockfd = srco->sockfd;
545 SheepdogReq *hdr = srco->hdr;
546 void *data = srco->data;
547 unsigned int *wlen = srco->wlen;
548 unsigned int *rlen = srco->rlen;
549
550 co = qemu_coroutine_self();
551 qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
552
553 socket_set_block(sockfd);
554 ret = send_co_req(sockfd, hdr, data, wlen);
555 if (ret < 0) {
556 goto out;
557 }
558
559 qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co);
560
561 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
562 if (ret < sizeof(*hdr)) {
563 error_report("failed to get a rsp, %s", strerror(errno));
564 ret = -errno;
565 goto out;
566 }
567
568 if (*rlen > hdr->data_length) {
569 *rlen = hdr->data_length;
570 }
571
572 if (*rlen) {
573 ret = qemu_co_recv(sockfd, data, *rlen);
574 if (ret < *rlen) {
575 error_report("failed to get the data, %s", strerror(errno));
576 ret = -errno;
577 goto out;
578 }
579 }
580 ret = 0;
581out:
582 qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
583 socket_set_nonblock(sockfd);
584
585 srco->ret = ret;
586 srco->finished = true;
587}
588
589static int do_req(int sockfd, SheepdogReq *hdr, void *data,
590 unsigned int *wlen, unsigned int *rlen)
591{
592 Coroutine *co;
593 SheepdogReqCo srco = {
594 .sockfd = sockfd,
595 .hdr = hdr,
596 .data = data,
597 .wlen = wlen,
598 .rlen = rlen,
599 .ret = 0,
600 .finished = false,
601 };
602
603 if (qemu_in_coroutine()) {
604 do_co_req(&srco);
605 } else {
606 co = qemu_coroutine_create(do_co_req);
607 qemu_coroutine_enter(co, &srco);
608 while (!srco.finished) {
609 qemu_aio_wait();
610 }
611 }
612
613 return srco.ret;
614}
615
616static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
617 struct iovec *iov, int niov, bool create,
618 enum AIOCBState aiocb_type);
619
620
621static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
622{
623 AIOReq *aio_req;
624
625 QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
626 if (aio_req->oid == oid) {
627 return aio_req;
628 }
629 }
630
631 return NULL;
632}
633
634/*
635 * This function searchs pending requests to the object `oid', and
636 * sends them.
637 */
638static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
639{
640 AIOReq *aio_req;
641 SheepdogAIOCB *acb;
642 int ret;
643
644 while ((aio_req = find_pending_req(s, oid)) != NULL) {
645 acb = aio_req->aiocb;
646 /* move aio_req from pending list to inflight one */
647 QLIST_REMOVE(aio_req, aio_siblings);
648 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
649 ret = add_aio_request(s, aio_req, acb->qiov->iov,
650 acb->qiov->niov, false, acb->aiocb_type);
651 if (ret < 0) {
652 error_report("add_aio_request is failed");
653 free_aio_req(s, aio_req);
654 if (!acb->nr_pending) {
655 sd_finish_aiocb(acb);
656 }
657 }
658 }
659}
660
661/*
662 * Receive responses of the I/O requests.
663 *
664 * This function is registered as a fd handler, and called from the
665 * main loop when s->fd is ready for reading responses.
666 */
667static void coroutine_fn aio_read_response(void *opaque)
668{
669 SheepdogObjRsp rsp;
670 BDRVSheepdogState *s = opaque;
671 int fd = s->fd;
672 int ret;
673 AIOReq *aio_req = NULL;
674 SheepdogAIOCB *acb;
675 unsigned long idx;
676
677 if (QLIST_EMPTY(&s->inflight_aio_head)) {
678 goto out;
679 }
680
681 /* read a header */
682 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
683 if (ret < 0) {
684 error_report("failed to get the header, %s", strerror(errno));
685 goto out;
686 }
687
688 /* find the right aio_req from the inflight aio list */
689 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
690 if (aio_req->id == rsp.id) {
691 break;
692 }
693 }
694 if (!aio_req) {
695 error_report("cannot find aio_req %x", rsp.id);
696 goto out;
697 }
698
699 acb = aio_req->aiocb;
700
701 switch (acb->aiocb_type) {
702 case AIOCB_WRITE_UDATA:
703 /* this coroutine context is no longer suitable for co_recv
704 * because we may send data to update vdi objects */
705 s->co_recv = NULL;
706 if (!is_data_obj(aio_req->oid)) {
707 break;
708 }
709 idx = data_oid_to_idx(aio_req->oid);
710
711 if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
712 /*
713 * If the object is newly created one, we need to update
714 * the vdi object (metadata object). min_dirty_data_idx
715 * and max_dirty_data_idx are changed to include updated
716 * index between them.
717 */
718 if (rsp.result == SD_RES_SUCCESS) {
719 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
720 s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
721 s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
722 }
723 /*
724 * Some requests may be blocked because simultaneous
725 * create requests are not allowed, so we search the
726 * pending requests here.
727 */
728 send_pending_req(s, aio_req->oid);
729 }
730 break;
731 case AIOCB_READ_UDATA:
732 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
733 aio_req->iov_offset, rsp.data_length);
734 if (ret < 0) {
735 error_report("failed to get the data, %s", strerror(errno));
736 goto out;
737 }
738 break;
739 case AIOCB_FLUSH_CACHE:
740 if (rsp.result == SD_RES_INVALID_PARMS) {
741 dprintf("disable cache since the server doesn't support it\n");
742 s->cache_flags = SD_FLAG_CMD_DIRECT;
743 rsp.result = SD_RES_SUCCESS;
744 }
745 break;
746 }
747
748 if (rsp.result != SD_RES_SUCCESS) {
749 acb->ret = -EIO;
750 error_report("%s", sd_strerror(rsp.result));
751 }
752
753 free_aio_req(s, aio_req);
754 if (!acb->nr_pending) {
755 /*
756 * We've finished all requests which belong to the AIOCB, so
757 * we can switch back to sd_co_readv/writev now.
758 */
759 acb->aio_done_func(acb);
760 }
761out:
762 s->co_recv = NULL;
763}
764
765static void co_read_response(void *opaque)
766{
767 BDRVSheepdogState *s = opaque;
768
769 if (!s->co_recv) {
770 s->co_recv = qemu_coroutine_create(aio_read_response);
771 }
772
773 qemu_coroutine_enter(s->co_recv, opaque);
774}
775
776static void co_write_request(void *opaque)
777{
778 BDRVSheepdogState *s = opaque;
779
780 qemu_coroutine_enter(s->co_send, NULL);
781}
782
783static int aio_flush_request(void *opaque)
784{
785 BDRVSheepdogState *s = opaque;
786
787 return !QLIST_EMPTY(&s->inflight_aio_head) ||
788 !QLIST_EMPTY(&s->pending_aio_head);
789}
790
791/*
792 * Return a socket discriptor to read/write objects.
793 *
794 * We cannot use this discriptor for other operations because
795 * the block driver may be on waiting response from the server.
796 */
797static int get_sheep_fd(BDRVSheepdogState *s)
798{
799 int ret, fd;
800
801 fd = connect_to_sdog(s->addr, s->port);
802 if (fd < 0) {
803 error_report("%s", strerror(errno));
804 return fd;
805 }
806
807 socket_set_nonblock(fd);
808
809 ret = socket_set_nodelay(fd);
810 if (ret) {
811 error_report("%s", strerror(errno));
812 closesocket(fd);
813 return -errno;
814 }
815
816 qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
817 return fd;
818}
819
820static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
821 char *vdi, uint32_t *snapid, char *tag)
822{
823 URI *uri;
824 QueryParams *qp = NULL;
825 int ret = 0;
826
827 uri = uri_parse(filename);
828 if (!uri) {
829 return -EINVAL;
830 }
831
832 if (uri->path == NULL || !strcmp(uri->path, "/")) {
833 ret = -EINVAL;
834 goto out;
835 }
836 pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1);
837
838 /* sheepdog[+tcp]://[host:port]/vdiname */
839 s->addr = g_strdup(uri->server ?: SD_DEFAULT_ADDR);
840 if (uri->port) {
841 s->port = g_strdup_printf("%d", uri->port);
842 } else {
843 s->port = g_strdup(SD_DEFAULT_PORT);
844 }
845
846 /* snapshot tag */
847 if (uri->fragment) {
848 *snapid = strtoul(uri->fragment, NULL, 10);
849 if (*snapid == 0) {
850 pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment);
851 }
852 } else {
853 *snapid = CURRENT_VDI_ID; /* search current vdi */
854 }
855
856out:
857 if (qp) {
858 query_params_free(qp);
859 }
860 uri_free(uri);
861 return ret;
862}
863
864/*
865 * Parse a filename (old syntax)
866 *
867 * filename must be one of the following formats:
868 * 1. [vdiname]
869 * 2. [vdiname]:[snapid]
870 * 3. [vdiname]:[tag]
871 * 4. [hostname]:[port]:[vdiname]
872 * 5. [hostname]:[port]:[vdiname]:[snapid]
873 * 6. [hostname]:[port]:[vdiname]:[tag]
874 *
875 * You can boot from the snapshot images by specifying `snapid` or
876 * `tag'.
877 *
878 * You can run VMs outside the Sheepdog cluster by specifying
879 * `hostname' and `port' (experimental).
880 */
881static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
882 char *vdi, uint32_t *snapid, char *tag)
883{
884 char *p, *q, *uri;
885 const char *host_spec, *vdi_spec;
886 int nr_sep, ret;
887
888 strstart(filename, "sheepdog:", (const char **)&filename);
889 p = q = g_strdup(filename);
890
891 /* count the number of separators */
892 nr_sep = 0;
893 while (*p) {
894 if (*p == ':') {
895 nr_sep++;
896 }
897 p++;
898 }
899 p = q;
900
901 /* use the first two tokens as host_spec. */
902 if (nr_sep >= 2) {
903 host_spec = p;
904 p = strchr(p, ':');
905 p++;
906 p = strchr(p, ':');
907 *p++ = '\0';
908 } else {
909 host_spec = "";
910 }
911
912 vdi_spec = p;
913
914 p = strchr(vdi_spec, ':');
915 if (p) {
916 *p++ = '#';
917 }
918
919 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
920
921 ret = sd_parse_uri(s, uri, vdi, snapid, tag);
922
923 g_free(q);
924 g_free(uri);
925
926 return ret;
927}
928
929static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
930 char *tag, uint32_t *vid, int for_snapshot)
931{
932 int ret, fd;
933 SheepdogVdiReq hdr;
934 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
935 unsigned int wlen, rlen = 0;
936 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
937
938 fd = connect_to_sdog(s->addr, s->port);
939 if (fd < 0) {
940 return fd;
941 }
942
943 /* This pair of strncpy calls ensures that the buffer is zero-filled,
944 * which is desirable since we'll soon be sending those bytes, and
945 * don't want the send_req to read uninitialized data.
946 */
947 strncpy(buf, filename, SD_MAX_VDI_LEN);
948 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
949
950 memset(&hdr, 0, sizeof(hdr));
951 if (for_snapshot) {
952 hdr.opcode = SD_OP_GET_VDI_INFO;
953 } else {
954 hdr.opcode = SD_OP_LOCK_VDI;
955 }
956 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
957 hdr.proto_ver = SD_PROTO_VER;
958 hdr.data_length = wlen;
959 hdr.snapid = snapid;
960 hdr.flags = SD_FLAG_CMD_WRITE;
961
962 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
963 if (ret) {
964 goto out;
965 }
966
967 if (rsp->result != SD_RES_SUCCESS) {
968 error_report("cannot get vdi info, %s, %s %d %s",
969 sd_strerror(rsp->result), filename, snapid, tag);
970 if (rsp->result == SD_RES_NO_VDI) {
971 ret = -ENOENT;
972 } else {
973 ret = -EIO;
974 }
975 goto out;
976 }
977 *vid = rsp->vdi_id;
978
979 ret = 0;
980out:
981 closesocket(fd);
982 return ret;
983}
984
985static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
986 struct iovec *iov, int niov, bool create,
987 enum AIOCBState aiocb_type)
988{
989 int nr_copies = s->inode.nr_copies;
990 SheepdogObjReq hdr;
991 unsigned int wlen = 0;
992 int ret;
993 uint64_t oid = aio_req->oid;
994 unsigned int datalen = aio_req->data_len;
995 uint64_t offset = aio_req->offset;
996 uint8_t flags = aio_req->flags;
997 uint64_t old_oid = aio_req->base_oid;
998
999 if (!nr_copies) {
1000 error_report("bug");
1001 }
1002
1003 memset(&hdr, 0, sizeof(hdr));
1004
1005 switch (aiocb_type) {
1006 case AIOCB_FLUSH_CACHE:
1007 hdr.opcode = SD_OP_FLUSH_VDI;
1008 break;
1009 case AIOCB_READ_UDATA:
1010 hdr.opcode = SD_OP_READ_OBJ;
1011 hdr.flags = flags;
1012 break;
1013 case AIOCB_WRITE_UDATA:
1014 if (create) {
1015 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1016 } else {
1017 hdr.opcode = SD_OP_WRITE_OBJ;
1018 }
1019 wlen = datalen;
1020 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1021 break;
1022 }
1023
1024 if (s->cache_flags) {
1025 hdr.flags |= s->cache_flags;
1026 }
1027
1028 hdr.oid = oid;
1029 hdr.cow_oid = old_oid;
1030 hdr.copies = s->inode.nr_copies;
1031
1032 hdr.data_length = datalen;
1033 hdr.offset = offset;
1034
1035 hdr.id = aio_req->id;
1036
1037 qemu_co_mutex_lock(&s->lock);
1038 s->co_send = qemu_coroutine_self();
1039 qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
1040 aio_flush_request, s);
1041 socket_set_cork(s->fd, 1);
1042
1043 /* send a header */
1044 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
1045 if (ret < 0) {
1046 qemu_co_mutex_unlock(&s->lock);
1047 error_report("failed to send a req, %s", strerror(errno));
1048 return -errno;
1049 }
1050
1051 if (wlen) {
1052 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
1053 if (ret < 0) {
1054 qemu_co_mutex_unlock(&s->lock);
1055 error_report("failed to send a data, %s", strerror(errno));
1056 return -errno;
1057 }
1058 }
1059
1060 socket_set_cork(s->fd, 0);
1061 qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
1062 aio_flush_request, s);
1063 qemu_co_mutex_unlock(&s->lock);
1064
1065 return 0;
1066}
1067
1068static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1069 unsigned int datalen, uint64_t offset,
1070 bool write, bool create, uint32_t cache_flags)
1071{
1072 SheepdogObjReq hdr;
1073 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1074 unsigned int wlen, rlen;
1075 int ret;
1076
1077 memset(&hdr, 0, sizeof(hdr));
1078
1079 if (write) {
1080 wlen = datalen;
1081 rlen = 0;
1082 hdr.flags = SD_FLAG_CMD_WRITE;
1083 if (create) {
1084 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1085 } else {
1086 hdr.opcode = SD_OP_WRITE_OBJ;
1087 }
1088 } else {
1089 wlen = 0;
1090 rlen = datalen;
1091 hdr.opcode = SD_OP_READ_OBJ;
1092 }
1093
1094 hdr.flags |= cache_flags;
1095
1096 hdr.oid = oid;
1097 hdr.data_length = datalen;
1098 hdr.offset = offset;
1099 hdr.copies = copies;
1100
1101 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1102 if (ret) {
1103 error_report("failed to send a request to the sheep");
1104 return ret;
1105 }
1106
1107 switch (rsp->result) {
1108 case SD_RES_SUCCESS:
1109 return 0;
1110 default:
1111 error_report("%s", sd_strerror(rsp->result));
1112 return -EIO;
1113 }
1114}
1115
1116static int read_object(int fd, char *buf, uint64_t oid, int copies,
1117 unsigned int datalen, uint64_t offset,
1118 uint32_t cache_flags)
1119{
1120 return read_write_object(fd, buf, oid, copies, datalen, offset, false,
1121 false, cache_flags);
1122}
1123
1124static int write_object(int fd, char *buf, uint64_t oid, int copies,
1125 unsigned int datalen, uint64_t offset, bool create,
1126 uint32_t cache_flags)
1127{
1128 return read_write_object(fd, buf, oid, copies, datalen, offset, true,
1129 create, cache_flags);
1130}
1131
1132static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1133{
1134 int ret, fd;
1135 uint32_t vid = 0;
1136 BDRVSheepdogState *s = bs->opaque;
1137 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1138 uint32_t snapid;
1139 char *buf = NULL;
1140
1141 QLIST_INIT(&s->inflight_aio_head);
1142 QLIST_INIT(&s->pending_aio_head);
1143 s->fd = -1;
1144
1145 memset(vdi, 0, sizeof(vdi));
1146 memset(tag, 0, sizeof(tag));
1147
1148 if (strstr(filename, "://")) {
1149 ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
1150 } else {
1151 ret = parse_vdiname(s, filename, vdi, &snapid, tag);
1152 }
1153 if (ret < 0) {
1154 goto out;
1155 }
1156 s->fd = get_sheep_fd(s);
1157 if (s->fd < 0) {
1158 ret = s->fd;
1159 goto out;
1160 }
1161
1162 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1163 if (ret) {
1164 goto out;
1165 }
1166
1167 /*
1168 * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1169 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1170 */
1171 s->cache_flags = SD_FLAG_CMD_CACHE;
1172 if (flags & BDRV_O_NOCACHE) {
1173 s->cache_flags = SD_FLAG_CMD_DIRECT;
1174 }
1175
1176 if (snapid || tag[0] != '\0') {
1177 dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1178 s->is_snapshot = true;
1179 }
1180
1181 fd = connect_to_sdog(s->addr, s->port);
1182 if (fd < 0) {
1183 error_report("failed to connect");
1184 ret = fd;
1185 goto out;
1186 }
1187
1188 buf = g_malloc(SD_INODE_SIZE);
1189 ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0,
1190 s->cache_flags);
1191
1192 closesocket(fd);
1193
1194 if (ret) {
1195 goto out;
1196 }
1197
1198 memcpy(&s->inode, buf, sizeof(s->inode));
1199 s->min_dirty_data_idx = UINT32_MAX;
1200 s->max_dirty_data_idx = 0;
1201
1202 bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1203 pstrcpy(s->name, sizeof(s->name), vdi);
1204 qemu_co_mutex_init(&s->lock);
1205 g_free(buf);
1206 return 0;
1207out:
1208 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
1209 if (s->fd >= 0) {
1210 closesocket(s->fd);
1211 }
1212 g_free(buf);
1213 return ret;
1214}
1215
1216static int do_sd_create(char *filename, int64_t vdi_size,
1217 uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1218 const char *addr, const char *port)
1219{
1220 SheepdogVdiReq hdr;
1221 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1222 int fd, ret;
1223 unsigned int wlen, rlen = 0;
1224 char buf[SD_MAX_VDI_LEN];
1225
1226 fd = connect_to_sdog(addr, port);
1227 if (fd < 0) {
1228 return fd;
1229 }
1230
1231 /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1232 * does not fit in buf? For now, just truncate and avoid buffer overrun.
1233 */
1234 memset(buf, 0, sizeof(buf));
1235 pstrcpy(buf, sizeof(buf), filename);
1236
1237 memset(&hdr, 0, sizeof(hdr));
1238 hdr.opcode = SD_OP_NEW_VDI;
1239 hdr.vdi_id = base_vid;
1240
1241 wlen = SD_MAX_VDI_LEN;
1242
1243 hdr.flags = SD_FLAG_CMD_WRITE;
1244 hdr.snapid = snapshot;
1245
1246 hdr.data_length = wlen;
1247 hdr.vdi_size = vdi_size;
1248
1249 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1250
1251 closesocket(fd);
1252
1253 if (ret) {
1254 return ret;
1255 }
1256
1257 if (rsp->result != SD_RES_SUCCESS) {
1258 error_report("%s, %s", sd_strerror(rsp->result), filename);
1259 return -EIO;
1260 }
1261
1262 if (vdi_id) {
1263 *vdi_id = rsp->vdi_id;
1264 }
1265
1266 return 0;
1267}
1268
1269static int sd_prealloc(const char *filename)
1270{
1271 BlockDriverState *bs = NULL;
1272 uint32_t idx, max_idx;
1273 int64_t vdi_size;
1274 void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
1275 int ret;
1276
1277 ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
1278 if (ret < 0) {
1279 goto out;
1280 }
1281
1282 vdi_size = bdrv_getlength(bs);
1283 if (vdi_size < 0) {
1284 ret = vdi_size;
1285 goto out;
1286 }
1287 max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
1288
1289 for (idx = 0; idx < max_idx; idx++) {
1290 /*
1291 * The created image can be a cloned image, so we need to read
1292 * a data from the source image.
1293 */
1294 ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1295 if (ret < 0) {
1296 goto out;
1297 }
1298 ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1299 if (ret < 0) {
1300 goto out;
1301 }
1302 }
1303out:
1304 if (bs) {
1305 bdrv_delete(bs);
1306 }
1307 g_free(buf);
1308
1309 return ret;
1310}
1311
1312static int sd_create(const char *filename, QEMUOptionParameter *options)
1313{
1314 int ret = 0;
1315 uint32_t vid = 0, base_vid = 0;
1316 int64_t vdi_size = 0;
1317 char *backing_file = NULL;
1318 BDRVSheepdogState *s;
1319 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1320 uint32_t snapid;
1321 bool prealloc = false;
1322
1323 s = g_malloc0(sizeof(BDRVSheepdogState));
1324
1325 memset(vdi, 0, sizeof(vdi));
1326 memset(tag, 0, sizeof(tag));
1327 if (strstr(filename, "://")) {
1328 ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
1329 } else {
1330 ret = parse_vdiname(s, filename, vdi, &snapid, tag);
1331 }
1332 if (ret < 0) {
1333 goto out;
1334 }
1335
1336 while (options && options->name) {
1337 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1338 vdi_size = options->value.n;
1339 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1340 backing_file = options->value.s;
1341 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1342 if (!options->value.s || !strcmp(options->value.s, "off")) {
1343 prealloc = false;
1344 } else if (!strcmp(options->value.s, "full")) {
1345 prealloc = true;
1346 } else {
1347 error_report("Invalid preallocation mode: '%s'",
1348 options->value.s);
1349 ret = -EINVAL;
1350 goto out;
1351 }
1352 }
1353 options++;
1354 }
1355
1356 if (vdi_size > SD_MAX_VDI_SIZE) {
1357 error_report("too big image size");
1358 ret = -EINVAL;
1359 goto out;
1360 }
1361
1362 if (backing_file) {
1363 BlockDriverState *bs;
1364 BDRVSheepdogState *s;
1365 BlockDriver *drv;
1366
1367 /* Currently, only Sheepdog backing image is supported. */
1368 drv = bdrv_find_protocol(backing_file);
1369 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1370 error_report("backing_file must be a sheepdog image");
1371 ret = -EINVAL;
1372 goto out;
1373 }
1374
1375 ret = bdrv_file_open(&bs, backing_file, 0);
1376 if (ret < 0) {
1377 goto out;
1378 }
1379
1380 s = bs->opaque;
1381
1382 if (!is_snapshot(&s->inode)) {
1383 error_report("cannot clone from a non snapshot vdi");
1384 bdrv_delete(bs);
1385 ret = -EINVAL;
1386 goto out;
1387 }
1388
1389 base_vid = s->inode.vdi_id;
1390 bdrv_delete(bs);
1391 }
1392
1393 ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s->addr, s->port);
1394 if (!prealloc || ret) {
1395 goto out;
1396 }
1397
1398 ret = sd_prealloc(filename);
1399out:
1400 g_free(s);
1401 return ret;
1402}
1403
1404static void sd_close(BlockDriverState *bs)
1405{
1406 BDRVSheepdogState *s = bs->opaque;
1407 SheepdogVdiReq hdr;
1408 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1409 unsigned int wlen, rlen = 0;
1410 int fd, ret;
1411
1412 dprintf("%s\n", s->name);
1413
1414 fd = connect_to_sdog(s->addr, s->port);
1415 if (fd < 0) {
1416 return;
1417 }
1418
1419 memset(&hdr, 0, sizeof(hdr));
1420
1421 hdr.opcode = SD_OP_RELEASE_VDI;
1422 hdr.vdi_id = s->inode.vdi_id;
1423 wlen = strlen(s->name) + 1;
1424 hdr.data_length = wlen;
1425 hdr.flags = SD_FLAG_CMD_WRITE;
1426
1427 ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1428
1429 closesocket(fd);
1430
1431 if (!ret && rsp->result != SD_RES_SUCCESS &&
1432 rsp->result != SD_RES_VDI_NOT_LOCKED) {
1433 error_report("%s, %s", sd_strerror(rsp->result), s->name);
1434 }
1435
1436 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
1437 closesocket(s->fd);
1438 g_free(s->addr);
1439 g_free(s->port);
1440}
1441
1442static int64_t sd_getlength(BlockDriverState *bs)
1443{
1444 BDRVSheepdogState *s = bs->opaque;
1445
1446 return s->inode.vdi_size;
1447}
1448
1449static int sd_truncate(BlockDriverState *bs, int64_t offset)
1450{
1451 BDRVSheepdogState *s = bs->opaque;
1452 int ret, fd;
1453 unsigned int datalen;
1454
1455 if (offset < s->inode.vdi_size) {
1456 error_report("shrinking is not supported");
1457 return -EINVAL;
1458 } else if (offset > SD_MAX_VDI_SIZE) {
1459 error_report("too big image size");
1460 return -EINVAL;
1461 }
1462
1463 fd = connect_to_sdog(s->addr, s->port);
1464 if (fd < 0) {
1465 return fd;
1466 }
1467
1468 /* we don't need to update entire object */
1469 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1470 s->inode.vdi_size = offset;
1471 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1472 s->inode.nr_copies, datalen, 0, false, s->cache_flags);
1473 close(fd);
1474
1475 if (ret < 0) {
1476 error_report("failed to update an inode.");
1477 }
1478
1479 return ret;
1480}
1481
1482/*
1483 * This function is called after writing data objects. If we need to
1484 * update metadata, this sends a write request to the vdi object.
1485 * Otherwise, this switches back to sd_co_readv/writev.
1486 */
1487static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
1488{
1489 int ret;
1490 BDRVSheepdogState *s = acb->common.bs->opaque;
1491 struct iovec iov;
1492 AIOReq *aio_req;
1493 uint32_t offset, data_len, mn, mx;
1494
1495 mn = s->min_dirty_data_idx;
1496 mx = s->max_dirty_data_idx;
1497 if (mn <= mx) {
1498 /* we need to update the vdi object. */
1499 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1500 mn * sizeof(s->inode.data_vdi_id[0]);
1501 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1502
1503 s->min_dirty_data_idx = UINT32_MAX;
1504 s->max_dirty_data_idx = 0;
1505
1506 iov.iov_base = &s->inode;
1507 iov.iov_len = sizeof(s->inode);
1508 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1509 data_len, offset, 0, 0, offset);
1510 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1511 ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
1512 if (ret) {
1513 free_aio_req(s, aio_req);
1514 acb->ret = -EIO;
1515 goto out;
1516 }
1517
1518 acb->aio_done_func = sd_finish_aiocb;
1519 acb->aiocb_type = AIOCB_WRITE_UDATA;
1520 return;
1521 }
1522out:
1523 sd_finish_aiocb(acb);
1524}
1525
1526/*
1527 * Create a writable VDI from a snapshot
1528 */
1529static int sd_create_branch(BDRVSheepdogState *s)
1530{
1531 int ret, fd;
1532 uint32_t vid;
1533 char *buf;
1534
1535 dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1536
1537 buf = g_malloc(SD_INODE_SIZE);
1538
1539 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1540 s->addr, s->port);
1541 if (ret) {
1542 goto out;
1543 }
1544
1545 dprintf("%" PRIx32 " is created.\n", vid);
1546
1547 fd = connect_to_sdog(s->addr, s->port);
1548 if (fd < 0) {
1549 error_report("failed to connect");
1550 ret = fd;
1551 goto out;
1552 }
1553
1554 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1555 SD_INODE_SIZE, 0, s->cache_flags);
1556
1557 closesocket(fd);
1558
1559 if (ret < 0) {
1560 goto out;
1561 }
1562
1563 memcpy(&s->inode, buf, sizeof(s->inode));
1564
1565 s->is_snapshot = false;
1566 ret = 0;
1567 dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1568
1569out:
1570 g_free(buf);
1571
1572 return ret;
1573}
1574
1575/*
1576 * Send I/O requests to the server.
1577 *
1578 * This function sends requests to the server, links the requests to
1579 * the inflight_list in BDRVSheepdogState, and exits without
1580 * waiting the response. The responses are received in the
1581 * `aio_read_response' function which is called from the main loop as
1582 * a fd handler.
1583 *
1584 * Returns 1 when we need to wait a response, 0 when there is no sent
1585 * request and -errno in error cases.
1586 */
1587static int coroutine_fn sd_co_rw_vector(void *p)
1588{
1589 SheepdogAIOCB *acb = p;
1590 int ret = 0;
1591 unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1592 unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1593 uint64_t oid;
1594 uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1595 BDRVSheepdogState *s = acb->common.bs->opaque;
1596 SheepdogInode *inode = &s->inode;
1597 AIOReq *aio_req;
1598
1599 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1600 /*
1601 * In the case we open the snapshot VDI, Sheepdog creates the
1602 * writable VDI when we do a write operation first.
1603 */
1604 ret = sd_create_branch(s);
1605 if (ret) {
1606 acb->ret = -EIO;
1607 goto out;
1608 }
1609 }
1610
1611 /*
1612 * Make sure we don't free the aiocb before we are done with all requests.
1613 * This additional reference is dropped at the end of this function.
1614 */
1615 acb->nr_pending++;
1616
1617 while (done != total) {
1618 uint8_t flags = 0;
1619 uint64_t old_oid = 0;
1620 bool create = false;
1621
1622 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1623
1624 len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1625
1626 switch (acb->aiocb_type) {
1627 case AIOCB_READ_UDATA:
1628 if (!inode->data_vdi_id[idx]) {
1629 qemu_iovec_memset(acb->qiov, done, 0, len);
1630 goto done;
1631 }
1632 break;
1633 case AIOCB_WRITE_UDATA:
1634 if (!inode->data_vdi_id[idx]) {
1635 create = true;
1636 } else if (!is_data_obj_writable(inode, idx)) {
1637 /* Copy-On-Write */
1638 create = true;
1639 old_oid = oid;
1640 flags = SD_FLAG_CMD_COW;
1641 }
1642 break;
1643 default:
1644 break;
1645 }
1646
1647 if (create) {
1648 dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
1649 inode->vdi_id, oid,
1650 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1651 oid = vid_to_data_oid(inode->vdi_id, idx);
1652 dprintf("new oid %" PRIx64 "\n", oid);
1653 }
1654
1655 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1656
1657 if (create) {
1658 AIOReq *areq;
1659 QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
1660 if (areq->oid == oid) {
1661 /*
1662 * Sheepdog cannot handle simultaneous create
1663 * requests to the same object. So we cannot send
1664 * the request until the previous request
1665 * finishes.
1666 */
1667 aio_req->flags = 0;
1668 aio_req->base_oid = 0;
1669 QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req,
1670 aio_siblings);
1671 goto done;
1672 }
1673 }
1674 }
1675
1676 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1677 ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1678 create, acb->aiocb_type);
1679 if (ret < 0) {
1680 error_report("add_aio_request is failed");
1681 free_aio_req(s, aio_req);
1682 acb->ret = -EIO;
1683 goto out;
1684 }
1685 done:
1686 offset = 0;
1687 idx++;
1688 done += len;
1689 }
1690out:
1691 if (!--acb->nr_pending) {
1692 return acb->ret;
1693 }
1694 return 1;
1695}
1696
1697static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
1698 int nb_sectors, QEMUIOVector *qiov)
1699{
1700 SheepdogAIOCB *acb;
1701 int ret;
1702
1703 if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1704 ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE);
1705 if (ret < 0) {
1706 return ret;
1707 }
1708 bs->total_sectors = sector_num + nb_sectors;
1709 }
1710
1711 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
1712 acb->aio_done_func = sd_write_done;
1713 acb->aiocb_type = AIOCB_WRITE_UDATA;
1714
1715 ret = sd_co_rw_vector(acb);
1716 if (ret <= 0) {
1717 qemu_aio_release(acb);
1718 return ret;
1719 }
1720
1721 qemu_coroutine_yield();
1722
1723 return acb->ret;
1724}
1725
1726static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
1727 int nb_sectors, QEMUIOVector *qiov)
1728{
1729 SheepdogAIOCB *acb;
1730 int ret;
1731
1732 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
1733 acb->aiocb_type = AIOCB_READ_UDATA;
1734 acb->aio_done_func = sd_finish_aiocb;
1735
1736 ret = sd_co_rw_vector(acb);
1737 if (ret <= 0) {
1738 qemu_aio_release(acb);
1739 return ret;
1740 }
1741
1742 qemu_coroutine_yield();
1743
1744 return acb->ret;
1745}
1746
1747static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
1748{
1749 BDRVSheepdogState *s = bs->opaque;
1750 SheepdogAIOCB *acb;
1751 AIOReq *aio_req;
1752 int ret;
1753
1754 if (s->cache_flags != SD_FLAG_CMD_CACHE) {
1755 return 0;
1756 }
1757
1758 acb = sd_aio_setup(bs, NULL, 0, 0);
1759 acb->aiocb_type = AIOCB_FLUSH_CACHE;
1760 acb->aio_done_func = sd_finish_aiocb;
1761
1762 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1763 0, 0, 0, 0, 0);
1764 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1765 ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
1766 if (ret < 0) {
1767 error_report("add_aio_request is failed");
1768 free_aio_req(s, aio_req);
1769 qemu_aio_release(acb);
1770 return ret;
1771 }
1772
1773 qemu_coroutine_yield();
1774 return acb->ret;
1775}
1776
1777static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1778{
1779 BDRVSheepdogState *s = bs->opaque;
1780 int ret, fd;
1781 uint32_t new_vid;
1782 SheepdogInode *inode;
1783 unsigned int datalen;
1784
1785 dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
1786 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1787 s->name, sn_info->vm_state_size, s->is_snapshot);
1788
1789 if (s->is_snapshot) {
1790 error_report("You can't create a snapshot of a snapshot VDI, "
1791 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
1792
1793 return -EINVAL;
1794 }
1795
1796 dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1797
1798 s->inode.vm_state_size = sn_info->vm_state_size;
1799 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1800 /* It appears that inode.tag does not require a NUL terminator,
1801 * which means this use of strncpy is ok.
1802 */
1803 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1804 /* we don't need to update entire object */
1805 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1806
1807 /* refresh inode. */
1808 fd = connect_to_sdog(s->addr, s->port);
1809 if (fd < 0) {
1810 ret = fd;
1811 goto cleanup;
1812 }
1813
1814 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1815 s->inode.nr_copies, datalen, 0, false, s->cache_flags);
1816 if (ret < 0) {
1817 error_report("failed to write snapshot's inode.");
1818 goto cleanup;
1819 }
1820
1821 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1822 s->addr, s->port);
1823 if (ret < 0) {
1824 error_report("failed to create inode for snapshot. %s",
1825 strerror(errno));
1826 goto cleanup;
1827 }
1828
1829 inode = (SheepdogInode *)g_malloc(datalen);
1830
1831 ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1832 s->inode.nr_copies, datalen, 0, s->cache_flags);
1833
1834 if (ret < 0) {
1835 error_report("failed to read new inode info. %s", strerror(errno));
1836 goto cleanup;
1837 }
1838
1839 memcpy(&s->inode, inode, datalen);
1840 dprintf("s->inode: name %s snap_id %x oid %x\n",
1841 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1842
1843cleanup:
1844 closesocket(fd);
1845 return ret;
1846}
1847
1848static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1849{
1850 BDRVSheepdogState *s = bs->opaque;
1851 BDRVSheepdogState *old_s;
1852 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1853 char *buf = NULL;
1854 uint32_t vid;
1855 uint32_t snapid = 0;
1856 int ret = 0, fd;
1857
1858 old_s = g_malloc(sizeof(BDRVSheepdogState));
1859
1860 memcpy(old_s, s, sizeof(BDRVSheepdogState));
1861
1862 pstrcpy(vdi, sizeof(vdi), s->name);
1863
1864 snapid = strtoul(snapshot_id, NULL, 10);
1865 if (snapid) {
1866 tag[0] = 0;
1867 } else {
1868 pstrcpy(tag, sizeof(tag), s->name);
1869 }
1870
1871 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1872 if (ret) {
1873 error_report("Failed to find_vdi_name");
1874 goto out;
1875 }
1876
1877 fd = connect_to_sdog(s->addr, s->port);
1878 if (fd < 0) {
1879 error_report("failed to connect");
1880 ret = fd;
1881 goto out;
1882 }
1883
1884 buf = g_malloc(SD_INODE_SIZE);
1885 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1886 SD_INODE_SIZE, 0, s->cache_flags);
1887
1888 closesocket(fd);
1889
1890 if (ret) {
1891 goto out;
1892 }
1893
1894 memcpy(&s->inode, buf, sizeof(s->inode));
1895
1896 if (!s->inode.vm_state_size) {
1897 error_report("Invalid snapshot");
1898 ret = -ENOENT;
1899 goto out;
1900 }
1901
1902 s->is_snapshot = true;
1903
1904 g_free(buf);
1905 g_free(old_s);
1906
1907 return 0;
1908out:
1909 /* recover bdrv_sd_state */
1910 memcpy(s, old_s, sizeof(BDRVSheepdogState));
1911 g_free(buf);
1912 g_free(old_s);
1913
1914 error_report("failed to open. recover old bdrv_sd_state.");
1915
1916 return ret;
1917}
1918
1919static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1920{
1921 /* FIXME: Delete specified snapshot id. */
1922 return 0;
1923}
1924
1925static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1926{
1927 BDRVSheepdogState *s = bs->opaque;
1928 SheepdogReq req;
1929 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1930 QEMUSnapshotInfo *sn_tab = NULL;
1931 unsigned wlen, rlen;
1932 int found = 0;
1933 static SheepdogInode inode;
1934 unsigned long *vdi_inuse;
1935 unsigned int start_nr;
1936 uint64_t hval;
1937 uint32_t vid;
1938
1939 vdi_inuse = g_malloc(max);
1940
1941 fd = connect_to_sdog(s->addr, s->port);
1942 if (fd < 0) {
1943 ret = fd;
1944 goto out;
1945 }
1946
1947 rlen = max;
1948 wlen = 0;
1949
1950 memset(&req, 0, sizeof(req));
1951
1952 req.opcode = SD_OP_READ_VDIS;
1953 req.data_length = max;
1954
1955 ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1956
1957 closesocket(fd);
1958 if (ret) {
1959 goto out;
1960 }
1961
1962 sn_tab = g_malloc0(nr * sizeof(*sn_tab));
1963
1964 /* calculate a vdi id with hash function */
1965 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1966 start_nr = hval & (SD_NR_VDIS - 1);
1967
1968 fd = connect_to_sdog(s->addr, s->port);
1969 if (fd < 0) {
1970 error_report("failed to connect");
1971 ret = fd;
1972 goto out;
1973 }
1974
1975 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1976 if (!test_bit(vid, vdi_inuse)) {
1977 break;
1978 }
1979
1980 /* we don't need to read entire object */
1981 ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1982 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
1983 s->cache_flags);
1984
1985 if (ret) {
1986 continue;
1987 }
1988
1989 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1990 sn_tab[found].date_sec = inode.snap_ctime >> 32;
1991 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1992 sn_tab[found].vm_state_size = inode.vm_state_size;
1993 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1994
1995 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1996 inode.snap_id);
1997 pstrcpy(sn_tab[found].name,
1998 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
1999 inode.tag);
2000 found++;
2001 }
2002 }
2003
2004 closesocket(fd);
2005out:
2006 *psn_tab = sn_tab;
2007
2008 g_free(vdi_inuse);
2009
2010 if (ret < 0) {
2011 return ret;
2012 }
2013
2014 return found;
2015}
2016
2017static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
2018 int64_t pos, int size, int load)
2019{
2020 bool create;
2021 int fd, ret = 0, remaining = size;
2022 unsigned int data_len;
2023 uint64_t vmstate_oid;
2024 uint32_t vdi_index;
2025 uint64_t offset;
2026
2027 fd = connect_to_sdog(s->addr, s->port);
2028 if (fd < 0) {
2029 return fd;
2030 }
2031
2032 while (remaining) {
2033 vdi_index = pos / SD_DATA_OBJ_SIZE;
2034 offset = pos % SD_DATA_OBJ_SIZE;
2035
2036 data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
2037
2038 vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
2039
2040 create = (offset == 0);
2041 if (load) {
2042 ret = read_object(fd, (char *)data, vmstate_oid,
2043 s->inode.nr_copies, data_len, offset,
2044 s->cache_flags);
2045 } else {
2046 ret = write_object(fd, (char *)data, vmstate_oid,
2047 s->inode.nr_copies, data_len, offset, create,
2048 s->cache_flags);
2049 }
2050
2051 if (ret < 0) {
2052 error_report("failed to save vmstate %s", strerror(errno));
2053 goto cleanup;
2054 }
2055
2056 pos += data_len;
2057 data += data_len;
2058 remaining -= data_len;
2059 }
2060 ret = size;
2061cleanup:
2062 closesocket(fd);
2063 return ret;
2064}
2065
2066static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
2067 int64_t pos, int size)
2068{
2069 BDRVSheepdogState *s = bs->opaque;
2070
2071 return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
2072}
2073
2074static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
2075 int64_t pos, int size)
2076{
2077 BDRVSheepdogState *s = bs->opaque;
2078
2079 return do_load_save_vmstate(s, data, pos, size, 1);
2080}
2081
2082
2083static QEMUOptionParameter sd_create_options[] = {
2084 {
2085 .name = BLOCK_OPT_SIZE,
2086 .type = OPT_SIZE,
2087 .help = "Virtual disk size"
2088 },
2089 {
2090 .name = BLOCK_OPT_BACKING_FILE,
2091 .type = OPT_STRING,
2092 .help = "File name of a base image"
2093 },
2094 {
2095 .name = BLOCK_OPT_PREALLOC,
2096 .type = OPT_STRING,
2097 .help = "Preallocation mode (allowed values: off, full)"
2098 },
2099 { NULL }
2100};
2101
2102static BlockDriver bdrv_sheepdog = {
2103 .format_name = "sheepdog",
2104 .protocol_name = "sheepdog",
2105 .instance_size = sizeof(BDRVSheepdogState),
2106 .bdrv_file_open = sd_open,
2107 .bdrv_close = sd_close,
2108 .bdrv_create = sd_create,
2109 .bdrv_getlength = sd_getlength,
2110 .bdrv_truncate = sd_truncate,
2111
2112 .bdrv_co_readv = sd_co_readv,
2113 .bdrv_co_writev = sd_co_writev,
2114 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
2115
2116 .bdrv_snapshot_create = sd_snapshot_create,
2117 .bdrv_snapshot_goto = sd_snapshot_goto,
2118 .bdrv_snapshot_delete = sd_snapshot_delete,
2119 .bdrv_snapshot_list = sd_snapshot_list,
2120
2121 .bdrv_save_vmstate = sd_save_vmstate,
2122 .bdrv_load_vmstate = sd_load_vmstate,
2123
2124 .create_options = sd_create_options,
2125};
2126
2127static BlockDriver bdrv_sheepdog_tcp = {
2128 .format_name = "sheepdog",
2129 .protocol_name = "sheepdog+tcp",
2130 .instance_size = sizeof(BDRVSheepdogState),
2131 .bdrv_file_open = sd_open,
2132 .bdrv_close = sd_close,
2133 .bdrv_create = sd_create,
2134 .bdrv_getlength = sd_getlength,
2135 .bdrv_truncate = sd_truncate,
2136
2137 .bdrv_co_readv = sd_co_readv,
2138 .bdrv_co_writev = sd_co_writev,
2139 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
2140
2141 .bdrv_snapshot_create = sd_snapshot_create,
2142 .bdrv_snapshot_goto = sd_snapshot_goto,
2143 .bdrv_snapshot_delete = sd_snapshot_delete,
2144 .bdrv_snapshot_list = sd_snapshot_list,
2145
2146 .bdrv_save_vmstate = sd_save_vmstate,
2147 .bdrv_load_vmstate = sd_load_vmstate,
2148
2149 .create_options = sd_create_options,
2150};
2151
2152static void bdrv_sheepdog_init(void)
2153{
2154 bdrv_register(&bdrv_sheepdog);
2155 bdrv_register(&bdrv_sheepdog_tcp);
2156}
2157block_init(bdrv_sheepdog_init);