]> git.proxmox.com Git - mirror_qemu.git/blame - block/sheepdog.c
block: add sheepdog driver for distributed storage support
[mirror_qemu.git] / block / sheepdog.c
CommitLineData
33b1db1c
MK
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10 */
11#ifdef _WIN32
12#include <windows.h>
13#include <winsock2.h>
14#include <ws2tcpip.h>
15#else
16#include <netdb.h>
17#include <netinet/tcp.h>
18
19#define closesocket(s) close(s)
20#endif
21
22#include "qemu-common.h"
23#include "qemu-error.h"
24#include "qemu_socket.h"
25#include "block_int.h"
26
27#define SD_PROTO_VER 0x01
28
29#define SD_DEFAULT_ADDR "localhost"
30#define SD_DEFAULT_PORT "7000"
31
32#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
33#define SD_OP_READ_OBJ 0x02
34#define SD_OP_WRITE_OBJ 0x03
35
36#define SD_OP_NEW_VDI 0x11
37#define SD_OP_LOCK_VDI 0x12
38#define SD_OP_RELEASE_VDI 0x13
39#define SD_OP_GET_VDI_INFO 0x14
40#define SD_OP_READ_VDIS 0x15
41
42#define SD_FLAG_CMD_WRITE 0x01
43#define SD_FLAG_CMD_COW 0x02
44
45#define SD_RES_SUCCESS 0x00 /* Success */
46#define SD_RES_UNKNOWN 0x01 /* Unknown error */
47#define SD_RES_NO_OBJ 0x02 /* No object found */
48#define SD_RES_EIO 0x03 /* I/O error */
49#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
50#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
51#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
52#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
53#define SD_RES_NO_VDI 0x08 /* No vdi found */
54#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
55#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
56#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
57#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
58#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
59#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
60#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
61#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
62#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
63#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
64#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
65#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
66#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
67#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
68#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
69#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
70
71/*
72 * Object ID rules
73 *
74 * 0 - 19 (20 bits): data object space
75 * 20 - 31 (12 bits): reserved data object space
76 * 32 - 55 (24 bits): vdi object space
77 * 56 - 59 ( 4 bits): reserved vdi object space
78 * 60 - 63 ( 4 bits): object type indentifier space
79 */
80
81#define VDI_SPACE_SHIFT 32
82#define VDI_BIT (UINT64_C(1) << 63)
83#define VMSTATE_BIT (UINT64_C(1) << 62)
84#define MAX_DATA_OBJS (UINT64_C(1) << 20)
85#define MAX_CHILDREN 1024
86#define SD_MAX_VDI_LEN 256
87#define SD_MAX_VDI_TAG_LEN 256
88#define SD_NR_VDIS (1U << 24)
89#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
90#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
91#define SECTOR_SIZE 512
92
93#define SD_INODE_SIZE (sizeof(SheepdogInode))
94#define CURRENT_VDI_ID 0
95
96typedef struct SheepdogReq {
97 uint8_t proto_ver;
98 uint8_t opcode;
99 uint16_t flags;
100 uint32_t epoch;
101 uint32_t id;
102 uint32_t data_length;
103 uint32_t opcode_specific[8];
104} SheepdogReq;
105
106typedef struct SheepdogRsp {
107 uint8_t proto_ver;
108 uint8_t opcode;
109 uint16_t flags;
110 uint32_t epoch;
111 uint32_t id;
112 uint32_t data_length;
113 uint32_t result;
114 uint32_t opcode_specific[7];
115} SheepdogRsp;
116
117typedef struct SheepdogObjReq {
118 uint8_t proto_ver;
119 uint8_t opcode;
120 uint16_t flags;
121 uint32_t epoch;
122 uint32_t id;
123 uint32_t data_length;
124 uint64_t oid;
125 uint64_t cow_oid;
126 uint32_t copies;
127 uint32_t rsvd;
128 uint64_t offset;
129} SheepdogObjReq;
130
131typedef struct SheepdogObjRsp {
132 uint8_t proto_ver;
133 uint8_t opcode;
134 uint16_t flags;
135 uint32_t epoch;
136 uint32_t id;
137 uint32_t data_length;
138 uint32_t result;
139 uint32_t copies;
140 uint32_t pad[6];
141} SheepdogObjRsp;
142
143typedef struct SheepdogVdiReq {
144 uint8_t proto_ver;
145 uint8_t opcode;
146 uint16_t flags;
147 uint32_t epoch;
148 uint32_t id;
149 uint32_t data_length;
150 uint64_t vdi_size;
151 uint32_t base_vdi_id;
152 uint32_t copies;
153 uint32_t snapid;
154 uint32_t pad[3];
155} SheepdogVdiReq;
156
157typedef struct SheepdogVdiRsp {
158 uint8_t proto_ver;
159 uint8_t opcode;
160 uint16_t flags;
161 uint32_t epoch;
162 uint32_t id;
163 uint32_t data_length;
164 uint32_t result;
165 uint32_t rsvd;
166 uint32_t vdi_id;
167 uint32_t pad[5];
168} SheepdogVdiRsp;
169
170typedef struct SheepdogInode {
171 char name[SD_MAX_VDI_LEN];
172 char tag[SD_MAX_VDI_TAG_LEN];
173 uint64_t ctime;
174 uint64_t snap_ctime;
175 uint64_t vm_clock_nsec;
176 uint64_t vdi_size;
177 uint64_t vm_state_size;
178 uint16_t copy_policy;
179 uint8_t nr_copies;
180 uint8_t block_size_shift;
181 uint32_t snap_id;
182 uint32_t vdi_id;
183 uint32_t parent_vdi_id;
184 uint32_t child_vdi_id[MAX_CHILDREN];
185 uint32_t data_vdi_id[MAX_DATA_OBJS];
186} SheepdogInode;
187
188/*
189 * 64 bit FNV-1a non-zero initial basis
190 */
191#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
192
193/*
194 * 64 bit Fowler/Noll/Vo FNV-1a hash code
195 */
196static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
197{
198 unsigned char *bp = buf;
199 unsigned char *be = bp + len;
200 while (bp < be) {
201 hval ^= (uint64_t) *bp++;
202 hval += (hval << 1) + (hval << 4) + (hval << 5) +
203 (hval << 7) + (hval << 8) + (hval << 40);
204 }
205 return hval;
206}
207
208static inline int is_data_obj_writeable(SheepdogInode *inode, unsigned int idx)
209{
210 return inode->vdi_id == inode->data_vdi_id[idx];
211}
212
213static inline int is_data_obj(uint64_t oid)
214{
215 return !(VDI_BIT & oid);
216}
217
218static inline uint64_t data_oid_to_idx(uint64_t oid)
219{
220 return oid & (MAX_DATA_OBJS - 1);
221}
222
223static inline uint64_t vid_to_vdi_oid(uint32_t vid)
224{
225 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
226}
227
228static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
229{
230 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
231}
232
233static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
234{
235 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
236}
237
238static inline int is_snapshot(struct SheepdogInode *inode)
239{
240 return !!inode->snap_ctime;
241}
242
243#undef dprintf
244#ifdef DEBUG_SDOG
245#define dprintf(fmt, args...) \
246 do { \
247 fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
248 } while (0)
249#else
250#define dprintf(fmt, args...)
251#endif
252
253typedef struct SheepdogAIOCB SheepdogAIOCB;
254
255typedef struct AIOReq {
256 SheepdogAIOCB *aiocb;
257 unsigned int iov_offset;
258
259 uint64_t oid;
260 uint64_t base_oid;
261 uint64_t offset;
262 unsigned int data_len;
263 uint8_t flags;
264 uint32_t id;
265
266 QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
267 QLIST_ENTRY(AIOReq) aioreq_siblings;
268} AIOReq;
269
270enum AIOCBState {
271 AIOCB_WRITE_UDATA,
272 AIOCB_READ_UDATA,
273};
274
275struct SheepdogAIOCB {
276 BlockDriverAIOCB common;
277
278 QEMUIOVector *qiov;
279
280 int64_t sector_num;
281 int nb_sectors;
282
283 int ret;
284 enum AIOCBState aiocb_type;
285
286 QEMUBH *bh;
287 void (*aio_done_func)(SheepdogAIOCB *);
288
289 int canceled;
290
291 QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
292};
293
294typedef struct BDRVSheepdogState {
295 SheepdogInode inode;
296
297 uint32_t min_dirty_data_idx;
298 uint32_t max_dirty_data_idx;
299
300 char name[SD_MAX_VDI_LEN];
301 int is_snapshot;
302
303 char *addr;
304 char *port;
305 int fd;
306
307 uint32_t aioreq_seq_num;
308 QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
309} BDRVSheepdogState;
310
311static const char * sd_strerror(int err)
312{
313 int i;
314
315 static const struct {
316 int err;
317 const char *desc;
318 } errors[] = {
319 {SD_RES_SUCCESS, "Success"},
320 {SD_RES_UNKNOWN, "Unknown error"},
321 {SD_RES_NO_OBJ, "No object found"},
322 {SD_RES_EIO, "I/O error"},
323 {SD_RES_VDI_EXIST, "VDI exists already"},
324 {SD_RES_INVALID_PARMS, "Invalid parameters"},
325 {SD_RES_SYSTEM_ERROR, "System error"},
326 {SD_RES_VDI_LOCKED, "VDI is already locked"},
327 {SD_RES_NO_VDI, "No vdi found"},
328 {SD_RES_NO_BASE_VDI, "No base VDI found"},
329 {SD_RES_VDI_READ, "Failed read the requested VDI"},
330 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
331 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
332 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
333 {SD_RES_NO_TAG, "Failed to find the requested tag"},
334 {SD_RES_STARTUP, "The system is still booting"},
335 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
336 {SD_RES_SHUTDOWN, "The system is shutting down"},
337 {SD_RES_NO_MEM, "Out of memory on the server"},
338 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
339 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
340 {SD_RES_NO_SPACE, "Server has no space for new objects"},
341 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
342 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
343 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
344 };
345
346 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
347 if (errors[i].err == err) {
348 return errors[i].desc;
349 }
350 }
351
352 return "Invalid error code";
353}
354
355/*
356 * Sheepdog I/O handling:
357 *
358 * 1. In the sd_aio_readv/writev, read/write requests are added to the
359 * QEMU Bottom Halves.
360 *
361 * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
362 * requests to the server and link the requests to the
363 * outstanding_list in the BDRVSheepdogState. we exits the
364 * function without waiting for receiving the response.
365 *
366 * 3. We receive the response in aio_read_response, the fd handler to
367 * the sheepdog connection. If metadata update is needed, we send
368 * the write request to the vdi object in sd_write_done, the write
369 * completion function. The AIOCB callback is not called until all
370 * the requests belonging to the AIOCB are finished.
371 */
372
373static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
374 uint64_t oid, unsigned int data_len,
375 uint64_t offset, uint8_t flags,
376 uint64_t base_oid, unsigned int iov_offset)
377{
378 AIOReq *aio_req;
379
380 aio_req = qemu_malloc(sizeof(*aio_req));
381 aio_req->aiocb = acb;
382 aio_req->iov_offset = iov_offset;
383 aio_req->oid = oid;
384 aio_req->base_oid = base_oid;
385 aio_req->offset = offset;
386 aio_req->data_len = data_len;
387 aio_req->flags = flags;
388 aio_req->id = s->aioreq_seq_num++;
389
390 QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
391 outstanding_aio_siblings);
392 QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
393
394 return aio_req;
395}
396
397static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
398{
399 SheepdogAIOCB *acb = aio_req->aiocb;
400 QLIST_REMOVE(aio_req, outstanding_aio_siblings);
401 QLIST_REMOVE(aio_req, aioreq_siblings);
402 qemu_free(aio_req);
403
404 return !QLIST_EMPTY(&acb->aioreq_head);
405}
406
407static void sd_finish_aiocb(SheepdogAIOCB *acb)
408{
409 if (!acb->canceled) {
410 acb->common.cb(acb->common.opaque, acb->ret);
411 }
412 qemu_aio_release(acb);
413}
414
415static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
416{
417 SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
418
419 /*
420 * Sheepdog cannot cancel the requests which are already sent to
421 * the servers, so we just complete the request with -EIO here.
422 */
423 acb->common.cb(acb->common.opaque, -EIO);
424 acb->canceled = 1;
425}
426
427static AIOPool sd_aio_pool = {
428 .aiocb_size = sizeof(SheepdogAIOCB),
429 .cancel = sd_aio_cancel,
430};
431
432static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
433 int64_t sector_num, int nb_sectors,
434 BlockDriverCompletionFunc *cb, void *opaque)
435{
436 SheepdogAIOCB *acb;
437
438 acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
439
440 acb->qiov = qiov;
441
442 acb->sector_num = sector_num;
443 acb->nb_sectors = nb_sectors;
444
445 acb->aio_done_func = NULL;
446 acb->canceled = 0;
447 acb->bh = NULL;
448 acb->ret = 0;
449 QLIST_INIT(&acb->aioreq_head);
450 return acb;
451}
452
453static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
454{
455 if (acb->bh) {
456 error_report("bug: %d %d\n", acb->aiocb_type, acb->aiocb_type);
457 return -EIO;
458 }
459
460 acb->bh = qemu_bh_new(cb, acb);
461 if (!acb->bh) {
462 error_report("oom: %d %d\n", acb->aiocb_type, acb->aiocb_type);
463 return -EIO;
464 }
465
466 qemu_bh_schedule(acb->bh);
467
468 return 0;
469}
470
471#ifdef _WIN32
472
473struct msghdr {
474 struct iovec *msg_iov;
475 size_t msg_iovlen;
476};
477
478static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
479{
480 size_t size = 0;
481 char *buf, *p;
482 int i, ret;
483
484 /* count the msg size */
485 for (i = 0; i < msg->msg_iovlen; i++) {
486 size += msg->msg_iov[i].iov_len;
487 }
488 buf = qemu_malloc(size);
489
490 p = buf;
491 for (i = 0; i < msg->msg_iovlen; i++) {
492 memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
493 p += msg->msg_iov[i].iov_len;
494 }
495
496 ret = send(s, buf, size, flags);
497
498 qemu_free(buf);
499 return ret;
500}
501
502static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
503{
504 size_t size = 0;
505 char *buf, *p;
506 int i, ret;
507
508 /* count the msg size */
509 for (i = 0; i < msg->msg_iovlen; i++) {
510 size += msg->msg_iov[i].iov_len;
511 }
512 buf = qemu_malloc(size);
513
514 ret = recv(s, buf, size, flags);
515 if (ret < 0) {
516 goto out;
517 }
518
519 p = buf;
520 for (i = 0; i < msg->msg_iovlen; i++) {
521 memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
522 p += msg->msg_iov[i].iov_len;
523 }
524out:
525 qemu_free(buf);
526 return ret;
527}
528
529#endif
530
531/*
532 * Send/recv data with iovec buffers
533 *
534 * This function send/recv data from/to the iovec buffer directly.
535 * The first `offset' bytes in the iovec buffer are skipped and next
536 * `len' bytes are used.
537 *
538 * For example,
539 *
540 * do_send_recv(sockfd, iov, len, offset, 1);
541 *
542 * is equals to
543 *
544 * char *buf = malloc(size);
545 * iov_to_buf(iov, iovcnt, buf, offset, size);
546 * send(sockfd, buf, size, 0);
547 * free(buf);
548 */
549static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
550 int write)
551{
552 struct msghdr msg;
553 int ret, diff;
554
555 memset(&msg, 0, sizeof(msg));
556 msg.msg_iov = iov;
557 msg.msg_iovlen = 1;
558
559 len += offset;
560
561 while (iov->iov_len < len) {
562 len -= iov->iov_len;
563
564 iov++;
565 msg.msg_iovlen++;
566 }
567
568 diff = iov->iov_len - len;
569 iov->iov_len -= diff;
570
571 while (msg.msg_iov->iov_len <= offset) {
572 offset -= msg.msg_iov->iov_len;
573
574 msg.msg_iov++;
575 msg.msg_iovlen--;
576 }
577
578 msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
579 msg.msg_iov->iov_len -= offset;
580
581 if (write) {
582 ret = sendmsg(sockfd, &msg, 0);
583 } else {
584 ret = recvmsg(sockfd, &msg, 0);
585 }
586
587 msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
588 msg.msg_iov->iov_len += offset;
589
590 iov->iov_len += diff;
591 return ret;
592}
593
594static int connect_to_sdog(const char *addr, const char *port)
595{
596 char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
597 int fd, ret;
598 struct addrinfo hints, *res, *res0;
599
600 if (!addr) {
601 addr = SD_DEFAULT_ADDR;
602 port = SD_DEFAULT_PORT;
603 }
604
605 memset(&hints, 0, sizeof(hints));
606 hints.ai_socktype = SOCK_STREAM;
607
608 ret = getaddrinfo(addr, port, &hints, &res0);
609 if (ret) {
610 error_report("unable to get address info %s, %s\n",
611 addr, strerror(errno));
612 return -1;
613 }
614
615 for (res = res0; res; res = res->ai_next) {
616 ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
617 sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
618 if (ret) {
619 continue;
620 }
621
622 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
623 if (fd < 0) {
624 continue;
625 }
626
627 reconnect:
628 ret = connect(fd, res->ai_addr, res->ai_addrlen);
629 if (ret < 0) {
630 if (errno == EINTR) {
631 goto reconnect;
632 }
633 break;
634 }
635
636 dprintf("connected to %s:%s\n", addr, port);
637 goto success;
638 }
639 fd = -1;
640 error_report("failed connect to %s:%s\n", addr, port);
641success:
642 freeaddrinfo(res0);
643 return fd;
644}
645
646static int do_readv_writev(int sockfd, struct iovec *iov, int len,
647 int iov_offset, int write)
648{
649 int ret;
650again:
651 ret = do_send_recv(sockfd, iov, len, iov_offset, write);
652 if (ret < 0) {
653 if (errno == EINTR || errno == EAGAIN) {
654 goto again;
655 }
656 error_report("failed to recv a rsp, %s\n", strerror(errno));
657 return 1;
658 }
659
660 iov_offset += ret;
661 len -= ret;
662 if (len) {
663 goto again;
664 }
665
666 return 0;
667}
668
669static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
670{
671 return do_readv_writev(sockfd, iov, len, iov_offset, 0);
672}
673
674static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
675{
676 return do_readv_writev(sockfd, iov, len, iov_offset, 1);
677}
678
679static int do_read_write(int sockfd, void *buf, int len, int write)
680{
681 struct iovec iov;
682
683 iov.iov_base = buf;
684 iov.iov_len = len;
685
686 return do_readv_writev(sockfd, &iov, len, 0, write);
687}
688
689static int do_read(int sockfd, void *buf, int len)
690{
691 return do_read_write(sockfd, buf, len, 0);
692}
693
694static int do_write(int sockfd, void *buf, int len)
695{
696 return do_read_write(sockfd, buf, len, 1);
697}
698
699static int send_req(int sockfd, SheepdogReq *hdr, void *data,
700 unsigned int *wlen)
701{
702 int ret;
703 struct iovec iov[2];
704
705 iov[0].iov_base = hdr;
706 iov[0].iov_len = sizeof(*hdr);
707
708 if (*wlen) {
709 iov[1].iov_base = data;
710 iov[1].iov_len = *wlen;
711 }
712
713 ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
714 if (ret) {
715 error_report("failed to send a req, %s\n", strerror(errno));
716 ret = -1;
717 }
718
719 return ret;
720}
721
722static int do_req(int sockfd, SheepdogReq *hdr, void *data,
723 unsigned int *wlen, unsigned int *rlen)
724{
725 int ret;
726
727 ret = send_req(sockfd, hdr, data, wlen);
728 if (ret) {
729 ret = -1;
730 goto out;
731 }
732
733 ret = do_read(sockfd, hdr, sizeof(*hdr));
734 if (ret) {
735 error_report("failed to get a rsp, %s\n", strerror(errno));
736 ret = -1;
737 goto out;
738 }
739
740 if (*rlen > hdr->data_length) {
741 *rlen = hdr->data_length;
742 }
743
744 if (*rlen) {
745 ret = do_read(sockfd, data, *rlen);
746 if (ret) {
747 error_report("failed to get the data, %s\n", strerror(errno));
748 ret = -1;
749 goto out;
750 }
751 }
752 ret = 0;
753out:
754 return ret;
755}
756
757static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
758 struct iovec *iov, int niov, int create,
759 enum AIOCBState aiocb_type);
760
761/*
762 * This function searchs pending requests to the object `oid', and
763 * sends them.
764 */
765static void send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
766{
767 AIOReq *aio_req, *next;
768 SheepdogAIOCB *acb;
769 int ret;
770
771 QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
772 outstanding_aio_siblings, next) {
773 if (id == aio_req->id) {
774 continue;
775 }
776 if (aio_req->oid != oid) {
777 continue;
778 }
779
780 acb = aio_req->aiocb;
781 ret = add_aio_request(s, aio_req, acb->qiov->iov,
782 acb->qiov->niov, 0, acb->aiocb_type);
783 if (ret < 0) {
784 error_report("add_aio_request is failed\n");
785 free_aio_req(s, aio_req);
786 if (QLIST_EMPTY(&acb->aioreq_head)) {
787 sd_finish_aiocb(acb);
788 }
789 }
790 }
791}
792
793/*
794 * Receive responses of the I/O requests.
795 *
796 * This function is registered as a fd handler, and called from the
797 * main loop when s->fd is ready for reading responses.
798 */
799static void aio_read_response(void *opaque)
800{
801 SheepdogObjRsp rsp;
802 BDRVSheepdogState *s = opaque;
803 int fd = s->fd;
804 int ret;
805 AIOReq *aio_req = NULL;
806 SheepdogAIOCB *acb;
807 int rest;
808 unsigned long idx;
809
810 if (QLIST_EMPTY(&s->outstanding_aio_head)) {
811 return;
812 }
813
814 /* read a header */
815 ret = do_read(fd, &rsp, sizeof(rsp));
816 if (ret) {
817 error_report("failed to get the header, %s\n", strerror(errno));
818 return;
819 }
820
821 /* find the right aio_req from the outstanding_aio list */
822 QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
823 if (aio_req->id == rsp.id) {
824 break;
825 }
826 }
827 if (!aio_req) {
828 error_report("cannot find aio_req %x\n", rsp.id);
829 return;
830 }
831
832 acb = aio_req->aiocb;
833
834 switch (acb->aiocb_type) {
835 case AIOCB_WRITE_UDATA:
836 if (!is_data_obj(aio_req->oid)) {
837 break;
838 }
839 idx = data_oid_to_idx(aio_req->oid);
840
841 if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
842 /*
843 * If the object is newly created one, we need to update
844 * the vdi object (metadata object). min_dirty_data_idx
845 * and max_dirty_data_idx are changed to include updated
846 * index between them.
847 */
848 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
849 s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
850 s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
851
852 /*
853 * Some requests may be blocked because simultaneous
854 * create requests are not allowed, so we search the
855 * pending requests here.
856 */
857 send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
858 }
859 break;
860 case AIOCB_READ_UDATA:
861 ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
862 aio_req->iov_offset);
863 if (ret) {
864 error_report("failed to get the data, %s\n", strerror(errno));
865 return;
866 }
867 break;
868 }
869
870 if (rsp.result != SD_RES_SUCCESS) {
871 acb->ret = -EIO;
872 error_report("%s\n", sd_strerror(rsp.result));
873 }
874
875 rest = free_aio_req(s, aio_req);
876 if (!rest) {
877 /*
878 * We've finished all requests which belong to the AIOCB, so
879 * we can call the callback now.
880 */
881 acb->aio_done_func(acb);
882 }
883}
884
885static int aio_flush_request(void *opaque)
886{
887 BDRVSheepdogState *s = opaque;
888
889 return !QLIST_EMPTY(&s->outstanding_aio_head);
890}
891
892#ifdef _WIN32
893
894static int set_cork(int fd, int v)
895{
896 return 0;
897}
898
899#else
900
901static int set_cork(int fd, int v)
902{
903 return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
904}
905
906#endif
907
908static int set_nodelay(int fd)
909{
910 int ret, opt;
911
912 opt = 1;
913 ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
914 return ret;
915}
916
917/*
918 * Return a socket discriptor to read/write objects.
919 *
920 * We cannot use this discriptor for other operations because
921 * the block driver may be on waiting response from the server.
922 */
923static int get_sheep_fd(BDRVSheepdogState *s)
924{
925 int ret, fd;
926
927 fd = connect_to_sdog(s->addr, s->port);
928 if (fd < 0) {
929 error_report("%s\n", strerror(errno));
930 return -1;
931 }
932
933 socket_set_nonblock(fd);
934
935 ret = set_nodelay(fd);
936 if (ret) {
937 error_report("%s\n", strerror(errno));
938 closesocket(fd);
939 return -1;
940 }
941
942 qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
943 NULL, s);
944 return fd;
945}
946
947/*
948 * Parse a filename
949 *
950 * filename must be one of the following formats:
951 * 1. [vdiname]
952 * 2. [vdiname]:[snapid]
953 * 3. [vdiname]:[tag]
954 * 4. [hostname]:[port]:[vdiname]
955 * 5. [hostname]:[port]:[vdiname]:[snapid]
956 * 6. [hostname]:[port]:[vdiname]:[tag]
957 *
958 * You can boot from the snapshot images by specifying `snapid` or
959 * `tag'.
960 *
961 * You can run VMs outside the Sheepdog cluster by specifying
962 * `hostname' and `port' (experimental).
963 */
964static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
965 char *vdi, uint32_t *snapid, char *tag)
966{
967 char *p, *q;
968 int nr_sep;
969
970 p = q = qemu_strdup(filename);
971
972 /* count the number of separators */
973 nr_sep = 0;
974 while (*p) {
975 if (*p == ':') {
976 nr_sep++;
977 }
978 p++;
979 }
980 p = q;
981
982 /* use the first two tokens as hostname and port number. */
983 if (nr_sep >= 2) {
984 s->addr = p;
985 p = strchr(p, ':');
986 *p++ = '\0';
987
988 s->port = p;
989 p = strchr(p, ':');
990 *p++ = '\0';
991 } else {
992 s->addr = NULL;
993 s->port = 0;
994 }
995
996 strncpy(vdi, p, SD_MAX_VDI_LEN);
997
998 p = strchr(vdi, ':');
999 if (p) {
1000 *p++ = '\0';
1001 *snapid = strtoul(p, NULL, 10);
1002 if (*snapid == 0) {
1003 strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
1004 }
1005 } else {
1006 *snapid = CURRENT_VDI_ID; /* search current vdi */
1007 }
1008
1009 if (s->addr == NULL) {
1010 qemu_free(q);
1011 }
1012
1013 return 0;
1014}
1015
1016static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1017 char *tag, uint32_t *vid, int for_snapshot)
1018{
1019 int ret, fd;
1020 SheepdogVdiReq hdr;
1021 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1022 unsigned int wlen, rlen = 0;
1023 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1024
1025 fd = connect_to_sdog(s->addr, s->port);
1026 if (fd < 0) {
1027 return -1;
1028 }
1029
1030 memset(buf, 0, sizeof(buf));
1031 strncpy(buf, filename, SD_MAX_VDI_LEN);
1032 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1033
1034 memset(&hdr, 0, sizeof(hdr));
1035 if (for_snapshot) {
1036 hdr.opcode = SD_OP_GET_VDI_INFO;
1037 } else {
1038 hdr.opcode = SD_OP_LOCK_VDI;
1039 }
1040 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1041 hdr.proto_ver = SD_PROTO_VER;
1042 hdr.data_length = wlen;
1043 hdr.snapid = snapid;
1044 hdr.flags = SD_FLAG_CMD_WRITE;
1045
1046 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1047 if (ret) {
1048 ret = -1;
1049 goto out;
1050 }
1051
1052 if (rsp->result != SD_RES_SUCCESS) {
1053 error_report("cannot get vdi info, %s, %s %d %s\n",
1054 sd_strerror(rsp->result), filename, snapid, tag);
1055 ret = -1;
1056 goto out;
1057 }
1058 *vid = rsp->vdi_id;
1059
1060 ret = 0;
1061out:
1062 closesocket(fd);
1063 return ret;
1064}
1065
1066static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1067 struct iovec *iov, int niov, int create,
1068 enum AIOCBState aiocb_type)
1069{
1070 int nr_copies = s->inode.nr_copies;
1071 SheepdogObjReq hdr;
1072 unsigned int wlen;
1073 int ret;
1074 uint64_t oid = aio_req->oid;
1075 unsigned int datalen = aio_req->data_len;
1076 uint64_t offset = aio_req->offset;
1077 uint8_t flags = aio_req->flags;
1078 uint64_t old_oid = aio_req->base_oid;
1079
1080 if (!nr_copies) {
1081 error_report("bug\n");
1082 }
1083
1084 memset(&hdr, 0, sizeof(hdr));
1085
1086 if (aiocb_type == AIOCB_READ_UDATA) {
1087 wlen = 0;
1088 hdr.opcode = SD_OP_READ_OBJ;
1089 hdr.flags = flags;
1090 } else if (create) {
1091 wlen = datalen;
1092 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1093 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1094 } else {
1095 wlen = datalen;
1096 hdr.opcode = SD_OP_WRITE_OBJ;
1097 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1098 }
1099
1100 hdr.oid = oid;
1101 hdr.cow_oid = old_oid;
1102 hdr.copies = s->inode.nr_copies;
1103
1104 hdr.data_length = datalen;
1105 hdr.offset = offset;
1106
1107 hdr.id = aio_req->id;
1108
1109 set_cork(s->fd, 1);
1110
1111 /* send a header */
1112 ret = do_write(s->fd, &hdr, sizeof(hdr));
1113 if (ret) {
1114 error_report("failed to send a req, %s\n", strerror(errno));
1115 return -EIO;
1116 }
1117
1118 if (wlen) {
1119 ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1120 if (ret) {
1121 error_report("failed to send a data, %s\n", strerror(errno));
1122 return -EIO;
1123 }
1124 }
1125
1126 set_cork(s->fd, 0);
1127
1128 return 0;
1129}
1130
1131static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1132 unsigned int datalen, uint64_t offset,
1133 int write, int create)
1134{
1135 SheepdogObjReq hdr;
1136 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1137 unsigned int wlen, rlen;
1138 int ret;
1139
1140 memset(&hdr, 0, sizeof(hdr));
1141
1142 if (write) {
1143 wlen = datalen;
1144 rlen = 0;
1145 hdr.flags = SD_FLAG_CMD_WRITE;
1146 if (create) {
1147 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1148 } else {
1149 hdr.opcode = SD_OP_WRITE_OBJ;
1150 }
1151 } else {
1152 wlen = 0;
1153 rlen = datalen;
1154 hdr.opcode = SD_OP_READ_OBJ;
1155 }
1156 hdr.oid = oid;
1157 hdr.data_length = datalen;
1158 hdr.offset = offset;
1159 hdr.copies = copies;
1160
1161 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1162 if (ret) {
1163 error_report("failed to send a request to the sheep\n");
1164 return -1;
1165 }
1166
1167 switch (rsp->result) {
1168 case SD_RES_SUCCESS:
1169 return 0;
1170 default:
1171 error_report("%s\n", sd_strerror(rsp->result));
1172 return -1;
1173 }
1174}
1175
1176static int read_object(int fd, char *buf, uint64_t oid, int copies,
1177 unsigned int datalen, uint64_t offset)
1178{
1179 return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1180}
1181
1182static int write_object(int fd, char *buf, uint64_t oid, int copies,
1183 unsigned int datalen, uint64_t offset, int create)
1184{
1185 return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1186}
1187
1188static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1189{
1190 int ret, fd;
1191 uint32_t vid = 0;
1192 BDRVSheepdogState *s = bs->opaque;
1193 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1194 uint32_t snapid;
1195 char *buf = NULL;
1196
1197 strstart(filename, "sheepdog:", (const char **)&filename);
1198
1199 QLIST_INIT(&s->outstanding_aio_head);
1200 s->fd = -1;
1201
1202 memset(vdi, 0, sizeof(vdi));
1203 memset(tag, 0, sizeof(tag));
1204 if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1205 goto out;
1206 }
1207 s->fd = get_sheep_fd(s);
1208 if (s->fd < 0) {
1209 goto out;
1210 }
1211
1212 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1213 if (ret) {
1214 goto out;
1215 }
1216
1217 if (snapid) {
1218 dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1219 s->is_snapshot = 1;
1220 }
1221
1222 fd = connect_to_sdog(s->addr, s->port);
1223 if (fd < 0) {
1224 error_report("failed to connect\n");
1225 goto out;
1226 }
1227
1228 buf = qemu_malloc(SD_INODE_SIZE);
1229 ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1230
1231 closesocket(fd);
1232
1233 if (ret) {
1234 goto out;
1235 }
1236
1237 memcpy(&s->inode, buf, sizeof(s->inode));
1238 s->min_dirty_data_idx = UINT32_MAX;
1239 s->max_dirty_data_idx = 0;
1240
1241 bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1242 strncpy(s->name, vdi, sizeof(s->name));
1243 qemu_free(buf);
1244 return 0;
1245out:
1246 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1247 if (s->fd >= 0) {
1248 closesocket(s->fd);
1249 }
1250 qemu_free(buf);
1251 return -1;
1252}
1253
1254static int do_sd_create(char *filename, int64_t vdi_size,
1255 uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1256 const char *addr, const char *port)
1257{
1258 SheepdogVdiReq hdr;
1259 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1260 int fd, ret;
1261 unsigned int wlen, rlen = 0;
1262 char buf[SD_MAX_VDI_LEN];
1263
1264 fd = connect_to_sdog(addr, port);
1265 if (fd < 0) {
1266 return -EIO;
1267 }
1268
1269 memset(buf, 0, sizeof(buf));
1270 strncpy(buf, filename, SD_MAX_VDI_LEN);
1271
1272 memset(&hdr, 0, sizeof(hdr));
1273 hdr.opcode = SD_OP_NEW_VDI;
1274 hdr.base_vdi_id = base_vid;
1275
1276 wlen = SD_MAX_VDI_LEN;
1277
1278 hdr.flags = SD_FLAG_CMD_WRITE;
1279 hdr.snapid = snapshot;
1280
1281 hdr.data_length = wlen;
1282 hdr.vdi_size = vdi_size;
1283
1284 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1285
1286 closesocket(fd);
1287
1288 if (ret) {
1289 return -EIO;
1290 }
1291
1292 if (rsp->result != SD_RES_SUCCESS) {
1293 error_report("%s, %s\n", sd_strerror(rsp->result), filename);
1294 return -EIO;
1295 }
1296
1297 if (vdi_id) {
1298 *vdi_id = rsp->vdi_id;
1299 }
1300
1301 return 0;
1302}
1303
1304static int sd_create(const char *filename, QEMUOptionParameter *options)
1305{
1306 int ret;
1307 uint32_t vid = 0;
1308 int64_t vdi_size = 0;
1309 char *backing_file = NULL;
1310
1311 strstart(filename, "sheepdog:", (const char **)&filename);
1312
1313 while (options && options->name) {
1314 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1315 vdi_size = options->value.n;
1316 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1317 backing_file = options->value.s;
1318 }
1319 options++;
1320 }
1321
1322 if (vdi_size > SD_MAX_VDI_SIZE) {
1323 error_report("too big image size\n");
1324 return -EINVAL;
1325 }
1326
1327 if (backing_file) {
1328 BlockDriverState *bs;
1329 BDRVSheepdogState *s;
1330 BlockDriver *drv;
1331
1332 /* Currently, only Sheepdog backing image is supported. */
1333 drv = bdrv_find_protocol(backing_file);
1334 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1335 error_report("backing_file must be a sheepdog image\n");
1336 return -EINVAL;
1337 }
1338
1339 ret = bdrv_file_open(&bs, backing_file, 0);
1340 if (ret < 0)
1341 return -EIO;
1342
1343 s = bs->opaque;
1344
1345 if (!is_snapshot(&s->inode)) {
1346 error_report("cannot clone from a non snapshot vdi\n");
1347 bdrv_delete(bs);
1348 return -EINVAL;
1349 }
1350
1351 vid = s->inode.vdi_id;
1352 bdrv_delete(bs);
1353 }
1354
1355 return do_sd_create((char *)filename, vdi_size, vid, NULL, 0, NULL, NULL);
1356}
1357
1358static void sd_close(BlockDriverState *bs)
1359{
1360 BDRVSheepdogState *s = bs->opaque;
1361 SheepdogVdiReq hdr;
1362 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1363 unsigned int wlen, rlen = 0;
1364 int fd, ret;
1365
1366 dprintf("%s\n", s->name);
1367
1368 fd = connect_to_sdog(s->addr, s->port);
1369 if (fd < 0) {
1370 return;
1371 }
1372
1373 memset(&hdr, 0, sizeof(hdr));
1374
1375 hdr.opcode = SD_OP_RELEASE_VDI;
1376 wlen = strlen(s->name) + 1;
1377 hdr.data_length = wlen;
1378 hdr.flags = SD_FLAG_CMD_WRITE;
1379
1380 ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1381
1382 closesocket(fd);
1383
1384 if (!ret && rsp->result != SD_RES_SUCCESS &&
1385 rsp->result != SD_RES_VDI_NOT_LOCKED) {
1386 error_report("%s, %s\n", sd_strerror(rsp->result), s->name);
1387 }
1388
1389 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1390 closesocket(s->fd);
1391 qemu_free(s->addr);
1392}
1393
1394static int64_t sd_getlength(BlockDriverState *bs)
1395{
1396 BDRVSheepdogState *s = bs->opaque;
1397
1398 return s->inode.vdi_size;
1399}
1400
1401static int sd_truncate(BlockDriverState *bs, int64_t offset)
1402{
1403 BDRVSheepdogState *s = bs->opaque;
1404 int ret, fd;
1405 unsigned int datalen;
1406
1407 if (offset < s->inode.vdi_size) {
1408 error_report("shrinking is not supported\n");
1409 return -EINVAL;
1410 } else if (offset > SD_MAX_VDI_SIZE) {
1411 error_report("too big image size\n");
1412 return -EINVAL;
1413 }
1414
1415 fd = connect_to_sdog(s->addr, s->port);
1416 if (fd < 0) {
1417 return -EIO;
1418 }
1419
1420 /* we don't need to update entire object */
1421 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1422 s->inode.vdi_size = offset;
1423 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1424 s->inode.nr_copies, datalen, 0, 0);
1425 close(fd);
1426
1427 if (ret < 0) {
1428 error_report("failed to update an inode.\n");
1429 return -EIO;
1430 }
1431
1432 return 0;
1433}
1434
1435/*
1436 * This function is called after writing data objects. If we need to
1437 * update metadata, this sends a write request to the vdi object.
1438 * Otherwise, this calls the AIOCB callback.
1439 */
1440static void sd_write_done(SheepdogAIOCB *acb)
1441{
1442 int ret;
1443 BDRVSheepdogState *s = acb->common.bs->opaque;
1444 struct iovec iov;
1445 AIOReq *aio_req;
1446 uint32_t offset, data_len, mn, mx;
1447
1448 mn = s->min_dirty_data_idx;
1449 mx = s->max_dirty_data_idx;
1450 if (mn <= mx) {
1451 /* we need to update the vdi object. */
1452 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1453 mn * sizeof(s->inode.data_vdi_id[0]);
1454 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1455
1456 s->min_dirty_data_idx = UINT32_MAX;
1457 s->max_dirty_data_idx = 0;
1458
1459 iov.iov_base = &s->inode;
1460 iov.iov_len = sizeof(s->inode);
1461 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1462 data_len, offset, 0, 0, offset);
1463 ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1464 if (ret) {
1465 free_aio_req(s, aio_req);
1466 acb->ret = -EIO;
1467 goto out;
1468 }
1469
1470 acb->aio_done_func = sd_finish_aiocb;
1471 acb->aiocb_type = AIOCB_WRITE_UDATA;
1472 return;
1473 }
1474out:
1475 sd_finish_aiocb(acb);
1476}
1477
1478/*
1479 * Create a writable VDI from a snapshot
1480 */
1481static int sd_create_branch(BDRVSheepdogState *s)
1482{
1483 int ret, fd;
1484 uint32_t vid;
1485 char *buf;
1486
1487 dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1488
1489 buf = qemu_malloc(SD_INODE_SIZE);
1490
1491 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1492 s->addr, s->port);
1493 if (ret) {
1494 goto out;
1495 }
1496
1497 dprintf("%" PRIx32 " is created.\n", vid);
1498
1499 fd = connect_to_sdog(s->addr, s->port);
1500 if (fd < 0) {
1501 error_report("failed to connect\n");
1502 goto out;
1503 }
1504
1505 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1506 SD_INODE_SIZE, 0);
1507
1508 closesocket(fd);
1509
1510 if (ret < 0) {
1511 goto out;
1512 }
1513
1514 memcpy(&s->inode, buf, sizeof(s->inode));
1515
1516 s->is_snapshot = 0;
1517 ret = 0;
1518 dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1519
1520out:
1521 qemu_free(buf);
1522
1523 return ret;
1524}
1525
1526/*
1527 * Send I/O requests to the server.
1528 *
1529 * This function sends requests to the server, links the requests to
1530 * the outstanding_list in BDRVSheepdogState, and exits without
1531 * waiting the response. The responses are received in the
1532 * `aio_read_response' function which is called from the main loop as
1533 * a fd handler.
1534 */
1535static void sd_readv_writev_bh_cb(void *p)
1536{
1537 SheepdogAIOCB *acb = p;
1538 int ret = 0;
1539 unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1540 unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1541 uint64_t oid;
1542 uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1543 BDRVSheepdogState *s = acb->common.bs->opaque;
1544 SheepdogInode *inode = &s->inode;
1545 AIOReq *aio_req;
1546
1547 qemu_bh_delete(acb->bh);
1548 acb->bh = NULL;
1549
1550 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1551 /*
1552 * In the case we open the snapshot VDI, Sheepdog creates the
1553 * writable VDI when we do a write operation first.
1554 */
1555 ret = sd_create_branch(s);
1556 if (ret) {
1557 acb->ret = -EIO;
1558 goto out;
1559 }
1560 }
1561
1562 while (done != total) {
1563 uint8_t flags = 0;
1564 uint64_t old_oid = 0;
1565 int create = 0;
1566
1567 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1568
1569 len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1570
1571 if (!inode->data_vdi_id[idx]) {
1572 if (acb->aiocb_type == AIOCB_READ_UDATA) {
1573 goto done;
1574 }
1575
1576 create = 1;
1577 } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
1578 && !is_data_obj_writeable(inode, idx)) {
1579 /* Copy-On-Write */
1580 create = 1;
1581 old_oid = oid;
1582 flags = SD_FLAG_CMD_COW;
1583 }
1584
1585 if (create) {
1586 dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1587 " %" PRIu64 "\n", inode->vdi_id, oid,
1588 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1589 oid = vid_to_data_oid(inode->vdi_id, idx);
1590 dprintf("new oid %lx\n", oid);
1591 }
1592
1593 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1594
1595 if (create) {
1596 AIOReq *areq;
1597 QLIST_FOREACH(areq, &s->outstanding_aio_head,
1598 outstanding_aio_siblings) {
1599 if (areq == aio_req) {
1600 continue;
1601 }
1602 if (areq->oid == oid) {
1603 /*
1604 * Sheepdog cannot handle simultaneous create
1605 * requests to the same object. So we cannot send
1606 * the request until the previous request
1607 * finishes.
1608 */
1609 aio_req->flags = 0;
1610 aio_req->base_oid = 0;
1611 goto done;
1612 }
1613 }
1614 }
1615
1616 ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1617 create, acb->aiocb_type);
1618 if (ret < 0) {
1619 error_report("add_aio_request is failed\n");
1620 free_aio_req(s, aio_req);
1621 acb->ret = -EIO;
1622 goto out;
1623 }
1624 done:
1625 offset = 0;
1626 idx++;
1627 done += len;
1628 }
1629out:
1630 if (QLIST_EMPTY(&acb->aioreq_head)) {
1631 sd_finish_aiocb(acb);
1632 }
1633}
1634
1635static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
1636 QEMUIOVector *qiov, int nb_sectors,
1637 BlockDriverCompletionFunc *cb,
1638 void *opaque)
1639{
1640 SheepdogAIOCB *acb;
1641
1642 if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1643 /* TODO: shouldn't block here */
1644 if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
1645 return NULL;
1646 }
1647 bs->total_sectors = sector_num + nb_sectors;
1648 }
1649
1650 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1651 acb->aio_done_func = sd_write_done;
1652 acb->aiocb_type = AIOCB_WRITE_UDATA;
1653
1654 sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1655 return &acb->common;
1656}
1657
1658static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
1659 QEMUIOVector *qiov, int nb_sectors,
1660 BlockDriverCompletionFunc *cb,
1661 void *opaque)
1662{
1663 SheepdogAIOCB *acb;
1664 int i;
1665
1666 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1667 acb->aiocb_type = AIOCB_READ_UDATA;
1668 acb->aio_done_func = sd_finish_aiocb;
1669
1670 /*
1671 * TODO: we can do better; we don't need to initialize
1672 * blindly.
1673 */
1674 for (i = 0; i < qiov->niov; i++) {
1675 memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1676 }
1677
1678 sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1679 return &acb->common;
1680}
1681
1682static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1683{
1684 BDRVSheepdogState *s = bs->opaque;
1685 int ret, fd;
1686 uint32_t new_vid;
1687 SheepdogInode *inode;
1688 unsigned int datalen;
1689
1690 dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1691 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1692 s->name, sn_info->vm_state_size, s->is_snapshot);
1693
1694 if (s->is_snapshot) {
1695 error_report("You can't create a snapshot of a snapshot VDI, "
1696 "%s (%" PRIu32 ").\n", s->name, s->inode.vdi_id);
1697
1698 return -EINVAL;
1699 }
1700
1701 dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1702
1703 s->inode.vm_state_size = sn_info->vm_state_size;
1704 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1705 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1706 /* we don't need to update entire object */
1707 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1708
1709 /* refresh inode. */
1710 fd = connect_to_sdog(s->addr, s->port);
1711 if (fd < 0) {
1712 ret = -EIO;
1713 goto cleanup;
1714 }
1715
1716 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1717 s->inode.nr_copies, datalen, 0, 0);
1718 if (ret < 0) {
1719 error_report("failed to write snapshot's inode.\n");
1720 ret = -EIO;
1721 goto cleanup;
1722 }
1723
1724 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1725 s->addr, s->port);
1726 if (ret < 0) {
1727 error_report("failed to create inode for snapshot. %s\n",
1728 strerror(errno));
1729 ret = -EIO;
1730 goto cleanup;
1731 }
1732
1733 inode = (SheepdogInode *)qemu_malloc(datalen);
1734
1735 ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1736 s->inode.nr_copies, datalen, 0);
1737
1738 if (ret < 0) {
1739 error_report("failed to read new inode info. %s\n", strerror(errno));
1740 ret = -EIO;
1741 goto cleanup;
1742 }
1743
1744 memcpy(&s->inode, inode, datalen);
1745 dprintf("s->inode: name %s snap_id %x oid %x\n",
1746 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1747
1748cleanup:
1749 closesocket(fd);
1750 return ret;
1751}
1752
1753static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1754{
1755 BDRVSheepdogState *s = bs->opaque;
1756 BDRVSheepdogState *old_s;
1757 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1758 char *buf = NULL;
1759 uint32_t vid;
1760 uint32_t snapid = 0;
1761 int ret = -ENOENT, fd;
1762
1763 old_s = qemu_malloc(sizeof(BDRVSheepdogState));
1764
1765 memcpy(old_s, s, sizeof(BDRVSheepdogState));
1766
1767 memset(vdi, 0, sizeof(vdi));
1768 strncpy(vdi, s->name, sizeof(vdi));
1769
1770 memset(tag, 0, sizeof(tag));
1771 snapid = strtoul(snapshot_id, NULL, 10);
1772 if (!snapid) {
1773 strncpy(tag, s->name, sizeof(tag));
1774 }
1775
1776 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1777 if (ret) {
1778 error_report("Failed to find_vdi_name\n");
1779 ret = -ENOENT;
1780 goto out;
1781 }
1782
1783 fd = connect_to_sdog(s->addr, s->port);
1784 if (fd < 0) {
1785 error_report("failed to connect\n");
1786 goto out;
1787 }
1788
1789 buf = qemu_malloc(SD_INODE_SIZE);
1790 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1791 SD_INODE_SIZE, 0);
1792
1793 closesocket(fd);
1794
1795 if (ret) {
1796 ret = -ENOENT;
1797 goto out;
1798 }
1799
1800 memcpy(&s->inode, buf, sizeof(s->inode));
1801
1802 if (!s->inode.vm_state_size) {
1803 error_report("Invalid snapshot\n");
1804 ret = -ENOENT;
1805 goto out;
1806 }
1807
1808 s->is_snapshot = 1;
1809
1810 qemu_free(buf);
1811 qemu_free(old_s);
1812
1813 return 0;
1814out:
1815 /* recover bdrv_sd_state */
1816 memcpy(s, old_s, sizeof(BDRVSheepdogState));
1817 qemu_free(buf);
1818 qemu_free(old_s);
1819
1820 error_report("failed to open. recover old bdrv_sd_state.\n");
1821
1822 return ret;
1823}
1824
1825static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1826{
1827 /* FIXME: Delete specified snapshot id. */
1828 return 0;
1829}
1830
1831#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
1832#define BITS_PER_BYTE 8
1833#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
1834#define DECLARE_BITMAP(name,bits) \
1835 unsigned long name[BITS_TO_LONGS(bits)]
1836
1837#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
1838
1839static inline int test_bit(unsigned int nr, const unsigned long *addr)
1840{
1841 return ((1UL << (nr % BITS_PER_LONG)) &
1842 (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
1843}
1844
1845static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1846{
1847 BDRVSheepdogState *s = bs->opaque;
1848 SheepdogReq req;
1849 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1850 QEMUSnapshotInfo *sn_tab = NULL;
1851 unsigned wlen, rlen;
1852 int found = 0;
1853 static SheepdogInode inode;
1854 unsigned long *vdi_inuse;
1855 unsigned int start_nr;
1856 uint64_t hval;
1857 uint32_t vid;
1858
1859 vdi_inuse = qemu_malloc(max);
1860
1861 fd = connect_to_sdog(s->addr, s->port);
1862 if (fd < 0) {
1863 goto out;
1864 }
1865
1866 rlen = max;
1867 wlen = 0;
1868
1869 memset(&req, 0, sizeof(req));
1870
1871 req.opcode = SD_OP_READ_VDIS;
1872 req.data_length = max;
1873
1874 ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1875
1876 closesocket(fd);
1877 if (ret) {
1878 goto out;
1879 }
1880
1881 sn_tab = qemu_mallocz(nr * sizeof(*sn_tab));
1882
1883 /* calculate a vdi id with hash function */
1884 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1885 start_nr = hval & (SD_NR_VDIS - 1);
1886
1887 fd = connect_to_sdog(s->addr, s->port);
1888 if (fd < 0) {
1889 error_report("failed to connect\n");
1890 goto out;
1891 }
1892
1893 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1894 if (!test_bit(vid, vdi_inuse)) {
1895 break;
1896 }
1897
1898 /* we don't need to read entire object */
1899 ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1900 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1901
1902 if (ret) {
1903 continue;
1904 }
1905
1906 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1907 sn_tab[found].date_sec = inode.snap_ctime >> 32;
1908 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1909 sn_tab[found].vm_state_size = inode.vm_state_size;
1910 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1911
1912 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1913 inode.snap_id);
1914 strncpy(sn_tab[found].name, inode.tag,
1915 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1916 found++;
1917 }
1918 }
1919
1920 closesocket(fd);
1921out:
1922 *psn_tab = sn_tab;
1923
1924 qemu_free(vdi_inuse);
1925
1926 return found;
1927}
1928
1929static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
1930 int64_t pos, int size, int load)
1931{
1932 int fd, create;
1933 int ret = 0;
1934 unsigned int data_len;
1935 uint64_t vmstate_oid;
1936 uint32_t vdi_index;
1937 uint64_t offset;
1938
1939 fd = connect_to_sdog(s->addr, s->port);
1940 if (fd < 0) {
1941 ret = -EIO;
1942 goto cleanup;
1943 }
1944
1945 while (size) {
1946 vdi_index = pos / SD_DATA_OBJ_SIZE;
1947 offset = pos % SD_DATA_OBJ_SIZE;
1948
1949 data_len = MIN(size, SD_DATA_OBJ_SIZE);
1950
1951 vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
1952
1953 create = (offset == 0);
1954 if (load) {
1955 ret = read_object(fd, (char *)data, vmstate_oid,
1956 s->inode.nr_copies, data_len, offset);
1957 } else {
1958 ret = write_object(fd, (char *)data, vmstate_oid,
1959 s->inode.nr_copies, data_len, offset, create);
1960 }
1961
1962 if (ret < 0) {
1963 error_report("failed to save vmstate %s\n", strerror(errno));
1964 ret = -EIO;
1965 goto cleanup;
1966 }
1967
1968 pos += data_len;
1969 size -= data_len;
1970 ret += data_len;
1971 }
1972cleanup:
1973 closesocket(fd);
1974 return ret;
1975}
1976
1977static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
1978 int64_t pos, int size)
1979{
1980 BDRVSheepdogState *s = bs->opaque;
1981
1982 return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
1983}
1984
1985static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
1986 int64_t pos, int size)
1987{
1988 BDRVSheepdogState *s = bs->opaque;
1989
1990 return do_load_save_vmstate(s, data, pos, size, 1);
1991}
1992
1993
1994static QEMUOptionParameter sd_create_options[] = {
1995 {
1996 .name = BLOCK_OPT_SIZE,
1997 .type = OPT_SIZE,
1998 .help = "Virtual disk size"
1999 },
2000 {
2001 .name = BLOCK_OPT_BACKING_FILE,
2002 .type = OPT_STRING,
2003 .help = "File name of a base image"
2004 },
2005 { NULL }
2006};
2007
2008BlockDriver bdrv_sheepdog = {
2009 .format_name = "sheepdog",
2010 .protocol_name = "sheepdog",
2011 .instance_size = sizeof(BDRVSheepdogState),
2012 .bdrv_file_open = sd_open,
2013 .bdrv_close = sd_close,
2014 .bdrv_create = sd_create,
2015 .bdrv_getlength = sd_getlength,
2016 .bdrv_truncate = sd_truncate,
2017
2018 .bdrv_aio_readv = sd_aio_readv,
2019 .bdrv_aio_writev = sd_aio_writev,
2020
2021 .bdrv_snapshot_create = sd_snapshot_create,
2022 .bdrv_snapshot_goto = sd_snapshot_goto,
2023 .bdrv_snapshot_delete = sd_snapshot_delete,
2024 .bdrv_snapshot_list = sd_snapshot_list,
2025
2026 .bdrv_save_vmstate = sd_save_vmstate,
2027 .bdrv_load_vmstate = sd_load_vmstate,
2028
2029 .create_options = sd_create_options,
2030};
2031
2032static void bdrv_sheepdog_init(void)
2033{
2034 bdrv_register(&bdrv_sheepdog);
2035}
2036block_init(bdrv_sheepdog_init);