]> git.proxmox.com Git - qemu.git/blame - block/sheepdog.c
sheepdog: accept URIs
[qemu.git] / block / sheepdog.c
CommitLineData
33b1db1c
MK
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
6b620ca3
PB
10 *
11 * Contributions after 2012-01-13 are licensed under the terms of the
12 * GNU GPL, version 2 or (at your option) any later version.
33b1db1c 13 */
33b1db1c
MK
14
15#include "qemu-common.h"
5d6768e3 16#include "qemu/uri.h"
1de7afc9
PB
17#include "qemu/error-report.h"
18#include "qemu/sockets.h"
737e150e 19#include "block/block_int.h"
1de7afc9 20#include "qemu/bitops.h"
33b1db1c
MK
21
22#define SD_PROTO_VER 0x01
23
24#define SD_DEFAULT_ADDR "localhost"
25#define SD_DEFAULT_PORT "7000"
26
27#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
28#define SD_OP_READ_OBJ 0x02
29#define SD_OP_WRITE_OBJ 0x03
30
31#define SD_OP_NEW_VDI 0x11
32#define SD_OP_LOCK_VDI 0x12
33#define SD_OP_RELEASE_VDI 0x13
34#define SD_OP_GET_VDI_INFO 0x14
35#define SD_OP_READ_VDIS 0x15
47622c44 36#define SD_OP_FLUSH_VDI 0x16
33b1db1c
MK
37
38#define SD_FLAG_CMD_WRITE 0x01
39#define SD_FLAG_CMD_COW 0x02
0e7106d8
LY
40#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
41#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
33b1db1c
MK
42
43#define SD_RES_SUCCESS 0x00 /* Success */
44#define SD_RES_UNKNOWN 0x01 /* Unknown error */
45#define SD_RES_NO_OBJ 0x02 /* No object found */
46#define SD_RES_EIO 0x03 /* I/O error */
47#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
48#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
49#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
50#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
51#define SD_RES_NO_VDI 0x08 /* No vdi found */
52#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
53#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
54#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
55#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
56#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
57#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
58#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
59#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
60#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
61#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
62#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
63#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
64#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
65#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
66#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
67#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
68
69/*
70 * Object ID rules
71 *
72 * 0 - 19 (20 bits): data object space
73 * 20 - 31 (12 bits): reserved data object space
74 * 32 - 55 (24 bits): vdi object space
75 * 56 - 59 ( 4 bits): reserved vdi object space
7acae208 76 * 60 - 63 ( 4 bits): object type identifier space
33b1db1c
MK
77 */
78
79#define VDI_SPACE_SHIFT 32
80#define VDI_BIT (UINT64_C(1) << 63)
81#define VMSTATE_BIT (UINT64_C(1) << 62)
82#define MAX_DATA_OBJS (UINT64_C(1) << 20)
83#define MAX_CHILDREN 1024
84#define SD_MAX_VDI_LEN 256
85#define SD_MAX_VDI_TAG_LEN 256
86#define SD_NR_VDIS (1U << 24)
87#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
88#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
89#define SECTOR_SIZE 512
90
91#define SD_INODE_SIZE (sizeof(SheepdogInode))
92#define CURRENT_VDI_ID 0
93
94typedef struct SheepdogReq {
95 uint8_t proto_ver;
96 uint8_t opcode;
97 uint16_t flags;
98 uint32_t epoch;
99 uint32_t id;
100 uint32_t data_length;
101 uint32_t opcode_specific[8];
102} SheepdogReq;
103
104typedef struct SheepdogRsp {
105 uint8_t proto_ver;
106 uint8_t opcode;
107 uint16_t flags;
108 uint32_t epoch;
109 uint32_t id;
110 uint32_t data_length;
111 uint32_t result;
112 uint32_t opcode_specific[7];
113} SheepdogRsp;
114
115typedef struct SheepdogObjReq {
116 uint8_t proto_ver;
117 uint8_t opcode;
118 uint16_t flags;
119 uint32_t epoch;
120 uint32_t id;
121 uint32_t data_length;
122 uint64_t oid;
123 uint64_t cow_oid;
124 uint32_t copies;
125 uint32_t rsvd;
126 uint64_t offset;
127} SheepdogObjReq;
128
129typedef struct SheepdogObjRsp {
130 uint8_t proto_ver;
131 uint8_t opcode;
132 uint16_t flags;
133 uint32_t epoch;
134 uint32_t id;
135 uint32_t data_length;
136 uint32_t result;
137 uint32_t copies;
138 uint32_t pad[6];
139} SheepdogObjRsp;
140
141typedef struct SheepdogVdiReq {
142 uint8_t proto_ver;
143 uint8_t opcode;
144 uint16_t flags;
145 uint32_t epoch;
146 uint32_t id;
147 uint32_t data_length;
148 uint64_t vdi_size;
6f74c260 149 uint32_t vdi_id;
33b1db1c
MK
150 uint32_t copies;
151 uint32_t snapid;
152 uint32_t pad[3];
153} SheepdogVdiReq;
154
155typedef struct SheepdogVdiRsp {
156 uint8_t proto_ver;
157 uint8_t opcode;
158 uint16_t flags;
159 uint32_t epoch;
160 uint32_t id;
161 uint32_t data_length;
162 uint32_t result;
163 uint32_t rsvd;
164 uint32_t vdi_id;
165 uint32_t pad[5];
166} SheepdogVdiRsp;
167
168typedef struct SheepdogInode {
169 char name[SD_MAX_VDI_LEN];
170 char tag[SD_MAX_VDI_TAG_LEN];
171 uint64_t ctime;
172 uint64_t snap_ctime;
173 uint64_t vm_clock_nsec;
174 uint64_t vdi_size;
175 uint64_t vm_state_size;
176 uint16_t copy_policy;
177 uint8_t nr_copies;
178 uint8_t block_size_shift;
179 uint32_t snap_id;
180 uint32_t vdi_id;
181 uint32_t parent_vdi_id;
182 uint32_t child_vdi_id[MAX_CHILDREN];
183 uint32_t data_vdi_id[MAX_DATA_OBJS];
184} SheepdogInode;
185
186/*
187 * 64 bit FNV-1a non-zero initial basis
188 */
189#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
190
191/*
192 * 64 bit Fowler/Noll/Vo FNV-1a hash code
193 */
194static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
195{
196 unsigned char *bp = buf;
197 unsigned char *be = bp + len;
198 while (bp < be) {
199 hval ^= (uint64_t) *bp++;
200 hval += (hval << 1) + (hval << 4) + (hval << 5) +
201 (hval << 7) + (hval << 8) + (hval << 40);
202 }
203 return hval;
204}
205
2f536801 206static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
33b1db1c
MK
207{
208 return inode->vdi_id == inode->data_vdi_id[idx];
209}
210
2f536801 211static inline bool is_data_obj(uint64_t oid)
33b1db1c
MK
212{
213 return !(VDI_BIT & oid);
214}
215
216static inline uint64_t data_oid_to_idx(uint64_t oid)
217{
218 return oid & (MAX_DATA_OBJS - 1);
219}
220
221static inline uint64_t vid_to_vdi_oid(uint32_t vid)
222{
223 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
224}
225
226static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
227{
228 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
229}
230
231static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
232{
233 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
234}
235
2f536801 236static inline bool is_snapshot(struct SheepdogInode *inode)
33b1db1c
MK
237{
238 return !!inode->snap_ctime;
239}
240
241#undef dprintf
242#ifdef DEBUG_SDOG
243#define dprintf(fmt, args...) \
244 do { \
245 fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
246 } while (0)
247#else
248#define dprintf(fmt, args...)
249#endif
250
251typedef struct SheepdogAIOCB SheepdogAIOCB;
252
253typedef struct AIOReq {
254 SheepdogAIOCB *aiocb;
255 unsigned int iov_offset;
256
257 uint64_t oid;
258 uint64_t base_oid;
259 uint64_t offset;
260 unsigned int data_len;
261 uint8_t flags;
262 uint32_t id;
263
c292ee6a 264 QLIST_ENTRY(AIOReq) aio_siblings;
33b1db1c
MK
265} AIOReq;
266
267enum AIOCBState {
268 AIOCB_WRITE_UDATA,
269 AIOCB_READ_UDATA,
47783072 270 AIOCB_FLUSH_CACHE,
33b1db1c
MK
271};
272
273struct SheepdogAIOCB {
274 BlockDriverAIOCB common;
275
276 QEMUIOVector *qiov;
277
278 int64_t sector_num;
279 int nb_sectors;
280
281 int ret;
282 enum AIOCBState aiocb_type;
283
2df46246 284 Coroutine *coroutine;
33b1db1c
MK
285 void (*aio_done_func)(SheepdogAIOCB *);
286
2f536801 287 bool canceled;
1d732d7d 288 int nr_pending;
33b1db1c
MK
289};
290
291typedef struct BDRVSheepdogState {
292 SheepdogInode inode;
293
294 uint32_t min_dirty_data_idx;
295 uint32_t max_dirty_data_idx;
296
297 char name[SD_MAX_VDI_LEN];
2f536801 298 bool is_snapshot;
0e7106d8 299 uint32_t cache_flags;
33b1db1c
MK
300
301 char *addr;
302 char *port;
303 int fd;
304
2df46246
MK
305 CoMutex lock;
306 Coroutine *co_send;
307 Coroutine *co_recv;
308
33b1db1c 309 uint32_t aioreq_seq_num;
c292ee6a
MK
310 QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
311 QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
33b1db1c
MK
312} BDRVSheepdogState;
313
314static const char * sd_strerror(int err)
315{
316 int i;
317
318 static const struct {
319 int err;
320 const char *desc;
321 } errors[] = {
322 {SD_RES_SUCCESS, "Success"},
323 {SD_RES_UNKNOWN, "Unknown error"},
324 {SD_RES_NO_OBJ, "No object found"},
325 {SD_RES_EIO, "I/O error"},
326 {SD_RES_VDI_EXIST, "VDI exists already"},
327 {SD_RES_INVALID_PARMS, "Invalid parameters"},
328 {SD_RES_SYSTEM_ERROR, "System error"},
329 {SD_RES_VDI_LOCKED, "VDI is already locked"},
330 {SD_RES_NO_VDI, "No vdi found"},
331 {SD_RES_NO_BASE_VDI, "No base VDI found"},
332 {SD_RES_VDI_READ, "Failed read the requested VDI"},
333 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
334 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
335 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
336 {SD_RES_NO_TAG, "Failed to find the requested tag"},
337 {SD_RES_STARTUP, "The system is still booting"},
338 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
339 {SD_RES_SHUTDOWN, "The system is shutting down"},
340 {SD_RES_NO_MEM, "Out of memory on the server"},
341 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
342 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
343 {SD_RES_NO_SPACE, "Server has no space for new objects"},
344 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
345 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
346 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
347 };
348
349 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
350 if (errors[i].err == err) {
351 return errors[i].desc;
352 }
353 }
354
355 return "Invalid error code";
356}
357
358/*
359 * Sheepdog I/O handling:
360 *
2df46246 361 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
c292ee6a 362 * link the requests to the inflight_list in the
2df46246
MK
363 * BDRVSheepdogState. The function exits without waiting for
364 * receiving the response.
33b1db1c 365 *
2df46246 366 * 2. We receive the response in aio_read_response, the fd handler to
33b1db1c
MK
367 * the sheepdog connection. If metadata update is needed, we send
368 * the write request to the vdi object in sd_write_done, the write
2df46246
MK
369 * completion function. We switch back to sd_co_readv/writev after
370 * all the requests belonging to the AIOCB are finished.
33b1db1c
MK
371 */
372
373static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
374 uint64_t oid, unsigned int data_len,
375 uint64_t offset, uint8_t flags,
376 uint64_t base_oid, unsigned int iov_offset)
377{
378 AIOReq *aio_req;
379
7267c094 380 aio_req = g_malloc(sizeof(*aio_req));
33b1db1c
MK
381 aio_req->aiocb = acb;
382 aio_req->iov_offset = iov_offset;
383 aio_req->oid = oid;
384 aio_req->base_oid = base_oid;
385 aio_req->offset = offset;
386 aio_req->data_len = data_len;
387 aio_req->flags = flags;
388 aio_req->id = s->aioreq_seq_num++;
389
1d732d7d 390 acb->nr_pending++;
33b1db1c
MK
391 return aio_req;
392}
393
1d732d7d 394static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
33b1db1c
MK
395{
396 SheepdogAIOCB *acb = aio_req->aiocb;
1d732d7d 397
c292ee6a 398 QLIST_REMOVE(aio_req, aio_siblings);
7267c094 399 g_free(aio_req);
33b1db1c 400
1d732d7d 401 acb->nr_pending--;
33b1db1c
MK
402}
403
d8716b41 404static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
33b1db1c
MK
405{
406 if (!acb->canceled) {
2df46246 407 qemu_coroutine_enter(acb->coroutine, NULL);
33b1db1c
MK
408 }
409 qemu_aio_release(acb);
410}
411
412static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
413{
414 SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
415
416 /*
417 * Sheepdog cannot cancel the requests which are already sent to
418 * the servers, so we just complete the request with -EIO here.
419 */
2df46246
MK
420 acb->ret = -EIO;
421 qemu_coroutine_enter(acb->coroutine, NULL);
2f536801 422 acb->canceled = true;
33b1db1c
MK
423}
424
d7331bed 425static const AIOCBInfo sd_aiocb_info = {
33b1db1c
MK
426 .aiocb_size = sizeof(SheepdogAIOCB),
427 .cancel = sd_aio_cancel,
428};
429
430static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
f700f8e3 431 int64_t sector_num, int nb_sectors)
33b1db1c
MK
432{
433 SheepdogAIOCB *acb;
434
f700f8e3 435 acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
33b1db1c
MK
436
437 acb->qiov = qiov;
438
439 acb->sector_num = sector_num;
440 acb->nb_sectors = nb_sectors;
441
442 acb->aio_done_func = NULL;
2f536801 443 acb->canceled = false;
2df46246 444 acb->coroutine = qemu_coroutine_self();
33b1db1c 445 acb->ret = 0;
1d732d7d 446 acb->nr_pending = 0;
33b1db1c
MK
447 return acb;
448}
449
33b1db1c
MK
450static int connect_to_sdog(const char *addr, const char *port)
451{
452 char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
453 int fd, ret;
454 struct addrinfo hints, *res, *res0;
455
456 if (!addr) {
457 addr = SD_DEFAULT_ADDR;
458 port = SD_DEFAULT_PORT;
459 }
460
461 memset(&hints, 0, sizeof(hints));
462 hints.ai_socktype = SOCK_STREAM;
463
464 ret = getaddrinfo(addr, port, &hints, &res0);
465 if (ret) {
6daf194d 466 error_report("unable to get address info %s, %s",
33b1db1c 467 addr, strerror(errno));
cb595887 468 return -errno;
33b1db1c
MK
469 }
470
471 for (res = res0; res; res = res->ai_next) {
472 ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
473 sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
474 if (ret) {
475 continue;
476 }
477
478 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
479 if (fd < 0) {
480 continue;
481 }
482
483 reconnect:
484 ret = connect(fd, res->ai_addr, res->ai_addrlen);
485 if (ret < 0) {
486 if (errno == EINTR) {
487 goto reconnect;
488 }
a7e47d4b 489 close(fd);
33b1db1c
MK
490 break;
491 }
492
493 dprintf("connected to %s:%s\n", addr, port);
494 goto success;
495 }
cb595887 496 fd = -errno;
6daf194d 497 error_report("failed connect to %s:%s", addr, port);
33b1db1c
MK
498success:
499 freeaddrinfo(res0);
500 return fd;
501}
502
e0d93a89
MK
503static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
504 unsigned int *wlen)
47622c44
LY
505{
506 int ret;
507
508 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
509 if (ret < sizeof(*hdr)) {
510 error_report("failed to send a req, %s", strerror(errno));
eb092180 511 return ret;
47622c44
LY
512 }
513
514 ret = qemu_co_send(sockfd, data, *wlen);
515 if (ret < *wlen) {
516 error_report("failed to send a req, %s", strerror(errno));
517 }
518
519 return ret;
520}
e0d93a89 521
2dfcca3b
MK
522static void restart_co_req(void *opaque)
523{
524 Coroutine *co = opaque;
525
526 qemu_coroutine_enter(co, NULL);
527}
528
cddd4ac7
MK
529typedef struct SheepdogReqCo {
530 int sockfd;
531 SheepdogReq *hdr;
532 void *data;
533 unsigned int *wlen;
534 unsigned int *rlen;
535 int ret;
536 bool finished;
537} SheepdogReqCo;
538
539static coroutine_fn void do_co_req(void *opaque)
47622c44
LY
540{
541 int ret;
2dfcca3b 542 Coroutine *co;
cddd4ac7
MK
543 SheepdogReqCo *srco = opaque;
544 int sockfd = srco->sockfd;
545 SheepdogReq *hdr = srco->hdr;
546 void *data = srco->data;
547 unsigned int *wlen = srco->wlen;
548 unsigned int *rlen = srco->rlen;
2dfcca3b
MK
549
550 co = qemu_coroutine_self();
551 qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
47622c44
LY
552
553 socket_set_block(sockfd);
554 ret = send_co_req(sockfd, hdr, data, wlen);
555 if (ret < 0) {
556 goto out;
557 }
558
2dfcca3b
MK
559 qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co);
560
47622c44
LY
561 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
562 if (ret < sizeof(*hdr)) {
563 error_report("failed to get a rsp, %s", strerror(errno));
cb595887 564 ret = -errno;
47622c44
LY
565 goto out;
566 }
567
568 if (*rlen > hdr->data_length) {
569 *rlen = hdr->data_length;
570 }
571
572 if (*rlen) {
573 ret = qemu_co_recv(sockfd, data, *rlen);
574 if (ret < *rlen) {
575 error_report("failed to get the data, %s", strerror(errno));
cb595887 576 ret = -errno;
47622c44
LY
577 goto out;
578 }
579 }
580 ret = 0;
581out:
2dfcca3b 582 qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
47622c44 583 socket_set_nonblock(sockfd);
cddd4ac7
MK
584
585 srco->ret = ret;
586 srco->finished = true;
587}
588
589static int do_req(int sockfd, SheepdogReq *hdr, void *data,
590 unsigned int *wlen, unsigned int *rlen)
591{
592 Coroutine *co;
593 SheepdogReqCo srco = {
594 .sockfd = sockfd,
595 .hdr = hdr,
596 .data = data,
597 .wlen = wlen,
598 .rlen = rlen,
599 .ret = 0,
600 .finished = false,
601 };
602
603 if (qemu_in_coroutine()) {
604 do_co_req(&srco);
605 } else {
606 co = qemu_coroutine_create(do_co_req);
607 qemu_coroutine_enter(co, &srco);
608 while (!srco.finished) {
609 qemu_aio_wait();
610 }
611 }
612
613 return srco.ret;
47622c44
LY
614}
615
d8716b41 616static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
2f536801 617 struct iovec *iov, int niov, bool create,
33b1db1c
MK
618 enum AIOCBState aiocb_type);
619
7dc1cde0
MK
620
621static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
622{
623 AIOReq *aio_req;
624
625 QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
626 if (aio_req->oid == oid) {
627 return aio_req;
628 }
629 }
630
631 return NULL;
632}
633
33b1db1c
MK
634/*
635 * This function searchs pending requests to the object `oid', and
636 * sends them.
637 */
c292ee6a 638static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
33b1db1c 639{
7dc1cde0 640 AIOReq *aio_req;
33b1db1c
MK
641 SheepdogAIOCB *acb;
642 int ret;
643
7dc1cde0 644 while ((aio_req = find_pending_req(s, oid)) != NULL) {
33b1db1c 645 acb = aio_req->aiocb;
c292ee6a
MK
646 /* move aio_req from pending list to inflight one */
647 QLIST_REMOVE(aio_req, aio_siblings);
648 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
33b1db1c 649 ret = add_aio_request(s, aio_req, acb->qiov->iov,
2f536801 650 acb->qiov->niov, false, acb->aiocb_type);
33b1db1c 651 if (ret < 0) {
6daf194d 652 error_report("add_aio_request is failed");
33b1db1c 653 free_aio_req(s, aio_req);
1d732d7d 654 if (!acb->nr_pending) {
33b1db1c
MK
655 sd_finish_aiocb(acb);
656 }
657 }
658 }
659}
660
661/*
662 * Receive responses of the I/O requests.
663 *
664 * This function is registered as a fd handler, and called from the
665 * main loop when s->fd is ready for reading responses.
666 */
d8716b41 667static void coroutine_fn aio_read_response(void *opaque)
33b1db1c
MK
668{
669 SheepdogObjRsp rsp;
670 BDRVSheepdogState *s = opaque;
671 int fd = s->fd;
672 int ret;
673 AIOReq *aio_req = NULL;
674 SheepdogAIOCB *acb;
33b1db1c
MK
675 unsigned long idx;
676
c292ee6a 677 if (QLIST_EMPTY(&s->inflight_aio_head)) {
2df46246 678 goto out;
33b1db1c
MK
679 }
680
681 /* read a header */
8c5135f9
PB
682 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
683 if (ret < 0) {
6daf194d 684 error_report("failed to get the header, %s", strerror(errno));
2df46246 685 goto out;
33b1db1c
MK
686 }
687
c292ee6a
MK
688 /* find the right aio_req from the inflight aio list */
689 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
33b1db1c
MK
690 if (aio_req->id == rsp.id) {
691 break;
692 }
693 }
694 if (!aio_req) {
6daf194d 695 error_report("cannot find aio_req %x", rsp.id);
2df46246 696 goto out;
33b1db1c
MK
697 }
698
699 acb = aio_req->aiocb;
700
701 switch (acb->aiocb_type) {
702 case AIOCB_WRITE_UDATA:
6d1acda8
MK
703 /* this coroutine context is no longer suitable for co_recv
704 * because we may send data to update vdi objects */
705 s->co_recv = NULL;
33b1db1c
MK
706 if (!is_data_obj(aio_req->oid)) {
707 break;
708 }
709 idx = data_oid_to_idx(aio_req->oid);
710
711 if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
712 /*
713 * If the object is newly created one, we need to update
714 * the vdi object (metadata object). min_dirty_data_idx
715 * and max_dirty_data_idx are changed to include updated
716 * index between them.
717 */
bd751f22
LY
718 if (rsp.result == SD_RES_SUCCESS) {
719 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
720 s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
721 s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
722 }
33b1db1c
MK
723 /*
724 * Some requests may be blocked because simultaneous
725 * create requests are not allowed, so we search the
726 * pending requests here.
727 */
d6b1ef89 728 send_pending_req(s, aio_req->oid);
33b1db1c
MK
729 }
730 break;
731 case AIOCB_READ_UDATA:
2fc8ae1d
MT
732 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
733 aio_req->iov_offset, rsp.data_length);
8c5135f9 734 if (ret < 0) {
6daf194d 735 error_report("failed to get the data, %s", strerror(errno));
2df46246 736 goto out;
33b1db1c
MK
737 }
738 break;
47783072
LY
739 case AIOCB_FLUSH_CACHE:
740 if (rsp.result == SD_RES_INVALID_PARMS) {
741 dprintf("disable cache since the server doesn't support it\n");
742 s->cache_flags = SD_FLAG_CMD_DIRECT;
743 rsp.result = SD_RES_SUCCESS;
744 }
745 break;
33b1db1c
MK
746 }
747
748 if (rsp.result != SD_RES_SUCCESS) {
749 acb->ret = -EIO;
6daf194d 750 error_report("%s", sd_strerror(rsp.result));
33b1db1c
MK
751 }
752
1d732d7d
MK
753 free_aio_req(s, aio_req);
754 if (!acb->nr_pending) {
33b1db1c
MK
755 /*
756 * We've finished all requests which belong to the AIOCB, so
2df46246 757 * we can switch back to sd_co_readv/writev now.
33b1db1c
MK
758 */
759 acb->aio_done_func(acb);
760 }
2df46246
MK
761out:
762 s->co_recv = NULL;
763}
764
765static void co_read_response(void *opaque)
766{
767 BDRVSheepdogState *s = opaque;
768
769 if (!s->co_recv) {
770 s->co_recv = qemu_coroutine_create(aio_read_response);
771 }
772
773 qemu_coroutine_enter(s->co_recv, opaque);
774}
775
776static void co_write_request(void *opaque)
777{
778 BDRVSheepdogState *s = opaque;
779
780 qemu_coroutine_enter(s->co_send, NULL);
33b1db1c
MK
781}
782
783static int aio_flush_request(void *opaque)
784{
785 BDRVSheepdogState *s = opaque;
786
c292ee6a
MK
787 return !QLIST_EMPTY(&s->inflight_aio_head) ||
788 !QLIST_EMPTY(&s->pending_aio_head);
33b1db1c
MK
789}
790
33b1db1c
MK
791/*
792 * Return a socket discriptor to read/write objects.
793 *
794 * We cannot use this discriptor for other operations because
795 * the block driver may be on waiting response from the server.
796 */
797static int get_sheep_fd(BDRVSheepdogState *s)
798{
799 int ret, fd;
800
801 fd = connect_to_sdog(s->addr, s->port);
802 if (fd < 0) {
6daf194d 803 error_report("%s", strerror(errno));
cb595887 804 return fd;
33b1db1c
MK
805 }
806
807 socket_set_nonblock(fd);
808
bf1c852a 809 ret = socket_set_nodelay(fd);
33b1db1c 810 if (ret) {
6daf194d 811 error_report("%s", strerror(errno));
33b1db1c 812 closesocket(fd);
cb595887 813 return -errno;
33b1db1c
MK
814 }
815
bafbd6a1 816 qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
33b1db1c
MK
817 return fd;
818}
819
5d6768e3
MK
820static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
821 char *vdi, uint32_t *snapid, char *tag)
822{
823 URI *uri;
824 QueryParams *qp = NULL;
825 int ret = 0;
826
827 uri = uri_parse(filename);
828 if (!uri) {
829 return -EINVAL;
830 }
831
832 if (uri->path == NULL || !strcmp(uri->path, "/")) {
833 ret = -EINVAL;
834 goto out;
835 }
836 pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1);
837
838 /* sheepdog[+tcp]://[host:port]/vdiname */
839 s->addr = g_strdup(uri->server ?: SD_DEFAULT_ADDR);
840 if (uri->port) {
841 s->port = g_strdup_printf("%d", uri->port);
842 } else {
843 s->port = g_strdup(SD_DEFAULT_PORT);
844 }
845
846 /* snapshot tag */
847 if (uri->fragment) {
848 *snapid = strtoul(uri->fragment, NULL, 10);
849 if (*snapid == 0) {
850 pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment);
851 }
852 } else {
853 *snapid = CURRENT_VDI_ID; /* search current vdi */
854 }
855
856out:
857 if (qp) {
858 query_params_free(qp);
859 }
860 uri_free(uri);
861 return ret;
862}
863
33b1db1c 864/*
5d6768e3 865 * Parse a filename (old syntax)
33b1db1c
MK
866 *
867 * filename must be one of the following formats:
868 * 1. [vdiname]
869 * 2. [vdiname]:[snapid]
870 * 3. [vdiname]:[tag]
871 * 4. [hostname]:[port]:[vdiname]
872 * 5. [hostname]:[port]:[vdiname]:[snapid]
873 * 6. [hostname]:[port]:[vdiname]:[tag]
874 *
875 * You can boot from the snapshot images by specifying `snapid` or
876 * `tag'.
877 *
878 * You can run VMs outside the Sheepdog cluster by specifying
879 * `hostname' and `port' (experimental).
880 */
881static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
882 char *vdi, uint32_t *snapid, char *tag)
883{
5d6768e3
MK
884 char *p, *q, *uri;
885 const char *host_spec, *vdi_spec;
886 int nr_sep, ret;
33b1db1c 887
5d6768e3 888 strstart(filename, "sheepdog:", (const char **)&filename);
7267c094 889 p = q = g_strdup(filename);
33b1db1c
MK
890
891 /* count the number of separators */
892 nr_sep = 0;
893 while (*p) {
894 if (*p == ':') {
895 nr_sep++;
896 }
897 p++;
898 }
899 p = q;
900
5d6768e3 901 /* use the first two tokens as host_spec. */
33b1db1c 902 if (nr_sep >= 2) {
5d6768e3 903 host_spec = p;
33b1db1c 904 p = strchr(p, ':');
5d6768e3 905 p++;
33b1db1c
MK
906 p = strchr(p, ':');
907 *p++ = '\0';
908 } else {
5d6768e3 909 host_spec = "";
33b1db1c
MK
910 }
911
5d6768e3 912 vdi_spec = p;
33b1db1c 913
5d6768e3 914 p = strchr(vdi_spec, ':');
33b1db1c 915 if (p) {
5d6768e3 916 *p++ = '#';
33b1db1c
MK
917 }
918
5d6768e3 919 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
33b1db1c 920
5d6768e3
MK
921 ret = sd_parse_uri(s, uri, vdi, snapid, tag);
922
923 g_free(q);
924 g_free(uri);
925
926 return ret;
33b1db1c
MK
927}
928
929static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
930 char *tag, uint32_t *vid, int for_snapshot)
931{
932 int ret, fd;
933 SheepdogVdiReq hdr;
934 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
935 unsigned int wlen, rlen = 0;
936 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
937
938 fd = connect_to_sdog(s->addr, s->port);
939 if (fd < 0) {
cb595887 940 return fd;
33b1db1c
MK
941 }
942
3178e275
JM
943 /* This pair of strncpy calls ensures that the buffer is zero-filled,
944 * which is desirable since we'll soon be sending those bytes, and
945 * don't want the send_req to read uninitialized data.
946 */
33b1db1c
MK
947 strncpy(buf, filename, SD_MAX_VDI_LEN);
948 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
949
950 memset(&hdr, 0, sizeof(hdr));
951 if (for_snapshot) {
952 hdr.opcode = SD_OP_GET_VDI_INFO;
953 } else {
954 hdr.opcode = SD_OP_LOCK_VDI;
955 }
956 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
957 hdr.proto_ver = SD_PROTO_VER;
958 hdr.data_length = wlen;
959 hdr.snapid = snapid;
960 hdr.flags = SD_FLAG_CMD_WRITE;
961
962 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
963 if (ret) {
33b1db1c
MK
964 goto out;
965 }
966
967 if (rsp->result != SD_RES_SUCCESS) {
6daf194d 968 error_report("cannot get vdi info, %s, %s %d %s",
33b1db1c 969 sd_strerror(rsp->result), filename, snapid, tag);
cb595887
MK
970 if (rsp->result == SD_RES_NO_VDI) {
971 ret = -ENOENT;
972 } else {
973 ret = -EIO;
974 }
33b1db1c
MK
975 goto out;
976 }
977 *vid = rsp->vdi_id;
978
979 ret = 0;
980out:
981 closesocket(fd);
982 return ret;
983}
984
d8716b41 985static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
2f536801 986 struct iovec *iov, int niov, bool create,
33b1db1c
MK
987 enum AIOCBState aiocb_type)
988{
989 int nr_copies = s->inode.nr_copies;
990 SheepdogObjReq hdr;
47783072 991 unsigned int wlen = 0;
33b1db1c
MK
992 int ret;
993 uint64_t oid = aio_req->oid;
994 unsigned int datalen = aio_req->data_len;
995 uint64_t offset = aio_req->offset;
996 uint8_t flags = aio_req->flags;
997 uint64_t old_oid = aio_req->base_oid;
998
999 if (!nr_copies) {
6daf194d 1000 error_report("bug");
33b1db1c
MK
1001 }
1002
1003 memset(&hdr, 0, sizeof(hdr));
1004
47783072
LY
1005 switch (aiocb_type) {
1006 case AIOCB_FLUSH_CACHE:
1007 hdr.opcode = SD_OP_FLUSH_VDI;
1008 break;
1009 case AIOCB_READ_UDATA:
33b1db1c
MK
1010 hdr.opcode = SD_OP_READ_OBJ;
1011 hdr.flags = flags;
47783072
LY
1012 break;
1013 case AIOCB_WRITE_UDATA:
1014 if (create) {
1015 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1016 } else {
1017 hdr.opcode = SD_OP_WRITE_OBJ;
1018 }
33b1db1c 1019 wlen = datalen;
33b1db1c 1020 hdr.flags = SD_FLAG_CMD_WRITE | flags;
47783072 1021 break;
33b1db1c
MK
1022 }
1023
0e7106d8
LY
1024 if (s->cache_flags) {
1025 hdr.flags |= s->cache_flags;
47622c44
LY
1026 }
1027
33b1db1c
MK
1028 hdr.oid = oid;
1029 hdr.cow_oid = old_oid;
1030 hdr.copies = s->inode.nr_copies;
1031
1032 hdr.data_length = datalen;
1033 hdr.offset = offset;
1034
1035 hdr.id = aio_req->id;
1036
2df46246
MK
1037 qemu_co_mutex_lock(&s->lock);
1038 s->co_send = qemu_coroutine_self();
1039 qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
bafbd6a1 1040 aio_flush_request, s);
128aa589 1041 socket_set_cork(s->fd, 1);
33b1db1c
MK
1042
1043 /* send a header */
8c5135f9
PB
1044 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
1045 if (ret < 0) {
c3fecea5 1046 qemu_co_mutex_unlock(&s->lock);
6daf194d 1047 error_report("failed to send a req, %s", strerror(errno));
cb595887 1048 return -errno;
33b1db1c
MK
1049 }
1050
1051 if (wlen) {
2fc8ae1d 1052 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
8c5135f9 1053 if (ret < 0) {
c3fecea5 1054 qemu_co_mutex_unlock(&s->lock);
6daf194d 1055 error_report("failed to send a data, %s", strerror(errno));
cb595887 1056 return -errno;
33b1db1c
MK
1057 }
1058 }
1059
128aa589 1060 socket_set_cork(s->fd, 0);
2df46246 1061 qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
bafbd6a1 1062 aio_flush_request, s);
2df46246 1063 qemu_co_mutex_unlock(&s->lock);
33b1db1c
MK
1064
1065 return 0;
1066}
1067
1068static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1069 unsigned int datalen, uint64_t offset,
0e7106d8 1070 bool write, bool create, uint32_t cache_flags)
33b1db1c
MK
1071{
1072 SheepdogObjReq hdr;
1073 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1074 unsigned int wlen, rlen;
1075 int ret;
1076
1077 memset(&hdr, 0, sizeof(hdr));
1078
1079 if (write) {
1080 wlen = datalen;
1081 rlen = 0;
1082 hdr.flags = SD_FLAG_CMD_WRITE;
1083 if (create) {
1084 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1085 } else {
1086 hdr.opcode = SD_OP_WRITE_OBJ;
1087 }
1088 } else {
1089 wlen = 0;
1090 rlen = datalen;
1091 hdr.opcode = SD_OP_READ_OBJ;
1092 }
47622c44 1093
0e7106d8 1094 hdr.flags |= cache_flags;
47622c44 1095
33b1db1c
MK
1096 hdr.oid = oid;
1097 hdr.data_length = datalen;
1098 hdr.offset = offset;
1099 hdr.copies = copies;
1100
1101 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1102 if (ret) {
6daf194d 1103 error_report("failed to send a request to the sheep");
cb595887 1104 return ret;
33b1db1c
MK
1105 }
1106
1107 switch (rsp->result) {
1108 case SD_RES_SUCCESS:
1109 return 0;
1110 default:
6daf194d 1111 error_report("%s", sd_strerror(rsp->result));
cb595887 1112 return -EIO;
33b1db1c
MK
1113 }
1114}
1115
1116static int read_object(int fd, char *buf, uint64_t oid, int copies,
0e7106d8
LY
1117 unsigned int datalen, uint64_t offset,
1118 uint32_t cache_flags)
33b1db1c 1119{
2f536801 1120 return read_write_object(fd, buf, oid, copies, datalen, offset, false,
0e7106d8 1121 false, cache_flags);
33b1db1c
MK
1122}
1123
1124static int write_object(int fd, char *buf, uint64_t oid, int copies,
2f536801 1125 unsigned int datalen, uint64_t offset, bool create,
0e7106d8 1126 uint32_t cache_flags)
33b1db1c 1127{
2f536801 1128 return read_write_object(fd, buf, oid, copies, datalen, offset, true,
0e7106d8 1129 create, cache_flags);
33b1db1c
MK
1130}
1131
1132static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1133{
1134 int ret, fd;
1135 uint32_t vid = 0;
1136 BDRVSheepdogState *s = bs->opaque;
1137 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1138 uint32_t snapid;
1139 char *buf = NULL;
1140
c292ee6a
MK
1141 QLIST_INIT(&s->inflight_aio_head);
1142 QLIST_INIT(&s->pending_aio_head);
33b1db1c
MK
1143 s->fd = -1;
1144
1145 memset(vdi, 0, sizeof(vdi));
1146 memset(tag, 0, sizeof(tag));
5d6768e3
MK
1147
1148 if (strstr(filename, "://")) {
1149 ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
1150 } else {
1151 ret = parse_vdiname(s, filename, vdi, &snapid, tag);
1152 }
1153 if (ret < 0) {
33b1db1c
MK
1154 goto out;
1155 }
1156 s->fd = get_sheep_fd(s);
1157 if (s->fd < 0) {
cb595887 1158 ret = s->fd;
33b1db1c
MK
1159 goto out;
1160 }
1161
1162 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1163 if (ret) {
1164 goto out;
1165 }
1166
0e7106d8
LY
1167 /*
1168 * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1169 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1170 */
1171 s->cache_flags = SD_FLAG_CMD_CACHE;
1172 if (flags & BDRV_O_NOCACHE) {
1173 s->cache_flags = SD_FLAG_CMD_DIRECT;
1174 }
1175
622b6057 1176 if (snapid || tag[0] != '\0') {
33b1db1c 1177 dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
2f536801 1178 s->is_snapshot = true;
33b1db1c
MK
1179 }
1180
1181 fd = connect_to_sdog(s->addr, s->port);
1182 if (fd < 0) {
6daf194d 1183 error_report("failed to connect");
cb595887 1184 ret = fd;
33b1db1c
MK
1185 goto out;
1186 }
1187
7267c094 1188 buf = g_malloc(SD_INODE_SIZE);
47622c44 1189 ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0,
0e7106d8 1190 s->cache_flags);
33b1db1c
MK
1191
1192 closesocket(fd);
1193
1194 if (ret) {
1195 goto out;
1196 }
1197
1198 memcpy(&s->inode, buf, sizeof(s->inode));
1199 s->min_dirty_data_idx = UINT32_MAX;
1200 s->max_dirty_data_idx = 0;
1201
1202 bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
3178e275 1203 pstrcpy(s->name, sizeof(s->name), vdi);
2df46246 1204 qemu_co_mutex_init(&s->lock);
7267c094 1205 g_free(buf);
33b1db1c
MK
1206 return 0;
1207out:
bafbd6a1 1208 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
33b1db1c
MK
1209 if (s->fd >= 0) {
1210 closesocket(s->fd);
1211 }
7267c094 1212 g_free(buf);
cb595887 1213 return ret;
33b1db1c
MK
1214}
1215
1216static int do_sd_create(char *filename, int64_t vdi_size,
1217 uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1218 const char *addr, const char *port)
1219{
1220 SheepdogVdiReq hdr;
1221 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1222 int fd, ret;
1223 unsigned int wlen, rlen = 0;
1224 char buf[SD_MAX_VDI_LEN];
1225
1226 fd = connect_to_sdog(addr, port);
1227 if (fd < 0) {
cb595887 1228 return fd;
33b1db1c
MK
1229 }
1230
3178e275
JM
1231 /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1232 * does not fit in buf? For now, just truncate and avoid buffer overrun.
1233 */
33b1db1c 1234 memset(buf, 0, sizeof(buf));
3178e275 1235 pstrcpy(buf, sizeof(buf), filename);
33b1db1c
MK
1236
1237 memset(&hdr, 0, sizeof(hdr));
1238 hdr.opcode = SD_OP_NEW_VDI;
6f74c260 1239 hdr.vdi_id = base_vid;
33b1db1c
MK
1240
1241 wlen = SD_MAX_VDI_LEN;
1242
1243 hdr.flags = SD_FLAG_CMD_WRITE;
1244 hdr.snapid = snapshot;
1245
1246 hdr.data_length = wlen;
1247 hdr.vdi_size = vdi_size;
1248
1249 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1250
1251 closesocket(fd);
1252
1253 if (ret) {
cb595887 1254 return ret;
33b1db1c
MK
1255 }
1256
1257 if (rsp->result != SD_RES_SUCCESS) {
6daf194d 1258 error_report("%s, %s", sd_strerror(rsp->result), filename);
33b1db1c
MK
1259 return -EIO;
1260 }
1261
1262 if (vdi_id) {
1263 *vdi_id = rsp->vdi_id;
1264 }
1265
1266 return 0;
1267}
1268
a8e0fdd7
MK
1269static int sd_prealloc(const char *filename)
1270{
1271 BlockDriverState *bs = NULL;
1272 uint32_t idx, max_idx;
1273 int64_t vdi_size;
7267c094 1274 void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
a8e0fdd7
MK
1275 int ret;
1276
1277 ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
1278 if (ret < 0) {
1279 goto out;
1280 }
1281
1282 vdi_size = bdrv_getlength(bs);
1283 if (vdi_size < 0) {
1284 ret = vdi_size;
1285 goto out;
1286 }
1287 max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
1288
1289 for (idx = 0; idx < max_idx; idx++) {
1290 /*
1291 * The created image can be a cloned image, so we need to read
1292 * a data from the source image.
1293 */
1294 ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1295 if (ret < 0) {
1296 goto out;
1297 }
1298 ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1299 if (ret < 0) {
1300 goto out;
1301 }
1302 }
1303out:
1304 if (bs) {
1305 bdrv_delete(bs);
1306 }
7267c094 1307 g_free(buf);
a8e0fdd7
MK
1308
1309 return ret;
1310}
1311
33b1db1c
MK
1312static int sd_create(const char *filename, QEMUOptionParameter *options)
1313{
b6fc8245 1314 int ret = 0;
b4447363 1315 uint32_t vid = 0, base_vid = 0;
33b1db1c
MK
1316 int64_t vdi_size = 0;
1317 char *backing_file = NULL;
b6fc8245 1318 BDRVSheepdogState *s;
b4447363
MK
1319 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1320 uint32_t snapid;
2f536801 1321 bool prealloc = false;
33b1db1c 1322
b6fc8245
MK
1323 s = g_malloc0(sizeof(BDRVSheepdogState));
1324
b4447363
MK
1325 memset(vdi, 0, sizeof(vdi));
1326 memset(tag, 0, sizeof(tag));
5d6768e3
MK
1327 if (strstr(filename, "://")) {
1328 ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
1329 } else {
1330 ret = parse_vdiname(s, filename, vdi, &snapid, tag);
1331 }
1332 if (ret < 0) {
b6fc8245 1333 goto out;
b4447363
MK
1334 }
1335
33b1db1c
MK
1336 while (options && options->name) {
1337 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1338 vdi_size = options->value.n;
1339 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1340 backing_file = options->value.s;
a8e0fdd7
MK
1341 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1342 if (!options->value.s || !strcmp(options->value.s, "off")) {
2f536801 1343 prealloc = false;
a8e0fdd7 1344 } else if (!strcmp(options->value.s, "full")) {
2f536801 1345 prealloc = true;
a8e0fdd7
MK
1346 } else {
1347 error_report("Invalid preallocation mode: '%s'",
1348 options->value.s);
b6fc8245
MK
1349 ret = -EINVAL;
1350 goto out;
a8e0fdd7 1351 }
33b1db1c
MK
1352 }
1353 options++;
1354 }
1355
1356 if (vdi_size > SD_MAX_VDI_SIZE) {
6daf194d 1357 error_report("too big image size");
b6fc8245
MK
1358 ret = -EINVAL;
1359 goto out;
33b1db1c
MK
1360 }
1361
1362 if (backing_file) {
1363 BlockDriverState *bs;
1364 BDRVSheepdogState *s;
1365 BlockDriver *drv;
1366
1367 /* Currently, only Sheepdog backing image is supported. */
1368 drv = bdrv_find_protocol(backing_file);
1369 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
6daf194d 1370 error_report("backing_file must be a sheepdog image");
b6fc8245
MK
1371 ret = -EINVAL;
1372 goto out;
33b1db1c
MK
1373 }
1374
1375 ret = bdrv_file_open(&bs, backing_file, 0);
cb595887 1376 if (ret < 0) {
b6fc8245 1377 goto out;
cb595887 1378 }
33b1db1c
MK
1379
1380 s = bs->opaque;
1381
1382 if (!is_snapshot(&s->inode)) {
6daf194d 1383 error_report("cannot clone from a non snapshot vdi");
33b1db1c 1384 bdrv_delete(bs);
b6fc8245
MK
1385 ret = -EINVAL;
1386 goto out;
33b1db1c
MK
1387 }
1388
b4447363 1389 base_vid = s->inode.vdi_id;
33b1db1c
MK
1390 bdrv_delete(bs);
1391 }
1392
b6fc8245 1393 ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s->addr, s->port);
a8e0fdd7 1394 if (!prealloc || ret) {
b6fc8245 1395 goto out;
a8e0fdd7
MK
1396 }
1397
b6fc8245
MK
1398 ret = sd_prealloc(filename);
1399out:
1400 g_free(s);
1401 return ret;
33b1db1c
MK
1402}
1403
1404static void sd_close(BlockDriverState *bs)
1405{
1406 BDRVSheepdogState *s = bs->opaque;
1407 SheepdogVdiReq hdr;
1408 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1409 unsigned int wlen, rlen = 0;
1410 int fd, ret;
1411
1412 dprintf("%s\n", s->name);
1413
1414 fd = connect_to_sdog(s->addr, s->port);
1415 if (fd < 0) {
1416 return;
1417 }
1418
1419 memset(&hdr, 0, sizeof(hdr));
1420
1421 hdr.opcode = SD_OP_RELEASE_VDI;
6f74c260 1422 hdr.vdi_id = s->inode.vdi_id;
33b1db1c
MK
1423 wlen = strlen(s->name) + 1;
1424 hdr.data_length = wlen;
1425 hdr.flags = SD_FLAG_CMD_WRITE;
1426
1427 ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1428
1429 closesocket(fd);
1430
1431 if (!ret && rsp->result != SD_RES_SUCCESS &&
1432 rsp->result != SD_RES_VDI_NOT_LOCKED) {
6daf194d 1433 error_report("%s, %s", sd_strerror(rsp->result), s->name);
33b1db1c
MK
1434 }
1435
bafbd6a1 1436 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL);
33b1db1c 1437 closesocket(s->fd);
7267c094 1438 g_free(s->addr);
5d6768e3 1439 g_free(s->port);
33b1db1c
MK
1440}
1441
1442static int64_t sd_getlength(BlockDriverState *bs)
1443{
1444 BDRVSheepdogState *s = bs->opaque;
1445
1446 return s->inode.vdi_size;
1447}
1448
1449static int sd_truncate(BlockDriverState *bs, int64_t offset)
1450{
1451 BDRVSheepdogState *s = bs->opaque;
1452 int ret, fd;
1453 unsigned int datalen;
1454
1455 if (offset < s->inode.vdi_size) {
6daf194d 1456 error_report("shrinking is not supported");
33b1db1c
MK
1457 return -EINVAL;
1458 } else if (offset > SD_MAX_VDI_SIZE) {
6daf194d 1459 error_report("too big image size");
33b1db1c
MK
1460 return -EINVAL;
1461 }
1462
1463 fd = connect_to_sdog(s->addr, s->port);
1464 if (fd < 0) {
cb595887 1465 return fd;
33b1db1c
MK
1466 }
1467
1468 /* we don't need to update entire object */
1469 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1470 s->inode.vdi_size = offset;
1471 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
0e7106d8 1472 s->inode.nr_copies, datalen, 0, false, s->cache_flags);
33b1db1c
MK
1473 close(fd);
1474
1475 if (ret < 0) {
6daf194d 1476 error_report("failed to update an inode.");
33b1db1c
MK
1477 }
1478
cb595887 1479 return ret;
33b1db1c
MK
1480}
1481
1482/*
1483 * This function is called after writing data objects. If we need to
1484 * update metadata, this sends a write request to the vdi object.
2df46246 1485 * Otherwise, this switches back to sd_co_readv/writev.
33b1db1c 1486 */
d8716b41 1487static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
33b1db1c
MK
1488{
1489 int ret;
1490 BDRVSheepdogState *s = acb->common.bs->opaque;
1491 struct iovec iov;
1492 AIOReq *aio_req;
1493 uint32_t offset, data_len, mn, mx;
1494
1495 mn = s->min_dirty_data_idx;
1496 mx = s->max_dirty_data_idx;
1497 if (mn <= mx) {
1498 /* we need to update the vdi object. */
1499 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1500 mn * sizeof(s->inode.data_vdi_id[0]);
1501 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1502
1503 s->min_dirty_data_idx = UINT32_MAX;
1504 s->max_dirty_data_idx = 0;
1505
1506 iov.iov_base = &s->inode;
1507 iov.iov_len = sizeof(s->inode);
1508 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1509 data_len, offset, 0, 0, offset);
c292ee6a 1510 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
2f536801 1511 ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
33b1db1c
MK
1512 if (ret) {
1513 free_aio_req(s, aio_req);
1514 acb->ret = -EIO;
1515 goto out;
1516 }
1517
1518 acb->aio_done_func = sd_finish_aiocb;
1519 acb->aiocb_type = AIOCB_WRITE_UDATA;
1520 return;
1521 }
1522out:
1523 sd_finish_aiocb(acb);
1524}
1525
1526/*
1527 * Create a writable VDI from a snapshot
1528 */
1529static int sd_create_branch(BDRVSheepdogState *s)
1530{
1531 int ret, fd;
1532 uint32_t vid;
1533 char *buf;
1534
1535 dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1536
7267c094 1537 buf = g_malloc(SD_INODE_SIZE);
33b1db1c
MK
1538
1539 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1540 s->addr, s->port);
1541 if (ret) {
1542 goto out;
1543 }
1544
1545 dprintf("%" PRIx32 " is created.\n", vid);
1546
1547 fd = connect_to_sdog(s->addr, s->port);
1548 if (fd < 0) {
6daf194d 1549 error_report("failed to connect");
cb595887 1550 ret = fd;
33b1db1c
MK
1551 goto out;
1552 }
1553
1554 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
0e7106d8 1555 SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
1556
1557 closesocket(fd);
1558
1559 if (ret < 0) {
1560 goto out;
1561 }
1562
1563 memcpy(&s->inode, buf, sizeof(s->inode));
1564
2f536801 1565 s->is_snapshot = false;
33b1db1c
MK
1566 ret = 0;
1567 dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1568
1569out:
7267c094 1570 g_free(buf);
33b1db1c
MK
1571
1572 return ret;
1573}
1574
1575/*
1576 * Send I/O requests to the server.
1577 *
1578 * This function sends requests to the server, links the requests to
c292ee6a 1579 * the inflight_list in BDRVSheepdogState, and exits without
33b1db1c
MK
1580 * waiting the response. The responses are received in the
1581 * `aio_read_response' function which is called from the main loop as
1582 * a fd handler.
2df46246
MK
1583 *
1584 * Returns 1 when we need to wait a response, 0 when there is no sent
1585 * request and -errno in error cases.
33b1db1c 1586 */
d8716b41 1587static int coroutine_fn sd_co_rw_vector(void *p)
33b1db1c
MK
1588{
1589 SheepdogAIOCB *acb = p;
1590 int ret = 0;
1591 unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1592 unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1593 uint64_t oid;
1594 uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1595 BDRVSheepdogState *s = acb->common.bs->opaque;
1596 SheepdogInode *inode = &s->inode;
1597 AIOReq *aio_req;
1598
33b1db1c
MK
1599 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1600 /*
1601 * In the case we open the snapshot VDI, Sheepdog creates the
1602 * writable VDI when we do a write operation first.
1603 */
1604 ret = sd_create_branch(s);
1605 if (ret) {
1606 acb->ret = -EIO;
1607 goto out;
1608 }
1609 }
1610
1d732d7d
MK
1611 /*
1612 * Make sure we don't free the aiocb before we are done with all requests.
1613 * This additional reference is dropped at the end of this function.
1614 */
1615 acb->nr_pending++;
1616
33b1db1c
MK
1617 while (done != total) {
1618 uint8_t flags = 0;
1619 uint64_t old_oid = 0;
2f536801 1620 bool create = false;
33b1db1c
MK
1621
1622 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1623
1624 len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1625
19db9b90
CH
1626 switch (acb->aiocb_type) {
1627 case AIOCB_READ_UDATA:
1628 if (!inode->data_vdi_id[idx]) {
1629 qemu_iovec_memset(acb->qiov, done, 0, len);
33b1db1c
MK
1630 goto done;
1631 }
19db9b90
CH
1632 break;
1633 case AIOCB_WRITE_UDATA:
1634 if (!inode->data_vdi_id[idx]) {
2f536801 1635 create = true;
19db9b90
CH
1636 } else if (!is_data_obj_writable(inode, idx)) {
1637 /* Copy-On-Write */
2f536801 1638 create = true;
19db9b90
CH
1639 old_oid = oid;
1640 flags = SD_FLAG_CMD_COW;
1641 }
1642 break;
1643 default:
1644 break;
33b1db1c
MK
1645 }
1646
1647 if (create) {
1b6ac998
MK
1648 dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
1649 inode->vdi_id, oid,
33b1db1c
MK
1650 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1651 oid = vid_to_data_oid(inode->vdi_id, idx);
1b6ac998 1652 dprintf("new oid %" PRIx64 "\n", oid);
33b1db1c
MK
1653 }
1654
1655 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1656
1657 if (create) {
1658 AIOReq *areq;
c292ee6a 1659 QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
33b1db1c
MK
1660 if (areq->oid == oid) {
1661 /*
1662 * Sheepdog cannot handle simultaneous create
1663 * requests to the same object. So we cannot send
1664 * the request until the previous request
1665 * finishes.
1666 */
1667 aio_req->flags = 0;
1668 aio_req->base_oid = 0;
c292ee6a
MK
1669 QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req,
1670 aio_siblings);
33b1db1c
MK
1671 goto done;
1672 }
1673 }
1674 }
1675
c292ee6a 1676 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
33b1db1c
MK
1677 ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1678 create, acb->aiocb_type);
1679 if (ret < 0) {
6daf194d 1680 error_report("add_aio_request is failed");
33b1db1c
MK
1681 free_aio_req(s, aio_req);
1682 acb->ret = -EIO;
1683 goto out;
1684 }
1685 done:
1686 offset = 0;
1687 idx++;
1688 done += len;
1689 }
1690out:
1d732d7d 1691 if (!--acb->nr_pending) {
2df46246 1692 return acb->ret;
33b1db1c 1693 }
2df46246 1694 return 1;
33b1db1c
MK
1695}
1696
a968168c 1697static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
2df46246 1698 int nb_sectors, QEMUIOVector *qiov)
33b1db1c
MK
1699{
1700 SheepdogAIOCB *acb;
2df46246 1701 int ret;
33b1db1c
MK
1702
1703 if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
cb595887
MK
1704 ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE);
1705 if (ret < 0) {
1706 return ret;
33b1db1c
MK
1707 }
1708 bs->total_sectors = sector_num + nb_sectors;
1709 }
1710
f700f8e3 1711 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
33b1db1c
MK
1712 acb->aio_done_func = sd_write_done;
1713 acb->aiocb_type = AIOCB_WRITE_UDATA;
1714
2df46246
MK
1715 ret = sd_co_rw_vector(acb);
1716 if (ret <= 0) {
1717 qemu_aio_release(acb);
1718 return ret;
1719 }
1720
1721 qemu_coroutine_yield();
1722
1723 return acb->ret;
33b1db1c
MK
1724}
1725
a968168c 1726static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2df46246 1727 int nb_sectors, QEMUIOVector *qiov)
33b1db1c
MK
1728{
1729 SheepdogAIOCB *acb;
19db9b90 1730 int ret;
33b1db1c 1731
f700f8e3 1732 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
33b1db1c
MK
1733 acb->aiocb_type = AIOCB_READ_UDATA;
1734 acb->aio_done_func = sd_finish_aiocb;
1735
2df46246
MK
1736 ret = sd_co_rw_vector(acb);
1737 if (ret <= 0) {
1738 qemu_aio_release(acb);
1739 return ret;
1740 }
1741
1742 qemu_coroutine_yield();
1743
1744 return acb->ret;
33b1db1c
MK
1745}
1746
47622c44
LY
1747static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
1748{
1749 BDRVSheepdogState *s = bs->opaque;
47783072
LY
1750 SheepdogAIOCB *acb;
1751 AIOReq *aio_req;
47622c44 1752 int ret;
47622c44 1753
0e7106d8 1754 if (s->cache_flags != SD_FLAG_CMD_CACHE) {
47622c44
LY
1755 return 0;
1756 }
1757
f700f8e3 1758 acb = sd_aio_setup(bs, NULL, 0, 0);
47783072
LY
1759 acb->aiocb_type = AIOCB_FLUSH_CACHE;
1760 acb->aio_done_func = sd_finish_aiocb;
47622c44 1761
47783072
LY
1762 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1763 0, 0, 0, 0, 0);
1764 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1765 ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
1766 if (ret < 0) {
1767 error_report("add_aio_request is failed");
1768 free_aio_req(s, aio_req);
1769 qemu_aio_release(acb);
47622c44
LY
1770 return ret;
1771 }
1772
47783072
LY
1773 qemu_coroutine_yield();
1774 return acb->ret;
47622c44
LY
1775}
1776
33b1db1c
MK
1777static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1778{
1779 BDRVSheepdogState *s = bs->opaque;
1780 int ret, fd;
1781 uint32_t new_vid;
1782 SheepdogInode *inode;
1783 unsigned int datalen;
1784
1b6ac998 1785 dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
33b1db1c
MK
1786 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1787 s->name, sn_info->vm_state_size, s->is_snapshot);
1788
1789 if (s->is_snapshot) {
1790 error_report("You can't create a snapshot of a snapshot VDI, "
6daf194d 1791 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
33b1db1c
MK
1792
1793 return -EINVAL;
1794 }
1795
1796 dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1797
1798 s->inode.vm_state_size = sn_info->vm_state_size;
1799 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
3178e275
JM
1800 /* It appears that inode.tag does not require a NUL terminator,
1801 * which means this use of strncpy is ok.
1802 */
33b1db1c
MK
1803 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1804 /* we don't need to update entire object */
1805 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1806
1807 /* refresh inode. */
1808 fd = connect_to_sdog(s->addr, s->port);
1809 if (fd < 0) {
cb595887 1810 ret = fd;
33b1db1c
MK
1811 goto cleanup;
1812 }
1813
1814 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
0e7106d8 1815 s->inode.nr_copies, datalen, 0, false, s->cache_flags);
33b1db1c 1816 if (ret < 0) {
6daf194d 1817 error_report("failed to write snapshot's inode.");
33b1db1c
MK
1818 goto cleanup;
1819 }
1820
1821 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1822 s->addr, s->port);
1823 if (ret < 0) {
6daf194d 1824 error_report("failed to create inode for snapshot. %s",
33b1db1c 1825 strerror(errno));
33b1db1c
MK
1826 goto cleanup;
1827 }
1828
7267c094 1829 inode = (SheepdogInode *)g_malloc(datalen);
33b1db1c
MK
1830
1831 ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
0e7106d8 1832 s->inode.nr_copies, datalen, 0, s->cache_flags);
33b1db1c
MK
1833
1834 if (ret < 0) {
6daf194d 1835 error_report("failed to read new inode info. %s", strerror(errno));
33b1db1c
MK
1836 goto cleanup;
1837 }
1838
1839 memcpy(&s->inode, inode, datalen);
1840 dprintf("s->inode: name %s snap_id %x oid %x\n",
1841 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1842
1843cleanup:
1844 closesocket(fd);
1845 return ret;
1846}
1847
1848static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1849{
1850 BDRVSheepdogState *s = bs->opaque;
1851 BDRVSheepdogState *old_s;
1852 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1853 char *buf = NULL;
1854 uint32_t vid;
1855 uint32_t snapid = 0;
cb595887 1856 int ret = 0, fd;
33b1db1c 1857
7267c094 1858 old_s = g_malloc(sizeof(BDRVSheepdogState));
33b1db1c
MK
1859
1860 memcpy(old_s, s, sizeof(BDRVSheepdogState));
1861
3178e275 1862 pstrcpy(vdi, sizeof(vdi), s->name);
33b1db1c 1863
33b1db1c 1864 snapid = strtoul(snapshot_id, NULL, 10);
3178e275
JM
1865 if (snapid) {
1866 tag[0] = 0;
1867 } else {
1868 pstrcpy(tag, sizeof(tag), s->name);
33b1db1c
MK
1869 }
1870
1871 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1872 if (ret) {
6daf194d 1873 error_report("Failed to find_vdi_name");
33b1db1c
MK
1874 goto out;
1875 }
1876
1877 fd = connect_to_sdog(s->addr, s->port);
1878 if (fd < 0) {
6daf194d 1879 error_report("failed to connect");
cb595887 1880 ret = fd;
33b1db1c
MK
1881 goto out;
1882 }
1883
7267c094 1884 buf = g_malloc(SD_INODE_SIZE);
33b1db1c 1885 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
0e7106d8 1886 SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
1887
1888 closesocket(fd);
1889
1890 if (ret) {
33b1db1c
MK
1891 goto out;
1892 }
1893
1894 memcpy(&s->inode, buf, sizeof(s->inode));
1895
1896 if (!s->inode.vm_state_size) {
6daf194d 1897 error_report("Invalid snapshot");
33b1db1c
MK
1898 ret = -ENOENT;
1899 goto out;
1900 }
1901
2f536801 1902 s->is_snapshot = true;
33b1db1c 1903
7267c094
AL
1904 g_free(buf);
1905 g_free(old_s);
33b1db1c
MK
1906
1907 return 0;
1908out:
1909 /* recover bdrv_sd_state */
1910 memcpy(s, old_s, sizeof(BDRVSheepdogState));
7267c094
AL
1911 g_free(buf);
1912 g_free(old_s);
33b1db1c 1913
6daf194d 1914 error_report("failed to open. recover old bdrv_sd_state.");
33b1db1c
MK
1915
1916 return ret;
1917}
1918
1919static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1920{
1921 /* FIXME: Delete specified snapshot id. */
1922 return 0;
1923}
1924
33b1db1c
MK
1925static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1926{
1927 BDRVSheepdogState *s = bs->opaque;
1928 SheepdogReq req;
1929 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1930 QEMUSnapshotInfo *sn_tab = NULL;
1931 unsigned wlen, rlen;
1932 int found = 0;
1933 static SheepdogInode inode;
1934 unsigned long *vdi_inuse;
1935 unsigned int start_nr;
1936 uint64_t hval;
1937 uint32_t vid;
1938
7267c094 1939 vdi_inuse = g_malloc(max);
33b1db1c
MK
1940
1941 fd = connect_to_sdog(s->addr, s->port);
1942 if (fd < 0) {
cb595887 1943 ret = fd;
33b1db1c
MK
1944 goto out;
1945 }
1946
1947 rlen = max;
1948 wlen = 0;
1949
1950 memset(&req, 0, sizeof(req));
1951
1952 req.opcode = SD_OP_READ_VDIS;
1953 req.data_length = max;
1954
1955 ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1956
1957 closesocket(fd);
1958 if (ret) {
1959 goto out;
1960 }
1961
7267c094 1962 sn_tab = g_malloc0(nr * sizeof(*sn_tab));
33b1db1c
MK
1963
1964 /* calculate a vdi id with hash function */
1965 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1966 start_nr = hval & (SD_NR_VDIS - 1);
1967
1968 fd = connect_to_sdog(s->addr, s->port);
1969 if (fd < 0) {
6daf194d 1970 error_report("failed to connect");
cb595887 1971 ret = fd;
33b1db1c
MK
1972 goto out;
1973 }
1974
1975 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1976 if (!test_bit(vid, vdi_inuse)) {
1977 break;
1978 }
1979
1980 /* we don't need to read entire object */
1981 ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
47622c44 1982 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
0e7106d8 1983 s->cache_flags);
33b1db1c
MK
1984
1985 if (ret) {
1986 continue;
1987 }
1988
1989 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1990 sn_tab[found].date_sec = inode.snap_ctime >> 32;
1991 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1992 sn_tab[found].vm_state_size = inode.vm_state_size;
1993 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1994
1995 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1996 inode.snap_id);
3178e275
JM
1997 pstrcpy(sn_tab[found].name,
1998 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
1999 inode.tag);
33b1db1c
MK
2000 found++;
2001 }
2002 }
2003
2004 closesocket(fd);
2005out:
2006 *psn_tab = sn_tab;
2007
7267c094 2008 g_free(vdi_inuse);
33b1db1c 2009
cb595887
MK
2010 if (ret < 0) {
2011 return ret;
2012 }
2013
33b1db1c
MK
2014 return found;
2015}
2016
2017static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
2018 int64_t pos, int size, int load)
2019{
2f536801
MK
2020 bool create;
2021 int fd, ret = 0, remaining = size;
33b1db1c
MK
2022 unsigned int data_len;
2023 uint64_t vmstate_oid;
2024 uint32_t vdi_index;
2025 uint64_t offset;
2026
2027 fd = connect_to_sdog(s->addr, s->port);
2028 if (fd < 0) {
cb595887 2029 return fd;
33b1db1c
MK
2030 }
2031
6f3c714e 2032 while (remaining) {
33b1db1c
MK
2033 vdi_index = pos / SD_DATA_OBJ_SIZE;
2034 offset = pos % SD_DATA_OBJ_SIZE;
2035
1f7a48de 2036 data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
33b1db1c
MK
2037
2038 vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
2039
2040 create = (offset == 0);
2041 if (load) {
2042 ret = read_object(fd, (char *)data, vmstate_oid,
47622c44 2043 s->inode.nr_copies, data_len, offset,
0e7106d8 2044 s->cache_flags);
33b1db1c
MK
2045 } else {
2046 ret = write_object(fd, (char *)data, vmstate_oid,
47622c44 2047 s->inode.nr_copies, data_len, offset, create,
0e7106d8 2048 s->cache_flags);
33b1db1c
MK
2049 }
2050
2051 if (ret < 0) {
6daf194d 2052 error_report("failed to save vmstate %s", strerror(errno));
33b1db1c
MK
2053 goto cleanup;
2054 }
2055
2056 pos += data_len;
1f7a48de 2057 data += data_len;
6f3c714e 2058 remaining -= data_len;
33b1db1c 2059 }
6f3c714e 2060 ret = size;
33b1db1c
MK
2061cleanup:
2062 closesocket(fd);
2063 return ret;
2064}
2065
2066static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
2067 int64_t pos, int size)
2068{
2069 BDRVSheepdogState *s = bs->opaque;
2070
2071 return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
2072}
2073
2074static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
2075 int64_t pos, int size)
2076{
2077 BDRVSheepdogState *s = bs->opaque;
2078
2079 return do_load_save_vmstate(s, data, pos, size, 1);
2080}
2081
2082
2083static QEMUOptionParameter sd_create_options[] = {
2084 {
2085 .name = BLOCK_OPT_SIZE,
2086 .type = OPT_SIZE,
2087 .help = "Virtual disk size"
2088 },
2089 {
2090 .name = BLOCK_OPT_BACKING_FILE,
2091 .type = OPT_STRING,
2092 .help = "File name of a base image"
2093 },
a8e0fdd7
MK
2094 {
2095 .name = BLOCK_OPT_PREALLOC,
2096 .type = OPT_STRING,
2097 .help = "Preallocation mode (allowed values: off, full)"
2098 },
33b1db1c
MK
2099 { NULL }
2100};
2101
5d6768e3 2102static BlockDriver bdrv_sheepdog = {
33b1db1c
MK
2103 .format_name = "sheepdog",
2104 .protocol_name = "sheepdog",
2105 .instance_size = sizeof(BDRVSheepdogState),
2106 .bdrv_file_open = sd_open,
2107 .bdrv_close = sd_close,
2108 .bdrv_create = sd_create,
2109 .bdrv_getlength = sd_getlength,
2110 .bdrv_truncate = sd_truncate,
2111
2df46246
MK
2112 .bdrv_co_readv = sd_co_readv,
2113 .bdrv_co_writev = sd_co_writev,
47622c44 2114 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
33b1db1c
MK
2115
2116 .bdrv_snapshot_create = sd_snapshot_create,
2117 .bdrv_snapshot_goto = sd_snapshot_goto,
2118 .bdrv_snapshot_delete = sd_snapshot_delete,
2119 .bdrv_snapshot_list = sd_snapshot_list,
2120
2121 .bdrv_save_vmstate = sd_save_vmstate,
2122 .bdrv_load_vmstate = sd_load_vmstate,
2123
2124 .create_options = sd_create_options,
2125};
2126
5d6768e3
MK
2127static BlockDriver bdrv_sheepdog_tcp = {
2128 .format_name = "sheepdog",
2129 .protocol_name = "sheepdog+tcp",
2130 .instance_size = sizeof(BDRVSheepdogState),
2131 .bdrv_file_open = sd_open,
2132 .bdrv_close = sd_close,
2133 .bdrv_create = sd_create,
2134 .bdrv_getlength = sd_getlength,
2135 .bdrv_truncate = sd_truncate,
2136
2137 .bdrv_co_readv = sd_co_readv,
2138 .bdrv_co_writev = sd_co_writev,
2139 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
2140
2141 .bdrv_snapshot_create = sd_snapshot_create,
2142 .bdrv_snapshot_goto = sd_snapshot_goto,
2143 .bdrv_snapshot_delete = sd_snapshot_delete,
2144 .bdrv_snapshot_list = sd_snapshot_list,
2145
2146 .bdrv_save_vmstate = sd_save_vmstate,
2147 .bdrv_load_vmstate = sd_load_vmstate,
2148
2149 .create_options = sd_create_options,
2150};
2151
33b1db1c
MK
2152static void bdrv_sheepdog_init(void)
2153{
2154 bdrv_register(&bdrv_sheepdog);
5d6768e3 2155 bdrv_register(&bdrv_sheepdog_tcp);
33b1db1c
MK
2156}
2157block_init(bdrv_sheepdog_init);