]> git.proxmox.com Git - mirror_qemu.git/blame - block/sheepdog.c
sheepdog: use per AIOCB dirty indexes for non overlapping requests
[mirror_qemu.git] / block / sheepdog.c
CommitLineData
33b1db1c
MK
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
6b620ca3
PB
10 *
11 * Contributions after 2012-01-13 are licensed under the terms of the
12 * GNU GPL, version 2 or (at your option) any later version.
33b1db1c 13 */
33b1db1c
MK
14
15#include "qemu-common.h"
5d6768e3 16#include "qemu/uri.h"
1de7afc9
PB
17#include "qemu/error-report.h"
18#include "qemu/sockets.h"
737e150e 19#include "block/block_int.h"
1de7afc9 20#include "qemu/bitops.h"
33b1db1c
MK
21
22#define SD_PROTO_VER 0x01
23
24#define SD_DEFAULT_ADDR "localhost"
25af257d 25#define SD_DEFAULT_PORT 7000
33b1db1c
MK
26
27#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
28#define SD_OP_READ_OBJ 0x02
29#define SD_OP_WRITE_OBJ 0x03
cac8f4a6
LY
30/* 0x04 is used internally by Sheepdog */
31#define SD_OP_DISCARD_OBJ 0x05
33b1db1c
MK
32
33#define SD_OP_NEW_VDI 0x11
34#define SD_OP_LOCK_VDI 0x12
35#define SD_OP_RELEASE_VDI 0x13
36#define SD_OP_GET_VDI_INFO 0x14
37#define SD_OP_READ_VDIS 0x15
47622c44 38#define SD_OP_FLUSH_VDI 0x16
859e5553 39#define SD_OP_DEL_VDI 0x17
876eb1b0 40#define SD_OP_GET_CLUSTER_DEFAULT 0x18
33b1db1c
MK
41
42#define SD_FLAG_CMD_WRITE 0x01
43#define SD_FLAG_CMD_COW 0x02
0e7106d8
LY
44#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
45#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
33b1db1c
MK
46
47#define SD_RES_SUCCESS 0x00 /* Success */
48#define SD_RES_UNKNOWN 0x01 /* Unknown error */
49#define SD_RES_NO_OBJ 0x02 /* No object found */
50#define SD_RES_EIO 0x03 /* I/O error */
51#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
52#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
53#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
54#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
55#define SD_RES_NO_VDI 0x08 /* No vdi found */
56#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
57#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
58#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
59#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
60#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
61#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
62#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
63#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
64#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
65#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
66#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
67#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
68#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
69#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
70#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
71#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
fca23f0a 72#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */
6a0b5490 73#define SD_RES_READONLY 0x1A /* Object is read-only */
33b1db1c
MK
74
75/*
76 * Object ID rules
77 *
78 * 0 - 19 (20 bits): data object space
79 * 20 - 31 (12 bits): reserved data object space
80 * 32 - 55 (24 bits): vdi object space
81 * 56 - 59 ( 4 bits): reserved vdi object space
7acae208 82 * 60 - 63 ( 4 bits): object type identifier space
33b1db1c
MK
83 */
84
85#define VDI_SPACE_SHIFT 32
86#define VDI_BIT (UINT64_C(1) << 63)
87#define VMSTATE_BIT (UINT64_C(1) << 62)
88#define MAX_DATA_OBJS (UINT64_C(1) << 20)
89#define MAX_CHILDREN 1024
90#define SD_MAX_VDI_LEN 256
91#define SD_MAX_VDI_TAG_LEN 256
92#define SD_NR_VDIS (1U << 24)
93#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
94#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
876eb1b0 95#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
b3af018f
LY
96/*
97 * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
98 * (SD_EC_MAX_STRIP - 1) for parity strips
99 *
100 * SD_MAX_COPIES is sum of number of data strips and parity strips.
101 */
102#define SD_EC_MAX_STRIP 16
103#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
33b1db1c
MK
104
105#define SD_INODE_SIZE (sizeof(SheepdogInode))
106#define CURRENT_VDI_ID 0
107
1dbfafed
HM
108#define LOCK_TYPE_NORMAL 0
109#define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */
110
33b1db1c
MK
111typedef struct SheepdogReq {
112 uint8_t proto_ver;
113 uint8_t opcode;
114 uint16_t flags;
115 uint32_t epoch;
116 uint32_t id;
117 uint32_t data_length;
118 uint32_t opcode_specific[8];
119} SheepdogReq;
120
121typedef struct SheepdogRsp {
122 uint8_t proto_ver;
123 uint8_t opcode;
124 uint16_t flags;
125 uint32_t epoch;
126 uint32_t id;
127 uint32_t data_length;
128 uint32_t result;
129 uint32_t opcode_specific[7];
130} SheepdogRsp;
131
132typedef struct SheepdogObjReq {
133 uint8_t proto_ver;
134 uint8_t opcode;
135 uint16_t flags;
136 uint32_t epoch;
137 uint32_t id;
138 uint32_t data_length;
139 uint64_t oid;
140 uint64_t cow_oid;
29a67f7e 141 uint8_t copies;
1841f880
LY
142 uint8_t copy_policy;
143 uint8_t reserved[6];
33b1db1c
MK
144 uint64_t offset;
145} SheepdogObjReq;
146
147typedef struct SheepdogObjRsp {
148 uint8_t proto_ver;
149 uint8_t opcode;
150 uint16_t flags;
151 uint32_t epoch;
152 uint32_t id;
153 uint32_t data_length;
154 uint32_t result;
29a67f7e 155 uint8_t copies;
1841f880
LY
156 uint8_t copy_policy;
157 uint8_t reserved[2];
33b1db1c
MK
158 uint32_t pad[6];
159} SheepdogObjRsp;
160
161typedef struct SheepdogVdiReq {
162 uint8_t proto_ver;
163 uint8_t opcode;
164 uint16_t flags;
165 uint32_t epoch;
166 uint32_t id;
167 uint32_t data_length;
168 uint64_t vdi_size;
9f23fce7 169 uint32_t base_vdi_id;
29a67f7e 170 uint8_t copies;
1841f880 171 uint8_t copy_policy;
876eb1b0
TI
172 uint8_t store_policy;
173 uint8_t block_size_shift;
33b1db1c 174 uint32_t snapid;
1dbfafed
HM
175 uint32_t type;
176 uint32_t pad[2];
33b1db1c
MK
177} SheepdogVdiReq;
178
179typedef struct SheepdogVdiRsp {
180 uint8_t proto_ver;
181 uint8_t opcode;
182 uint16_t flags;
183 uint32_t epoch;
184 uint32_t id;
185 uint32_t data_length;
186 uint32_t result;
187 uint32_t rsvd;
188 uint32_t vdi_id;
189 uint32_t pad[5];
190} SheepdogVdiRsp;
191
876eb1b0
TI
192typedef struct SheepdogClusterRsp {
193 uint8_t proto_ver;
194 uint8_t opcode;
195 uint16_t flags;
196 uint32_t epoch;
197 uint32_t id;
198 uint32_t data_length;
199 uint32_t result;
200 uint8_t nr_copies;
201 uint8_t copy_policy;
202 uint8_t block_size_shift;
203 uint8_t __pad1;
204 uint32_t __pad2[6];
205} SheepdogClusterRsp;
206
33b1db1c
MK
207typedef struct SheepdogInode {
208 char name[SD_MAX_VDI_LEN];
209 char tag[SD_MAX_VDI_TAG_LEN];
210 uint64_t ctime;
211 uint64_t snap_ctime;
212 uint64_t vm_clock_nsec;
213 uint64_t vdi_size;
214 uint64_t vm_state_size;
215 uint16_t copy_policy;
216 uint8_t nr_copies;
217 uint8_t block_size_shift;
218 uint32_t snap_id;
219 uint32_t vdi_id;
220 uint32_t parent_vdi_id;
221 uint32_t child_vdi_id[MAX_CHILDREN];
222 uint32_t data_vdi_id[MAX_DATA_OBJS];
223} SheepdogInode;
224
5d039bab
HM
225#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id)
226
33b1db1c
MK
227/*
228 * 64 bit FNV-1a non-zero initial basis
229 */
230#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
231
232/*
233 * 64 bit Fowler/Noll/Vo FNV-1a hash code
234 */
235static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
236{
237 unsigned char *bp = buf;
238 unsigned char *be = bp + len;
239 while (bp < be) {
240 hval ^= (uint64_t) *bp++;
241 hval += (hval << 1) + (hval << 4) + (hval << 5) +
242 (hval << 7) + (hval << 8) + (hval << 40);
243 }
244 return hval;
245}
246
2f536801 247static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
33b1db1c
MK
248{
249 return inode->vdi_id == inode->data_vdi_id[idx];
250}
251
2f536801 252static inline bool is_data_obj(uint64_t oid)
33b1db1c
MK
253{
254 return !(VDI_BIT & oid);
255}
256
257static inline uint64_t data_oid_to_idx(uint64_t oid)
258{
259 return oid & (MAX_DATA_OBJS - 1);
260}
261
72e0996c
MK
262static inline uint32_t oid_to_vid(uint64_t oid)
263{
264 return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
265}
266
33b1db1c
MK
267static inline uint64_t vid_to_vdi_oid(uint32_t vid)
268{
269 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
270}
271
272static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
273{
274 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
275}
276
277static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
278{
279 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
280}
281
2f536801 282static inline bool is_snapshot(struct SheepdogInode *inode)
33b1db1c
MK
283{
284 return !!inode->snap_ctime;
285}
286
2440a2c3 287#undef DPRINTF
33b1db1c 288#ifdef DEBUG_SDOG
2440a2c3 289#define DPRINTF(fmt, args...) \
33b1db1c
MK
290 do { \
291 fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
292 } while (0)
293#else
2440a2c3 294#define DPRINTF(fmt, args...)
33b1db1c
MK
295#endif
296
297typedef struct SheepdogAIOCB SheepdogAIOCB;
298
299typedef struct AIOReq {
300 SheepdogAIOCB *aiocb;
301 unsigned int iov_offset;
302
303 uint64_t oid;
304 uint64_t base_oid;
305 uint64_t offset;
306 unsigned int data_len;
307 uint8_t flags;
308 uint32_t id;
b544c1ab 309 bool create;
33b1db1c 310
c292ee6a 311 QLIST_ENTRY(AIOReq) aio_siblings;
33b1db1c
MK
312} AIOReq;
313
314enum AIOCBState {
315 AIOCB_WRITE_UDATA,
316 AIOCB_READ_UDATA,
47783072 317 AIOCB_FLUSH_CACHE,
cac8f4a6 318 AIOCB_DISCARD_OBJ,
33b1db1c
MK
319};
320
498f2140 321#define AIOCBOverlapping(x, y) \
6a55c82c
HM
322 (!(x->max_affect_data_idx < y->min_affect_data_idx \
323 || y->max_affect_data_idx < x->min_affect_data_idx))
324
33b1db1c 325struct SheepdogAIOCB {
7c84b1b8 326 BlockAIOCB common;
33b1db1c
MK
327
328 QEMUIOVector *qiov;
329
330 int64_t sector_num;
331 int nb_sectors;
332
333 int ret;
334 enum AIOCBState aiocb_type;
335
2df46246 336 Coroutine *coroutine;
33b1db1c
MK
337 void (*aio_done_func)(SheepdogAIOCB *);
338
35200687 339 bool cancelable;
1d732d7d 340 int nr_pending;
6a55c82c
HM
341
342 uint32_t min_affect_data_idx;
343 uint32_t max_affect_data_idx;
344
498f2140
HM
345 /*
346 * The difference between affect_data_idx and dirty_data_idx:
347 * affect_data_idx represents range of index of all request types.
348 * dirty_data_idx represents range of index updated by COW requests.
349 * dirty_data_idx is used for updating an inode object.
350 */
351 uint32_t min_dirty_data_idx;
352 uint32_t max_dirty_data_idx;
353
6a55c82c 354 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
33b1db1c
MK
355};
356
357typedef struct BDRVSheepdogState {
011603ca 358 BlockDriverState *bs;
84390bed 359 AioContext *aio_context;
011603ca 360
33b1db1c
MK
361 SheepdogInode inode;
362
33b1db1c 363 char name[SD_MAX_VDI_LEN];
2f536801 364 bool is_snapshot;
0e7106d8 365 uint32_t cache_flags;
cac8f4a6 366 bool discard_supported;
33b1db1c 367
25af257d 368 char *host_spec;
1b8bbb46 369 bool is_unix;
33b1db1c
MK
370 int fd;
371
2df46246
MK
372 CoMutex lock;
373 Coroutine *co_send;
374 Coroutine *co_recv;
375
33b1db1c 376 uint32_t aioreq_seq_num;
011603ca
MK
377
378 /* Every aio request must be linked to either of these queues. */
c292ee6a 379 QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
011603ca 380 QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
6a55c82c 381
498f2140 382 CoQueue overlapping_queue;
6a55c82c 383 QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
33b1db1c
MK
384} BDRVSheepdogState;
385
4da65c80
LY
386typedef struct BDRVSheepdogReopenState {
387 int fd;
388 int cache_flags;
389} BDRVSheepdogReopenState;
390
33b1db1c
MK
391static const char * sd_strerror(int err)
392{
393 int i;
394
395 static const struct {
396 int err;
397 const char *desc;
398 } errors[] = {
399 {SD_RES_SUCCESS, "Success"},
400 {SD_RES_UNKNOWN, "Unknown error"},
401 {SD_RES_NO_OBJ, "No object found"},
402 {SD_RES_EIO, "I/O error"},
403 {SD_RES_VDI_EXIST, "VDI exists already"},
404 {SD_RES_INVALID_PARMS, "Invalid parameters"},
405 {SD_RES_SYSTEM_ERROR, "System error"},
406 {SD_RES_VDI_LOCKED, "VDI is already locked"},
407 {SD_RES_NO_VDI, "No vdi found"},
408 {SD_RES_NO_BASE_VDI, "No base VDI found"},
409 {SD_RES_VDI_READ, "Failed read the requested VDI"},
410 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
411 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
412 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
413 {SD_RES_NO_TAG, "Failed to find the requested tag"},
414 {SD_RES_STARTUP, "The system is still booting"},
415 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
416 {SD_RES_SHUTDOWN, "The system is shutting down"},
417 {SD_RES_NO_MEM, "Out of memory on the server"},
418 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
419 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
420 {SD_RES_NO_SPACE, "Server has no space for new objects"},
421 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
422 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
423 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
fca23f0a 424 {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
6a0b5490 425 {SD_RES_READONLY, "Object is read-only"},
33b1db1c
MK
426 };
427
428 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
429 if (errors[i].err == err) {
430 return errors[i].desc;
431 }
432 }
433
434 return "Invalid error code";
435}
436
437/*
438 * Sheepdog I/O handling:
439 *
2df46246 440 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
c292ee6a 441 * link the requests to the inflight_list in the
2df46246
MK
442 * BDRVSheepdogState. The function exits without waiting for
443 * receiving the response.
33b1db1c 444 *
2df46246 445 * 2. We receive the response in aio_read_response, the fd handler to
33b1db1c
MK
446 * the sheepdog connection. If metadata update is needed, we send
447 * the write request to the vdi object in sd_write_done, the write
2df46246
MK
448 * completion function. We switch back to sd_co_readv/writev after
449 * all the requests belonging to the AIOCB are finished.
33b1db1c
MK
450 */
451
452static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
453 uint64_t oid, unsigned int data_len,
b544c1ab 454 uint64_t offset, uint8_t flags, bool create,
33b1db1c
MK
455 uint64_t base_oid, unsigned int iov_offset)
456{
457 AIOReq *aio_req;
458
7267c094 459 aio_req = g_malloc(sizeof(*aio_req));
33b1db1c
MK
460 aio_req->aiocb = acb;
461 aio_req->iov_offset = iov_offset;
462 aio_req->oid = oid;
463 aio_req->base_oid = base_oid;
464 aio_req->offset = offset;
465 aio_req->data_len = data_len;
466 aio_req->flags = flags;
467 aio_req->id = s->aioreq_seq_num++;
b544c1ab 468 aio_req->create = create;
33b1db1c 469
1d732d7d 470 acb->nr_pending++;
33b1db1c
MK
471 return aio_req;
472}
473
1d732d7d 474static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
33b1db1c
MK
475{
476 SheepdogAIOCB *acb = aio_req->aiocb;
1d732d7d 477
35200687 478 acb->cancelable = false;
c292ee6a 479 QLIST_REMOVE(aio_req, aio_siblings);
7267c094 480 g_free(aio_req);
33b1db1c 481
1d732d7d 482 acb->nr_pending--;
33b1db1c
MK
483}
484
d8716b41 485static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
33b1db1c 486{
35200687 487 qemu_coroutine_enter(acb->coroutine, NULL);
8007429a 488 qemu_aio_unref(acb);
33b1db1c
MK
489}
490
35200687
MK
491/*
492 * Check whether the specified acb can be canceled
493 *
494 * We can cancel aio when any request belonging to the acb is:
495 * - Not processed by the sheepdog server.
496 * - Not linked to the inflight queue.
497 */
498static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
499{
500 BDRVSheepdogState *s = acb->common.bs->opaque;
501 AIOReq *aioreq;
502
503 if (!acb->cancelable) {
504 return false;
505 }
506
507 QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) {
508 if (aioreq->aiocb == acb) {
509 return false;
510 }
511 }
512
513 return true;
514}
515
7c84b1b8 516static void sd_aio_cancel(BlockAIOCB *blockacb)
33b1db1c
MK
517{
518 SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
35200687
MK
519 BDRVSheepdogState *s = acb->common.bs->opaque;
520 AIOReq *aioreq, *next;
6d24b4df
FZ
521
522 if (sd_acb_cancelable(acb)) {
6a55c82c 523 /* Remove outstanding requests from failed queue. */
6d24b4df
FZ
524 QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
525 next) {
526 if (aioreq->aiocb == acb) {
527 free_aio_req(s, aioreq);
35200687 528 }
6d24b4df 529 }
35200687 530
6d24b4df
FZ
531 assert(acb->nr_pending == 0);
532 if (acb->common.cb) {
533 acb->common.cb(acb->common.opaque, -ECANCELED);
35200687 534 }
6d24b4df 535 sd_finish_aiocb(acb);
35200687 536 }
33b1db1c
MK
537}
538
d7331bed 539static const AIOCBInfo sd_aiocb_info = {
6d24b4df
FZ
540 .aiocb_size = sizeof(SheepdogAIOCB),
541 .cancel_async = sd_aio_cancel,
33b1db1c
MK
542};
543
544static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
f700f8e3 545 int64_t sector_num, int nb_sectors)
33b1db1c
MK
546{
547 SheepdogAIOCB *acb;
6a55c82c
HM
548 uint32_t object_size;
549 BDRVSheepdogState *s = bs->opaque;
550
551 object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 552
f700f8e3 553 acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
33b1db1c
MK
554
555 acb->qiov = qiov;
556
557 acb->sector_num = sector_num;
558 acb->nb_sectors = nb_sectors;
559
560 acb->aio_done_func = NULL;
35200687 561 acb->cancelable = true;
2df46246 562 acb->coroutine = qemu_coroutine_self();
33b1db1c 563 acb->ret = 0;
1d732d7d 564 acb->nr_pending = 0;
6a55c82c
HM
565
566 acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
567 acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
568 acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
569
498f2140
HM
570 acb->min_dirty_data_idx = UINT32_MAX;
571 acb->max_dirty_data_idx = 0;
572
33b1db1c
MK
573 return acb;
574}
575
833a7cc3 576/* Return -EIO in case of error, file descriptor on success */
dfb12bf8 577static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
33b1db1c 578{
25af257d 579 int fd;
33b1db1c 580
1b8bbb46 581 if (s->is_unix) {
dfb12bf8 582 fd = unix_connect(s->host_spec, errp);
1b8bbb46 583 } else {
dfb12bf8 584 fd = inet_connect(s->host_spec, errp);
1b8bbb46 585
dfb12bf8 586 if (fd >= 0) {
1b8bbb46
MK
587 int ret = socket_set_nodelay(fd);
588 if (ret < 0) {
589 error_report("%s", strerror(errno));
590 }
591 }
592 }
33b1db1c 593
dfb12bf8 594 if (fd >= 0) {
f9e8cacc 595 qemu_set_nonblock(fd);
833a7cc3
LY
596 } else {
597 fd = -EIO;
33b1db1c
MK
598 }
599
33b1db1c
MK
600 return fd;
601}
602
833a7cc3 603/* Return 0 on success and -errno in case of error */
e0d93a89
MK
604static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
605 unsigned int *wlen)
47622c44
LY
606{
607 int ret;
608
609 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
80731d9d 610 if (ret != sizeof(*hdr)) {
47622c44 611 error_report("failed to send a req, %s", strerror(errno));
833a7cc3 612 ret = -socket_error();
eb092180 613 return ret;
47622c44
LY
614 }
615
616 ret = qemu_co_send(sockfd, data, *wlen);
80731d9d 617 if (ret != *wlen) {
833a7cc3 618 ret = -socket_error();
47622c44
LY
619 error_report("failed to send a req, %s", strerror(errno));
620 }
621
622 return ret;
623}
e0d93a89 624
2dfcca3b
MK
625static void restart_co_req(void *opaque)
626{
627 Coroutine *co = opaque;
628
629 qemu_coroutine_enter(co, NULL);
630}
631
cddd4ac7
MK
632typedef struct SheepdogReqCo {
633 int sockfd;
84390bed 634 AioContext *aio_context;
cddd4ac7
MK
635 SheepdogReq *hdr;
636 void *data;
637 unsigned int *wlen;
638 unsigned int *rlen;
639 int ret;
640 bool finished;
641} SheepdogReqCo;
642
643static coroutine_fn void do_co_req(void *opaque)
47622c44
LY
644{
645 int ret;
2dfcca3b 646 Coroutine *co;
cddd4ac7
MK
647 SheepdogReqCo *srco = opaque;
648 int sockfd = srco->sockfd;
649 SheepdogReq *hdr = srco->hdr;
650 void *data = srco->data;
651 unsigned int *wlen = srco->wlen;
652 unsigned int *rlen = srco->rlen;
2dfcca3b
MK
653
654 co = qemu_coroutine_self();
84390bed 655 aio_set_fd_handler(srco->aio_context, sockfd, NULL, restart_co_req, co);
47622c44 656
47622c44
LY
657 ret = send_co_req(sockfd, hdr, data, wlen);
658 if (ret < 0) {
659 goto out;
660 }
661
84390bed 662 aio_set_fd_handler(srco->aio_context, sockfd, restart_co_req, NULL, co);
2dfcca3b 663
47622c44 664 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
80731d9d 665 if (ret != sizeof(*hdr)) {
47622c44 666 error_report("failed to get a rsp, %s", strerror(errno));
cb595887 667 ret = -errno;
47622c44
LY
668 goto out;
669 }
670
671 if (*rlen > hdr->data_length) {
672 *rlen = hdr->data_length;
673 }
674
675 if (*rlen) {
676 ret = qemu_co_recv(sockfd, data, *rlen);
80731d9d 677 if (ret != *rlen) {
47622c44 678 error_report("failed to get the data, %s", strerror(errno));
cb595887 679 ret = -errno;
47622c44
LY
680 goto out;
681 }
682 }
683 ret = 0;
684out:
ed9ba724
MK
685 /* there is at most one request for this sockfd, so it is safe to
686 * set each handler to NULL. */
84390bed 687 aio_set_fd_handler(srco->aio_context, sockfd, NULL, NULL, NULL);
cddd4ac7
MK
688
689 srco->ret = ret;
690 srco->finished = true;
691}
692
833a7cc3
LY
693/*
694 * Send the request to the sheep in a synchronous manner.
695 *
696 * Return 0 on success, -errno in case of error.
697 */
84390bed
SH
698static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
699 void *data, unsigned int *wlen, unsigned int *rlen)
cddd4ac7
MK
700{
701 Coroutine *co;
702 SheepdogReqCo srco = {
703 .sockfd = sockfd,
84390bed 704 .aio_context = aio_context,
cddd4ac7
MK
705 .hdr = hdr,
706 .data = data,
707 .wlen = wlen,
708 .rlen = rlen,
709 .ret = 0,
710 .finished = false,
711 };
712
713 if (qemu_in_coroutine()) {
714 do_co_req(&srco);
715 } else {
716 co = qemu_coroutine_create(do_co_req);
717 qemu_coroutine_enter(co, &srco);
718 while (!srco.finished) {
84390bed 719 aio_poll(aio_context, true);
cddd4ac7
MK
720 }
721 }
722
723 return srco.ret;
47622c44
LY
724}
725
a37dcdf9 726static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
727 struct iovec *iov, int niov,
728 enum AIOCBState aiocb_type);
a37dcdf9 729static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
72e0996c 730static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
356b4ca2 731static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
011603ca 732static void co_write_request(void *opaque);
7dc1cde0 733
011603ca
MK
734static coroutine_fn void reconnect_to_sdog(void *opaque)
735{
736 BDRVSheepdogState *s = opaque;
737 AIOReq *aio_req, *next;
738
84390bed 739 aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
011603ca
MK
740 close(s->fd);
741 s->fd = -1;
742
743 /* Wait for outstanding write requests to be completed. */
744 while (s->co_send != NULL) {
745 co_write_request(opaque);
746 }
747
748 /* Try to reconnect the sheepdog server every one second. */
749 while (s->fd < 0) {
a780dea0 750 Error *local_err = NULL;
356b4ca2 751 s->fd = get_sheep_fd(s, &local_err);
011603ca
MK
752 if (s->fd < 0) {
753 DPRINTF("Wait for connection to be established\n");
565f65d2 754 error_report_err(local_err);
011603ca
MK
755 co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME,
756 1000000000ULL);
757 }
758 };
759
760 /*
761 * Now we have to resend all the request in the inflight queue. However,
762 * resend_aioreq() can yield and newly created requests can be added to the
763 * inflight queue before the coroutine is resumed. To avoid mixing them, we
764 * have to move all the inflight requests to the failed queue before
765 * resend_aioreq() is called.
766 */
767 QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
768 QLIST_REMOVE(aio_req, aio_siblings);
769 QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
770 }
771
772 /* Resend all the failed aio requests. */
773 while (!QLIST_EMPTY(&s->failed_aio_head)) {
774 aio_req = QLIST_FIRST(&s->failed_aio_head);
775 QLIST_REMOVE(aio_req, aio_siblings);
776 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
777 resend_aioreq(s, aio_req);
778 }
779}
780
33b1db1c
MK
781/*
782 * Receive responses of the I/O requests.
783 *
784 * This function is registered as a fd handler, and called from the
785 * main loop when s->fd is ready for reading responses.
786 */
d8716b41 787static void coroutine_fn aio_read_response(void *opaque)
33b1db1c
MK
788{
789 SheepdogObjRsp rsp;
790 BDRVSheepdogState *s = opaque;
791 int fd = s->fd;
792 int ret;
793 AIOReq *aio_req = NULL;
794 SheepdogAIOCB *acb;
cac8f4a6 795 uint64_t idx;
33b1db1c 796
33b1db1c 797 /* read a header */
8c5135f9 798 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
80731d9d 799 if (ret != sizeof(rsp)) {
6daf194d 800 error_report("failed to get the header, %s", strerror(errno));
011603ca 801 goto err;
33b1db1c
MK
802 }
803
c292ee6a
MK
804 /* find the right aio_req from the inflight aio list */
805 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
33b1db1c
MK
806 if (aio_req->id == rsp.id) {
807 break;
808 }
809 }
810 if (!aio_req) {
6daf194d 811 error_report("cannot find aio_req %x", rsp.id);
011603ca 812 goto err;
33b1db1c
MK
813 }
814
815 acb = aio_req->aiocb;
816
817 switch (acb->aiocb_type) {
818 case AIOCB_WRITE_UDATA:
6d1acda8
MK
819 /* this coroutine context is no longer suitable for co_recv
820 * because we may send data to update vdi objects */
821 s->co_recv = NULL;
33b1db1c
MK
822 if (!is_data_obj(aio_req->oid)) {
823 break;
824 }
825 idx = data_oid_to_idx(aio_req->oid);
826
b544c1ab 827 if (aio_req->create) {
33b1db1c
MK
828 /*
829 * If the object is newly created one, we need to update
830 * the vdi object (metadata object). min_dirty_data_idx
831 * and max_dirty_data_idx are changed to include updated
832 * index between them.
833 */
bd751f22
LY
834 if (rsp.result == SD_RES_SUCCESS) {
835 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
498f2140
HM
836 acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
837 acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
bd751f22 838 }
33b1db1c
MK
839 }
840 break;
841 case AIOCB_READ_UDATA:
2fc8ae1d
MT
842 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
843 aio_req->iov_offset, rsp.data_length);
80731d9d 844 if (ret != rsp.data_length) {
6daf194d 845 error_report("failed to get the data, %s", strerror(errno));
011603ca 846 goto err;
33b1db1c
MK
847 }
848 break;
47783072
LY
849 case AIOCB_FLUSH_CACHE:
850 if (rsp.result == SD_RES_INVALID_PARMS) {
2440a2c3 851 DPRINTF("disable cache since the server doesn't support it\n");
47783072
LY
852 s->cache_flags = SD_FLAG_CMD_DIRECT;
853 rsp.result = SD_RES_SUCCESS;
854 }
855 break;
cac8f4a6
LY
856 case AIOCB_DISCARD_OBJ:
857 switch (rsp.result) {
858 case SD_RES_INVALID_PARMS:
859 error_report("sheep(%s) doesn't support discard command",
860 s->host_spec);
861 rsp.result = SD_RES_SUCCESS;
862 s->discard_supported = false;
863 break;
864 case SD_RES_SUCCESS:
865 idx = data_oid_to_idx(aio_req->oid);
866 s->inode.data_vdi_id[idx] = 0;
867 break;
868 default:
869 break;
870 }
33b1db1c
MK
871 }
872
13c31de2
MK
873 switch (rsp.result) {
874 case SD_RES_SUCCESS:
875 break;
876 case SD_RES_READONLY:
72e0996c
MK
877 if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
878 ret = reload_inode(s, 0, "");
879 if (ret < 0) {
011603ca 880 goto err;
72e0996c
MK
881 }
882 }
72e0996c
MK
883 if (is_data_obj(aio_req->oid)) {
884 aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
885 data_oid_to_idx(aio_req->oid));
886 } else {
887 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
888 }
a37dcdf9
MK
889 resend_aioreq(s, aio_req);
890 goto out;
13c31de2 891 default:
33b1db1c 892 acb->ret = -EIO;
6daf194d 893 error_report("%s", sd_strerror(rsp.result));
13c31de2 894 break;
33b1db1c
MK
895 }
896
1d732d7d
MK
897 free_aio_req(s, aio_req);
898 if (!acb->nr_pending) {
33b1db1c
MK
899 /*
900 * We've finished all requests which belong to the AIOCB, so
2df46246 901 * we can switch back to sd_co_readv/writev now.
33b1db1c
MK
902 */
903 acb->aio_done_func(acb);
904 }
2df46246
MK
905out:
906 s->co_recv = NULL;
011603ca
MK
907 return;
908err:
909 s->co_recv = NULL;
910 reconnect_to_sdog(opaque);
2df46246
MK
911}
912
913static void co_read_response(void *opaque)
914{
915 BDRVSheepdogState *s = opaque;
916
917 if (!s->co_recv) {
918 s->co_recv = qemu_coroutine_create(aio_read_response);
919 }
920
921 qemu_coroutine_enter(s->co_recv, opaque);
922}
923
924static void co_write_request(void *opaque)
925{
926 BDRVSheepdogState *s = opaque;
927
928 qemu_coroutine_enter(s->co_send, NULL);
33b1db1c
MK
929}
930
33b1db1c 931/*
dc6fb73d 932 * Return a socket descriptor to read/write objects.
33b1db1c 933 *
dc6fb73d 934 * We cannot use this descriptor for other operations because
33b1db1c
MK
935 * the block driver may be on waiting response from the server.
936 */
356b4ca2 937static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
33b1db1c 938{
1b8bbb46 939 int fd;
33b1db1c 940
356b4ca2 941 fd = connect_to_sdog(s, errp);
33b1db1c 942 if (fd < 0) {
cb595887 943 return fd;
33b1db1c
MK
944 }
945
84390bed 946 aio_set_fd_handler(s->aio_context, fd, co_read_response, NULL, s);
33b1db1c
MK
947 return fd;
948}
949
5d6768e3
MK
950static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
951 char *vdi, uint32_t *snapid, char *tag)
952{
953 URI *uri;
954 QueryParams *qp = NULL;
955 int ret = 0;
956
957 uri = uri_parse(filename);
958 if (!uri) {
959 return -EINVAL;
960 }
961
1b8bbb46
MK
962 /* transport */
963 if (!strcmp(uri->scheme, "sheepdog")) {
964 s->is_unix = false;
965 } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
966 s->is_unix = false;
967 } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
968 s->is_unix = true;
969 } else {
970 ret = -EINVAL;
971 goto out;
972 }
973
5d6768e3
MK
974 if (uri->path == NULL || !strcmp(uri->path, "/")) {
975 ret = -EINVAL;
976 goto out;
977 }
978 pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1);
979
1b8bbb46
MK
980 qp = query_params_parse(uri->query);
981 if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
982 ret = -EINVAL;
983 goto out;
984 }
985
986 if (s->is_unix) {
987 /* sheepdog+unix:///vdiname?socket=path */
988 if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
989 ret = -EINVAL;
990 goto out;
991 }
992 s->host_spec = g_strdup(qp->p[0].value);
993 } else {
994 /* sheepdog[+tcp]://[host:port]/vdiname */
995 s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR,
996 uri->port ?: SD_DEFAULT_PORT);
997 }
5d6768e3
MK
998
999 /* snapshot tag */
1000 if (uri->fragment) {
1001 *snapid = strtoul(uri->fragment, NULL, 10);
1002 if (*snapid == 0) {
1003 pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment);
1004 }
1005 } else {
1006 *snapid = CURRENT_VDI_ID; /* search current vdi */
1007 }
1008
1009out:
1010 if (qp) {
1011 query_params_free(qp);
1012 }
1013 uri_free(uri);
1014 return ret;
1015}
1016
33b1db1c 1017/*
5d6768e3 1018 * Parse a filename (old syntax)
33b1db1c
MK
1019 *
1020 * filename must be one of the following formats:
1021 * 1. [vdiname]
1022 * 2. [vdiname]:[snapid]
1023 * 3. [vdiname]:[tag]
1024 * 4. [hostname]:[port]:[vdiname]
1025 * 5. [hostname]:[port]:[vdiname]:[snapid]
1026 * 6. [hostname]:[port]:[vdiname]:[tag]
1027 *
1028 * You can boot from the snapshot images by specifying `snapid` or
1029 * `tag'.
1030 *
1031 * You can run VMs outside the Sheepdog cluster by specifying
1032 * `hostname' and `port' (experimental).
1033 */
1034static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
1035 char *vdi, uint32_t *snapid, char *tag)
1036{
5d6768e3
MK
1037 char *p, *q, *uri;
1038 const char *host_spec, *vdi_spec;
1039 int nr_sep, ret;
33b1db1c 1040
5d6768e3 1041 strstart(filename, "sheepdog:", (const char **)&filename);
7267c094 1042 p = q = g_strdup(filename);
33b1db1c
MK
1043
1044 /* count the number of separators */
1045 nr_sep = 0;
1046 while (*p) {
1047 if (*p == ':') {
1048 nr_sep++;
1049 }
1050 p++;
1051 }
1052 p = q;
1053
5d6768e3 1054 /* use the first two tokens as host_spec. */
33b1db1c 1055 if (nr_sep >= 2) {
5d6768e3 1056 host_spec = p;
33b1db1c 1057 p = strchr(p, ':');
5d6768e3 1058 p++;
33b1db1c
MK
1059 p = strchr(p, ':');
1060 *p++ = '\0';
1061 } else {
5d6768e3 1062 host_spec = "";
33b1db1c
MK
1063 }
1064
5d6768e3 1065 vdi_spec = p;
33b1db1c 1066
5d6768e3 1067 p = strchr(vdi_spec, ':');
33b1db1c 1068 if (p) {
5d6768e3 1069 *p++ = '#';
33b1db1c
MK
1070 }
1071
5d6768e3 1072 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
33b1db1c 1073
5d6768e3
MK
1074 ret = sd_parse_uri(s, uri, vdi, snapid, tag);
1075
1076 g_free(q);
1077 g_free(uri);
1078
1079 return ret;
33b1db1c
MK
1080}
1081
982dcbf4
MK
1082static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
1083 uint32_t snapid, const char *tag, uint32_t *vid,
dc83cd42 1084 bool lock, Error **errp)
33b1db1c
MK
1085{
1086 int ret, fd;
1087 SheepdogVdiReq hdr;
1088 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1089 unsigned int wlen, rlen = 0;
1090 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1091
dc83cd42 1092 fd = connect_to_sdog(s, errp);
33b1db1c 1093 if (fd < 0) {
cb595887 1094 return fd;
33b1db1c
MK
1095 }
1096
3178e275
JM
1097 /* This pair of strncpy calls ensures that the buffer is zero-filled,
1098 * which is desirable since we'll soon be sending those bytes, and
1099 * don't want the send_req to read uninitialized data.
1100 */
33b1db1c
MK
1101 strncpy(buf, filename, SD_MAX_VDI_LEN);
1102 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1103
1104 memset(&hdr, 0, sizeof(hdr));
982dcbf4 1105 if (lock) {
33b1db1c 1106 hdr.opcode = SD_OP_LOCK_VDI;
1dbfafed 1107 hdr.type = LOCK_TYPE_NORMAL;
982dcbf4
MK
1108 } else {
1109 hdr.opcode = SD_OP_GET_VDI_INFO;
33b1db1c
MK
1110 }
1111 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1112 hdr.proto_ver = SD_PROTO_VER;
1113 hdr.data_length = wlen;
1114 hdr.snapid = snapid;
1115 hdr.flags = SD_FLAG_CMD_WRITE;
1116
84390bed 1117 ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1118 if (ret) {
dc83cd42 1119 error_setg_errno(errp, -ret, "cannot get vdi info");
33b1db1c
MK
1120 goto out;
1121 }
1122
1123 if (rsp->result != SD_RES_SUCCESS) {
dc83cd42
MA
1124 error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
1125 sd_strerror(rsp->result), filename, snapid, tag);
cb595887
MK
1126 if (rsp->result == SD_RES_NO_VDI) {
1127 ret = -ENOENT;
38890b24
HM
1128 } else if (rsp->result == SD_RES_VDI_LOCKED) {
1129 ret = -EBUSY;
cb595887
MK
1130 } else {
1131 ret = -EIO;
1132 }
33b1db1c
MK
1133 goto out;
1134 }
1135 *vid = rsp->vdi_id;
1136
1137 ret = 0;
1138out:
1139 closesocket(fd);
1140 return ret;
1141}
1142
a37dcdf9 1143static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
1144 struct iovec *iov, int niov,
1145 enum AIOCBState aiocb_type)
33b1db1c
MK
1146{
1147 int nr_copies = s->inode.nr_copies;
1148 SheepdogObjReq hdr;
47783072 1149 unsigned int wlen = 0;
33b1db1c
MK
1150 int ret;
1151 uint64_t oid = aio_req->oid;
1152 unsigned int datalen = aio_req->data_len;
1153 uint64_t offset = aio_req->offset;
1154 uint8_t flags = aio_req->flags;
1155 uint64_t old_oid = aio_req->base_oid;
b544c1ab 1156 bool create = aio_req->create;
33b1db1c
MK
1157
1158 if (!nr_copies) {
6daf194d 1159 error_report("bug");
33b1db1c
MK
1160 }
1161
1162 memset(&hdr, 0, sizeof(hdr));
1163
47783072
LY
1164 switch (aiocb_type) {
1165 case AIOCB_FLUSH_CACHE:
1166 hdr.opcode = SD_OP_FLUSH_VDI;
1167 break;
1168 case AIOCB_READ_UDATA:
33b1db1c
MK
1169 hdr.opcode = SD_OP_READ_OBJ;
1170 hdr.flags = flags;
47783072
LY
1171 break;
1172 case AIOCB_WRITE_UDATA:
1173 if (create) {
1174 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1175 } else {
1176 hdr.opcode = SD_OP_WRITE_OBJ;
1177 }
33b1db1c 1178 wlen = datalen;
33b1db1c 1179 hdr.flags = SD_FLAG_CMD_WRITE | flags;
47783072 1180 break;
cac8f4a6
LY
1181 case AIOCB_DISCARD_OBJ:
1182 hdr.opcode = SD_OP_DISCARD_OBJ;
1183 break;
33b1db1c
MK
1184 }
1185
0e7106d8
LY
1186 if (s->cache_flags) {
1187 hdr.flags |= s->cache_flags;
47622c44
LY
1188 }
1189
33b1db1c
MK
1190 hdr.oid = oid;
1191 hdr.cow_oid = old_oid;
1192 hdr.copies = s->inode.nr_copies;
1193
1194 hdr.data_length = datalen;
1195 hdr.offset = offset;
1196
1197 hdr.id = aio_req->id;
1198
2df46246
MK
1199 qemu_co_mutex_lock(&s->lock);
1200 s->co_send = qemu_coroutine_self();
84390bed
SH
1201 aio_set_fd_handler(s->aio_context, s->fd,
1202 co_read_response, co_write_request, s);
128aa589 1203 socket_set_cork(s->fd, 1);
33b1db1c
MK
1204
1205 /* send a header */
8c5135f9 1206 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
80731d9d 1207 if (ret != sizeof(hdr)) {
6daf194d 1208 error_report("failed to send a req, %s", strerror(errno));
011603ca 1209 goto out;
33b1db1c
MK
1210 }
1211
1212 if (wlen) {
2fc8ae1d 1213 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
80731d9d 1214 if (ret != wlen) {
6daf194d 1215 error_report("failed to send a data, %s", strerror(errno));
33b1db1c
MK
1216 }
1217 }
011603ca 1218out:
128aa589 1219 socket_set_cork(s->fd, 0);
84390bed 1220 aio_set_fd_handler(s->aio_context, s->fd, co_read_response, NULL, s);
011603ca 1221 s->co_send = NULL;
2df46246 1222 qemu_co_mutex_unlock(&s->lock);
33b1db1c
MK
1223}
1224
84390bed
SH
1225static int read_write_object(int fd, AioContext *aio_context, char *buf,
1226 uint64_t oid, uint8_t copies,
33b1db1c 1227 unsigned int datalen, uint64_t offset,
0e7106d8 1228 bool write, bool create, uint32_t cache_flags)
33b1db1c
MK
1229{
1230 SheepdogObjReq hdr;
1231 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1232 unsigned int wlen, rlen;
1233 int ret;
1234
1235 memset(&hdr, 0, sizeof(hdr));
1236
1237 if (write) {
1238 wlen = datalen;
1239 rlen = 0;
1240 hdr.flags = SD_FLAG_CMD_WRITE;
1241 if (create) {
1242 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1243 } else {
1244 hdr.opcode = SD_OP_WRITE_OBJ;
1245 }
1246 } else {
1247 wlen = 0;
1248 rlen = datalen;
1249 hdr.opcode = SD_OP_READ_OBJ;
1250 }
47622c44 1251
0e7106d8 1252 hdr.flags |= cache_flags;
47622c44 1253
33b1db1c
MK
1254 hdr.oid = oid;
1255 hdr.data_length = datalen;
1256 hdr.offset = offset;
1257 hdr.copies = copies;
1258
84390bed 1259 ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1260 if (ret) {
6daf194d 1261 error_report("failed to send a request to the sheep");
cb595887 1262 return ret;
33b1db1c
MK
1263 }
1264
1265 switch (rsp->result) {
1266 case SD_RES_SUCCESS:
1267 return 0;
1268 default:
6daf194d 1269 error_report("%s", sd_strerror(rsp->result));
cb595887 1270 return -EIO;
33b1db1c
MK
1271 }
1272}
1273
84390bed
SH
1274static int read_object(int fd, AioContext *aio_context, char *buf,
1275 uint64_t oid, uint8_t copies,
0e7106d8
LY
1276 unsigned int datalen, uint64_t offset,
1277 uint32_t cache_flags)
33b1db1c 1278{
84390bed
SH
1279 return read_write_object(fd, aio_context, buf, oid, copies,
1280 datalen, offset, false,
0e7106d8 1281 false, cache_flags);
33b1db1c
MK
1282}
1283
84390bed
SH
1284static int write_object(int fd, AioContext *aio_context, char *buf,
1285 uint64_t oid, uint8_t copies,
2f536801 1286 unsigned int datalen, uint64_t offset, bool create,
0e7106d8 1287 uint32_t cache_flags)
33b1db1c 1288{
84390bed
SH
1289 return read_write_object(fd, aio_context, buf, oid, copies,
1290 datalen, offset, true,
0e7106d8 1291 create, cache_flags);
33b1db1c
MK
1292}
1293
9ff53a0e
MK
1294/* update inode with the latest state */
1295static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
1296{
dfb12bf8 1297 Error *local_err = NULL;
9ff53a0e
MK
1298 SheepdogInode *inode;
1299 int ret = 0, fd;
1300 uint32_t vid = 0;
1301
dfb12bf8 1302 fd = connect_to_sdog(s, &local_err);
9ff53a0e 1303 if (fd < 0) {
565f65d2 1304 error_report_err(local_err);
9ff53a0e
MK
1305 return -EIO;
1306 }
1307
5d039bab 1308 inode = g_malloc(SD_INODE_HEADER_SIZE);
9ff53a0e 1309
dc83cd42 1310 ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
9ff53a0e 1311 if (ret) {
565f65d2 1312 error_report_err(local_err);
9ff53a0e
MK
1313 goto out;
1314 }
1315
84390bed 1316 ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid),
5d039bab
HM
1317 s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
1318 s->cache_flags);
9ff53a0e
MK
1319 if (ret < 0) {
1320 goto out;
1321 }
1322
1323 if (inode->vdi_id != s->inode.vdi_id) {
5d039bab 1324 memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE);
9ff53a0e
MK
1325 }
1326
1327out:
1328 g_free(inode);
1329 closesocket(fd);
1330
1331 return ret;
1332}
1333
a37dcdf9 1334static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
13c31de2
MK
1335{
1336 SheepdogAIOCB *acb = aio_req->aiocb;
b544c1ab
HM
1337
1338 aio_req->create = false;
13c31de2
MK
1339
1340 /* check whether this request becomes a CoW one */
2412aec7 1341 if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
13c31de2 1342 int idx = data_oid_to_idx(aio_req->oid);
13c31de2 1343
13c31de2
MK
1344 if (is_data_obj_writable(&s->inode, idx)) {
1345 goto out;
1346 }
1347
80308d33
MK
1348 if (s->inode.data_vdi_id[idx]) {
1349 aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
1350 aio_req->flags |= SD_FLAG_CMD_COW;
1351 }
b544c1ab 1352 aio_req->create = true;
13c31de2
MK
1353 }
1354out:
2412aec7 1355 if (is_data_obj(aio_req->oid)) {
b544c1ab 1356 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 1357 acb->aiocb_type);
2412aec7
MK
1358 } else {
1359 struct iovec iov;
1360 iov.iov_base = &s->inode;
1361 iov.iov_len = sizeof(s->inode);
b544c1ab 1362 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
2412aec7 1363 }
13c31de2
MK
1364}
1365
84390bed
SH
1366static void sd_detach_aio_context(BlockDriverState *bs)
1367{
1368 BDRVSheepdogState *s = bs->opaque;
1369
1370 aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
1371}
1372
1373static void sd_attach_aio_context(BlockDriverState *bs,
1374 AioContext *new_context)
1375{
1376 BDRVSheepdogState *s = bs->opaque;
1377
1378 s->aio_context = new_context;
1379 aio_set_fd_handler(new_context, s->fd, co_read_response, NULL, s);
1380}
1381
c8c96350
KW
1382/* TODO Convert to fine grained options */
1383static QemuOptsList runtime_opts = {
1384 .name = "sheepdog",
1385 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
1386 .desc = {
1387 {
1388 .name = "filename",
1389 .type = QEMU_OPT_STRING,
1390 .help = "URL to the sheepdog image",
1391 },
1392 { /* end of list */ }
1393 },
1394};
1395
015a1036
HR
1396static int sd_open(BlockDriverState *bs, QDict *options, int flags,
1397 Error **errp)
33b1db1c
MK
1398{
1399 int ret, fd;
1400 uint32_t vid = 0;
1401 BDRVSheepdogState *s = bs->opaque;
1402 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1403 uint32_t snapid;
1404 char *buf = NULL;
c8c96350
KW
1405 QemuOpts *opts;
1406 Error *local_err = NULL;
1407 const char *filename;
1408
011603ca 1409 s->bs = bs;
84390bed 1410 s->aio_context = bdrv_get_aio_context(bs);
011603ca 1411
87ea75d5 1412 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
c8c96350 1413 qemu_opts_absorb_qdict(opts, options, &local_err);
84d18f06 1414 if (local_err) {
e67c3993 1415 error_propagate(errp, local_err);
c8c96350
KW
1416 ret = -EINVAL;
1417 goto out;
1418 }
1419
1420 filename = qemu_opt_get(opts, "filename");
33b1db1c 1421
c292ee6a 1422 QLIST_INIT(&s->inflight_aio_head);
011603ca 1423 QLIST_INIT(&s->failed_aio_head);
6a55c82c 1424 QLIST_INIT(&s->inflight_aiocb_head);
33b1db1c
MK
1425 s->fd = -1;
1426
1427 memset(vdi, 0, sizeof(vdi));
1428 memset(tag, 0, sizeof(tag));
5d6768e3
MK
1429
1430 if (strstr(filename, "://")) {
1431 ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
1432 } else {
1433 ret = parse_vdiname(s, filename, vdi, &snapid, tag);
1434 }
1435 if (ret < 0) {
efde4b62 1436 error_setg(errp, "Can't parse filename");
33b1db1c
MK
1437 goto out;
1438 }
e67c3993 1439 s->fd = get_sheep_fd(s, errp);
33b1db1c 1440 if (s->fd < 0) {
cb595887 1441 ret = s->fd;
33b1db1c
MK
1442 goto out;
1443 }
1444
e67c3993 1445 ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp);
33b1db1c
MK
1446 if (ret) {
1447 goto out;
1448 }
1449
0e7106d8
LY
1450 /*
1451 * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1452 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1453 */
1454 s->cache_flags = SD_FLAG_CMD_CACHE;
1455 if (flags & BDRV_O_NOCACHE) {
1456 s->cache_flags = SD_FLAG_CMD_DIRECT;
1457 }
cac8f4a6 1458 s->discard_supported = true;
0e7106d8 1459
622b6057 1460 if (snapid || tag[0] != '\0') {
2440a2c3 1461 DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
2f536801 1462 s->is_snapshot = true;
33b1db1c
MK
1463 }
1464
e67c3993 1465 fd = connect_to_sdog(s, errp);
33b1db1c 1466 if (fd < 0) {
cb595887 1467 ret = fd;
33b1db1c
MK
1468 goto out;
1469 }
1470
7267c094 1471 buf = g_malloc(SD_INODE_SIZE);
84390bed
SH
1472 ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
1473 0, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
1474
1475 closesocket(fd);
1476
1477 if (ret) {
efde4b62 1478 error_setg(errp, "Can't read snapshot inode");
33b1db1c
MK
1479 goto out;
1480 }
1481
1482 memcpy(&s->inode, buf, sizeof(s->inode));
33b1db1c 1483
e8bfaa2f 1484 bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
3178e275 1485 pstrcpy(s->name, sizeof(s->name), vdi);
2df46246 1486 qemu_co_mutex_init(&s->lock);
498f2140 1487 qemu_co_queue_init(&s->overlapping_queue);
c8c96350 1488 qemu_opts_del(opts);
7267c094 1489 g_free(buf);
33b1db1c
MK
1490 return 0;
1491out:
84390bed 1492 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL);
33b1db1c
MK
1493 if (s->fd >= 0) {
1494 closesocket(s->fd);
1495 }
c8c96350 1496 qemu_opts_del(opts);
7267c094 1497 g_free(buf);
cb595887 1498 return ret;
33b1db1c
MK
1499}
1500
4da65c80
LY
1501static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
1502 Error **errp)
1503{
1504 BDRVSheepdogState *s = state->bs->opaque;
1505 BDRVSheepdogReopenState *re_s;
1506 int ret = 0;
1507
1508 re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
1509
1510 re_s->cache_flags = SD_FLAG_CMD_CACHE;
1511 if (state->flags & BDRV_O_NOCACHE) {
1512 re_s->cache_flags = SD_FLAG_CMD_DIRECT;
1513 }
1514
1515 re_s->fd = get_sheep_fd(s, errp);
1516 if (re_s->fd < 0) {
1517 ret = re_s->fd;
1518 return ret;
1519 }
1520
1521 return ret;
1522}
1523
1524static void sd_reopen_commit(BDRVReopenState *state)
1525{
1526 BDRVSheepdogReopenState *re_s = state->opaque;
1527 BDRVSheepdogState *s = state->bs->opaque;
1528
1529 if (s->fd) {
1530 aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
1531 closesocket(s->fd);
1532 }
1533
1534 s->fd = re_s->fd;
1535 s->cache_flags = re_s->cache_flags;
1536
1537 g_free(state->opaque);
1538 state->opaque = NULL;
1539
1540 return;
1541}
1542
1543static void sd_reopen_abort(BDRVReopenState *state)
1544{
1545 BDRVSheepdogReopenState *re_s = state->opaque;
1546 BDRVSheepdogState *s = state->bs->opaque;
1547
1548 if (re_s == NULL) {
1549 return;
1550 }
1551
1552 if (re_s->fd) {
1553 aio_set_fd_handler(s->aio_context, re_s->fd, NULL, NULL, NULL);
1554 closesocket(re_s->fd);
1555 }
1556
1557 g_free(state->opaque);
1558 state->opaque = NULL;
1559
1560 return;
1561}
1562
7d2d3e74
MA
1563static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
1564 Error **errp)
33b1db1c
MK
1565{
1566 SheepdogVdiReq hdr;
1567 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1568 int fd, ret;
1569 unsigned int wlen, rlen = 0;
1570 char buf[SD_MAX_VDI_LEN];
1571
7d2d3e74 1572 fd = connect_to_sdog(s, errp);
33b1db1c 1573 if (fd < 0) {
cb595887 1574 return fd;
33b1db1c
MK
1575 }
1576
3178e275
JM
1577 /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1578 * does not fit in buf? For now, just truncate and avoid buffer overrun.
1579 */
33b1db1c 1580 memset(buf, 0, sizeof(buf));
c31d482f 1581 pstrcpy(buf, sizeof(buf), s->name);
33b1db1c
MK
1582
1583 memset(&hdr, 0, sizeof(hdr));
1584 hdr.opcode = SD_OP_NEW_VDI;
9f23fce7 1585 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
1586
1587 wlen = SD_MAX_VDI_LEN;
1588
1589 hdr.flags = SD_FLAG_CMD_WRITE;
1590 hdr.snapid = snapshot;
1591
1592 hdr.data_length = wlen;
c31d482f
LY
1593 hdr.vdi_size = s->inode.vdi_size;
1594 hdr.copy_policy = s->inode.copy_policy;
b3af018f 1595 hdr.copies = s->inode.nr_copies;
876eb1b0 1596 hdr.block_size_shift = s->inode.block_size_shift;
33b1db1c 1597
84390bed 1598 ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c
MK
1599
1600 closesocket(fd);
1601
1602 if (ret) {
7d2d3e74 1603 error_setg_errno(errp, -ret, "create failed");
cb595887 1604 return ret;
33b1db1c
MK
1605 }
1606
1607 if (rsp->result != SD_RES_SUCCESS) {
7d2d3e74 1608 error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
33b1db1c
MK
1609 return -EIO;
1610 }
1611
1612 if (vdi_id) {
1613 *vdi_id = rsp->vdi_id;
1614 }
1615
1616 return 0;
1617}
1618
318df29e 1619static int sd_prealloc(const char *filename, Error **errp)
a8e0fdd7
MK
1620{
1621 BlockDriverState *bs = NULL;
876eb1b0
TI
1622 BDRVSheepdogState *base = NULL;
1623 unsigned long buf_size;
a8e0fdd7 1624 uint32_t idx, max_idx;
876eb1b0 1625 uint32_t object_size;
a8e0fdd7 1626 int64_t vdi_size;
876eb1b0 1627 void *buf = NULL;
a8e0fdd7
MK
1628 int ret;
1629
2e40134b 1630 ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
6ebf9aa2 1631 errp);
a8e0fdd7 1632 if (ret < 0) {
318df29e 1633 goto out_with_err_set;
a8e0fdd7
MK
1634 }
1635
1636 vdi_size = bdrv_getlength(bs);
1637 if (vdi_size < 0) {
1638 ret = vdi_size;
1639 goto out;
1640 }
876eb1b0
TI
1641
1642 base = bs->opaque;
1643 object_size = (UINT32_C(1) << base->inode.block_size_shift);
1644 buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
1645 buf = g_malloc0(buf_size);
1646
1647 max_idx = DIV_ROUND_UP(vdi_size, buf_size);
a8e0fdd7
MK
1648
1649 for (idx = 0; idx < max_idx; idx++) {
1650 /*
1651 * The created image can be a cloned image, so we need to read
1652 * a data from the source image.
1653 */
876eb1b0 1654 ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
a8e0fdd7
MK
1655 if (ret < 0) {
1656 goto out;
1657 }
876eb1b0 1658 ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
a8e0fdd7
MK
1659 if (ret < 0) {
1660 goto out;
1661 }
1662 }
318df29e 1663
a8e0fdd7 1664out:
318df29e
MA
1665 if (ret < 0) {
1666 error_setg_errno(errp, -ret, "Can't pre-allocate");
1667 }
1668out_with_err_set:
a8e0fdd7 1669 if (bs) {
4f6fd349 1670 bdrv_unref(bs);
a8e0fdd7 1671 }
7267c094 1672 g_free(buf);
a8e0fdd7
MK
1673
1674 return ret;
1675}
1676
b3af018f
LY
1677/*
1678 * Sheepdog support two kinds of redundancy, full replication and erasure
1679 * coding.
1680 *
1681 * # create a fully replicated vdi with x copies
1682 * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
1683 *
1684 * # create a erasure coded vdi with x data strips and y parity strips
1685 * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
1686 */
1687static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
1688{
1689 struct SheepdogInode *inode = &s->inode;
1690 const char *n1, *n2;
1691 long copy, parity;
1692 char p[10];
1693
1694 pstrcpy(p, sizeof(p), opt);
1695 n1 = strtok(p, ":");
1696 n2 = strtok(NULL, ":");
1697
1698 if (!n1) {
1699 return -EINVAL;
1700 }
1701
1702 copy = strtol(n1, NULL, 10);
1703 if (copy > SD_MAX_COPIES || copy < 1) {
1704 return -EINVAL;
1705 }
1706 if (!n2) {
1707 inode->copy_policy = 0;
1708 inode->nr_copies = copy;
1709 return 0;
1710 }
1711
1712 if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
1713 return -EINVAL;
1714 }
1715
1716 parity = strtol(n2, NULL, 10);
1717 if (parity >= SD_EC_MAX_STRIP || parity < 1) {
1718 return -EINVAL;
1719 }
1720
1721 /*
1722 * 4 bits for parity and 4 bits for data.
1723 * We have to compress upper data bits because it can't represent 16
1724 */
1725 inode->copy_policy = ((copy / 2) << 4) + parity;
1726 inode->nr_copies = copy + parity;
1727
1728 return 0;
1729}
1730
876eb1b0
TI
1731static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
1732{
1733 struct SheepdogInode *inode = &s->inode;
1734 uint64_t object_size;
1735 int obj_order;
1736
1737 object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
1738 if (object_size) {
1739 if ((object_size - 1) & object_size) { /* not a power of 2? */
1740 return -EINVAL;
1741 }
786a4ea8 1742 obj_order = ctz32(object_size);
876eb1b0
TI
1743 if (obj_order < 20 || obj_order > 31) {
1744 return -EINVAL;
1745 }
1746 inode->block_size_shift = (uint8_t)obj_order;
1747 }
1748
1749 return 0;
1750}
1751
b222237b 1752static int sd_create(const char *filename, QemuOpts *opts,
d5124c00 1753 Error **errp)
33b1db1c 1754{
b6fc8245 1755 int ret = 0;
c31d482f 1756 uint32_t vid = 0;
33b1db1c 1757 char *backing_file = NULL;
b222237b 1758 char *buf = NULL;
b6fc8245 1759 BDRVSheepdogState *s;
c31d482f 1760 char tag[SD_MAX_VDI_TAG_LEN];
b4447363 1761 uint32_t snapid;
876eb1b0 1762 uint64_t max_vdi_size;
2f536801 1763 bool prealloc = false;
33b1db1c 1764
5839e53b 1765 s = g_new0(BDRVSheepdogState, 1);
b6fc8245 1766
b4447363 1767 memset(tag, 0, sizeof(tag));
5d6768e3 1768 if (strstr(filename, "://")) {
c31d482f 1769 ret = sd_parse_uri(s, filename, s->name, &snapid, tag);
5d6768e3 1770 } else {
c31d482f 1771 ret = parse_vdiname(s, filename, s->name, &snapid, tag);
5d6768e3
MK
1772 }
1773 if (ret < 0) {
efde4b62 1774 error_setg(errp, "Can't parse filename");
b6fc8245 1775 goto out;
b4447363
MK
1776 }
1777
c2eb918e
HT
1778 s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1779 BDRV_SECTOR_SIZE);
b222237b
CL
1780 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
1781 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1782 if (!buf || !strcmp(buf, "off")) {
1783 prealloc = false;
1784 } else if (!strcmp(buf, "full")) {
1785 prealloc = true;
1786 } else {
1787 error_setg(errp, "Invalid preallocation mode: '%s'", buf);
1788 ret = -EINVAL;
1789 goto out;
1790 }
1791
1792 g_free(buf);
1793 buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
1794 if (buf) {
1795 ret = parse_redundancy(s, buf);
1796 if (ret < 0) {
1797 error_setg(errp, "Invalid redundancy mode: '%s'", buf);
1798 goto out;
33b1db1c 1799 }
33b1db1c 1800 }
876eb1b0
TI
1801 ret = parse_block_size_shift(s, opts);
1802 if (ret < 0) {
1803 error_setg(errp, "Invalid object_size."
1804 " obect_size needs to be power of 2"
1805 " and be limited from 2^20 to 2^31");
b6fc8245 1806 goto out;
33b1db1c
MK
1807 }
1808
1809 if (backing_file) {
1810 BlockDriverState *bs;
9f23fce7 1811 BDRVSheepdogState *base;
33b1db1c
MK
1812 BlockDriver *drv;
1813
1814 /* Currently, only Sheepdog backing image is supported. */
b65a5e12 1815 drv = bdrv_find_protocol(backing_file, true, NULL);
33b1db1c 1816 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
e67c3993 1817 error_setg(errp, "backing_file must be a sheepdog image");
b6fc8245
MK
1818 ret = -EINVAL;
1819 goto out;
33b1db1c
MK
1820 }
1821
2e40134b 1822 bs = NULL;
6ebf9aa2 1823 ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, errp);
cb595887 1824 if (ret < 0) {
b6fc8245 1825 goto out;
cb595887 1826 }
33b1db1c 1827
9f23fce7 1828 base = bs->opaque;
33b1db1c 1829
9f23fce7 1830 if (!is_snapshot(&base->inode)) {
e67c3993 1831 error_setg(errp, "cannot clone from a non snapshot vdi");
4f6fd349 1832 bdrv_unref(bs);
b6fc8245
MK
1833 ret = -EINVAL;
1834 goto out;
33b1db1c 1835 }
9f23fce7 1836 s->inode.vdi_id = base->inode.vdi_id;
4f6fd349 1837 bdrv_unref(bs);
33b1db1c
MK
1838 }
1839
5d5da114 1840 s->aio_context = qemu_get_aio_context();
876eb1b0
TI
1841
1842 /* if block_size_shift is not specified, get cluster default value */
1843 if (s->inode.block_size_shift == 0) {
1844 SheepdogVdiReq hdr;
1845 SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
1846 Error *local_err = NULL;
1847 int fd;
1848 unsigned int wlen = 0, rlen = 0;
1849
1850 fd = connect_to_sdog(s, &local_err);
1851 if (fd < 0) {
1852 error_report("%s", error_get_pretty(local_err));
1853 error_free(local_err);
1854 ret = -EIO;
1855 goto out;
1856 }
1857
1858 memset(&hdr, 0, sizeof(hdr));
1859 hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
1860 hdr.proto_ver = SD_PROTO_VER;
1861
1862 ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
1863 NULL, &wlen, &rlen);
1864 closesocket(fd);
1865 if (ret) {
1866 error_setg_errno(errp, -ret, "failed to get cluster default");
1867 goto out;
1868 }
1869 if (rsp->result == SD_RES_SUCCESS) {
1870 s->inode.block_size_shift = rsp->block_size_shift;
1871 } else {
1872 s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
1873 }
1874 }
1875
1876 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
1877
1878 if (s->inode.vdi_size > max_vdi_size) {
1879 error_setg(errp, "An image is too large."
1880 " The maximum image size is %"PRIu64 "GB",
1881 max_vdi_size / 1024 / 1024 / 1024);
1882 ret = -EINVAL;
1883 goto out;
1884 }
1885
e67c3993 1886 ret = do_sd_create(s, &vid, 0, errp);
7d2d3e74 1887 if (ret) {
b6fc8245 1888 goto out;
a8e0fdd7
MK
1889 }
1890
7d2d3e74 1891 if (prealloc) {
e67c3993 1892 ret = sd_prealloc(filename, errp);
318df29e 1893 }
b6fc8245 1894out:
b222237b
CL
1895 g_free(backing_file);
1896 g_free(buf);
b6fc8245
MK
1897 g_free(s);
1898 return ret;
33b1db1c
MK
1899}
1900
1901static void sd_close(BlockDriverState *bs)
1902{
dfb12bf8 1903 Error *local_err = NULL;
33b1db1c
MK
1904 BDRVSheepdogState *s = bs->opaque;
1905 SheepdogVdiReq hdr;
1906 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1907 unsigned int wlen, rlen = 0;
1908 int fd, ret;
1909
2440a2c3 1910 DPRINTF("%s\n", s->name);
33b1db1c 1911
dfb12bf8 1912 fd = connect_to_sdog(s, &local_err);
33b1db1c 1913 if (fd < 0) {
565f65d2 1914 error_report_err(local_err);
33b1db1c
MK
1915 return;
1916 }
1917
1918 memset(&hdr, 0, sizeof(hdr));
1919
1920 hdr.opcode = SD_OP_RELEASE_VDI;
1dbfafed 1921 hdr.type = LOCK_TYPE_NORMAL;
9f23fce7 1922 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
1923 wlen = strlen(s->name) + 1;
1924 hdr.data_length = wlen;
1925 hdr.flags = SD_FLAG_CMD_WRITE;
1926
84390bed
SH
1927 ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
1928 s->name, &wlen, &rlen);
33b1db1c
MK
1929
1930 closesocket(fd);
1931
1932 if (!ret && rsp->result != SD_RES_SUCCESS &&
1933 rsp->result != SD_RES_VDI_NOT_LOCKED) {
6daf194d 1934 error_report("%s, %s", sd_strerror(rsp->result), s->name);
33b1db1c
MK
1935 }
1936
84390bed 1937 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL);
33b1db1c 1938 closesocket(s->fd);
25af257d 1939 g_free(s->host_spec);
33b1db1c
MK
1940}
1941
1942static int64_t sd_getlength(BlockDriverState *bs)
1943{
1944 BDRVSheepdogState *s = bs->opaque;
1945
1946 return s->inode.vdi_size;
1947}
1948
1949static int sd_truncate(BlockDriverState *bs, int64_t offset)
1950{
dfb12bf8 1951 Error *local_err = NULL;
33b1db1c
MK
1952 BDRVSheepdogState *s = bs->opaque;
1953 int ret, fd;
1954 unsigned int datalen;
876eb1b0 1955 uint64_t max_vdi_size;
33b1db1c 1956
876eb1b0 1957 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
33b1db1c 1958 if (offset < s->inode.vdi_size) {
6daf194d 1959 error_report("shrinking is not supported");
33b1db1c 1960 return -EINVAL;
876eb1b0 1961 } else if (offset > max_vdi_size) {
6daf194d 1962 error_report("too big image size");
33b1db1c
MK
1963 return -EINVAL;
1964 }
1965
dfb12bf8 1966 fd = connect_to_sdog(s, &local_err);
33b1db1c 1967 if (fd < 0) {
565f65d2 1968 error_report_err(local_err);
cb595887 1969 return fd;
33b1db1c
MK
1970 }
1971
1972 /* we don't need to update entire object */
1973 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1974 s->inode.vdi_size = offset;
84390bed
SH
1975 ret = write_object(fd, s->aio_context, (char *)&s->inode,
1976 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
1977 datalen, 0, false, s->cache_flags);
33b1db1c
MK
1978 close(fd);
1979
1980 if (ret < 0) {
6daf194d 1981 error_report("failed to update an inode.");
33b1db1c
MK
1982 }
1983
cb595887 1984 return ret;
33b1db1c
MK
1985}
1986
1987/*
1988 * This function is called after writing data objects. If we need to
1989 * update metadata, this sends a write request to the vdi object.
2df46246 1990 * Otherwise, this switches back to sd_co_readv/writev.
33b1db1c 1991 */
d8716b41 1992static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
33b1db1c 1993{
33b1db1c
MK
1994 BDRVSheepdogState *s = acb->common.bs->opaque;
1995 struct iovec iov;
1996 AIOReq *aio_req;
1997 uint32_t offset, data_len, mn, mx;
1998
498f2140
HM
1999 mn = acb->min_dirty_data_idx;
2000 mx = acb->max_dirty_data_idx;
33b1db1c
MK
2001 if (mn <= mx) {
2002 /* we need to update the vdi object. */
2003 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
2004 mn * sizeof(s->inode.data_vdi_id[0]);
2005 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
2006
498f2140
HM
2007 acb->min_dirty_data_idx = UINT32_MAX;
2008 acb->max_dirty_data_idx = 0;
33b1db1c
MK
2009
2010 iov.iov_base = &s->inode;
2011 iov.iov_len = sizeof(s->inode);
2012 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2013 data_len, offset, 0, false, 0, offset);
c292ee6a 2014 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
b544c1ab 2015 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
33b1db1c
MK
2016
2017 acb->aio_done_func = sd_finish_aiocb;
2018 acb->aiocb_type = AIOCB_WRITE_UDATA;
2019 return;
2020 }
a37dcdf9 2021
33b1db1c
MK
2022 sd_finish_aiocb(acb);
2023}
2024
859e5553
LY
2025/* Delete current working VDI on the snapshot chain */
2026static bool sd_delete(BDRVSheepdogState *s)
2027{
dfb12bf8 2028 Error *local_err = NULL;
859e5553
LY
2029 unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
2030 SheepdogVdiReq hdr = {
2031 .opcode = SD_OP_DEL_VDI,
9f23fce7 2032 .base_vdi_id = s->inode.vdi_id,
859e5553
LY
2033 .data_length = wlen,
2034 .flags = SD_FLAG_CMD_WRITE,
2035 };
2036 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2037 int fd, ret;
2038
dfb12bf8 2039 fd = connect_to_sdog(s, &local_err);
859e5553 2040 if (fd < 0) {
565f65d2 2041 error_report_err(local_err);
859e5553
LY
2042 return false;
2043 }
2044
84390bed
SH
2045 ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
2046 s->name, &wlen, &rlen);
859e5553
LY
2047 closesocket(fd);
2048 if (ret) {
2049 return false;
2050 }
2051 switch (rsp->result) {
2052 case SD_RES_NO_VDI:
2053 error_report("%s was already deleted", s->name);
2054 /* fall through */
2055 case SD_RES_SUCCESS:
2056 break;
2057 default:
2058 error_report("%s, %s", sd_strerror(rsp->result), s->name);
2059 return false;
2060 }
2061
2062 return true;
2063}
2064
33b1db1c
MK
2065/*
2066 * Create a writable VDI from a snapshot
2067 */
2068static int sd_create_branch(BDRVSheepdogState *s)
2069{
dfb12bf8 2070 Error *local_err = NULL;
33b1db1c
MK
2071 int ret, fd;
2072 uint32_t vid;
2073 char *buf;
859e5553 2074 bool deleted;
33b1db1c 2075
2440a2c3 2076 DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
33b1db1c 2077
7267c094 2078 buf = g_malloc(SD_INODE_SIZE);
33b1db1c 2079
859e5553
LY
2080 /*
2081 * Even If deletion fails, we will just create extra snapshot based on
dc6fb73d 2082 * the working VDI which was supposed to be deleted. So no need to
859e5553
LY
2083 * false bail out.
2084 */
2085 deleted = sd_delete(s);
7d2d3e74 2086 ret = do_sd_create(s, &vid, !deleted, &local_err);
33b1db1c 2087 if (ret) {
565f65d2 2088 error_report_err(local_err);
33b1db1c
MK
2089 goto out;
2090 }
2091
2440a2c3 2092 DPRINTF("%" PRIx32 " is created.\n", vid);
33b1db1c 2093
dfb12bf8 2094 fd = connect_to_sdog(s, &local_err);
33b1db1c 2095 if (fd < 0) {
565f65d2 2096 error_report_err(local_err);
cb595887 2097 ret = fd;
33b1db1c
MK
2098 goto out;
2099 }
2100
84390bed
SH
2101 ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
2102 s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
2103
2104 closesocket(fd);
2105
2106 if (ret < 0) {
2107 goto out;
2108 }
2109
2110 memcpy(&s->inode, buf, sizeof(s->inode));
2111
2f536801 2112 s->is_snapshot = false;
33b1db1c 2113 ret = 0;
2440a2c3 2114 DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
33b1db1c
MK
2115
2116out:
7267c094 2117 g_free(buf);
33b1db1c
MK
2118
2119 return ret;
2120}
2121
2122/*
2123 * Send I/O requests to the server.
2124 *
2125 * This function sends requests to the server, links the requests to
c292ee6a 2126 * the inflight_list in BDRVSheepdogState, and exits without
33b1db1c
MK
2127 * waiting the response. The responses are received in the
2128 * `aio_read_response' function which is called from the main loop as
2129 * a fd handler.
2df46246
MK
2130 *
2131 * Returns 1 when we need to wait a response, 0 when there is no sent
2132 * request and -errno in error cases.
33b1db1c 2133 */
d8716b41 2134static int coroutine_fn sd_co_rw_vector(void *p)
33b1db1c
MK
2135{
2136 SheepdogAIOCB *acb = p;
2137 int ret = 0;
e8bfaa2f 2138 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
876eb1b0
TI
2139 unsigned long idx;
2140 uint32_t object_size;
33b1db1c 2141 uint64_t oid;
876eb1b0 2142 uint64_t offset;
33b1db1c
MK
2143 BDRVSheepdogState *s = acb->common.bs->opaque;
2144 SheepdogInode *inode = &s->inode;
2145 AIOReq *aio_req;
2146
33b1db1c
MK
2147 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
2148 /*
2149 * In the case we open the snapshot VDI, Sheepdog creates the
2150 * writable VDI when we do a write operation first.
2151 */
2152 ret = sd_create_branch(s);
2153 if (ret) {
2154 acb->ret = -EIO;
2155 goto out;
2156 }
2157 }
2158
876eb1b0
TI
2159 object_size = (UINT32_C(1) << inode->block_size_shift);
2160 idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
2161 offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
2162
1d732d7d
MK
2163 /*
2164 * Make sure we don't free the aiocb before we are done with all requests.
2165 * This additional reference is dropped at the end of this function.
2166 */
2167 acb->nr_pending++;
2168
33b1db1c
MK
2169 while (done != total) {
2170 uint8_t flags = 0;
2171 uint64_t old_oid = 0;
2f536801 2172 bool create = false;
33b1db1c
MK
2173
2174 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
2175
876eb1b0 2176 len = MIN(total - done, object_size - offset);
33b1db1c 2177
19db9b90
CH
2178 switch (acb->aiocb_type) {
2179 case AIOCB_READ_UDATA:
2180 if (!inode->data_vdi_id[idx]) {
2181 qemu_iovec_memset(acb->qiov, done, 0, len);
33b1db1c
MK
2182 goto done;
2183 }
19db9b90
CH
2184 break;
2185 case AIOCB_WRITE_UDATA:
2186 if (!inode->data_vdi_id[idx]) {
2f536801 2187 create = true;
19db9b90
CH
2188 } else if (!is_data_obj_writable(inode, idx)) {
2189 /* Copy-On-Write */
2f536801 2190 create = true;
19db9b90
CH
2191 old_oid = oid;
2192 flags = SD_FLAG_CMD_COW;
2193 }
2194 break;
cac8f4a6
LY
2195 case AIOCB_DISCARD_OBJ:
2196 /*
2197 * We discard the object only when the whole object is
2198 * 1) allocated 2) trimmed. Otherwise, simply skip it.
2199 */
876eb1b0 2200 if (len != object_size || inode->data_vdi_id[idx] == 0) {
cac8f4a6
LY
2201 goto done;
2202 }
2203 break;
19db9b90
CH
2204 default:
2205 break;
33b1db1c
MK
2206 }
2207
2208 if (create) {
2440a2c3 2209 DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
1b6ac998 2210 inode->vdi_id, oid,
33b1db1c
MK
2211 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
2212 oid = vid_to_data_oid(inode->vdi_id, idx);
2440a2c3 2213 DPRINTF("new oid %" PRIx64 "\n", oid);
33b1db1c
MK
2214 }
2215
b544c1ab
HM
2216 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
2217 old_oid, done);
80308d33 2218 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
33b1db1c 2219
b544c1ab 2220 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 2221 acb->aiocb_type);
33b1db1c
MK
2222 done:
2223 offset = 0;
2224 idx++;
2225 done += len;
2226 }
2227out:
1d732d7d 2228 if (!--acb->nr_pending) {
2df46246 2229 return acb->ret;
33b1db1c 2230 }
2df46246 2231 return 1;
33b1db1c
MK
2232}
2233
498f2140 2234static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
6a55c82c
HM
2235{
2236 SheepdogAIOCB *cb;
2237
2238 QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
498f2140 2239 if (AIOCBOverlapping(aiocb, cb)) {
6a55c82c
HM
2240 return true;
2241 }
2242 }
2243
2244 QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
2245 return false;
2246}
2247
a968168c 2248static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
2df46246 2249 int nb_sectors, QEMUIOVector *qiov)
33b1db1c
MK
2250{
2251 SheepdogAIOCB *acb;
2df46246 2252 int ret;
e50d7607
LY
2253 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
2254 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2255
c0191e76 2256 if (offset > s->inode.vdi_size) {
e50d7607 2257 ret = sd_truncate(bs, offset);
cb595887
MK
2258 if (ret < 0) {
2259 return ret;
33b1db1c 2260 }
33b1db1c
MK
2261 }
2262
f700f8e3 2263 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
33b1db1c
MK
2264 acb->aio_done_func = sd_write_done;
2265 acb->aiocb_type = AIOCB_WRITE_UDATA;
2266
6a55c82c 2267retry:
498f2140
HM
2268 if (check_overlapping_aiocb(s, acb)) {
2269 qemu_co_queue_wait(&s->overlapping_queue);
6a55c82c
HM
2270 goto retry;
2271 }
2272
2df46246
MK
2273 ret = sd_co_rw_vector(acb);
2274 if (ret <= 0) {
6a55c82c 2275 QLIST_REMOVE(acb, aiocb_siblings);
498f2140 2276 qemu_co_queue_restart_all(&s->overlapping_queue);
8007429a 2277 qemu_aio_unref(acb);
2df46246
MK
2278 return ret;
2279 }
2280
2281 qemu_coroutine_yield();
2282
6a55c82c 2283 QLIST_REMOVE(acb, aiocb_siblings);
498f2140 2284 qemu_co_queue_restart_all(&s->overlapping_queue);
6a55c82c 2285
2df46246 2286 return acb->ret;
33b1db1c
MK
2287}
2288
a968168c 2289static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2df46246 2290 int nb_sectors, QEMUIOVector *qiov)
33b1db1c
MK
2291{
2292 SheepdogAIOCB *acb;
19db9b90 2293 int ret;
6a55c82c 2294 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2295
f700f8e3 2296 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
33b1db1c
MK
2297 acb->aiocb_type = AIOCB_READ_UDATA;
2298 acb->aio_done_func = sd_finish_aiocb;
2299
6a55c82c 2300retry:
498f2140
HM
2301 if (check_overlapping_aiocb(s, acb)) {
2302 qemu_co_queue_wait(&s->overlapping_queue);
6a55c82c
HM
2303 goto retry;
2304 }
2305
2df46246
MK
2306 ret = sd_co_rw_vector(acb);
2307 if (ret <= 0) {
6a55c82c 2308 QLIST_REMOVE(acb, aiocb_siblings);
498f2140 2309 qemu_co_queue_restart_all(&s->overlapping_queue);
8007429a 2310 qemu_aio_unref(acb);
2df46246
MK
2311 return ret;
2312 }
2313
2314 qemu_coroutine_yield();
2315
6a55c82c 2316 QLIST_REMOVE(acb, aiocb_siblings);
498f2140 2317 qemu_co_queue_restart_all(&s->overlapping_queue);
2df46246 2318 return acb->ret;
33b1db1c
MK
2319}
2320
47622c44
LY
2321static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
2322{
2323 BDRVSheepdogState *s = bs->opaque;
47783072
LY
2324 SheepdogAIOCB *acb;
2325 AIOReq *aio_req;
47622c44 2326
0e7106d8 2327 if (s->cache_flags != SD_FLAG_CMD_CACHE) {
47622c44
LY
2328 return 0;
2329 }
2330
f700f8e3 2331 acb = sd_aio_setup(bs, NULL, 0, 0);
47783072
LY
2332 acb->aiocb_type = AIOCB_FLUSH_CACHE;
2333 acb->aio_done_func = sd_finish_aiocb;
47622c44 2334
47783072 2335 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2336 0, 0, 0, false, 0, 0);
47783072 2337 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
b544c1ab 2338 add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type);
47622c44 2339
47783072
LY
2340 qemu_coroutine_yield();
2341 return acb->ret;
47622c44
LY
2342}
2343
33b1db1c
MK
2344static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
2345{
dfb12bf8 2346 Error *local_err = NULL;
33b1db1c
MK
2347 BDRVSheepdogState *s = bs->opaque;
2348 int ret, fd;
2349 uint32_t new_vid;
2350 SheepdogInode *inode;
2351 unsigned int datalen;
2352
2440a2c3 2353 DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
33b1db1c
MK
2354 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
2355 s->name, sn_info->vm_state_size, s->is_snapshot);
2356
2357 if (s->is_snapshot) {
2358 error_report("You can't create a snapshot of a snapshot VDI, "
6daf194d 2359 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
33b1db1c
MK
2360
2361 return -EINVAL;
2362 }
2363
2440a2c3 2364 DPRINTF("%s %s\n", sn_info->name, sn_info->id_str);
33b1db1c
MK
2365
2366 s->inode.vm_state_size = sn_info->vm_state_size;
2367 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
3178e275
JM
2368 /* It appears that inode.tag does not require a NUL terminator,
2369 * which means this use of strncpy is ok.
2370 */
33b1db1c
MK
2371 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
2372 /* we don't need to update entire object */
2373 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2df5fee2 2374 inode = g_malloc(datalen);
33b1db1c
MK
2375
2376 /* refresh inode. */
dfb12bf8 2377 fd = connect_to_sdog(s, &local_err);
33b1db1c 2378 if (fd < 0) {
565f65d2 2379 error_report_err(local_err);
cb595887 2380 ret = fd;
33b1db1c
MK
2381 goto cleanup;
2382 }
2383
84390bed
SH
2384 ret = write_object(fd, s->aio_context, (char *)&s->inode,
2385 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2386 datalen, 0, false, s->cache_flags);
33b1db1c 2387 if (ret < 0) {
6daf194d 2388 error_report("failed to write snapshot's inode.");
33b1db1c
MK
2389 goto cleanup;
2390 }
2391
7d2d3e74 2392 ret = do_sd_create(s, &new_vid, 1, &local_err);
33b1db1c 2393 if (ret < 0) {
27994d58
MA
2394 error_report("failed to create inode for snapshot: %s",
2395 error_get_pretty(local_err));
973a8529 2396 error_free(local_err);
33b1db1c
MK
2397 goto cleanup;
2398 }
2399
84390bed
SH
2400 ret = read_object(fd, s->aio_context, (char *)inode,
2401 vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
2402 s->cache_flags);
33b1db1c
MK
2403
2404 if (ret < 0) {
6daf194d 2405 error_report("failed to read new inode info. %s", strerror(errno));
33b1db1c
MK
2406 goto cleanup;
2407 }
2408
2409 memcpy(&s->inode, inode, datalen);
2440a2c3 2410 DPRINTF("s->inode: name %s snap_id %x oid %x\n",
33b1db1c
MK
2411 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
2412
2413cleanup:
2df5fee2 2414 g_free(inode);
33b1db1c
MK
2415 closesocket(fd);
2416 return ret;
2417}
2418
859e5553
LY
2419/*
2420 * We implement rollback(loadvm) operation to the specified snapshot by
2421 * 1) switch to the snapshot
2422 * 2) rely on sd_create_branch to delete working VDI and
dc6fb73d 2423 * 3) create a new working VDI based on the specified snapshot
859e5553 2424 */
33b1db1c
MK
2425static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
2426{
2427 BDRVSheepdogState *s = bs->opaque;
2428 BDRVSheepdogState *old_s;
9ff53a0e 2429 char tag[SD_MAX_VDI_TAG_LEN];
33b1db1c 2430 uint32_t snapid = 0;
9ff53a0e 2431 int ret = 0;
33b1db1c 2432
5839e53b 2433 old_s = g_new(BDRVSheepdogState, 1);
33b1db1c
MK
2434
2435 memcpy(old_s, s, sizeof(BDRVSheepdogState));
2436
33b1db1c 2437 snapid = strtoul(snapshot_id, NULL, 10);
3178e275
JM
2438 if (snapid) {
2439 tag[0] = 0;
2440 } else {
b579ffb3 2441 pstrcpy(tag, sizeof(tag), snapshot_id);
33b1db1c
MK
2442 }
2443
9ff53a0e 2444 ret = reload_inode(s, snapid, tag);
33b1db1c 2445 if (ret) {
33b1db1c
MK
2446 goto out;
2447 }
2448
cede621f
LY
2449 ret = sd_create_branch(s);
2450 if (ret) {
33b1db1c
MK
2451 goto out;
2452 }
2453
7267c094 2454 g_free(old_s);
33b1db1c
MK
2455
2456 return 0;
2457out:
2458 /* recover bdrv_sd_state */
2459 memcpy(s, old_s, sizeof(BDRVSheepdogState));
7267c094 2460 g_free(old_s);
33b1db1c 2461
6daf194d 2462 error_report("failed to open. recover old bdrv_sd_state.");
33b1db1c
MK
2463
2464 return ret;
2465}
2466
a89d89d3
WX
2467static int sd_snapshot_delete(BlockDriverState *bs,
2468 const char *snapshot_id,
2469 const char *name,
2470 Error **errp)
33b1db1c
MK
2471{
2472 /* FIXME: Delete specified snapshot id. */
2473 return 0;
2474}
2475
33b1db1c
MK
2476static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
2477{
dfb12bf8 2478 Error *local_err = NULL;
33b1db1c
MK
2479 BDRVSheepdogState *s = bs->opaque;
2480 SheepdogReq req;
2481 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
2482 QEMUSnapshotInfo *sn_tab = NULL;
2483 unsigned wlen, rlen;
2484 int found = 0;
2485 static SheepdogInode inode;
2486 unsigned long *vdi_inuse;
2487 unsigned int start_nr;
2488 uint64_t hval;
2489 uint32_t vid;
2490
7267c094 2491 vdi_inuse = g_malloc(max);
33b1db1c 2492
dfb12bf8 2493 fd = connect_to_sdog(s, &local_err);
33b1db1c 2494 if (fd < 0) {
565f65d2 2495 error_report_err(local_err);
cb595887 2496 ret = fd;
33b1db1c
MK
2497 goto out;
2498 }
2499
2500 rlen = max;
2501 wlen = 0;
2502
2503 memset(&req, 0, sizeof(req));
2504
2505 req.opcode = SD_OP_READ_VDIS;
2506 req.data_length = max;
2507
84390bed
SH
2508 ret = do_req(fd, s->aio_context, (SheepdogReq *)&req,
2509 vdi_inuse, &wlen, &rlen);
33b1db1c
MK
2510
2511 closesocket(fd);
2512 if (ret) {
2513 goto out;
2514 }
2515
02c4f26b 2516 sn_tab = g_new0(QEMUSnapshotInfo, nr);
33b1db1c
MK
2517
2518 /* calculate a vdi id with hash function */
2519 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
2520 start_nr = hval & (SD_NR_VDIS - 1);
2521
dfb12bf8 2522 fd = connect_to_sdog(s, &local_err);
33b1db1c 2523 if (fd < 0) {
565f65d2 2524 error_report_err(local_err);
cb595887 2525 ret = fd;
33b1db1c
MK
2526 goto out;
2527 }
2528
2529 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
2530 if (!test_bit(vid, vdi_inuse)) {
2531 break;
2532 }
2533
2534 /* we don't need to read entire object */
84390bed
SH
2535 ret = read_object(fd, s->aio_context, (char *)&inode,
2536 vid_to_vdi_oid(vid),
47622c44 2537 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
0e7106d8 2538 s->cache_flags);
33b1db1c
MK
2539
2540 if (ret) {
2541 continue;
2542 }
2543
2544 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
2545 sn_tab[found].date_sec = inode.snap_ctime >> 32;
2546 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
2547 sn_tab[found].vm_state_size = inode.vm_state_size;
2548 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
2549
521b2b5d
HR
2550 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
2551 "%" PRIu32, inode.snap_id);
3178e275
JM
2552 pstrcpy(sn_tab[found].name,
2553 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
2554 inode.tag);
33b1db1c
MK
2555 found++;
2556 }
2557 }
2558
2559 closesocket(fd);
2560out:
2561 *psn_tab = sn_tab;
2562
7267c094 2563 g_free(vdi_inuse);
33b1db1c 2564
cb595887
MK
2565 if (ret < 0) {
2566 return ret;
2567 }
2568
33b1db1c
MK
2569 return found;
2570}
2571
2572static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
2573 int64_t pos, int size, int load)
2574{
dfb12bf8 2575 Error *local_err = NULL;
2f536801
MK
2576 bool create;
2577 int fd, ret = 0, remaining = size;
33b1db1c
MK
2578 unsigned int data_len;
2579 uint64_t vmstate_oid;
33b1db1c 2580 uint64_t offset;
cede621f
LY
2581 uint32_t vdi_index;
2582 uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
876eb1b0 2583 uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 2584
dfb12bf8 2585 fd = connect_to_sdog(s, &local_err);
33b1db1c 2586 if (fd < 0) {
565f65d2 2587 error_report_err(local_err);
cb595887 2588 return fd;
33b1db1c
MK
2589 }
2590
6f3c714e 2591 while (remaining) {
876eb1b0
TI
2592 vdi_index = pos / object_size;
2593 offset = pos % object_size;
33b1db1c 2594
876eb1b0 2595 data_len = MIN(remaining, object_size - offset);
33b1db1c 2596
cede621f 2597 vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
33b1db1c
MK
2598
2599 create = (offset == 0);
2600 if (load) {
84390bed 2601 ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid,
47622c44 2602 s->inode.nr_copies, data_len, offset,
0e7106d8 2603 s->cache_flags);
33b1db1c 2604 } else {
84390bed 2605 ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid,
47622c44 2606 s->inode.nr_copies, data_len, offset, create,
0e7106d8 2607 s->cache_flags);
33b1db1c
MK
2608 }
2609
2610 if (ret < 0) {
6daf194d 2611 error_report("failed to save vmstate %s", strerror(errno));
33b1db1c
MK
2612 goto cleanup;
2613 }
2614
2615 pos += data_len;
1f7a48de 2616 data += data_len;
6f3c714e 2617 remaining -= data_len;
33b1db1c 2618 }
6f3c714e 2619 ret = size;
33b1db1c
MK
2620cleanup:
2621 closesocket(fd);
2622 return ret;
2623}
2624
cf8074b3
KW
2625static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2626 int64_t pos)
33b1db1c
MK
2627{
2628 BDRVSheepdogState *s = bs->opaque;
cf8074b3
KW
2629 void *buf;
2630 int ret;
33b1db1c 2631
cf8074b3
KW
2632 buf = qemu_blockalign(bs, qiov->size);
2633 qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
2634 ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
2635 qemu_vfree(buf);
2636
2637 return ret;
33b1db1c
MK
2638}
2639
2640static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
2641 int64_t pos, int size)
2642{
2643 BDRVSheepdogState *s = bs->opaque;
2644
2645 return do_load_save_vmstate(s, data, pos, size, 1);
2646}
2647
2648
cac8f4a6
LY
2649static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
2650 int nb_sectors)
2651{
2652 SheepdogAIOCB *acb;
2653 QEMUIOVector dummy;
2654 BDRVSheepdogState *s = bs->opaque;
2655 int ret;
2656
2657 if (!s->discard_supported) {
2658 return 0;
2659 }
2660
2661 acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors);
2662 acb->aiocb_type = AIOCB_DISCARD_OBJ;
2663 acb->aio_done_func = sd_finish_aiocb;
2664
6a55c82c 2665retry:
498f2140
HM
2666 if (check_overlapping_aiocb(s, acb)) {
2667 qemu_co_queue_wait(&s->overlapping_queue);
6a55c82c
HM
2668 goto retry;
2669 }
2670
cac8f4a6
LY
2671 ret = sd_co_rw_vector(acb);
2672 if (ret <= 0) {
6a55c82c 2673 QLIST_REMOVE(acb, aiocb_siblings);
498f2140 2674 qemu_co_queue_restart_all(&s->overlapping_queue);
8007429a 2675 qemu_aio_unref(acb);
cac8f4a6
LY
2676 return ret;
2677 }
2678
2679 qemu_coroutine_yield();
2680
6a55c82c 2681 QLIST_REMOVE(acb, aiocb_siblings);
498f2140 2682 qemu_co_queue_restart_all(&s->overlapping_queue);
6a55c82c 2683
cac8f4a6
LY
2684 return acb->ret;
2685}
2686
b6b8a333
PB
2687static coroutine_fn int64_t
2688sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2689 int *pnum)
8d71c631
LY
2690{
2691 BDRVSheepdogState *s = bs->opaque;
2692 SheepdogInode *inode = &s->inode;
876eb1b0 2693 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
9cd76737 2694 uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
876eb1b0 2695 unsigned long start = offset / object_size,
8d71c631 2696 end = DIV_ROUND_UP((sector_num + nb_sectors) *
876eb1b0 2697 BDRV_SECTOR_SIZE, object_size);
8d71c631 2698 unsigned long idx;
9cd76737 2699 int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
8d71c631
LY
2700
2701 for (idx = start; idx < end; idx++) {
2702 if (inode->data_vdi_id[idx] == 0) {
2703 break;
2704 }
2705 }
2706 if (idx == start) {
2707 /* Get the longest length of unallocated sectors */
2708 ret = 0;
2709 for (idx = start + 1; idx < end; idx++) {
2710 if (inode->data_vdi_id[idx] != 0) {
2711 break;
2712 }
2713 }
2714 }
2715
876eb1b0 2716 *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
8d71c631
LY
2717 if (*pnum > nb_sectors) {
2718 *pnum = nb_sectors;
2719 }
2720 return ret;
2721}
2722
85829722
LY
2723static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
2724{
2725 BDRVSheepdogState *s = bs->opaque;
2726 SheepdogInode *inode = &s->inode;
876eb1b0
TI
2727 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
2728 unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
85829722
LY
2729 uint64_t size = 0;
2730
2731 for (i = 0; i < last; i++) {
2732 if (inode->data_vdi_id[i] == 0) {
2733 continue;
2734 }
876eb1b0 2735 size += object_size;
85829722
LY
2736 }
2737 return size;
2738}
2739
b222237b
CL
2740static QemuOptsList sd_create_opts = {
2741 .name = "sheepdog-create-opts",
2742 .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head),
2743 .desc = {
2744 {
2745 .name = BLOCK_OPT_SIZE,
2746 .type = QEMU_OPT_SIZE,
2747 .help = "Virtual disk size"
2748 },
2749 {
2750 .name = BLOCK_OPT_BACKING_FILE,
2751 .type = QEMU_OPT_STRING,
2752 .help = "File name of a base image"
2753 },
2754 {
2755 .name = BLOCK_OPT_PREALLOC,
2756 .type = QEMU_OPT_STRING,
2757 .help = "Preallocation mode (allowed values: off, full)"
2758 },
2759 {
2760 .name = BLOCK_OPT_REDUNDANCY,
2761 .type = QEMU_OPT_STRING,
2762 .help = "Redundancy of the image"
2763 },
876eb1b0
TI
2764 {
2765 .name = BLOCK_OPT_OBJECT_SIZE,
2766 .type = QEMU_OPT_SIZE,
2767 .help = "Object size of the image"
2768 },
b222237b
CL
2769 { /* end of list */ }
2770 }
33b1db1c
MK
2771};
2772
5d6768e3 2773static BlockDriver bdrv_sheepdog = {
33b1db1c
MK
2774 .format_name = "sheepdog",
2775 .protocol_name = "sheepdog",
2776 .instance_size = sizeof(BDRVSheepdogState),
030be321 2777 .bdrv_needs_filename = true,
33b1db1c 2778 .bdrv_file_open = sd_open,
4da65c80
LY
2779 .bdrv_reopen_prepare = sd_reopen_prepare,
2780 .bdrv_reopen_commit = sd_reopen_commit,
2781 .bdrv_reopen_abort = sd_reopen_abort,
33b1db1c 2782 .bdrv_close = sd_close,
c282e1fd 2783 .bdrv_create = sd_create,
e4f5c1bf 2784 .bdrv_has_zero_init = bdrv_has_zero_init_1,
33b1db1c 2785 .bdrv_getlength = sd_getlength,
85829722 2786 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
33b1db1c
MK
2787 .bdrv_truncate = sd_truncate,
2788
2df46246
MK
2789 .bdrv_co_readv = sd_co_readv,
2790 .bdrv_co_writev = sd_co_writev,
47622c44 2791 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
cac8f4a6 2792 .bdrv_co_discard = sd_co_discard,
b6b8a333 2793 .bdrv_co_get_block_status = sd_co_get_block_status,
33b1db1c
MK
2794
2795 .bdrv_snapshot_create = sd_snapshot_create,
2796 .bdrv_snapshot_goto = sd_snapshot_goto,
2797 .bdrv_snapshot_delete = sd_snapshot_delete,
2798 .bdrv_snapshot_list = sd_snapshot_list,
2799
2800 .bdrv_save_vmstate = sd_save_vmstate,
2801 .bdrv_load_vmstate = sd_load_vmstate,
2802
84390bed
SH
2803 .bdrv_detach_aio_context = sd_detach_aio_context,
2804 .bdrv_attach_aio_context = sd_attach_aio_context,
2805
b222237b 2806 .create_opts = &sd_create_opts,
33b1db1c
MK
2807};
2808
5d6768e3
MK
2809static BlockDriver bdrv_sheepdog_tcp = {
2810 .format_name = "sheepdog",
2811 .protocol_name = "sheepdog+tcp",
2812 .instance_size = sizeof(BDRVSheepdogState),
030be321 2813 .bdrv_needs_filename = true,
5d6768e3 2814 .bdrv_file_open = sd_open,
4da65c80
LY
2815 .bdrv_reopen_prepare = sd_reopen_prepare,
2816 .bdrv_reopen_commit = sd_reopen_commit,
2817 .bdrv_reopen_abort = sd_reopen_abort,
5d6768e3 2818 .bdrv_close = sd_close,
c282e1fd 2819 .bdrv_create = sd_create,
e4f5c1bf 2820 .bdrv_has_zero_init = bdrv_has_zero_init_1,
5d6768e3 2821 .bdrv_getlength = sd_getlength,
85829722 2822 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
5d6768e3
MK
2823 .bdrv_truncate = sd_truncate,
2824
2825 .bdrv_co_readv = sd_co_readv,
2826 .bdrv_co_writev = sd_co_writev,
2827 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
cac8f4a6 2828 .bdrv_co_discard = sd_co_discard,
b6b8a333 2829 .bdrv_co_get_block_status = sd_co_get_block_status,
5d6768e3
MK
2830
2831 .bdrv_snapshot_create = sd_snapshot_create,
2832 .bdrv_snapshot_goto = sd_snapshot_goto,
2833 .bdrv_snapshot_delete = sd_snapshot_delete,
2834 .bdrv_snapshot_list = sd_snapshot_list,
2835
2836 .bdrv_save_vmstate = sd_save_vmstate,
2837 .bdrv_load_vmstate = sd_load_vmstate,
2838
84390bed
SH
2839 .bdrv_detach_aio_context = sd_detach_aio_context,
2840 .bdrv_attach_aio_context = sd_attach_aio_context,
2841
b222237b 2842 .create_opts = &sd_create_opts,
5d6768e3
MK
2843};
2844
1b8bbb46
MK
2845static BlockDriver bdrv_sheepdog_unix = {
2846 .format_name = "sheepdog",
2847 .protocol_name = "sheepdog+unix",
2848 .instance_size = sizeof(BDRVSheepdogState),
030be321 2849 .bdrv_needs_filename = true,
1b8bbb46 2850 .bdrv_file_open = sd_open,
4da65c80
LY
2851 .bdrv_reopen_prepare = sd_reopen_prepare,
2852 .bdrv_reopen_commit = sd_reopen_commit,
2853 .bdrv_reopen_abort = sd_reopen_abort,
1b8bbb46 2854 .bdrv_close = sd_close,
c282e1fd 2855 .bdrv_create = sd_create,
3ac21627 2856 .bdrv_has_zero_init = bdrv_has_zero_init_1,
1b8bbb46 2857 .bdrv_getlength = sd_getlength,
85829722 2858 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
1b8bbb46
MK
2859 .bdrv_truncate = sd_truncate,
2860
2861 .bdrv_co_readv = sd_co_readv,
2862 .bdrv_co_writev = sd_co_writev,
2863 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
cac8f4a6 2864 .bdrv_co_discard = sd_co_discard,
b6b8a333 2865 .bdrv_co_get_block_status = sd_co_get_block_status,
1b8bbb46
MK
2866
2867 .bdrv_snapshot_create = sd_snapshot_create,
2868 .bdrv_snapshot_goto = sd_snapshot_goto,
2869 .bdrv_snapshot_delete = sd_snapshot_delete,
2870 .bdrv_snapshot_list = sd_snapshot_list,
2871
2872 .bdrv_save_vmstate = sd_save_vmstate,
2873 .bdrv_load_vmstate = sd_load_vmstate,
2874
84390bed
SH
2875 .bdrv_detach_aio_context = sd_detach_aio_context,
2876 .bdrv_attach_aio_context = sd_attach_aio_context,
2877
b222237b 2878 .create_opts = &sd_create_opts,
1b8bbb46
MK
2879};
2880
33b1db1c
MK
2881static void bdrv_sheepdog_init(void)
2882{
2883 bdrv_register(&bdrv_sheepdog);
5d6768e3 2884 bdrv_register(&bdrv_sheepdog_tcp);
1b8bbb46 2885 bdrv_register(&bdrv_sheepdog_unix);
33b1db1c
MK
2886}
2887block_init(bdrv_sheepdog_init);