]> git.proxmox.com Git - mirror_qemu.git/blame - block/sheepdog.c
sheepdog: Don't truncate long VDI name in _open(), _create()
[mirror_qemu.git] / block / sheepdog.c
CommitLineData
33b1db1c
MK
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
6b620ca3
PB
10 *
11 * Contributions after 2012-01-13 are licensed under the terms of the
12 * GNU GPL, version 2 or (at your option) any later version.
33b1db1c 13 */
33b1db1c 14
80c71a24 15#include "qemu/osdep.h"
da34e65c 16#include "qapi/error.h"
5d6768e3 17#include "qemu/uri.h"
1de7afc9
PB
18#include "qemu/error-report.h"
19#include "qemu/sockets.h"
737e150e 20#include "block/block_int.h"
fba98d45 21#include "sysemu/block-backend.h"
1de7afc9 22#include "qemu/bitops.h"
f348b6d1 23#include "qemu/cutils.h"
33b1db1c
MK
24
25#define SD_PROTO_VER 0x01
26
27#define SD_DEFAULT_ADDR "localhost"
25af257d 28#define SD_DEFAULT_PORT 7000
33b1db1c
MK
29
30#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
31#define SD_OP_READ_OBJ 0x02
32#define SD_OP_WRITE_OBJ 0x03
cac8f4a6 33/* 0x04 is used internally by Sheepdog */
33b1db1c
MK
34
35#define SD_OP_NEW_VDI 0x11
36#define SD_OP_LOCK_VDI 0x12
37#define SD_OP_RELEASE_VDI 0x13
38#define SD_OP_GET_VDI_INFO 0x14
39#define SD_OP_READ_VDIS 0x15
47622c44 40#define SD_OP_FLUSH_VDI 0x16
859e5553 41#define SD_OP_DEL_VDI 0x17
876eb1b0 42#define SD_OP_GET_CLUSTER_DEFAULT 0x18
33b1db1c
MK
43
44#define SD_FLAG_CMD_WRITE 0x01
45#define SD_FLAG_CMD_COW 0x02
0e7106d8
LY
46#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
47#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
33b1db1c
MK
48
49#define SD_RES_SUCCESS 0x00 /* Success */
50#define SD_RES_UNKNOWN 0x01 /* Unknown error */
51#define SD_RES_NO_OBJ 0x02 /* No object found */
52#define SD_RES_EIO 0x03 /* I/O error */
53#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
54#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
55#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
56#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
57#define SD_RES_NO_VDI 0x08 /* No vdi found */
58#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
59#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
60#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
61#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
62#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
63#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
64#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
65#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
66#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
67#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
68#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
69#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
70#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
71#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
72#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
73#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
fca23f0a 74#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */
6a0b5490 75#define SD_RES_READONLY 0x1A /* Object is read-only */
33b1db1c
MK
76
77/*
78 * Object ID rules
79 *
80 * 0 - 19 (20 bits): data object space
81 * 20 - 31 (12 bits): reserved data object space
82 * 32 - 55 (24 bits): vdi object space
83 * 56 - 59 ( 4 bits): reserved vdi object space
7acae208 84 * 60 - 63 ( 4 bits): object type identifier space
33b1db1c
MK
85 */
86
87#define VDI_SPACE_SHIFT 32
88#define VDI_BIT (UINT64_C(1) << 63)
89#define VMSTATE_BIT (UINT64_C(1) << 62)
90#define MAX_DATA_OBJS (UINT64_C(1) << 20)
91#define MAX_CHILDREN 1024
92#define SD_MAX_VDI_LEN 256
93#define SD_MAX_VDI_TAG_LEN 256
94#define SD_NR_VDIS (1U << 24)
95#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
96#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
876eb1b0 97#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
b3af018f
LY
98/*
99 * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
100 * (SD_EC_MAX_STRIP - 1) for parity strips
101 *
102 * SD_MAX_COPIES is sum of number of data strips and parity strips.
103 */
104#define SD_EC_MAX_STRIP 16
105#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
33b1db1c
MK
106
107#define SD_INODE_SIZE (sizeof(SheepdogInode))
108#define CURRENT_VDI_ID 0
109
1dbfafed
HM
110#define LOCK_TYPE_NORMAL 0
111#define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */
112
33b1db1c
MK
113typedef struct SheepdogReq {
114 uint8_t proto_ver;
115 uint8_t opcode;
116 uint16_t flags;
117 uint32_t epoch;
118 uint32_t id;
119 uint32_t data_length;
120 uint32_t opcode_specific[8];
121} SheepdogReq;
122
123typedef struct SheepdogRsp {
124 uint8_t proto_ver;
125 uint8_t opcode;
126 uint16_t flags;
127 uint32_t epoch;
128 uint32_t id;
129 uint32_t data_length;
130 uint32_t result;
131 uint32_t opcode_specific[7];
132} SheepdogRsp;
133
134typedef struct SheepdogObjReq {
135 uint8_t proto_ver;
136 uint8_t opcode;
137 uint16_t flags;
138 uint32_t epoch;
139 uint32_t id;
140 uint32_t data_length;
141 uint64_t oid;
142 uint64_t cow_oid;
29a67f7e 143 uint8_t copies;
1841f880
LY
144 uint8_t copy_policy;
145 uint8_t reserved[6];
33b1db1c
MK
146 uint64_t offset;
147} SheepdogObjReq;
148
149typedef struct SheepdogObjRsp {
150 uint8_t proto_ver;
151 uint8_t opcode;
152 uint16_t flags;
153 uint32_t epoch;
154 uint32_t id;
155 uint32_t data_length;
156 uint32_t result;
29a67f7e 157 uint8_t copies;
1841f880
LY
158 uint8_t copy_policy;
159 uint8_t reserved[2];
33b1db1c
MK
160 uint32_t pad[6];
161} SheepdogObjRsp;
162
163typedef struct SheepdogVdiReq {
164 uint8_t proto_ver;
165 uint8_t opcode;
166 uint16_t flags;
167 uint32_t epoch;
168 uint32_t id;
169 uint32_t data_length;
170 uint64_t vdi_size;
9f23fce7 171 uint32_t base_vdi_id;
29a67f7e 172 uint8_t copies;
1841f880 173 uint8_t copy_policy;
876eb1b0
TI
174 uint8_t store_policy;
175 uint8_t block_size_shift;
33b1db1c 176 uint32_t snapid;
1dbfafed
HM
177 uint32_t type;
178 uint32_t pad[2];
33b1db1c
MK
179} SheepdogVdiReq;
180
181typedef struct SheepdogVdiRsp {
182 uint8_t proto_ver;
183 uint8_t opcode;
184 uint16_t flags;
185 uint32_t epoch;
186 uint32_t id;
187 uint32_t data_length;
188 uint32_t result;
189 uint32_t rsvd;
190 uint32_t vdi_id;
191 uint32_t pad[5];
192} SheepdogVdiRsp;
193
876eb1b0
TI
194typedef struct SheepdogClusterRsp {
195 uint8_t proto_ver;
196 uint8_t opcode;
197 uint16_t flags;
198 uint32_t epoch;
199 uint32_t id;
200 uint32_t data_length;
201 uint32_t result;
202 uint8_t nr_copies;
203 uint8_t copy_policy;
204 uint8_t block_size_shift;
205 uint8_t __pad1;
206 uint32_t __pad2[6];
207} SheepdogClusterRsp;
208
33b1db1c
MK
209typedef struct SheepdogInode {
210 char name[SD_MAX_VDI_LEN];
211 char tag[SD_MAX_VDI_TAG_LEN];
212 uint64_t ctime;
213 uint64_t snap_ctime;
214 uint64_t vm_clock_nsec;
215 uint64_t vdi_size;
216 uint64_t vm_state_size;
217 uint16_t copy_policy;
218 uint8_t nr_copies;
219 uint8_t block_size_shift;
220 uint32_t snap_id;
221 uint32_t vdi_id;
222 uint32_t parent_vdi_id;
223 uint32_t child_vdi_id[MAX_CHILDREN];
224 uint32_t data_vdi_id[MAX_DATA_OBJS];
225} SheepdogInode;
226
5d039bab
HM
227#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id)
228
33b1db1c
MK
229/*
230 * 64 bit FNV-1a non-zero initial basis
231 */
232#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
233
234/*
235 * 64 bit Fowler/Noll/Vo FNV-1a hash code
236 */
237static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
238{
239 unsigned char *bp = buf;
240 unsigned char *be = bp + len;
241 while (bp < be) {
242 hval ^= (uint64_t) *bp++;
243 hval += (hval << 1) + (hval << 4) + (hval << 5) +
244 (hval << 7) + (hval << 8) + (hval << 40);
245 }
246 return hval;
247}
248
2f536801 249static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
33b1db1c
MK
250{
251 return inode->vdi_id == inode->data_vdi_id[idx];
252}
253
2f536801 254static inline bool is_data_obj(uint64_t oid)
33b1db1c
MK
255{
256 return !(VDI_BIT & oid);
257}
258
259static inline uint64_t data_oid_to_idx(uint64_t oid)
260{
261 return oid & (MAX_DATA_OBJS - 1);
262}
263
72e0996c
MK
264static inline uint32_t oid_to_vid(uint64_t oid)
265{
266 return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
267}
268
33b1db1c
MK
269static inline uint64_t vid_to_vdi_oid(uint32_t vid)
270{
271 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
272}
273
274static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
275{
276 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
277}
278
279static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
280{
281 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
282}
283
2f536801 284static inline bool is_snapshot(struct SheepdogInode *inode)
33b1db1c
MK
285{
286 return !!inode->snap_ctime;
287}
288
eab8eb8d
VT
289static inline size_t count_data_objs(const struct SheepdogInode *inode)
290{
291 return DIV_ROUND_UP(inode->vdi_size,
292 (1UL << inode->block_size_shift));
293}
294
2440a2c3 295#undef DPRINTF
33b1db1c 296#ifdef DEBUG_SDOG
ed79f37d 297#define DEBUG_SDOG_PRINT 1
33b1db1c 298#else
ed79f37d 299#define DEBUG_SDOG_PRINT 0
33b1db1c 300#endif
ed79f37d
ZJ
301#define DPRINTF(fmt, args...) \
302 do { \
303 if (DEBUG_SDOG_PRINT) { \
304 fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
305 } \
306 } while (0)
33b1db1c
MK
307
308typedef struct SheepdogAIOCB SheepdogAIOCB;
28ddd08c 309typedef struct BDRVSheepdogState BDRVSheepdogState;
33b1db1c
MK
310
311typedef struct AIOReq {
312 SheepdogAIOCB *aiocb;
313 unsigned int iov_offset;
314
315 uint64_t oid;
316 uint64_t base_oid;
317 uint64_t offset;
318 unsigned int data_len;
319 uint8_t flags;
320 uint32_t id;
b544c1ab 321 bool create;
33b1db1c 322
c292ee6a 323 QLIST_ENTRY(AIOReq) aio_siblings;
33b1db1c
MK
324} AIOReq;
325
326enum AIOCBState {
327 AIOCB_WRITE_UDATA,
328 AIOCB_READ_UDATA,
47783072 329 AIOCB_FLUSH_CACHE,
cac8f4a6 330 AIOCB_DISCARD_OBJ,
33b1db1c
MK
331};
332
498f2140 333#define AIOCBOverlapping(x, y) \
6a55c82c
HM
334 (!(x->max_affect_data_idx < y->min_affect_data_idx \
335 || y->max_affect_data_idx < x->min_affect_data_idx))
336
33b1db1c 337struct SheepdogAIOCB {
28ddd08c 338 BDRVSheepdogState *s;
33b1db1c
MK
339
340 QEMUIOVector *qiov;
341
342 int64_t sector_num;
343 int nb_sectors;
344
345 int ret;
346 enum AIOCBState aiocb_type;
347
2df46246 348 Coroutine *coroutine;
1d732d7d 349 int nr_pending;
6a55c82c
HM
350
351 uint32_t min_affect_data_idx;
352 uint32_t max_affect_data_idx;
353
498f2140
HM
354 /*
355 * The difference between affect_data_idx and dirty_data_idx:
356 * affect_data_idx represents range of index of all request types.
357 * dirty_data_idx represents range of index updated by COW requests.
358 * dirty_data_idx is used for updating an inode object.
359 */
360 uint32_t min_dirty_data_idx;
361 uint32_t max_dirty_data_idx;
362
6a55c82c 363 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
33b1db1c
MK
364};
365
28ddd08c 366struct BDRVSheepdogState {
011603ca 367 BlockDriverState *bs;
84390bed 368 AioContext *aio_context;
011603ca 369
33b1db1c
MK
370 SheepdogInode inode;
371
33b1db1c 372 char name[SD_MAX_VDI_LEN];
2f536801 373 bool is_snapshot;
0e7106d8 374 uint32_t cache_flags;
cac8f4a6 375 bool discard_supported;
33b1db1c 376
25af257d 377 char *host_spec;
1b8bbb46 378 bool is_unix;
33b1db1c
MK
379 int fd;
380
2df46246
MK
381 CoMutex lock;
382 Coroutine *co_send;
383 Coroutine *co_recv;
384
33b1db1c 385 uint32_t aioreq_seq_num;
011603ca
MK
386
387 /* Every aio request must be linked to either of these queues. */
c292ee6a 388 QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
011603ca 389 QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
6a55c82c 390
498f2140 391 CoQueue overlapping_queue;
6a55c82c 392 QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
28ddd08c 393};
33b1db1c 394
4da65c80
LY
395typedef struct BDRVSheepdogReopenState {
396 int fd;
397 int cache_flags;
398} BDRVSheepdogReopenState;
399
33b1db1c
MK
400static const char * sd_strerror(int err)
401{
402 int i;
403
404 static const struct {
405 int err;
406 const char *desc;
407 } errors[] = {
408 {SD_RES_SUCCESS, "Success"},
409 {SD_RES_UNKNOWN, "Unknown error"},
410 {SD_RES_NO_OBJ, "No object found"},
411 {SD_RES_EIO, "I/O error"},
412 {SD_RES_VDI_EXIST, "VDI exists already"},
413 {SD_RES_INVALID_PARMS, "Invalid parameters"},
414 {SD_RES_SYSTEM_ERROR, "System error"},
415 {SD_RES_VDI_LOCKED, "VDI is already locked"},
416 {SD_RES_NO_VDI, "No vdi found"},
417 {SD_RES_NO_BASE_VDI, "No base VDI found"},
418 {SD_RES_VDI_READ, "Failed read the requested VDI"},
419 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
420 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
421 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
422 {SD_RES_NO_TAG, "Failed to find the requested tag"},
423 {SD_RES_STARTUP, "The system is still booting"},
424 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
425 {SD_RES_SHUTDOWN, "The system is shutting down"},
426 {SD_RES_NO_MEM, "Out of memory on the server"},
427 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
428 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
429 {SD_RES_NO_SPACE, "Server has no space for new objects"},
430 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
431 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
432 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
fca23f0a 433 {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
6a0b5490 434 {SD_RES_READONLY, "Object is read-only"},
33b1db1c
MK
435 };
436
437 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
438 if (errors[i].err == err) {
439 return errors[i].desc;
440 }
441 }
442
443 return "Invalid error code";
444}
445
446/*
447 * Sheepdog I/O handling:
448 *
2df46246 449 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
c292ee6a 450 * link the requests to the inflight_list in the
e80ab33d 451 * BDRVSheepdogState. The function yields while waiting for
2df46246 452 * receiving the response.
33b1db1c 453 *
2df46246 454 * 2. We receive the response in aio_read_response, the fd handler to
e80ab33d
PB
455 * the sheepdog connection. We switch back to sd_co_readv/sd_writev
456 * after all the requests belonging to the AIOCB are finished. If
457 * needed, sd_co_writev will send another requests for the vdi object.
33b1db1c
MK
458 */
459
460static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
461 uint64_t oid, unsigned int data_len,
b544c1ab 462 uint64_t offset, uint8_t flags, bool create,
33b1db1c
MK
463 uint64_t base_oid, unsigned int iov_offset)
464{
465 AIOReq *aio_req;
466
7267c094 467 aio_req = g_malloc(sizeof(*aio_req));
33b1db1c
MK
468 aio_req->aiocb = acb;
469 aio_req->iov_offset = iov_offset;
470 aio_req->oid = oid;
471 aio_req->base_oid = base_oid;
472 aio_req->offset = offset;
473 aio_req->data_len = data_len;
474 aio_req->flags = flags;
475 aio_req->id = s->aioreq_seq_num++;
b544c1ab 476 aio_req->create = create;
33b1db1c 477
1d732d7d 478 acb->nr_pending++;
33b1db1c
MK
479 return aio_req;
480}
481
acf6e5f0
PB
482static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
483{
484 SheepdogAIOCB *cb;
485
486retry:
487 QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
488 if (AIOCBOverlapping(acb, cb)) {
1ace7cea 489 qemu_co_queue_wait(&s->overlapping_queue, NULL);
acf6e5f0
PB
490 goto retry;
491 }
492 }
493}
494
28ddd08c
PB
495static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
496 QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
497 int type)
33b1db1c 498{
6a55c82c 499 uint32_t object_size;
6a55c82c
HM
500
501 object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 502
28ddd08c 503 acb->s = s;
33b1db1c
MK
504
505 acb->qiov = qiov;
506
507 acb->sector_num = sector_num;
508 acb->nb_sectors = nb_sectors;
509
2df46246 510 acb->coroutine = qemu_coroutine_self();
33b1db1c 511 acb->ret = 0;
1d732d7d 512 acb->nr_pending = 0;
6a55c82c
HM
513
514 acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
515 acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
516 acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
517
498f2140
HM
518 acb->min_dirty_data_idx = UINT32_MAX;
519 acb->max_dirty_data_idx = 0;
28ddd08c 520 acb->aiocb_type = type;
acf6e5f0
PB
521
522 if (type == AIOCB_FLUSH_CACHE) {
523 return;
524 }
525
526 wait_for_overlapping_aiocb(s, acb);
527 QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
33b1db1c
MK
528}
529
833a7cc3 530/* Return -EIO in case of error, file descriptor on success */
dfb12bf8 531static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
33b1db1c 532{
25af257d 533 int fd;
33b1db1c 534
1b8bbb46 535 if (s->is_unix) {
dfb12bf8 536 fd = unix_connect(s->host_spec, errp);
1b8bbb46 537 } else {
dfb12bf8 538 fd = inet_connect(s->host_spec, errp);
1b8bbb46 539
dfb12bf8 540 if (fd >= 0) {
1b8bbb46
MK
541 int ret = socket_set_nodelay(fd);
542 if (ret < 0) {
543 error_report("%s", strerror(errno));
544 }
545 }
546 }
33b1db1c 547
dfb12bf8 548 if (fd >= 0) {
f9e8cacc 549 qemu_set_nonblock(fd);
833a7cc3
LY
550 } else {
551 fd = -EIO;
33b1db1c
MK
552 }
553
33b1db1c
MK
554 return fd;
555}
556
833a7cc3 557/* Return 0 on success and -errno in case of error */
e0d93a89
MK
558static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
559 unsigned int *wlen)
47622c44
LY
560{
561 int ret;
562
563 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
80731d9d 564 if (ret != sizeof(*hdr)) {
47622c44 565 error_report("failed to send a req, %s", strerror(errno));
b16a44e1 566 return -errno;
47622c44
LY
567 }
568
569 ret = qemu_co_send(sockfd, data, *wlen);
80731d9d 570 if (ret != *wlen) {
47622c44 571 error_report("failed to send a req, %s", strerror(errno));
b16a44e1 572 return -errno;
47622c44
LY
573 }
574
575 return ret;
576}
e0d93a89 577
cddd4ac7
MK
578typedef struct SheepdogReqCo {
579 int sockfd;
f11672db 580 BlockDriverState *bs;
84390bed 581 AioContext *aio_context;
cddd4ac7
MK
582 SheepdogReq *hdr;
583 void *data;
584 unsigned int *wlen;
585 unsigned int *rlen;
586 int ret;
587 bool finished;
9d456654 588 Coroutine *co;
cddd4ac7
MK
589} SheepdogReqCo;
590
9d456654
PB
591static void restart_co_req(void *opaque)
592{
593 SheepdogReqCo *srco = opaque;
594
595 aio_co_wake(srco->co);
596}
597
cddd4ac7 598static coroutine_fn void do_co_req(void *opaque)
47622c44
LY
599{
600 int ret;
cddd4ac7
MK
601 SheepdogReqCo *srco = opaque;
602 int sockfd = srco->sockfd;
603 SheepdogReq *hdr = srco->hdr;
604 void *data = srco->data;
605 unsigned int *wlen = srco->wlen;
606 unsigned int *rlen = srco->rlen;
2dfcca3b 607
9d456654 608 srco->co = qemu_coroutine_self();
dca21ef2 609 aio_set_fd_handler(srco->aio_context, sockfd, false,
9d456654 610 NULL, restart_co_req, NULL, srco);
47622c44 611
47622c44
LY
612 ret = send_co_req(sockfd, hdr, data, wlen);
613 if (ret < 0) {
614 goto out;
615 }
616
dca21ef2 617 aio_set_fd_handler(srco->aio_context, sockfd, false,
9d456654 618 restart_co_req, NULL, NULL, srco);
2dfcca3b 619
47622c44 620 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
80731d9d 621 if (ret != sizeof(*hdr)) {
47622c44 622 error_report("failed to get a rsp, %s", strerror(errno));
cb595887 623 ret = -errno;
47622c44
LY
624 goto out;
625 }
626
627 if (*rlen > hdr->data_length) {
628 *rlen = hdr->data_length;
629 }
630
631 if (*rlen) {
632 ret = qemu_co_recv(sockfd, data, *rlen);
80731d9d 633 if (ret != *rlen) {
47622c44 634 error_report("failed to get the data, %s", strerror(errno));
cb595887 635 ret = -errno;
47622c44
LY
636 goto out;
637 }
638 }
639 ret = 0;
640out:
ed9ba724
MK
641 /* there is at most one request for this sockfd, so it is safe to
642 * set each handler to NULL. */
dca21ef2 643 aio_set_fd_handler(srco->aio_context, sockfd, false,
f6a51c84 644 NULL, NULL, NULL, NULL);
cddd4ac7 645
9d456654 646 srco->co = NULL;
cddd4ac7
MK
647 srco->ret = ret;
648 srco->finished = true;
c9d1a561
PB
649 if (srco->bs) {
650 bdrv_wakeup(srco->bs);
651 }
cddd4ac7
MK
652}
653
833a7cc3
LY
654/*
655 * Send the request to the sheep in a synchronous manner.
656 *
657 * Return 0 on success, -errno in case of error.
658 */
f11672db 659static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
84390bed 660 void *data, unsigned int *wlen, unsigned int *rlen)
cddd4ac7
MK
661{
662 Coroutine *co;
663 SheepdogReqCo srco = {
664 .sockfd = sockfd,
f11672db
PB
665 .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
666 .bs = bs,
cddd4ac7
MK
667 .hdr = hdr,
668 .data = data,
669 .wlen = wlen,
670 .rlen = rlen,
671 .ret = 0,
672 .finished = false,
673 };
674
675 if (qemu_in_coroutine()) {
676 do_co_req(&srco);
677 } else {
0b8b8753 678 co = qemu_coroutine_create(do_co_req, &srco);
f11672db
PB
679 if (bs) {
680 qemu_coroutine_enter(co);
681 BDRV_POLL_WHILE(bs, !srco.finished);
682 } else {
683 qemu_coroutine_enter(co);
684 while (!srco.finished) {
685 aio_poll(qemu_get_aio_context(), true);
686 }
cddd4ac7
MK
687 }
688 }
689
690 return srco.ret;
47622c44
LY
691}
692
a37dcdf9 693static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
694 struct iovec *iov, int niov,
695 enum AIOCBState aiocb_type);
a37dcdf9 696static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
72e0996c 697static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
356b4ca2 698static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
011603ca 699static void co_write_request(void *opaque);
7dc1cde0 700
011603ca
MK
701static coroutine_fn void reconnect_to_sdog(void *opaque)
702{
703 BDRVSheepdogState *s = opaque;
704 AIOReq *aio_req, *next;
705
dca21ef2 706 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
f6a51c84 707 NULL, NULL, NULL);
011603ca
MK
708 close(s->fd);
709 s->fd = -1;
710
711 /* Wait for outstanding write requests to be completed. */
712 while (s->co_send != NULL) {
713 co_write_request(opaque);
714 }
715
716 /* Try to reconnect the sheepdog server every one second. */
717 while (s->fd < 0) {
a780dea0 718 Error *local_err = NULL;
356b4ca2 719 s->fd = get_sheep_fd(s, &local_err);
011603ca
MK
720 if (s->fd < 0) {
721 DPRINTF("Wait for connection to be established\n");
565f65d2 722 error_report_err(local_err);
011603ca
MK
723 co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME,
724 1000000000ULL);
725 }
726 };
727
728 /*
729 * Now we have to resend all the request in the inflight queue. However,
730 * resend_aioreq() can yield and newly created requests can be added to the
731 * inflight queue before the coroutine is resumed. To avoid mixing them, we
732 * have to move all the inflight requests to the failed queue before
733 * resend_aioreq() is called.
734 */
735 QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
736 QLIST_REMOVE(aio_req, aio_siblings);
737 QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
738 }
739
740 /* Resend all the failed aio requests. */
741 while (!QLIST_EMPTY(&s->failed_aio_head)) {
742 aio_req = QLIST_FIRST(&s->failed_aio_head);
743 QLIST_REMOVE(aio_req, aio_siblings);
011603ca
MK
744 resend_aioreq(s, aio_req);
745 }
746}
747
33b1db1c
MK
748/*
749 * Receive responses of the I/O requests.
750 *
751 * This function is registered as a fd handler, and called from the
752 * main loop when s->fd is ready for reading responses.
753 */
d8716b41 754static void coroutine_fn aio_read_response(void *opaque)
33b1db1c
MK
755{
756 SheepdogObjRsp rsp;
757 BDRVSheepdogState *s = opaque;
758 int fd = s->fd;
759 int ret;
760 AIOReq *aio_req = NULL;
761 SheepdogAIOCB *acb;
cac8f4a6 762 uint64_t idx;
33b1db1c 763
33b1db1c 764 /* read a header */
8c5135f9 765 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
80731d9d 766 if (ret != sizeof(rsp)) {
6daf194d 767 error_report("failed to get the header, %s", strerror(errno));
011603ca 768 goto err;
33b1db1c
MK
769 }
770
c292ee6a
MK
771 /* find the right aio_req from the inflight aio list */
772 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
33b1db1c
MK
773 if (aio_req->id == rsp.id) {
774 break;
775 }
776 }
777 if (!aio_req) {
6daf194d 778 error_report("cannot find aio_req %x", rsp.id);
011603ca 779 goto err;
33b1db1c
MK
780 }
781
782 acb = aio_req->aiocb;
783
784 switch (acb->aiocb_type) {
785 case AIOCB_WRITE_UDATA:
786 if (!is_data_obj(aio_req->oid)) {
787 break;
788 }
789 idx = data_oid_to_idx(aio_req->oid);
790
b544c1ab 791 if (aio_req->create) {
33b1db1c
MK
792 /*
793 * If the object is newly created one, we need to update
794 * the vdi object (metadata object). min_dirty_data_idx
795 * and max_dirty_data_idx are changed to include updated
796 * index between them.
797 */
bd751f22
LY
798 if (rsp.result == SD_RES_SUCCESS) {
799 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
498f2140
HM
800 acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
801 acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
bd751f22 802 }
33b1db1c
MK
803 }
804 break;
805 case AIOCB_READ_UDATA:
2fc8ae1d
MT
806 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
807 aio_req->iov_offset, rsp.data_length);
80731d9d 808 if (ret != rsp.data_length) {
6daf194d 809 error_report("failed to get the data, %s", strerror(errno));
011603ca 810 goto err;
33b1db1c
MK
811 }
812 break;
47783072
LY
813 case AIOCB_FLUSH_CACHE:
814 if (rsp.result == SD_RES_INVALID_PARMS) {
2440a2c3 815 DPRINTF("disable cache since the server doesn't support it\n");
47783072
LY
816 s->cache_flags = SD_FLAG_CMD_DIRECT;
817 rsp.result = SD_RES_SUCCESS;
818 }
819 break;
cac8f4a6
LY
820 case AIOCB_DISCARD_OBJ:
821 switch (rsp.result) {
822 case SD_RES_INVALID_PARMS:
823 error_report("sheep(%s) doesn't support discard command",
824 s->host_spec);
825 rsp.result = SD_RES_SUCCESS;
826 s->discard_supported = false;
827 break;
cac8f4a6
LY
828 default:
829 break;
830 }
33b1db1c
MK
831 }
832
e80ab33d
PB
833 /* No more data for this aio_req (reload_inode below uses its own file
834 * descriptor handler which doesn't use co_recv).
835 */
836 s->co_recv = NULL;
837
c4080e93 838 QLIST_REMOVE(aio_req, aio_siblings);
13c31de2
MK
839 switch (rsp.result) {
840 case SD_RES_SUCCESS:
841 break;
842 case SD_RES_READONLY:
72e0996c
MK
843 if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
844 ret = reload_inode(s, 0, "");
845 if (ret < 0) {
011603ca 846 goto err;
72e0996c
MK
847 }
848 }
72e0996c
MK
849 if (is_data_obj(aio_req->oid)) {
850 aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
851 data_oid_to_idx(aio_req->oid));
852 } else {
853 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
854 }
a37dcdf9 855 resend_aioreq(s, aio_req);
e80ab33d 856 return;
13c31de2 857 default:
33b1db1c 858 acb->ret = -EIO;
6daf194d 859 error_report("%s", sd_strerror(rsp.result));
13c31de2 860 break;
33b1db1c
MK
861 }
862
c4080e93
PB
863 g_free(aio_req);
864
865 if (!--acb->nr_pending) {
33b1db1c
MK
866 /*
867 * We've finished all requests which belong to the AIOCB, so
2df46246 868 * we can switch back to sd_co_readv/writev now.
33b1db1c 869 */
9d456654 870 aio_co_wake(acb->coroutine);
33b1db1c 871 }
e80ab33d 872
011603ca 873 return;
e80ab33d 874
011603ca 875err:
011603ca 876 reconnect_to_sdog(opaque);
2df46246
MK
877}
878
879static void co_read_response(void *opaque)
880{
881 BDRVSheepdogState *s = opaque;
882
883 if (!s->co_recv) {
0b8b8753 884 s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
2df46246
MK
885 }
886
9d456654 887 aio_co_wake(s->co_recv);
2df46246
MK
888}
889
890static void co_write_request(void *opaque)
891{
892 BDRVSheepdogState *s = opaque;
893
9d456654 894 aio_co_wake(s->co_send);
33b1db1c
MK
895}
896
33b1db1c 897/*
dc6fb73d 898 * Return a socket descriptor to read/write objects.
33b1db1c 899 *
dc6fb73d 900 * We cannot use this descriptor for other operations because
33b1db1c
MK
901 * the block driver may be on waiting response from the server.
902 */
356b4ca2 903static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
33b1db1c 904{
1b8bbb46 905 int fd;
33b1db1c 906
356b4ca2 907 fd = connect_to_sdog(s, errp);
33b1db1c 908 if (fd < 0) {
cb595887 909 return fd;
33b1db1c
MK
910 }
911
dca21ef2 912 aio_set_fd_handler(s->aio_context, fd, false,
f6a51c84 913 co_read_response, NULL, NULL, s);
33b1db1c
MK
914 return fd;
915}
916
89e2a31d
MA
917/*
918 * Parse numeric snapshot ID in @str
919 * If @str can't be parsed as number, return false.
920 * Else, if the number is zero or too large, set *@snapid to zero and
921 * return true.
922 * Else, set *@snapid to the number and return true.
923 */
924static bool sd_parse_snapid(const char *str, uint32_t *snapid)
925{
926 unsigned long ul;
927 int ret;
928
929 ret = qemu_strtoul(str, NULL, 10, &ul);
930 if (ret == -ERANGE) {
931 ul = ret = 0;
932 }
933 if (ret) {
934 return false;
935 }
936 if (ul > UINT32_MAX) {
937 ul = 0;
938 }
939
940 *snapid = ul;
941 return true;
942}
943
944static bool sd_parse_snapid_or_tag(const char *str,
945 uint32_t *snapid, char tag[])
946{
947 if (!sd_parse_snapid(str, snapid)) {
948 *snapid = 0;
949 if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) {
950 return false;
951 }
952 } else if (!*snapid) {
953 return false;
954 } else {
955 tag[0] = 0;
956 }
957 return true;
958}
959
5d6768e3
MK
960static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
961 char *vdi, uint32_t *snapid, char *tag)
962{
963 URI *uri;
964 QueryParams *qp = NULL;
965 int ret = 0;
966
967 uri = uri_parse(filename);
968 if (!uri) {
969 return -EINVAL;
970 }
971
1b8bbb46
MK
972 /* transport */
973 if (!strcmp(uri->scheme, "sheepdog")) {
974 s->is_unix = false;
975 } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
976 s->is_unix = false;
977 } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
978 s->is_unix = true;
979 } else {
980 ret = -EINVAL;
981 goto out;
982 }
983
5d6768e3
MK
984 if (uri->path == NULL || !strcmp(uri->path, "/")) {
985 ret = -EINVAL;
986 goto out;
987 }
daa0b0d4
MA
988 if (g_strlcpy(vdi, uri->path + 1, SD_MAX_VDI_LEN) >= SD_MAX_VDI_LEN) {
989 ret = -EINVAL;
990 goto out;
991 }
5d6768e3 992
1b8bbb46
MK
993 qp = query_params_parse(uri->query);
994 if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
995 ret = -EINVAL;
996 goto out;
997 }
998
999 if (s->is_unix) {
1000 /* sheepdog+unix:///vdiname?socket=path */
1001 if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
1002 ret = -EINVAL;
1003 goto out;
1004 }
1005 s->host_spec = g_strdup(qp->p[0].value);
1006 } else {
1007 /* sheepdog[+tcp]://[host:port]/vdiname */
1008 s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR,
1009 uri->port ?: SD_DEFAULT_PORT);
1010 }
5d6768e3
MK
1011
1012 /* snapshot tag */
1013 if (uri->fragment) {
89e2a31d
MA
1014 if (!sd_parse_snapid_or_tag(uri->fragment, snapid, tag)) {
1015 ret = -EINVAL;
1016 goto out;
5d6768e3
MK
1017 }
1018 } else {
1019 *snapid = CURRENT_VDI_ID; /* search current vdi */
1020 }
1021
1022out:
1023 if (qp) {
1024 query_params_free(qp);
1025 }
1026 uri_free(uri);
1027 return ret;
1028}
1029
33b1db1c 1030/*
5d6768e3 1031 * Parse a filename (old syntax)
33b1db1c
MK
1032 *
1033 * filename must be one of the following formats:
1034 * 1. [vdiname]
1035 * 2. [vdiname]:[snapid]
1036 * 3. [vdiname]:[tag]
1037 * 4. [hostname]:[port]:[vdiname]
1038 * 5. [hostname]:[port]:[vdiname]:[snapid]
1039 * 6. [hostname]:[port]:[vdiname]:[tag]
1040 *
1041 * You can boot from the snapshot images by specifying `snapid` or
1042 * `tag'.
1043 *
1044 * You can run VMs outside the Sheepdog cluster by specifying
1045 * `hostname' and `port' (experimental).
1046 */
1047static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
1048 char *vdi, uint32_t *snapid, char *tag)
1049{
5d6768e3
MK
1050 char *p, *q, *uri;
1051 const char *host_spec, *vdi_spec;
1052 int nr_sep, ret;
33b1db1c 1053
11d816a5 1054 strstart(filename, "sheepdog:", &filename);
7267c094 1055 p = q = g_strdup(filename);
33b1db1c
MK
1056
1057 /* count the number of separators */
1058 nr_sep = 0;
1059 while (*p) {
1060 if (*p == ':') {
1061 nr_sep++;
1062 }
1063 p++;
1064 }
1065 p = q;
1066
5d6768e3 1067 /* use the first two tokens as host_spec. */
33b1db1c 1068 if (nr_sep >= 2) {
5d6768e3 1069 host_spec = p;
33b1db1c 1070 p = strchr(p, ':');
5d6768e3 1071 p++;
33b1db1c
MK
1072 p = strchr(p, ':');
1073 *p++ = '\0';
1074 } else {
5d6768e3 1075 host_spec = "";
33b1db1c
MK
1076 }
1077
5d6768e3 1078 vdi_spec = p;
33b1db1c 1079
5d6768e3 1080 p = strchr(vdi_spec, ':');
33b1db1c 1081 if (p) {
5d6768e3 1082 *p++ = '#';
33b1db1c
MK
1083 }
1084
5d6768e3 1085 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
33b1db1c 1086
5d6768e3
MK
1087 ret = sd_parse_uri(s, uri, vdi, snapid, tag);
1088
1089 g_free(q);
1090 g_free(uri);
1091
1092 return ret;
33b1db1c
MK
1093}
1094
982dcbf4
MK
1095static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
1096 uint32_t snapid, const char *tag, uint32_t *vid,
dc83cd42 1097 bool lock, Error **errp)
33b1db1c
MK
1098{
1099 int ret, fd;
1100 SheepdogVdiReq hdr;
1101 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1102 unsigned int wlen, rlen = 0;
1103 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1104
dc83cd42 1105 fd = connect_to_sdog(s, errp);
33b1db1c 1106 if (fd < 0) {
cb595887 1107 return fd;
33b1db1c
MK
1108 }
1109
3178e275
JM
1110 /* This pair of strncpy calls ensures that the buffer is zero-filled,
1111 * which is desirable since we'll soon be sending those bytes, and
1112 * don't want the send_req to read uninitialized data.
1113 */
33b1db1c
MK
1114 strncpy(buf, filename, SD_MAX_VDI_LEN);
1115 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1116
1117 memset(&hdr, 0, sizeof(hdr));
982dcbf4 1118 if (lock) {
33b1db1c 1119 hdr.opcode = SD_OP_LOCK_VDI;
1dbfafed 1120 hdr.type = LOCK_TYPE_NORMAL;
982dcbf4
MK
1121 } else {
1122 hdr.opcode = SD_OP_GET_VDI_INFO;
33b1db1c
MK
1123 }
1124 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1125 hdr.proto_ver = SD_PROTO_VER;
1126 hdr.data_length = wlen;
1127 hdr.snapid = snapid;
1128 hdr.flags = SD_FLAG_CMD_WRITE;
1129
f11672db 1130 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1131 if (ret) {
dc83cd42 1132 error_setg_errno(errp, -ret, "cannot get vdi info");
33b1db1c
MK
1133 goto out;
1134 }
1135
1136 if (rsp->result != SD_RES_SUCCESS) {
dc83cd42
MA
1137 error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
1138 sd_strerror(rsp->result), filename, snapid, tag);
cb595887
MK
1139 if (rsp->result == SD_RES_NO_VDI) {
1140 ret = -ENOENT;
38890b24
HM
1141 } else if (rsp->result == SD_RES_VDI_LOCKED) {
1142 ret = -EBUSY;
cb595887
MK
1143 } else {
1144 ret = -EIO;
1145 }
33b1db1c
MK
1146 goto out;
1147 }
1148 *vid = rsp->vdi_id;
1149
1150 ret = 0;
1151out:
1152 closesocket(fd);
1153 return ret;
1154}
1155
a37dcdf9 1156static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
1157 struct iovec *iov, int niov,
1158 enum AIOCBState aiocb_type)
33b1db1c
MK
1159{
1160 int nr_copies = s->inode.nr_copies;
1161 SheepdogObjReq hdr;
47783072 1162 unsigned int wlen = 0;
33b1db1c
MK
1163 int ret;
1164 uint64_t oid = aio_req->oid;
1165 unsigned int datalen = aio_req->data_len;
1166 uint64_t offset = aio_req->offset;
1167 uint8_t flags = aio_req->flags;
1168 uint64_t old_oid = aio_req->base_oid;
b544c1ab 1169 bool create = aio_req->create;
33b1db1c 1170
c4080e93
PB
1171 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1172
33b1db1c 1173 if (!nr_copies) {
6daf194d 1174 error_report("bug");
33b1db1c
MK
1175 }
1176
1177 memset(&hdr, 0, sizeof(hdr));
1178
47783072
LY
1179 switch (aiocb_type) {
1180 case AIOCB_FLUSH_CACHE:
1181 hdr.opcode = SD_OP_FLUSH_VDI;
1182 break;
1183 case AIOCB_READ_UDATA:
33b1db1c
MK
1184 hdr.opcode = SD_OP_READ_OBJ;
1185 hdr.flags = flags;
47783072
LY
1186 break;
1187 case AIOCB_WRITE_UDATA:
1188 if (create) {
1189 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1190 } else {
1191 hdr.opcode = SD_OP_WRITE_OBJ;
1192 }
33b1db1c 1193 wlen = datalen;
33b1db1c 1194 hdr.flags = SD_FLAG_CMD_WRITE | flags;
47783072 1195 break;
cac8f4a6 1196 case AIOCB_DISCARD_OBJ:
e6fd57ea
HM
1197 hdr.opcode = SD_OP_WRITE_OBJ;
1198 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1199 s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
1200 offset = offsetof(SheepdogInode,
1201 data_vdi_id[data_oid_to_idx(oid)]);
1202 oid = vid_to_vdi_oid(s->inode.vdi_id);
1203 wlen = datalen = sizeof(uint32_t);
cac8f4a6 1204 break;
33b1db1c
MK
1205 }
1206
0e7106d8
LY
1207 if (s->cache_flags) {
1208 hdr.flags |= s->cache_flags;
47622c44
LY
1209 }
1210
33b1db1c
MK
1211 hdr.oid = oid;
1212 hdr.cow_oid = old_oid;
1213 hdr.copies = s->inode.nr_copies;
1214
1215 hdr.data_length = datalen;
1216 hdr.offset = offset;
1217
1218 hdr.id = aio_req->id;
1219
2df46246
MK
1220 qemu_co_mutex_lock(&s->lock);
1221 s->co_send = qemu_coroutine_self();
dca21ef2 1222 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1223 co_read_response, co_write_request, NULL, s);
128aa589 1224 socket_set_cork(s->fd, 1);
33b1db1c
MK
1225
1226 /* send a header */
8c5135f9 1227 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
80731d9d 1228 if (ret != sizeof(hdr)) {
6daf194d 1229 error_report("failed to send a req, %s", strerror(errno));
011603ca 1230 goto out;
33b1db1c
MK
1231 }
1232
1233 if (wlen) {
2fc8ae1d 1234 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
80731d9d 1235 if (ret != wlen) {
6daf194d 1236 error_report("failed to send a data, %s", strerror(errno));
33b1db1c
MK
1237 }
1238 }
011603ca 1239out:
128aa589 1240 socket_set_cork(s->fd, 0);
dca21ef2 1241 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1242 co_read_response, NULL, NULL, s);
011603ca 1243 s->co_send = NULL;
2df46246 1244 qemu_co_mutex_unlock(&s->lock);
33b1db1c
MK
1245}
1246
f11672db 1247static int read_write_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1248 uint64_t oid, uint8_t copies,
33b1db1c 1249 unsigned int datalen, uint64_t offset,
0e7106d8 1250 bool write, bool create, uint32_t cache_flags)
33b1db1c
MK
1251{
1252 SheepdogObjReq hdr;
1253 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1254 unsigned int wlen, rlen;
1255 int ret;
1256
1257 memset(&hdr, 0, sizeof(hdr));
1258
1259 if (write) {
1260 wlen = datalen;
1261 rlen = 0;
1262 hdr.flags = SD_FLAG_CMD_WRITE;
1263 if (create) {
1264 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1265 } else {
1266 hdr.opcode = SD_OP_WRITE_OBJ;
1267 }
1268 } else {
1269 wlen = 0;
1270 rlen = datalen;
1271 hdr.opcode = SD_OP_READ_OBJ;
1272 }
47622c44 1273
0e7106d8 1274 hdr.flags |= cache_flags;
47622c44 1275
33b1db1c
MK
1276 hdr.oid = oid;
1277 hdr.data_length = datalen;
1278 hdr.offset = offset;
1279 hdr.copies = copies;
1280
f11672db 1281 ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1282 if (ret) {
6daf194d 1283 error_report("failed to send a request to the sheep");
cb595887 1284 return ret;
33b1db1c
MK
1285 }
1286
1287 switch (rsp->result) {
1288 case SD_RES_SUCCESS:
1289 return 0;
1290 default:
6daf194d 1291 error_report("%s", sd_strerror(rsp->result));
cb595887 1292 return -EIO;
33b1db1c
MK
1293 }
1294}
1295
f11672db 1296static int read_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1297 uint64_t oid, uint8_t copies,
0e7106d8
LY
1298 unsigned int datalen, uint64_t offset,
1299 uint32_t cache_flags)
33b1db1c 1300{
f11672db 1301 return read_write_object(fd, bs, buf, oid, copies,
84390bed 1302 datalen, offset, false,
0e7106d8 1303 false, cache_flags);
33b1db1c
MK
1304}
1305
f11672db 1306static int write_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1307 uint64_t oid, uint8_t copies,
2f536801 1308 unsigned int datalen, uint64_t offset, bool create,
0e7106d8 1309 uint32_t cache_flags)
33b1db1c 1310{
f11672db 1311 return read_write_object(fd, bs, buf, oid, copies,
84390bed 1312 datalen, offset, true,
0e7106d8 1313 create, cache_flags);
33b1db1c
MK
1314}
1315
9ff53a0e
MK
1316/* update inode with the latest state */
1317static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
1318{
dfb12bf8 1319 Error *local_err = NULL;
9ff53a0e
MK
1320 SheepdogInode *inode;
1321 int ret = 0, fd;
1322 uint32_t vid = 0;
1323
dfb12bf8 1324 fd = connect_to_sdog(s, &local_err);
9ff53a0e 1325 if (fd < 0) {
565f65d2 1326 error_report_err(local_err);
9ff53a0e
MK
1327 return -EIO;
1328 }
1329
5d039bab 1330 inode = g_malloc(SD_INODE_HEADER_SIZE);
9ff53a0e 1331
dc83cd42 1332 ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
9ff53a0e 1333 if (ret) {
565f65d2 1334 error_report_err(local_err);
9ff53a0e
MK
1335 goto out;
1336 }
1337
f11672db 1338 ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
5d039bab
HM
1339 s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
1340 s->cache_flags);
9ff53a0e
MK
1341 if (ret < 0) {
1342 goto out;
1343 }
1344
1345 if (inode->vdi_id != s->inode.vdi_id) {
5d039bab 1346 memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE);
9ff53a0e
MK
1347 }
1348
1349out:
1350 g_free(inode);
1351 closesocket(fd);
1352
1353 return ret;
1354}
1355
a37dcdf9 1356static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
13c31de2
MK
1357{
1358 SheepdogAIOCB *acb = aio_req->aiocb;
b544c1ab
HM
1359
1360 aio_req->create = false;
13c31de2
MK
1361
1362 /* check whether this request becomes a CoW one */
2412aec7 1363 if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
13c31de2 1364 int idx = data_oid_to_idx(aio_req->oid);
13c31de2 1365
13c31de2
MK
1366 if (is_data_obj_writable(&s->inode, idx)) {
1367 goto out;
1368 }
1369
80308d33
MK
1370 if (s->inode.data_vdi_id[idx]) {
1371 aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
1372 aio_req->flags |= SD_FLAG_CMD_COW;
1373 }
b544c1ab 1374 aio_req->create = true;
13c31de2
MK
1375 }
1376out:
2412aec7 1377 if (is_data_obj(aio_req->oid)) {
b544c1ab 1378 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 1379 acb->aiocb_type);
2412aec7
MK
1380 } else {
1381 struct iovec iov;
1382 iov.iov_base = &s->inode;
1383 iov.iov_len = sizeof(s->inode);
b544c1ab 1384 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
2412aec7 1385 }
13c31de2
MK
1386}
1387
84390bed
SH
1388static void sd_detach_aio_context(BlockDriverState *bs)
1389{
1390 BDRVSheepdogState *s = bs->opaque;
1391
dca21ef2 1392 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
f6a51c84 1393 NULL, NULL, NULL);
84390bed
SH
1394}
1395
1396static void sd_attach_aio_context(BlockDriverState *bs,
1397 AioContext *new_context)
1398{
1399 BDRVSheepdogState *s = bs->opaque;
1400
1401 s->aio_context = new_context;
dca21ef2 1402 aio_set_fd_handler(new_context, s->fd, false,
f6a51c84 1403 co_read_response, NULL, NULL, s);
84390bed
SH
1404}
1405
c8c96350
KW
1406/* TODO Convert to fine grained options */
1407static QemuOptsList runtime_opts = {
1408 .name = "sheepdog",
1409 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
1410 .desc = {
1411 {
1412 .name = "filename",
1413 .type = QEMU_OPT_STRING,
1414 .help = "URL to the sheepdog image",
1415 },
1416 { /* end of list */ }
1417 },
1418};
1419
015a1036
HR
1420static int sd_open(BlockDriverState *bs, QDict *options, int flags,
1421 Error **errp)
33b1db1c
MK
1422{
1423 int ret, fd;
1424 uint32_t vid = 0;
1425 BDRVSheepdogState *s = bs->opaque;
1426 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1427 uint32_t snapid;
1428 char *buf = NULL;
c8c96350
KW
1429 QemuOpts *opts;
1430 Error *local_err = NULL;
1431 const char *filename;
1432
011603ca 1433 s->bs = bs;
84390bed 1434 s->aio_context = bdrv_get_aio_context(bs);
011603ca 1435
87ea75d5 1436 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
c8c96350 1437 qemu_opts_absorb_qdict(opts, options, &local_err);
84d18f06 1438 if (local_err) {
e67c3993 1439 error_propagate(errp, local_err);
c8c96350 1440 ret = -EINVAL;
cbc488ee 1441 goto err_no_fd;
c8c96350
KW
1442 }
1443
1444 filename = qemu_opt_get(opts, "filename");
33b1db1c 1445
c292ee6a 1446 QLIST_INIT(&s->inflight_aio_head);
011603ca 1447 QLIST_INIT(&s->failed_aio_head);
6a55c82c 1448 QLIST_INIT(&s->inflight_aiocb_head);
33b1db1c
MK
1449 s->fd = -1;
1450
1451 memset(vdi, 0, sizeof(vdi));
1452 memset(tag, 0, sizeof(tag));
5d6768e3
MK
1453
1454 if (strstr(filename, "://")) {
1455 ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
1456 } else {
1457 ret = parse_vdiname(s, filename, vdi, &snapid, tag);
1458 }
1459 if (ret < 0) {
efde4b62 1460 error_setg(errp, "Can't parse filename");
cbc488ee 1461 goto err_no_fd;
33b1db1c 1462 }
e67c3993 1463 s->fd = get_sheep_fd(s, errp);
33b1db1c 1464 if (s->fd < 0) {
cb595887 1465 ret = s->fd;
cbc488ee 1466 goto err_no_fd;
33b1db1c
MK
1467 }
1468
e67c3993 1469 ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp);
33b1db1c 1470 if (ret) {
cbc488ee 1471 goto err;
33b1db1c
MK
1472 }
1473
0e7106d8
LY
1474 /*
1475 * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1476 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1477 */
1478 s->cache_flags = SD_FLAG_CMD_CACHE;
1479 if (flags & BDRV_O_NOCACHE) {
1480 s->cache_flags = SD_FLAG_CMD_DIRECT;
1481 }
cac8f4a6 1482 s->discard_supported = true;
0e7106d8 1483
622b6057 1484 if (snapid || tag[0] != '\0') {
2440a2c3 1485 DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
2f536801 1486 s->is_snapshot = true;
33b1db1c
MK
1487 }
1488
e67c3993 1489 fd = connect_to_sdog(s, errp);
33b1db1c 1490 if (fd < 0) {
cb595887 1491 ret = fd;
cbc488ee 1492 goto err;
33b1db1c
MK
1493 }
1494
7267c094 1495 buf = g_malloc(SD_INODE_SIZE);
f11672db 1496 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
84390bed 1497 0, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
1498
1499 closesocket(fd);
1500
1501 if (ret) {
efde4b62 1502 error_setg(errp, "Can't read snapshot inode");
cbc488ee 1503 goto err;
33b1db1c
MK
1504 }
1505
1506 memcpy(&s->inode, buf, sizeof(s->inode));
33b1db1c 1507
e8bfaa2f 1508 bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
3178e275 1509 pstrcpy(s->name, sizeof(s->name), vdi);
2df46246 1510 qemu_co_mutex_init(&s->lock);
498f2140 1511 qemu_co_queue_init(&s->overlapping_queue);
c8c96350 1512 qemu_opts_del(opts);
7267c094 1513 g_free(buf);
33b1db1c 1514 return 0;
cbc488ee
MA
1515
1516err:
dca21ef2 1517 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
f6a51c84 1518 false, NULL, NULL, NULL, NULL);
cbc488ee
MA
1519 closesocket(s->fd);
1520err_no_fd:
c8c96350 1521 qemu_opts_del(opts);
7267c094 1522 g_free(buf);
cb595887 1523 return ret;
33b1db1c
MK
1524}
1525
4da65c80
LY
1526static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
1527 Error **errp)
1528{
1529 BDRVSheepdogState *s = state->bs->opaque;
1530 BDRVSheepdogReopenState *re_s;
1531 int ret = 0;
1532
1533 re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
1534
1535 re_s->cache_flags = SD_FLAG_CMD_CACHE;
1536 if (state->flags & BDRV_O_NOCACHE) {
1537 re_s->cache_flags = SD_FLAG_CMD_DIRECT;
1538 }
1539
1540 re_s->fd = get_sheep_fd(s, errp);
1541 if (re_s->fd < 0) {
1542 ret = re_s->fd;
1543 return ret;
1544 }
1545
1546 return ret;
1547}
1548
1549static void sd_reopen_commit(BDRVReopenState *state)
1550{
1551 BDRVSheepdogReopenState *re_s = state->opaque;
1552 BDRVSheepdogState *s = state->bs->opaque;
1553
1554 if (s->fd) {
dca21ef2 1555 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1556 NULL, NULL, NULL, NULL);
4da65c80
LY
1557 closesocket(s->fd);
1558 }
1559
1560 s->fd = re_s->fd;
1561 s->cache_flags = re_s->cache_flags;
1562
1563 g_free(state->opaque);
1564 state->opaque = NULL;
1565
1566 return;
1567}
1568
1569static void sd_reopen_abort(BDRVReopenState *state)
1570{
1571 BDRVSheepdogReopenState *re_s = state->opaque;
1572 BDRVSheepdogState *s = state->bs->opaque;
1573
1574 if (re_s == NULL) {
1575 return;
1576 }
1577
1578 if (re_s->fd) {
dca21ef2 1579 aio_set_fd_handler(s->aio_context, re_s->fd, false,
f6a51c84 1580 NULL, NULL, NULL, NULL);
4da65c80
LY
1581 closesocket(re_s->fd);
1582 }
1583
1584 g_free(state->opaque);
1585 state->opaque = NULL;
1586
1587 return;
1588}
1589
7d2d3e74
MA
1590static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
1591 Error **errp)
33b1db1c
MK
1592{
1593 SheepdogVdiReq hdr;
1594 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1595 int fd, ret;
1596 unsigned int wlen, rlen = 0;
1597 char buf[SD_MAX_VDI_LEN];
1598
7d2d3e74 1599 fd = connect_to_sdog(s, errp);
33b1db1c 1600 if (fd < 0) {
cb595887 1601 return fd;
33b1db1c
MK
1602 }
1603
3178e275
JM
1604 /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1605 * does not fit in buf? For now, just truncate and avoid buffer overrun.
1606 */
33b1db1c 1607 memset(buf, 0, sizeof(buf));
c31d482f 1608 pstrcpy(buf, sizeof(buf), s->name);
33b1db1c
MK
1609
1610 memset(&hdr, 0, sizeof(hdr));
1611 hdr.opcode = SD_OP_NEW_VDI;
9f23fce7 1612 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
1613
1614 wlen = SD_MAX_VDI_LEN;
1615
1616 hdr.flags = SD_FLAG_CMD_WRITE;
1617 hdr.snapid = snapshot;
1618
1619 hdr.data_length = wlen;
c31d482f
LY
1620 hdr.vdi_size = s->inode.vdi_size;
1621 hdr.copy_policy = s->inode.copy_policy;
b3af018f 1622 hdr.copies = s->inode.nr_copies;
876eb1b0 1623 hdr.block_size_shift = s->inode.block_size_shift;
33b1db1c 1624
f11672db 1625 ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c
MK
1626
1627 closesocket(fd);
1628
1629 if (ret) {
7d2d3e74 1630 error_setg_errno(errp, -ret, "create failed");
cb595887 1631 return ret;
33b1db1c
MK
1632 }
1633
1634 if (rsp->result != SD_RES_SUCCESS) {
7d2d3e74 1635 error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
33b1db1c
MK
1636 return -EIO;
1637 }
1638
1639 if (vdi_id) {
1640 *vdi_id = rsp->vdi_id;
1641 }
1642
1643 return 0;
1644}
1645
318df29e 1646static int sd_prealloc(const char *filename, Error **errp)
a8e0fdd7 1647{
fba98d45 1648 BlockBackend *blk = NULL;
876eb1b0
TI
1649 BDRVSheepdogState *base = NULL;
1650 unsigned long buf_size;
a8e0fdd7 1651 uint32_t idx, max_idx;
876eb1b0 1652 uint32_t object_size;
a8e0fdd7 1653 int64_t vdi_size;
876eb1b0 1654 void *buf = NULL;
a8e0fdd7
MK
1655 int ret;
1656
efaa7c4e 1657 blk = blk_new_open(filename, NULL, NULL,
55880601 1658 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
fba98d45
KW
1659 if (blk == NULL) {
1660 ret = -EIO;
318df29e 1661 goto out_with_err_set;
a8e0fdd7
MK
1662 }
1663
fba98d45
KW
1664 blk_set_allow_write_beyond_eof(blk, true);
1665
1666 vdi_size = blk_getlength(blk);
a8e0fdd7
MK
1667 if (vdi_size < 0) {
1668 ret = vdi_size;
1669 goto out;
1670 }
876eb1b0 1671
fba98d45 1672 base = blk_bs(blk)->opaque;
876eb1b0
TI
1673 object_size = (UINT32_C(1) << base->inode.block_size_shift);
1674 buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
1675 buf = g_malloc0(buf_size);
1676
1677 max_idx = DIV_ROUND_UP(vdi_size, buf_size);
a8e0fdd7
MK
1678
1679 for (idx = 0; idx < max_idx; idx++) {
1680 /*
1681 * The created image can be a cloned image, so we need to read
1682 * a data from the source image.
1683 */
fba98d45 1684 ret = blk_pread(blk, idx * buf_size, buf, buf_size);
a8e0fdd7
MK
1685 if (ret < 0) {
1686 goto out;
1687 }
8341f00d 1688 ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
a8e0fdd7
MK
1689 if (ret < 0) {
1690 goto out;
1691 }
1692 }
318df29e 1693
fba98d45 1694 ret = 0;
a8e0fdd7 1695out:
318df29e
MA
1696 if (ret < 0) {
1697 error_setg_errno(errp, -ret, "Can't pre-allocate");
1698 }
1699out_with_err_set:
fba98d45
KW
1700 if (blk) {
1701 blk_unref(blk);
a8e0fdd7 1702 }
7267c094 1703 g_free(buf);
a8e0fdd7
MK
1704
1705 return ret;
1706}
1707
b3af018f
LY
1708/*
1709 * Sheepdog support two kinds of redundancy, full replication and erasure
1710 * coding.
1711 *
1712 * # create a fully replicated vdi with x copies
1713 * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
1714 *
1715 * # create a erasure coded vdi with x data strips and y parity strips
1716 * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
1717 */
1718static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
1719{
1720 struct SheepdogInode *inode = &s->inode;
1721 const char *n1, *n2;
1722 long copy, parity;
1723 char p[10];
1724
1725 pstrcpy(p, sizeof(p), opt);
1726 n1 = strtok(p, ":");
1727 n2 = strtok(NULL, ":");
1728
1729 if (!n1) {
1730 return -EINVAL;
1731 }
1732
1733 copy = strtol(n1, NULL, 10);
89e2a31d 1734 /* FIXME fix error checking by switching to qemu_strtol() */
b3af018f
LY
1735 if (copy > SD_MAX_COPIES || copy < 1) {
1736 return -EINVAL;
1737 }
1738 if (!n2) {
1739 inode->copy_policy = 0;
1740 inode->nr_copies = copy;
1741 return 0;
1742 }
1743
1744 if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
1745 return -EINVAL;
1746 }
1747
1748 parity = strtol(n2, NULL, 10);
89e2a31d 1749 /* FIXME fix error checking by switching to qemu_strtol() */
b3af018f
LY
1750 if (parity >= SD_EC_MAX_STRIP || parity < 1) {
1751 return -EINVAL;
1752 }
1753
1754 /*
1755 * 4 bits for parity and 4 bits for data.
1756 * We have to compress upper data bits because it can't represent 16
1757 */
1758 inode->copy_policy = ((copy / 2) << 4) + parity;
1759 inode->nr_copies = copy + parity;
1760
1761 return 0;
1762}
1763
876eb1b0
TI
1764static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
1765{
1766 struct SheepdogInode *inode = &s->inode;
1767 uint64_t object_size;
1768 int obj_order;
1769
1770 object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
1771 if (object_size) {
1772 if ((object_size - 1) & object_size) { /* not a power of 2? */
1773 return -EINVAL;
1774 }
786a4ea8 1775 obj_order = ctz32(object_size);
876eb1b0
TI
1776 if (obj_order < 20 || obj_order > 31) {
1777 return -EINVAL;
1778 }
1779 inode->block_size_shift = (uint8_t)obj_order;
1780 }
1781
1782 return 0;
1783}
1784
b222237b 1785static int sd_create(const char *filename, QemuOpts *opts,
d5124c00 1786 Error **errp)
33b1db1c 1787{
b6fc8245 1788 int ret = 0;
c31d482f 1789 uint32_t vid = 0;
33b1db1c 1790 char *backing_file = NULL;
b222237b 1791 char *buf = NULL;
b6fc8245 1792 BDRVSheepdogState *s;
c31d482f 1793 char tag[SD_MAX_VDI_TAG_LEN];
b4447363 1794 uint32_t snapid;
876eb1b0 1795 uint64_t max_vdi_size;
2f536801 1796 bool prealloc = false;
33b1db1c 1797
5839e53b 1798 s = g_new0(BDRVSheepdogState, 1);
b6fc8245 1799
b4447363 1800 memset(tag, 0, sizeof(tag));
5d6768e3 1801 if (strstr(filename, "://")) {
c31d482f 1802 ret = sd_parse_uri(s, filename, s->name, &snapid, tag);
5d6768e3 1803 } else {
c31d482f 1804 ret = parse_vdiname(s, filename, s->name, &snapid, tag);
5d6768e3
MK
1805 }
1806 if (ret < 0) {
efde4b62 1807 error_setg(errp, "Can't parse filename");
b6fc8245 1808 goto out;
b4447363
MK
1809 }
1810
c2eb918e
HT
1811 s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1812 BDRV_SECTOR_SIZE);
b222237b
CL
1813 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
1814 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1815 if (!buf || !strcmp(buf, "off")) {
1816 prealloc = false;
1817 } else if (!strcmp(buf, "full")) {
1818 prealloc = true;
1819 } else {
1820 error_setg(errp, "Invalid preallocation mode: '%s'", buf);
1821 ret = -EINVAL;
1822 goto out;
1823 }
1824
1825 g_free(buf);
1826 buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
1827 if (buf) {
1828 ret = parse_redundancy(s, buf);
1829 if (ret < 0) {
1830 error_setg(errp, "Invalid redundancy mode: '%s'", buf);
1831 goto out;
33b1db1c 1832 }
33b1db1c 1833 }
876eb1b0
TI
1834 ret = parse_block_size_shift(s, opts);
1835 if (ret < 0) {
1836 error_setg(errp, "Invalid object_size."
1837 " obect_size needs to be power of 2"
1838 " and be limited from 2^20 to 2^31");
b6fc8245 1839 goto out;
33b1db1c
MK
1840 }
1841
1842 if (backing_file) {
fba98d45 1843 BlockBackend *blk;
9f23fce7 1844 BDRVSheepdogState *base;
33b1db1c
MK
1845 BlockDriver *drv;
1846
1847 /* Currently, only Sheepdog backing image is supported. */
b65a5e12 1848 drv = bdrv_find_protocol(backing_file, true, NULL);
33b1db1c 1849 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
e67c3993 1850 error_setg(errp, "backing_file must be a sheepdog image");
b6fc8245
MK
1851 ret = -EINVAL;
1852 goto out;
33b1db1c
MK
1853 }
1854
efaa7c4e 1855 blk = blk_new_open(backing_file, NULL, NULL,
72e775c7 1856 BDRV_O_PROTOCOL, errp);
fba98d45
KW
1857 if (blk == NULL) {
1858 ret = -EIO;
b6fc8245 1859 goto out;
cb595887 1860 }
33b1db1c 1861
fba98d45 1862 base = blk_bs(blk)->opaque;
33b1db1c 1863
9f23fce7 1864 if (!is_snapshot(&base->inode)) {
e67c3993 1865 error_setg(errp, "cannot clone from a non snapshot vdi");
fba98d45 1866 blk_unref(blk);
b6fc8245
MK
1867 ret = -EINVAL;
1868 goto out;
33b1db1c 1869 }
9f23fce7 1870 s->inode.vdi_id = base->inode.vdi_id;
fba98d45 1871 blk_unref(blk);
33b1db1c
MK
1872 }
1873
5d5da114 1874 s->aio_context = qemu_get_aio_context();
876eb1b0
TI
1875
1876 /* if block_size_shift is not specified, get cluster default value */
1877 if (s->inode.block_size_shift == 0) {
1878 SheepdogVdiReq hdr;
1879 SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
876eb1b0
TI
1880 int fd;
1881 unsigned int wlen = 0, rlen = 0;
1882
48d7c4af 1883 fd = connect_to_sdog(s, errp);
876eb1b0 1884 if (fd < 0) {
48d7c4af 1885 ret = fd;
876eb1b0
TI
1886 goto out;
1887 }
1888
1889 memset(&hdr, 0, sizeof(hdr));
1890 hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
1891 hdr.proto_ver = SD_PROTO_VER;
1892
f11672db 1893 ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
876eb1b0
TI
1894 NULL, &wlen, &rlen);
1895 closesocket(fd);
1896 if (ret) {
1897 error_setg_errno(errp, -ret, "failed to get cluster default");
1898 goto out;
1899 }
1900 if (rsp->result == SD_RES_SUCCESS) {
1901 s->inode.block_size_shift = rsp->block_size_shift;
1902 } else {
1903 s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
1904 }
1905 }
1906
1907 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
1908
1909 if (s->inode.vdi_size > max_vdi_size) {
1910 error_setg(errp, "An image is too large."
1911 " The maximum image size is %"PRIu64 "GB",
1912 max_vdi_size / 1024 / 1024 / 1024);
1913 ret = -EINVAL;
1914 goto out;
1915 }
1916
e67c3993 1917 ret = do_sd_create(s, &vid, 0, errp);
7d2d3e74 1918 if (ret) {
b6fc8245 1919 goto out;
a8e0fdd7
MK
1920 }
1921
7d2d3e74 1922 if (prealloc) {
e67c3993 1923 ret = sd_prealloc(filename, errp);
318df29e 1924 }
b6fc8245 1925out:
b222237b
CL
1926 g_free(backing_file);
1927 g_free(buf);
b6fc8245
MK
1928 g_free(s);
1929 return ret;
33b1db1c
MK
1930}
1931
1932static void sd_close(BlockDriverState *bs)
1933{
dfb12bf8 1934 Error *local_err = NULL;
33b1db1c
MK
1935 BDRVSheepdogState *s = bs->opaque;
1936 SheepdogVdiReq hdr;
1937 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1938 unsigned int wlen, rlen = 0;
1939 int fd, ret;
1940
2440a2c3 1941 DPRINTF("%s\n", s->name);
33b1db1c 1942
dfb12bf8 1943 fd = connect_to_sdog(s, &local_err);
33b1db1c 1944 if (fd < 0) {
565f65d2 1945 error_report_err(local_err);
33b1db1c
MK
1946 return;
1947 }
1948
1949 memset(&hdr, 0, sizeof(hdr));
1950
1951 hdr.opcode = SD_OP_RELEASE_VDI;
1dbfafed 1952 hdr.type = LOCK_TYPE_NORMAL;
9f23fce7 1953 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
1954 wlen = strlen(s->name) + 1;
1955 hdr.data_length = wlen;
1956 hdr.flags = SD_FLAG_CMD_WRITE;
1957
f11672db 1958 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
84390bed 1959 s->name, &wlen, &rlen);
33b1db1c
MK
1960
1961 closesocket(fd);
1962
1963 if (!ret && rsp->result != SD_RES_SUCCESS &&
1964 rsp->result != SD_RES_VDI_NOT_LOCKED) {
6daf194d 1965 error_report("%s, %s", sd_strerror(rsp->result), s->name);
33b1db1c
MK
1966 }
1967
dca21ef2 1968 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
f6a51c84 1969 false, NULL, NULL, NULL, NULL);
33b1db1c 1970 closesocket(s->fd);
25af257d 1971 g_free(s->host_spec);
33b1db1c
MK
1972}
1973
1974static int64_t sd_getlength(BlockDriverState *bs)
1975{
1976 BDRVSheepdogState *s = bs->opaque;
1977
1978 return s->inode.vdi_size;
1979}
1980
1981static int sd_truncate(BlockDriverState *bs, int64_t offset)
1982{
dfb12bf8 1983 Error *local_err = NULL;
33b1db1c
MK
1984 BDRVSheepdogState *s = bs->opaque;
1985 int ret, fd;
1986 unsigned int datalen;
876eb1b0 1987 uint64_t max_vdi_size;
33b1db1c 1988
876eb1b0 1989 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
33b1db1c 1990 if (offset < s->inode.vdi_size) {
6daf194d 1991 error_report("shrinking is not supported");
33b1db1c 1992 return -EINVAL;
876eb1b0 1993 } else if (offset > max_vdi_size) {
6daf194d 1994 error_report("too big image size");
33b1db1c
MK
1995 return -EINVAL;
1996 }
1997
dfb12bf8 1998 fd = connect_to_sdog(s, &local_err);
33b1db1c 1999 if (fd < 0) {
565f65d2 2000 error_report_err(local_err);
cb595887 2001 return fd;
33b1db1c
MK
2002 }
2003
2004 /* we don't need to update entire object */
2005 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2006 s->inode.vdi_size = offset;
f11672db 2007 ret = write_object(fd, s->bs, (char *)&s->inode,
84390bed
SH
2008 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2009 datalen, 0, false, s->cache_flags);
33b1db1c
MK
2010 close(fd);
2011
2012 if (ret < 0) {
6daf194d 2013 error_report("failed to update an inode.");
33b1db1c
MK
2014 }
2015
cb595887 2016 return ret;
33b1db1c
MK
2017}
2018
2019/*
2020 * This function is called after writing data objects. If we need to
2021 * update metadata, this sends a write request to the vdi object.
33b1db1c 2022 */
d8716b41 2023static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
33b1db1c 2024{
28ddd08c 2025 BDRVSheepdogState *s = acb->s;
33b1db1c
MK
2026 struct iovec iov;
2027 AIOReq *aio_req;
2028 uint32_t offset, data_len, mn, mx;
2029
498f2140
HM
2030 mn = acb->min_dirty_data_idx;
2031 mx = acb->max_dirty_data_idx;
33b1db1c
MK
2032 if (mn <= mx) {
2033 /* we need to update the vdi object. */
e80ab33d 2034 ++acb->nr_pending;
33b1db1c
MK
2035 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
2036 mn * sizeof(s->inode.data_vdi_id[0]);
2037 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
2038
498f2140
HM
2039 acb->min_dirty_data_idx = UINT32_MAX;
2040 acb->max_dirty_data_idx = 0;
33b1db1c
MK
2041
2042 iov.iov_base = &s->inode;
2043 iov.iov_len = sizeof(s->inode);
2044 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2045 data_len, offset, 0, false, 0, offset);
b544c1ab 2046 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
e80ab33d
PB
2047 if (--acb->nr_pending) {
2048 qemu_coroutine_yield();
2049 }
33b1db1c 2050 }
33b1db1c
MK
2051}
2052
859e5553
LY
2053/* Delete current working VDI on the snapshot chain */
2054static bool sd_delete(BDRVSheepdogState *s)
2055{
dfb12bf8 2056 Error *local_err = NULL;
859e5553
LY
2057 unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
2058 SheepdogVdiReq hdr = {
2059 .opcode = SD_OP_DEL_VDI,
9f23fce7 2060 .base_vdi_id = s->inode.vdi_id,
859e5553
LY
2061 .data_length = wlen,
2062 .flags = SD_FLAG_CMD_WRITE,
2063 };
2064 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2065 int fd, ret;
2066
dfb12bf8 2067 fd = connect_to_sdog(s, &local_err);
859e5553 2068 if (fd < 0) {
565f65d2 2069 error_report_err(local_err);
859e5553
LY
2070 return false;
2071 }
2072
f11672db 2073 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
84390bed 2074 s->name, &wlen, &rlen);
859e5553
LY
2075 closesocket(fd);
2076 if (ret) {
2077 return false;
2078 }
2079 switch (rsp->result) {
2080 case SD_RES_NO_VDI:
2081 error_report("%s was already deleted", s->name);
2082 /* fall through */
2083 case SD_RES_SUCCESS:
2084 break;
2085 default:
2086 error_report("%s, %s", sd_strerror(rsp->result), s->name);
2087 return false;
2088 }
2089
2090 return true;
2091}
2092
33b1db1c
MK
2093/*
2094 * Create a writable VDI from a snapshot
2095 */
2096static int sd_create_branch(BDRVSheepdogState *s)
2097{
dfb12bf8 2098 Error *local_err = NULL;
33b1db1c
MK
2099 int ret, fd;
2100 uint32_t vid;
2101 char *buf;
859e5553 2102 bool deleted;
33b1db1c 2103
2440a2c3 2104 DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
33b1db1c 2105
7267c094 2106 buf = g_malloc(SD_INODE_SIZE);
33b1db1c 2107
859e5553
LY
2108 /*
2109 * Even If deletion fails, we will just create extra snapshot based on
dc6fb73d 2110 * the working VDI which was supposed to be deleted. So no need to
859e5553
LY
2111 * false bail out.
2112 */
2113 deleted = sd_delete(s);
7d2d3e74 2114 ret = do_sd_create(s, &vid, !deleted, &local_err);
33b1db1c 2115 if (ret) {
565f65d2 2116 error_report_err(local_err);
33b1db1c
MK
2117 goto out;
2118 }
2119
2440a2c3 2120 DPRINTF("%" PRIx32 " is created.\n", vid);
33b1db1c 2121
dfb12bf8 2122 fd = connect_to_sdog(s, &local_err);
33b1db1c 2123 if (fd < 0) {
565f65d2 2124 error_report_err(local_err);
cb595887 2125 ret = fd;
33b1db1c
MK
2126 goto out;
2127 }
2128
f11672db 2129 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
84390bed 2130 s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
2131
2132 closesocket(fd);
2133
2134 if (ret < 0) {
2135 goto out;
2136 }
2137
2138 memcpy(&s->inode, buf, sizeof(s->inode));
2139
2f536801 2140 s->is_snapshot = false;
33b1db1c 2141 ret = 0;
2440a2c3 2142 DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
33b1db1c
MK
2143
2144out:
7267c094 2145 g_free(buf);
33b1db1c
MK
2146
2147 return ret;
2148}
2149
2150/*
2151 * Send I/O requests to the server.
2152 *
2153 * This function sends requests to the server, links the requests to
c292ee6a 2154 * the inflight_list in BDRVSheepdogState, and exits without
33b1db1c
MK
2155 * waiting the response. The responses are received in the
2156 * `aio_read_response' function which is called from the main loop as
2157 * a fd handler.
2df46246
MK
2158 *
2159 * Returns 1 when we need to wait a response, 0 when there is no sent
2160 * request and -errno in error cases.
33b1db1c 2161 */
28ddd08c 2162static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
33b1db1c 2163{
33b1db1c 2164 int ret = 0;
e8bfaa2f 2165 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
876eb1b0
TI
2166 unsigned long idx;
2167 uint32_t object_size;
33b1db1c 2168 uint64_t oid;
876eb1b0 2169 uint64_t offset;
28ddd08c 2170 BDRVSheepdogState *s = acb->s;
33b1db1c
MK
2171 SheepdogInode *inode = &s->inode;
2172 AIOReq *aio_req;
2173
33b1db1c
MK
2174 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
2175 /*
2176 * In the case we open the snapshot VDI, Sheepdog creates the
2177 * writable VDI when we do a write operation first.
2178 */
2179 ret = sd_create_branch(s);
2180 if (ret) {
2181 acb->ret = -EIO;
e80ab33d 2182 return;
33b1db1c
MK
2183 }
2184 }
2185
876eb1b0
TI
2186 object_size = (UINT32_C(1) << inode->block_size_shift);
2187 idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
2188 offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
2189
1d732d7d
MK
2190 /*
2191 * Make sure we don't free the aiocb before we are done with all requests.
2192 * This additional reference is dropped at the end of this function.
2193 */
2194 acb->nr_pending++;
2195
33b1db1c
MK
2196 while (done != total) {
2197 uint8_t flags = 0;
2198 uint64_t old_oid = 0;
2f536801 2199 bool create = false;
33b1db1c
MK
2200
2201 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
2202
876eb1b0 2203 len = MIN(total - done, object_size - offset);
33b1db1c 2204
19db9b90
CH
2205 switch (acb->aiocb_type) {
2206 case AIOCB_READ_UDATA:
2207 if (!inode->data_vdi_id[idx]) {
2208 qemu_iovec_memset(acb->qiov, done, 0, len);
33b1db1c
MK
2209 goto done;
2210 }
19db9b90
CH
2211 break;
2212 case AIOCB_WRITE_UDATA:
2213 if (!inode->data_vdi_id[idx]) {
2f536801 2214 create = true;
19db9b90
CH
2215 } else if (!is_data_obj_writable(inode, idx)) {
2216 /* Copy-On-Write */
2f536801 2217 create = true;
19db9b90
CH
2218 old_oid = oid;
2219 flags = SD_FLAG_CMD_COW;
2220 }
2221 break;
cac8f4a6
LY
2222 case AIOCB_DISCARD_OBJ:
2223 /*
2224 * We discard the object only when the whole object is
2225 * 1) allocated 2) trimmed. Otherwise, simply skip it.
2226 */
876eb1b0 2227 if (len != object_size || inode->data_vdi_id[idx] == 0) {
cac8f4a6
LY
2228 goto done;
2229 }
2230 break;
19db9b90
CH
2231 default:
2232 break;
33b1db1c
MK
2233 }
2234
2235 if (create) {
2440a2c3 2236 DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
1b6ac998 2237 inode->vdi_id, oid,
33b1db1c
MK
2238 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
2239 oid = vid_to_data_oid(inode->vdi_id, idx);
2440a2c3 2240 DPRINTF("new oid %" PRIx64 "\n", oid);
33b1db1c
MK
2241 }
2242
b544c1ab 2243 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
e6fd57ea
HM
2244 old_oid,
2245 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
2246 0 : done);
b544c1ab 2247 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 2248 acb->aiocb_type);
33b1db1c
MK
2249 done:
2250 offset = 0;
2251 idx++;
2252 done += len;
2253 }
e80ab33d
PB
2254 if (--acb->nr_pending) {
2255 qemu_coroutine_yield();
33b1db1c
MK
2256 }
2257}
2258
acf6e5f0 2259static void sd_aio_complete(SheepdogAIOCB *acb)
6a55c82c 2260{
acf6e5f0
PB
2261 if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
2262 return;
6a55c82c
HM
2263 }
2264
acf6e5f0
PB
2265 QLIST_REMOVE(acb, aiocb_siblings);
2266 qemu_co_queue_restart_all(&acb->s->overlapping_queue);
6a55c82c
HM
2267}
2268
a968168c 2269static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
2df46246 2270 int nb_sectors, QEMUIOVector *qiov)
33b1db1c 2271{
28ddd08c 2272 SheepdogAIOCB acb;
2df46246 2273 int ret;
e50d7607
LY
2274 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
2275 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2276
c0191e76 2277 if (offset > s->inode.vdi_size) {
e50d7607 2278 ret = sd_truncate(bs, offset);
cb595887
MK
2279 if (ret < 0) {
2280 return ret;
33b1db1c 2281 }
33b1db1c
MK
2282 }
2283
28ddd08c 2284 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
28ddd08c
PB
2285 sd_co_rw_vector(&acb);
2286 sd_write_done(&acb);
acf6e5f0 2287 sd_aio_complete(&acb);
2df46246 2288
28ddd08c 2289 return acb.ret;
33b1db1c
MK
2290}
2291
a968168c 2292static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2df46246 2293 int nb_sectors, QEMUIOVector *qiov)
33b1db1c 2294{
28ddd08c 2295 SheepdogAIOCB acb;
6a55c82c 2296 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2297
28ddd08c 2298 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
28ddd08c 2299 sd_co_rw_vector(&acb);
acf6e5f0 2300 sd_aio_complete(&acb);
2df46246 2301
28ddd08c 2302 return acb.ret;
33b1db1c
MK
2303}
2304
47622c44
LY
2305static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
2306{
2307 BDRVSheepdogState *s = bs->opaque;
28ddd08c 2308 SheepdogAIOCB acb;
47783072 2309 AIOReq *aio_req;
47622c44 2310
0e7106d8 2311 if (s->cache_flags != SD_FLAG_CMD_CACHE) {
47622c44
LY
2312 return 0;
2313 }
2314
28ddd08c 2315 sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
47622c44 2316
28ddd08c
PB
2317 acb.nr_pending++;
2318 aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2319 0, 0, 0, false, 0, 0);
28ddd08c 2320 add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
47622c44 2321
28ddd08c 2322 if (--acb.nr_pending) {
e80ab33d
PB
2323 qemu_coroutine_yield();
2324 }
acf6e5f0
PB
2325
2326 sd_aio_complete(&acb);
28ddd08c 2327 return acb.ret;
47622c44
LY
2328}
2329
33b1db1c
MK
2330static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
2331{
dfb12bf8 2332 Error *local_err = NULL;
33b1db1c
MK
2333 BDRVSheepdogState *s = bs->opaque;
2334 int ret, fd;
2335 uint32_t new_vid;
2336 SheepdogInode *inode;
2337 unsigned int datalen;
2338
2440a2c3 2339 DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
33b1db1c
MK
2340 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
2341 s->name, sn_info->vm_state_size, s->is_snapshot);
2342
2343 if (s->is_snapshot) {
2344 error_report("You can't create a snapshot of a snapshot VDI, "
6daf194d 2345 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
33b1db1c
MK
2346
2347 return -EINVAL;
2348 }
2349
2440a2c3 2350 DPRINTF("%s %s\n", sn_info->name, sn_info->id_str);
33b1db1c
MK
2351
2352 s->inode.vm_state_size = sn_info->vm_state_size;
2353 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
3178e275
JM
2354 /* It appears that inode.tag does not require a NUL terminator,
2355 * which means this use of strncpy is ok.
2356 */
33b1db1c
MK
2357 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
2358 /* we don't need to update entire object */
2359 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2df5fee2 2360 inode = g_malloc(datalen);
33b1db1c
MK
2361
2362 /* refresh inode. */
dfb12bf8 2363 fd = connect_to_sdog(s, &local_err);
33b1db1c 2364 if (fd < 0) {
565f65d2 2365 error_report_err(local_err);
cb595887 2366 ret = fd;
33b1db1c
MK
2367 goto cleanup;
2368 }
2369
f11672db 2370 ret = write_object(fd, s->bs, (char *)&s->inode,
84390bed
SH
2371 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2372 datalen, 0, false, s->cache_flags);
33b1db1c 2373 if (ret < 0) {
6daf194d 2374 error_report("failed to write snapshot's inode.");
33b1db1c
MK
2375 goto cleanup;
2376 }
2377
7d2d3e74 2378 ret = do_sd_create(s, &new_vid, 1, &local_err);
33b1db1c 2379 if (ret < 0) {
c29b77f9
MA
2380 error_reportf_err(local_err,
2381 "failed to create inode for snapshot: ");
33b1db1c
MK
2382 goto cleanup;
2383 }
2384
f11672db 2385 ret = read_object(fd, s->bs, (char *)inode,
84390bed
SH
2386 vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
2387 s->cache_flags);
33b1db1c
MK
2388
2389 if (ret < 0) {
6daf194d 2390 error_report("failed to read new inode info. %s", strerror(errno));
33b1db1c
MK
2391 goto cleanup;
2392 }
2393
2394 memcpy(&s->inode, inode, datalen);
2440a2c3 2395 DPRINTF("s->inode: name %s snap_id %x oid %x\n",
33b1db1c
MK
2396 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
2397
2398cleanup:
2df5fee2 2399 g_free(inode);
33b1db1c
MK
2400 closesocket(fd);
2401 return ret;
2402}
2403
859e5553
LY
2404/*
2405 * We implement rollback(loadvm) operation to the specified snapshot by
2406 * 1) switch to the snapshot
2407 * 2) rely on sd_create_branch to delete working VDI and
dc6fb73d 2408 * 3) create a new working VDI based on the specified snapshot
859e5553 2409 */
33b1db1c
MK
2410static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
2411{
2412 BDRVSheepdogState *s = bs->opaque;
2413 BDRVSheepdogState *old_s;
9ff53a0e 2414 char tag[SD_MAX_VDI_TAG_LEN];
33b1db1c 2415 uint32_t snapid = 0;
89e2a31d
MA
2416 int ret;
2417
2418 if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) {
2419 return -EINVAL;
2420 }
33b1db1c 2421
5839e53b 2422 old_s = g_new(BDRVSheepdogState, 1);
33b1db1c
MK
2423
2424 memcpy(old_s, s, sizeof(BDRVSheepdogState));
2425
9ff53a0e 2426 ret = reload_inode(s, snapid, tag);
33b1db1c 2427 if (ret) {
33b1db1c
MK
2428 goto out;
2429 }
2430
cede621f
LY
2431 ret = sd_create_branch(s);
2432 if (ret) {
33b1db1c
MK
2433 goto out;
2434 }
2435
7267c094 2436 g_free(old_s);
33b1db1c
MK
2437
2438 return 0;
2439out:
2440 /* recover bdrv_sd_state */
2441 memcpy(s, old_s, sizeof(BDRVSheepdogState));
7267c094 2442 g_free(old_s);
33b1db1c 2443
6daf194d 2444 error_report("failed to open. recover old bdrv_sd_state.");
33b1db1c
MK
2445
2446 return ret;
2447}
2448
eab8eb8d
VT
2449#define NR_BATCHED_DISCARD 128
2450
e25cad69 2451static int remove_objects(BDRVSheepdogState *s, Error **errp)
eab8eb8d
VT
2452{
2453 int fd, i = 0, nr_objs = 0;
e25cad69 2454 int ret;
eab8eb8d
VT
2455 SheepdogInode *inode = &s->inode;
2456
e25cad69 2457 fd = connect_to_sdog(s, errp);
eab8eb8d 2458 if (fd < 0) {
e25cad69 2459 return fd;
eab8eb8d
VT
2460 }
2461
2462 nr_objs = count_data_objs(inode);
2463 while (i < nr_objs) {
2464 int start_idx, nr_filled_idx;
2465
2466 while (i < nr_objs && !inode->data_vdi_id[i]) {
2467 i++;
2468 }
2469 start_idx = i;
2470
2471 nr_filled_idx = 0;
2472 while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
2473 if (inode->data_vdi_id[i]) {
2474 inode->data_vdi_id[i] = 0;
2475 nr_filled_idx++;
2476 }
2477
2478 i++;
2479 }
2480
f11672db 2481 ret = write_object(fd, s->bs,
eab8eb8d
VT
2482 (char *)&inode->data_vdi_id[start_idx],
2483 vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
2484 (i - start_idx) * sizeof(uint32_t),
2485 offsetof(struct SheepdogInode,
2486 data_vdi_id[start_idx]),
2487 false, s->cache_flags);
2488 if (ret < 0) {
e25cad69 2489 error_setg(errp, "Failed to discard snapshot inode");
eab8eb8d
VT
2490 goto out;
2491 }
2492 }
2493
e25cad69 2494 ret = 0;
eab8eb8d
VT
2495out:
2496 closesocket(fd);
e25cad69 2497 return ret;
eab8eb8d
VT
2498}
2499
a89d89d3
WX
2500static int sd_snapshot_delete(BlockDriverState *bs,
2501 const char *snapshot_id,
2502 const char *name,
2503 Error **errp)
33b1db1c 2504{
a0dc0e2b
MA
2505 /*
2506 * FIXME should delete the snapshot matching both @snapshot_id and
2507 * @name, but @name not used here
2508 */
03c698f0 2509 unsigned long snap_id = 0;
eab8eb8d 2510 char snap_tag[SD_MAX_VDI_TAG_LEN];
eab8eb8d
VT
2511 int fd, ret;
2512 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
2513 BDRVSheepdogState *s = bs->opaque;
2514 unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
2515 uint32_t vid;
2516 SheepdogVdiReq hdr = {
2517 .opcode = SD_OP_DEL_VDI,
2518 .data_length = wlen,
2519 .flags = SD_FLAG_CMD_WRITE,
2520 };
2521 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2522
e25cad69
MA
2523 ret = remove_objects(s, errp);
2524 if (ret) {
2525 return ret;
eab8eb8d
VT
2526 }
2527
2528 memset(buf, 0, sizeof(buf));
2529 memset(snap_tag, 0, sizeof(snap_tag));
2530 pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
89e2a31d 2531 /* TODO Use sd_parse_snapid() once this mess is cleaned up */
03c698f0
JC
2532 ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
2533 if (ret || snap_id > UINT32_MAX) {
a0dc0e2b
MA
2534 /*
2535 * FIXME Since qemu_strtoul() returns -EINVAL when
2536 * @snapshot_id is null, @snapshot_id is mandatory. Correct
2537 * would be to require at least one of @snapshot_id and @name.
2538 */
03c698f0
JC
2539 error_setg(errp, "Invalid snapshot ID: %s",
2540 snapshot_id ? snapshot_id : "<null>");
2541 return -EINVAL;
eab8eb8d
VT
2542 }
2543
2544 if (snap_id) {
03c698f0 2545 hdr.snapid = (uint32_t) snap_id;
eab8eb8d 2546 } else {
a0dc0e2b 2547 /* FIXME I suspect we should use @name here */
89e2a31d 2548 /* FIXME don't truncate silently */
eab8eb8d
VT
2549 pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
2550 pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
2551 }
2552
e25cad69 2553 ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp);
eab8eb8d
VT
2554 if (ret) {
2555 return ret;
2556 }
2557
e25cad69 2558 fd = connect_to_sdog(s, errp);
eab8eb8d 2559 if (fd < 0) {
e25cad69 2560 return fd;
eab8eb8d
VT
2561 }
2562
f11672db 2563 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
eab8eb8d
VT
2564 buf, &wlen, &rlen);
2565 closesocket(fd);
2566 if (ret) {
e25cad69 2567 error_setg_errno(errp, -ret, "Couldn't send request to server");
eab8eb8d
VT
2568 return ret;
2569 }
2570
2571 switch (rsp->result) {
2572 case SD_RES_NO_VDI:
e25cad69
MA
2573 error_setg(errp, "Can't find the snapshot");
2574 return -ENOENT;
eab8eb8d
VT
2575 case SD_RES_SUCCESS:
2576 break;
2577 default:
e25cad69
MA
2578 error_setg(errp, "%s", sd_strerror(rsp->result));
2579 return -EIO;
eab8eb8d
VT
2580 }
2581
e25cad69 2582 return 0;
33b1db1c
MK
2583}
2584
33b1db1c
MK
2585static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
2586{
dfb12bf8 2587 Error *local_err = NULL;
33b1db1c
MK
2588 BDRVSheepdogState *s = bs->opaque;
2589 SheepdogReq req;
2590 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
2591 QEMUSnapshotInfo *sn_tab = NULL;
2592 unsigned wlen, rlen;
2593 int found = 0;
2594 static SheepdogInode inode;
2595 unsigned long *vdi_inuse;
2596 unsigned int start_nr;
2597 uint64_t hval;
2598 uint32_t vid;
2599
7267c094 2600 vdi_inuse = g_malloc(max);
33b1db1c 2601
dfb12bf8 2602 fd = connect_to_sdog(s, &local_err);
33b1db1c 2603 if (fd < 0) {
565f65d2 2604 error_report_err(local_err);
cb595887 2605 ret = fd;
33b1db1c
MK
2606 goto out;
2607 }
2608
2609 rlen = max;
2610 wlen = 0;
2611
2612 memset(&req, 0, sizeof(req));
2613
2614 req.opcode = SD_OP_READ_VDIS;
2615 req.data_length = max;
2616
f11672db 2617 ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
33b1db1c
MK
2618
2619 closesocket(fd);
2620 if (ret) {
2621 goto out;
2622 }
2623
02c4f26b 2624 sn_tab = g_new0(QEMUSnapshotInfo, nr);
33b1db1c
MK
2625
2626 /* calculate a vdi id with hash function */
2627 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
2628 start_nr = hval & (SD_NR_VDIS - 1);
2629
dfb12bf8 2630 fd = connect_to_sdog(s, &local_err);
33b1db1c 2631 if (fd < 0) {
565f65d2 2632 error_report_err(local_err);
cb595887 2633 ret = fd;
33b1db1c
MK
2634 goto out;
2635 }
2636
2637 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
2638 if (!test_bit(vid, vdi_inuse)) {
2639 break;
2640 }
2641
2642 /* we don't need to read entire object */
f11672db 2643 ret = read_object(fd, s->bs, (char *)&inode,
84390bed 2644 vid_to_vdi_oid(vid),
47622c44 2645 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
0e7106d8 2646 s->cache_flags);
33b1db1c
MK
2647
2648 if (ret) {
2649 continue;
2650 }
2651
2652 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
2653 sn_tab[found].date_sec = inode.snap_ctime >> 32;
2654 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
2655 sn_tab[found].vm_state_size = inode.vm_state_size;
2656 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
2657
521b2b5d
HR
2658 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
2659 "%" PRIu32, inode.snap_id);
3178e275
JM
2660 pstrcpy(sn_tab[found].name,
2661 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
2662 inode.tag);
33b1db1c
MK
2663 found++;
2664 }
2665 }
2666
2667 closesocket(fd);
2668out:
2669 *psn_tab = sn_tab;
2670
7267c094 2671 g_free(vdi_inuse);
33b1db1c 2672
cb595887
MK
2673 if (ret < 0) {
2674 return ret;
2675 }
2676
33b1db1c
MK
2677 return found;
2678}
2679
2680static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
2681 int64_t pos, int size, int load)
2682{
dfb12bf8 2683 Error *local_err = NULL;
2f536801
MK
2684 bool create;
2685 int fd, ret = 0, remaining = size;
33b1db1c
MK
2686 unsigned int data_len;
2687 uint64_t vmstate_oid;
33b1db1c 2688 uint64_t offset;
cede621f
LY
2689 uint32_t vdi_index;
2690 uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
876eb1b0 2691 uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 2692
dfb12bf8 2693 fd = connect_to_sdog(s, &local_err);
33b1db1c 2694 if (fd < 0) {
565f65d2 2695 error_report_err(local_err);
cb595887 2696 return fd;
33b1db1c
MK
2697 }
2698
6f3c714e 2699 while (remaining) {
876eb1b0
TI
2700 vdi_index = pos / object_size;
2701 offset = pos % object_size;
33b1db1c 2702
876eb1b0 2703 data_len = MIN(remaining, object_size - offset);
33b1db1c 2704
cede621f 2705 vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
33b1db1c
MK
2706
2707 create = (offset == 0);
2708 if (load) {
f11672db 2709 ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
47622c44 2710 s->inode.nr_copies, data_len, offset,
0e7106d8 2711 s->cache_flags);
33b1db1c 2712 } else {
f11672db 2713 ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
47622c44 2714 s->inode.nr_copies, data_len, offset, create,
0e7106d8 2715 s->cache_flags);
33b1db1c
MK
2716 }
2717
2718 if (ret < 0) {
6daf194d 2719 error_report("failed to save vmstate %s", strerror(errno));
33b1db1c
MK
2720 goto cleanup;
2721 }
2722
2723 pos += data_len;
1f7a48de 2724 data += data_len;
6f3c714e 2725 remaining -= data_len;
33b1db1c 2726 }
6f3c714e 2727 ret = size;
33b1db1c
MK
2728cleanup:
2729 closesocket(fd);
2730 return ret;
2731}
2732
cf8074b3
KW
2733static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2734 int64_t pos)
33b1db1c
MK
2735{
2736 BDRVSheepdogState *s = bs->opaque;
cf8074b3
KW
2737 void *buf;
2738 int ret;
33b1db1c 2739
cf8074b3
KW
2740 buf = qemu_blockalign(bs, qiov->size);
2741 qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
2742 ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
2743 qemu_vfree(buf);
2744
2745 return ret;
33b1db1c
MK
2746}
2747
5ddda0b8
KW
2748static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2749 int64_t pos)
33b1db1c
MK
2750{
2751 BDRVSheepdogState *s = bs->opaque;
5ddda0b8
KW
2752 void *buf;
2753 int ret;
33b1db1c 2754
5ddda0b8
KW
2755 buf = qemu_blockalign(bs, qiov->size);
2756 ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
2757 qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
2758 qemu_vfree(buf);
2759
2760 return ret;
33b1db1c
MK
2761}
2762
2763
dde47537
EB
2764static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
2765 int count)
cac8f4a6 2766{
28ddd08c 2767 SheepdogAIOCB acb;
cac8f4a6 2768 BDRVSheepdogState *s = bs->opaque;
e6fd57ea
HM
2769 QEMUIOVector discard_iov;
2770 struct iovec iov;
2771 uint32_t zero = 0;
cac8f4a6
LY
2772
2773 if (!s->discard_supported) {
dde47537 2774 return 0;
cac8f4a6
LY
2775 }
2776
e6fd57ea
HM
2777 memset(&discard_iov, 0, sizeof(discard_iov));
2778 memset(&iov, 0, sizeof(iov));
2779 iov.iov_base = &zero;
2780 iov.iov_len = sizeof(zero);
2781 discard_iov.iov = &iov;
2782 discard_iov.niov = 1;
49228d1e
EB
2783 if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
2784 return -ENOTSUP;
2785 }
28ddd08c
PB
2786 sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
2787 count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
28ddd08c 2788 sd_co_rw_vector(&acb);
acf6e5f0 2789 sd_aio_complete(&acb);
cac8f4a6 2790
28ddd08c 2791 return acb.ret;
cac8f4a6
LY
2792}
2793
b6b8a333
PB
2794static coroutine_fn int64_t
2795sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
67a0fd2a 2796 int *pnum, BlockDriverState **file)
8d71c631
LY
2797{
2798 BDRVSheepdogState *s = bs->opaque;
2799 SheepdogInode *inode = &s->inode;
876eb1b0 2800 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
9cd76737 2801 uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
876eb1b0 2802 unsigned long start = offset / object_size,
8d71c631 2803 end = DIV_ROUND_UP((sector_num + nb_sectors) *
876eb1b0 2804 BDRV_SECTOR_SIZE, object_size);
8d71c631 2805 unsigned long idx;
9cd76737 2806 int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
8d71c631
LY
2807
2808 for (idx = start; idx < end; idx++) {
2809 if (inode->data_vdi_id[idx] == 0) {
2810 break;
2811 }
2812 }
2813 if (idx == start) {
2814 /* Get the longest length of unallocated sectors */
2815 ret = 0;
2816 for (idx = start + 1; idx < end; idx++) {
2817 if (inode->data_vdi_id[idx] != 0) {
2818 break;
2819 }
2820 }
2821 }
2822
876eb1b0 2823 *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
8d71c631
LY
2824 if (*pnum > nb_sectors) {
2825 *pnum = nb_sectors;
2826 }
d234c929
FZ
2827 if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
2828 *file = bs;
2829 }
8d71c631
LY
2830 return ret;
2831}
2832
85829722
LY
2833static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
2834{
2835 BDRVSheepdogState *s = bs->opaque;
2836 SheepdogInode *inode = &s->inode;
876eb1b0
TI
2837 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
2838 unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
85829722
LY
2839 uint64_t size = 0;
2840
2841 for (i = 0; i < last; i++) {
2842 if (inode->data_vdi_id[i] == 0) {
2843 continue;
2844 }
876eb1b0 2845 size += object_size;
85829722
LY
2846 }
2847 return size;
2848}
2849
b222237b
CL
2850static QemuOptsList sd_create_opts = {
2851 .name = "sheepdog-create-opts",
2852 .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head),
2853 .desc = {
2854 {
2855 .name = BLOCK_OPT_SIZE,
2856 .type = QEMU_OPT_SIZE,
2857 .help = "Virtual disk size"
2858 },
2859 {
2860 .name = BLOCK_OPT_BACKING_FILE,
2861 .type = QEMU_OPT_STRING,
2862 .help = "File name of a base image"
2863 },
2864 {
2865 .name = BLOCK_OPT_PREALLOC,
2866 .type = QEMU_OPT_STRING,
2867 .help = "Preallocation mode (allowed values: off, full)"
2868 },
2869 {
2870 .name = BLOCK_OPT_REDUNDANCY,
2871 .type = QEMU_OPT_STRING,
2872 .help = "Redundancy of the image"
2873 },
876eb1b0
TI
2874 {
2875 .name = BLOCK_OPT_OBJECT_SIZE,
2876 .type = QEMU_OPT_SIZE,
2877 .help = "Object size of the image"
2878 },
b222237b
CL
2879 { /* end of list */ }
2880 }
33b1db1c
MK
2881};
2882
5d6768e3 2883static BlockDriver bdrv_sheepdog = {
33b1db1c
MK
2884 .format_name = "sheepdog",
2885 .protocol_name = "sheepdog",
2886 .instance_size = sizeof(BDRVSheepdogState),
030be321 2887 .bdrv_needs_filename = true,
33b1db1c 2888 .bdrv_file_open = sd_open,
4da65c80
LY
2889 .bdrv_reopen_prepare = sd_reopen_prepare,
2890 .bdrv_reopen_commit = sd_reopen_commit,
2891 .bdrv_reopen_abort = sd_reopen_abort,
33b1db1c 2892 .bdrv_close = sd_close,
c282e1fd 2893 .bdrv_create = sd_create,
e4f5c1bf 2894 .bdrv_has_zero_init = bdrv_has_zero_init_1,
33b1db1c 2895 .bdrv_getlength = sd_getlength,
85829722 2896 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
33b1db1c
MK
2897 .bdrv_truncate = sd_truncate,
2898
2df46246
MK
2899 .bdrv_co_readv = sd_co_readv,
2900 .bdrv_co_writev = sd_co_writev,
47622c44 2901 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
dde47537 2902 .bdrv_co_pdiscard = sd_co_pdiscard,
b6b8a333 2903 .bdrv_co_get_block_status = sd_co_get_block_status,
33b1db1c
MK
2904
2905 .bdrv_snapshot_create = sd_snapshot_create,
2906 .bdrv_snapshot_goto = sd_snapshot_goto,
2907 .bdrv_snapshot_delete = sd_snapshot_delete,
2908 .bdrv_snapshot_list = sd_snapshot_list,
2909
2910 .bdrv_save_vmstate = sd_save_vmstate,
2911 .bdrv_load_vmstate = sd_load_vmstate,
2912
84390bed
SH
2913 .bdrv_detach_aio_context = sd_detach_aio_context,
2914 .bdrv_attach_aio_context = sd_attach_aio_context,
2915
b222237b 2916 .create_opts = &sd_create_opts,
33b1db1c
MK
2917};
2918
5d6768e3
MK
2919static BlockDriver bdrv_sheepdog_tcp = {
2920 .format_name = "sheepdog",
2921 .protocol_name = "sheepdog+tcp",
2922 .instance_size = sizeof(BDRVSheepdogState),
030be321 2923 .bdrv_needs_filename = true,
5d6768e3 2924 .bdrv_file_open = sd_open,
4da65c80
LY
2925 .bdrv_reopen_prepare = sd_reopen_prepare,
2926 .bdrv_reopen_commit = sd_reopen_commit,
2927 .bdrv_reopen_abort = sd_reopen_abort,
5d6768e3 2928 .bdrv_close = sd_close,
c282e1fd 2929 .bdrv_create = sd_create,
e4f5c1bf 2930 .bdrv_has_zero_init = bdrv_has_zero_init_1,
5d6768e3 2931 .bdrv_getlength = sd_getlength,
85829722 2932 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
5d6768e3
MK
2933 .bdrv_truncate = sd_truncate,
2934
2935 .bdrv_co_readv = sd_co_readv,
2936 .bdrv_co_writev = sd_co_writev,
2937 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
dde47537 2938 .bdrv_co_pdiscard = sd_co_pdiscard,
b6b8a333 2939 .bdrv_co_get_block_status = sd_co_get_block_status,
5d6768e3
MK
2940
2941 .bdrv_snapshot_create = sd_snapshot_create,
2942 .bdrv_snapshot_goto = sd_snapshot_goto,
2943 .bdrv_snapshot_delete = sd_snapshot_delete,
2944 .bdrv_snapshot_list = sd_snapshot_list,
2945
2946 .bdrv_save_vmstate = sd_save_vmstate,
2947 .bdrv_load_vmstate = sd_load_vmstate,
2948
84390bed
SH
2949 .bdrv_detach_aio_context = sd_detach_aio_context,
2950 .bdrv_attach_aio_context = sd_attach_aio_context,
2951
b222237b 2952 .create_opts = &sd_create_opts,
5d6768e3
MK
2953};
2954
1b8bbb46
MK
2955static BlockDriver bdrv_sheepdog_unix = {
2956 .format_name = "sheepdog",
2957 .protocol_name = "sheepdog+unix",
2958 .instance_size = sizeof(BDRVSheepdogState),
030be321 2959 .bdrv_needs_filename = true,
1b8bbb46 2960 .bdrv_file_open = sd_open,
4da65c80
LY
2961 .bdrv_reopen_prepare = sd_reopen_prepare,
2962 .bdrv_reopen_commit = sd_reopen_commit,
2963 .bdrv_reopen_abort = sd_reopen_abort,
1b8bbb46 2964 .bdrv_close = sd_close,
c282e1fd 2965 .bdrv_create = sd_create,
3ac21627 2966 .bdrv_has_zero_init = bdrv_has_zero_init_1,
1b8bbb46 2967 .bdrv_getlength = sd_getlength,
85829722 2968 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
1b8bbb46
MK
2969 .bdrv_truncate = sd_truncate,
2970
2971 .bdrv_co_readv = sd_co_readv,
2972 .bdrv_co_writev = sd_co_writev,
2973 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
dde47537 2974 .bdrv_co_pdiscard = sd_co_pdiscard,
b6b8a333 2975 .bdrv_co_get_block_status = sd_co_get_block_status,
1b8bbb46
MK
2976
2977 .bdrv_snapshot_create = sd_snapshot_create,
2978 .bdrv_snapshot_goto = sd_snapshot_goto,
2979 .bdrv_snapshot_delete = sd_snapshot_delete,
2980 .bdrv_snapshot_list = sd_snapshot_list,
2981
2982 .bdrv_save_vmstate = sd_save_vmstate,
2983 .bdrv_load_vmstate = sd_load_vmstate,
2984
84390bed
SH
2985 .bdrv_detach_aio_context = sd_detach_aio_context,
2986 .bdrv_attach_aio_context = sd_attach_aio_context,
2987
b222237b 2988 .create_opts = &sd_create_opts,
1b8bbb46
MK
2989};
2990
33b1db1c
MK
2991static void bdrv_sheepdog_init(void)
2992{
2993 bdrv_register(&bdrv_sheepdog);
5d6768e3 2994 bdrv_register(&bdrv_sheepdog_tcp);
1b8bbb46 2995 bdrv_register(&bdrv_sheepdog_unix);
33b1db1c
MK
2996}
2997block_init(bdrv_sheepdog_init);