]> git.proxmox.com Git - mirror_qemu.git/blame - block/sheepdog.c
rbd: Fix to cleanly reject -drive without pool or image
[mirror_qemu.git] / block / sheepdog.c
CommitLineData
33b1db1c
MK
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
6b620ca3
PB
10 *
11 * Contributions after 2012-01-13 are licensed under the terms of the
12 * GNU GPL, version 2 or (at your option) any later version.
33b1db1c 13 */
33b1db1c 14
80c71a24 15#include "qemu/osdep.h"
da34e65c 16#include "qapi/error.h"
831acdc9
MA
17#include "qapi/qmp/qdict.h"
18#include "qapi/qmp/qint.h"
5d6768e3 19#include "qemu/uri.h"
1de7afc9
PB
20#include "qemu/error-report.h"
21#include "qemu/sockets.h"
737e150e 22#include "block/block_int.h"
fba98d45 23#include "sysemu/block-backend.h"
1de7afc9 24#include "qemu/bitops.h"
f348b6d1 25#include "qemu/cutils.h"
33b1db1c
MK
26
27#define SD_PROTO_VER 0x01
28
29#define SD_DEFAULT_ADDR "localhost"
25af257d 30#define SD_DEFAULT_PORT 7000
33b1db1c
MK
31
32#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
33#define SD_OP_READ_OBJ 0x02
34#define SD_OP_WRITE_OBJ 0x03
cac8f4a6 35/* 0x04 is used internally by Sheepdog */
33b1db1c
MK
36
37#define SD_OP_NEW_VDI 0x11
38#define SD_OP_LOCK_VDI 0x12
39#define SD_OP_RELEASE_VDI 0x13
40#define SD_OP_GET_VDI_INFO 0x14
41#define SD_OP_READ_VDIS 0x15
47622c44 42#define SD_OP_FLUSH_VDI 0x16
859e5553 43#define SD_OP_DEL_VDI 0x17
876eb1b0 44#define SD_OP_GET_CLUSTER_DEFAULT 0x18
33b1db1c
MK
45
46#define SD_FLAG_CMD_WRITE 0x01
47#define SD_FLAG_CMD_COW 0x02
0e7106d8
LY
48#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
49#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
33b1db1c
MK
50
51#define SD_RES_SUCCESS 0x00 /* Success */
52#define SD_RES_UNKNOWN 0x01 /* Unknown error */
53#define SD_RES_NO_OBJ 0x02 /* No object found */
54#define SD_RES_EIO 0x03 /* I/O error */
55#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
56#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
57#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
58#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
59#define SD_RES_NO_VDI 0x08 /* No vdi found */
60#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
61#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
62#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
63#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
64#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
65#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
66#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
67#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
68#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
69#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
70#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
71#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
72#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
73#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
74#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
75#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
fca23f0a 76#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */
6a0b5490 77#define SD_RES_READONLY 0x1A /* Object is read-only */
33b1db1c
MK
78
79/*
80 * Object ID rules
81 *
82 * 0 - 19 (20 bits): data object space
83 * 20 - 31 (12 bits): reserved data object space
84 * 32 - 55 (24 bits): vdi object space
85 * 56 - 59 ( 4 bits): reserved vdi object space
7acae208 86 * 60 - 63 ( 4 bits): object type identifier space
33b1db1c
MK
87 */
88
89#define VDI_SPACE_SHIFT 32
90#define VDI_BIT (UINT64_C(1) << 63)
91#define VMSTATE_BIT (UINT64_C(1) << 62)
92#define MAX_DATA_OBJS (UINT64_C(1) << 20)
93#define MAX_CHILDREN 1024
94#define SD_MAX_VDI_LEN 256
95#define SD_MAX_VDI_TAG_LEN 256
96#define SD_NR_VDIS (1U << 24)
97#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
98#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
876eb1b0 99#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
b3af018f
LY
100/*
101 * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
102 * (SD_EC_MAX_STRIP - 1) for parity strips
103 *
104 * SD_MAX_COPIES is sum of number of data strips and parity strips.
105 */
106#define SD_EC_MAX_STRIP 16
107#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
33b1db1c
MK
108
109#define SD_INODE_SIZE (sizeof(SheepdogInode))
110#define CURRENT_VDI_ID 0
111
1dbfafed
HM
112#define LOCK_TYPE_NORMAL 0
113#define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */
114
33b1db1c
MK
115typedef struct SheepdogReq {
116 uint8_t proto_ver;
117 uint8_t opcode;
118 uint16_t flags;
119 uint32_t epoch;
120 uint32_t id;
121 uint32_t data_length;
122 uint32_t opcode_specific[8];
123} SheepdogReq;
124
125typedef struct SheepdogRsp {
126 uint8_t proto_ver;
127 uint8_t opcode;
128 uint16_t flags;
129 uint32_t epoch;
130 uint32_t id;
131 uint32_t data_length;
132 uint32_t result;
133 uint32_t opcode_specific[7];
134} SheepdogRsp;
135
136typedef struct SheepdogObjReq {
137 uint8_t proto_ver;
138 uint8_t opcode;
139 uint16_t flags;
140 uint32_t epoch;
141 uint32_t id;
142 uint32_t data_length;
143 uint64_t oid;
144 uint64_t cow_oid;
29a67f7e 145 uint8_t copies;
1841f880
LY
146 uint8_t copy_policy;
147 uint8_t reserved[6];
33b1db1c
MK
148 uint64_t offset;
149} SheepdogObjReq;
150
151typedef struct SheepdogObjRsp {
152 uint8_t proto_ver;
153 uint8_t opcode;
154 uint16_t flags;
155 uint32_t epoch;
156 uint32_t id;
157 uint32_t data_length;
158 uint32_t result;
29a67f7e 159 uint8_t copies;
1841f880
LY
160 uint8_t copy_policy;
161 uint8_t reserved[2];
33b1db1c
MK
162 uint32_t pad[6];
163} SheepdogObjRsp;
164
165typedef struct SheepdogVdiReq {
166 uint8_t proto_ver;
167 uint8_t opcode;
168 uint16_t flags;
169 uint32_t epoch;
170 uint32_t id;
171 uint32_t data_length;
172 uint64_t vdi_size;
9f23fce7 173 uint32_t base_vdi_id;
29a67f7e 174 uint8_t copies;
1841f880 175 uint8_t copy_policy;
876eb1b0
TI
176 uint8_t store_policy;
177 uint8_t block_size_shift;
33b1db1c 178 uint32_t snapid;
1dbfafed
HM
179 uint32_t type;
180 uint32_t pad[2];
33b1db1c
MK
181} SheepdogVdiReq;
182
183typedef struct SheepdogVdiRsp {
184 uint8_t proto_ver;
185 uint8_t opcode;
186 uint16_t flags;
187 uint32_t epoch;
188 uint32_t id;
189 uint32_t data_length;
190 uint32_t result;
191 uint32_t rsvd;
192 uint32_t vdi_id;
193 uint32_t pad[5];
194} SheepdogVdiRsp;
195
876eb1b0
TI
196typedef struct SheepdogClusterRsp {
197 uint8_t proto_ver;
198 uint8_t opcode;
199 uint16_t flags;
200 uint32_t epoch;
201 uint32_t id;
202 uint32_t data_length;
203 uint32_t result;
204 uint8_t nr_copies;
205 uint8_t copy_policy;
206 uint8_t block_size_shift;
207 uint8_t __pad1;
208 uint32_t __pad2[6];
209} SheepdogClusterRsp;
210
33b1db1c
MK
211typedef struct SheepdogInode {
212 char name[SD_MAX_VDI_LEN];
213 char tag[SD_MAX_VDI_TAG_LEN];
214 uint64_t ctime;
215 uint64_t snap_ctime;
216 uint64_t vm_clock_nsec;
217 uint64_t vdi_size;
218 uint64_t vm_state_size;
219 uint16_t copy_policy;
220 uint8_t nr_copies;
221 uint8_t block_size_shift;
222 uint32_t snap_id;
223 uint32_t vdi_id;
224 uint32_t parent_vdi_id;
225 uint32_t child_vdi_id[MAX_CHILDREN];
226 uint32_t data_vdi_id[MAX_DATA_OBJS];
227} SheepdogInode;
228
5d039bab
HM
229#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id)
230
33b1db1c
MK
231/*
232 * 64 bit FNV-1a non-zero initial basis
233 */
234#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
235
236/*
237 * 64 bit Fowler/Noll/Vo FNV-1a hash code
238 */
239static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
240{
241 unsigned char *bp = buf;
242 unsigned char *be = bp + len;
243 while (bp < be) {
244 hval ^= (uint64_t) *bp++;
245 hval += (hval << 1) + (hval << 4) + (hval << 5) +
246 (hval << 7) + (hval << 8) + (hval << 40);
247 }
248 return hval;
249}
250
2f536801 251static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
33b1db1c
MK
252{
253 return inode->vdi_id == inode->data_vdi_id[idx];
254}
255
2f536801 256static inline bool is_data_obj(uint64_t oid)
33b1db1c
MK
257{
258 return !(VDI_BIT & oid);
259}
260
261static inline uint64_t data_oid_to_idx(uint64_t oid)
262{
263 return oid & (MAX_DATA_OBJS - 1);
264}
265
72e0996c
MK
266static inline uint32_t oid_to_vid(uint64_t oid)
267{
268 return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
269}
270
33b1db1c
MK
271static inline uint64_t vid_to_vdi_oid(uint32_t vid)
272{
273 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
274}
275
276static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
277{
278 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
279}
280
281static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
282{
283 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
284}
285
2f536801 286static inline bool is_snapshot(struct SheepdogInode *inode)
33b1db1c
MK
287{
288 return !!inode->snap_ctime;
289}
290
eab8eb8d
VT
291static inline size_t count_data_objs(const struct SheepdogInode *inode)
292{
293 return DIV_ROUND_UP(inode->vdi_size,
294 (1UL << inode->block_size_shift));
295}
296
2440a2c3 297#undef DPRINTF
33b1db1c 298#ifdef DEBUG_SDOG
ed79f37d 299#define DEBUG_SDOG_PRINT 1
33b1db1c 300#else
ed79f37d 301#define DEBUG_SDOG_PRINT 0
33b1db1c 302#endif
ed79f37d
ZJ
303#define DPRINTF(fmt, args...) \
304 do { \
305 if (DEBUG_SDOG_PRINT) { \
306 fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
307 } \
308 } while (0)
33b1db1c
MK
309
310typedef struct SheepdogAIOCB SheepdogAIOCB;
28ddd08c 311typedef struct BDRVSheepdogState BDRVSheepdogState;
33b1db1c
MK
312
313typedef struct AIOReq {
314 SheepdogAIOCB *aiocb;
315 unsigned int iov_offset;
316
317 uint64_t oid;
318 uint64_t base_oid;
319 uint64_t offset;
320 unsigned int data_len;
321 uint8_t flags;
322 uint32_t id;
b544c1ab 323 bool create;
33b1db1c 324
c292ee6a 325 QLIST_ENTRY(AIOReq) aio_siblings;
33b1db1c
MK
326} AIOReq;
327
328enum AIOCBState {
329 AIOCB_WRITE_UDATA,
330 AIOCB_READ_UDATA,
47783072 331 AIOCB_FLUSH_CACHE,
cac8f4a6 332 AIOCB_DISCARD_OBJ,
33b1db1c
MK
333};
334
498f2140 335#define AIOCBOverlapping(x, y) \
6a55c82c
HM
336 (!(x->max_affect_data_idx < y->min_affect_data_idx \
337 || y->max_affect_data_idx < x->min_affect_data_idx))
338
33b1db1c 339struct SheepdogAIOCB {
28ddd08c 340 BDRVSheepdogState *s;
33b1db1c
MK
341
342 QEMUIOVector *qiov;
343
344 int64_t sector_num;
345 int nb_sectors;
346
347 int ret;
348 enum AIOCBState aiocb_type;
349
2df46246 350 Coroutine *coroutine;
1d732d7d 351 int nr_pending;
6a55c82c
HM
352
353 uint32_t min_affect_data_idx;
354 uint32_t max_affect_data_idx;
355
498f2140
HM
356 /*
357 * The difference between affect_data_idx and dirty_data_idx:
358 * affect_data_idx represents range of index of all request types.
359 * dirty_data_idx represents range of index updated by COW requests.
360 * dirty_data_idx is used for updating an inode object.
361 */
362 uint32_t min_dirty_data_idx;
363 uint32_t max_dirty_data_idx;
364
6a55c82c 365 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
33b1db1c
MK
366};
367
28ddd08c 368struct BDRVSheepdogState {
011603ca 369 BlockDriverState *bs;
84390bed 370 AioContext *aio_context;
011603ca 371
33b1db1c
MK
372 SheepdogInode inode;
373
33b1db1c 374 char name[SD_MAX_VDI_LEN];
2f536801 375 bool is_snapshot;
0e7106d8 376 uint32_t cache_flags;
cac8f4a6 377 bool discard_supported;
33b1db1c 378
8ecc2f9e 379 SocketAddress *addr;
33b1db1c
MK
380 int fd;
381
2df46246
MK
382 CoMutex lock;
383 Coroutine *co_send;
384 Coroutine *co_recv;
385
33b1db1c 386 uint32_t aioreq_seq_num;
011603ca
MK
387
388 /* Every aio request must be linked to either of these queues. */
c292ee6a 389 QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
011603ca 390 QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
6a55c82c 391
498f2140 392 CoQueue overlapping_queue;
6a55c82c 393 QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
28ddd08c 394};
33b1db1c 395
4da65c80
LY
396typedef struct BDRVSheepdogReopenState {
397 int fd;
398 int cache_flags;
399} BDRVSheepdogReopenState;
400
33b1db1c
MK
401static const char * sd_strerror(int err)
402{
403 int i;
404
405 static const struct {
406 int err;
407 const char *desc;
408 } errors[] = {
409 {SD_RES_SUCCESS, "Success"},
410 {SD_RES_UNKNOWN, "Unknown error"},
411 {SD_RES_NO_OBJ, "No object found"},
412 {SD_RES_EIO, "I/O error"},
413 {SD_RES_VDI_EXIST, "VDI exists already"},
414 {SD_RES_INVALID_PARMS, "Invalid parameters"},
415 {SD_RES_SYSTEM_ERROR, "System error"},
416 {SD_RES_VDI_LOCKED, "VDI is already locked"},
417 {SD_RES_NO_VDI, "No vdi found"},
418 {SD_RES_NO_BASE_VDI, "No base VDI found"},
419 {SD_RES_VDI_READ, "Failed read the requested VDI"},
420 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
421 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
422 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
423 {SD_RES_NO_TAG, "Failed to find the requested tag"},
424 {SD_RES_STARTUP, "The system is still booting"},
425 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
426 {SD_RES_SHUTDOWN, "The system is shutting down"},
427 {SD_RES_NO_MEM, "Out of memory on the server"},
428 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
429 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
430 {SD_RES_NO_SPACE, "Server has no space for new objects"},
431 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
432 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
433 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
fca23f0a 434 {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
6a0b5490 435 {SD_RES_READONLY, "Object is read-only"},
33b1db1c
MK
436 };
437
438 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
439 if (errors[i].err == err) {
440 return errors[i].desc;
441 }
442 }
443
444 return "Invalid error code";
445}
446
447/*
448 * Sheepdog I/O handling:
449 *
2df46246 450 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
c292ee6a 451 * link the requests to the inflight_list in the
e80ab33d 452 * BDRVSheepdogState. The function yields while waiting for
2df46246 453 * receiving the response.
33b1db1c 454 *
2df46246 455 * 2. We receive the response in aio_read_response, the fd handler to
e80ab33d
PB
456 * the sheepdog connection. We switch back to sd_co_readv/sd_writev
457 * after all the requests belonging to the AIOCB are finished. If
458 * needed, sd_co_writev will send another requests for the vdi object.
33b1db1c
MK
459 */
460
461static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
462 uint64_t oid, unsigned int data_len,
b544c1ab 463 uint64_t offset, uint8_t flags, bool create,
33b1db1c
MK
464 uint64_t base_oid, unsigned int iov_offset)
465{
466 AIOReq *aio_req;
467
7267c094 468 aio_req = g_malloc(sizeof(*aio_req));
33b1db1c
MK
469 aio_req->aiocb = acb;
470 aio_req->iov_offset = iov_offset;
471 aio_req->oid = oid;
472 aio_req->base_oid = base_oid;
473 aio_req->offset = offset;
474 aio_req->data_len = data_len;
475 aio_req->flags = flags;
476 aio_req->id = s->aioreq_seq_num++;
b544c1ab 477 aio_req->create = create;
33b1db1c 478
1d732d7d 479 acb->nr_pending++;
33b1db1c
MK
480 return aio_req;
481}
482
acf6e5f0
PB
483static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
484{
485 SheepdogAIOCB *cb;
486
487retry:
488 QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
489 if (AIOCBOverlapping(acb, cb)) {
1ace7cea 490 qemu_co_queue_wait(&s->overlapping_queue, NULL);
acf6e5f0
PB
491 goto retry;
492 }
493 }
494}
495
28ddd08c
PB
496static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
497 QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
498 int type)
33b1db1c 499{
6a55c82c 500 uint32_t object_size;
6a55c82c
HM
501
502 object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 503
28ddd08c 504 acb->s = s;
33b1db1c
MK
505
506 acb->qiov = qiov;
507
508 acb->sector_num = sector_num;
509 acb->nb_sectors = nb_sectors;
510
2df46246 511 acb->coroutine = qemu_coroutine_self();
33b1db1c 512 acb->ret = 0;
1d732d7d 513 acb->nr_pending = 0;
6a55c82c
HM
514
515 acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
516 acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
517 acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
518
498f2140
HM
519 acb->min_dirty_data_idx = UINT32_MAX;
520 acb->max_dirty_data_idx = 0;
28ddd08c 521 acb->aiocb_type = type;
acf6e5f0
PB
522
523 if (type == AIOCB_FLUSH_CACHE) {
524 return;
525 }
526
527 wait_for_overlapping_aiocb(s, acb);
528 QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
33b1db1c
MK
529}
530
831acdc9
MA
531static SocketAddress *sd_socket_address(const char *path,
532 const char *host, const char *port)
533{
534 SocketAddress *addr = g_new0(SocketAddress, 1);
535
536 if (path) {
537 addr->type = SOCKET_ADDRESS_KIND_UNIX;
538 addr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
539 addr->u.q_unix.data->path = g_strdup(path);
540 } else {
541 addr->type = SOCKET_ADDRESS_KIND_INET;
542 addr->u.inet.data = g_new0(InetSocketAddress, 1);
543 addr->u.inet.data->host = g_strdup(host ?: SD_DEFAULT_ADDR);
544 addr->u.inet.data->port = g_strdup(port ?: stringify(SD_DEFAULT_PORT));
545 }
546
547 return addr;
548}
549
833a7cc3 550/* Return -EIO in case of error, file descriptor on success */
dfb12bf8 551static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
33b1db1c 552{
25af257d 553 int fd;
33b1db1c 554
8ecc2f9e 555 fd = socket_connect(s->addr, errp, NULL, NULL);
1b8bbb46 556
8ecc2f9e
MA
557 if (s->addr->type == SOCKET_ADDRESS_KIND_INET && fd >= 0) {
558 int ret = socket_set_nodelay(fd);
559 if (ret < 0) {
560 error_report("%s", strerror(errno));
1b8bbb46
MK
561 }
562 }
33b1db1c 563
dfb12bf8 564 if (fd >= 0) {
f9e8cacc 565 qemu_set_nonblock(fd);
833a7cc3
LY
566 } else {
567 fd = -EIO;
33b1db1c
MK
568 }
569
33b1db1c
MK
570 return fd;
571}
572
833a7cc3 573/* Return 0 on success and -errno in case of error */
e0d93a89
MK
574static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
575 unsigned int *wlen)
47622c44
LY
576{
577 int ret;
578
579 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
80731d9d 580 if (ret != sizeof(*hdr)) {
47622c44 581 error_report("failed to send a req, %s", strerror(errno));
b16a44e1 582 return -errno;
47622c44
LY
583 }
584
585 ret = qemu_co_send(sockfd, data, *wlen);
80731d9d 586 if (ret != *wlen) {
47622c44 587 error_report("failed to send a req, %s", strerror(errno));
b16a44e1 588 return -errno;
47622c44
LY
589 }
590
591 return ret;
592}
e0d93a89 593
cddd4ac7
MK
594typedef struct SheepdogReqCo {
595 int sockfd;
f11672db 596 BlockDriverState *bs;
84390bed 597 AioContext *aio_context;
cddd4ac7
MK
598 SheepdogReq *hdr;
599 void *data;
600 unsigned int *wlen;
601 unsigned int *rlen;
602 int ret;
603 bool finished;
9d456654 604 Coroutine *co;
cddd4ac7
MK
605} SheepdogReqCo;
606
9d456654
PB
607static void restart_co_req(void *opaque)
608{
609 SheepdogReqCo *srco = opaque;
610
611 aio_co_wake(srco->co);
612}
613
cddd4ac7 614static coroutine_fn void do_co_req(void *opaque)
47622c44
LY
615{
616 int ret;
cddd4ac7
MK
617 SheepdogReqCo *srco = opaque;
618 int sockfd = srco->sockfd;
619 SheepdogReq *hdr = srco->hdr;
620 void *data = srco->data;
621 unsigned int *wlen = srco->wlen;
622 unsigned int *rlen = srco->rlen;
2dfcca3b 623
9d456654 624 srco->co = qemu_coroutine_self();
dca21ef2 625 aio_set_fd_handler(srco->aio_context, sockfd, false,
9d456654 626 NULL, restart_co_req, NULL, srco);
47622c44 627
47622c44
LY
628 ret = send_co_req(sockfd, hdr, data, wlen);
629 if (ret < 0) {
630 goto out;
631 }
632
dca21ef2 633 aio_set_fd_handler(srco->aio_context, sockfd, false,
9d456654 634 restart_co_req, NULL, NULL, srco);
2dfcca3b 635
47622c44 636 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
80731d9d 637 if (ret != sizeof(*hdr)) {
47622c44 638 error_report("failed to get a rsp, %s", strerror(errno));
cb595887 639 ret = -errno;
47622c44
LY
640 goto out;
641 }
642
643 if (*rlen > hdr->data_length) {
644 *rlen = hdr->data_length;
645 }
646
647 if (*rlen) {
648 ret = qemu_co_recv(sockfd, data, *rlen);
80731d9d 649 if (ret != *rlen) {
47622c44 650 error_report("failed to get the data, %s", strerror(errno));
cb595887 651 ret = -errno;
47622c44
LY
652 goto out;
653 }
654 }
655 ret = 0;
656out:
ed9ba724
MK
657 /* there is at most one request for this sockfd, so it is safe to
658 * set each handler to NULL. */
dca21ef2 659 aio_set_fd_handler(srco->aio_context, sockfd, false,
f6a51c84 660 NULL, NULL, NULL, NULL);
cddd4ac7 661
9d456654 662 srco->co = NULL;
cddd4ac7
MK
663 srco->ret = ret;
664 srco->finished = true;
c9d1a561
PB
665 if (srco->bs) {
666 bdrv_wakeup(srco->bs);
667 }
cddd4ac7
MK
668}
669
833a7cc3
LY
670/*
671 * Send the request to the sheep in a synchronous manner.
672 *
673 * Return 0 on success, -errno in case of error.
674 */
f11672db 675static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
84390bed 676 void *data, unsigned int *wlen, unsigned int *rlen)
cddd4ac7
MK
677{
678 Coroutine *co;
679 SheepdogReqCo srco = {
680 .sockfd = sockfd,
f11672db
PB
681 .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
682 .bs = bs,
cddd4ac7
MK
683 .hdr = hdr,
684 .data = data,
685 .wlen = wlen,
686 .rlen = rlen,
687 .ret = 0,
688 .finished = false,
689 };
690
691 if (qemu_in_coroutine()) {
692 do_co_req(&srco);
693 } else {
0b8b8753 694 co = qemu_coroutine_create(do_co_req, &srco);
f11672db
PB
695 if (bs) {
696 qemu_coroutine_enter(co);
697 BDRV_POLL_WHILE(bs, !srco.finished);
698 } else {
699 qemu_coroutine_enter(co);
700 while (!srco.finished) {
701 aio_poll(qemu_get_aio_context(), true);
702 }
cddd4ac7
MK
703 }
704 }
705
706 return srco.ret;
47622c44
LY
707}
708
a37dcdf9 709static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
710 struct iovec *iov, int niov,
711 enum AIOCBState aiocb_type);
a37dcdf9 712static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
72e0996c 713static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
356b4ca2 714static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
011603ca 715static void co_write_request(void *opaque);
7dc1cde0 716
011603ca
MK
717static coroutine_fn void reconnect_to_sdog(void *opaque)
718{
719 BDRVSheepdogState *s = opaque;
720 AIOReq *aio_req, *next;
721
dca21ef2 722 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
f6a51c84 723 NULL, NULL, NULL);
011603ca
MK
724 close(s->fd);
725 s->fd = -1;
726
727 /* Wait for outstanding write requests to be completed. */
728 while (s->co_send != NULL) {
729 co_write_request(opaque);
730 }
731
732 /* Try to reconnect the sheepdog server every one second. */
733 while (s->fd < 0) {
a780dea0 734 Error *local_err = NULL;
356b4ca2 735 s->fd = get_sheep_fd(s, &local_err);
011603ca
MK
736 if (s->fd < 0) {
737 DPRINTF("Wait for connection to be established\n");
565f65d2 738 error_report_err(local_err);
011603ca
MK
739 co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME,
740 1000000000ULL);
741 }
742 };
743
744 /*
745 * Now we have to resend all the request in the inflight queue. However,
746 * resend_aioreq() can yield and newly created requests can be added to the
747 * inflight queue before the coroutine is resumed. To avoid mixing them, we
748 * have to move all the inflight requests to the failed queue before
749 * resend_aioreq() is called.
750 */
751 QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
752 QLIST_REMOVE(aio_req, aio_siblings);
753 QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
754 }
755
756 /* Resend all the failed aio requests. */
757 while (!QLIST_EMPTY(&s->failed_aio_head)) {
758 aio_req = QLIST_FIRST(&s->failed_aio_head);
759 QLIST_REMOVE(aio_req, aio_siblings);
011603ca
MK
760 resend_aioreq(s, aio_req);
761 }
762}
763
33b1db1c
MK
764/*
765 * Receive responses of the I/O requests.
766 *
767 * This function is registered as a fd handler, and called from the
768 * main loop when s->fd is ready for reading responses.
769 */
d8716b41 770static void coroutine_fn aio_read_response(void *opaque)
33b1db1c
MK
771{
772 SheepdogObjRsp rsp;
773 BDRVSheepdogState *s = opaque;
774 int fd = s->fd;
775 int ret;
776 AIOReq *aio_req = NULL;
777 SheepdogAIOCB *acb;
cac8f4a6 778 uint64_t idx;
33b1db1c 779
33b1db1c 780 /* read a header */
8c5135f9 781 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
80731d9d 782 if (ret != sizeof(rsp)) {
6daf194d 783 error_report("failed to get the header, %s", strerror(errno));
011603ca 784 goto err;
33b1db1c
MK
785 }
786
c292ee6a
MK
787 /* find the right aio_req from the inflight aio list */
788 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
33b1db1c
MK
789 if (aio_req->id == rsp.id) {
790 break;
791 }
792 }
793 if (!aio_req) {
6daf194d 794 error_report("cannot find aio_req %x", rsp.id);
011603ca 795 goto err;
33b1db1c
MK
796 }
797
798 acb = aio_req->aiocb;
799
800 switch (acb->aiocb_type) {
801 case AIOCB_WRITE_UDATA:
802 if (!is_data_obj(aio_req->oid)) {
803 break;
804 }
805 idx = data_oid_to_idx(aio_req->oid);
806
b544c1ab 807 if (aio_req->create) {
33b1db1c
MK
808 /*
809 * If the object is newly created one, we need to update
810 * the vdi object (metadata object). min_dirty_data_idx
811 * and max_dirty_data_idx are changed to include updated
812 * index between them.
813 */
bd751f22
LY
814 if (rsp.result == SD_RES_SUCCESS) {
815 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
498f2140
HM
816 acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
817 acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
bd751f22 818 }
33b1db1c
MK
819 }
820 break;
821 case AIOCB_READ_UDATA:
2fc8ae1d
MT
822 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
823 aio_req->iov_offset, rsp.data_length);
80731d9d 824 if (ret != rsp.data_length) {
6daf194d 825 error_report("failed to get the data, %s", strerror(errno));
011603ca 826 goto err;
33b1db1c
MK
827 }
828 break;
47783072
LY
829 case AIOCB_FLUSH_CACHE:
830 if (rsp.result == SD_RES_INVALID_PARMS) {
2440a2c3 831 DPRINTF("disable cache since the server doesn't support it\n");
47783072
LY
832 s->cache_flags = SD_FLAG_CMD_DIRECT;
833 rsp.result = SD_RES_SUCCESS;
834 }
835 break;
cac8f4a6
LY
836 case AIOCB_DISCARD_OBJ:
837 switch (rsp.result) {
838 case SD_RES_INVALID_PARMS:
8ecc2f9e 839 error_report("server doesn't support discard command");
cac8f4a6
LY
840 rsp.result = SD_RES_SUCCESS;
841 s->discard_supported = false;
842 break;
cac8f4a6
LY
843 default:
844 break;
845 }
33b1db1c
MK
846 }
847
e80ab33d
PB
848 /* No more data for this aio_req (reload_inode below uses its own file
849 * descriptor handler which doesn't use co_recv).
850 */
851 s->co_recv = NULL;
852
c4080e93 853 QLIST_REMOVE(aio_req, aio_siblings);
13c31de2
MK
854 switch (rsp.result) {
855 case SD_RES_SUCCESS:
856 break;
857 case SD_RES_READONLY:
72e0996c
MK
858 if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
859 ret = reload_inode(s, 0, "");
860 if (ret < 0) {
011603ca 861 goto err;
72e0996c
MK
862 }
863 }
72e0996c
MK
864 if (is_data_obj(aio_req->oid)) {
865 aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
866 data_oid_to_idx(aio_req->oid));
867 } else {
868 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
869 }
a37dcdf9 870 resend_aioreq(s, aio_req);
e80ab33d 871 return;
13c31de2 872 default:
33b1db1c 873 acb->ret = -EIO;
6daf194d 874 error_report("%s", sd_strerror(rsp.result));
13c31de2 875 break;
33b1db1c
MK
876 }
877
c4080e93
PB
878 g_free(aio_req);
879
880 if (!--acb->nr_pending) {
33b1db1c
MK
881 /*
882 * We've finished all requests which belong to the AIOCB, so
2df46246 883 * we can switch back to sd_co_readv/writev now.
33b1db1c 884 */
9d456654 885 aio_co_wake(acb->coroutine);
33b1db1c 886 }
e80ab33d 887
011603ca 888 return;
e80ab33d 889
011603ca 890err:
011603ca 891 reconnect_to_sdog(opaque);
2df46246
MK
892}
893
894static void co_read_response(void *opaque)
895{
896 BDRVSheepdogState *s = opaque;
897
898 if (!s->co_recv) {
0b8b8753 899 s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
2df46246
MK
900 }
901
9d456654 902 aio_co_wake(s->co_recv);
2df46246
MK
903}
904
905static void co_write_request(void *opaque)
906{
907 BDRVSheepdogState *s = opaque;
908
9d456654 909 aio_co_wake(s->co_send);
33b1db1c
MK
910}
911
33b1db1c 912/*
dc6fb73d 913 * Return a socket descriptor to read/write objects.
33b1db1c 914 *
dc6fb73d 915 * We cannot use this descriptor for other operations because
33b1db1c
MK
916 * the block driver may be on waiting response from the server.
917 */
356b4ca2 918static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
33b1db1c 919{
1b8bbb46 920 int fd;
33b1db1c 921
356b4ca2 922 fd = connect_to_sdog(s, errp);
33b1db1c 923 if (fd < 0) {
cb595887 924 return fd;
33b1db1c
MK
925 }
926
dca21ef2 927 aio_set_fd_handler(s->aio_context, fd, false,
f6a51c84 928 co_read_response, NULL, NULL, s);
33b1db1c
MK
929 return fd;
930}
931
89e2a31d
MA
932/*
933 * Parse numeric snapshot ID in @str
934 * If @str can't be parsed as number, return false.
935 * Else, if the number is zero or too large, set *@snapid to zero and
936 * return true.
937 * Else, set *@snapid to the number and return true.
938 */
939static bool sd_parse_snapid(const char *str, uint32_t *snapid)
940{
941 unsigned long ul;
942 int ret;
943
944 ret = qemu_strtoul(str, NULL, 10, &ul);
945 if (ret == -ERANGE) {
946 ul = ret = 0;
947 }
948 if (ret) {
949 return false;
950 }
951 if (ul > UINT32_MAX) {
952 ul = 0;
953 }
954
955 *snapid = ul;
956 return true;
957}
958
959static bool sd_parse_snapid_or_tag(const char *str,
960 uint32_t *snapid, char tag[])
961{
962 if (!sd_parse_snapid(str, snapid)) {
963 *snapid = 0;
964 if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) {
965 return false;
966 }
967 } else if (!*snapid) {
968 return false;
969 } else {
970 tag[0] = 0;
971 }
972 return true;
973}
974
831acdc9
MA
975typedef struct {
976 const char *path; /* non-null iff transport is tcp */
977 const char *host; /* valid when transport is tcp */
978 int port; /* valid when transport is tcp */
979 char vdi[SD_MAX_VDI_LEN];
980 char tag[SD_MAX_VDI_TAG_LEN];
981 uint32_t snap_id;
982 /* Remainder is only for sd_config_done() */
983 URI *uri;
984 QueryParams *qp;
985} SheepdogConfig;
986
987static void sd_config_done(SheepdogConfig *cfg)
988{
989 if (cfg->qp) {
990 query_params_free(cfg->qp);
991 }
992 uri_free(cfg->uri);
993}
994
995static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
36bcac16 996 Error **errp)
5d6768e3 997{
36bcac16 998 Error *err = NULL;
5d6768e3 999 QueryParams *qp = NULL;
8ecc2f9e
MA
1000 bool is_unix;
1001 URI *uri;
5d6768e3 1002
831acdc9
MA
1003 memset(cfg, 0, sizeof(*cfg));
1004
1005 cfg->uri = uri = uri_parse(filename);
5d6768e3 1006 if (!uri) {
36bcac16
MA
1007 error_setg(&err, "invalid URI");
1008 goto out;
5d6768e3
MK
1009 }
1010
1b8bbb46
MK
1011 /* transport */
1012 if (!strcmp(uri->scheme, "sheepdog")) {
8ecc2f9e 1013 is_unix = false;
1b8bbb46 1014 } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
8ecc2f9e 1015 is_unix = false;
1b8bbb46 1016 } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
8ecc2f9e 1017 is_unix = true;
1b8bbb46 1018 } else {
36bcac16
MA
1019 error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
1020 " or 'sheepdog+unix'");
1b8bbb46
MK
1021 goto out;
1022 }
1023
5d6768e3 1024 if (uri->path == NULL || !strcmp(uri->path, "/")) {
36bcac16 1025 error_setg(&err, "missing file path in URI");
5d6768e3
MK
1026 goto out;
1027 }
831acdc9
MA
1028 if (g_strlcpy(cfg->vdi, uri->path + 1, SD_MAX_VDI_LEN)
1029 >= SD_MAX_VDI_LEN) {
36bcac16 1030 error_setg(&err, "VDI name is too long");
daa0b0d4
MA
1031 goto out;
1032 }
5d6768e3 1033
831acdc9 1034 cfg->qp = qp = query_params_parse(uri->query);
1b8bbb46 1035
8ecc2f9e 1036 if (is_unix) {
1b8bbb46 1037 /* sheepdog+unix:///vdiname?socket=path */
36bcac16
MA
1038 if (uri->server || uri->port) {
1039 error_setg(&err, "URI scheme %s doesn't accept a server address",
1040 uri->scheme);
1041 goto out;
1042 }
1043 if (!qp->n) {
1044 error_setg(&err,
1045 "URI scheme %s requires query parameter 'socket'",
1046 uri->scheme);
1047 goto out;
1048 }
1049 if (qp->n != 1 || strcmp(qp->p[0].name, "socket")) {
1050 error_setg(&err, "unexpected query parameters");
1b8bbb46
MK
1051 goto out;
1052 }
831acdc9 1053 cfg->path = qp->p[0].value;
1b8bbb46
MK
1054 } else {
1055 /* sheepdog[+tcp]://[host:port]/vdiname */
36bcac16
MA
1056 if (qp->n) {
1057 error_setg(&err, "unexpected query parameters");
1058 goto out;
1059 }
831acdc9
MA
1060 cfg->host = uri->server;
1061 cfg->port = uri->port;
1b8bbb46 1062 }
5d6768e3
MK
1063
1064 /* snapshot tag */
1065 if (uri->fragment) {
831acdc9
MA
1066 if (!sd_parse_snapid_or_tag(uri->fragment,
1067 &cfg->snap_id, cfg->tag)) {
36bcac16
MA
1068 error_setg(&err, "'%s' is not a valid snapshot ID",
1069 uri->fragment);
89e2a31d 1070 goto out;
5d6768e3
MK
1071 }
1072 } else {
831acdc9 1073 cfg->snap_id = CURRENT_VDI_ID; /* search current vdi */
5d6768e3
MK
1074 }
1075
1076out:
8ecc2f9e
MA
1077 if (err) {
1078 error_propagate(errp, err);
831acdc9 1079 sd_config_done(cfg);
5d6768e3 1080 }
5d6768e3
MK
1081}
1082
33b1db1c 1083/*
5d6768e3 1084 * Parse a filename (old syntax)
33b1db1c
MK
1085 *
1086 * filename must be one of the following formats:
1087 * 1. [vdiname]
1088 * 2. [vdiname]:[snapid]
1089 * 3. [vdiname]:[tag]
1090 * 4. [hostname]:[port]:[vdiname]
1091 * 5. [hostname]:[port]:[vdiname]:[snapid]
1092 * 6. [hostname]:[port]:[vdiname]:[tag]
1093 *
1094 * You can boot from the snapshot images by specifying `snapid` or
1095 * `tag'.
1096 *
1097 * You can run VMs outside the Sheepdog cluster by specifying
1098 * `hostname' and `port' (experimental).
1099 */
831acdc9 1100static void parse_vdiname(SheepdogConfig *cfg, const char *filename,
36bcac16 1101 Error **errp)
33b1db1c 1102{
36bcac16 1103 Error *err = NULL;
5d6768e3
MK
1104 char *p, *q, *uri;
1105 const char *host_spec, *vdi_spec;
36bcac16 1106 int nr_sep;
33b1db1c 1107
11d816a5 1108 strstart(filename, "sheepdog:", &filename);
7267c094 1109 p = q = g_strdup(filename);
33b1db1c
MK
1110
1111 /* count the number of separators */
1112 nr_sep = 0;
1113 while (*p) {
1114 if (*p == ':') {
1115 nr_sep++;
1116 }
1117 p++;
1118 }
1119 p = q;
1120
5d6768e3 1121 /* use the first two tokens as host_spec. */
33b1db1c 1122 if (nr_sep >= 2) {
5d6768e3 1123 host_spec = p;
33b1db1c 1124 p = strchr(p, ':');
5d6768e3 1125 p++;
33b1db1c
MK
1126 p = strchr(p, ':');
1127 *p++ = '\0';
1128 } else {
5d6768e3 1129 host_spec = "";
33b1db1c
MK
1130 }
1131
5d6768e3 1132 vdi_spec = p;
33b1db1c 1133
5d6768e3 1134 p = strchr(vdi_spec, ':');
33b1db1c 1135 if (p) {
5d6768e3 1136 *p++ = '#';
33b1db1c
MK
1137 }
1138
5d6768e3 1139 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
33b1db1c 1140
36bcac16
MA
1141 /*
1142 * FIXME We to escape URI meta-characters, e.g. "x?y=z"
1143 * produces "sheepdog://x?y=z". Because of that ...
1144 */
831acdc9 1145 sd_parse_uri(cfg, uri, &err);
36bcac16
MA
1146 if (err) {
1147 /*
1148 * ... this can fail, but the error message is misleading.
1149 * Replace it by the traditional useless one until the
1150 * escaping is fixed.
1151 */
1152 error_free(err);
1153 error_setg(errp, "Can't parse filename");
1154 }
5d6768e3
MK
1155
1156 g_free(q);
1157 g_free(uri);
33b1db1c
MK
1158}
1159
831acdc9
MA
1160static void sd_parse_filename(const char *filename, QDict *options,
1161 Error **errp)
1162{
1163 Error *err = NULL;
1164 SheepdogConfig cfg;
1165 char buf[32];
1166
1167 if (strstr(filename, "://")) {
1168 sd_parse_uri(&cfg, filename, &err);
1169 } else {
1170 parse_vdiname(&cfg, filename, &err);
1171 }
1172 if (err) {
1173 error_propagate(errp, err);
1174 return;
1175 }
1176
1177 if (cfg.host) {
1178 qdict_set_default_str(options, "host", cfg.host);
1179 }
1180 if (cfg.port) {
1181 snprintf(buf, sizeof(buf), "%d", cfg.port);
1182 qdict_set_default_str(options, "port", buf);
1183 }
1184 if (cfg.path) {
1185 qdict_set_default_str(options, "path", cfg.path);
1186 }
1187 qdict_set_default_str(options, "vdi", cfg.vdi);
1188 qdict_set_default_str(options, "tag", cfg.tag);
1189 if (cfg.snap_id) {
1190 snprintf(buf, sizeof(buf), "%d", cfg.snap_id);
1191 qdict_set_default_str(options, "snap-id", buf);
1192 }
1193
1194 sd_config_done(&cfg);
1195}
1196
982dcbf4
MK
1197static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
1198 uint32_t snapid, const char *tag, uint32_t *vid,
dc83cd42 1199 bool lock, Error **errp)
33b1db1c
MK
1200{
1201 int ret, fd;
1202 SheepdogVdiReq hdr;
1203 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1204 unsigned int wlen, rlen = 0;
1205 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1206
dc83cd42 1207 fd = connect_to_sdog(s, errp);
33b1db1c 1208 if (fd < 0) {
cb595887 1209 return fd;
33b1db1c
MK
1210 }
1211
3178e275
JM
1212 /* This pair of strncpy calls ensures that the buffer is zero-filled,
1213 * which is desirable since we'll soon be sending those bytes, and
1214 * don't want the send_req to read uninitialized data.
1215 */
33b1db1c
MK
1216 strncpy(buf, filename, SD_MAX_VDI_LEN);
1217 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1218
1219 memset(&hdr, 0, sizeof(hdr));
982dcbf4 1220 if (lock) {
33b1db1c 1221 hdr.opcode = SD_OP_LOCK_VDI;
1dbfafed 1222 hdr.type = LOCK_TYPE_NORMAL;
982dcbf4
MK
1223 } else {
1224 hdr.opcode = SD_OP_GET_VDI_INFO;
33b1db1c
MK
1225 }
1226 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1227 hdr.proto_ver = SD_PROTO_VER;
1228 hdr.data_length = wlen;
1229 hdr.snapid = snapid;
1230 hdr.flags = SD_FLAG_CMD_WRITE;
1231
f11672db 1232 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1233 if (ret) {
dc83cd42 1234 error_setg_errno(errp, -ret, "cannot get vdi info");
33b1db1c
MK
1235 goto out;
1236 }
1237
1238 if (rsp->result != SD_RES_SUCCESS) {
dc83cd42
MA
1239 error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
1240 sd_strerror(rsp->result), filename, snapid, tag);
cb595887
MK
1241 if (rsp->result == SD_RES_NO_VDI) {
1242 ret = -ENOENT;
38890b24
HM
1243 } else if (rsp->result == SD_RES_VDI_LOCKED) {
1244 ret = -EBUSY;
cb595887
MK
1245 } else {
1246 ret = -EIO;
1247 }
33b1db1c
MK
1248 goto out;
1249 }
1250 *vid = rsp->vdi_id;
1251
1252 ret = 0;
1253out:
1254 closesocket(fd);
1255 return ret;
1256}
1257
a37dcdf9 1258static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
1259 struct iovec *iov, int niov,
1260 enum AIOCBState aiocb_type)
33b1db1c
MK
1261{
1262 int nr_copies = s->inode.nr_copies;
1263 SheepdogObjReq hdr;
47783072 1264 unsigned int wlen = 0;
33b1db1c
MK
1265 int ret;
1266 uint64_t oid = aio_req->oid;
1267 unsigned int datalen = aio_req->data_len;
1268 uint64_t offset = aio_req->offset;
1269 uint8_t flags = aio_req->flags;
1270 uint64_t old_oid = aio_req->base_oid;
b544c1ab 1271 bool create = aio_req->create;
33b1db1c 1272
c4080e93
PB
1273 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
1274
33b1db1c 1275 if (!nr_copies) {
6daf194d 1276 error_report("bug");
33b1db1c
MK
1277 }
1278
1279 memset(&hdr, 0, sizeof(hdr));
1280
47783072
LY
1281 switch (aiocb_type) {
1282 case AIOCB_FLUSH_CACHE:
1283 hdr.opcode = SD_OP_FLUSH_VDI;
1284 break;
1285 case AIOCB_READ_UDATA:
33b1db1c
MK
1286 hdr.opcode = SD_OP_READ_OBJ;
1287 hdr.flags = flags;
47783072
LY
1288 break;
1289 case AIOCB_WRITE_UDATA:
1290 if (create) {
1291 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1292 } else {
1293 hdr.opcode = SD_OP_WRITE_OBJ;
1294 }
33b1db1c 1295 wlen = datalen;
33b1db1c 1296 hdr.flags = SD_FLAG_CMD_WRITE | flags;
47783072 1297 break;
cac8f4a6 1298 case AIOCB_DISCARD_OBJ:
e6fd57ea
HM
1299 hdr.opcode = SD_OP_WRITE_OBJ;
1300 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1301 s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
1302 offset = offsetof(SheepdogInode,
1303 data_vdi_id[data_oid_to_idx(oid)]);
1304 oid = vid_to_vdi_oid(s->inode.vdi_id);
1305 wlen = datalen = sizeof(uint32_t);
cac8f4a6 1306 break;
33b1db1c
MK
1307 }
1308
0e7106d8
LY
1309 if (s->cache_flags) {
1310 hdr.flags |= s->cache_flags;
47622c44
LY
1311 }
1312
33b1db1c
MK
1313 hdr.oid = oid;
1314 hdr.cow_oid = old_oid;
1315 hdr.copies = s->inode.nr_copies;
1316
1317 hdr.data_length = datalen;
1318 hdr.offset = offset;
1319
1320 hdr.id = aio_req->id;
1321
2df46246
MK
1322 qemu_co_mutex_lock(&s->lock);
1323 s->co_send = qemu_coroutine_self();
dca21ef2 1324 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1325 co_read_response, co_write_request, NULL, s);
128aa589 1326 socket_set_cork(s->fd, 1);
33b1db1c
MK
1327
1328 /* send a header */
8c5135f9 1329 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
80731d9d 1330 if (ret != sizeof(hdr)) {
6daf194d 1331 error_report("failed to send a req, %s", strerror(errno));
011603ca 1332 goto out;
33b1db1c
MK
1333 }
1334
1335 if (wlen) {
2fc8ae1d 1336 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
80731d9d 1337 if (ret != wlen) {
6daf194d 1338 error_report("failed to send a data, %s", strerror(errno));
33b1db1c
MK
1339 }
1340 }
011603ca 1341out:
128aa589 1342 socket_set_cork(s->fd, 0);
dca21ef2 1343 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1344 co_read_response, NULL, NULL, s);
011603ca 1345 s->co_send = NULL;
2df46246 1346 qemu_co_mutex_unlock(&s->lock);
33b1db1c
MK
1347}
1348
f11672db 1349static int read_write_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1350 uint64_t oid, uint8_t copies,
33b1db1c 1351 unsigned int datalen, uint64_t offset,
0e7106d8 1352 bool write, bool create, uint32_t cache_flags)
33b1db1c
MK
1353{
1354 SheepdogObjReq hdr;
1355 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1356 unsigned int wlen, rlen;
1357 int ret;
1358
1359 memset(&hdr, 0, sizeof(hdr));
1360
1361 if (write) {
1362 wlen = datalen;
1363 rlen = 0;
1364 hdr.flags = SD_FLAG_CMD_WRITE;
1365 if (create) {
1366 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1367 } else {
1368 hdr.opcode = SD_OP_WRITE_OBJ;
1369 }
1370 } else {
1371 wlen = 0;
1372 rlen = datalen;
1373 hdr.opcode = SD_OP_READ_OBJ;
1374 }
47622c44 1375
0e7106d8 1376 hdr.flags |= cache_flags;
47622c44 1377
33b1db1c
MK
1378 hdr.oid = oid;
1379 hdr.data_length = datalen;
1380 hdr.offset = offset;
1381 hdr.copies = copies;
1382
f11672db 1383 ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1384 if (ret) {
6daf194d 1385 error_report("failed to send a request to the sheep");
cb595887 1386 return ret;
33b1db1c
MK
1387 }
1388
1389 switch (rsp->result) {
1390 case SD_RES_SUCCESS:
1391 return 0;
1392 default:
6daf194d 1393 error_report("%s", sd_strerror(rsp->result));
cb595887 1394 return -EIO;
33b1db1c
MK
1395 }
1396}
1397
f11672db 1398static int read_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1399 uint64_t oid, uint8_t copies,
0e7106d8
LY
1400 unsigned int datalen, uint64_t offset,
1401 uint32_t cache_flags)
33b1db1c 1402{
f11672db 1403 return read_write_object(fd, bs, buf, oid, copies,
84390bed 1404 datalen, offset, false,
0e7106d8 1405 false, cache_flags);
33b1db1c
MK
1406}
1407
f11672db 1408static int write_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1409 uint64_t oid, uint8_t copies,
2f536801 1410 unsigned int datalen, uint64_t offset, bool create,
0e7106d8 1411 uint32_t cache_flags)
33b1db1c 1412{
f11672db 1413 return read_write_object(fd, bs, buf, oid, copies,
84390bed 1414 datalen, offset, true,
0e7106d8 1415 create, cache_flags);
33b1db1c
MK
1416}
1417
9ff53a0e
MK
1418/* update inode with the latest state */
1419static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
1420{
dfb12bf8 1421 Error *local_err = NULL;
9ff53a0e
MK
1422 SheepdogInode *inode;
1423 int ret = 0, fd;
1424 uint32_t vid = 0;
1425
dfb12bf8 1426 fd = connect_to_sdog(s, &local_err);
9ff53a0e 1427 if (fd < 0) {
565f65d2 1428 error_report_err(local_err);
9ff53a0e
MK
1429 return -EIO;
1430 }
1431
5d039bab 1432 inode = g_malloc(SD_INODE_HEADER_SIZE);
9ff53a0e 1433
dc83cd42 1434 ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
9ff53a0e 1435 if (ret) {
565f65d2 1436 error_report_err(local_err);
9ff53a0e
MK
1437 goto out;
1438 }
1439
f11672db 1440 ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
5d039bab
HM
1441 s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
1442 s->cache_flags);
9ff53a0e
MK
1443 if (ret < 0) {
1444 goto out;
1445 }
1446
1447 if (inode->vdi_id != s->inode.vdi_id) {
5d039bab 1448 memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE);
9ff53a0e
MK
1449 }
1450
1451out:
1452 g_free(inode);
1453 closesocket(fd);
1454
1455 return ret;
1456}
1457
a37dcdf9 1458static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
13c31de2
MK
1459{
1460 SheepdogAIOCB *acb = aio_req->aiocb;
b544c1ab
HM
1461
1462 aio_req->create = false;
13c31de2
MK
1463
1464 /* check whether this request becomes a CoW one */
2412aec7 1465 if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
13c31de2 1466 int idx = data_oid_to_idx(aio_req->oid);
13c31de2 1467
13c31de2
MK
1468 if (is_data_obj_writable(&s->inode, idx)) {
1469 goto out;
1470 }
1471
80308d33
MK
1472 if (s->inode.data_vdi_id[idx]) {
1473 aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
1474 aio_req->flags |= SD_FLAG_CMD_COW;
1475 }
b544c1ab 1476 aio_req->create = true;
13c31de2
MK
1477 }
1478out:
2412aec7 1479 if (is_data_obj(aio_req->oid)) {
b544c1ab 1480 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 1481 acb->aiocb_type);
2412aec7
MK
1482 } else {
1483 struct iovec iov;
1484 iov.iov_base = &s->inode;
1485 iov.iov_len = sizeof(s->inode);
b544c1ab 1486 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
2412aec7 1487 }
13c31de2
MK
1488}
1489
84390bed
SH
1490static void sd_detach_aio_context(BlockDriverState *bs)
1491{
1492 BDRVSheepdogState *s = bs->opaque;
1493
dca21ef2 1494 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
f6a51c84 1495 NULL, NULL, NULL);
84390bed
SH
1496}
1497
1498static void sd_attach_aio_context(BlockDriverState *bs,
1499 AioContext *new_context)
1500{
1501 BDRVSheepdogState *s = bs->opaque;
1502
1503 s->aio_context = new_context;
dca21ef2 1504 aio_set_fd_handler(new_context, s->fd, false,
f6a51c84 1505 co_read_response, NULL, NULL, s);
84390bed
SH
1506}
1507
c8c96350
KW
1508static QemuOptsList runtime_opts = {
1509 .name = "sheepdog",
1510 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
1511 .desc = {
1512 {
831acdc9
MA
1513 .name = "host",
1514 .type = QEMU_OPT_STRING,
1515 },
1516 {
1517 .name = "port",
1518 .type = QEMU_OPT_STRING,
1519 },
1520 {
1521 .name = "path",
1522 .type = QEMU_OPT_STRING,
1523 },
1524 {
1525 .name = "vdi",
1526 .type = QEMU_OPT_STRING,
1527 },
1528 {
1529 .name = "snap-id",
1530 .type = QEMU_OPT_NUMBER,
1531 },
1532 {
1533 .name = "tag",
c8c96350 1534 .type = QEMU_OPT_STRING,
c8c96350
KW
1535 },
1536 { /* end of list */ }
1537 },
1538};
1539
015a1036
HR
1540static int sd_open(BlockDriverState *bs, QDict *options, int flags,
1541 Error **errp)
33b1db1c
MK
1542{
1543 int ret, fd;
1544 uint32_t vid = 0;
1545 BDRVSheepdogState *s = bs->opaque;
831acdc9
MA
1546 const char *host, *port, *path, *vdi, *snap_id_str, *tag;
1547 uint64_t snap_id;
33b1db1c 1548 char *buf = NULL;
c8c96350
KW
1549 QemuOpts *opts;
1550 Error *local_err = NULL;
c8c96350 1551
011603ca 1552 s->bs = bs;
84390bed 1553 s->aio_context = bdrv_get_aio_context(bs);
011603ca 1554
87ea75d5 1555 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
c8c96350 1556 qemu_opts_absorb_qdict(opts, options, &local_err);
84d18f06 1557 if (local_err) {
e67c3993 1558 error_propagate(errp, local_err);
c8c96350 1559 ret = -EINVAL;
cbc488ee 1560 goto err_no_fd;
c8c96350
KW
1561 }
1562
831acdc9
MA
1563 host = qemu_opt_get(opts, "host");
1564 port = qemu_opt_get(opts, "port");
1565 path = qemu_opt_get(opts, "path");
1566 vdi = qemu_opt_get(opts, "vdi");
1567 snap_id_str = qemu_opt_get(opts, "snap-id");
1568 snap_id = qemu_opt_get_number(opts, "snap-id", CURRENT_VDI_ID);
1569 tag = qemu_opt_get(opts, "tag");
33b1db1c 1570
831acdc9
MA
1571 if ((host || port) && path) {
1572 error_setg(errp, "can't use 'path' together with 'host' or 'port'");
1573 ret = -EINVAL;
1574 goto err_no_fd;
1575 }
1576
1577 if (!vdi) {
1578 error_setg(errp, "parameter 'vdi' is missing");
1579 ret = -EINVAL;
1580 goto err_no_fd;
1581 }
1582 if (strlen(vdi) >= SD_MAX_VDI_LEN) {
1583 error_setg(errp, "value of parameter 'vdi' is too long");
1584 ret = -EINVAL;
1585 goto err_no_fd;
1586 }
33b1db1c 1587
831acdc9
MA
1588 if (snap_id > UINT32_MAX) {
1589 snap_id = 0;
1590 }
1591 if (snap_id_str && !snap_id) {
1592 error_setg(errp, "'snap-id=%s' is not a valid snapshot ID",
1593 snap_id_str);
1594 ret = -EINVAL;
1595 goto err_no_fd;
1596 }
5d6768e3 1597
831acdc9
MA
1598 if (!tag) {
1599 tag = "";
5d6768e3 1600 }
831acdc9
MA
1601 if (tag && strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
1602 error_setg(errp, "value of parameter 'tag' is too long");
36bcac16 1603 ret = -EINVAL;
cbc488ee 1604 goto err_no_fd;
33b1db1c 1605 }
831acdc9
MA
1606
1607 s->addr = sd_socket_address(path, host, port);
1608
1609 QLIST_INIT(&s->inflight_aio_head);
1610 QLIST_INIT(&s->failed_aio_head);
1611 QLIST_INIT(&s->inflight_aiocb_head);
1612
e67c3993 1613 s->fd = get_sheep_fd(s, errp);
33b1db1c 1614 if (s->fd < 0) {
cb595887 1615 ret = s->fd;
cbc488ee 1616 goto err_no_fd;
33b1db1c
MK
1617 }
1618
831acdc9 1619 ret = find_vdi_name(s, vdi, (uint32_t)snap_id, tag, &vid, true, errp);
33b1db1c 1620 if (ret) {
cbc488ee 1621 goto err;
33b1db1c
MK
1622 }
1623
0e7106d8
LY
1624 /*
1625 * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1626 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1627 */
1628 s->cache_flags = SD_FLAG_CMD_CACHE;
1629 if (flags & BDRV_O_NOCACHE) {
1630 s->cache_flags = SD_FLAG_CMD_DIRECT;
1631 }
cac8f4a6 1632 s->discard_supported = true;
0e7106d8 1633
831acdc9 1634 if (snap_id || tag[0]) {
2440a2c3 1635 DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid);
2f536801 1636 s->is_snapshot = true;
33b1db1c
MK
1637 }
1638
e67c3993 1639 fd = connect_to_sdog(s, errp);
33b1db1c 1640 if (fd < 0) {
cb595887 1641 ret = fd;
cbc488ee 1642 goto err;
33b1db1c
MK
1643 }
1644
7267c094 1645 buf = g_malloc(SD_INODE_SIZE);
f11672db 1646 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
84390bed 1647 0, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
1648
1649 closesocket(fd);
1650
1651 if (ret) {
efde4b62 1652 error_setg(errp, "Can't read snapshot inode");
cbc488ee 1653 goto err;
33b1db1c
MK
1654 }
1655
1656 memcpy(&s->inode, buf, sizeof(s->inode));
33b1db1c 1657
e8bfaa2f 1658 bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
3178e275 1659 pstrcpy(s->name, sizeof(s->name), vdi);
2df46246 1660 qemu_co_mutex_init(&s->lock);
498f2140 1661 qemu_co_queue_init(&s->overlapping_queue);
c8c96350 1662 qemu_opts_del(opts);
7267c094 1663 g_free(buf);
33b1db1c 1664 return 0;
cbc488ee
MA
1665
1666err:
dca21ef2 1667 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
f6a51c84 1668 false, NULL, NULL, NULL, NULL);
cbc488ee
MA
1669 closesocket(s->fd);
1670err_no_fd:
c8c96350 1671 qemu_opts_del(opts);
7267c094 1672 g_free(buf);
cb595887 1673 return ret;
33b1db1c
MK
1674}
1675
4da65c80
LY
1676static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
1677 Error **errp)
1678{
1679 BDRVSheepdogState *s = state->bs->opaque;
1680 BDRVSheepdogReopenState *re_s;
1681 int ret = 0;
1682
1683 re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
1684
1685 re_s->cache_flags = SD_FLAG_CMD_CACHE;
1686 if (state->flags & BDRV_O_NOCACHE) {
1687 re_s->cache_flags = SD_FLAG_CMD_DIRECT;
1688 }
1689
1690 re_s->fd = get_sheep_fd(s, errp);
1691 if (re_s->fd < 0) {
1692 ret = re_s->fd;
1693 return ret;
1694 }
1695
1696 return ret;
1697}
1698
1699static void sd_reopen_commit(BDRVReopenState *state)
1700{
1701 BDRVSheepdogReopenState *re_s = state->opaque;
1702 BDRVSheepdogState *s = state->bs->opaque;
1703
1704 if (s->fd) {
dca21ef2 1705 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1706 NULL, NULL, NULL, NULL);
4da65c80
LY
1707 closesocket(s->fd);
1708 }
1709
1710 s->fd = re_s->fd;
1711 s->cache_flags = re_s->cache_flags;
1712
1713 g_free(state->opaque);
1714 state->opaque = NULL;
1715
1716 return;
1717}
1718
1719static void sd_reopen_abort(BDRVReopenState *state)
1720{
1721 BDRVSheepdogReopenState *re_s = state->opaque;
1722 BDRVSheepdogState *s = state->bs->opaque;
1723
1724 if (re_s == NULL) {
1725 return;
1726 }
1727
1728 if (re_s->fd) {
dca21ef2 1729 aio_set_fd_handler(s->aio_context, re_s->fd, false,
f6a51c84 1730 NULL, NULL, NULL, NULL);
4da65c80
LY
1731 closesocket(re_s->fd);
1732 }
1733
1734 g_free(state->opaque);
1735 state->opaque = NULL;
1736
1737 return;
1738}
1739
7d2d3e74
MA
1740static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
1741 Error **errp)
33b1db1c
MK
1742{
1743 SheepdogVdiReq hdr;
1744 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1745 int fd, ret;
1746 unsigned int wlen, rlen = 0;
1747 char buf[SD_MAX_VDI_LEN];
1748
7d2d3e74 1749 fd = connect_to_sdog(s, errp);
33b1db1c 1750 if (fd < 0) {
cb595887 1751 return fd;
33b1db1c
MK
1752 }
1753
3178e275
JM
1754 /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1755 * does not fit in buf? For now, just truncate and avoid buffer overrun.
1756 */
33b1db1c 1757 memset(buf, 0, sizeof(buf));
c31d482f 1758 pstrcpy(buf, sizeof(buf), s->name);
33b1db1c
MK
1759
1760 memset(&hdr, 0, sizeof(hdr));
1761 hdr.opcode = SD_OP_NEW_VDI;
9f23fce7 1762 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
1763
1764 wlen = SD_MAX_VDI_LEN;
1765
1766 hdr.flags = SD_FLAG_CMD_WRITE;
1767 hdr.snapid = snapshot;
1768
1769 hdr.data_length = wlen;
c31d482f
LY
1770 hdr.vdi_size = s->inode.vdi_size;
1771 hdr.copy_policy = s->inode.copy_policy;
b3af018f 1772 hdr.copies = s->inode.nr_copies;
876eb1b0 1773 hdr.block_size_shift = s->inode.block_size_shift;
33b1db1c 1774
f11672db 1775 ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c
MK
1776
1777 closesocket(fd);
1778
1779 if (ret) {
7d2d3e74 1780 error_setg_errno(errp, -ret, "create failed");
cb595887 1781 return ret;
33b1db1c
MK
1782 }
1783
1784 if (rsp->result != SD_RES_SUCCESS) {
7d2d3e74 1785 error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
33b1db1c
MK
1786 return -EIO;
1787 }
1788
1789 if (vdi_id) {
1790 *vdi_id = rsp->vdi_id;
1791 }
1792
1793 return 0;
1794}
1795
318df29e 1796static int sd_prealloc(const char *filename, Error **errp)
a8e0fdd7 1797{
fba98d45 1798 BlockBackend *blk = NULL;
876eb1b0
TI
1799 BDRVSheepdogState *base = NULL;
1800 unsigned long buf_size;
a8e0fdd7 1801 uint32_t idx, max_idx;
876eb1b0 1802 uint32_t object_size;
a8e0fdd7 1803 int64_t vdi_size;
876eb1b0 1804 void *buf = NULL;
a8e0fdd7
MK
1805 int ret;
1806
efaa7c4e 1807 blk = blk_new_open(filename, NULL, NULL,
55880601 1808 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
fba98d45
KW
1809 if (blk == NULL) {
1810 ret = -EIO;
318df29e 1811 goto out_with_err_set;
a8e0fdd7
MK
1812 }
1813
fba98d45
KW
1814 blk_set_allow_write_beyond_eof(blk, true);
1815
1816 vdi_size = blk_getlength(blk);
a8e0fdd7
MK
1817 if (vdi_size < 0) {
1818 ret = vdi_size;
1819 goto out;
1820 }
876eb1b0 1821
fba98d45 1822 base = blk_bs(blk)->opaque;
876eb1b0
TI
1823 object_size = (UINT32_C(1) << base->inode.block_size_shift);
1824 buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
1825 buf = g_malloc0(buf_size);
1826
1827 max_idx = DIV_ROUND_UP(vdi_size, buf_size);
a8e0fdd7
MK
1828
1829 for (idx = 0; idx < max_idx; idx++) {
1830 /*
1831 * The created image can be a cloned image, so we need to read
1832 * a data from the source image.
1833 */
fba98d45 1834 ret = blk_pread(blk, idx * buf_size, buf, buf_size);
a8e0fdd7
MK
1835 if (ret < 0) {
1836 goto out;
1837 }
8341f00d 1838 ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
a8e0fdd7
MK
1839 if (ret < 0) {
1840 goto out;
1841 }
1842 }
318df29e 1843
fba98d45 1844 ret = 0;
a8e0fdd7 1845out:
318df29e
MA
1846 if (ret < 0) {
1847 error_setg_errno(errp, -ret, "Can't pre-allocate");
1848 }
1849out_with_err_set:
fba98d45
KW
1850 if (blk) {
1851 blk_unref(blk);
a8e0fdd7 1852 }
7267c094 1853 g_free(buf);
a8e0fdd7
MK
1854
1855 return ret;
1856}
1857
b3af018f
LY
1858/*
1859 * Sheepdog support two kinds of redundancy, full replication and erasure
1860 * coding.
1861 *
1862 * # create a fully replicated vdi with x copies
1863 * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
1864 *
1865 * # create a erasure coded vdi with x data strips and y parity strips
1866 * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
1867 */
1868static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
1869{
1870 struct SheepdogInode *inode = &s->inode;
1871 const char *n1, *n2;
1872 long copy, parity;
1873 char p[10];
1874
1875 pstrcpy(p, sizeof(p), opt);
1876 n1 = strtok(p, ":");
1877 n2 = strtok(NULL, ":");
1878
1879 if (!n1) {
1880 return -EINVAL;
1881 }
1882
1883 copy = strtol(n1, NULL, 10);
89e2a31d 1884 /* FIXME fix error checking by switching to qemu_strtol() */
b3af018f
LY
1885 if (copy > SD_MAX_COPIES || copy < 1) {
1886 return -EINVAL;
1887 }
1888 if (!n2) {
1889 inode->copy_policy = 0;
1890 inode->nr_copies = copy;
1891 return 0;
1892 }
1893
1894 if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
1895 return -EINVAL;
1896 }
1897
1898 parity = strtol(n2, NULL, 10);
89e2a31d 1899 /* FIXME fix error checking by switching to qemu_strtol() */
b3af018f
LY
1900 if (parity >= SD_EC_MAX_STRIP || parity < 1) {
1901 return -EINVAL;
1902 }
1903
1904 /*
1905 * 4 bits for parity and 4 bits for data.
1906 * We have to compress upper data bits because it can't represent 16
1907 */
1908 inode->copy_policy = ((copy / 2) << 4) + parity;
1909 inode->nr_copies = copy + parity;
1910
1911 return 0;
1912}
1913
876eb1b0
TI
1914static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
1915{
1916 struct SheepdogInode *inode = &s->inode;
1917 uint64_t object_size;
1918 int obj_order;
1919
1920 object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
1921 if (object_size) {
1922 if ((object_size - 1) & object_size) { /* not a power of 2? */
1923 return -EINVAL;
1924 }
786a4ea8 1925 obj_order = ctz32(object_size);
876eb1b0
TI
1926 if (obj_order < 20 || obj_order > 31) {
1927 return -EINVAL;
1928 }
1929 inode->block_size_shift = (uint8_t)obj_order;
1930 }
1931
1932 return 0;
1933}
1934
b222237b 1935static int sd_create(const char *filename, QemuOpts *opts,
d5124c00 1936 Error **errp)
33b1db1c 1937{
36bcac16 1938 Error *err = NULL;
b6fc8245 1939 int ret = 0;
c31d482f 1940 uint32_t vid = 0;
33b1db1c 1941 char *backing_file = NULL;
b222237b 1942 char *buf = NULL;
b6fc8245 1943 BDRVSheepdogState *s;
831acdc9 1944 SheepdogConfig cfg;
876eb1b0 1945 uint64_t max_vdi_size;
2f536801 1946 bool prealloc = false;
33b1db1c 1947
5839e53b 1948 s = g_new0(BDRVSheepdogState, 1);
b6fc8245 1949
5d6768e3 1950 if (strstr(filename, "://")) {
831acdc9 1951 sd_parse_uri(&cfg, filename, &err);
5d6768e3 1952 } else {
831acdc9 1953 parse_vdiname(&cfg, filename, &err);
5d6768e3 1954 }
36bcac16
MA
1955 if (err) {
1956 error_propagate(errp, err);
b6fc8245 1957 goto out;
b4447363
MK
1958 }
1959
831acdc9
MA
1960 buf = cfg.port ? g_strdup_printf("%d", cfg.port) : NULL;
1961 s->addr = sd_socket_address(cfg.path, cfg.host, buf);
1962 g_free(buf);
1963 strcpy(s->name, cfg.vdi);
1964 sd_config_done(&cfg);
1965
c2eb918e
HT
1966 s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1967 BDRV_SECTOR_SIZE);
b222237b
CL
1968 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
1969 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1970 if (!buf || !strcmp(buf, "off")) {
1971 prealloc = false;
1972 } else if (!strcmp(buf, "full")) {
1973 prealloc = true;
1974 } else {
1975 error_setg(errp, "Invalid preallocation mode: '%s'", buf);
1976 ret = -EINVAL;
1977 goto out;
1978 }
1979
1980 g_free(buf);
1981 buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
1982 if (buf) {
1983 ret = parse_redundancy(s, buf);
1984 if (ret < 0) {
1985 error_setg(errp, "Invalid redundancy mode: '%s'", buf);
1986 goto out;
33b1db1c 1987 }
33b1db1c 1988 }
876eb1b0
TI
1989 ret = parse_block_size_shift(s, opts);
1990 if (ret < 0) {
1991 error_setg(errp, "Invalid object_size."
1992 " obect_size needs to be power of 2"
1993 " and be limited from 2^20 to 2^31");
b6fc8245 1994 goto out;
33b1db1c
MK
1995 }
1996
1997 if (backing_file) {
fba98d45 1998 BlockBackend *blk;
9f23fce7 1999 BDRVSheepdogState *base;
33b1db1c
MK
2000 BlockDriver *drv;
2001
2002 /* Currently, only Sheepdog backing image is supported. */
b65a5e12 2003 drv = bdrv_find_protocol(backing_file, true, NULL);
33b1db1c 2004 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
e67c3993 2005 error_setg(errp, "backing_file must be a sheepdog image");
b6fc8245
MK
2006 ret = -EINVAL;
2007 goto out;
33b1db1c
MK
2008 }
2009
efaa7c4e 2010 blk = blk_new_open(backing_file, NULL, NULL,
72e775c7 2011 BDRV_O_PROTOCOL, errp);
fba98d45
KW
2012 if (blk == NULL) {
2013 ret = -EIO;
b6fc8245 2014 goto out;
cb595887 2015 }
33b1db1c 2016
fba98d45 2017 base = blk_bs(blk)->opaque;
33b1db1c 2018
9f23fce7 2019 if (!is_snapshot(&base->inode)) {
e67c3993 2020 error_setg(errp, "cannot clone from a non snapshot vdi");
fba98d45 2021 blk_unref(blk);
b6fc8245
MK
2022 ret = -EINVAL;
2023 goto out;
33b1db1c 2024 }
9f23fce7 2025 s->inode.vdi_id = base->inode.vdi_id;
fba98d45 2026 blk_unref(blk);
33b1db1c
MK
2027 }
2028
5d5da114 2029 s->aio_context = qemu_get_aio_context();
876eb1b0
TI
2030
2031 /* if block_size_shift is not specified, get cluster default value */
2032 if (s->inode.block_size_shift == 0) {
2033 SheepdogVdiReq hdr;
2034 SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
876eb1b0
TI
2035 int fd;
2036 unsigned int wlen = 0, rlen = 0;
2037
48d7c4af 2038 fd = connect_to_sdog(s, errp);
876eb1b0 2039 if (fd < 0) {
48d7c4af 2040 ret = fd;
876eb1b0
TI
2041 goto out;
2042 }
2043
2044 memset(&hdr, 0, sizeof(hdr));
2045 hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
2046 hdr.proto_ver = SD_PROTO_VER;
2047
f11672db 2048 ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
876eb1b0
TI
2049 NULL, &wlen, &rlen);
2050 closesocket(fd);
2051 if (ret) {
2052 error_setg_errno(errp, -ret, "failed to get cluster default");
2053 goto out;
2054 }
2055 if (rsp->result == SD_RES_SUCCESS) {
2056 s->inode.block_size_shift = rsp->block_size_shift;
2057 } else {
2058 s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
2059 }
2060 }
2061
2062 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
2063
2064 if (s->inode.vdi_size > max_vdi_size) {
2065 error_setg(errp, "An image is too large."
2066 " The maximum image size is %"PRIu64 "GB",
2067 max_vdi_size / 1024 / 1024 / 1024);
2068 ret = -EINVAL;
2069 goto out;
2070 }
2071
e67c3993 2072 ret = do_sd_create(s, &vid, 0, errp);
7d2d3e74 2073 if (ret) {
b6fc8245 2074 goto out;
a8e0fdd7
MK
2075 }
2076
7d2d3e74 2077 if (prealloc) {
e67c3993 2078 ret = sd_prealloc(filename, errp);
318df29e 2079 }
b6fc8245 2080out:
b222237b
CL
2081 g_free(backing_file);
2082 g_free(buf);
b6fc8245
MK
2083 g_free(s);
2084 return ret;
33b1db1c
MK
2085}
2086
2087static void sd_close(BlockDriverState *bs)
2088{
dfb12bf8 2089 Error *local_err = NULL;
33b1db1c
MK
2090 BDRVSheepdogState *s = bs->opaque;
2091 SheepdogVdiReq hdr;
2092 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2093 unsigned int wlen, rlen = 0;
2094 int fd, ret;
2095
2440a2c3 2096 DPRINTF("%s\n", s->name);
33b1db1c 2097
dfb12bf8 2098 fd = connect_to_sdog(s, &local_err);
33b1db1c 2099 if (fd < 0) {
565f65d2 2100 error_report_err(local_err);
33b1db1c
MK
2101 return;
2102 }
2103
2104 memset(&hdr, 0, sizeof(hdr));
2105
2106 hdr.opcode = SD_OP_RELEASE_VDI;
1dbfafed 2107 hdr.type = LOCK_TYPE_NORMAL;
9f23fce7 2108 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
2109 wlen = strlen(s->name) + 1;
2110 hdr.data_length = wlen;
2111 hdr.flags = SD_FLAG_CMD_WRITE;
2112
f11672db 2113 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
84390bed 2114 s->name, &wlen, &rlen);
33b1db1c
MK
2115
2116 closesocket(fd);
2117
2118 if (!ret && rsp->result != SD_RES_SUCCESS &&
2119 rsp->result != SD_RES_VDI_NOT_LOCKED) {
6daf194d 2120 error_report("%s, %s", sd_strerror(rsp->result), s->name);
33b1db1c
MK
2121 }
2122
dca21ef2 2123 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
f6a51c84 2124 false, NULL, NULL, NULL, NULL);
33b1db1c 2125 closesocket(s->fd);
8ecc2f9e 2126 qapi_free_SocketAddress(s->addr);
33b1db1c
MK
2127}
2128
2129static int64_t sd_getlength(BlockDriverState *bs)
2130{
2131 BDRVSheepdogState *s = bs->opaque;
2132
2133 return s->inode.vdi_size;
2134}
2135
2136static int sd_truncate(BlockDriverState *bs, int64_t offset)
2137{
dfb12bf8 2138 Error *local_err = NULL;
33b1db1c
MK
2139 BDRVSheepdogState *s = bs->opaque;
2140 int ret, fd;
2141 unsigned int datalen;
876eb1b0 2142 uint64_t max_vdi_size;
33b1db1c 2143
876eb1b0 2144 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
33b1db1c 2145 if (offset < s->inode.vdi_size) {
6daf194d 2146 error_report("shrinking is not supported");
33b1db1c 2147 return -EINVAL;
876eb1b0 2148 } else if (offset > max_vdi_size) {
6daf194d 2149 error_report("too big image size");
33b1db1c
MK
2150 return -EINVAL;
2151 }
2152
dfb12bf8 2153 fd = connect_to_sdog(s, &local_err);
33b1db1c 2154 if (fd < 0) {
565f65d2 2155 error_report_err(local_err);
cb595887 2156 return fd;
33b1db1c
MK
2157 }
2158
2159 /* we don't need to update entire object */
2160 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2161 s->inode.vdi_size = offset;
f11672db 2162 ret = write_object(fd, s->bs, (char *)&s->inode,
84390bed
SH
2163 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2164 datalen, 0, false, s->cache_flags);
33b1db1c
MK
2165 close(fd);
2166
2167 if (ret < 0) {
6daf194d 2168 error_report("failed to update an inode.");
33b1db1c
MK
2169 }
2170
cb595887 2171 return ret;
33b1db1c
MK
2172}
2173
2174/*
2175 * This function is called after writing data objects. If we need to
2176 * update metadata, this sends a write request to the vdi object.
33b1db1c 2177 */
d8716b41 2178static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
33b1db1c 2179{
28ddd08c 2180 BDRVSheepdogState *s = acb->s;
33b1db1c
MK
2181 struct iovec iov;
2182 AIOReq *aio_req;
2183 uint32_t offset, data_len, mn, mx;
2184
498f2140
HM
2185 mn = acb->min_dirty_data_idx;
2186 mx = acb->max_dirty_data_idx;
33b1db1c
MK
2187 if (mn <= mx) {
2188 /* we need to update the vdi object. */
e80ab33d 2189 ++acb->nr_pending;
33b1db1c
MK
2190 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
2191 mn * sizeof(s->inode.data_vdi_id[0]);
2192 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
2193
498f2140
HM
2194 acb->min_dirty_data_idx = UINT32_MAX;
2195 acb->max_dirty_data_idx = 0;
33b1db1c
MK
2196
2197 iov.iov_base = &s->inode;
2198 iov.iov_len = sizeof(s->inode);
2199 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2200 data_len, offset, 0, false, 0, offset);
b544c1ab 2201 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
e80ab33d
PB
2202 if (--acb->nr_pending) {
2203 qemu_coroutine_yield();
2204 }
33b1db1c 2205 }
33b1db1c
MK
2206}
2207
859e5553
LY
2208/* Delete current working VDI on the snapshot chain */
2209static bool sd_delete(BDRVSheepdogState *s)
2210{
dfb12bf8 2211 Error *local_err = NULL;
859e5553
LY
2212 unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
2213 SheepdogVdiReq hdr = {
2214 .opcode = SD_OP_DEL_VDI,
9f23fce7 2215 .base_vdi_id = s->inode.vdi_id,
859e5553
LY
2216 .data_length = wlen,
2217 .flags = SD_FLAG_CMD_WRITE,
2218 };
2219 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2220 int fd, ret;
2221
dfb12bf8 2222 fd = connect_to_sdog(s, &local_err);
859e5553 2223 if (fd < 0) {
565f65d2 2224 error_report_err(local_err);
859e5553
LY
2225 return false;
2226 }
2227
f11672db 2228 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
84390bed 2229 s->name, &wlen, &rlen);
859e5553
LY
2230 closesocket(fd);
2231 if (ret) {
2232 return false;
2233 }
2234 switch (rsp->result) {
2235 case SD_RES_NO_VDI:
2236 error_report("%s was already deleted", s->name);
2237 /* fall through */
2238 case SD_RES_SUCCESS:
2239 break;
2240 default:
2241 error_report("%s, %s", sd_strerror(rsp->result), s->name);
2242 return false;
2243 }
2244
2245 return true;
2246}
2247
33b1db1c
MK
2248/*
2249 * Create a writable VDI from a snapshot
2250 */
2251static int sd_create_branch(BDRVSheepdogState *s)
2252{
dfb12bf8 2253 Error *local_err = NULL;
33b1db1c
MK
2254 int ret, fd;
2255 uint32_t vid;
2256 char *buf;
859e5553 2257 bool deleted;
33b1db1c 2258
2440a2c3 2259 DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
33b1db1c 2260
7267c094 2261 buf = g_malloc(SD_INODE_SIZE);
33b1db1c 2262
859e5553
LY
2263 /*
2264 * Even If deletion fails, we will just create extra snapshot based on
dc6fb73d 2265 * the working VDI which was supposed to be deleted. So no need to
859e5553
LY
2266 * false bail out.
2267 */
2268 deleted = sd_delete(s);
7d2d3e74 2269 ret = do_sd_create(s, &vid, !deleted, &local_err);
33b1db1c 2270 if (ret) {
565f65d2 2271 error_report_err(local_err);
33b1db1c
MK
2272 goto out;
2273 }
2274
2440a2c3 2275 DPRINTF("%" PRIx32 " is created.\n", vid);
33b1db1c 2276
dfb12bf8 2277 fd = connect_to_sdog(s, &local_err);
33b1db1c 2278 if (fd < 0) {
565f65d2 2279 error_report_err(local_err);
cb595887 2280 ret = fd;
33b1db1c
MK
2281 goto out;
2282 }
2283
f11672db 2284 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
84390bed 2285 s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
2286
2287 closesocket(fd);
2288
2289 if (ret < 0) {
2290 goto out;
2291 }
2292
2293 memcpy(&s->inode, buf, sizeof(s->inode));
2294
2f536801 2295 s->is_snapshot = false;
33b1db1c 2296 ret = 0;
2440a2c3 2297 DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
33b1db1c
MK
2298
2299out:
7267c094 2300 g_free(buf);
33b1db1c
MK
2301
2302 return ret;
2303}
2304
2305/*
2306 * Send I/O requests to the server.
2307 *
2308 * This function sends requests to the server, links the requests to
c292ee6a 2309 * the inflight_list in BDRVSheepdogState, and exits without
33b1db1c
MK
2310 * waiting the response. The responses are received in the
2311 * `aio_read_response' function which is called from the main loop as
2312 * a fd handler.
2df46246
MK
2313 *
2314 * Returns 1 when we need to wait a response, 0 when there is no sent
2315 * request and -errno in error cases.
33b1db1c 2316 */
28ddd08c 2317static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
33b1db1c 2318{
33b1db1c 2319 int ret = 0;
e8bfaa2f 2320 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
876eb1b0
TI
2321 unsigned long idx;
2322 uint32_t object_size;
33b1db1c 2323 uint64_t oid;
876eb1b0 2324 uint64_t offset;
28ddd08c 2325 BDRVSheepdogState *s = acb->s;
33b1db1c
MK
2326 SheepdogInode *inode = &s->inode;
2327 AIOReq *aio_req;
2328
33b1db1c
MK
2329 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
2330 /*
2331 * In the case we open the snapshot VDI, Sheepdog creates the
2332 * writable VDI when we do a write operation first.
2333 */
2334 ret = sd_create_branch(s);
2335 if (ret) {
2336 acb->ret = -EIO;
e80ab33d 2337 return;
33b1db1c
MK
2338 }
2339 }
2340
876eb1b0
TI
2341 object_size = (UINT32_C(1) << inode->block_size_shift);
2342 idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
2343 offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
2344
1d732d7d
MK
2345 /*
2346 * Make sure we don't free the aiocb before we are done with all requests.
2347 * This additional reference is dropped at the end of this function.
2348 */
2349 acb->nr_pending++;
2350
33b1db1c
MK
2351 while (done != total) {
2352 uint8_t flags = 0;
2353 uint64_t old_oid = 0;
2f536801 2354 bool create = false;
33b1db1c
MK
2355
2356 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
2357
876eb1b0 2358 len = MIN(total - done, object_size - offset);
33b1db1c 2359
19db9b90
CH
2360 switch (acb->aiocb_type) {
2361 case AIOCB_READ_UDATA:
2362 if (!inode->data_vdi_id[idx]) {
2363 qemu_iovec_memset(acb->qiov, done, 0, len);
33b1db1c
MK
2364 goto done;
2365 }
19db9b90
CH
2366 break;
2367 case AIOCB_WRITE_UDATA:
2368 if (!inode->data_vdi_id[idx]) {
2f536801 2369 create = true;
19db9b90
CH
2370 } else if (!is_data_obj_writable(inode, idx)) {
2371 /* Copy-On-Write */
2f536801 2372 create = true;
19db9b90
CH
2373 old_oid = oid;
2374 flags = SD_FLAG_CMD_COW;
2375 }
2376 break;
cac8f4a6
LY
2377 case AIOCB_DISCARD_OBJ:
2378 /*
2379 * We discard the object only when the whole object is
2380 * 1) allocated 2) trimmed. Otherwise, simply skip it.
2381 */
876eb1b0 2382 if (len != object_size || inode->data_vdi_id[idx] == 0) {
cac8f4a6
LY
2383 goto done;
2384 }
2385 break;
19db9b90
CH
2386 default:
2387 break;
33b1db1c
MK
2388 }
2389
2390 if (create) {
2440a2c3 2391 DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
1b6ac998 2392 inode->vdi_id, oid,
33b1db1c
MK
2393 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
2394 oid = vid_to_data_oid(inode->vdi_id, idx);
2440a2c3 2395 DPRINTF("new oid %" PRIx64 "\n", oid);
33b1db1c
MK
2396 }
2397
b544c1ab 2398 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
e6fd57ea
HM
2399 old_oid,
2400 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
2401 0 : done);
b544c1ab 2402 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 2403 acb->aiocb_type);
33b1db1c
MK
2404 done:
2405 offset = 0;
2406 idx++;
2407 done += len;
2408 }
e80ab33d
PB
2409 if (--acb->nr_pending) {
2410 qemu_coroutine_yield();
33b1db1c
MK
2411 }
2412}
2413
acf6e5f0 2414static void sd_aio_complete(SheepdogAIOCB *acb)
6a55c82c 2415{
acf6e5f0
PB
2416 if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
2417 return;
6a55c82c
HM
2418 }
2419
acf6e5f0
PB
2420 QLIST_REMOVE(acb, aiocb_siblings);
2421 qemu_co_queue_restart_all(&acb->s->overlapping_queue);
6a55c82c
HM
2422}
2423
a968168c 2424static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
2df46246 2425 int nb_sectors, QEMUIOVector *qiov)
33b1db1c 2426{
28ddd08c 2427 SheepdogAIOCB acb;
2df46246 2428 int ret;
e50d7607
LY
2429 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
2430 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2431
c0191e76 2432 if (offset > s->inode.vdi_size) {
e50d7607 2433 ret = sd_truncate(bs, offset);
cb595887
MK
2434 if (ret < 0) {
2435 return ret;
33b1db1c 2436 }
33b1db1c
MK
2437 }
2438
28ddd08c 2439 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
28ddd08c
PB
2440 sd_co_rw_vector(&acb);
2441 sd_write_done(&acb);
acf6e5f0 2442 sd_aio_complete(&acb);
2df46246 2443
28ddd08c 2444 return acb.ret;
33b1db1c
MK
2445}
2446
a968168c 2447static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2df46246 2448 int nb_sectors, QEMUIOVector *qiov)
33b1db1c 2449{
28ddd08c 2450 SheepdogAIOCB acb;
6a55c82c 2451 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2452
28ddd08c 2453 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
28ddd08c 2454 sd_co_rw_vector(&acb);
acf6e5f0 2455 sd_aio_complete(&acb);
2df46246 2456
28ddd08c 2457 return acb.ret;
33b1db1c
MK
2458}
2459
47622c44
LY
2460static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
2461{
2462 BDRVSheepdogState *s = bs->opaque;
28ddd08c 2463 SheepdogAIOCB acb;
47783072 2464 AIOReq *aio_req;
47622c44 2465
0e7106d8 2466 if (s->cache_flags != SD_FLAG_CMD_CACHE) {
47622c44
LY
2467 return 0;
2468 }
2469
28ddd08c 2470 sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
47622c44 2471
28ddd08c
PB
2472 acb.nr_pending++;
2473 aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2474 0, 0, 0, false, 0, 0);
28ddd08c 2475 add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
47622c44 2476
28ddd08c 2477 if (--acb.nr_pending) {
e80ab33d
PB
2478 qemu_coroutine_yield();
2479 }
acf6e5f0
PB
2480
2481 sd_aio_complete(&acb);
28ddd08c 2482 return acb.ret;
47622c44
LY
2483}
2484
33b1db1c
MK
2485static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
2486{
dfb12bf8 2487 Error *local_err = NULL;
33b1db1c
MK
2488 BDRVSheepdogState *s = bs->opaque;
2489 int ret, fd;
2490 uint32_t new_vid;
2491 SheepdogInode *inode;
2492 unsigned int datalen;
2493
2440a2c3 2494 DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
33b1db1c
MK
2495 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
2496 s->name, sn_info->vm_state_size, s->is_snapshot);
2497
2498 if (s->is_snapshot) {
2499 error_report("You can't create a snapshot of a snapshot VDI, "
6daf194d 2500 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
33b1db1c
MK
2501
2502 return -EINVAL;
2503 }
2504
2440a2c3 2505 DPRINTF("%s %s\n", sn_info->name, sn_info->id_str);
33b1db1c
MK
2506
2507 s->inode.vm_state_size = sn_info->vm_state_size;
2508 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
3178e275
JM
2509 /* It appears that inode.tag does not require a NUL terminator,
2510 * which means this use of strncpy is ok.
2511 */
33b1db1c
MK
2512 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
2513 /* we don't need to update entire object */
2514 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
2df5fee2 2515 inode = g_malloc(datalen);
33b1db1c
MK
2516
2517 /* refresh inode. */
dfb12bf8 2518 fd = connect_to_sdog(s, &local_err);
33b1db1c 2519 if (fd < 0) {
565f65d2 2520 error_report_err(local_err);
cb595887 2521 ret = fd;
33b1db1c
MK
2522 goto cleanup;
2523 }
2524
f11672db 2525 ret = write_object(fd, s->bs, (char *)&s->inode,
84390bed
SH
2526 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2527 datalen, 0, false, s->cache_flags);
33b1db1c 2528 if (ret < 0) {
6daf194d 2529 error_report("failed to write snapshot's inode.");
33b1db1c
MK
2530 goto cleanup;
2531 }
2532
7d2d3e74 2533 ret = do_sd_create(s, &new_vid, 1, &local_err);
33b1db1c 2534 if (ret < 0) {
c29b77f9
MA
2535 error_reportf_err(local_err,
2536 "failed to create inode for snapshot: ");
33b1db1c
MK
2537 goto cleanup;
2538 }
2539
f11672db 2540 ret = read_object(fd, s->bs, (char *)inode,
84390bed
SH
2541 vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
2542 s->cache_flags);
33b1db1c
MK
2543
2544 if (ret < 0) {
6daf194d 2545 error_report("failed to read new inode info. %s", strerror(errno));
33b1db1c
MK
2546 goto cleanup;
2547 }
2548
2549 memcpy(&s->inode, inode, datalen);
2440a2c3 2550 DPRINTF("s->inode: name %s snap_id %x oid %x\n",
33b1db1c
MK
2551 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
2552
2553cleanup:
2df5fee2 2554 g_free(inode);
33b1db1c
MK
2555 closesocket(fd);
2556 return ret;
2557}
2558
859e5553
LY
2559/*
2560 * We implement rollback(loadvm) operation to the specified snapshot by
2561 * 1) switch to the snapshot
2562 * 2) rely on sd_create_branch to delete working VDI and
dc6fb73d 2563 * 3) create a new working VDI based on the specified snapshot
859e5553 2564 */
33b1db1c
MK
2565static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
2566{
2567 BDRVSheepdogState *s = bs->opaque;
2568 BDRVSheepdogState *old_s;
9ff53a0e 2569 char tag[SD_MAX_VDI_TAG_LEN];
33b1db1c 2570 uint32_t snapid = 0;
89e2a31d
MA
2571 int ret;
2572
2573 if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) {
2574 return -EINVAL;
2575 }
33b1db1c 2576
5839e53b 2577 old_s = g_new(BDRVSheepdogState, 1);
33b1db1c
MK
2578
2579 memcpy(old_s, s, sizeof(BDRVSheepdogState));
2580
9ff53a0e 2581 ret = reload_inode(s, snapid, tag);
33b1db1c 2582 if (ret) {
33b1db1c
MK
2583 goto out;
2584 }
2585
cede621f
LY
2586 ret = sd_create_branch(s);
2587 if (ret) {
33b1db1c
MK
2588 goto out;
2589 }
2590
7267c094 2591 g_free(old_s);
33b1db1c
MK
2592
2593 return 0;
2594out:
2595 /* recover bdrv_sd_state */
2596 memcpy(s, old_s, sizeof(BDRVSheepdogState));
7267c094 2597 g_free(old_s);
33b1db1c 2598
6daf194d 2599 error_report("failed to open. recover old bdrv_sd_state.");
33b1db1c
MK
2600
2601 return ret;
2602}
2603
eab8eb8d
VT
2604#define NR_BATCHED_DISCARD 128
2605
e25cad69 2606static int remove_objects(BDRVSheepdogState *s, Error **errp)
eab8eb8d
VT
2607{
2608 int fd, i = 0, nr_objs = 0;
e25cad69 2609 int ret;
eab8eb8d
VT
2610 SheepdogInode *inode = &s->inode;
2611
e25cad69 2612 fd = connect_to_sdog(s, errp);
eab8eb8d 2613 if (fd < 0) {
e25cad69 2614 return fd;
eab8eb8d
VT
2615 }
2616
2617 nr_objs = count_data_objs(inode);
2618 while (i < nr_objs) {
2619 int start_idx, nr_filled_idx;
2620
2621 while (i < nr_objs && !inode->data_vdi_id[i]) {
2622 i++;
2623 }
2624 start_idx = i;
2625
2626 nr_filled_idx = 0;
2627 while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
2628 if (inode->data_vdi_id[i]) {
2629 inode->data_vdi_id[i] = 0;
2630 nr_filled_idx++;
2631 }
2632
2633 i++;
2634 }
2635
f11672db 2636 ret = write_object(fd, s->bs,
eab8eb8d
VT
2637 (char *)&inode->data_vdi_id[start_idx],
2638 vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
2639 (i - start_idx) * sizeof(uint32_t),
2640 offsetof(struct SheepdogInode,
2641 data_vdi_id[start_idx]),
2642 false, s->cache_flags);
2643 if (ret < 0) {
e25cad69 2644 error_setg(errp, "Failed to discard snapshot inode");
eab8eb8d
VT
2645 goto out;
2646 }
2647 }
2648
e25cad69 2649 ret = 0;
eab8eb8d
VT
2650out:
2651 closesocket(fd);
e25cad69 2652 return ret;
eab8eb8d
VT
2653}
2654
a89d89d3
WX
2655static int sd_snapshot_delete(BlockDriverState *bs,
2656 const char *snapshot_id,
2657 const char *name,
2658 Error **errp)
33b1db1c 2659{
a0dc0e2b
MA
2660 /*
2661 * FIXME should delete the snapshot matching both @snapshot_id and
2662 * @name, but @name not used here
2663 */
03c698f0 2664 unsigned long snap_id = 0;
eab8eb8d 2665 char snap_tag[SD_MAX_VDI_TAG_LEN];
eab8eb8d
VT
2666 int fd, ret;
2667 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
2668 BDRVSheepdogState *s = bs->opaque;
2669 unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
2670 uint32_t vid;
2671 SheepdogVdiReq hdr = {
2672 .opcode = SD_OP_DEL_VDI,
2673 .data_length = wlen,
2674 .flags = SD_FLAG_CMD_WRITE,
2675 };
2676 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2677
e25cad69
MA
2678 ret = remove_objects(s, errp);
2679 if (ret) {
2680 return ret;
eab8eb8d
VT
2681 }
2682
2683 memset(buf, 0, sizeof(buf));
2684 memset(snap_tag, 0, sizeof(snap_tag));
2685 pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
89e2a31d 2686 /* TODO Use sd_parse_snapid() once this mess is cleaned up */
03c698f0
JC
2687 ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
2688 if (ret || snap_id > UINT32_MAX) {
a0dc0e2b
MA
2689 /*
2690 * FIXME Since qemu_strtoul() returns -EINVAL when
2691 * @snapshot_id is null, @snapshot_id is mandatory. Correct
2692 * would be to require at least one of @snapshot_id and @name.
2693 */
03c698f0
JC
2694 error_setg(errp, "Invalid snapshot ID: %s",
2695 snapshot_id ? snapshot_id : "<null>");
2696 return -EINVAL;
eab8eb8d
VT
2697 }
2698
2699 if (snap_id) {
03c698f0 2700 hdr.snapid = (uint32_t) snap_id;
eab8eb8d 2701 } else {
a0dc0e2b 2702 /* FIXME I suspect we should use @name here */
89e2a31d 2703 /* FIXME don't truncate silently */
eab8eb8d
VT
2704 pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
2705 pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
2706 }
2707
e25cad69 2708 ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp);
eab8eb8d
VT
2709 if (ret) {
2710 return ret;
2711 }
2712
e25cad69 2713 fd = connect_to_sdog(s, errp);
eab8eb8d 2714 if (fd < 0) {
e25cad69 2715 return fd;
eab8eb8d
VT
2716 }
2717
f11672db 2718 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
eab8eb8d
VT
2719 buf, &wlen, &rlen);
2720 closesocket(fd);
2721 if (ret) {
e25cad69 2722 error_setg_errno(errp, -ret, "Couldn't send request to server");
eab8eb8d
VT
2723 return ret;
2724 }
2725
2726 switch (rsp->result) {
2727 case SD_RES_NO_VDI:
e25cad69
MA
2728 error_setg(errp, "Can't find the snapshot");
2729 return -ENOENT;
eab8eb8d
VT
2730 case SD_RES_SUCCESS:
2731 break;
2732 default:
e25cad69
MA
2733 error_setg(errp, "%s", sd_strerror(rsp->result));
2734 return -EIO;
eab8eb8d
VT
2735 }
2736
e25cad69 2737 return 0;
33b1db1c
MK
2738}
2739
33b1db1c
MK
2740static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
2741{
dfb12bf8 2742 Error *local_err = NULL;
33b1db1c
MK
2743 BDRVSheepdogState *s = bs->opaque;
2744 SheepdogReq req;
2745 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
2746 QEMUSnapshotInfo *sn_tab = NULL;
2747 unsigned wlen, rlen;
2748 int found = 0;
2749 static SheepdogInode inode;
2750 unsigned long *vdi_inuse;
2751 unsigned int start_nr;
2752 uint64_t hval;
2753 uint32_t vid;
2754
7267c094 2755 vdi_inuse = g_malloc(max);
33b1db1c 2756
dfb12bf8 2757 fd = connect_to_sdog(s, &local_err);
33b1db1c 2758 if (fd < 0) {
565f65d2 2759 error_report_err(local_err);
cb595887 2760 ret = fd;
33b1db1c
MK
2761 goto out;
2762 }
2763
2764 rlen = max;
2765 wlen = 0;
2766
2767 memset(&req, 0, sizeof(req));
2768
2769 req.opcode = SD_OP_READ_VDIS;
2770 req.data_length = max;
2771
f11672db 2772 ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
33b1db1c
MK
2773
2774 closesocket(fd);
2775 if (ret) {
2776 goto out;
2777 }
2778
02c4f26b 2779 sn_tab = g_new0(QEMUSnapshotInfo, nr);
33b1db1c
MK
2780
2781 /* calculate a vdi id with hash function */
2782 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
2783 start_nr = hval & (SD_NR_VDIS - 1);
2784
dfb12bf8 2785 fd = connect_to_sdog(s, &local_err);
33b1db1c 2786 if (fd < 0) {
565f65d2 2787 error_report_err(local_err);
cb595887 2788 ret = fd;
33b1db1c
MK
2789 goto out;
2790 }
2791
2792 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
2793 if (!test_bit(vid, vdi_inuse)) {
2794 break;
2795 }
2796
2797 /* we don't need to read entire object */
f11672db 2798 ret = read_object(fd, s->bs, (char *)&inode,
84390bed 2799 vid_to_vdi_oid(vid),
47622c44 2800 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
0e7106d8 2801 s->cache_flags);
33b1db1c
MK
2802
2803 if (ret) {
2804 continue;
2805 }
2806
2807 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
2808 sn_tab[found].date_sec = inode.snap_ctime >> 32;
2809 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
2810 sn_tab[found].vm_state_size = inode.vm_state_size;
2811 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
2812
521b2b5d
HR
2813 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
2814 "%" PRIu32, inode.snap_id);
3178e275
JM
2815 pstrcpy(sn_tab[found].name,
2816 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
2817 inode.tag);
33b1db1c
MK
2818 found++;
2819 }
2820 }
2821
2822 closesocket(fd);
2823out:
2824 *psn_tab = sn_tab;
2825
7267c094 2826 g_free(vdi_inuse);
33b1db1c 2827
cb595887
MK
2828 if (ret < 0) {
2829 return ret;
2830 }
2831
33b1db1c
MK
2832 return found;
2833}
2834
2835static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
2836 int64_t pos, int size, int load)
2837{
dfb12bf8 2838 Error *local_err = NULL;
2f536801
MK
2839 bool create;
2840 int fd, ret = 0, remaining = size;
33b1db1c
MK
2841 unsigned int data_len;
2842 uint64_t vmstate_oid;
33b1db1c 2843 uint64_t offset;
cede621f
LY
2844 uint32_t vdi_index;
2845 uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
876eb1b0 2846 uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 2847
dfb12bf8 2848 fd = connect_to_sdog(s, &local_err);
33b1db1c 2849 if (fd < 0) {
565f65d2 2850 error_report_err(local_err);
cb595887 2851 return fd;
33b1db1c
MK
2852 }
2853
6f3c714e 2854 while (remaining) {
876eb1b0
TI
2855 vdi_index = pos / object_size;
2856 offset = pos % object_size;
33b1db1c 2857
876eb1b0 2858 data_len = MIN(remaining, object_size - offset);
33b1db1c 2859
cede621f 2860 vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
33b1db1c
MK
2861
2862 create = (offset == 0);
2863 if (load) {
f11672db 2864 ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
47622c44 2865 s->inode.nr_copies, data_len, offset,
0e7106d8 2866 s->cache_flags);
33b1db1c 2867 } else {
f11672db 2868 ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
47622c44 2869 s->inode.nr_copies, data_len, offset, create,
0e7106d8 2870 s->cache_flags);
33b1db1c
MK
2871 }
2872
2873 if (ret < 0) {
6daf194d 2874 error_report("failed to save vmstate %s", strerror(errno));
33b1db1c
MK
2875 goto cleanup;
2876 }
2877
2878 pos += data_len;
1f7a48de 2879 data += data_len;
6f3c714e 2880 remaining -= data_len;
33b1db1c 2881 }
6f3c714e 2882 ret = size;
33b1db1c
MK
2883cleanup:
2884 closesocket(fd);
2885 return ret;
2886}
2887
cf8074b3
KW
2888static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2889 int64_t pos)
33b1db1c
MK
2890{
2891 BDRVSheepdogState *s = bs->opaque;
cf8074b3
KW
2892 void *buf;
2893 int ret;
33b1db1c 2894
cf8074b3
KW
2895 buf = qemu_blockalign(bs, qiov->size);
2896 qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
2897 ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
2898 qemu_vfree(buf);
2899
2900 return ret;
33b1db1c
MK
2901}
2902
5ddda0b8
KW
2903static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2904 int64_t pos)
33b1db1c
MK
2905{
2906 BDRVSheepdogState *s = bs->opaque;
5ddda0b8
KW
2907 void *buf;
2908 int ret;
33b1db1c 2909
5ddda0b8
KW
2910 buf = qemu_blockalign(bs, qiov->size);
2911 ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
2912 qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
2913 qemu_vfree(buf);
2914
2915 return ret;
33b1db1c
MK
2916}
2917
2918
dde47537
EB
2919static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
2920 int count)
cac8f4a6 2921{
28ddd08c 2922 SheepdogAIOCB acb;
cac8f4a6 2923 BDRVSheepdogState *s = bs->opaque;
e6fd57ea
HM
2924 QEMUIOVector discard_iov;
2925 struct iovec iov;
2926 uint32_t zero = 0;
cac8f4a6
LY
2927
2928 if (!s->discard_supported) {
dde47537 2929 return 0;
cac8f4a6
LY
2930 }
2931
e6fd57ea
HM
2932 memset(&discard_iov, 0, sizeof(discard_iov));
2933 memset(&iov, 0, sizeof(iov));
2934 iov.iov_base = &zero;
2935 iov.iov_len = sizeof(zero);
2936 discard_iov.iov = &iov;
2937 discard_iov.niov = 1;
49228d1e
EB
2938 if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {
2939 return -ENOTSUP;
2940 }
28ddd08c
PB
2941 sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
2942 count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
28ddd08c 2943 sd_co_rw_vector(&acb);
acf6e5f0 2944 sd_aio_complete(&acb);
cac8f4a6 2945
28ddd08c 2946 return acb.ret;
cac8f4a6
LY
2947}
2948
b6b8a333
PB
2949static coroutine_fn int64_t
2950sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
67a0fd2a 2951 int *pnum, BlockDriverState **file)
8d71c631
LY
2952{
2953 BDRVSheepdogState *s = bs->opaque;
2954 SheepdogInode *inode = &s->inode;
876eb1b0 2955 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
9cd76737 2956 uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
876eb1b0 2957 unsigned long start = offset / object_size,
8d71c631 2958 end = DIV_ROUND_UP((sector_num + nb_sectors) *
876eb1b0 2959 BDRV_SECTOR_SIZE, object_size);
8d71c631 2960 unsigned long idx;
9cd76737 2961 int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
8d71c631
LY
2962
2963 for (idx = start; idx < end; idx++) {
2964 if (inode->data_vdi_id[idx] == 0) {
2965 break;
2966 }
2967 }
2968 if (idx == start) {
2969 /* Get the longest length of unallocated sectors */
2970 ret = 0;
2971 for (idx = start + 1; idx < end; idx++) {
2972 if (inode->data_vdi_id[idx] != 0) {
2973 break;
2974 }
2975 }
2976 }
2977
876eb1b0 2978 *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
8d71c631
LY
2979 if (*pnum > nb_sectors) {
2980 *pnum = nb_sectors;
2981 }
d234c929
FZ
2982 if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
2983 *file = bs;
2984 }
8d71c631
LY
2985 return ret;
2986}
2987
85829722
LY
2988static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
2989{
2990 BDRVSheepdogState *s = bs->opaque;
2991 SheepdogInode *inode = &s->inode;
876eb1b0
TI
2992 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
2993 unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
85829722
LY
2994 uint64_t size = 0;
2995
2996 for (i = 0; i < last; i++) {
2997 if (inode->data_vdi_id[i] == 0) {
2998 continue;
2999 }
876eb1b0 3000 size += object_size;
85829722
LY
3001 }
3002 return size;
3003}
3004
b222237b
CL
3005static QemuOptsList sd_create_opts = {
3006 .name = "sheepdog-create-opts",
3007 .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head),
3008 .desc = {
3009 {
3010 .name = BLOCK_OPT_SIZE,
3011 .type = QEMU_OPT_SIZE,
3012 .help = "Virtual disk size"
3013 },
3014 {
3015 .name = BLOCK_OPT_BACKING_FILE,
3016 .type = QEMU_OPT_STRING,
3017 .help = "File name of a base image"
3018 },
3019 {
3020 .name = BLOCK_OPT_PREALLOC,
3021 .type = QEMU_OPT_STRING,
3022 .help = "Preallocation mode (allowed values: off, full)"
3023 },
3024 {
3025 .name = BLOCK_OPT_REDUNDANCY,
3026 .type = QEMU_OPT_STRING,
3027 .help = "Redundancy of the image"
3028 },
876eb1b0
TI
3029 {
3030 .name = BLOCK_OPT_OBJECT_SIZE,
3031 .type = QEMU_OPT_SIZE,
3032 .help = "Object size of the image"
3033 },
b222237b
CL
3034 { /* end of list */ }
3035 }
33b1db1c
MK
3036};
3037
5d6768e3 3038static BlockDriver bdrv_sheepdog = {
33b1db1c
MK
3039 .format_name = "sheepdog",
3040 .protocol_name = "sheepdog",
3041 .instance_size = sizeof(BDRVSheepdogState),
831acdc9 3042 .bdrv_parse_filename = sd_parse_filename,
33b1db1c 3043 .bdrv_file_open = sd_open,
4da65c80
LY
3044 .bdrv_reopen_prepare = sd_reopen_prepare,
3045 .bdrv_reopen_commit = sd_reopen_commit,
3046 .bdrv_reopen_abort = sd_reopen_abort,
33b1db1c 3047 .bdrv_close = sd_close,
c282e1fd 3048 .bdrv_create = sd_create,
e4f5c1bf 3049 .bdrv_has_zero_init = bdrv_has_zero_init_1,
33b1db1c 3050 .bdrv_getlength = sd_getlength,
85829722 3051 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
33b1db1c
MK
3052 .bdrv_truncate = sd_truncate,
3053
2df46246
MK
3054 .bdrv_co_readv = sd_co_readv,
3055 .bdrv_co_writev = sd_co_writev,
47622c44 3056 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
dde47537 3057 .bdrv_co_pdiscard = sd_co_pdiscard,
b6b8a333 3058 .bdrv_co_get_block_status = sd_co_get_block_status,
33b1db1c
MK
3059
3060 .bdrv_snapshot_create = sd_snapshot_create,
3061 .bdrv_snapshot_goto = sd_snapshot_goto,
3062 .bdrv_snapshot_delete = sd_snapshot_delete,
3063 .bdrv_snapshot_list = sd_snapshot_list,
3064
3065 .bdrv_save_vmstate = sd_save_vmstate,
3066 .bdrv_load_vmstate = sd_load_vmstate,
3067
84390bed
SH
3068 .bdrv_detach_aio_context = sd_detach_aio_context,
3069 .bdrv_attach_aio_context = sd_attach_aio_context,
3070
b222237b 3071 .create_opts = &sd_create_opts,
33b1db1c
MK
3072};
3073
5d6768e3
MK
3074static BlockDriver bdrv_sheepdog_tcp = {
3075 .format_name = "sheepdog",
3076 .protocol_name = "sheepdog+tcp",
3077 .instance_size = sizeof(BDRVSheepdogState),
831acdc9 3078 .bdrv_parse_filename = sd_parse_filename,
5d6768e3 3079 .bdrv_file_open = sd_open,
4da65c80
LY
3080 .bdrv_reopen_prepare = sd_reopen_prepare,
3081 .bdrv_reopen_commit = sd_reopen_commit,
3082 .bdrv_reopen_abort = sd_reopen_abort,
5d6768e3 3083 .bdrv_close = sd_close,
c282e1fd 3084 .bdrv_create = sd_create,
e4f5c1bf 3085 .bdrv_has_zero_init = bdrv_has_zero_init_1,
5d6768e3 3086 .bdrv_getlength = sd_getlength,
85829722 3087 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
5d6768e3
MK
3088 .bdrv_truncate = sd_truncate,
3089
3090 .bdrv_co_readv = sd_co_readv,
3091 .bdrv_co_writev = sd_co_writev,
3092 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
dde47537 3093 .bdrv_co_pdiscard = sd_co_pdiscard,
b6b8a333 3094 .bdrv_co_get_block_status = sd_co_get_block_status,
5d6768e3
MK
3095
3096 .bdrv_snapshot_create = sd_snapshot_create,
3097 .bdrv_snapshot_goto = sd_snapshot_goto,
3098 .bdrv_snapshot_delete = sd_snapshot_delete,
3099 .bdrv_snapshot_list = sd_snapshot_list,
3100
3101 .bdrv_save_vmstate = sd_save_vmstate,
3102 .bdrv_load_vmstate = sd_load_vmstate,
3103
84390bed
SH
3104 .bdrv_detach_aio_context = sd_detach_aio_context,
3105 .bdrv_attach_aio_context = sd_attach_aio_context,
3106
b222237b 3107 .create_opts = &sd_create_opts,
5d6768e3
MK
3108};
3109
1b8bbb46
MK
3110static BlockDriver bdrv_sheepdog_unix = {
3111 .format_name = "sheepdog",
3112 .protocol_name = "sheepdog+unix",
3113 .instance_size = sizeof(BDRVSheepdogState),
831acdc9 3114 .bdrv_parse_filename = sd_parse_filename,
1b8bbb46 3115 .bdrv_file_open = sd_open,
4da65c80
LY
3116 .bdrv_reopen_prepare = sd_reopen_prepare,
3117 .bdrv_reopen_commit = sd_reopen_commit,
3118 .bdrv_reopen_abort = sd_reopen_abort,
1b8bbb46 3119 .bdrv_close = sd_close,
c282e1fd 3120 .bdrv_create = sd_create,
3ac21627 3121 .bdrv_has_zero_init = bdrv_has_zero_init_1,
1b8bbb46 3122 .bdrv_getlength = sd_getlength,
85829722 3123 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
1b8bbb46
MK
3124 .bdrv_truncate = sd_truncate,
3125
3126 .bdrv_co_readv = sd_co_readv,
3127 .bdrv_co_writev = sd_co_writev,
3128 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
dde47537 3129 .bdrv_co_pdiscard = sd_co_pdiscard,
b6b8a333 3130 .bdrv_co_get_block_status = sd_co_get_block_status,
1b8bbb46
MK
3131
3132 .bdrv_snapshot_create = sd_snapshot_create,
3133 .bdrv_snapshot_goto = sd_snapshot_goto,
3134 .bdrv_snapshot_delete = sd_snapshot_delete,
3135 .bdrv_snapshot_list = sd_snapshot_list,
3136
3137 .bdrv_save_vmstate = sd_save_vmstate,
3138 .bdrv_load_vmstate = sd_load_vmstate,
3139
84390bed
SH
3140 .bdrv_detach_aio_context = sd_detach_aio_context,
3141 .bdrv_attach_aio_context = sd_attach_aio_context,
3142
b222237b 3143 .create_opts = &sd_create_opts,
1b8bbb46
MK
3144};
3145
33b1db1c
MK
3146static void bdrv_sheepdog_init(void)
3147{
3148 bdrv_register(&bdrv_sheepdog);
5d6768e3 3149 bdrv_register(&bdrv_sheepdog_tcp);
1b8bbb46 3150 bdrv_register(&bdrv_sheepdog_unix);
33b1db1c
MK
3151}
3152block_init(bdrv_sheepdog_init);