]> git.proxmox.com Git - mirror_qemu.git/blob - hw/i386/kvm/xenstore_impl.c
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into...
[mirror_qemu.git] / hw / i386 / kvm / xenstore_impl.c
1 /*
2 * QEMU Xen emulation: The actual implementation of XenStore
3 *
4 * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5 *
6 * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12 #include "qemu/osdep.h"
13 #include "qom/object.h"
14
15 #include "hw/xen/xen.h"
16
17 #include "xen_xenstore.h"
18 #include "xenstore_impl.h"
19
20 #include "hw/xen/interface/io/xs_wire.h"
21
22 #define XS_MAX_WATCHES 128
23 #define XS_MAX_DOMAIN_NODES 1000
24 #define XS_MAX_NODE_SIZE 2048
25 #define XS_MAX_TRANSACTIONS 10
26 #define XS_MAX_PERMS_PER_NODE 5
27
28 #define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
30 "0123456789-/_"
31
32 typedef struct XsNode {
33 uint32_t ref;
34 GByteArray *content;
35 GList *perms;
36 GHashTable *children;
37 uint64_t gencnt;
38 bool deleted_in_tx;
39 bool modified_in_tx;
40 unsigned int serialized_tx;
41 #ifdef XS_NODE_UNIT_TEST
42 gchar *name; /* debug only */
43 #endif
44 } XsNode;
45
46 typedef struct XsWatch {
47 struct XsWatch *next;
48 xs_impl_watch_fn *cb;
49 void *cb_opaque;
50 char *token;
51 unsigned int dom_id;
52 int rel_prefix;
53 } XsWatch;
54
55 typedef struct XsTransaction {
56 XsNode *root;
57 unsigned int nr_nodes;
58 unsigned int base_tx;
59 unsigned int tx_id;
60 unsigned int dom_id;
61 } XsTransaction;
62
63 struct XenstoreImplState {
64 XsNode *root;
65 unsigned int nr_nodes;
66 GHashTable *watches;
67 unsigned int nr_domu_watches;
68 GHashTable *transactions;
69 unsigned int nr_domu_transactions;
70 unsigned int root_tx;
71 unsigned int last_tx;
72 bool serialized;
73 };
74
75
76 static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
77 {
78 unsigned int *new_tx_id = user_data;
79 XsTransaction *tx = value;
80
81 if (tx->base_tx == *new_tx_id) {
82 /* Transactions based on XBT_NULL will always fail */
83 tx->base_tx = XBT_NULL;
84 }
85 }
86
87 static inline unsigned int next_tx(struct XenstoreImplState *s)
88 {
89 unsigned int tx_id;
90
91 /* Find the next TX id which isn't either XBT_NULL or in use. */
92 do {
93 tx_id = ++s->last_tx;
94 } while (tx_id == XBT_NULL || tx_id == s->root_tx ||
95 g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
96
97 /*
98 * It is vanishingly unlikely, but ensure that no outstanding transaction
99 * is based on the (previous incarnation of the) newly-allocated TX id.
100 */
101 g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
102
103 return tx_id;
104 }
105
106 static inline XsNode *xs_node_new(void)
107 {
108 XsNode *n = g_new0(XsNode, 1);
109 n->ref = 1;
110
111 #ifdef XS_NODE_UNIT_TEST
112 nr_xs_nodes++;
113 xs_node_list = g_list_prepend(xs_node_list, n);
114 #endif
115 return n;
116 }
117
118 static inline XsNode *xs_node_ref(XsNode *n)
119 {
120 /* With just 10 transactions, it can never get anywhere near this. */
121 g_assert(n->ref < INT_MAX);
122
123 g_assert(n->ref);
124 n->ref++;
125 return n;
126 }
127
128 static inline void xs_node_unref(XsNode *n)
129 {
130 if (!n) {
131 return;
132 }
133 g_assert(n->ref);
134 if (--n->ref) {
135 return;
136 }
137
138 if (n->content) {
139 g_byte_array_unref(n->content);
140 }
141 if (n->perms) {
142 g_list_free_full(n->perms, g_free);
143 }
144 if (n->children) {
145 g_hash_table_unref(n->children);
146 }
147 #ifdef XS_NODE_UNIT_TEST
148 g_free(n->name);
149 nr_xs_nodes--;
150 xs_node_list = g_list_remove(xs_node_list, n);
151 #endif
152 g_free(n);
153 }
154
155 char *xs_perm_as_string(unsigned int perm, unsigned int domid)
156 {
157 char letter;
158
159 switch (perm) {
160 case XS_PERM_READ | XS_PERM_WRITE:
161 letter = 'b';
162 break;
163 case XS_PERM_READ:
164 letter = 'r';
165 break;
166 case XS_PERM_WRITE:
167 letter = 'w';
168 break;
169 case XS_PERM_NONE:
170 default:
171 letter = 'n';
172 break;
173 }
174
175 return g_strdup_printf("%c%u", letter, domid);
176 }
177
178 static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
179 {
180 return g_strdup(src);
181 }
182
183 static XsNode *xs_node_create(const char *name, GList *perms)
184 {
185 XsNode *n = xs_node_new();
186
187 #ifdef XS_NODE_UNIT_TEST
188 if (name) {
189 n->name = g_strdup(name);
190 }
191 #endif
192
193 n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
194
195 return n;
196 }
197
198 /* For copying from one hash table to another using g_hash_table_foreach() */
199 static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
200 {
201 g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
202 }
203
204 static XsNode *xs_node_copy(XsNode *old)
205 {
206 XsNode *n = xs_node_new();
207
208 n->gencnt = old->gencnt;
209
210 #ifdef XS_NODE_UNIT_TEST
211 if (n->name) {
212 n->name = g_strdup(old->name);
213 }
214 #endif
215
216 assert(old);
217 if (old->children) {
218 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
219 (GDestroyNotify)xs_node_unref);
220 g_hash_table_foreach(old->children, do_child_insert, n->children);
221 }
222 if (old->perms) {
223 n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
224 }
225 if (old->content) {
226 n->content = g_byte_array_ref(old->content);
227 }
228 return n;
229 }
230
231 /* Returns true if it made a change to the hash table */
232 static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
233 {
234 assert(!strchr(path_elem, '/'));
235
236 if (!child) {
237 assert(n->children);
238 return g_hash_table_remove(n->children, path_elem);
239 }
240
241 #ifdef XS_NODE_UNIT_TEST
242 g_free(child->name);
243 child->name = g_strdup(path_elem);
244 #endif
245 if (!n->children) {
246 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
247 (GDestroyNotify)xs_node_unref);
248 }
249
250 /*
251 * The documentation for g_hash_table_insert() says that it "returns a
252 * boolean value to indicate whether the newly added value was already
253 * in the hash table or not."
254 *
255 * It could perhaps be clearer that returning TRUE means it wasn't,
256 */
257 return g_hash_table_insert(n->children, g_strdup(path_elem), child);
258 }
259
260 struct walk_op {
261 struct XenstoreImplState *s;
262 char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */
263 int (*op_fn)(XsNode **n, struct walk_op *op);
264 void *op_opaque;
265 void *op_opaque2;
266
267 GList *watches;
268 unsigned int dom_id;
269 unsigned int tx_id;
270
271 /* The number of nodes which will exist in the tree if this op succeeds. */
272 unsigned int new_nr_nodes;
273
274 /*
275 * This is maintained on the way *down* the walk to indicate
276 * whether nodes can be modified in place or whether COW is
277 * required. It starts off being true, as we're always going to
278 * replace the root node. If we walk into a shared subtree it
279 * becomes false. If we start *creating* new nodes for a write,
280 * it becomes true again.
281 *
282 * Do not use it on the way back up.
283 */
284 bool inplace;
285 bool mutating;
286 bool create_dirs;
287 bool in_transaction;
288
289 /* Tracking during recursion so we know which is first. */
290 bool deleted_in_tx;
291 };
292
293 static void fire_watches(struct walk_op *op, bool parents)
294 {
295 GList *l = NULL;
296 XsWatch *w;
297
298 if (!op->mutating || op->in_transaction) {
299 return;
300 }
301
302 if (parents) {
303 l = op->watches;
304 }
305
306 w = g_hash_table_lookup(op->s->watches, op->path);
307 while (w || l) {
308 if (!w) {
309 /* Fire the parent nodes from 'op' if asked to */
310 w = l->data;
311 l = l->next;
312 continue;
313 }
314
315 assert(strlen(op->path) > w->rel_prefix);
316 w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
317
318 w = w->next;
319 }
320 }
321
322 static int xs_node_add_content(XsNode **n, struct walk_op *op)
323 {
324 GByteArray *data = op->op_opaque;
325
326 if (op->dom_id) {
327 /*
328 * The real XenStored includes permissions and names of child nodes
329 * in the calculated datasize but life's too short. For a single
330 * tenant internal XenStore, we don't have to be quite as pedantic.
331 */
332 if (data->len > XS_MAX_NODE_SIZE) {
333 return E2BIG;
334 }
335 }
336 /* We *are* the node to be written. Either this or a copy. */
337 if (!op->inplace) {
338 XsNode *old = *n;
339 *n = xs_node_copy(old);
340 xs_node_unref(old);
341 }
342
343 if ((*n)->content) {
344 g_byte_array_unref((*n)->content);
345 }
346 (*n)->content = g_byte_array_ref(data);
347 if (op->tx_id != XBT_NULL) {
348 (*n)->modified_in_tx = true;
349 }
350 return 0;
351 }
352
353 static int xs_node_get_content(XsNode **n, struct walk_op *op)
354 {
355 GByteArray *data = op->op_opaque;
356 GByteArray *node_data;
357
358 assert(op->inplace);
359 assert(*n);
360
361 node_data = (*n)->content;
362 if (node_data) {
363 g_byte_array_append(data, node_data->data, node_data->len);
364 }
365
366 return 0;
367 }
368
369 static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
370 {
371 struct walk_op *op = user_data;
372 int path_len = strlen(op->path);
373 int key_len = strlen(key);
374 XsNode *n = value;
375 bool this_inplace = op->inplace;
376
377 if (n->ref != 1) {
378 op->inplace = 0;
379 }
380
381 assert(key_len + path_len + 2 <= sizeof(op->path));
382 op->path[path_len] = '/';
383 memcpy(op->path + path_len + 1, key, key_len + 1);
384
385 if (n->children) {
386 g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
387 }
388 op->new_nr_nodes--;
389
390 /*
391 * Fire watches on *this* node but not the parents because they are
392 * going to be deleted too, so the watch will fire for them anyway.
393 */
394 fire_watches(op, false);
395 op->path[path_len] = '\0';
396
397 /*
398 * Actually deleting the child here is just an optimisation; if we
399 * don't then the final unref on the topmost victim will just have
400 * to cascade down again repeating all the g_hash_table_foreach()
401 * calls.
402 */
403 return this_inplace;
404 }
405
406 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
407 static void copy_deleted_recurse(gpointer key, gpointer value,
408 gpointer user_data)
409 {
410 struct walk_op *op = user_data;
411 GHashTable *siblings = op->op_opaque2;
412 XsNode *n = xs_node_copy_deleted(value, op);
413
414 /*
415 * Reinsert the deleted_in_tx copy of the node into the parent's
416 * 'children' hash table. Having stashed it from op->op_opaque2
417 * before the recursive call to xs_node_copy_deleted() scribbled
418 * over it.
419 */
420 g_hash_table_insert(siblings, g_strdup(key), n);
421 }
422
423 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
424 {
425 XsNode *n = xs_node_new();
426
427 n->gencnt = old->gencnt;
428
429 #ifdef XS_NODE_UNIT_TEST
430 if (old->name) {
431 n->name = g_strdup(old->name);
432 }
433 #endif
434
435 if (old->children) {
436 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
437 (GDestroyNotify)xs_node_unref);
438 op->op_opaque2 = n->children;
439 g_hash_table_foreach(old->children, copy_deleted_recurse, op);
440 }
441 if (old->perms) {
442 n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
443 }
444 n->deleted_in_tx = true;
445 /* If it gets resurrected we only fire a watch if it lost its content */
446 if (old->content) {
447 n->modified_in_tx = true;
448 }
449 op->new_nr_nodes--;
450 return n;
451 }
452
453 static int xs_node_rm(XsNode **n, struct walk_op *op)
454 {
455 bool this_inplace = op->inplace;
456
457 if (op->tx_id != XBT_NULL) {
458 /* It's not trivial to do inplace handling for this one */
459 XsNode *old = *n;
460 *n = xs_node_copy_deleted(old, op);
461 xs_node_unref(old);
462 return 0;
463 }
464
465 /* Fire watches for, and count, nodes in the subtree which get deleted */
466 if ((*n)->children) {
467 g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
468 }
469 op->new_nr_nodes--;
470
471 if (this_inplace) {
472 xs_node_unref(*n);
473 }
474 *n = NULL;
475 return 0;
476 }
477
478 static int xs_node_get_perms(XsNode **n, struct walk_op *op)
479 {
480 GList **perms = op->op_opaque;
481
482 assert(op->inplace);
483 assert(*n);
484
485 *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
486 return 0;
487 }
488
489 static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
490 {
491 unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
492
493 assert(n == 2);
494 }
495
496 static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
497 {
498 unsigned int i, n;
499 char perm_letter;
500 unsigned int perm_dom_id;
501 bool access;
502
503 if (dom_id == 0) {
504 return true;
505 }
506
507 n = g_list_length(perms);
508 assert(n >= 1);
509
510 /*
511 * The dom_id of the first perm is the owner, and the owner always has
512 * read-write access.
513 */
514 parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id);
515 if (dom_id == perm_dom_id) {
516 return true;
517 }
518
519 /*
520 * The letter of the first perm specified the default access for all other
521 * domains.
522 */
523 access = !!strchr(letters, perm_letter);
524 for (i = 1; i < n; i++) {
525 parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id);
526 if (dom_id != perm_dom_id) {
527 continue;
528 }
529 access = !!strchr(letters, perm_letter);
530 }
531
532 return access;
533 }
534
535 static int xs_node_set_perms(XsNode **n, struct walk_op *op)
536 {
537 GList *perms = op->op_opaque;
538
539 if (op->dom_id) {
540 unsigned int perm_dom_id;
541 char perm_letter;
542
543 /* A guest may not change permissions on nodes it does not own */
544 if (!can_access(op->dom_id, (*n)->perms, "")) {
545 return EPERM;
546 }
547
548 /* A guest may not change the owner of a node it owns. */
549 parse_perm(perms->data, &perm_letter, &perm_dom_id);
550 if (perm_dom_id != op->dom_id) {
551 return EPERM;
552 }
553
554 if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) {
555 return ENOSPC;
556 }
557 }
558
559 /* We *are* the node to be written. Either this or a copy. */
560 if (!op->inplace) {
561 XsNode *old = *n;
562 *n = xs_node_copy(old);
563 xs_node_unref(old);
564 }
565
566 if ((*n)->perms) {
567 g_list_free_full((*n)->perms, g_free);
568 }
569 (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
570 if (op->tx_id != XBT_NULL) {
571 (*n)->modified_in_tx = true;
572 }
573 return 0;
574 }
575
576 /*
577 * Passed a full reference in *n which it may free if it needs to COW.
578 *
579 * When changing the tree, the op->inplace flag indicates whether this
580 * node may be modified in place (i.e. it and all its parents had a
581 * refcount of one). If walking down the tree we find a node whose
582 * refcount is higher, we must clear op->inplace and COW from there
583 * down. Unless we are creating new nodes as scaffolding for a write
584 * (which works like 'mkdir -p' does). In which case those newly
585 * created nodes can (and must) be modified in place again.
586 */
587 static int xs_node_walk(XsNode **n, struct walk_op *op)
588 {
589 char *child_name = NULL;
590 size_t namelen;
591 XsNode *old = *n, *child = NULL;
592 bool stole_child = false;
593 bool this_inplace;
594 XsWatch *watch;
595 int err;
596
597 namelen = strlen(op->path);
598 watch = g_hash_table_lookup(op->s->watches, op->path);
599
600 /* Is there a child, or do we hit the double-NUL termination? */
601 if (op->path[namelen + 1]) {
602 char *slash;
603 child_name = op->path + namelen + 1;
604 slash = strchr(child_name, '/');
605 if (slash) {
606 *slash = '\0';
607 }
608 op->path[namelen] = '/';
609 }
610
611 /* If we walk into a subtree which is shared, we must COW */
612 if (op->mutating && old->ref != 1) {
613 op->inplace = false;
614 }
615
616 if (!child_name) {
617 const char *letters = op->mutating ? "wb" : "rb";
618
619 if (!can_access(op->dom_id, old->perms, letters)) {
620 err = EACCES;
621 goto out;
622 }
623
624 /* This is the actual node on which the operation shall be performed */
625 err = op->op_fn(n, op);
626 if (!err) {
627 fire_watches(op, true);
628 }
629 goto out;
630 }
631
632 /* op->inplace will be further modified during the recursion */
633 this_inplace = op->inplace;
634
635 if (old && old->children) {
636 child = g_hash_table_lookup(old->children, child_name);
637 /* This is a *weak* reference to 'child', owned by the hash table */
638 }
639
640 if (child) {
641 if (child->deleted_in_tx) {
642 assert(child->ref == 1);
643 /* Cannot actually set child->deleted_in_tx = false until later */
644 }
645 xs_node_ref(child);
646 /*
647 * Now we own it too. But if we can modify inplace, that's going to
648 * foil the check and force it to COW. We want to be the *only* owner
649 * so that it can be modified in place, so remove it from the hash
650 * table in that case. We'll add it (or its replacement) back later.
651 */
652 if (op->mutating && this_inplace) {
653 g_hash_table_remove(old->children, child_name);
654 stole_child = true;
655 }
656 } else if (op->create_dirs) {
657 assert(op->mutating);
658
659 if (!can_access(op->dom_id, old->perms, "wb")) {
660 err = EACCES;
661 goto out;
662 }
663
664 if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) {
665 err = ENOSPC;
666 goto out;
667 }
668
669 child = xs_node_create(child_name, old->perms);
670 op->new_nr_nodes++;
671
672 /*
673 * If we're creating a new child, we can clearly modify it (and its
674 * children) in place from here on down.
675 */
676 op->inplace = true;
677 } else {
678 err = ENOENT;
679 goto out;
680 }
681
682 /*
683 * If there's a watch on this node, add it to the list to be fired
684 * (with the correct full pathname for the modified node) at the end.
685 */
686 if (watch) {
687 op->watches = g_list_append(op->watches, watch);
688 }
689
690 /*
691 * Except for the temporary child-stealing as noted, our node has not
692 * changed yet. We don't yet know the overall operation will complete.
693 */
694 err = xs_node_walk(&child, op);
695
696 if (watch) {
697 op->watches = g_list_remove(op->watches, watch);
698 }
699
700 if (err || !op->mutating) {
701 if (stole_child) {
702 /* Put it back as it was. */
703 g_hash_table_replace(old->children, g_strdup(child_name), child);
704 } else {
705 xs_node_unref(child);
706 }
707 goto out;
708 }
709
710 /*
711 * Now we know the operation has completed successfully and we're on
712 * the way back up. Make the change, substituting 'child' in the
713 * node at our level.
714 */
715 if (!this_inplace) {
716 *n = xs_node_copy(old);
717 xs_node_unref(old);
718 }
719
720 /*
721 * If we resurrected a deleted_in_tx node, we can mark it as no longer
722 * deleted now that we know the overall operation has succeeded.
723 */
724 if (op->create_dirs && child && child->deleted_in_tx) {
725 op->new_nr_nodes++;
726 child->deleted_in_tx = false;
727 }
728
729 /*
730 * The child may be NULL here, for a remove operation. Either way,
731 * xs_node_add_child() will do the right thing and return a value
732 * indicating whether it changed the parent's hash table or not.
733 *
734 * We bump the parent gencnt if it adds a child that we *didn't*
735 * steal from it in the first place, or if child==NULL and was
736 * thus removed (whether we stole it earlier and didn't put it
737 * back, or xs_node_add_child() actually removed it now).
738 */
739 if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) {
740 (*n)->gencnt++;
741 }
742
743 out:
744 op->path[namelen] = '\0';
745 if (!namelen) {
746 assert(!op->watches);
747 /*
748 * On completing the recursion back up the path walk and reaching the
749 * top, assign the new node count if the operation was successful. If
750 * the main tree was changed, bump its tx ID so that outstanding
751 * transactions correctly fail. But don't bump it every time; only
752 * if it makes a difference.
753 */
754 if (!err && op->mutating) {
755 if (!op->in_transaction) {
756 if (op->s->root_tx != op->s->last_tx) {
757 op->s->root_tx = next_tx(op->s);
758 }
759 op->s->nr_nodes = op->new_nr_nodes;
760 } else {
761 XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
762 GINT_TO_POINTER(op->tx_id));
763 assert(tx);
764 tx->nr_nodes = op->new_nr_nodes;
765 }
766 }
767 }
768 return err;
769 }
770
771 static void append_directory_item(gpointer key, gpointer value,
772 gpointer user_data)
773 {
774 GList **items = user_data;
775
776 *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp);
777 }
778
779 /* Populates items with char * names which caller must free. */
780 static int xs_node_directory(XsNode **n, struct walk_op *op)
781 {
782 GList **items = op->op_opaque;
783
784 assert(op->inplace);
785 assert(*n);
786
787 if ((*n)->children) {
788 g_hash_table_foreach((*n)->children, append_directory_item, items);
789 }
790
791 if (op->op_opaque2) {
792 *(uint64_t *)op->op_opaque2 = (*n)->gencnt;
793 }
794
795 return 0;
796 }
797
798 static int validate_path(char *outpath, const char *userpath,
799 unsigned int dom_id)
800 {
801 size_t i, pathlen = strlen(userpath);
802
803 if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) {
804 return EINVAL;
805 }
806 for (i = 0; i < pathlen; i++) {
807 if (!strchr(XS_VALID_CHARS, userpath[i])) {
808 return EINVAL;
809 }
810 }
811 if (userpath[0] == '/') {
812 if (pathlen > XENSTORE_ABS_PATH_MAX) {
813 return E2BIG;
814 }
815 memcpy(outpath, userpath, pathlen + 1);
816 } else {
817 if (pathlen > XENSTORE_REL_PATH_MAX) {
818 return E2BIG;
819 }
820 snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id,
821 userpath);
822 }
823 return 0;
824 }
825
826
827 static int init_walk_op(XenstoreImplState *s, struct walk_op *op,
828 xs_transaction_t tx_id, unsigned int dom_id,
829 const char *path, XsNode ***rootp)
830 {
831 int ret = validate_path(op->path, path, dom_id);
832 if (ret) {
833 return ret;
834 }
835
836 /*
837 * We use *two* NUL terminators at the end of the path, as during the walk
838 * we will temporarily turn each '/' into a NUL to allow us to use that
839 * path element for the lookup.
840 */
841 op->path[strlen(op->path) + 1] = '\0';
842 op->watches = NULL;
843 op->path[0] = '\0';
844 op->inplace = true;
845 op->mutating = false;
846 op->create_dirs = false;
847 op->in_transaction = false;
848 op->dom_id = dom_id;
849 op->tx_id = tx_id;
850 op->s = s;
851
852 if (tx_id == XBT_NULL) {
853 *rootp = &s->root;
854 op->new_nr_nodes = s->nr_nodes;
855 } else {
856 XsTransaction *tx = g_hash_table_lookup(s->transactions,
857 GINT_TO_POINTER(tx_id));
858 if (!tx) {
859 return ENOENT;
860 }
861 *rootp = &tx->root;
862 op->new_nr_nodes = tx->nr_nodes;
863 op->in_transaction = true;
864 }
865
866 return 0;
867 }
868
869 int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
870 xs_transaction_t tx_id, const char *path, GByteArray *data)
871 {
872 /*
873 * The data GByteArray shall exist, and will be freed by caller.
874 * Just g_byte_array_append() to it.
875 */
876 struct walk_op op;
877 XsNode **n;
878 int ret;
879
880 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
881 if (ret) {
882 return ret;
883 }
884 op.op_fn = xs_node_get_content;
885 op.op_opaque = data;
886 return xs_node_walk(n, &op);
887 }
888
889 int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
890 xs_transaction_t tx_id, const char *path, GByteArray *data)
891 {
892 /*
893 * The data GByteArray shall exist, will be freed by caller. You are
894 * free to use g_byte_array_steal() and keep the data. Or just ref it.
895 */
896 struct walk_op op;
897 XsNode **n;
898 int ret;
899
900 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
901 if (ret) {
902 return ret;
903 }
904 op.op_fn = xs_node_add_content;
905 op.op_opaque = data;
906 op.mutating = true;
907 op.create_dirs = true;
908 return xs_node_walk(n, &op);
909 }
910
911 int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
912 xs_transaction_t tx_id, const char *path,
913 uint64_t *gencnt, GList **items)
914 {
915 /*
916 * The items are (char *) to be freed by caller. Although it's consumed
917 * immediately so if you want to change it to (const char *) and keep
918 * them, go ahead and change the caller.
919 */
920 struct walk_op op;
921 XsNode **n;
922 int ret;
923
924 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
925 if (ret) {
926 return ret;
927 }
928 op.op_fn = xs_node_directory;
929 op.op_opaque = items;
930 op.op_opaque2 = gencnt;
931 return xs_node_walk(n, &op);
932 }
933
934 int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
935 xs_transaction_t *tx_id)
936 {
937 XsTransaction *tx;
938
939 if (*tx_id != XBT_NULL) {
940 return EINVAL;
941 }
942
943 if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) {
944 return ENOSPC;
945 }
946
947 tx = g_new0(XsTransaction, 1);
948
949 tx->nr_nodes = s->nr_nodes;
950 tx->tx_id = next_tx(s);
951 tx->base_tx = s->root_tx;
952 tx->root = xs_node_ref(s->root);
953 tx->dom_id = dom_id;
954
955 g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx);
956 if (dom_id) {
957 s->nr_domu_transactions++;
958 }
959 *tx_id = tx->tx_id;
960 return 0;
961 }
962
963 static gboolean tx_commit_walk(gpointer key, gpointer value,
964 gpointer user_data)
965 {
966 struct walk_op *op = user_data;
967 int path_len = strlen(op->path);
968 int key_len = strlen(key);
969 bool fire_parents = true;
970 XsWatch *watch;
971 XsNode *n = value;
972
973 if (n->ref != 1) {
974 return false;
975 }
976
977 if (n->deleted_in_tx) {
978 /*
979 * We fire watches on our parents if we are the *first* node
980 * to be deleted (the topmost one). This matches the behaviour
981 * when deleting in the live tree.
982 */
983 fire_parents = !op->deleted_in_tx;
984
985 /* Only used on the way down so no need to clear it later */
986 op->deleted_in_tx = true;
987 }
988
989 assert(key_len + path_len + 2 <= sizeof(op->path));
990 op->path[path_len] = '/';
991 memcpy(op->path + path_len + 1, key, key_len + 1);
992
993 watch = g_hash_table_lookup(op->s->watches, op->path);
994 if (watch) {
995 op->watches = g_list_append(op->watches, watch);
996 }
997
998 if (n->children) {
999 g_hash_table_foreach_remove(n->children, tx_commit_walk, op);
1000 }
1001
1002 if (watch) {
1003 op->watches = g_list_remove(op->watches, watch);
1004 }
1005
1006 /*
1007 * Don't fire watches if this node was only copied because a
1008 * descendent was changed. The modified_in_tx flag indicates the
1009 * ones which were really changed.
1010 */
1011 if (n->modified_in_tx || n->deleted_in_tx) {
1012 fire_watches(op, fire_parents);
1013 n->modified_in_tx = false;
1014 }
1015 op->path[path_len] = '\0';
1016
1017 /* Deleted nodes really do get expunged when we commit */
1018 return n->deleted_in_tx;
1019 }
1020
1021 static int transaction_commit(XenstoreImplState *s, XsTransaction *tx)
1022 {
1023 struct walk_op op;
1024 XsNode **n;
1025
1026 if (s->root_tx != tx->base_tx) {
1027 return EAGAIN;
1028 }
1029 xs_node_unref(s->root);
1030 s->root = tx->root;
1031 tx->root = NULL;
1032 s->root_tx = tx->tx_id;
1033 s->nr_nodes = tx->nr_nodes;
1034
1035 init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n);
1036 op.deleted_in_tx = false;
1037 op.mutating = true;
1038
1039 /*
1040 * Walk the new root and fire watches on any node which has a
1041 * refcount of one (which is therefore unique to this transaction).
1042 */
1043 if (s->root->children) {
1044 g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op);
1045 }
1046
1047 return 0;
1048 }
1049
1050 int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
1051 xs_transaction_t tx_id, bool commit)
1052 {
1053 int ret = 0;
1054 XsTransaction *tx = g_hash_table_lookup(s->transactions,
1055 GINT_TO_POINTER(tx_id));
1056
1057 if (!tx || tx->dom_id != dom_id) {
1058 return ENOENT;
1059 }
1060
1061 if (commit) {
1062 ret = transaction_commit(s, tx);
1063 }
1064
1065 g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id));
1066 if (dom_id) {
1067 assert(s->nr_domu_transactions);
1068 s->nr_domu_transactions--;
1069 }
1070 return ret;
1071 }
1072
1073 int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
1074 xs_transaction_t tx_id, const char *path)
1075 {
1076 struct walk_op op;
1077 XsNode **n;
1078 int ret;
1079
1080 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1081 if (ret) {
1082 return ret;
1083 }
1084 op.op_fn = xs_node_rm;
1085 op.mutating = true;
1086 return xs_node_walk(n, &op);
1087 }
1088
1089 int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
1090 xs_transaction_t tx_id, const char *path, GList **perms)
1091 {
1092 struct walk_op op;
1093 XsNode **n;
1094 int ret;
1095
1096 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1097 if (ret) {
1098 return ret;
1099 }
1100 op.op_fn = xs_node_get_perms;
1101 op.op_opaque = perms;
1102 return xs_node_walk(n, &op);
1103 }
1104
1105 static void is_valid_perm(gpointer data, gpointer user_data)
1106 {
1107 char *perm = data;
1108 bool *valid = user_data;
1109 char letter;
1110 unsigned int dom_id;
1111
1112 if (!*valid) {
1113 return;
1114 }
1115
1116 if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) {
1117 *valid = false;
1118 return;
1119 }
1120
1121 switch (letter) {
1122 case 'n':
1123 case 'r':
1124 case 'w':
1125 case 'b':
1126 break;
1127
1128 default:
1129 *valid = false;
1130 break;
1131 }
1132 }
1133
1134 int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
1135 xs_transaction_t tx_id, const char *path, GList *perms)
1136 {
1137 struct walk_op op;
1138 XsNode **n;
1139 bool valid = true;
1140 int ret;
1141
1142 if (!g_list_length(perms)) {
1143 return EINVAL;
1144 }
1145
1146 g_list_foreach(perms, is_valid_perm, &valid);
1147 if (!valid) {
1148 return EINVAL;
1149 }
1150
1151 ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1152 if (ret) {
1153 return ret;
1154 }
1155 op.op_fn = xs_node_set_perms;
1156 op.op_opaque = perms;
1157 op.mutating = true;
1158 return xs_node_walk(n, &op);
1159 }
1160
1161 static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
1162 const char *path, const char *token,
1163 xs_impl_watch_fn fn, void *opaque)
1164
1165 {
1166 char abspath[XENSTORE_ABS_PATH_MAX + 1];
1167 XsWatch *w, *l;
1168 int ret;
1169
1170 ret = validate_path(abspath, path, dom_id);
1171 if (ret) {
1172 return ret;
1173 }
1174
1175 /* Check for duplicates */
1176 l = w = g_hash_table_lookup(s->watches, abspath);
1177 while (w) {
1178 if (!g_strcmp0(token, w->token) && opaque == w->cb_opaque &&
1179 fn == w->cb && dom_id == w->dom_id) {
1180 return EEXIST;
1181 }
1182 w = w->next;
1183 }
1184
1185 if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
1186 return E2BIG;
1187 }
1188
1189 w = g_new0(XsWatch, 1);
1190 w->token = g_strdup(token);
1191 w->cb = fn;
1192 w->cb_opaque = opaque;
1193 w->dom_id = dom_id;
1194 w->rel_prefix = strlen(abspath) - strlen(path);
1195
1196 /* l was looked up above when checking for duplicates */
1197 if (l) {
1198 w->next = l->next;
1199 l->next = w;
1200 } else {
1201 g_hash_table_insert(s->watches, g_strdup(abspath), w);
1202 }
1203 if (dom_id) {
1204 s->nr_domu_watches++;
1205 }
1206
1207 return 0;
1208 }
1209
1210 int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
1211 const char *token, xs_impl_watch_fn fn, void *opaque)
1212 {
1213 int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
1214
1215 if (!ret) {
1216 /* A new watch should fire immediately */
1217 fn(opaque, path, token);
1218 }
1219
1220 return ret;
1221 }
1222
1223 static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
1224 {
1225 XsWatch *next = w->next;
1226
1227 if (w->dom_id) {
1228 assert(s->nr_domu_watches);
1229 s->nr_domu_watches--;
1230 }
1231
1232 g_free(w->token);
1233 g_free(w);
1234
1235 return next;
1236 }
1237
1238 int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
1239 const char *path, const char *token,
1240 xs_impl_watch_fn fn, void *opaque)
1241 {
1242 char abspath[XENSTORE_ABS_PATH_MAX + 1];
1243 XsWatch *w, **l;
1244 int ret;
1245
1246 ret = validate_path(abspath, path, dom_id);
1247 if (ret) {
1248 return ret;
1249 }
1250
1251 w = g_hash_table_lookup(s->watches, abspath);
1252 if (!w) {
1253 return ENOENT;
1254 }
1255
1256 /*
1257 * The hash table contains the first element of a list of
1258 * watches. Removing the first element in the list is a
1259 * special case because we have to update the hash table to
1260 * point to the next (or remove it if there's nothing left).
1261 */
1262 if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque &&
1263 dom_id == w->dom_id) {
1264 if (w->next) {
1265 /* Insert the previous 'next' into the hash table */
1266 g_hash_table_insert(s->watches, g_strdup(abspath), w->next);
1267 } else {
1268 /* Nothing left; remove from hash table */
1269 g_hash_table_remove(s->watches, abspath);
1270 }
1271 free_watch(s, w);
1272 return 0;
1273 }
1274
1275 /*
1276 * We're all done messing with the hash table because the element
1277 * it points to has survived the cull. Now it's just a simple
1278 * linked list removal operation.
1279 */
1280 for (l = &w->next; *l; l = &w->next) {
1281 w = *l;
1282
1283 if (!g_strcmp0(token, w->token) && fn == w->cb &&
1284 opaque != w->cb_opaque && dom_id == w->dom_id) {
1285 *l = free_watch(s, w);
1286 return 0;
1287 }
1288 }
1289
1290 return ENOENT;
1291 }
1292
1293 int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id)
1294 {
1295 char **watch_paths;
1296 guint nr_watch_paths;
1297 guint i;
1298
1299 watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches,
1300 &nr_watch_paths);
1301
1302 for (i = 0; i < nr_watch_paths; i++) {
1303 XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]);
1304 XsWatch *w2, *w, **l;
1305
1306 /*
1307 * w1 is the original list. The hash table has this pointer.
1308 * w2 is the head of our newly-filtered list.
1309 * w and l are temporary for processing. w is somewhat redundant
1310 * with *l but makes my eyes bleed less.
1311 */
1312
1313 w = w2 = w1;
1314 l = &w;
1315 while (w) {
1316 if (w->dom_id == dom_id) {
1317 /* If we're freeing the head of the list, bump w2 */
1318 if (w2 == w) {
1319 w2 = w->next;
1320 }
1321 *l = free_watch(s, w);
1322 } else {
1323 l = &w->next;
1324 }
1325 w = *l;
1326 }
1327 /*
1328 * If the head of the list survived the cull, we don't need to
1329 * touch the hash table and we're done with this path. Else...
1330 */
1331 if (w1 != w2) {
1332 g_hash_table_steal(s->watches, watch_paths[i]);
1333
1334 /*
1335 * It was already freed. (Don't worry, this whole thing is
1336 * single-threaded and nobody saw it in the meantime). And
1337 * having *stolen* it, we now own the watch_paths[i] string
1338 * so if we don't give it back to the hash table, we need
1339 * to free it.
1340 */
1341 if (w2) {
1342 g_hash_table_insert(s->watches, watch_paths[i], w2);
1343 } else {
1344 g_free(watch_paths[i]);
1345 }
1346 }
1347 }
1348 g_free(watch_paths);
1349 return 0;
1350 }
1351
1352 static void xs_tx_free(void *_tx)
1353 {
1354 XsTransaction *tx = _tx;
1355 if (tx->root) {
1356 xs_node_unref(tx->root);
1357 }
1358 g_free(tx);
1359 }
1360
1361 XenstoreImplState *xs_impl_create(unsigned int dom_id)
1362 {
1363 XenstoreImplState *s = g_new0(XenstoreImplState, 1);
1364 GList *perms;
1365
1366 s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
1367 s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal,
1368 NULL, xs_tx_free);
1369
1370 perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0));
1371 s->root = xs_node_create("/", perms);
1372 g_list_free_full(perms, g_free);
1373 s->nr_nodes = 1;
1374
1375 s->root_tx = s->last_tx = 1;
1376 return s;
1377 }
1378
1379
1380 static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
1381 {
1382 XsNode *n = value;
1383
1384 n->serialized_tx = XBT_NULL;
1385 if (n->children) {
1386 g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
1387 }
1388 }
1389
1390 static void clear_tx_serialized_tx(gpointer key, gpointer value,
1391 gpointer opaque)
1392 {
1393 XsTransaction *t = value;
1394
1395 clear_serialized_tx(NULL, t->root, NULL);
1396 }
1397
1398 static void write_be32(GByteArray *save, uint32_t val)
1399 {
1400 uint32_t be = htonl(val);
1401 g_byte_array_append(save, (void *)&be, sizeof(be));
1402 }
1403
1404
1405 struct save_state {
1406 GByteArray *bytes;
1407 unsigned int tx_id;
1408 };
1409
1410 #define MODIFIED_IN_TX (1U << 0)
1411 #define DELETED_IN_TX (1U << 1)
1412 #define NODE_REF (1U << 2)
1413
1414 static void save_node(gpointer key, gpointer value, gpointer opaque)
1415 {
1416 struct save_state *ss = opaque;
1417 XsNode *n = value;
1418 char *name = key;
1419 uint8_t flag = 0;
1420
1421 /* Child nodes (i.e. anything but the root) have a name */
1422 if (name) {
1423 g_byte_array_append(ss->bytes, key, strlen(key) + 1);
1424 }
1425
1426 /*
1427 * If we already wrote this node, refer to the previous copy.
1428 * There's no rename/move in XenStore, so all we need to find
1429 * it is the tx_id of the transation in which it exists. Which
1430 * may be the root tx.
1431 */
1432 if (n->serialized_tx != XBT_NULL) {
1433 flag = NODE_REF;
1434 g_byte_array_append(ss->bytes, &flag, 1);
1435 write_be32(ss->bytes, n->serialized_tx);
1436 } else {
1437 GList *l;
1438 n->serialized_tx = ss->tx_id;
1439
1440 if (n->modified_in_tx) {
1441 flag |= MODIFIED_IN_TX;
1442 }
1443 if (n->deleted_in_tx) {
1444 flag |= DELETED_IN_TX;
1445 }
1446 g_byte_array_append(ss->bytes, &flag, 1);
1447
1448 if (n->content) {
1449 write_be32(ss->bytes, n->content->len);
1450 g_byte_array_append(ss->bytes, n->content->data, n->content->len);
1451 } else {
1452 write_be32(ss->bytes, 0);
1453 }
1454
1455 for (l = n->perms; l; l = l->next) {
1456 g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1);
1457 }
1458 /* NUL termination after perms */
1459 g_byte_array_append(ss->bytes, (void *)"", 1);
1460
1461 if (n->children) {
1462 g_hash_table_foreach(n->children, save_node, ss);
1463 }
1464 /* NUL termination after children (child name is NUL) */
1465 g_byte_array_append(ss->bytes, (void *)"", 1);
1466 }
1467 }
1468
1469 static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root)
1470 {
1471 write_be32(ss->bytes, tx_id);
1472 ss->tx_id = tx_id;
1473 save_node(NULL, root, ss);
1474 }
1475
1476 static void save_tx(gpointer key, gpointer value, gpointer opaque)
1477 {
1478 uint32_t tx_id = GPOINTER_TO_INT(key);
1479 struct save_state *ss = opaque;
1480 XsTransaction *n = value;
1481
1482 write_be32(ss->bytes, n->base_tx);
1483 write_be32(ss->bytes, n->dom_id);
1484
1485 save_tree(ss, tx_id, n->root);
1486 }
1487
1488 static void save_watch(gpointer key, gpointer value, gpointer opaque)
1489 {
1490 struct save_state *ss = opaque;
1491 XsWatch *w = value;
1492
1493 /* We only save the *guest* watches. */
1494 if (w->dom_id) {
1495 gpointer relpath = key + w->rel_prefix;
1496 g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1);
1497 g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1);
1498 }
1499 }
1500
1501 GByteArray *xs_impl_serialize(XenstoreImplState *s)
1502 {
1503 struct save_state ss;
1504
1505 ss.bytes = g_byte_array_new();
1506
1507 /*
1508 * node = flags [ real_node / node_ref ]
1509 * flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
1510 * node_ref = tx_id (in which the original version of this node exists)
1511 * real_node = content perms child* NUL
1512 * content = len data
1513 * len = uint32_t
1514 * data = uint8_t{len}
1515 * perms = perm* NUL
1516 * perm = asciiz
1517 * child = name node
1518 * name = asciiz
1519 *
1520 * tree = tx_id node
1521 * tx_id = uint32_t
1522 *
1523 * transaction = base_tx_id dom_id tree
1524 * base_tx_id = uint32_t
1525 * dom_id = uint32_t
1526 *
1527 * tx_list = tree transaction* XBT_NULL
1528 *
1529 * watch = path token
1530 * path = asciiz
1531 * token = asciiz
1532 *
1533 * watch_list = watch* NUL
1534 *
1535 * xs_serialize_stream = last_tx tx_list watch_list
1536 * last_tx = uint32_t
1537 */
1538
1539 /* Clear serialized_tx in every node. */
1540 if (s->serialized) {
1541 clear_serialized_tx(NULL, s->root, NULL);
1542 g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL);
1543 }
1544
1545 s->serialized = true;
1546
1547 write_be32(ss.bytes, s->last_tx);
1548 save_tree(&ss, s->root_tx, s->root);
1549 g_hash_table_foreach(s->transactions, save_tx, &ss);
1550
1551 write_be32(ss.bytes, XBT_NULL);
1552
1553 g_hash_table_foreach(s->watches, save_watch, &ss);
1554 g_byte_array_append(ss.bytes, (void *)"", 1);
1555
1556 return ss.bytes;
1557 }
1558
1559 struct unsave_state {
1560 char path[XENSTORE_ABS_PATH_MAX + 1];
1561 XenstoreImplState *s;
1562 GByteArray *bytes;
1563 uint8_t *d;
1564 size_t l;
1565 bool root_walk;
1566 };
1567
1568 static int consume_be32(struct unsave_state *us, unsigned int *val)
1569 {
1570 uint32_t d;
1571
1572 if (us->l < sizeof(d)) {
1573 return -EINVAL;
1574 }
1575 memcpy(&d, us->d, sizeof(d));
1576 *val = ntohl(d);
1577 us->d += sizeof(d);
1578 us->l -= sizeof(d);
1579 return 0;
1580 }
1581
1582 static int consume_string(struct unsave_state *us, char **str, size_t *len)
1583 {
1584 size_t l;
1585
1586 if (!us->l) {
1587 return -EINVAL;
1588 }
1589
1590 l = strnlen((void *)us->d, us->l);
1591 if (l == us->l) {
1592 return -EINVAL;
1593 }
1594
1595 if (str) {
1596 *str = (void *)us->d;
1597 }
1598 if (len) {
1599 *len = l;
1600 }
1601
1602 us->d += l + 1;
1603 us->l -= l + 1;
1604 return 0;
1605 }
1606
1607 static XsNode *lookup_node(XsNode *n, char *path)
1608 {
1609 char *slash = strchr(path, '/');
1610 XsNode *child;
1611
1612 if (path[0] == '\0') {
1613 return n;
1614 }
1615
1616 if (slash) {
1617 *slash = '\0';
1618 }
1619
1620 if (!n->children) {
1621 return NULL;
1622 }
1623 child = g_hash_table_lookup(n->children, path);
1624 if (!slash) {
1625 return child;
1626 }
1627
1628 *slash = '/';
1629 if (!child) {
1630 return NULL;
1631 }
1632 return lookup_node(child, slash + 1);
1633 }
1634
1635 static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id)
1636 {
1637 XsTransaction *t;
1638 if (tx_id == us->s->root_tx) {
1639 return lookup_node(us->s->root, us->path + 1);
1640 }
1641
1642 t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id));
1643 if (!t) {
1644 return NULL;
1645 }
1646 g_assert(t->root);
1647 return lookup_node(t->root, us->path + 1);
1648 }
1649
1650 static void count_child_nodes(gpointer key, gpointer value, gpointer user_data)
1651 {
1652 unsigned int *nr_nodes = user_data;
1653 XsNode *n = value;
1654
1655 (*nr_nodes)++;
1656
1657 if (n->children) {
1658 g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1659 }
1660 }
1661
1662 static int consume_node(struct unsave_state *us, XsNode **nodep,
1663 unsigned int *nr_nodes)
1664 {
1665 XsNode *n = NULL;
1666 uint8_t flags;
1667 int ret;
1668
1669 if (us->l < 1) {
1670 return -EINVAL;
1671 }
1672 flags = us->d[0];
1673 us->d++;
1674 us->l--;
1675
1676 if (flags == NODE_REF) {
1677 unsigned int tx;
1678
1679 ret = consume_be32(us, &tx);
1680 if (ret) {
1681 return ret;
1682 }
1683
1684 n = lookup_tx_node(us, tx);
1685 if (!n) {
1686 return -EINVAL;
1687 }
1688 n->ref++;
1689 if (n->children) {
1690 g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1691 }
1692 } else {
1693 uint32_t datalen;
1694
1695 if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) {
1696 return -EINVAL;
1697 }
1698 n = xs_node_new();
1699
1700 if (flags & DELETED_IN_TX) {
1701 n->deleted_in_tx = true;
1702 }
1703 if (flags & MODIFIED_IN_TX) {
1704 n->modified_in_tx = true;
1705 }
1706 ret = consume_be32(us, &datalen);
1707 if (ret) {
1708 xs_node_unref(n);
1709 return -EINVAL;
1710 }
1711 if (datalen) {
1712 if (datalen > us->l) {
1713 xs_node_unref(n);
1714 return -EINVAL;
1715 }
1716
1717 GByteArray *node_data = g_byte_array_new();
1718 g_byte_array_append(node_data, us->d, datalen);
1719 us->d += datalen;
1720 us->l -= datalen;
1721 n->content = node_data;
1722
1723 if (us->root_walk) {
1724 n->modified_in_tx = true;
1725 }
1726 }
1727 while (1) {
1728 char *perm = NULL;
1729 size_t permlen = 0;
1730
1731 ret = consume_string(us, &perm, &permlen);
1732 if (ret) {
1733 xs_node_unref(n);
1734 return ret;
1735 }
1736
1737 if (!permlen) {
1738 break;
1739 }
1740
1741 n->perms = g_list_append(n->perms, g_strdup(perm));
1742 }
1743
1744 /* Now children */
1745 while (1) {
1746 size_t childlen;
1747 char *childname;
1748 char *pathend;
1749 XsNode *child = NULL;
1750
1751 ret = consume_string(us, &childname, &childlen);
1752 if (ret) {
1753 xs_node_unref(n);
1754 return ret;
1755 }
1756
1757 if (!childlen) {
1758 break;
1759 }
1760
1761 pathend = us->path + strlen(us->path);
1762 strncat(us->path, "/", sizeof(us->path) - 1);
1763 strncat(us->path, childname, sizeof(us->path) - 1);
1764
1765 ret = consume_node(us, &child, nr_nodes);
1766 *pathend = '\0';
1767 if (ret) {
1768 xs_node_unref(n);
1769 return ret;
1770 }
1771 g_assert(child);
1772 xs_node_add_child(n, childname, child);
1773 }
1774
1775 /*
1776 * If the node has no data and no children we still want to fire
1777 * a watch on it.
1778 */
1779 if (us->root_walk && !n->children) {
1780 n->modified_in_tx = true;
1781 }
1782 }
1783
1784 if (!n->deleted_in_tx) {
1785 (*nr_nodes)++;
1786 }
1787
1788 *nodep = n;
1789 return 0;
1790 }
1791
1792 static int consume_tree(struct unsave_state *us, XsTransaction *t)
1793 {
1794 int ret;
1795
1796 ret = consume_be32(us, &t->tx_id);
1797 if (ret) {
1798 return ret;
1799 }
1800
1801 if (t->tx_id > us->s->last_tx) {
1802 return -EINVAL;
1803 }
1804
1805 us->path[0] = '\0';
1806
1807 return consume_node(us, &t->root, &t->nr_nodes);
1808 }
1809
1810 int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
1811 unsigned int dom_id, xs_impl_watch_fn watch_fn,
1812 void *watch_opaque)
1813 {
1814 struct unsave_state us;
1815 XsTransaction base_t = { 0 };
1816 int ret;
1817
1818 us.s = s;
1819 us.bytes = bytes;
1820 us.d = bytes->data;
1821 us.l = bytes->len;
1822
1823 xs_impl_reset_watches(s, dom_id);
1824 g_hash_table_remove_all(s->transactions);
1825
1826 xs_node_unref(s->root);
1827 s->root = NULL;
1828 s->root_tx = s->last_tx = XBT_NULL;
1829
1830 ret = consume_be32(&us, &s->last_tx);
1831 if (ret) {
1832 return ret;
1833 }
1834
1835 /*
1836 * Consume the base tree into a transaction so that watches can be
1837 * fired as we commit it. By setting us.root_walk we cause the nodes
1838 * to be marked as 'modified_in_tx' as they are created, so that the
1839 * watches are triggered on them.
1840 */
1841 base_t.dom_id = dom_id;
1842 base_t.base_tx = XBT_NULL;
1843 us.root_walk = true;
1844 ret = consume_tree(&us, &base_t);
1845 if (ret) {
1846 return ret;
1847 }
1848 us.root_walk = false;
1849
1850 /*
1851 * Commit the transaction now while the refcount on all nodes is 1.
1852 * Note that we haven't yet reinstated the *guest* watches but that's
1853 * OK because we don't want the guest to see any changes. Even any
1854 * backend nodes which get recreated should be *precisely* as they
1855 * were before the migration. Back ends may have been instantiated
1856 * already, and will see the frontend magically blink into existence
1857 * now (well, from the aio_bh which fires the watches). It's their
1858 * responsibility to rebuild everything precisely as it was before.
1859 */
1860 ret = transaction_commit(s, &base_t);
1861 if (ret) {
1862 return ret;
1863 }
1864
1865 while (1) {
1866 unsigned int base_tx;
1867 XsTransaction *t;
1868
1869 ret = consume_be32(&us, &base_tx);
1870 if (ret) {
1871 return ret;
1872 }
1873 if (base_tx == XBT_NULL) {
1874 break;
1875 }
1876
1877 t = g_new0(XsTransaction, 1);
1878 t->base_tx = base_tx;
1879
1880 ret = consume_be32(&us, &t->dom_id);
1881 if (!ret) {
1882 ret = consume_tree(&us, t);
1883 }
1884 if (ret) {
1885 g_free(t);
1886 return ret;
1887 }
1888 g_assert(t->root);
1889 if (t->dom_id) {
1890 s->nr_domu_transactions++;
1891 }
1892 g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t);
1893 }
1894
1895 while (1) {
1896 char *path, *token;
1897 size_t pathlen, toklen;
1898
1899 ret = consume_string(&us, &path, &pathlen);
1900 if (ret) {
1901 return ret;
1902 }
1903 if (!pathlen) {
1904 break;
1905 }
1906
1907 ret = consume_string(&us, &token, &toklen);
1908 if (ret) {
1909 return ret;
1910 }
1911
1912 if (!watch_fn) {
1913 continue;
1914 }
1915
1916 ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque);
1917 if (ret) {
1918 return ret;
1919 }
1920 }
1921
1922 if (us.l) {
1923 return -EINVAL;
1924 }
1925
1926 return 0;
1927 }