2 * QEMU Xen emulation: The actual implementation of XenStore
4 * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "qom/object.h"
15 #include "hw/xen/xen.h"
17 #include "xen_xenstore.h"
18 #include "xenstore_impl.h"
20 #include "hw/xen/interface/io/xs_wire.h"
22 #define XS_MAX_WATCHES 128
23 #define XS_MAX_DOMAIN_NODES 1000
24 #define XS_MAX_NODE_SIZE 2048
25 #define XS_MAX_TRANSACTIONS 10
26 #define XS_MAX_PERMS_PER_NODE 5
28 #define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
32 typedef struct XsNode
{
40 unsigned int serialized_tx
;
41 #ifdef XS_NODE_UNIT_TEST
42 gchar
*name
; /* debug only */
46 typedef struct XsWatch
{
55 typedef struct XsTransaction
{
57 unsigned int nr_nodes
;
63 struct XenstoreImplState
{
65 unsigned int nr_nodes
;
67 unsigned int nr_domu_watches
;
68 GHashTable
*transactions
;
69 unsigned int nr_domu_transactions
;
76 static void nobble_tx(gpointer key
, gpointer value
, gpointer user_data
)
78 unsigned int *new_tx_id
= user_data
;
79 XsTransaction
*tx
= value
;
81 if (tx
->base_tx
== *new_tx_id
) {
82 /* Transactions based on XBT_NULL will always fail */
83 tx
->base_tx
= XBT_NULL
;
87 static inline unsigned int next_tx(struct XenstoreImplState
*s
)
91 /* Find the next TX id which isn't either XBT_NULL or in use. */
94 } while (tx_id
== XBT_NULL
|| tx_id
== s
->root_tx
||
95 g_hash_table_lookup(s
->transactions
, GINT_TO_POINTER(tx_id
)));
98 * It is vanishingly unlikely, but ensure that no outstanding transaction
99 * is based on the (previous incarnation of the) newly-allocated TX id.
101 g_hash_table_foreach(s
->transactions
, nobble_tx
, &tx_id
);
106 static inline XsNode
*xs_node_new(void)
108 XsNode
*n
= g_new0(XsNode
, 1);
111 #ifdef XS_NODE_UNIT_TEST
113 xs_node_list
= g_list_prepend(xs_node_list
, n
);
118 static inline XsNode
*xs_node_ref(XsNode
*n
)
120 /* With just 10 transactions, it can never get anywhere near this. */
121 g_assert(n
->ref
< INT_MAX
);
128 static inline void xs_node_unref(XsNode
*n
)
139 g_byte_array_unref(n
->content
);
142 g_list_free_full(n
->perms
, g_free
);
145 g_hash_table_unref(n
->children
);
147 #ifdef XS_NODE_UNIT_TEST
150 xs_node_list
= g_list_remove(xs_node_list
, n
);
155 char *xs_perm_as_string(unsigned int perm
, unsigned int domid
)
160 case XS_PERM_READ
| XS_PERM_WRITE
:
175 return g_strdup_printf("%c%u", letter
, domid
);
178 static gpointer
do_perm_copy(gconstpointer src
, gpointer user_data
)
180 return g_strdup(src
);
183 static XsNode
*xs_node_create(const char *name
, GList
*perms
)
185 XsNode
*n
= xs_node_new();
187 #ifdef XS_NODE_UNIT_TEST
189 n
->name
= g_strdup(name
);
193 n
->perms
= g_list_copy_deep(perms
, do_perm_copy
, NULL
);
198 /* For copying from one hash table to another using g_hash_table_foreach() */
199 static void do_child_insert(gpointer key
, gpointer value
, gpointer user_data
)
201 g_hash_table_insert(user_data
, g_strdup(key
), xs_node_ref(value
));
204 static XsNode
*xs_node_copy(XsNode
*old
)
206 XsNode
*n
= xs_node_new();
208 n
->gencnt
= old
->gencnt
;
210 #ifdef XS_NODE_UNIT_TEST
212 n
->name
= g_strdup(old
->name
);
218 n
->children
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
,
219 (GDestroyNotify
)xs_node_unref
);
220 g_hash_table_foreach(old
->children
, do_child_insert
, n
->children
);
223 n
->perms
= g_list_copy_deep(old
->perms
, do_perm_copy
, NULL
);
226 n
->content
= g_byte_array_ref(old
->content
);
231 /* Returns true if it made a change to the hash table */
232 static bool xs_node_add_child(XsNode
*n
, const char *path_elem
, XsNode
*child
)
234 assert(!strchr(path_elem
, '/'));
238 return g_hash_table_remove(n
->children
, path_elem
);
241 #ifdef XS_NODE_UNIT_TEST
243 child
->name
= g_strdup(path_elem
);
246 n
->children
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
,
247 (GDestroyNotify
)xs_node_unref
);
251 * The documentation for g_hash_table_insert() says that it "returns a
252 * boolean value to indicate whether the newly added value was already
253 * in the hash table or not."
255 * It could perhaps be clearer that returning TRUE means it wasn't,
257 return g_hash_table_insert(n
->children
, g_strdup(path_elem
), child
);
261 struct XenstoreImplState
*s
;
262 char path
[XENSTORE_ABS_PATH_MAX
+ 2]; /* Two NUL terminators */
263 int (*op_fn
)(XsNode
**n
, struct walk_op
*op
);
271 /* The number of nodes which will exist in the tree if this op succeeds. */
272 unsigned int new_nr_nodes
;
275 * This is maintained on the way *down* the walk to indicate
276 * whether nodes can be modified in place or whether COW is
277 * required. It starts off being true, as we're always going to
278 * replace the root node. If we walk into a shared subtree it
279 * becomes false. If we start *creating* new nodes for a write,
280 * it becomes true again.
282 * Do not use it on the way back up.
289 /* Tracking during recursion so we know which is first. */
293 static void fire_watches(struct walk_op
*op
, bool parents
)
298 if (!op
->mutating
|| op
->in_transaction
) {
306 w
= g_hash_table_lookup(op
->s
->watches
, op
->path
);
309 /* Fire the parent nodes from 'op' if asked to */
315 assert(strlen(op
->path
) > w
->rel_prefix
);
316 w
->cb(w
->cb_opaque
, op
->path
+ w
->rel_prefix
, w
->token
);
322 static int xs_node_add_content(XsNode
**n
, struct walk_op
*op
)
324 GByteArray
*data
= op
->op_opaque
;
328 * The real XenStored includes permissions and names of child nodes
329 * in the calculated datasize but life's too short. For a single
330 * tenant internal XenStore, we don't have to be quite as pedantic.
332 if (data
->len
> XS_MAX_NODE_SIZE
) {
336 /* We *are* the node to be written. Either this or a copy. */
339 *n
= xs_node_copy(old
);
344 g_byte_array_unref((*n
)->content
);
346 (*n
)->content
= g_byte_array_ref(data
);
347 if (op
->tx_id
!= XBT_NULL
) {
348 (*n
)->modified_in_tx
= true;
353 static int xs_node_get_content(XsNode
**n
, struct walk_op
*op
)
355 GByteArray
*data
= op
->op_opaque
;
356 GByteArray
*node_data
;
361 node_data
= (*n
)->content
;
363 g_byte_array_append(data
, node_data
->data
, node_data
->len
);
369 static int node_rm_recurse(gpointer key
, gpointer value
, gpointer user_data
)
371 struct walk_op
*op
= user_data
;
372 int path_len
= strlen(op
->path
);
373 int key_len
= strlen(key
);
375 bool this_inplace
= op
->inplace
;
381 assert(key_len
+ path_len
+ 2 <= sizeof(op
->path
));
382 op
->path
[path_len
] = '/';
383 memcpy(op
->path
+ path_len
+ 1, key
, key_len
+ 1);
386 g_hash_table_foreach_remove(n
->children
, node_rm_recurse
, op
);
391 * Fire watches on *this* node but not the parents because they are
392 * going to be deleted too, so the watch will fire for them anyway.
394 fire_watches(op
, false);
395 op
->path
[path_len
] = '\0';
398 * Actually deleting the child here is just an optimisation; if we
399 * don't then the final unref on the topmost victim will just have
400 * to cascade down again repeating all the g_hash_table_foreach()
406 static XsNode
*xs_node_copy_deleted(XsNode
*old
, struct walk_op
*op
);
407 static void copy_deleted_recurse(gpointer key
, gpointer value
,
410 struct walk_op
*op
= user_data
;
411 GHashTable
*siblings
= op
->op_opaque2
;
412 XsNode
*n
= xs_node_copy_deleted(value
, op
);
415 * Reinsert the deleted_in_tx copy of the node into the parent's
416 * 'children' hash table. Having stashed it from op->op_opaque2
417 * before the recursive call to xs_node_copy_deleted() scribbled
420 g_hash_table_insert(siblings
, g_strdup(key
), n
);
423 static XsNode
*xs_node_copy_deleted(XsNode
*old
, struct walk_op
*op
)
425 XsNode
*n
= xs_node_new();
427 n
->gencnt
= old
->gencnt
;
429 #ifdef XS_NODE_UNIT_TEST
431 n
->name
= g_strdup(old
->name
);
436 n
->children
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
,
437 (GDestroyNotify
)xs_node_unref
);
438 op
->op_opaque2
= n
->children
;
439 g_hash_table_foreach(old
->children
, copy_deleted_recurse
, op
);
442 n
->perms
= g_list_copy_deep(old
->perms
, do_perm_copy
, NULL
);
444 n
->deleted_in_tx
= true;
445 /* If it gets resurrected we only fire a watch if it lost its content */
447 n
->modified_in_tx
= true;
453 static int xs_node_rm(XsNode
**n
, struct walk_op
*op
)
455 bool this_inplace
= op
->inplace
;
457 if (op
->tx_id
!= XBT_NULL
) {
458 /* It's not trivial to do inplace handling for this one */
460 *n
= xs_node_copy_deleted(old
, op
);
465 /* Fire watches for, and count, nodes in the subtree which get deleted */
466 if ((*n
)->children
) {
467 g_hash_table_foreach_remove((*n
)->children
, node_rm_recurse
, op
);
478 static int xs_node_get_perms(XsNode
**n
, struct walk_op
*op
)
480 GList
**perms
= op
->op_opaque
;
485 *perms
= g_list_copy_deep((*n
)->perms
, do_perm_copy
, NULL
);
489 static void parse_perm(const char *perm
, char *letter
, unsigned int *dom_id
)
491 unsigned int n
= sscanf(perm
, "%c%u", letter
, dom_id
);
496 static bool can_access(unsigned int dom_id
, GList
*perms
, const char *letters
)
500 unsigned int perm_dom_id
;
507 n
= g_list_length(perms
);
511 * The dom_id of the first perm is the owner, and the owner always has
514 parse_perm(g_list_nth_data(perms
, 0), &perm_letter
, &perm_dom_id
);
515 if (dom_id
== perm_dom_id
) {
520 * The letter of the first perm specified the default access for all other
523 access
= !!strchr(letters
, perm_letter
);
524 for (i
= 1; i
< n
; i
++) {
525 parse_perm(g_list_nth_data(perms
, i
), &perm_letter
, &perm_dom_id
);
526 if (dom_id
!= perm_dom_id
) {
529 access
= !!strchr(letters
, perm_letter
);
535 static int xs_node_set_perms(XsNode
**n
, struct walk_op
*op
)
537 GList
*perms
= op
->op_opaque
;
540 unsigned int perm_dom_id
;
543 /* A guest may not change permissions on nodes it does not own */
544 if (!can_access(op
->dom_id
, (*n
)->perms
, "")) {
548 /* A guest may not change the owner of a node it owns. */
549 parse_perm(perms
->data
, &perm_letter
, &perm_dom_id
);
550 if (perm_dom_id
!= op
->dom_id
) {
554 if (g_list_length(perms
) > XS_MAX_PERMS_PER_NODE
) {
559 /* We *are* the node to be written. Either this or a copy. */
562 *n
= xs_node_copy(old
);
567 g_list_free_full((*n
)->perms
, g_free
);
569 (*n
)->perms
= g_list_copy_deep(perms
, do_perm_copy
, NULL
);
570 if (op
->tx_id
!= XBT_NULL
) {
571 (*n
)->modified_in_tx
= true;
577 * Passed a full reference in *n which it may free if it needs to COW.
579 * When changing the tree, the op->inplace flag indicates whether this
580 * node may be modified in place (i.e. it and all its parents had a
581 * refcount of one). If walking down the tree we find a node whose
582 * refcount is higher, we must clear op->inplace and COW from there
583 * down. Unless we are creating new nodes as scaffolding for a write
584 * (which works like 'mkdir -p' does). In which case those newly
585 * created nodes can (and must) be modified in place again.
587 static int xs_node_walk(XsNode
**n
, struct walk_op
*op
)
589 char *child_name
= NULL
;
591 XsNode
*old
= *n
, *child
= NULL
;
592 bool stole_child
= false;
597 namelen
= strlen(op
->path
);
598 watch
= g_hash_table_lookup(op
->s
->watches
, op
->path
);
600 /* Is there a child, or do we hit the double-NUL termination? */
601 if (op
->path
[namelen
+ 1]) {
603 child_name
= op
->path
+ namelen
+ 1;
604 slash
= strchr(child_name
, '/');
608 op
->path
[namelen
] = '/';
611 /* If we walk into a subtree which is shared, we must COW */
612 if (op
->mutating
&& old
->ref
!= 1) {
617 const char *letters
= op
->mutating
? "wb" : "rb";
619 if (!can_access(op
->dom_id
, old
->perms
, letters
)) {
624 /* This is the actual node on which the operation shall be performed */
625 err
= op
->op_fn(n
, op
);
627 fire_watches(op
, true);
632 /* op->inplace will be further modified during the recursion */
633 this_inplace
= op
->inplace
;
635 if (old
&& old
->children
) {
636 child
= g_hash_table_lookup(old
->children
, child_name
);
637 /* This is a *weak* reference to 'child', owned by the hash table */
641 if (child
->deleted_in_tx
) {
642 assert(child
->ref
== 1);
643 /* Cannot actually set child->deleted_in_tx = false until later */
647 * Now we own it too. But if we can modify inplace, that's going to
648 * foil the check and force it to COW. We want to be the *only* owner
649 * so that it can be modified in place, so remove it from the hash
650 * table in that case. We'll add it (or its replacement) back later.
652 if (op
->mutating
&& this_inplace
) {
653 g_hash_table_remove(old
->children
, child_name
);
656 } else if (op
->create_dirs
) {
657 assert(op
->mutating
);
659 if (!can_access(op
->dom_id
, old
->perms
, "wb")) {
664 if (op
->dom_id
&& op
->new_nr_nodes
>= XS_MAX_DOMAIN_NODES
) {
669 child
= xs_node_create(child_name
, old
->perms
);
673 * If we're creating a new child, we can clearly modify it (and its
674 * children) in place from here on down.
683 * If there's a watch on this node, add it to the list to be fired
684 * (with the correct full pathname for the modified node) at the end.
687 op
->watches
= g_list_append(op
->watches
, watch
);
691 * Except for the temporary child-stealing as noted, our node has not
692 * changed yet. We don't yet know the overall operation will complete.
694 err
= xs_node_walk(&child
, op
);
697 op
->watches
= g_list_remove(op
->watches
, watch
);
700 if (err
|| !op
->mutating
) {
702 /* Put it back as it was. */
703 g_hash_table_replace(old
->children
, g_strdup(child_name
), child
);
705 xs_node_unref(child
);
711 * Now we know the operation has completed successfully and we're on
712 * the way back up. Make the change, substituting 'child' in the
716 *n
= xs_node_copy(old
);
721 * If we resurrected a deleted_in_tx node, we can mark it as no longer
722 * deleted now that we know the overall operation has succeeded.
724 if (op
->create_dirs
&& child
&& child
->deleted_in_tx
) {
726 child
->deleted_in_tx
= false;
730 * The child may be NULL here, for a remove operation. Either way,
731 * xs_node_add_child() will do the right thing and return a value
732 * indicating whether it changed the parent's hash table or not.
734 * We bump the parent gencnt if it adds a child that we *didn't*
735 * steal from it in the first place, or if child==NULL and was
736 * thus removed (whether we stole it earlier and didn't put it
737 * back, or xs_node_add_child() actually removed it now).
739 if ((xs_node_add_child(*n
, child_name
, child
) && !stole_child
) || !child
) {
744 op
->path
[namelen
] = '\0';
746 assert(!op
->watches
);
748 * On completing the recursion back up the path walk and reaching the
749 * top, assign the new node count if the operation was successful. If
750 * the main tree was changed, bump its tx ID so that outstanding
751 * transactions correctly fail. But don't bump it every time; only
752 * if it makes a difference.
754 if (!err
&& op
->mutating
) {
755 if (!op
->in_transaction
) {
756 if (op
->s
->root_tx
!= op
->s
->last_tx
) {
757 op
->s
->root_tx
= next_tx(op
->s
);
759 op
->s
->nr_nodes
= op
->new_nr_nodes
;
761 XsTransaction
*tx
= g_hash_table_lookup(op
->s
->transactions
,
762 GINT_TO_POINTER(op
->tx_id
));
764 tx
->nr_nodes
= op
->new_nr_nodes
;
771 static void append_directory_item(gpointer key
, gpointer value
,
774 GList
**items
= user_data
;
776 *items
= g_list_insert_sorted(*items
, g_strdup(key
), (GCompareFunc
)strcmp
);
779 /* Populates items with char * names which caller must free. */
780 static int xs_node_directory(XsNode
**n
, struct walk_op
*op
)
782 GList
**items
= op
->op_opaque
;
787 if ((*n
)->children
) {
788 g_hash_table_foreach((*n
)->children
, append_directory_item
, items
);
791 if (op
->op_opaque2
) {
792 *(uint64_t *)op
->op_opaque2
= (*n
)->gencnt
;
798 static int validate_path(char *outpath
, const char *userpath
,
801 size_t i
, pathlen
= strlen(userpath
);
803 if (!pathlen
|| userpath
[pathlen
] == '/' || strstr(userpath
, "//")) {
806 for (i
= 0; i
< pathlen
; i
++) {
807 if (!strchr(XS_VALID_CHARS
, userpath
[i
])) {
811 if (userpath
[0] == '/') {
812 if (pathlen
> XENSTORE_ABS_PATH_MAX
) {
815 memcpy(outpath
, userpath
, pathlen
+ 1);
817 if (pathlen
> XENSTORE_REL_PATH_MAX
) {
820 snprintf(outpath
, XENSTORE_ABS_PATH_MAX
, "/local/domain/%u/%s", dom_id
,
827 static int init_walk_op(XenstoreImplState
*s
, struct walk_op
*op
,
828 xs_transaction_t tx_id
, unsigned int dom_id
,
829 const char *path
, XsNode
***rootp
)
831 int ret
= validate_path(op
->path
, path
, dom_id
);
837 * We use *two* NUL terminators at the end of the path, as during the walk
838 * we will temporarily turn each '/' into a NUL to allow us to use that
839 * path element for the lookup.
841 op
->path
[strlen(op
->path
) + 1] = '\0';
845 op
->mutating
= false;
846 op
->create_dirs
= false;
847 op
->in_transaction
= false;
852 if (tx_id
== XBT_NULL
) {
854 op
->new_nr_nodes
= s
->nr_nodes
;
856 XsTransaction
*tx
= g_hash_table_lookup(s
->transactions
,
857 GINT_TO_POINTER(tx_id
));
862 op
->new_nr_nodes
= tx
->nr_nodes
;
863 op
->in_transaction
= true;
869 int xs_impl_read(XenstoreImplState
*s
, unsigned int dom_id
,
870 xs_transaction_t tx_id
, const char *path
, GByteArray
*data
)
873 * The data GByteArray shall exist, and will be freed by caller.
874 * Just g_byte_array_append() to it.
880 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
884 op
.op_fn
= xs_node_get_content
;
886 return xs_node_walk(n
, &op
);
889 int xs_impl_write(XenstoreImplState
*s
, unsigned int dom_id
,
890 xs_transaction_t tx_id
, const char *path
, GByteArray
*data
)
893 * The data GByteArray shall exist, will be freed by caller. You are
894 * free to use g_byte_array_steal() and keep the data. Or just ref it.
900 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
904 op
.op_fn
= xs_node_add_content
;
907 op
.create_dirs
= true;
908 return xs_node_walk(n
, &op
);
911 int xs_impl_directory(XenstoreImplState
*s
, unsigned int dom_id
,
912 xs_transaction_t tx_id
, const char *path
,
913 uint64_t *gencnt
, GList
**items
)
916 * The items are (char *) to be freed by caller. Although it's consumed
917 * immediately so if you want to change it to (const char *) and keep
918 * them, go ahead and change the caller.
924 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
928 op
.op_fn
= xs_node_directory
;
929 op
.op_opaque
= items
;
930 op
.op_opaque2
= gencnt
;
931 return xs_node_walk(n
, &op
);
934 int xs_impl_transaction_start(XenstoreImplState
*s
, unsigned int dom_id
,
935 xs_transaction_t
*tx_id
)
939 if (*tx_id
!= XBT_NULL
) {
943 if (dom_id
&& s
->nr_domu_transactions
>= XS_MAX_TRANSACTIONS
) {
947 tx
= g_new0(XsTransaction
, 1);
949 tx
->nr_nodes
= s
->nr_nodes
;
950 tx
->tx_id
= next_tx(s
);
951 tx
->base_tx
= s
->root_tx
;
952 tx
->root
= xs_node_ref(s
->root
);
955 g_hash_table_insert(s
->transactions
, GINT_TO_POINTER(tx
->tx_id
), tx
);
957 s
->nr_domu_transactions
++;
963 static gboolean
tx_commit_walk(gpointer key
, gpointer value
,
966 struct walk_op
*op
= user_data
;
967 int path_len
= strlen(op
->path
);
968 int key_len
= strlen(key
);
969 bool fire_parents
= true;
977 if (n
->deleted_in_tx
) {
979 * We fire watches on our parents if we are the *first* node
980 * to be deleted (the topmost one). This matches the behaviour
981 * when deleting in the live tree.
983 fire_parents
= !op
->deleted_in_tx
;
985 /* Only used on the way down so no need to clear it later */
986 op
->deleted_in_tx
= true;
989 assert(key_len
+ path_len
+ 2 <= sizeof(op
->path
));
990 op
->path
[path_len
] = '/';
991 memcpy(op
->path
+ path_len
+ 1, key
, key_len
+ 1);
993 watch
= g_hash_table_lookup(op
->s
->watches
, op
->path
);
995 op
->watches
= g_list_append(op
->watches
, watch
);
999 g_hash_table_foreach_remove(n
->children
, tx_commit_walk
, op
);
1003 op
->watches
= g_list_remove(op
->watches
, watch
);
1007 * Don't fire watches if this node was only copied because a
1008 * descendent was changed. The modified_in_tx flag indicates the
1009 * ones which were really changed.
1011 if (n
->modified_in_tx
|| n
->deleted_in_tx
) {
1012 fire_watches(op
, fire_parents
);
1013 n
->modified_in_tx
= false;
1015 op
->path
[path_len
] = '\0';
1017 /* Deleted nodes really do get expunged when we commit */
1018 return n
->deleted_in_tx
;
1021 static int transaction_commit(XenstoreImplState
*s
, XsTransaction
*tx
)
1026 if (s
->root_tx
!= tx
->base_tx
) {
1029 xs_node_unref(s
->root
);
1032 s
->root_tx
= tx
->tx_id
;
1033 s
->nr_nodes
= tx
->nr_nodes
;
1035 init_walk_op(s
, &op
, XBT_NULL
, tx
->dom_id
, "/", &n
);
1036 op
.deleted_in_tx
= false;
1040 * Walk the new root and fire watches on any node which has a
1041 * refcount of one (which is therefore unique to this transaction).
1043 if (s
->root
->children
) {
1044 g_hash_table_foreach_remove(s
->root
->children
, tx_commit_walk
, &op
);
1050 int xs_impl_transaction_end(XenstoreImplState
*s
, unsigned int dom_id
,
1051 xs_transaction_t tx_id
, bool commit
)
1054 XsTransaction
*tx
= g_hash_table_lookup(s
->transactions
,
1055 GINT_TO_POINTER(tx_id
));
1057 if (!tx
|| tx
->dom_id
!= dom_id
) {
1062 ret
= transaction_commit(s
, tx
);
1065 g_hash_table_remove(s
->transactions
, GINT_TO_POINTER(tx_id
));
1067 assert(s
->nr_domu_transactions
);
1068 s
->nr_domu_transactions
--;
1073 int xs_impl_rm(XenstoreImplState
*s
, unsigned int dom_id
,
1074 xs_transaction_t tx_id
, const char *path
)
1080 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
1084 op
.op_fn
= xs_node_rm
;
1086 return xs_node_walk(n
, &op
);
1089 int xs_impl_get_perms(XenstoreImplState
*s
, unsigned int dom_id
,
1090 xs_transaction_t tx_id
, const char *path
, GList
**perms
)
1096 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
1100 op
.op_fn
= xs_node_get_perms
;
1101 op
.op_opaque
= perms
;
1102 return xs_node_walk(n
, &op
);
1105 static void is_valid_perm(gpointer data
, gpointer user_data
)
1108 bool *valid
= user_data
;
1110 unsigned int dom_id
;
1116 if (sscanf(perm
, "%c%u", &letter
, &dom_id
) != 2) {
1134 int xs_impl_set_perms(XenstoreImplState
*s
, unsigned int dom_id
,
1135 xs_transaction_t tx_id
, const char *path
, GList
*perms
)
1142 if (!g_list_length(perms
)) {
1146 g_list_foreach(perms
, is_valid_perm
, &valid
);
1151 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
1155 op
.op_fn
= xs_node_set_perms
;
1156 op
.op_opaque
= perms
;
1158 return xs_node_walk(n
, &op
);
1161 static int do_xs_impl_watch(XenstoreImplState
*s
, unsigned int dom_id
,
1162 const char *path
, const char *token
,
1163 xs_impl_watch_fn fn
, void *opaque
)
1166 char abspath
[XENSTORE_ABS_PATH_MAX
+ 1];
1170 ret
= validate_path(abspath
, path
, dom_id
);
1175 /* Check for duplicates */
1176 l
= w
= g_hash_table_lookup(s
->watches
, abspath
);
1178 if (!g_strcmp0(token
, w
->token
) && opaque
== w
->cb_opaque
&&
1179 fn
== w
->cb
&& dom_id
== w
->dom_id
) {
1185 if (dom_id
&& s
->nr_domu_watches
>= XS_MAX_WATCHES
) {
1189 w
= g_new0(XsWatch
, 1);
1190 w
->token
= g_strdup(token
);
1192 w
->cb_opaque
= opaque
;
1194 w
->rel_prefix
= strlen(abspath
) - strlen(path
);
1196 /* l was looked up above when checking for duplicates */
1201 g_hash_table_insert(s
->watches
, g_strdup(abspath
), w
);
1204 s
->nr_domu_watches
++;
1210 int xs_impl_watch(XenstoreImplState
*s
, unsigned int dom_id
, const char *path
,
1211 const char *token
, xs_impl_watch_fn fn
, void *opaque
)
1213 int ret
= do_xs_impl_watch(s
, dom_id
, path
, token
, fn
, opaque
);
1216 /* A new watch should fire immediately */
1217 fn(opaque
, path
, token
);
1223 static XsWatch
*free_watch(XenstoreImplState
*s
, XsWatch
*w
)
1225 XsWatch
*next
= w
->next
;
1228 assert(s
->nr_domu_watches
);
1229 s
->nr_domu_watches
--;
1238 int xs_impl_unwatch(XenstoreImplState
*s
, unsigned int dom_id
,
1239 const char *path
, const char *token
,
1240 xs_impl_watch_fn fn
, void *opaque
)
1242 char abspath
[XENSTORE_ABS_PATH_MAX
+ 1];
1246 ret
= validate_path(abspath
, path
, dom_id
);
1251 w
= g_hash_table_lookup(s
->watches
, abspath
);
1257 * The hash table contains the first element of a list of
1258 * watches. Removing the first element in the list is a
1259 * special case because we have to update the hash table to
1260 * point to the next (or remove it if there's nothing left).
1262 if (!g_strcmp0(token
, w
->token
) && fn
== w
->cb
&& opaque
== w
->cb_opaque
&&
1263 dom_id
== w
->dom_id
) {
1265 /* Insert the previous 'next' into the hash table */
1266 g_hash_table_insert(s
->watches
, g_strdup(abspath
), w
->next
);
1268 /* Nothing left; remove from hash table */
1269 g_hash_table_remove(s
->watches
, abspath
);
1276 * We're all done messing with the hash table because the element
1277 * it points to has survived the cull. Now it's just a simple
1278 * linked list removal operation.
1280 for (l
= &w
->next
; *l
; l
= &w
->next
) {
1283 if (!g_strcmp0(token
, w
->token
) && fn
== w
->cb
&&
1284 opaque
!= w
->cb_opaque
&& dom_id
== w
->dom_id
) {
1285 *l
= free_watch(s
, w
);
1293 int xs_impl_reset_watches(XenstoreImplState
*s
, unsigned int dom_id
)
1296 guint nr_watch_paths
;
1299 watch_paths
= (char **)g_hash_table_get_keys_as_array(s
->watches
,
1302 for (i
= 0; i
< nr_watch_paths
; i
++) {
1303 XsWatch
*w1
= g_hash_table_lookup(s
->watches
, watch_paths
[i
]);
1304 XsWatch
*w2
, *w
, **l
;
1307 * w1 is the original list. The hash table has this pointer.
1308 * w2 is the head of our newly-filtered list.
1309 * w and l are temporary for processing. w is somewhat redundant
1310 * with *l but makes my eyes bleed less.
1316 if (w
->dom_id
== dom_id
) {
1317 /* If we're freeing the head of the list, bump w2 */
1321 *l
= free_watch(s
, w
);
1328 * If the head of the list survived the cull, we don't need to
1329 * touch the hash table and we're done with this path. Else...
1332 g_hash_table_steal(s
->watches
, watch_paths
[i
]);
1335 * It was already freed. (Don't worry, this whole thing is
1336 * single-threaded and nobody saw it in the meantime). And
1337 * having *stolen* it, we now own the watch_paths[i] string
1338 * so if we don't give it back to the hash table, we need
1342 g_hash_table_insert(s
->watches
, watch_paths
[i
], w2
);
1344 g_free(watch_paths
[i
]);
1348 g_free(watch_paths
);
1352 static void xs_tx_free(void *_tx
)
1354 XsTransaction
*tx
= _tx
;
1356 xs_node_unref(tx
->root
);
1361 XenstoreImplState
*xs_impl_create(unsigned int dom_id
)
1363 XenstoreImplState
*s
= g_new0(XenstoreImplState
, 1);
1366 s
->watches
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
, NULL
);
1367 s
->transactions
= g_hash_table_new_full(g_direct_hash
, g_direct_equal
,
1370 perms
= g_list_append(NULL
, xs_perm_as_string(XS_PERM_NONE
, 0));
1371 s
->root
= xs_node_create("/", perms
);
1372 g_list_free_full(perms
, g_free
);
1375 s
->root_tx
= s
->last_tx
= 1;
1380 static void clear_serialized_tx(gpointer key
, gpointer value
, gpointer opaque
)
1384 n
->serialized_tx
= XBT_NULL
;
1386 g_hash_table_foreach(n
->children
, clear_serialized_tx
, NULL
);
1390 static void clear_tx_serialized_tx(gpointer key
, gpointer value
,
1393 XsTransaction
*t
= value
;
1395 clear_serialized_tx(NULL
, t
->root
, NULL
);
1398 static void write_be32(GByteArray
*save
, uint32_t val
)
1400 uint32_t be
= htonl(val
);
1401 g_byte_array_append(save
, (void *)&be
, sizeof(be
));
1410 #define MODIFIED_IN_TX (1U << 0)
1411 #define DELETED_IN_TX (1U << 1)
1412 #define NODE_REF (1U << 2)
1414 static void save_node(gpointer key
, gpointer value
, gpointer opaque
)
1416 struct save_state
*ss
= opaque
;
1421 /* Child nodes (i.e. anything but the root) have a name */
1423 g_byte_array_append(ss
->bytes
, key
, strlen(key
) + 1);
1427 * If we already wrote this node, refer to the previous copy.
1428 * There's no rename/move in XenStore, so all we need to find
1429 * it is the tx_id of the transation in which it exists. Which
1430 * may be the root tx.
1432 if (n
->serialized_tx
!= XBT_NULL
) {
1434 g_byte_array_append(ss
->bytes
, &flag
, 1);
1435 write_be32(ss
->bytes
, n
->serialized_tx
);
1438 n
->serialized_tx
= ss
->tx_id
;
1440 if (n
->modified_in_tx
) {
1441 flag
|= MODIFIED_IN_TX
;
1443 if (n
->deleted_in_tx
) {
1444 flag
|= DELETED_IN_TX
;
1446 g_byte_array_append(ss
->bytes
, &flag
, 1);
1449 write_be32(ss
->bytes
, n
->content
->len
);
1450 g_byte_array_append(ss
->bytes
, n
->content
->data
, n
->content
->len
);
1452 write_be32(ss
->bytes
, 0);
1455 for (l
= n
->perms
; l
; l
= l
->next
) {
1456 g_byte_array_append(ss
->bytes
, l
->data
, strlen(l
->data
) + 1);
1458 /* NUL termination after perms */
1459 g_byte_array_append(ss
->bytes
, (void *)"", 1);
1462 g_hash_table_foreach(n
->children
, save_node
, ss
);
1464 /* NUL termination after children (child name is NUL) */
1465 g_byte_array_append(ss
->bytes
, (void *)"", 1);
1469 static void save_tree(struct save_state
*ss
, uint32_t tx_id
, XsNode
*root
)
1471 write_be32(ss
->bytes
, tx_id
);
1473 save_node(NULL
, root
, ss
);
1476 static void save_tx(gpointer key
, gpointer value
, gpointer opaque
)
1478 uint32_t tx_id
= GPOINTER_TO_INT(key
);
1479 struct save_state
*ss
= opaque
;
1480 XsTransaction
*n
= value
;
1482 write_be32(ss
->bytes
, n
->base_tx
);
1483 write_be32(ss
->bytes
, n
->dom_id
);
1485 save_tree(ss
, tx_id
, n
->root
);
1488 static void save_watch(gpointer key
, gpointer value
, gpointer opaque
)
1490 struct save_state
*ss
= opaque
;
1493 /* We only save the *guest* watches. */
1495 gpointer relpath
= key
+ w
->rel_prefix
;
1496 g_byte_array_append(ss
->bytes
, relpath
, strlen(relpath
) + 1);
1497 g_byte_array_append(ss
->bytes
, (void *)w
->token
, strlen(w
->token
) + 1);
1501 GByteArray
*xs_impl_serialize(XenstoreImplState
*s
)
1503 struct save_state ss
;
1505 ss
.bytes
= g_byte_array_new();
1508 * node = flags [ real_node / node_ref ]
1509 * flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
1510 * node_ref = tx_id (in which the original version of this node exists)
1511 * real_node = content perms child* NUL
1512 * content = len data
1514 * data = uint8_t{len}
1523 * transaction = base_tx_id dom_id tree
1524 * base_tx_id = uint32_t
1527 * tx_list = tree transaction* XBT_NULL
1529 * watch = path token
1533 * watch_list = watch* NUL
1535 * xs_serialize_stream = last_tx tx_list watch_list
1536 * last_tx = uint32_t
1539 /* Clear serialized_tx in every node. */
1540 if (s
->serialized
) {
1541 clear_serialized_tx(NULL
, s
->root
, NULL
);
1542 g_hash_table_foreach(s
->transactions
, clear_tx_serialized_tx
, NULL
);
1545 s
->serialized
= true;
1547 write_be32(ss
.bytes
, s
->last_tx
);
1548 save_tree(&ss
, s
->root_tx
, s
->root
);
1549 g_hash_table_foreach(s
->transactions
, save_tx
, &ss
);
1551 write_be32(ss
.bytes
, XBT_NULL
);
1553 g_hash_table_foreach(s
->watches
, save_watch
, &ss
);
1554 g_byte_array_append(ss
.bytes
, (void *)"", 1);
1559 struct unsave_state
{
1560 char path
[XENSTORE_ABS_PATH_MAX
+ 1];
1561 XenstoreImplState
*s
;
1568 static int consume_be32(struct unsave_state
*us
, unsigned int *val
)
1572 if (us
->l
< sizeof(d
)) {
1575 memcpy(&d
, us
->d
, sizeof(d
));
1582 static int consume_string(struct unsave_state
*us
, char **str
, size_t *len
)
1590 l
= strnlen((void *)us
->d
, us
->l
);
1596 *str
= (void *)us
->d
;
1607 static XsNode
*lookup_node(XsNode
*n
, char *path
)
1609 char *slash
= strchr(path
, '/');
1612 if (path
[0] == '\0') {
1623 child
= g_hash_table_lookup(n
->children
, path
);
1632 return lookup_node(child
, slash
+ 1);
1635 static XsNode
*lookup_tx_node(struct unsave_state
*us
, unsigned int tx_id
)
1638 if (tx_id
== us
->s
->root_tx
) {
1639 return lookup_node(us
->s
->root
, us
->path
+ 1);
1642 t
= g_hash_table_lookup(us
->s
->transactions
, GINT_TO_POINTER(tx_id
));
1647 return lookup_node(t
->root
, us
->path
+ 1);
1650 static void count_child_nodes(gpointer key
, gpointer value
, gpointer user_data
)
1652 unsigned int *nr_nodes
= user_data
;
1658 g_hash_table_foreach(n
->children
, count_child_nodes
, nr_nodes
);
1662 static int consume_node(struct unsave_state
*us
, XsNode
**nodep
,
1663 unsigned int *nr_nodes
)
1676 if (flags
== NODE_REF
) {
1679 ret
= consume_be32(us
, &tx
);
1684 n
= lookup_tx_node(us
, tx
);
1690 g_hash_table_foreach(n
->children
, count_child_nodes
, nr_nodes
);
1695 if (flags
& ~(DELETED_IN_TX
| MODIFIED_IN_TX
)) {
1700 if (flags
& DELETED_IN_TX
) {
1701 n
->deleted_in_tx
= true;
1703 if (flags
& MODIFIED_IN_TX
) {
1704 n
->modified_in_tx
= true;
1706 ret
= consume_be32(us
, &datalen
);
1712 if (datalen
> us
->l
) {
1717 GByteArray
*node_data
= g_byte_array_new();
1718 g_byte_array_append(node_data
, us
->d
, datalen
);
1721 n
->content
= node_data
;
1723 if (us
->root_walk
) {
1724 n
->modified_in_tx
= true;
1731 ret
= consume_string(us
, &perm
, &permlen
);
1741 n
->perms
= g_list_append(n
->perms
, g_strdup(perm
));
1749 XsNode
*child
= NULL
;
1751 ret
= consume_string(us
, &childname
, &childlen
);
1761 pathend
= us
->path
+ strlen(us
->path
);
1762 strncat(us
->path
, "/", sizeof(us
->path
) - 1);
1763 strncat(us
->path
, childname
, sizeof(us
->path
) - 1);
1765 ret
= consume_node(us
, &child
, nr_nodes
);
1772 xs_node_add_child(n
, childname
, child
);
1776 * If the node has no data and no children we still want to fire
1779 if (us
->root_walk
&& !n
->children
) {
1780 n
->modified_in_tx
= true;
1784 if (!n
->deleted_in_tx
) {
1792 static int consume_tree(struct unsave_state
*us
, XsTransaction
*t
)
1796 ret
= consume_be32(us
, &t
->tx_id
);
1801 if (t
->tx_id
> us
->s
->last_tx
) {
1807 return consume_node(us
, &t
->root
, &t
->nr_nodes
);
1810 int xs_impl_deserialize(XenstoreImplState
*s
, GByteArray
*bytes
,
1811 unsigned int dom_id
, xs_impl_watch_fn watch_fn
,
1814 struct unsave_state us
;
1815 XsTransaction base_t
= { 0 };
1823 xs_impl_reset_watches(s
, dom_id
);
1824 g_hash_table_remove_all(s
->transactions
);
1826 xs_node_unref(s
->root
);
1828 s
->root_tx
= s
->last_tx
= XBT_NULL
;
1830 ret
= consume_be32(&us
, &s
->last_tx
);
1836 * Consume the base tree into a transaction so that watches can be
1837 * fired as we commit it. By setting us.root_walk we cause the nodes
1838 * to be marked as 'modified_in_tx' as they are created, so that the
1839 * watches are triggered on them.
1841 base_t
.dom_id
= dom_id
;
1842 base_t
.base_tx
= XBT_NULL
;
1843 us
.root_walk
= true;
1844 ret
= consume_tree(&us
, &base_t
);
1848 us
.root_walk
= false;
1851 * Commit the transaction now while the refcount on all nodes is 1.
1852 * Note that we haven't yet reinstated the *guest* watches but that's
1853 * OK because we don't want the guest to see any changes. Even any
1854 * backend nodes which get recreated should be *precisely* as they
1855 * were before the migration. Back ends may have been instantiated
1856 * already, and will see the frontend magically blink into existence
1857 * now (well, from the aio_bh which fires the watches). It's their
1858 * responsibility to rebuild everything precisely as it was before.
1860 ret
= transaction_commit(s
, &base_t
);
1866 unsigned int base_tx
;
1869 ret
= consume_be32(&us
, &base_tx
);
1873 if (base_tx
== XBT_NULL
) {
1877 t
= g_new0(XsTransaction
, 1);
1878 t
->base_tx
= base_tx
;
1880 ret
= consume_be32(&us
, &t
->dom_id
);
1882 ret
= consume_tree(&us
, t
);
1890 s
->nr_domu_transactions
++;
1892 g_hash_table_insert(s
->transactions
, GINT_TO_POINTER(t
->tx_id
), t
);
1897 size_t pathlen
, toklen
;
1899 ret
= consume_string(&us
, &path
, &pathlen
);
1907 ret
= consume_string(&us
, &token
, &toklen
);
1916 ret
= do_xs_impl_watch(s
, dom_id
, path
, token
, watch_fn
, watch_opaque
);