2 Copyright (C) 2010 - 2020 Proxmox Server Solutions GmbH
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Affero General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Affero General Public License for more details.
14 You should have received a copy of the GNU Affero General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Author: Dietmar Maurer <dietmar@proxmox.com>
21 #define G_LOG_DOMAIN "database"
25 #endif /* HAVE_CONFIG_H */
39 #include "cfs-utils.h"
45 sqlite3_stmt
*stmt_insert_entry
;
46 sqlite3_stmt
*stmt_update_entry
;
47 sqlite3_stmt
*stmt_replace_entry
;
48 sqlite3_stmt
*stmt_delete_entry
;
49 sqlite3_stmt
*stmt_begin
;
50 sqlite3_stmt
*stmt_commit
;
51 sqlite3_stmt
*stmt_rollback
;
52 sqlite3_stmt
*stmt_load_all
;
55 #define VERSIONFILENAME "__version__"
57 /* colume type "INTEGER PRIMARY KEY" is a special case, because sqlite
58 * uses the internal ROWID. So only real interger are allowed, and
59 * there is no need to add an additionl check
61 static const char *sql_create_db
=
62 "CREATE TABLE IF NOT EXISTS tree ("
63 " inode INTEGER PRIMARY KEY NOT NULL,"
64 " parent INTEGER NOT NULL CHECK(typeof(parent)=='integer'),"
65 " version INTEGER NOT NULL CHECK(typeof(version)=='integer'),"
66 " writer INTEGER NOT NULL CHECK(typeof(writer)=='integer'),"
67 " mtime INTEGER NOT NULL CHECK(typeof(mtime)=='integer'),"
68 " type INTEGER NOT NULL CHECK(typeof(type)=='integer'),"
69 " name TEXT NOT NULL,"
72 static const char *sql_load_all
=
73 "SELECT inode, parent, version, writer, mtime, type, name, data FROM tree;";
75 static char *sql_insert_entry
=
77 "inode, parent, version, writer, mtime, type, name, data) "
78 "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8);";
80 static char *sql_update_entry
=
81 "UPDATE tree SET parent = ?2, version = ?3, writer = ?4, mtime = ?5, "
82 "type = ?6, name = ?7, data = ?8 WHERE inode = ?1;";
84 static char *sql_replace_entry
=
85 "REPLACE INTO tree (inode, parent, version, writer, mtime, type, "
86 "name, data) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8);";
88 static char *sql_delete_entry
=
89 "DELETE FROM tree WHERE inode = ?1;";
91 static char *sql_begin
= "BEGIN TRANSACTION;";
92 static char *sql_commit
= "COMMIT TRANSACTION;";
93 static char *sql_rollback
= "ROLLBACK TRANSACTION;";
95 static sqlite3
*bdb_create(
101 int flags
= SQLITE_OPEN_READWRITE
|SQLITE_OPEN_CREATE
;
102 rc
= sqlite3_open_v2(filename
, &db
, flags
, NULL
);
103 if (rc
!= SQLITE_OK
) {
104 cfs_critical("splite3_open_v2 failed: %d\n", rc
);
109 if (chmod(filename
, 0600) == -1) {
110 cfs_critical("chmod failed: %s", strerror(errno
));
114 /* use WAL mode - to allow concurrent reads */
115 rc
= sqlite3_exec(db
, "PRAGMA journal_mode=WAL;", NULL
, NULL
, NULL
);
116 if (rc
!= SQLITE_OK
) {
117 cfs_critical("unable to set WAL mode: %s\n", sqlite3_errmsg(db
));
122 /* NORMAL is good enough when using WAL */
123 rc
= sqlite3_exec(db
, "PRAGMA synchronous=NORMAL", NULL
, NULL
, NULL
);
124 if (rc
!= SQLITE_OK
) {
125 cfs_critical("unable to set synchronous mode: %s\n", sqlite3_errmsg(db
));
130 sqlite3_busy_timeout(db
, 10000); /* 10 seconds */
132 rc
= sqlite3_exec(db
, sql_create_db
, NULL
, NULL
, NULL
);
133 if (rc
!= SQLITE_OK
) {
134 cfs_critical("init database failed: %s\n", sqlite3_errmsg(db
));
142 static int backend_write_inode(
157 cfs_debug("enter backend_write_inode %016" PRIX64
" '%s', size %"PRIu32
"", inode
, name
, size
);
159 if ((rc
= sqlite3_bind_int64(stmt
, 1, inode
)) != SQLITE_OK
) {
160 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
163 if ((rc
= sqlite3_bind_int64(stmt
, 2, parent
)) != SQLITE_OK
) {
164 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
167 if ((rc
= sqlite3_bind_int64(stmt
, 3, version
)) != SQLITE_OK
) {
168 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
171 if ((rc
= sqlite3_bind_int64(stmt
, 4, writer
)) != SQLITE_OK
) {
172 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
175 if ((rc
= sqlite3_bind_int64(stmt
, 5, mtime
)) != SQLITE_OK
) {
176 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
179 if ((rc
= sqlite3_bind_int64(stmt
, 6, type
)) != SQLITE_OK
) {
180 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
183 if ((rc
= sqlite3_bind_text(stmt
, 7, name
, -1, SQLITE_STATIC
)) != SQLITE_OK
) {
184 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
187 if ((rc
= sqlite3_bind_blob(stmt
, 8, value
, size
, SQLITE_STATIC
)) != SQLITE_OK
) {
188 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
192 if ((rc
= sqlite3_step(stmt
)) != SQLITE_DONE
) {
193 cfs_critical("sqlite3_step failed: %s\n", sqlite3_errmsg(db
));
203 static int bdb_backend_delete_inode(
209 cfs_debug("enter dbd_backend_delete_inode");
211 sqlite3_stmt
*stmt
= bdb
->stmt_delete_entry
;
213 if ((rc
= sqlite3_bind_int64(stmt
, 1, inode
)) != SQLITE_OK
) {
214 cfs_critical("delete_inode/sqlite3_bind failed: %s\n", sqlite3_errmsg(bdb
->db
));
218 if ((rc
= sqlite3_step(stmt
)) != SQLITE_DONE
) {
219 cfs_critical("delete_inode failed: %s\n", sqlite3_errmsg(bdb
->db
));
229 int bdb_backend_write(
240 guint64 delete_inode
)
242 g_return_val_if_fail(bdb
!= NULL
, SQLITE_PERM
);
243 g_return_val_if_fail(inode
== 0 || (name
!= NULL
&& name
[0]), SQLITE_PERM
);
244 g_return_val_if_fail(type
== DT_REG
|| type
== DT_DIR
, SQLITE_PERM
);
247 gboolean need_txn
= (inode
!= 0 || delete_inode
!= 0);
250 rc
= sqlite3_step(bdb
->stmt_begin
);
251 sqlite3_reset(bdb
->stmt_begin
);
252 if (rc
!= SQLITE_DONE
) {
253 cfs_critical("begin transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
258 if (delete_inode
!= 0) {
259 if ((rc
= bdb_backend_delete_inode(bdb
, delete_inode
)) != SQLITE_OK
)
265 sqlite3_stmt
*stmt
= (inode
> version
) ?
266 bdb
->stmt_insert_entry
: bdb
->stmt_replace_entry
;
268 rc
= backend_write_inode(bdb
->db
, stmt
, inode
, parent
, version
,
269 writer
, mtime
, size
, type
, name
, value
);
273 if (sqlite3_changes(bdb
->db
) != 1) {
274 cfs_critical("no such inode %016" PRIX64
, inode
);
279 rc
= backend_write_inode(bdb
->db
, bdb
->stmt_replace_entry
, 0, 0, version
,
280 writer
, mtime
, 0, DT_REG
, VERSIONFILENAME
, NULL
);
287 rc
= sqlite3_step(bdb
->stmt_commit
);
288 sqlite3_reset(bdb
->stmt_commit
);
289 if (rc
!= SQLITE_DONE
) {
290 cfs_critical("commit transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
302 int rbrc
= sqlite3_step(bdb
->stmt_rollback
);
303 sqlite3_reset(bdb
->stmt_rollback
);
304 if (rbrc
!= SQLITE_DONE
) {
305 cfs_critical("rollback transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
312 static gboolean
bdb_backend_load_index(
314 memdb_tree_entry_t
*root
,
317 g_return_val_if_fail(bdb
!= NULL
, FALSE
);
318 g_return_val_if_fail(root
!= NULL
, FALSE
);
319 g_return_val_if_fail(index
!= NULL
, FALSE
);
320 g_return_val_if_fail(root
->version
== 0, FALSE
);
321 g_return_val_if_fail(g_hash_table_size(index
) == 1, FALSE
);
323 sqlite3_stmt
*stmt
= bdb
->stmt_load_all
;
326 while ((rc
= sqlite3_step(stmt
)) == SQLITE_ROW
) {
328 memdb_tree_entry_t
*te
;
330 guint64 inode
= sqlite3_column_int64(stmt
, 0);
331 const char *name
= (const char *)sqlite3_column_text(stmt
, 6);
332 int namelen
= sqlite3_column_bytes(stmt
, 6);
333 if (name
== NULL
|| namelen
== 0) {
334 cfs_critical("inode has no name (inode = %016" PRIX64
")", inode
);
337 te
= g_malloc0(sizeof(memdb_tree_entry_t
) + namelen
+ 1);
338 strcpy(te
->name
, name
);
341 te
->parent
= sqlite3_column_int64(stmt
, 1);
342 te
->version
= sqlite3_column_int64(stmt
, 2);
343 te
->writer
= sqlite3_column_int64(stmt
, 3) & 0x0ffffffff;
344 te
->mtime
= sqlite3_column_int64(stmt
, 4) & 0x0ffffffff;
345 te
->type
= sqlite3_column_int64(stmt
, 5) & 255;
347 gconstpointer value
= sqlite3_column_blob(stmt
, 7);
349 int size
= sqlite3_column_bytes(stmt
, 7);
352 if (te
->type
== DT_REG
) {
354 te
->data
.value
= g_memdup2(value
, size
);
355 } else if (te
->type
== DT_DIR
) {
357 cfs_critical("directory inode contains data (inode = %016" PRIX64
")",
362 te
->data
.entries
= NULL
;
364 cfs_critical("inode has unknown type (inode = %016" PRIX64
", type = %d)",
365 te
->inode
, te
->type
);
370 cfs_debug("name %s (inode = %016" PRIX64
", parent = %016" PRIX64
")",
371 te
->name
, te
->inode
, te
->parent
);
373 if (te
->inode
== 0) {
374 if (!strcmp(te
->name
, VERSIONFILENAME
)) {
375 root
->version
= te
->version
;
376 root
->writer
= te
->writer
;
377 root
->mtime
= te
->mtime
;
378 memdb_tree_entry_free(te
);
380 cfs_critical("root inode has unexpected name '%s'", te
->name
);
381 memdb_tree_entry_free(te
);
385 memdb_tree_entry_t
*pte
;
387 if (!(pte
= g_hash_table_lookup(index
, &te
->parent
))) {
388 /* allocate placeholder (type == 0)
389 * this is simply replaced if we find a real inode later
391 pte
= g_malloc0(sizeof(memdb_tree_entry_t
));
392 pte
->inode
= te
->parent
;
393 pte
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
394 g_hash_table_replace(index
, &pte
->inode
, pte
);
396 } else if (!(pte
->type
== DT_DIR
|| pte
->type
== 0)) {
397 cfs_critical("parent is not a directory "
398 "(inode = %016" PRIX64
", parent = %016" PRIX64
", name = '%s')",
399 te
->inode
, te
->parent
, te
->name
);
400 memdb_tree_entry_free(te
);
404 if (te
->type
== DT_DIR
) {
405 memdb_tree_entry_t
*tmpte
;
406 /* test if there is a placeholder entry */
407 if ((tmpte
= g_hash_table_lookup(index
, &te
->inode
))) {
408 if (tmpte
->type
!= 0) {
409 cfs_critical("found strange placeholder for "
410 "(inode = %016" PRIX64
", parent = %016" PRIX64
", name = '%s', type = '%d')",
411 te
->inode
, te
->parent
, te
->name
, tmpte
->type
);
412 memdb_tree_entry_free(te
);
415 /* copy entries from placeholder */
416 te
->data
.entries
= tmpte
->data
.entries
;
417 tmpte
->data
.entries
= NULL
;
419 te
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
423 memdb_tree_entry_t
*existing
;
424 if ((existing
= g_hash_table_lookup(pte
->data
.entries
, te
->name
))) {
426 "found entry with duplicate name '%s' - "
427 "A:(inode = 0x%016"PRIX64
", parent = 0x%016"PRIX64
", v./mtime = 0x%"PRIX64
"/0x%"PRIi32
")"
429 "B:(inode = 0x%016"PRIX64
", parent = 0x%016"PRIX64
", v./mtime = 0x%"PRIX64
"/0x%"PRIi32
")",
431 existing
->inode
, existing
->parent
, existing
->version
, existing
->mtime
,
432 te
->inode
, te
->parent
, te
->version
, te
->mtime
437 g_hash_table_replace(pte
->data
.entries
, te
->name
, te
);
438 g_hash_table_replace(index
, &te
->inode
, te
);
441 if (rc
!= SQLITE_DONE
) {
442 cfs_critical("select returned error: %s", sqlite3_errmsg(bdb
->db
));
446 /* check if all inodes have parents (there must be no placeholders) */
449 g_hash_table_iter_init (&iter
, index
);
450 while (g_hash_table_iter_next (&iter
, &key
, &value
)) {
451 memdb_tree_entry_t
*te
= (memdb_tree_entry_t
*)value
;
453 cfs_critical("missing directory inode (inode = %016" PRIX64
")", te
->inode
);
465 cfs_critical("DB load failed");
470 gboolean
bdb_backend_commit_update(
472 memdb_index_t
*master
,
473 memdb_index_t
*slave
,
476 g_return_val_if_fail(memdb
!= NULL
, FALSE
);
477 g_return_val_if_fail(memdb
->bdb
!= NULL
, FALSE
);
478 g_return_val_if_fail(master
!= NULL
, FALSE
);
479 g_return_val_if_fail(slave
!= NULL
, FALSE
);
481 cfs_debug("enter bdb_backend_commit_update");
483 memdb_tree_entry_t
*root
= NULL
;
484 GHashTable
*index
= NULL
;
486 db_backend_t
*bdb
= (db_backend_t
*)memdb
->bdb
;
487 gboolean result
= FALSE
;
491 rc
= sqlite3_step(bdb
->stmt_begin
);
492 sqlite3_reset(bdb
->stmt_begin
);
493 if (rc
!= SQLITE_DONE
) {
494 cfs_critical("begin transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
498 g_mutex_lock (&memdb
->mutex
);
500 /* first, delete anything not found in master index) */
505 for (i
= 0; i
< master
->size
; i
++) {
506 guint64 inode
= master
->entries
[i
].inode
;
508 while (j
< slave
->size
&& (slave_inode
= slave
->entries
[j
].inode
) <= inode
) {
510 if (slave_inode
< inode
) {
511 if (bdb_backend_delete_inode(bdb
, slave_inode
) != SQLITE_OK
)
514 cfs_debug("deleted inode %016" PRIX64
, slave_inode
);
518 if (j
>= slave
->size
)
522 while (j
< slave
->size
) {
523 guint64 slave_inode
= slave
->entries
[j
].inode
;
525 if (bdb_backend_delete_inode(bdb
, slave_inode
) != SQLITE_OK
)
528 cfs_debug("deleted inode %016" PRIX64
, slave_inode
);
533 /* now add all updates */
537 memdb_tree_entry_t
*te
= (memdb_tree_entry_t
*)l
->data
;
539 tree_entry_debug(te
);
541 if (backend_write_inode(
542 bdb
->db
, bdb
->stmt_replace_entry
, te
->inode
, te
->parent
, te
->version
,
543 te
->writer
, te
->mtime
, te
->size
, te
->type
,
544 te
->inode
? te
->name
: VERSIONFILENAME
, te
->data
.value
) != SQLITE_OK
) {
551 /* now try to reload */
552 root
= memdb_tree_entry_new("");
553 root
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
556 index
= g_hash_table_new_full(g_int64_hash
, g_int64_equal
, NULL
,
557 (GDestroyNotify
)memdb_tree_entry_free
);
559 g_hash_table_replace(index
, &root
->inode
, root
);
561 if (!bdb_backend_load_index(bdb
, root
, index
))
564 if (!memdb
->root
->version
) {
565 cfs_critical("new index has version 0 - internal error");
569 memdb_index_t
*new_idx
= memdb_encode_index(index
, root
);
571 cfs_critical("cant encode new index - internal error");
575 int idx_equal
= (new_idx
->bytes
== master
->bytes
&&
576 (memcmp(master
, new_idx
, new_idx
->bytes
) == 0));
581 cfs_critical("new index does not match master index - internal error");
585 rc
= sqlite3_step(bdb
->stmt_commit
);
586 sqlite3_reset(bdb
->stmt_commit
);
587 if (rc
!= SQLITE_DONE
) {
588 cfs_critical("commit transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
592 g_hash_table_destroy(memdb
->index
);
593 memdb
->index
= index
;
598 record_memdb_reload();
600 if (!memdb_recreate_vmlist(memdb
)) {
601 cfs_critical("memdb_recreate_vmlist failed");
607 memdb_update_locks(memdb
);
612 g_mutex_unlock (&memdb
->mutex
);
615 g_hash_table_destroy(index
);
617 cfs_debug("leave bdb_backend_commit_update (%d)", result
);
625 rc
= sqlite3_step(bdb
->stmt_rollback
);
626 sqlite3_reset(bdb
->stmt_rollback
);
627 if (rc
!= SQLITE_DONE
)
628 cfs_critical("rollback transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
635 void bdb_backend_close(db_backend_t
*bdb
)
637 g_return_if_fail(bdb
!= NULL
);
639 sqlite3_finalize(bdb
->stmt_insert_entry
);
640 sqlite3_finalize(bdb
->stmt_replace_entry
);
641 sqlite3_finalize(bdb
->stmt_update_entry
);
642 sqlite3_finalize(bdb
->stmt_delete_entry
);
643 sqlite3_finalize(bdb
->stmt_begin
);
644 sqlite3_finalize(bdb
->stmt_commit
);
645 sqlite3_finalize(bdb
->stmt_rollback
);
646 sqlite3_finalize(bdb
->stmt_load_all
);
649 if ((rc
= sqlite3_close(bdb
->db
)) != SQLITE_OK
) {
650 cfs_critical("sqlite3_close failed: %d\n", rc
);
658 db_backend_t
*bdb_backend_open(
659 const char *filename
,
660 memdb_tree_entry_t
*root
,
663 g_return_val_if_fail(filename
!= NULL
, NULL
);
664 g_return_val_if_fail(root
!= NULL
, NULL
);
665 g_return_val_if_fail(index
!= NULL
, NULL
);
667 db_backend_t
*bdb
= g_new0(db_backend_t
, 1);
668 g_return_val_if_fail(bdb
!= NULL
, NULL
);
672 sqlite3_initialize();
674 if (!(bdb
->db
= bdb_create(filename
)))
677 // tell the query planner that the prepared statement will be retained for a long time and
678 // probably reused many times
679 const unsigned int flags
= SQLITE_PREPARE_PERSISTENT
;
681 rc
= sqlite3_prepare_v3(bdb
->db
, sql_insert_entry
, -1, flags
, &bdb
->stmt_insert_entry
, NULL
);
682 if (rc
!= SQLITE_OK
) {
683 cfs_critical("sqlite3_prepare 'sql_insert_entry' failed: %s\n",
684 sqlite3_errmsg(bdb
->db
));
687 rc
= sqlite3_prepare_v3(bdb
->db
, sql_update_entry
, -1, flags
, &bdb
->stmt_update_entry
, NULL
);
688 if (rc
!= SQLITE_OK
) {
689 cfs_critical("sqlite3_prepare 'sql_update_entry' failed: %s\n",
690 sqlite3_errmsg(bdb
->db
));
693 rc
= sqlite3_prepare_v3(bdb
->db
, sql_replace_entry
, -1, flags
, &bdb
->stmt_replace_entry
, NULL
);
694 if (rc
!= SQLITE_OK
) {
695 cfs_critical("sqlite3_prepare 'sql_replace_entry' failed: %s\n",
696 sqlite3_errmsg(bdb
->db
));
699 rc
= sqlite3_prepare_v3(bdb
->db
, sql_delete_entry
, -1, flags
, &bdb
->stmt_delete_entry
, NULL
);
700 if (rc
!= SQLITE_OK
) {
701 cfs_critical("sqlite3_prepare 'sql_delete_entry' failed: %s\n",
702 sqlite3_errmsg(bdb
->db
));
705 rc
= sqlite3_prepare_v3(bdb
->db
, sql_begin
, -1, flags
, &bdb
->stmt_begin
, NULL
);
706 if (rc
!= SQLITE_OK
) {
707 cfs_critical("sqlite3_prepare 'sql_begin' failed: %s\n",
708 sqlite3_errmsg(bdb
->db
));
711 rc
= sqlite3_prepare_v3(bdb
->db
, sql_commit
, -1, flags
, &bdb
->stmt_commit
, NULL
);
712 if (rc
!= SQLITE_OK
) {
713 cfs_critical("sqlite3_prepare 'sql_commit' failed: %s\n",
714 sqlite3_errmsg(bdb
->db
));
717 rc
= sqlite3_prepare_v3(bdb
->db
, sql_rollback
, -1, flags
, &bdb
->stmt_rollback
, NULL
);
718 if (rc
!= SQLITE_OK
) {
719 cfs_critical("sqlite3_prepare 'sql_rollback' failed: %s\n",
720 sqlite3_errmsg(bdb
->db
));
723 rc
= sqlite3_prepare_v3(bdb
->db
, sql_load_all
, -1, flags
, &bdb
->stmt_load_all
, NULL
);
724 if (rc
!= SQLITE_OK
) {
725 cfs_critical("sqlite3_prepare 'sql_load_all' failed: %s\n",
726 sqlite3_errmsg(bdb
->db
));
730 if (!bdb_backend_load_index(bdb
, root
, index
))
733 if (!root
->version
) {
736 guint32 mtime
= time(NULL
);
738 if (bdb_backend_write(bdb
, 0, 0, root
->version
, 0, mtime
,
739 0, DT_REG
, NULL
, NULL
, 0) != SQLITE_OK
)
746 bdb_backend_close(bdb
);