]>
git.proxmox.com Git - pve-cluster.git/blob - data/src/database.c
2 Copyright (C) 2010 Proxmox Server Solutions GmbH
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Affero General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Affero General Public License for more details.
14 You should have received a copy of the GNU Affero General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Author: Dietmar Maurer <dietmar@proxmox.com>
21 #define G_LOG_DOMAIN "database"
25 #endif /* HAVE_CONFIG_H */
39 #include "cfs-utils.h"
45 sqlite3_stmt
*stmt_insert_entry
;
46 sqlite3_stmt
*stmt_update_entry
;
47 sqlite3_stmt
*stmt_replace_entry
;
48 sqlite3_stmt
*stmt_delete_entry
;
49 sqlite3_stmt
*stmt_begin
;
50 sqlite3_stmt
*stmt_commit
;
51 sqlite3_stmt
*stmt_rollback
;
52 sqlite3_stmt
*stmt_load_all
;
55 #define VERSIONFILENAME "__version__"
57 /* colume type "INTEGER PRIMARY KEY" is a special case, because sqlite
58 * usese the internal ROWID. So only real interger are allowed, and
59 * there is no need to add an additionl check
61 static const char *sql_create_db
=
62 "CREATE TABLE IF NOT EXISTS tree ("
63 " inode INTEGER PRIMARY KEY NOT NULL,"
64 " parent INTEGER NOT NULL CHECK(typeof(parent)=='integer'),"
65 " version INTEGER NOT NULL CHECK(typeof(version)=='integer'),"
66 " writer INTEGER NOT NULL CHECK(typeof(writer)=='integer'),"
67 " mtime INTEGER NOT NULL CHECK(typeof(mtime)=='integer'),"
68 " type INTEGER NOT NULL CHECK(typeof(type)=='integer'),"
69 " name TEXT NOT NULL,"
72 static const char *sql_load_all
=
73 "SELECT inode, parent, version, writer, mtime, type, name, data FROM tree;";
75 static char *sql_insert_entry
=
77 "inode, parent, version, writer, mtime, type, name, data) "
78 "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8);";
80 static char *sql_update_entry
=
81 "UPDATE tree SET parent = ?2, version = ?3, writer = ?4, mtime = ?5, "
82 "type = ?6, name = ?7, data = ?8 WHERE inode = ?1;";
84 static char *sql_replace_entry
=
85 "REPLACE INTO tree (inode, parent, version, writer, mtime, type, "
86 "name, data) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8);";
88 static char *sql_delete_entry
=
89 "DELETE FROM tree WHERE inode = ?1;";
91 static char *sql_begin
= "BEGIN TRANSACTION;";
92 static char *sql_commit
= "COMMIT TRANSACTION;";
93 static char *sql_rollback
= "ROLLBACK TRANSACTION;";
95 static sqlite3
*bdb_create(
101 int flags
= SQLITE_OPEN_READWRITE
|SQLITE_OPEN_CREATE
;
102 rc
= sqlite3_open_v2(filename
, &db
, flags
, NULL
);
103 if (rc
!= SQLITE_OK
) {
104 cfs_critical("splite3_open_v2 failed: %d\n", rc
);
109 if (chmod(filename
, 0600) == -1) {
110 cfs_critical("chmod failed: %s", strerror(errno
));
114 /* use WAL mode - to allow concurrent reads */
115 rc
= sqlite3_exec(db
, "PRAGMA journal_mode=WAL;", NULL
, NULL
, NULL
);
116 if (rc
!= SQLITE_OK
) {
117 cfs_critical("unable to set WAL mode: %s\n", sqlite3_errmsg(db
));
122 /* NORMAL is good enough when using WAL */
123 rc
= sqlite3_exec(db
, "PRAGMA synchronous=NORMAL", NULL
, NULL
, NULL
);
124 if (rc
!= SQLITE_OK
) {
125 cfs_critical("unable to set synchronous mode: %s\n", sqlite3_errmsg(db
));
130 sqlite3_busy_timeout(db
, 10000); /* 10 seconds */
132 rc
= sqlite3_exec(db
, sql_create_db
, NULL
, NULL
, NULL
);
133 if (rc
!= SQLITE_OK
) {
134 cfs_critical("init database failed: %s\n", sqlite3_errmsg(db
));
142 static int backend_write_inode(
157 cfs_debug("enter backend_write_inode %016" PRIX64
, inode
);
159 if ((rc
= sqlite3_bind_int64(stmt
, 1, inode
)) != SQLITE_OK
) {
160 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
163 if ((rc
= sqlite3_bind_int64(stmt
, 2, parent
)) != SQLITE_OK
) {
164 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
167 if ((rc
= sqlite3_bind_int64(stmt
, 3, version
)) != SQLITE_OK
) {
168 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
171 if ((rc
= sqlite3_bind_int64(stmt
, 4, writer
)) != SQLITE_OK
) {
172 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
175 if ((rc
= sqlite3_bind_int64(stmt
, 5, mtime
)) != SQLITE_OK
) {
176 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
179 if ((rc
= sqlite3_bind_int64(stmt
, 6, type
)) != SQLITE_OK
) {
180 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
183 /* question: can we use SQLITE_STATIC instead? */
184 if ((rc
= sqlite3_bind_text(stmt
, 7, name
, -1, SQLITE_TRANSIENT
)) != SQLITE_OK
) {
185 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
188 if ((rc
= sqlite3_bind_blob(stmt
, 8, value
, size
, SQLITE_TRANSIENT
)) != SQLITE_OK
) {
189 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
193 if ((rc
= sqlite3_step(stmt
)) != SQLITE_DONE
) {
194 cfs_critical("sqlite3_step failed: %s\n", sqlite3_errmsg(db
));
204 static int bdb_backend_delete_inode(
210 cfs_debug("enter dbd_backend_delete_inode");
212 sqlite3_stmt
*stmt
= bdb
->stmt_delete_entry
;
214 if ((rc
= sqlite3_bind_int64(stmt
, 1, inode
)) != SQLITE_OK
) {
215 cfs_critical("delete_inode/sqlite3_bind failed: %s\n", sqlite3_errmsg(bdb
->db
));
219 if ((rc
= sqlite3_step(stmt
)) != SQLITE_DONE
) {
220 cfs_critical("delete_inode failed: %s\n", sqlite3_errmsg(bdb
->db
));
230 int bdb_backend_write(
241 guint64 delete_inode
)
243 g_return_val_if_fail(bdb
!= NULL
, SQLITE_PERM
);
244 g_return_val_if_fail(inode
== 0 || (name
!= NULL
&& name
[0]), SQLITE_PERM
);
245 g_return_val_if_fail(type
== DT_REG
|| type
== DT_DIR
, SQLITE_PERM
);
248 gboolean need_txn
= (inode
!= 0 || delete_inode
!= 0);
251 rc
= sqlite3_step(bdb
->stmt_begin
);
252 sqlite3_reset(bdb
->stmt_begin
);
253 if (rc
!= SQLITE_DONE
) {
254 cfs_critical("begin transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
259 if (delete_inode
!= 0) {
260 if ((rc
= bdb_backend_delete_inode(bdb
, delete_inode
)) != SQLITE_OK
)
266 sqlite3_stmt
*stmt
= (inode
> version
) ?
267 bdb
->stmt_insert_entry
: bdb
->stmt_replace_entry
;
269 rc
= backend_write_inode(bdb
->db
, stmt
, inode
, parent
, version
,
270 writer
, mtime
, size
, type
, name
, value
);
274 if (sqlite3_changes(bdb
->db
) != 1) {
275 cfs_critical("no such inode %016" PRIX64
, inode
);
280 rc
= backend_write_inode(bdb
->db
, bdb
->stmt_replace_entry
, 0, 0, version
,
281 writer
, mtime
, 0, DT_REG
, VERSIONFILENAME
, NULL
);
288 rc
= sqlite3_step(bdb
->stmt_commit
);
289 sqlite3_reset(bdb
->stmt_commit
);
290 if (rc
!= SQLITE_DONE
) {
291 cfs_critical("commit transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
303 int rbrc
= sqlite3_step(bdb
->stmt_rollback
);
304 sqlite3_reset(bdb
->stmt_rollback
);
305 if (rbrc
!= SQLITE_DONE
) {
306 cfs_critical("rollback transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
313 static gboolean
bdb_backend_load_index(
315 memdb_tree_entry_t
*root
,
318 g_return_val_if_fail(bdb
!= NULL
, FALSE
);
319 g_return_val_if_fail(root
!= NULL
, FALSE
);
320 g_return_val_if_fail(index
!= NULL
, FALSE
);
321 g_return_val_if_fail(root
->version
== 0, FALSE
);
322 g_return_val_if_fail(g_hash_table_size(index
) == 1, FALSE
);
326 sqlite3_stmt
*stmt
= bdb
->stmt_load_all
;
328 while ((rc
= sqlite3_step(stmt
)) == SQLITE_ROW
) {
330 memdb_tree_entry_t
*te
;
332 guint64 inode
= sqlite3_column_int64(stmt
, 0);
333 const char *name
= (const char *)sqlite3_column_text(stmt
, 6);
334 int namelen
= sqlite3_column_bytes(stmt
, 6);
335 if (name
== NULL
|| namelen
== 0) {
336 cfs_critical("inode has no name (inode = %016" PRIX64
")", inode
);
339 te
= g_malloc0(sizeof(memdb_tree_entry_t
) + namelen
+ 1);
340 strcpy(te
->name
, name
);
343 te
->parent
= sqlite3_column_int64(stmt
, 1);
344 te
->version
= sqlite3_column_int64(stmt
, 2);
345 te
->writer
= sqlite3_column_int64(stmt
, 3) & 0x0ffffffff;
346 te
->mtime
= sqlite3_column_int64(stmt
, 4) & 0x0ffffffff;
347 te
->type
= sqlite3_column_int64(stmt
, 5) & 255;
349 gconstpointer value
= sqlite3_column_blob(stmt
, 7);
351 int size
= sqlite3_column_bytes(stmt
, 7);
354 if (te
->type
== DT_REG
) {
356 te
->data
.value
= g_memdup(value
, size
);
357 } else if (te
->type
== DT_DIR
) {
359 cfs_critical("directory inode contains data (inode = %016" PRIX64
")",
364 te
->data
.entries
= NULL
;
366 cfs_critical("inode has unknown type (inode = %016" PRIX64
", type = %d)",
367 te
->inode
, te
->type
);
372 cfs_debug("name %s (inode = %016" PRIX64
", parent = %016" PRIX64
")",
373 te
->name
, te
->inode
, te
->parent
);
375 if (te
->inode
== 0) {
376 if (te
->name
&& !strcmp(te
->name
, VERSIONFILENAME
)) {
377 root
->version
= te
->version
;
378 root
->writer
= te
->writer
;
379 root
->mtime
= te
->mtime
;
380 memdb_tree_entry_free(te
);
382 cfs_critical("root inode has unexpected name '%s'", te
->name
);
383 memdb_tree_entry_free(te
);
387 memdb_tree_entry_t
*pte
;
389 if (!(pte
= g_hash_table_lookup(index
, &te
->parent
))) {
390 /* allocate placeholder (type == 0)
391 * this is simply replaced if we find a real inode later
393 pte
= g_malloc0(sizeof(memdb_tree_entry_t
));
394 pte
->inode
= te
->parent
;
395 pte
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
396 g_hash_table_replace(index
, &pte
->inode
, pte
);
398 } else if (!(pte
->type
== DT_DIR
|| pte
->type
== 0)) {
399 cfs_critical("parent is not a directory "
400 "(inode = %016" PRIX64
", parent = %016" PRIX64
", name = '%s')",
401 te
->inode
, te
->parent
, te
->name
);
402 memdb_tree_entry_free(te
);
406 if (te
->type
== DT_DIR
) {
407 memdb_tree_entry_t
*tmpte
;
408 /* test if there is a placeholder entry */
409 if ((tmpte
= g_hash_table_lookup(index
, &te
->inode
))) {
410 if (tmpte
->type
!= 0) {
411 cfs_critical("found strange placeholder for "
412 "(inode = %016" PRIX64
", parent = %016" PRIX64
", name = '%s', type = '%d')",
413 te
->inode
, te
->parent
, te
->name
, tmpte
->type
);
414 memdb_tree_entry_free(te
);
417 /* copy entries from placeholder */
418 te
->data
.entries
= tmpte
->data
.entries
;
419 tmpte
->data
.entries
= NULL
;
421 te
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
425 if (g_hash_table_lookup(pte
->data
.entries
, te
->name
)) {
426 cfs_critical("found entry with duplicate name "
427 "(inode = %016" PRIX64
", parent = %016" PRIX64
", name = '%s')",
428 te
->inode
, te
->parent
, te
->name
);
432 g_hash_table_replace(pte
->data
.entries
, te
->name
, te
);
433 g_hash_table_replace(index
, &te
->inode
, te
);
436 if (rc
!= SQLITE_DONE
) {
437 cfs_critical("select returned error: %s", sqlite3_errmsg(bdb
->db
));
441 /* check if all inodes have parents (there must be no placeholders) */
444 g_hash_table_iter_init (&iter
, index
);
445 while (g_hash_table_iter_next (&iter
, &key
, &value
)) {
446 memdb_tree_entry_t
*te
= (memdb_tree_entry_t
*)value
;
448 cfs_critical("missing directory inode (inode = %016" PRIX64
")", te
->inode
);
460 cfs_critical("DB load failed");
465 gboolean
bdb_backend_commit_update(
467 memdb_index_t
*master
,
468 memdb_index_t
*slave
,
471 g_return_val_if_fail(memdb
!= NULL
, FALSE
);
472 g_return_val_if_fail(memdb
->bdb
!= NULL
, FALSE
);
473 g_return_val_if_fail(master
!= NULL
, FALSE
);
474 g_return_val_if_fail(slave
!= NULL
, FALSE
);
476 cfs_debug("enter bdb_backend_commit_update");
478 memdb_tree_entry_t
*root
= NULL
;
479 GHashTable
*index
= NULL
;
481 db_backend_t
*bdb
= (db_backend_t
*)memdb
->bdb
;
482 gboolean result
= FALSE
;
486 rc
= sqlite3_step(bdb
->stmt_begin
);
487 sqlite3_reset(bdb
->stmt_begin
);
488 if (rc
!= SQLITE_DONE
) {
489 cfs_critical("begin transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
493 g_mutex_lock (&memdb
->mutex
);
495 /* first, delete anything not found in master index) */
500 for (i
= 0; i
< master
->size
; i
++) {
501 guint64 inode
= master
->entries
[i
].inode
;
503 while (j
< slave
->size
&& (slave_inode
= slave
->entries
[j
].inode
) <= inode
) {
505 if (slave_inode
< inode
) {
506 if (bdb_backend_delete_inode(bdb
, slave_inode
) != SQLITE_OK
)
509 cfs_debug("deleted inode %016" PRIX64
, slave_inode
);
513 if (j
>= slave
->size
)
517 while (j
< slave
->size
) {
518 guint64 slave_inode
= slave
->entries
[j
].inode
;
520 if (bdb_backend_delete_inode(bdb
, slave_inode
) != SQLITE_OK
)
523 cfs_debug("deleted inode %016" PRIX64
, slave_inode
);
528 /* now add all updates */
532 memdb_tree_entry_t
*te
= (memdb_tree_entry_t
*)l
->data
;
534 tree_entry_debug(te
);
536 if (backend_write_inode(
537 bdb
->db
, bdb
->stmt_replace_entry
, te
->inode
, te
->parent
, te
->version
,
538 te
->writer
, te
->mtime
, te
->size
, te
->type
,
539 te
->inode
? te
->name
: VERSIONFILENAME
, te
->data
.value
) != SQLITE_OK
) {
546 /* now try to reload */
547 root
= memdb_tree_entry_new("");
548 root
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
551 index
= g_hash_table_new_full(g_int64_hash
, g_int64_equal
, NULL
,
552 (GDestroyNotify
)memdb_tree_entry_free
);
554 g_hash_table_replace(index
, &root
->inode
, root
);
556 if (!bdb_backend_load_index(bdb
, root
, index
))
559 if (!memdb
->root
->version
) {
560 cfs_critical("new index has version 0 - internal error");
564 memdb_index_t
*new_idx
= memdb_encode_index(index
, root
);
566 cfs_critical("cant encode new index - internal error");
570 int idx_equal
= (new_idx
->bytes
== master
->bytes
&&
571 (memcmp(master
, new_idx
, new_idx
->bytes
) == 0));
576 cfs_critical("new index does not match master index - internal error");
580 rc
= sqlite3_step(bdb
->stmt_commit
);
581 sqlite3_reset(bdb
->stmt_commit
);
582 if (rc
!= SQLITE_DONE
) {
583 cfs_critical("commit transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
587 g_hash_table_destroy(memdb
->index
);
588 memdb
->index
= index
;
593 record_memdb_reload();
595 if (!memdb_recreate_vmlist(memdb
)) {
596 cfs_critical("memdb_recreate_vmlist failed");
602 memdb_update_locks(memdb
);
607 g_mutex_unlock (&memdb
->mutex
);
610 g_hash_table_destroy(index
);
612 cfs_debug("leave bdb_backend_commit_update (%d)", result
);
620 rc
= sqlite3_step(bdb
->stmt_rollback
);
621 sqlite3_reset(bdb
->stmt_rollback
);
622 if (rc
!= SQLITE_DONE
)
623 cfs_critical("rollback transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
630 void bdb_backend_close(db_backend_t
*bdb
)
632 g_return_if_fail(bdb
!= NULL
);
634 sqlite3_finalize(bdb
->stmt_insert_entry
);
635 sqlite3_finalize(bdb
->stmt_replace_entry
);
636 sqlite3_finalize(bdb
->stmt_update_entry
);
637 sqlite3_finalize(bdb
->stmt_delete_entry
);
638 sqlite3_finalize(bdb
->stmt_begin
);
639 sqlite3_finalize(bdb
->stmt_commit
);
640 sqlite3_finalize(bdb
->stmt_rollback
);
641 sqlite3_finalize(bdb
->stmt_load_all
);
644 if ((rc
= sqlite3_close(bdb
->db
)) != SQLITE_OK
) {
645 cfs_critical("sqlite3_close failed: %d\n", rc
);
653 db_backend_t
*bdb_backend_open(
654 const char *filename
,
655 memdb_tree_entry_t
*root
,
658 g_return_val_if_fail(filename
!= NULL
, NULL
);
659 g_return_val_if_fail(root
!= NULL
, NULL
);
660 g_return_val_if_fail(index
!= NULL
, NULL
);
662 db_backend_t
*bdb
= g_new0(db_backend_t
, 1);
663 g_return_val_if_fail(bdb
!= NULL
, NULL
);
667 sqlite3_initialize();
669 if (!(bdb
->db
= bdb_create(filename
)))
672 rc
= sqlite3_prepare_v2(bdb
->db
, sql_insert_entry
, -1, &bdb
->stmt_insert_entry
, NULL
);
673 if (rc
!= SQLITE_OK
) {
674 cfs_critical("sqlite3_prepare 'sql_insert_entry' failed: %s\n",
675 sqlite3_errmsg(bdb
->db
));
678 rc
= sqlite3_prepare_v2(bdb
->db
, sql_update_entry
, -1, &bdb
->stmt_update_entry
, NULL
);
679 if (rc
!= SQLITE_OK
) {
680 cfs_critical("sqlite3_prepare 'sql_update_entry' failed: %s\n",
681 sqlite3_errmsg(bdb
->db
));
684 rc
= sqlite3_prepare_v2(bdb
->db
, sql_replace_entry
, -1, &bdb
->stmt_replace_entry
, NULL
);
685 if (rc
!= SQLITE_OK
) {
686 cfs_critical("sqlite3_prepare 'sql_replace_entry' failed: %s\n",
687 sqlite3_errmsg(bdb
->db
));
690 rc
= sqlite3_prepare_v2(bdb
->db
, sql_delete_entry
, -1, &bdb
->stmt_delete_entry
, NULL
);
691 if (rc
!= SQLITE_OK
) {
692 cfs_critical("sqlite3_prepare 'sql_delete_entry' failed: %s\n",
693 sqlite3_errmsg(bdb
->db
));
696 rc
= sqlite3_prepare_v2(bdb
->db
, sql_begin
, -1, &bdb
->stmt_begin
, NULL
);
697 if (rc
!= SQLITE_OK
) {
698 cfs_critical("sqlite3_prepare 'sql_begin' failed: %s\n",
699 sqlite3_errmsg(bdb
->db
));
702 rc
= sqlite3_prepare_v2(bdb
->db
, sql_commit
, -1, &bdb
->stmt_commit
, NULL
);
703 if (rc
!= SQLITE_OK
) {
704 cfs_critical("sqlite3_prepare 'sql_commit' failed: %s\n",
705 sqlite3_errmsg(bdb
->db
));
708 rc
= sqlite3_prepare_v2(bdb
->db
, sql_rollback
, -1, &bdb
->stmt_rollback
, NULL
);
709 if (rc
!= SQLITE_OK
) {
710 cfs_critical("sqlite3_prepare 'sql_rollback' failed: %s\n",
711 sqlite3_errmsg(bdb
->db
));
714 rc
= sqlite3_prepare_v2(bdb
->db
, sql_load_all
, -1, &bdb
->stmt_load_all
, NULL
);
715 if (rc
!= SQLITE_OK
) {
716 cfs_critical("sqlite3_prepare 'sql_load_all' failed: %s\n",
717 sqlite3_errmsg(bdb
->db
));
721 if (!bdb_backend_load_index(bdb
, root
, index
))
724 if (!root
->version
) {
727 guint32 mtime
= time(NULL
);
729 if (bdb_backend_write(bdb
, 0, 0, root
->version
, 0, mtime
,
730 0, DT_REG
, NULL
, NULL
, 0) != SQLITE_OK
)
738 bdb_backend_close(bdb
);