]>
git.proxmox.com Git - pve-cluster.git/blob - data/src/database.c
2 Copyright (C) 2010 Proxmox Server Solutions GmbH
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Affero General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Affero General Public License for more details.
14 You should have received a copy of the GNU Affero General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Author: Dietmar Maurer <dietmar@proxmox.com>
21 #define G_LOG_DOMAIN "database"
25 #endif /* HAVE_CONFIG_H */
38 #include "cfs-utils.h"
44 sqlite3_stmt
*stmt_insert_entry
;
45 sqlite3_stmt
*stmt_update_entry
;
46 sqlite3_stmt
*stmt_replace_entry
;
47 sqlite3_stmt
*stmt_delete_entry
;
48 sqlite3_stmt
*stmt_begin
;
49 sqlite3_stmt
*stmt_commit
;
50 sqlite3_stmt
*stmt_rollback
;
51 sqlite3_stmt
*stmt_load_all
;
54 #define VERSIONFILENAME "__version__"
56 /* colume type "INTEGER PRIMARY KEY" is a special case, because sqlite
57 * usese the internal ROWID. So only real interger are allowed, and
58 * there is no need to add an additionl check
60 static const char *sql_create_db
=
61 "CREATE TABLE IF NOT EXISTS tree ("
62 " inode INTEGER PRIMARY KEY NOT NULL,"
63 " parent INTEGER NOT NULL CHECK(typeof(parent)=='integer'),"
64 " version INTEGER NOT NULL CHECK(typeof(version)=='integer'),"
65 " writer INTEGER NOT NULL CHECK(typeof(writer)=='integer'),"
66 " mtime INTEGER NOT NULL CHECK(typeof(mtime)=='integer'),"
67 " type INTEGER NOT NULL CHECK(typeof(type)=='integer'),"
68 " name TEXT NOT NULL,"
71 static const char *sql_load_all
=
72 "SELECT inode, parent, version, writer, mtime, type, name, data FROM tree;";
74 static char *sql_insert_entry
=
76 "inode, parent, version, writer, mtime, type, name, data) "
77 "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8);";
79 static char *sql_update_entry
=
80 "UPDATE tree SET parent = ?2, version = ?3, writer = ?4, mtime = ?5, "
81 "type = ?6, name = ?7, data = ?8 WHERE inode = ?1;";
83 static char *sql_replace_entry
=
84 "REPLACE INTO tree (inode, parent, version, writer, mtime, type, "
85 "name, data) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8);";
87 static char *sql_delete_entry
=
88 "DELETE FROM tree WHERE inode = ?1;";
90 static char *sql_begin
= "BEGIN TRANSACTION;";
91 static char *sql_commit
= "COMMIT TRANSACTION;";
92 static char *sql_rollback
= "ROLLBACK TRANSACTION;";
94 static sqlite3
*bdb_create(
100 int flags
= SQLITE_OPEN_READWRITE
|SQLITE_OPEN_CREATE
;
101 rc
= sqlite3_open_v2(filename
, &db
, flags
, NULL
);
102 if (rc
!= SQLITE_OK
) {
103 cfs_critical("splite3_open_v2 failed: %d\n", rc
);
108 if (chmod(filename
, 0600) == -1) {
109 cfs_critical("chmod failed: %s", strerror(errno
));
113 /* use WAL mode - to allow concurrent reads */
114 rc
= sqlite3_exec(db
, "PRAGMA journal_mode=WAL;", NULL
, NULL
, NULL
);
115 if (rc
!= SQLITE_OK
) {
116 cfs_critical("unable to set WAL mode: %s\n", sqlite3_errmsg(db
));
121 /* NORMAL is good enough when using WAL */
122 rc
= sqlite3_exec(db
, "PRAGMA synchronous=NORMAL", NULL
, NULL
, NULL
);
123 if (rc
!= SQLITE_OK
) {
124 cfs_critical("unable to set synchronous mode: %s\n", sqlite3_errmsg(db
));
129 sqlite3_busy_timeout(db
, 10000); /* 10 seconds */
131 rc
= sqlite3_exec(db
, sql_create_db
, NULL
, NULL
, NULL
);
132 if (rc
!= SQLITE_OK
) {
133 cfs_critical("init database failed: %s\n", sqlite3_errmsg(db
));
141 static int backend_write_inode(
156 cfs_debug("enter backend_write_inode %016zX", inode
);
158 if ((rc
= sqlite3_bind_int64(stmt
, 1, inode
)) != SQLITE_OK
) {
159 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
162 if ((rc
= sqlite3_bind_int64(stmt
, 2, parent
)) != SQLITE_OK
) {
163 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
166 if ((rc
= sqlite3_bind_int64(stmt
, 3, version
)) != SQLITE_OK
) {
167 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
170 if ((rc
= sqlite3_bind_int64(stmt
, 4, writer
)) != SQLITE_OK
) {
171 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
174 if ((rc
= sqlite3_bind_int64(stmt
, 5, mtime
)) != SQLITE_OK
) {
175 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
178 if ((rc
= sqlite3_bind_int64(stmt
, 6, type
)) != SQLITE_OK
) {
179 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
182 /* question: can we use SQLITE_STATIC instead? */
183 if ((rc
= sqlite3_bind_text(stmt
, 7, name
, -1, SQLITE_TRANSIENT
)) != SQLITE_OK
) {
184 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
187 if ((rc
= sqlite3_bind_blob(stmt
, 8, value
, size
, SQLITE_TRANSIENT
)) != SQLITE_OK
) {
188 cfs_critical("sqlite3_bind failed: %s\n", sqlite3_errmsg(db
));
192 if ((rc
= sqlite3_step(stmt
)) != SQLITE_DONE
) {
193 cfs_critical("sqlite3_step failed: %s\n", sqlite3_errmsg(db
));
203 static int bdb_backend_delete_inode(
209 cfs_debug("enter dbd_backend_delete_inode");
211 sqlite3_stmt
*stmt
= bdb
->stmt_delete_entry
;
213 if ((rc
= sqlite3_bind_int64(stmt
, 1, inode
)) != SQLITE_OK
) {
214 cfs_critical("delete_inode/sqlite3_bind failed: %s\n", sqlite3_errmsg(bdb
->db
));
218 if ((rc
= sqlite3_step(stmt
)) != SQLITE_DONE
) {
219 cfs_critical("delete_inode failed: %s\n", sqlite3_errmsg(bdb
->db
));
229 int bdb_backend_write(
240 guint64 delete_inode
)
242 g_return_val_if_fail(bdb
!= NULL
, SQLITE_PERM
);
243 g_return_val_if_fail(inode
== 0 || (name
!= NULL
&& name
[0]), SQLITE_PERM
);
244 g_return_val_if_fail(type
== DT_REG
|| type
== DT_DIR
, SQLITE_PERM
);
247 gboolean need_txn
= (inode
!= 0 || delete_inode
!= 0);
250 rc
= sqlite3_step(bdb
->stmt_begin
);
251 sqlite3_reset(bdb
->stmt_begin
);
252 if (rc
!= SQLITE_DONE
) {
253 cfs_critical("begin transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
258 if (delete_inode
!= 0) {
259 if ((rc
= bdb_backend_delete_inode(bdb
, delete_inode
)) != SQLITE_OK
)
265 sqlite3_stmt
*stmt
= (inode
> version
) ?
266 bdb
->stmt_insert_entry
: bdb
->stmt_replace_entry
;
268 rc
= backend_write_inode(bdb
->db
, stmt
, inode
, parent
, version
,
269 writer
, mtime
, size
, type
, name
, value
);
273 if (sqlite3_changes(bdb
->db
) != 1) {
274 cfs_critical("no such inode %016zX", inode
);
279 rc
= backend_write_inode(bdb
->db
, bdb
->stmt_replace_entry
, 0, 0, version
,
280 writer
, mtime
, 0, DT_REG
, VERSIONFILENAME
, NULL
);
287 rc
= sqlite3_step(bdb
->stmt_commit
);
288 sqlite3_reset(bdb
->stmt_commit
);
289 if (rc
!= SQLITE_DONE
) {
290 cfs_critical("commit transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
302 int rbrc
= sqlite3_step(bdb
->stmt_rollback
);
303 sqlite3_reset(bdb
->stmt_rollback
);
304 if (rbrc
!= SQLITE_DONE
) {
305 cfs_critical("rollback transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
312 static gboolean
bdb_backend_load_index(
314 memdb_tree_entry_t
*root
,
317 g_return_val_if_fail(bdb
!= NULL
, FALSE
);
318 g_return_val_if_fail(root
!= NULL
, FALSE
);
319 g_return_val_if_fail(index
!= NULL
, FALSE
);
320 g_return_val_if_fail(root
->version
== 0, FALSE
);
321 g_return_val_if_fail(g_hash_table_size(index
) == 1, FALSE
);
325 sqlite3_stmt
*stmt
= bdb
->stmt_load_all
;
327 while ((rc
= sqlite3_step(stmt
)) == SQLITE_ROW
) {
329 memdb_tree_entry_t
*te
;
331 guint64 inode
= sqlite3_column_int64(stmt
, 0);
332 const char *name
= (const char *)sqlite3_column_text(stmt
, 6);
333 int namelen
= sqlite3_column_bytes(stmt
, 6);
334 if (name
== NULL
|| namelen
== 0) {
335 cfs_critical("inode has no name (inode = %016zX)", inode
);
338 te
= g_malloc0(sizeof(memdb_tree_entry_t
) + namelen
+ 1);
339 strcpy(te
->name
, name
);
342 te
->parent
= sqlite3_column_int64(stmt
, 1);
343 te
->version
= sqlite3_column_int64(stmt
, 2);
344 te
->writer
= sqlite3_column_int64(stmt
, 3) & 0x0ffffffff;
345 te
->mtime
= sqlite3_column_int64(stmt
, 4) & 0x0ffffffff;
346 te
->type
= sqlite3_column_int64(stmt
, 5) & 255;
348 gconstpointer value
= sqlite3_column_blob(stmt
, 7);
350 int size
= sqlite3_column_bytes(stmt
, 7);
353 if (te
->type
== DT_REG
) {
355 te
->data
.value
= g_memdup(value
, size
);
356 } else if (te
->type
== DT_DIR
) {
358 cfs_critical("directory inode contains data (inode = %016zX)",
363 te
->data
.entries
= NULL
;
365 cfs_critical("inode has unknown type (inode = %016zX, type = %d)",
366 te
->inode
, te
->type
);
371 cfs_debug("name %s (inode = %016zX, parent = %016zX)",
372 te
->name
, te
->inode
, te
->parent
);
374 if (te
->inode
== 0) {
375 if (te
->name
&& !strcmp(te
->name
, VERSIONFILENAME
)) {
376 root
->version
= te
->version
;
377 root
->writer
= te
->writer
;
378 root
->mtime
= te
->mtime
;
379 memdb_tree_entry_free(te
);
381 cfs_critical("root inode has unexpected name '%s'", te
->name
);
382 memdb_tree_entry_free(te
);
386 memdb_tree_entry_t
*pte
;
388 if (!(pte
= g_hash_table_lookup(index
, &te
->parent
))) {
390 /* allocate placeholder (type == 0)
391 * this is simply replaced if we find a real inode later
393 pte
= g_malloc0(sizeof(memdb_tree_entry_t
));
394 pte
->inode
= te
->parent
;
395 pte
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
396 g_hash_table_replace(index
, &pte
->inode
, pte
);
398 } else if (pte
->type
!= DT_DIR
) {
399 cfs_critical("parent is not a directory "
400 "(inode = %016zX, parent = %016zX, name = '%s')",
401 te
->inode
, te
->parent
, te
->name
);
402 memdb_tree_entry_free(te
);
406 if (te
->type
== DT_DIR
) {
407 te
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
410 if (g_hash_table_lookup(pte
->data
.entries
, te
->name
)) {
411 cfs_critical("found entry with duplicate name "
412 "(inode = %016zX, parent = %016zX, name = '%s')",
413 te
->inode
, te
->parent
, te
->name
);
417 g_hash_table_replace(pte
->data
.entries
, te
->name
, te
);
418 g_hash_table_replace(index
, &te
->inode
, te
);
421 if (rc
!= SQLITE_DONE
) {
422 cfs_critical("select returned error: %s", sqlite3_errmsg(bdb
->db
));
426 /* no, check if all inodes have parents (there must be no placeholders) */
429 g_hash_table_iter_init (&iter
, index
);
430 while (g_hash_table_iter_next (&iter
, &key
, &value
)) {
431 memdb_tree_entry_t
*te
= (memdb_tree_entry_t
*)value
;
433 cfs_critical("missing directory inode (inode = %016zX)", te
->inode
);
445 cfs_critical("DB load failed");
450 gboolean
bdb_backend_commit_update(
452 memdb_index_t
*master
,
453 memdb_index_t
*slave
,
456 g_return_val_if_fail(memdb
!= NULL
, FALSE
);
457 g_return_val_if_fail(memdb
->bdb
!= NULL
, FALSE
);
458 g_return_val_if_fail(master
!= NULL
, FALSE
);
459 g_return_val_if_fail(slave
!= NULL
, FALSE
);
461 cfs_debug("enter bdb_backend_commit_update");
463 memdb_tree_entry_t
*root
= NULL
;
464 GHashTable
*index
= NULL
;
466 db_backend_t
*bdb
= (db_backend_t
*)memdb
->bdb
;
467 gboolean result
= FALSE
;
471 rc
= sqlite3_step(bdb
->stmt_begin
);
472 sqlite3_reset(bdb
->stmt_begin
);
473 if (rc
!= SQLITE_DONE
) {
474 cfs_critical("begin transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
478 g_mutex_lock (memdb
->mutex
);
480 /* first, delete anything not found in master index) */
485 for (i
= 0; i
< master
->size
; i
++) {
486 guint64 inode
= master
->entries
[i
].inode
;
488 while (j
< slave
->size
&& (slave_inode
= slave
->entries
[j
].inode
) <= inode
) {
490 if (slave_inode
< inode
) {
491 if (bdb_backend_delete_inode(bdb
, slave_inode
) != SQLITE_OK
)
494 cfs_debug("deleted inode %016zX", slave_inode
);
498 if (j
>= slave
->size
)
502 while (j
< slave
->size
) {
503 guint64 slave_inode
= slave
->entries
[j
].inode
;
505 if (bdb_backend_delete_inode(bdb
, slave_inode
) != SQLITE_OK
)
508 cfs_debug("deleted inode %016zX", slave_inode
);
513 /* now add all updates */
517 memdb_tree_entry_t
*te
= (memdb_tree_entry_t
*)l
->data
;
519 tree_entry_debug(te
);
521 if (backend_write_inode(
522 bdb
->db
, bdb
->stmt_replace_entry
, te
->inode
, te
->parent
, te
->version
,
523 te
->writer
, te
->mtime
, te
->size
, te
->type
,
524 te
->inode
? te
->name
: VERSIONFILENAME
, te
->data
.value
) != SQLITE_OK
) {
531 /* now try to reload */
532 root
= memdb_tree_entry_new("");
533 root
->data
.entries
= g_hash_table_new(g_str_hash
, g_str_equal
);
536 index
= g_hash_table_new_full(g_int64_hash
, g_int64_equal
, NULL
,
537 (GDestroyNotify
)memdb_tree_entry_free
);
539 g_hash_table_replace(index
, &root
->inode
, root
);
541 if (!bdb_backend_load_index(bdb
, root
, index
))
544 if (!memdb
->root
->version
) {
545 cfs_critical("new index has version 0 - internal error");
549 memdb_index_t
*new_idx
= memdb_encode_index(index
, root
);
551 cfs_critical("cant encode new index - internal error");
555 int idx_equal
= (new_idx
->bytes
== master
->bytes
&&
556 (memcmp(master
, new_idx
, new_idx
->bytes
) == 0));
561 cfs_critical("new index does not match master index - internal error");
565 rc
= sqlite3_step(bdb
->stmt_commit
);
566 sqlite3_reset(bdb
->stmt_commit
);
567 if (rc
!= SQLITE_DONE
) {
568 cfs_critical("commit transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
572 g_hash_table_destroy(memdb
->index
);
573 memdb
->index
= index
;
578 record_memdb_reload();
580 if (!memdb_recreate_vmlist(memdb
)) {
581 cfs_critical("memdb_recreate_vmlist failed");
587 memdb_update_locks(memdb
);
592 g_mutex_unlock (memdb
->mutex
);
595 g_hash_table_destroy(index
);
597 cfs_debug("leave bdb_backend_commit_update (%d)", result
);
605 rc
= sqlite3_step(bdb
->stmt_rollback
);
606 sqlite3_reset(bdb
->stmt_rollback
);
607 if (rc
!= SQLITE_DONE
)
608 cfs_critical("rollback transaction failed: %s\n", sqlite3_errmsg(bdb
->db
));
615 void bdb_backend_close(db_backend_t
*bdb
)
617 g_return_if_fail(bdb
!= NULL
);
619 sqlite3_finalize(bdb
->stmt_insert_entry
);
620 sqlite3_finalize(bdb
->stmt_replace_entry
);
621 sqlite3_finalize(bdb
->stmt_update_entry
);
622 sqlite3_finalize(bdb
->stmt_delete_entry
);
623 sqlite3_finalize(bdb
->stmt_begin
);
624 sqlite3_finalize(bdb
->stmt_commit
);
625 sqlite3_finalize(bdb
->stmt_rollback
);
626 sqlite3_finalize(bdb
->stmt_load_all
);
629 if ((rc
= sqlite3_close(bdb
->db
)) != SQLITE_OK
) {
630 cfs_critical("sqlite3_close failed: %d\n", rc
);
638 db_backend_t
*bdb_backend_open(
639 const char *filename
,
640 memdb_tree_entry_t
*root
,
643 g_return_val_if_fail(filename
!= NULL
, NULL
);
644 g_return_val_if_fail(root
!= NULL
, NULL
);
645 g_return_val_if_fail(index
!= NULL
, NULL
);
647 db_backend_t
*bdb
= g_new0(db_backend_t
, 1);
648 g_return_val_if_fail(bdb
!= NULL
, NULL
);
652 sqlite3_initialize();
654 if (!(bdb
->db
= bdb_create(filename
)))
657 rc
= sqlite3_prepare_v2(bdb
->db
, sql_insert_entry
, -1, &bdb
->stmt_insert_entry
, NULL
);
658 if (rc
!= SQLITE_OK
) {
659 cfs_critical("sqlite3_prepare 'sql_insert_entry' failed: %s\n",
660 sqlite3_errmsg(bdb
->db
));
663 rc
= sqlite3_prepare_v2(bdb
->db
, sql_update_entry
, -1, &bdb
->stmt_update_entry
, NULL
);
664 if (rc
!= SQLITE_OK
) {
665 cfs_critical("sqlite3_prepare 'sql_update_entry' failed: %s\n",
666 sqlite3_errmsg(bdb
->db
));
669 rc
= sqlite3_prepare_v2(bdb
->db
, sql_replace_entry
, -1, &bdb
->stmt_replace_entry
, NULL
);
670 if (rc
!= SQLITE_OK
) {
671 cfs_critical("sqlite3_prepare 'sql_replace_entry' failed: %s\n",
672 sqlite3_errmsg(bdb
->db
));
675 rc
= sqlite3_prepare_v2(bdb
->db
, sql_delete_entry
, -1, &bdb
->stmt_delete_entry
, NULL
);
676 if (rc
!= SQLITE_OK
) {
677 cfs_critical("sqlite3_prepare 'sql_delete_entry' failed: %s\n",
678 sqlite3_errmsg(bdb
->db
));
681 rc
= sqlite3_prepare_v2(bdb
->db
, sql_begin
, -1, &bdb
->stmt_begin
, NULL
);
682 if (rc
!= SQLITE_OK
) {
683 cfs_critical("sqlite3_prepare 'sql_begin' failed: %s\n",
684 sqlite3_errmsg(bdb
->db
));
687 rc
= sqlite3_prepare_v2(bdb
->db
, sql_commit
, -1, &bdb
->stmt_commit
, NULL
);
688 if (rc
!= SQLITE_OK
) {
689 cfs_critical("sqlite3_prepare 'sql_commit' failed: %s\n",
690 sqlite3_errmsg(bdb
->db
));
693 rc
= sqlite3_prepare_v2(bdb
->db
, sql_rollback
, -1, &bdb
->stmt_rollback
, NULL
);
694 if (rc
!= SQLITE_OK
) {
695 cfs_critical("sqlite3_prepare 'sql_rollback' failed: %s\n",
696 sqlite3_errmsg(bdb
->db
));
699 rc
= sqlite3_prepare_v2(bdb
->db
, sql_load_all
, -1, &bdb
->stmt_load_all
, NULL
);
700 if (rc
!= SQLITE_OK
) {
701 cfs_critical("sqlite3_prepare 'sql_load_all' failed: %s\n",
702 sqlite3_errmsg(bdb
->db
));
706 if (!bdb_backend_load_index(bdb
, root
, index
))
709 if (!root
->version
) {
712 guint32 mtime
= time(NULL
);
714 if (bdb_backend_write(bdb
, 0, 0, root
->version
, 0, mtime
,
715 0, DT_REG
, NULL
, NULL
, 0) != SQLITE_OK
)
723 bdb_backend_close(bdb
);