1 /* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
28 #include "openvswitch/dynamic-string.h"
29 #include "openvswitch/json.h"
30 #include "openvswitch/vlog.h"
31 #include "ovs-atomic.h"
33 #include "ovs-thread.h"
34 #include "ovsdb-error.h"
36 #include "openvswitch/poll-loop.h"
39 #include "socket-util.h"
40 #include "transaction.h"
43 VLOG_DEFINE_THIS_MODULE(ovsdb_log
);
45 /* State in a log's state machine.
47 * OVSDB_LOG_READ is the initial state for a newly opened log. Log records may
48 * be read in this state only. Reaching end of file does not cause a state
49 * transition. A read error transitions to OVSDB_LOG_READ_ERROR.
51 * OVSDB_LOG_READ_ERROR prevents further reads from succeeding; they will
52 * report the same error as before. A log write transitions away to
53 * OVSDB_LOG_WRITE or OVSDB_LOG_WRITE_ERROR.
55 * OVSDB_LOG_WRITE is the state following a call to ovsdb_log_write(), when all
56 * goes well. Any state other than OVSDB_LOG_BROKEN may transition to this
57 * state. A write error transitions to OVSDB_LOG_WRITE_ERROR.
59 * OVSDB_LOG_WRITE_ERROR is the state following a write error. Further writes
60 * retry and might transition back to OVSDB_LOG_WRITE.
62 * OVSDB_LOG_BROKEN is the state following a call to ovsdb_log_replace() or
63 * ovsdb_log_replace_commit(), if it fails in a spectacular enough way that no
64 * further reads or writes can succeed. This is a terminal state.
66 enum ovsdb_log_state
{
67 OVSDB_LOG_READ
, /* Ready to read. */
68 OVSDB_LOG_READ_ERROR
, /* Read failed, see 'error' for details. */
69 OVSDB_LOG_WRITE
, /* Ready to write. */
70 OVSDB_LOG_WRITE_ERROR
, /* Write failed, see 'error' for details. */
71 OVSDB_LOG_BROKEN
, /* Disk on fire, see 'error' for details. */
75 enum ovsdb_log_state state
;
76 struct ovsdb_error
*error
;
80 char *name
; /* Absolute name of file. */
81 char *display_name
; /* For use in log messages, etc. */
83 struct lockfile
*lockfile
;
86 struct afsync
*afsync
;
89 /* Whether the OS supports renaming open files.
91 * (Making this a variable makes it easier to test both strategies on Unix-like
94 static bool rename_open_files
= false;
96 static bool rename_open_files
= true;
99 static bool parse_header(char *header
, const char **magicp
,
100 unsigned long int *length
,
101 uint8_t sha1
[SHA1_DIGEST_SIZE
]);
102 static bool is_magic_ok(const char *needle
, const char *haystack
);
104 static struct afsync
*afsync_create(int fd
, uint64_t initial_ticket
);
105 static uint64_t afsync_destroy(struct afsync
*);
107 /* Attempts to open 'name' with the specified 'open_mode'. On success, stores
108 * the new log into '*filep' and returns NULL; otherwise returns NULL and
109 * stores NULL into '*filep'.
111 * 'magic' is a short text string put at the beginning of every record and used
112 * to distinguish one kind of log file from another. For a conventional OVSDB
113 * log file, use the OVSDB_MAGIC macro. To accept more than one magic string,
114 * separate them with "|", e.g. "MAGIC 1|MAGIC 2".
116 * Whether the file will be locked using lockfile_lock() depends on 'locking':
117 * use true to lock it, false not to lock it, or -1 to lock it only if
118 * 'open_mode' is a mode that allows writing.
120 * A log consists of a series of records. After opening or creating a log with
121 * this function, the client may use ovsdb_log_read() to read any existing
122 * records, one by one. The client may also use ovsdb_log_write() to write new
123 * records (if some records have not yet been read at this point, then the
124 * first write truncates them).
127 ovsdb_log_open(const char *name
, const char *magic
,
128 enum ovsdb_log_open_mode open_mode
,
129 int locking
, struct ovsdb_log
**filep
)
131 struct lockfile
*lockfile
;
132 struct ovsdb_error
*error
;
138 /* If we can create a new file, we need to know what kind of magic to
139 * use, so there must be only one kind. */
140 if (open_mode
== OVSDB_LOG_CREATE_EXCL
|| open_mode
== OVSDB_LOG_CREATE
) {
141 ovs_assert(!strchr(magic
, '|'));
146 /* Get the absolute name of the file because we might need to access it by
147 * name again later after the process has changed directory (e.g. because
148 * daemonize() chdirs to "/").
150 * We save the user-provided name of the file for use in log messages, to
151 * reduce user confusion. */
152 char *abs_name
= abs_file_name(NULL
, name
);
154 error
= ovsdb_io_error(0, "could not determine current "
155 "working directory");
159 ovs_assert(locking
== -1 || locking
== false || locking
== true);
161 locking
= open_mode
!= OVSDB_LOG_READ_ONLY
;
164 int retval
= lockfile_lock(name
, &lockfile
);
166 error
= ovsdb_io_error(retval
, "%s: failed to lock lockfile",
175 case OVSDB_LOG_READ_ONLY
:
179 case OVSDB_LOG_READ_WRITE
:
183 case OVSDB_LOG_CREATE_EXCL
:
185 if (stat(name
, &s
) == -1 && errno
== ENOENT
186 && lstat(name
, &s
) == 0 && S_ISLNK(s
.st_mode
)) {
187 /* 'name' is a dangling symlink. We want to create the file that
188 * the symlink points to, but POSIX says that open() with O_EXCL
189 * must fail with EEXIST if the named file is a symlink. So, we
190 * have to leave off O_EXCL and accept the race. */
191 flags
= O_RDWR
| O_CREAT
;
193 flags
= O_RDWR
| O_CREAT
| O_EXCL
;
196 flags
= O_RDWR
| O_CREAT
| O_EXCL
;
200 case OVSDB_LOG_CREATE
:
201 flags
= O_RDWR
| O_CREAT
;
208 flags
= flags
| O_BINARY
;
210 /* Special case for /dev/stdin to make it work even if the operating system
211 * doesn't support it under that name. */
212 if (!strcmp(name
, "/dev/stdin") && open_mode
== OVSDB_LOG_READ_ONLY
) {
213 fd
= dup(STDIN_FILENO
);
215 fd
= open(name
, flags
, 0666);
218 const char *op
= (open_mode
== OVSDB_LOG_CREATE_EXCL
? "create"
219 : open_mode
== OVSDB_LOG_CREATE
? "create or open"
221 error
= ovsdb_io_error(errno
, "%s: %s failed", name
, op
);
225 stream
= fdopen(fd
, open_mode
== OVSDB_LOG_READ_ONLY
? "rb" : "w+b");
227 error
= ovsdb_io_error(errno
, "%s: fdopen failed", name
);
232 /* Read the magic from the first log record. */
234 const char *actual_magic
;
235 if (!fgets(header
, sizeof header
, stream
)) {
236 if (ferror(stream
)) {
237 error
= ovsdb_io_error(errno
, "%s: read error", name
);
241 /* We need to be able to report what kind of file this is but we can't
242 * if it's empty and we accept more than one. */
243 if (strchr(magic
, '|')) {
244 error
= ovsdb_error(NULL
, "%s: cannot identify file type", name
);
247 actual_magic
= magic
;
249 /* It's an empty file and therefore probably a new file, so fsync()
250 * its parent directory to ensure that its directory entry is
251 * committed to disk. */
252 fsync_parent_dir(name
);
254 unsigned long int length
;
255 uint8_t sha1
[SHA1_DIGEST_SIZE
];
256 if (!parse_header(header
, &actual_magic
, &length
, sha1
)) {
257 error
= ovsdb_error(NULL
, "%s: unexpected file format", name
);
259 } else if (!is_magic_ok(actual_magic
, magic
)) {
260 error
= ovsdb_error(NULL
, "%s: cannot identify file type", name
);
265 if (fseek(stream
, 0, SEEK_SET
)) {
266 error
= ovsdb_io_error(errno
, "%s: seek failed", name
);
270 struct ovsdb_log
*file
= xmalloc(sizeof *file
);
271 file
->state
= OVSDB_LOG_READ
;
273 file
->name
= abs_name
;
274 file
->display_name
= xstrdup(name
);
275 file
->magic
= xstrdup(actual_magic
);
276 file
->lockfile
= lockfile
;
277 file
->stream
= stream
;
278 file
->prev_offset
= 0;
288 lockfile_unlock(lockfile
);
294 /* Returns true if 'needle' is one of the |-delimited words in 'haystack'. */
296 is_magic_ok(const char *needle
, const char *haystack
)
298 /* 'needle' can't be multiple words. */
299 if (strchr(needle
, '|')) {
303 size_t n
= strlen(needle
);
305 if (!strncmp(needle
, haystack
, n
) && strchr("|", haystack
[n
])) {
308 haystack
= strchr(haystack
, '|');
317 ovsdb_log_close(struct ovsdb_log
*file
)
320 ovsdb_error_destroy(file
->error
);
321 afsync_destroy(file
->afsync
);
323 free(file
->display_name
);
326 fclose(file
->stream
);
328 lockfile_unlock(file
->lockfile
);
334 ovsdb_log_get_magic(const struct ovsdb_log
*log
)
339 /* Attempts to parse 'header' as a header line for an OVSDB log record (as
340 * described in ovsdb(5)). Stores a pointer to the magic string in '*magicp',
341 * the length in *length, and the parsed sha1 value in sha1[].
343 * Modifies 'header' and points '*magicp' inside it.
345 * Returns true if successful, false on failure. */
347 parse_header(char *header
, const char **magicp
,
348 unsigned long int *length
, uint8_t sha1
[SHA1_DIGEST_SIZE
])
350 /* 'header' must consist of "OVSDB "... */
351 const char lead
[] = "OVSDB ";
352 if (strncmp(lead
, header
, strlen(lead
))) {
356 /* ...followed by a magic string... */
357 char *magic
= header
+ strlen(lead
);
358 size_t magic_len
= strcspn(magic
, " ");
359 if (magic
[magic_len
] != ' ') {
362 magic
[magic_len
] = '\0';
365 /* ...followed by a length in bytes... */
367 *length
= strtoul(magic
+ magic_len
+ 1, &p
, 10);
368 if (!*length
|| *length
== ULONG_MAX
|| *p
!= ' ') {
373 /* ...followed by a SHA-1 hash... */
374 if (!sha1_from_hex(sha1
, p
)) {
377 p
+= SHA1_HEX_DIGEST_LEN
;
379 /* ...and ended by a new-line. */
387 static struct ovsdb_error
*
388 parse_body(struct ovsdb_log
*file
, off_t offset
, unsigned long int length
,
389 uint8_t sha1
[SHA1_DIGEST_SIZE
], struct json
**jsonp
)
391 struct json_parser
*parser
;
395 parser
= json_parser_create(JSPF_TRAILER
);
401 chunk
= MIN(length
, sizeof input
);
402 if (fread(input
, 1, chunk
, file
->stream
) != chunk
) {
403 json_parser_abort(parser
);
404 return ovsdb_io_error(ferror(file
->stream
) ? errno
: EOF
,
405 "%s: error reading %lu bytes "
406 "starting at offset %lld",
407 file
->display_name
, length
,
408 (long long int) offset
);
410 sha1_update(&ctx
, input
, chunk
);
411 json_parser_feed(parser
, input
, chunk
);
415 sha1_final(&ctx
, sha1
);
416 *jsonp
= json_parser_finish(parser
);
420 /* Attempts to read a log record from 'file'.
422 * If successful, returns NULL and stores in '*jsonp' the JSON object that the
423 * record contains. The caller owns the data and must eventually free it (with
426 * If a read error occurs, returns the error and stores NULL in '*jsonp'.
428 * If the read reaches end of file, returns NULL and stores NULL in
431 ovsdb_log_read(struct ovsdb_log
*file
, struct json
**jsonp
)
434 switch (file
->state
) {
438 case OVSDB_LOG_READ_ERROR
:
439 case OVSDB_LOG_WRITE_ERROR
:
440 case OVSDB_LOG_BROKEN
:
441 return ovsdb_error_clone(file
->error
);
443 case OVSDB_LOG_WRITE
:
447 uint8_t expected_sha1
[SHA1_DIGEST_SIZE
];
448 uint8_t actual_sha1
[SHA1_DIGEST_SIZE
];
449 struct ovsdb_error
*error
;
450 unsigned long data_length
;
456 if (!fgets(header
, sizeof header
, file
->stream
)) {
457 if (feof(file
->stream
)) {
460 error
= ovsdb_io_error(errno
, "%s: read failed", file
->display_name
);
463 off_t data_offset
= file
->offset
+ strlen(header
);
466 if (!parse_header(header
, &magic
, &data_length
, expected_sha1
)
467 || strcmp(magic
, file
->magic
)) {
468 error
= ovsdb_syntax_error(NULL
, NULL
, "%s: parse error at offset "
469 "%lld in header line \"%.*s\"",
471 (long long int) file
->offset
,
472 (int) strcspn(header
, "\n"), header
);
476 error
= parse_body(file
, data_offset
, data_length
, actual_sha1
, &json
);
481 if (memcmp(expected_sha1
, actual_sha1
, SHA1_DIGEST_SIZE
)) {
482 error
= ovsdb_syntax_error(NULL
, NULL
, "%s: %lu bytes starting at "
483 "offset %lld have SHA-1 hash "SHA1_FMT
" "
484 "but should have hash "SHA1_FMT
,
485 file
->display_name
, data_length
,
486 (long long int) data_offset
,
487 SHA1_ARGS(actual_sha1
),
488 SHA1_ARGS(expected_sha1
));
492 if (json
->type
== JSON_STRING
) {
493 error
= ovsdb_syntax_error(NULL
, NULL
, "%s: %lu bytes starting at "
494 "offset %lld are not valid JSON (%s)",
495 file
->display_name
, data_length
,
496 (long long int) data_offset
,
500 if (json
->type
!= JSON_OBJECT
) {
501 error
= ovsdb_syntax_error(NULL
, NULL
, "%s: %lu bytes starting at "
502 "offset %lld are not a JSON object",
503 file
->display_name
, data_length
,
504 (long long int) data_offset
);
508 file
->prev_offset
= file
->offset
;
509 file
->offset
= data_offset
+ data_length
;
514 file
->state
= OVSDB_LOG_READ_ERROR
;
515 file
->error
= ovsdb_error_clone(error
);
520 /* Causes the log record read by the previous call to ovsdb_log_read() to be
521 * effectively discarded. The next call to ovsdb_log_write() will overwrite
522 * that previously read record.
524 * Calling this function more than once has no additional effect.
526 * This function is useful when ovsdb_log_read() successfully reads a record
527 * but that record does not make sense at a higher level (e.g. it specifies an
528 * invalid transaction). */
530 ovsdb_log_unread(struct ovsdb_log
*file
)
532 ovs_assert(file
->state
== OVSDB_LOG_READ
);
533 file
->offset
= file
->prev_offset
;
536 static struct ovsdb_error
*
537 ovsdb_log_truncate(struct ovsdb_log
*file
)
539 file
->state
= OVSDB_LOG_WRITE
;
541 struct ovsdb_error
*error
= NULL
;
542 if (fseeko(file
->stream
, file
->offset
, SEEK_SET
)) {
543 error
= ovsdb_io_error(errno
, "%s: cannot seek to offset %lld",
545 (long long int) file
->offset
);
546 } else if (ftruncate(fileno(file
->stream
), file
->offset
)) {
547 error
= ovsdb_io_error(errno
, "%s: cannot truncate to length %lld",
549 (long long int) file
->offset
);
554 /* Composes a log record for 'json' by filling 'header' with a header line and
555 * 'data' with a data line (each ending with a new-line). To write the record
556 * to a file, write 'header' followed by 'data'.
558 * 'magic' is the magic to use in the header record, e.g. OVSDB_MAGIC.
560 * The caller must initialize 'header' and 'data' to empty strings. */
562 ovsdb_log_compose_record(const struct json
*json
,
563 const char *magic
, struct ds
*header
, struct ds
*data
)
565 ovs_assert(json
->type
== JSON_OBJECT
|| json
->type
== JSON_ARRAY
);
566 ovs_assert(!header
->length
);
567 ovs_assert(!data
->length
);
569 /* Compose content. */
570 json_to_ds(json
, 0, data
);
571 ds_put_char(data
, '\n');
573 /* Compose header. */
574 uint8_t sha1
[SHA1_DIGEST_SIZE
];
575 sha1_bytes(data
->string
, data
->length
, sha1
);
576 ds_put_format(header
, "OVSDB %s %"PRIuSIZE
" "SHA1_FMT
"\n",
577 magic
, data
->length
, SHA1_ARGS(sha1
));
580 /* Writes log record 'json' to 'file'. Returns NULL if successful or an error
581 * (which the caller must eventually destroy) on failure.
583 * If the log contains some records that have not yet been read, then calling
584 * this function truncates them.
586 * Log writes are atomic. A client may use ovsdb_log_commit() to ensure that
590 ovsdb_log_write(struct ovsdb_log
*file
, const struct json
*json
)
592 switch (file
->state
) {
593 case OVSDB_LOG_WRITE
:
597 case OVSDB_LOG_READ_ERROR
:
598 case OVSDB_LOG_WRITE_ERROR
:
599 ovsdb_error_destroy(file
->error
);
600 file
->error
= ovsdb_log_truncate(file
);
602 file
->state
= OVSDB_LOG_WRITE_ERROR
;
603 return ovsdb_error_clone(file
->error
);
605 file
->state
= OVSDB_LOG_WRITE
;
608 case OVSDB_LOG_BROKEN
:
609 return ovsdb_error_clone(file
->error
);
612 if (json
->type
!= JSON_OBJECT
&& json
->type
!= JSON_ARRAY
) {
613 return OVSDB_BUG("bad JSON type");
616 struct ds header
= DS_EMPTY_INITIALIZER
;
617 struct ds data
= DS_EMPTY_INITIALIZER
;
618 ovsdb_log_compose_record(json
, file
->magic
, &header
, &data
);
619 size_t total_length
= header
.length
+ data
.length
;
622 bool ok
= (fwrite(header
.string
, header
.length
, 1, file
->stream
) == 1
623 && fwrite(data
.string
, data
.length
, 1, file
->stream
) == 1
624 && fflush(file
->stream
) == 0);
630 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 5);
631 VLOG_WARN_RL(&rl
, "%s: write failed (%s)",
632 file
->name
, ovs_strerror(error
));
634 /* Remove any partially written data, ignoring errors since there is
635 * nothing further we can do. */
636 ignore(ftruncate(fileno(file
->stream
), file
->offset
));
638 file
->error
= ovsdb_io_error(error
, "%s: write failed",
640 file
->state
= OVSDB_LOG_WRITE_ERROR
;
641 return ovsdb_error_clone(file
->error
);
644 file
->offset
+= total_length
;
648 struct ovsdb_error
* OVS_WARN_UNUSED_RESULT
649 ovsdb_log_write_and_free(struct ovsdb_log
*log
, struct json
*json
)
651 struct ovsdb_error
*error
= ovsdb_log_write(log
, json
);
656 /* Attempts to commit 'file' to disk. Waits for the commit to succeed or fail.
657 * Returns NULL if successful, otherwise the error that occurred. */
659 ovsdb_log_commit_block(struct ovsdb_log
*file
)
661 if (file
->stream
&& fsync(fileno(file
->stream
))) {
662 return ovsdb_io_error(errno
, "%s: fsync failed", file
->display_name
);
667 /* Sets the current position in 'log' as the "base", that is, the initial size
668 * of the log that ovsdb_log_grew_lots() uses to determine whether the log has
669 * grown enough to make compacting worthwhile. */
671 ovsdb_log_mark_base(struct ovsdb_log
*log
)
673 log
->base
= log
->offset
;
676 /* Returns true if 'log' has grown enough above the base that it's worthwhile
677 * to compact it, false otherwise. */
679 ovsdb_log_grew_lots(const struct ovsdb_log
*log
)
681 return log
->offset
> 10 * 1024 * 1024 && log
->offset
/ 2 > log
->base
;
684 /* Attempts to atomically replace the contents of 'log', on disk, by the 'n'
685 * entries in 'entries'. If successful, returns NULL, otherwise returns an
686 * error (which the caller must eventually free).
688 * If successful, 'log' will be in write mode at the end of the log. */
689 struct ovsdb_error
* OVS_WARN_UNUSED_RESULT
690 ovsdb_log_replace(struct ovsdb_log
*log
, struct json
**entries
, size_t n
)
692 struct ovsdb_error
*error
;
693 struct ovsdb_log
*new;
695 error
= ovsdb_log_replace_start(log
, &new);
700 for (size_t i
= 0; i
< n
; i
++) {
701 error
= ovsdb_log_write(new, entries
[i
]);
703 ovsdb_log_replace_abort(new);
707 ovsdb_log_mark_base(new);
709 return ovsdb_log_replace_commit(log
, new);
712 struct ovsdb_error
* OVS_WARN_UNUSED_RESULT
713 ovsdb_log_replace_start(struct ovsdb_log
*old
,
714 struct ovsdb_log
**newp
)
716 /* If old->name is a symlink, then we want the new file to be in the same
717 * directory as the symlink's referent. */
718 char *deref_name
= follow_symlinks(old
->name
);
719 char *tmp_name
= xasprintf("%s.tmp", deref_name
);
722 struct ovsdb_error
*error
;
724 ovs_assert(old
->lockfile
);
726 /* Remove temporary file. (It might not exist.) */
727 if (unlink(tmp_name
) < 0 && errno
!= ENOENT
) {
728 error
= ovsdb_io_error(errno
, "failed to remove %s", tmp_name
);
734 /* Create temporary file. */
735 error
= ovsdb_log_open(tmp_name
, old
->magic
, OVSDB_LOG_CREATE_EXCL
,
741 /* Rename 'old' to 'new', replacing 'new' if it exists. Returns NULL if
742 * successful, otherwise an ovsdb_error that the caller must destroy. */
743 static struct ovsdb_error
* OVS_WARN_UNUSED_RESULT
744 ovsdb_rename(const char *old
, const char *new)
747 /* Avoid rename() because it fails if the destination exists. */
748 int error
= (MoveFileEx(old
, new, MOVEFILE_REPLACE_EXISTING
749 | MOVEFILE_WRITE_THROUGH
| MOVEFILE_COPY_ALLOWED
)
752 int error
= rename(old
, new) ? errno
: 0;
756 ? ovsdb_io_error(error
, "failed to rename \"%s\" to \"%s\"",
761 struct ovsdb_error
* OVS_WARN_UNUSED_RESULT
762 ovsdb_log_replace_commit(struct ovsdb_log
*old
, struct ovsdb_log
*new)
764 struct ovsdb_error
*error
= ovsdb_log_commit_block(new);
766 ovsdb_log_replace_abort(new);
770 /* Replace original file by the temporary file.
772 * We support two strategies:
774 * - The preferred strategy is to rename the temporary file over the
775 * original one in-place, then close the original one. This works on
776 * Unix-like systems. It does not work on Windows, which does not
777 * allow open files to be renamed. The approach has the advantage
778 * that, at any point, we can drop back to something that already
781 * - Alternatively, we can close both files, rename, then open the new
782 * file (which now has the original name). This works on all
783 * systems, but if reopening the file fails then 'old' is broken.
785 * We make the strategy a variable instead of an #ifdef to make it easier
786 * to test both strategies on Unix-like systems, and to make the code
788 if (!rename_open_files
) {
796 /* Rename 'old' to 'new'. We dereference the old name because, if it is a
797 * symlink, we want to replace the referent of the symlink instead of the
799 char *deref_name
= follow_symlinks(old
->name
);
800 error
= ovsdb_rename(new->name
, deref_name
);
804 ovsdb_log_replace_abort(new);
807 if (rename_open_files
) {
808 fsync_parent_dir(old
->name
);
810 old
->stream
= new->stream
;
813 old
->stream
= fopen(old
->name
, "r+b");
815 old
->error
= ovsdb_io_error(errno
, "%s: could not reopen log",
817 old
->state
= OVSDB_LOG_BROKEN
;
818 return ovsdb_error_clone(old
->error
);
821 if (fseek(old
->stream
, new->offset
, SEEK_SET
)) {
822 old
->error
= ovsdb_io_error(errno
, "%s: seek failed", old
->name
);
823 old
->state
= OVSDB_LOG_BROKEN
;
824 return ovsdb_error_clone(old
->error
);
828 /* Replace 'old' by 'new' in memory.
830 * 'old' transitions to OVSDB_LOG_WRITE (it was probably in that mode
832 old
->state
= OVSDB_LOG_WRITE
;
833 ovsdb_error_destroy(old
->error
);
835 /* prev_offset only matters for OVSDB_LOG_READ. */
837 uint64_t ticket
= afsync_destroy(old
->afsync
);
838 old
->afsync
= afsync_create(fileno(old
->stream
), ticket
+ 1);
840 old
->offset
= new->offset
;
841 /* Keep old->name. */
843 old
->magic
= new->magic
;
845 /* Keep old->lockfile. */
846 old
->base
= new->base
;
849 ovsdb_log_close(new);
855 ovsdb_log_replace_abort(struct ovsdb_log
*new)
858 /* Unlink the new file, but only after we close it (because Windows
859 * does not allow removing an open file). */
860 char *name
= xstrdup(new->name
);
861 ovsdb_log_close(new);
868 ovsdb_log_disable_renaming_open_files(void)
870 rename_open_files
= false;
875 atomic_uint64_t cur
, next
;
876 struct seq
*request
, *complete
;
881 afsync_thread(void *afsync_
)
883 struct afsync
*afsync
= afsync_
;
886 ovsrcu_quiesce_start();
888 uint64_t request_seq
= seq_read(afsync
->request
);
891 atomic_read_explicit(&afsync
->next
, &next
, memory_order_acquire
);
892 if (next
== UINT64_MAX
) {
896 if (cur
!= next
&& afsync
->fd
!= -1) {
897 int error
= fsync(afsync
->fd
) ? errno
: 0;
900 atomic_store_explicit(&afsync
->cur
, cur
, memory_order_release
);
901 seq_change(afsync
->complete
);
903 VLOG_WARN("fsync failed (%s)", ovs_strerror(error
));
907 seq_wait(afsync
->request
, request_seq
);
913 static struct afsync
*
914 afsync_create(int fd
, uint64_t initial_ticket
)
916 struct afsync
*afsync
= xzalloc(sizeof *afsync
);
917 atomic_init(&afsync
->cur
, initial_ticket
);
918 atomic_init(&afsync
->next
, initial_ticket
);
919 afsync
->request
= seq_create();
920 afsync
->complete
= seq_create();
921 afsync
->thread
= ovs_thread_create("log_fsync", afsync_thread
, afsync
);
927 afsync_destroy(struct afsync
*afsync
)
934 atomic_read(&afsync
->next
, &next
);
935 atomic_store(&afsync
->next
, UINT64_MAX
);
936 seq_change(afsync
->request
);
937 xpthread_join(afsync
->thread
, NULL
);
939 seq_destroy(afsync
->request
);
940 seq_destroy(afsync
->complete
);
947 static struct afsync
*
948 ovsdb_log_get_afsync(struct ovsdb_log
*log
)
951 log
->afsync
= afsync_create(log
->stream
? fileno(log
->stream
) : -1, 0);
956 /* Starts committing 'log' to disk. Returns a ticket that can be passed to
957 * ovsdb_log_commit_wait() or compared against the return value of
958 * ovsdb_log_commit_progress() later. */
960 ovsdb_log_commit_start(struct ovsdb_log
*log
)
962 struct afsync
*afsync
= ovsdb_log_get_afsync(log
);
965 atomic_add_explicit(&afsync
->next
, 1, &orig
, memory_order_acq_rel
);
967 seq_change(afsync
->request
);
972 /* Returns a ticket value that represents the current progress of commits to
973 * 'log'. Suppose that some call to ovsdb_log_commit_start() returns X and any
974 * call ovsdb_log_commit_progress() returns Y, for the same 'log'. Then commit
975 * X is complete if and only if X <= Y. */
977 ovsdb_log_commit_progress(struct ovsdb_log
*log
)
979 struct afsync
*afsync
= ovsdb_log_get_afsync(log
);
981 atomic_read_explicit(&afsync
->cur
, &cur
, memory_order_acquire
);
985 /* Causes poll_block() to wake up if and when ovsdb_log_commit_progress(log)
986 * would return at least 'goal'. */
988 ovsdb_log_commit_wait(struct ovsdb_log
*log
, uint64_t goal
)
990 struct afsync
*afsync
= ovsdb_log_get_afsync(log
);
991 uint64_t complete
= seq_read(afsync
->complete
);
992 uint64_t cur
= ovsdb_log_commit_progress(log
);
994 seq_wait(afsync
->complete
, complete
);
996 poll_immediate_wake();