]> git.proxmox.com Git - mirror_ovs.git/blame - ovsdb/log.c
Eliminate "whitelist" and "blacklist" terms.
[mirror_ovs.git] / ovsdb / log.c
CommitLineData
19b276cb 1/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
f85f8ebb
BP
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include <config.h>
17
41709ccc 18#include "log.h"
f85f8ebb 19
f85f8ebb
BP
20#include <errno.h>
21#include <fcntl.h>
22#include <stdlib.h>
23#include <string.h>
3762274e 24#include <sys/stat.h>
f85f8ebb
BP
25#include <unistd.h>
26
f70b61d3 27#include "lockfile.h"
c9167341 28#include "openvswitch/dynamic-string.h"
ee89ea7b 29#include "openvswitch/json.h"
c7007aa7 30#include "openvswitch/vlog.h"
f70b61d3
BP
31#include "ovs-atomic.h"
32#include "ovs-rcu.h"
33#include "ovs-thread.h"
f85f8ebb 34#include "ovsdb-error.h"
f70b61d3
BP
35#include "ovsdb.h"
36#include "openvswitch/poll-loop.h"
37#include "seq.h"
f85f8ebb 38#include "sha1.h"
8e71cf88 39#include "socket-util.h"
41709ccc 40#include "transaction.h"
f85f8ebb 41#include "util.h"
5136ce49 42
c7007aa7
BP
43VLOG_DEFINE_THIS_MODULE(ovsdb_log);
44
421fc8a1
BP
45/* State in a log's state machine.
46 *
47 * OVSDB_LOG_READ is the initial state for a newly opened log. Log records may
48 * be read in this state only. Reaching end of file does not cause a state
49 * transition. A read error transitions to OVSDB_LOG_READ_ERROR.
50 *
51 * OVSDB_LOG_READ_ERROR prevents further reads from succeeding; they will
52 * report the same error as before. A log write transitions away to
53 * OVSDB_LOG_WRITE or OVSDB_LOG_WRITE_ERROR.
54 *
55 * OVSDB_LOG_WRITE is the state following a call to ovsdb_log_write(), when all
56 * goes well. Any state other than OVSDB_LOG_BROKEN may transition to this
57 * state. A write error transitions to OVSDB_LOG_WRITE_ERROR.
58 *
59 * OVSDB_LOG_WRITE_ERROR is the state following a write error. Further writes
60 * retry and might transition back to OVSDB_LOG_WRITE.
61 *
62 * OVSDB_LOG_BROKEN is the state following a call to ovsdb_log_replace() or
63 * ovsdb_log_replace_commit(), if it fails in a spectacular enough way that no
64 * further reads or writes can succeed. This is a terminal state.
65 */
66enum ovsdb_log_state {
67 OVSDB_LOG_READ, /* Ready to read. */
68 OVSDB_LOG_READ_ERROR, /* Read failed, see 'error' for details. */
69 OVSDB_LOG_WRITE, /* Ready to write. */
70 OVSDB_LOG_WRITE_ERROR, /* Write failed, see 'error' for details. */
71 OVSDB_LOG_BROKEN, /* Disk on fire, see 'error' for details. */
f85f8ebb
BP
72};
73
41709ccc 74struct ovsdb_log {
421fc8a1
BP
75 enum ovsdb_log_state state;
76 struct ovsdb_error *error;
77
43675e26 78 off_t prev_offset;
f85f8ebb 79 off_t offset;
02f4f231
BP
80 char *name; /* Absolute name of file. */
81 char *display_name; /* For use in log messages, etc. */
19b276cb 82 char *magic;
f85f8ebb
BP
83 struct lockfile *lockfile;
84 FILE *stream;
4cc9d1f0 85 off_t base;
f70b61d3 86 struct afsync *afsync;
f85f8ebb
BP
87};
88
421fc8a1
BP
89/* Whether the OS supports renaming open files.
90 *
91 * (Making this a variable makes it easier to test both strategies on Unix-like
92 * systems.) */
93#ifdef _WIN32
94static bool rename_open_files = false;
95#else
96static bool rename_open_files = true;
97#endif
98
fed27759
BP
99static bool parse_header(char *header, const char **magicp,
100 unsigned long int *length,
101 uint8_t sha1[SHA1_DIGEST_SIZE]);
102static bool is_magic_ok(const char *needle, const char *haystack);
103
f70b61d3
BP
104static struct afsync *afsync_create(int fd, uint64_t initial_ticket);
105static uint64_t afsync_destroy(struct afsync *);
106
7446f148
BP
107/* Attempts to open 'name' with the specified 'open_mode'. On success, stores
108 * the new log into '*filep' and returns NULL; otherwise returns NULL and
109 * stores NULL into '*filep'.
110 *
19b276cb
BP
111 * 'magic' is a short text string put at the beginning of every record and used
112 * to distinguish one kind of log file from another. For a conventional OVSDB
fed27759
BP
113 * log file, use the OVSDB_MAGIC macro. To accept more than one magic string,
114 * separate them with "|", e.g. "MAGIC 1|MAGIC 2".
19b276cb 115 *
7446f148
BP
116 * Whether the file will be locked using lockfile_lock() depends on 'locking':
117 * use true to lock it, false not to lock it, or -1 to lock it only if
118 * 'open_mode' is a mode that allows writing.
4cc9d1f0
BP
119 *
120 * A log consists of a series of records. After opening or creating a log with
121 * this function, the client may use ovsdb_log_read() to read any existing
122 * records, one by one. The client may also use ovsdb_log_write() to write new
123 * records (if some records have not yet been read at this point, then the
124 * first write truncates them).
7446f148 125 */
f85f8ebb 126struct ovsdb_error *
19b276cb
BP
127ovsdb_log_open(const char *name, const char *magic,
128 enum ovsdb_log_open_mode open_mode,
7446f148 129 int locking, struct ovsdb_log **filep)
f85f8ebb
BP
130{
131 struct lockfile *lockfile;
132 struct ovsdb_error *error;
f85f8ebb
BP
133 struct stat s;
134 FILE *stream;
7446f148 135 int flags;
f85f8ebb
BP
136 int fd;
137
fed27759
BP
138 /* If we can create a new file, we need to know what kind of magic to
139 * use, so there must be only one kind. */
140 if (open_mode == OVSDB_LOG_CREATE_EXCL || open_mode == OVSDB_LOG_CREATE) {
141 ovs_assert(!strchr(magic, '|'));
142 }
02f4f231 143
f85f8ebb
BP
144 *filep = NULL;
145
02f4f231
BP
146 /* Get the absolute name of the file because we might need to access it by
147 * name again later after the process has changed directory (e.g. because
148 * daemonize() chdirs to "/").
149 *
150 * We save the user-provided name of the file for use in log messages, to
151 * reduce user confusion. */
152 char *abs_name = abs_file_name(NULL, name);
153 if (!abs_name) {
154 error = ovsdb_io_error(0, "could not determine current "
155 "working directory");
156 goto error;
157 }
158
cb22974d 159 ovs_assert(locking == -1 || locking == false || locking == true);
7446f148
BP
160 if (locking < 0) {
161 locking = open_mode != OVSDB_LOG_READ_ONLY;
162 }
163 if (locking) {
4770e795 164 int retval = lockfile_lock(name, &lockfile);
f85f8ebb
BP
165 if (retval) {
166 error = ovsdb_io_error(retval, "%s: failed to lock lockfile",
167 name);
168 goto error;
169 }
170 } else {
171 lockfile = NULL;
172 }
173
1e0b7e94
BP
174 switch (open_mode) {
175 case OVSDB_LOG_READ_ONLY:
7446f148 176 flags = O_RDONLY;
1e0b7e94
BP
177 break;
178
179 case OVSDB_LOG_READ_WRITE:
7446f148 180 flags = O_RDWR;
1e0b7e94
BP
181 break;
182
183 case OVSDB_LOG_CREATE_EXCL:
7470e8e6 184#ifndef _WIN32
01ca539f
BP
185 if (stat(name, &s) == -1 && errno == ENOENT
186 && lstat(name, &s) == 0 && S_ISLNK(s.st_mode)) {
187 /* 'name' is a dangling symlink. We want to create the file that
188 * the symlink points to, but POSIX says that open() with O_EXCL
189 * must fail with EEXIST if the named file is a symlink. So, we
190 * have to leave off O_EXCL and accept the race. */
191 flags = O_RDWR | O_CREAT;
192 } else {
193 flags = O_RDWR | O_CREAT | O_EXCL;
194 }
7470e8e6
GS
195#else
196 flags = O_RDWR | O_CREAT | O_EXCL;
197#endif
1e0b7e94
BP
198 break;
199
200 case OVSDB_LOG_CREATE:
201 flags = O_RDWR | O_CREAT;
202 break;
203
204 default:
428b2edd 205 OVS_NOT_REACHED();
7446f148 206 }
3e94784c
GS
207#ifdef _WIN32
208 flags = flags | O_BINARY;
209#endif
71e4030f
BP
210 /* Special case for /dev/stdin to make it work even if the operating system
211 * doesn't support it under that name. */
212 if (!strcmp(name, "/dev/stdin") && open_mode == OVSDB_LOG_READ_ONLY) {
213 fd = dup(STDIN_FILENO);
214 } else {
215 fd = open(name, flags, 0666);
216 }
f85f8ebb 217 if (fd < 0) {
1e0b7e94
BP
218 const char *op = (open_mode == OVSDB_LOG_CREATE_EXCL ? "create"
219 : open_mode == OVSDB_LOG_CREATE ? "create or open"
220 : "open");
a6be657b 221 error = ovsdb_io_error(errno, "%s: %s failed", name, op);
f85f8ebb
BP
222 goto error_unlock;
223 }
224
7446f148 225 stream = fdopen(fd, open_mode == OVSDB_LOG_READ_ONLY ? "rb" : "w+b");
f85f8ebb
BP
226 if (!stream) {
227 error = ovsdb_io_error(errno, "%s: fdopen failed", name);
fed27759
BP
228 close(fd);
229 goto error_unlock;
230 }
231
232 /* Read the magic from the first log record. */
233 char header[128];
234 const char *actual_magic;
235 if (!fgets(header, sizeof header, stream)) {
236 if (ferror(stream)) {
237 error = ovsdb_io_error(errno, "%s: read error", name);
238 goto error_fclose;
239 }
240
241 /* We need to be able to report what kind of file this is but we can't
242 * if it's empty and we accept more than one. */
243 if (strchr(magic, '|')) {
244 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
245 goto error_fclose;
246 }
247 actual_magic = magic;
248
249 /* It's an empty file and therefore probably a new file, so fsync()
250 * its parent directory to ensure that its directory entry is
251 * committed to disk. */
252 fsync_parent_dir(name);
253 } else {
254 unsigned long int length;
255 uint8_t sha1[SHA1_DIGEST_SIZE];
256 if (!parse_header(header, &actual_magic, &length, sha1)) {
257 error = ovsdb_error(NULL, "%s: unexpected file format", name);
258 goto error_fclose;
259 } else if (!is_magic_ok(actual_magic, magic)) {
260 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
261 goto error_fclose;
262 }
263 }
264
265 if (fseek(stream, 0, SEEK_SET)) {
266 error = ovsdb_io_error(errno, "%s: seek failed", name);
267 goto error_fclose;
f85f8ebb
BP
268 }
269
fed27759 270 struct ovsdb_log *file = xmalloc(sizeof *file);
421fc8a1
BP
271 file->state = OVSDB_LOG_READ;
272 file->error = NULL;
02f4f231
BP
273 file->name = abs_name;
274 file->display_name = xstrdup(name);
fed27759 275 file->magic = xstrdup(actual_magic);
f85f8ebb
BP
276 file->lockfile = lockfile;
277 file->stream = stream;
43675e26 278 file->prev_offset = 0;
f85f8ebb 279 file->offset = 0;
4cc9d1f0 280 file->base = 0;
f70b61d3 281 file->afsync = NULL;
f85f8ebb
BP
282 *filep = file;
283 return NULL;
284
fed27759
BP
285error_fclose:
286 fclose(stream);
f85f8ebb
BP
287error_unlock:
288 lockfile_unlock(lockfile);
289error:
02f4f231 290 free(abs_name);
f85f8ebb
BP
291 return error;
292}
293
fed27759
BP
294/* Returns true if 'needle' is one of the |-delimited words in 'haystack'. */
295static bool
296is_magic_ok(const char *needle, const char *haystack)
297{
298 /* 'needle' can't be multiple words. */
299 if (strchr(needle, '|')) {
300 return false;
301 }
302
303 size_t n = strlen(needle);
304 for (;;) {
305 if (!strncmp(needle, haystack, n) && strchr("|", haystack[n])) {
306 return true;
307 }
308 haystack = strchr(haystack, '|');
309 if (!haystack) {
310 return false;
311 }
312 haystack++;
313 }
314}
315
f85f8ebb 316void
41709ccc 317ovsdb_log_close(struct ovsdb_log *file)
f85f8ebb
BP
318{
319 if (file) {
421fc8a1 320 ovsdb_error_destroy(file->error);
f70b61d3 321 afsync_destroy(file->afsync);
f85f8ebb 322 free(file->name);
02f4f231 323 free(file->display_name);
19b276cb 324 free(file->magic);
421fc8a1
BP
325 if (file->stream) {
326 fclose(file->stream);
327 }
f85f8ebb 328 lockfile_unlock(file->lockfile);
f85f8ebb
BP
329 free(file);
330 }
331}
332
fed27759
BP
333const char *
334ovsdb_log_get_magic(const struct ovsdb_log *log)
335{
336 return log->magic;
337}
338
339/* Attempts to parse 'header' as a header line for an OVSDB log record (as
340 * described in ovsdb(5)). Stores a pointer to the magic string in '*magicp',
341 * the length in *length, and the parsed sha1 value in sha1[].
342 *
343 * Modifies 'header' and points '*magicp' inside it.
344 *
345 * Returns true if successful, false on failure. */
f85f8ebb 346static bool
fed27759
BP
347parse_header(char *header, const char **magicp,
348 unsigned long int *length, uint8_t sha1[SHA1_DIGEST_SIZE])
f85f8ebb 349{
fed27759
BP
350 /* 'header' must consist of "OVSDB "... */
351 const char lead[] = "OVSDB ";
352 if (strncmp(lead, header, strlen(lead))) {
353 return false;
354 }
f85f8ebb 355
fed27759
BP
356 /* ...followed by a magic string... */
357 char *magic = header + strlen(lead);
358 size_t magic_len = strcspn(magic, " ");
359 if (magic[magic_len] != ' ') {
f85f8ebb
BP
360 return false;
361 }
fed27759
BP
362 magic[magic_len] = '\0';
363 *magicp = magic;
f85f8ebb
BP
364
365 /* ...followed by a length in bytes... */
fed27759
BP
366 char *p;
367 *length = strtoul(magic + magic_len + 1, &p, 10);
f85f8ebb
BP
368 if (!*length || *length == ULONG_MAX || *p != ' ') {
369 return false;
370 }
371 p++;
372
373 /* ...followed by a SHA-1 hash... */
374 if (!sha1_from_hex(sha1, p)) {
375 return false;
376 }
377 p += SHA1_HEX_DIGEST_LEN;
378
379 /* ...and ended by a new-line. */
380 if (*p != '\n') {
381 return false;
382 }
383
384 return true;
385}
386
f85f8ebb 387static struct ovsdb_error *
41709ccc 388parse_body(struct ovsdb_log *file, off_t offset, unsigned long int length,
f85f8ebb
BP
389 uint8_t sha1[SHA1_DIGEST_SIZE], struct json **jsonp)
390{
f85f8ebb
BP
391 struct json_parser *parser;
392 struct sha1_ctx ctx;
393
394 sha1_init(&ctx);
395 parser = json_parser_create(JSPF_TRAILER);
396
f85f8ebb
BP
397 while (length > 0) {
398 char input[BUFSIZ];
399 int chunk;
400
401 chunk = MIN(length, sizeof input);
402 if (fread(input, 1, chunk, file->stream) != chunk) {
403 json_parser_abort(parser);
404 return ovsdb_io_error(ferror(file->stream) ? errno : EOF,
405 "%s: error reading %lu bytes "
02f4f231
BP
406 "starting at offset %lld",
407 file->display_name, length,
408 (long long int) offset);
f85f8ebb
BP
409 }
410 sha1_update(&ctx, input, chunk);
411 json_parser_feed(parser, input, chunk);
412 length -= chunk;
413 }
414
415 sha1_final(&ctx, sha1);
416 *jsonp = json_parser_finish(parser);
417 return NULL;
418}
419
4407aa48
BP
420/* Attempts to read a log record from 'file'.
421 *
422 * If successful, returns NULL and stores in '*jsonp' the JSON object that the
423 * record contains. The caller owns the data and must eventually free it (with
424 * json_destroy()).
425 *
426 * If a read error occurs, returns the error and stores NULL in '*jsonp'.
427 *
428 * If the read reaches end of file, returns NULL and stores NULL in
429 * '*jsonp'. */
f85f8ebb 430struct ovsdb_error *
41709ccc 431ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
f85f8ebb 432{
421fc8a1
BP
433 *jsonp = NULL;
434 switch (file->state) {
435 case OVSDB_LOG_READ:
436 break;
437
438 case OVSDB_LOG_READ_ERROR:
439 case OVSDB_LOG_WRITE_ERROR:
440 case OVSDB_LOG_BROKEN:
441 return ovsdb_error_clone(file->error);
442
443 case OVSDB_LOG_WRITE:
444 return NULL;
445 }
446
f85f8ebb
BP
447 uint8_t expected_sha1[SHA1_DIGEST_SIZE];
448 uint8_t actual_sha1[SHA1_DIGEST_SIZE];
449 struct ovsdb_error *error;
f85f8ebb
BP
450 unsigned long data_length;
451 struct json *json;
452 char header[128];
453
421fc8a1 454 json = NULL;
f85f8ebb
BP
455
456 if (!fgets(header, sizeof header, file->stream)) {
457 if (feof(file->stream)) {
421fc8a1 458 return NULL;
f85f8ebb 459 }
02f4f231 460 error = ovsdb_io_error(errno, "%s: read failed", file->display_name);
f85f8ebb
BP
461 goto error;
462 }
fed27759 463 off_t data_offset = file->offset + strlen(header);
f85f8ebb 464
fed27759
BP
465 const char *magic;
466 if (!parse_header(header, &magic, &data_length, expected_sha1)
467 || strcmp(magic, file->magic)) {
f85f8ebb
BP
468 error = ovsdb_syntax_error(NULL, NULL, "%s: parse error at offset "
469 "%lld in header line \"%.*s\"",
02f4f231
BP
470 file->display_name,
471 (long long int) file->offset,
f85f8ebb
BP
472 (int) strcspn(header, "\n"), header);
473 goto error;
474 }
475
f85f8ebb
BP
476 error = parse_body(file, data_offset, data_length, actual_sha1, &json);
477 if (error) {
478 goto error;
479 }
480
481 if (memcmp(expected_sha1, actual_sha1, SHA1_DIGEST_SIZE)) {
482 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
483 "offset %lld have SHA-1 hash "SHA1_FMT" "
484 "but should have hash "SHA1_FMT,
02f4f231 485 file->display_name, data_length,
f85f8ebb
BP
486 (long long int) data_offset,
487 SHA1_ARGS(actual_sha1),
488 SHA1_ARGS(expected_sha1));
489 goto error;
490 }
491
492 if (json->type == JSON_STRING) {
493 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
494 "offset %lld are not valid JSON (%s)",
02f4f231 495 file->display_name, data_length,
f85f8ebb 496 (long long int) data_offset,
fa37affa 497 json->string);
f85f8ebb
BP
498 goto error;
499 }
4407aa48
BP
500 if (json->type != JSON_OBJECT) {
501 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
502 "offset %lld are not a JSON object",
02f4f231 503 file->display_name, data_length,
4407aa48
BP
504 (long long int) data_offset);
505 goto error;
506 }
f85f8ebb 507
43675e26 508 file->prev_offset = file->offset;
f85f8ebb
BP
509 file->offset = data_offset + data_length;
510 *jsonp = json;
e3c17733 511 return NULL;
f85f8ebb
BP
512
513error:
421fc8a1
BP
514 file->state = OVSDB_LOG_READ_ERROR;
515 file->error = ovsdb_error_clone(error);
f85f8ebb
BP
516 json_destroy(json);
517 return error;
518}
519
43675e26
BP
520/* Causes the log record read by the previous call to ovsdb_log_read() to be
521 * effectively discarded. The next call to ovsdb_log_write() will overwrite
522 * that previously read record.
523 *
524 * Calling this function more than once has no additional effect.
525 *
526 * This function is useful when ovsdb_log_read() successfully reads a record
527 * but that record does not make sense at a higher level (e.g. it specifies an
528 * invalid transaction). */
529void
530ovsdb_log_unread(struct ovsdb_log *file)
531{
421fc8a1 532 ovs_assert(file->state == OVSDB_LOG_READ);
43675e26
BP
533 file->offset = file->prev_offset;
534}
535
02f4f231
BP
536static struct ovsdb_error *
537ovsdb_log_truncate(struct ovsdb_log *file)
538{
539 file->state = OVSDB_LOG_WRITE;
540
541 struct ovsdb_error *error = NULL;
542 if (fseeko(file->stream, file->offset, SEEK_SET)) {
543 error = ovsdb_io_error(errno, "%s: cannot seek to offset %lld",
544 file->display_name,
545 (long long int) file->offset);
546 } else if (ftruncate(fileno(file->stream), file->offset)) {
547 error = ovsdb_io_error(errno, "%s: cannot truncate to length %lld",
548 file->display_name,
549 (long long int) file->offset);
550 }
551 return error;
552}
553
c9167341
BP
554/* Composes a log record for 'json' by filling 'header' with a header line and
555 * 'data' with a data line (each ending with a new-line). To write the record
556 * to a file, write 'header' followed by 'data'.
557 *
02f4f231
BP
558 * 'magic' is the magic to use in the header record, e.g. OVSDB_MAGIC.
559 *
c9167341
BP
560 * The caller must initialize 'header' and 'data' to empty strings. */
561void
562ovsdb_log_compose_record(const struct json *json,
19b276cb 563 const char *magic, struct ds *header, struct ds *data)
c9167341
BP
564{
565 ovs_assert(json->type == JSON_OBJECT || json->type == JSON_ARRAY);
566 ovs_assert(!header->length);
567 ovs_assert(!data->length);
568
569 /* Compose content. */
570 json_to_ds(json, 0, data);
571 ds_put_char(data, '\n');
572
573 /* Compose header. */
574 uint8_t sha1[SHA1_DIGEST_SIZE];
575 sha1_bytes(data->string, data->length, sha1);
fed27759 576 ds_put_format(header, "OVSDB %s %"PRIuSIZE" "SHA1_FMT"\n",
c9167341
BP
577 magic, data->length, SHA1_ARGS(sha1));
578}
579
226600d9 580/* Writes log record 'json' to 'file'. Returns NULL if successful or an error
4cc9d1f0
BP
581 * (which the caller must eventually destroy) on failure.
582 *
583 * If the log contains some records that have not yet been read, then calling
584 * this function truncates them.
585 *
586 * Log writes are atomic. A client may use ovsdb_log_commit() to ensure that
587 * they are durable.
588 */
f85f8ebb 589struct ovsdb_error *
226600d9 590ovsdb_log_write(struct ovsdb_log *file, const struct json *json)
f85f8ebb 591{
421fc8a1
BP
592 switch (file->state) {
593 case OVSDB_LOG_WRITE:
594 break;
595
596 case OVSDB_LOG_READ:
597 case OVSDB_LOG_READ_ERROR:
598 case OVSDB_LOG_WRITE_ERROR:
421fc8a1 599 ovsdb_error_destroy(file->error);
02f4f231
BP
600 file->error = ovsdb_log_truncate(file);
601 if (file->error) {
602 file->state = OVSDB_LOG_WRITE_ERROR;
603 return ovsdb_error_clone(file->error);
f85f8ebb 604 }
02f4f231 605 file->state = OVSDB_LOG_WRITE;
421fc8a1
BP
606 break;
607
608 case OVSDB_LOG_BROKEN:
609 return ovsdb_error_clone(file->error);
f85f8ebb
BP
610 }
611
612 if (json->type != JSON_OBJECT && json->type != JSON_ARRAY) {
02f4f231 613 return OVSDB_BUG("bad JSON type");
f85f8ebb
BP
614 }
615
c9167341
BP
616 struct ds header = DS_EMPTY_INITIALIZER;
617 struct ds data = DS_EMPTY_INITIALIZER;
19b276cb 618 ovsdb_log_compose_record(json, file->magic, &header, &data);
c9167341 619 size_t total_length = header.length + data.length;
f85f8ebb
BP
620
621 /* Write. */
c9167341
BP
622 bool ok = (fwrite(header.string, header.length, 1, file->stream) == 1
623 && fwrite(data.string, data.length, 1, file->stream) == 1
624 && fflush(file->stream) == 0);
625 ds_destroy(&header);
626 ds_destroy(&data);
627 if (!ok) {
02f4f231 628 int error = errno;
f85f8ebb 629
c7007aa7
BP
630 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
631 VLOG_WARN_RL(&rl, "%s: write failed (%s)",
02f4f231 632 file->name, ovs_strerror(error));
c7007aa7 633
f85f8ebb
BP
634 /* Remove any partially written data, ignoring errors since there is
635 * nothing further we can do. */
18b9283b 636 ignore(ftruncate(fileno(file->stream), file->offset));
f85f8ebb 637
02f4f231
BP
638 file->error = ovsdb_io_error(error, "%s: write failed",
639 file->display_name);
640 file->state = OVSDB_LOG_WRITE_ERROR;
641 return ovsdb_error_clone(file->error);
f85f8ebb
BP
642 }
643
c9167341 644 file->offset += total_length;
e3c17733 645 return NULL;
f85f8ebb
BP
646}
647
1b1d2e6d
BP
648struct ovsdb_error * OVS_WARN_UNUSED_RESULT
649ovsdb_log_write_and_free(struct ovsdb_log *log, struct json *json)
650{
651 struct ovsdb_error *error = ovsdb_log_write(log, json);
652 json_destroy(json);
653 return error;
654}
655
f70b61d3
BP
656/* Attempts to commit 'file' to disk. Waits for the commit to succeed or fail.
657 * Returns NULL if successful, otherwise the error that occurred. */
f85f8ebb 658struct ovsdb_error *
f70b61d3 659ovsdb_log_commit_block(struct ovsdb_log *file)
f85f8ebb 660{
3738d929
AI
661#if (_POSIX_C_SOURCE >= 199309L || _XOPEN_SOURCE >= 500)
662 /* we do not check metadata - mtime, atime, anywhere, so we
663 * do not need to update it every time we sync the log.
664 * if the system supports it, the log update should be
665 * data only
666 */
667 if (file->stream && fdatasync(fileno(file->stream))) {
668#else
421fc8a1 669 if (file->stream && fsync(fileno(file->stream))) {
3738d929 670#endif
02f4f231 671 return ovsdb_io_error(errno, "%s: fsync failed", file->display_name);
f85f8ebb 672 }
e3c17733 673 return NULL;
f85f8ebb 674}
41709ccc 675
4cc9d1f0
BP
676/* Sets the current position in 'log' as the "base", that is, the initial size
677 * of the log that ovsdb_log_grew_lots() uses to determine whether the log has
678 * grown enough to make compacting worthwhile. */
679void
680ovsdb_log_mark_base(struct ovsdb_log *log)
ada496b5 681{
4cc9d1f0
BP
682 log->base = log->offset;
683}
684
685/* Returns true if 'log' has grown enough above the base that it's worthwhile
686 * to compact it, false otherwise. */
687bool
688ovsdb_log_grew_lots(const struct ovsdb_log *log)
689{
1cfdc175 690 return log->offset > 10 * 1024 * 1024 && log->offset / 2 > log->base;
ada496b5 691}
421fc8a1
BP
692\f
693/* Attempts to atomically replace the contents of 'log', on disk, by the 'n'
694 * entries in 'entries'. If successful, returns NULL, otherwise returns an
695 * error (which the caller must eventually free).
696 *
697 * If successful, 'log' will be in write mode at the end of the log. */
698struct ovsdb_error * OVS_WARN_UNUSED_RESULT
699ovsdb_log_replace(struct ovsdb_log *log, struct json **entries, size_t n)
700{
701 struct ovsdb_error *error;
702 struct ovsdb_log *new;
703
704 error = ovsdb_log_replace_start(log, &new);
705 if (error) {
706 return error;
707 }
708
709 for (size_t i = 0; i < n; i++) {
710 error = ovsdb_log_write(new, entries[i]);
711 if (error) {
712 ovsdb_log_replace_abort(new);
713 return error;
714 }
715 }
4cc9d1f0 716 ovsdb_log_mark_base(new);
421fc8a1
BP
717
718 return ovsdb_log_replace_commit(log, new);
719}
720
721struct ovsdb_error * OVS_WARN_UNUSED_RESULT
722ovsdb_log_replace_start(struct ovsdb_log *old,
723 struct ovsdb_log **newp)
724{
725 /* If old->name is a symlink, then we want the new file to be in the same
726 * directory as the symlink's referent. */
727 char *deref_name = follow_symlinks(old->name);
728 char *tmp_name = xasprintf("%s.tmp", deref_name);
729 free(deref_name);
730
731 struct ovsdb_error *error;
732
733 ovs_assert(old->lockfile);
734
735 /* Remove temporary file. (It might not exist.) */
736 if (unlink(tmp_name) < 0 && errno != ENOENT) {
737 error = ovsdb_io_error(errno, "failed to remove %s", tmp_name);
738 free(tmp_name);
739 *newp = NULL;
740 return error;
741 }
742
743 /* Create temporary file. */
744 error = ovsdb_log_open(tmp_name, old->magic, OVSDB_LOG_CREATE_EXCL,
745 false, newp);
746 free(tmp_name);
747 return error;
748}
749
750/* Rename 'old' to 'new', replacing 'new' if it exists. Returns NULL if
751 * successful, otherwise an ovsdb_error that the caller must destroy. */
752static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
753ovsdb_rename(const char *old, const char *new)
754{
755#ifdef _WIN32
756 /* Avoid rename() because it fails if the destination exists. */
757 int error = (MoveFileEx(old, new, MOVEFILE_REPLACE_EXISTING
758 | MOVEFILE_WRITE_THROUGH | MOVEFILE_COPY_ALLOWED)
759 ? 0 : EACCES);
760#else
761 int error = rename(old, new) ? errno : 0;
762#endif
763
764 return (error
765 ? ovsdb_io_error(error, "failed to rename \"%s\" to \"%s\"",
766 old, new)
767 : NULL);
768}
769
770struct ovsdb_error * OVS_WARN_UNUSED_RESULT
771ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new)
772{
f70b61d3 773 struct ovsdb_error *error = ovsdb_log_commit_block(new);
421fc8a1
BP
774 if (error) {
775 ovsdb_log_replace_abort(new);
776 return error;
777 }
778
779 /* Replace original file by the temporary file.
780 *
781 * We support two strategies:
782 *
783 * - The preferred strategy is to rename the temporary file over the
784 * original one in-place, then close the original one. This works on
785 * Unix-like systems. It does not work on Windows, which does not
786 * allow open files to be renamed. The approach has the advantage
787 * that, at any point, we can drop back to something that already
788 * works.
789 *
790 * - Alternatively, we can close both files, rename, then open the new
791 * file (which now has the original name). This works on all
792 * systems, but if reopening the file fails then 'old' is broken.
793 *
794 * We make the strategy a variable instead of an #ifdef to make it easier
795 * to test both strategies on Unix-like systems, and to make the code
796 * easier to read. */
797 if (!rename_open_files) {
798 fclose(old->stream);
799 old->stream = NULL;
800
801 fclose(new->stream);
802 new->stream = NULL;
803 }
804
805 /* Rename 'old' to 'new'. We dereference the old name because, if it is a
806 * symlink, we want to replace the referent of the symlink instead of the
807 * symlink itself. */
808 char *deref_name = follow_symlinks(old->name);
809 error = ovsdb_rename(new->name, deref_name);
810 free(deref_name);
811
812 if (error) {
813 ovsdb_log_replace_abort(new);
814 return error;
815 }
816 if (rename_open_files) {
817 fsync_parent_dir(old->name);
818 fclose(old->stream);
819 old->stream = new->stream;
820 new->stream = NULL;
821 } else {
822 old->stream = fopen(old->name, "r+b");
823 if (!old->stream) {
824 old->error = ovsdb_io_error(errno, "%s: could not reopen log",
825 old->name);
826 old->state = OVSDB_LOG_BROKEN;
827 return ovsdb_error_clone(old->error);
828 }
829
830 if (fseek(old->stream, new->offset, SEEK_SET)) {
831 old->error = ovsdb_io_error(errno, "%s: seek failed", old->name);
832 old->state = OVSDB_LOG_BROKEN;
833 return ovsdb_error_clone(old->error);
834 }
835 }
836
837 /* Replace 'old' by 'new' in memory.
838 *
839 * 'old' transitions to OVSDB_LOG_WRITE (it was probably in that mode
840 * anyway). */
841 old->state = OVSDB_LOG_WRITE;
842 ovsdb_error_destroy(old->error);
843 old->error = NULL;
844 /* prev_offset only matters for OVSDB_LOG_READ. */
f70b61d3
BP
845 if (old->afsync) {
846 uint64_t ticket = afsync_destroy(old->afsync);
847 old->afsync = afsync_create(fileno(old->stream), ticket + 1);
848 }
421fc8a1
BP
849 old->offset = new->offset;
850 /* Keep old->name. */
851 free(old->magic);
852 old->magic = new->magic;
853 new->magic = NULL;
854 /* Keep old->lockfile. */
4cc9d1f0
BP
855 old->base = new->base;
856
421fc8a1
BP
857 /* Free 'new'. */
858 ovsdb_log_close(new);
859
860 return NULL;
861}
862
863void
864ovsdb_log_replace_abort(struct ovsdb_log *new)
865{
866 if (new) {
867 /* Unlink the new file, but only after we close it (because Windows
868 * does not allow removing an open file). */
869 char *name = xstrdup(new->name);
870 ovsdb_log_close(new);
871 unlink(name);
872 free(name);
873 }
874}
875
876void
877ovsdb_log_disable_renaming_open_files(void)
878{
879 rename_open_files = false;
880}
f70b61d3
BP
881\f
882struct afsync {
883 pthread_t thread;
884 atomic_uint64_t cur, next;
885 struct seq *request, *complete;
886 int fd;
887};
888
889static void *
890afsync_thread(void *afsync_)
891{
892 struct afsync *afsync = afsync_;
893 uint64_t cur = 0;
894 for (;;) {
895 ovsrcu_quiesce_start();
896
897 uint64_t request_seq = seq_read(afsync->request);
898
899 uint64_t next;
900 atomic_read_explicit(&afsync->next, &next, memory_order_acquire);
901 if (next == UINT64_MAX) {
902 break;
903 }
904
905 if (cur != next && afsync->fd != -1) {
906 int error = fsync(afsync->fd) ? errno : 0;
907 if (!error) {
908 cur = next;
909 atomic_store_explicit(&afsync->cur, cur, memory_order_release);
910 seq_change(afsync->complete);
911 } else {
912 VLOG_WARN("fsync failed (%s)", ovs_strerror(error));
913 }
914 }
915
916 seq_wait(afsync->request, request_seq);
917 poll_block();
918 }
919 return NULL;
920}
921
922static struct afsync *
923afsync_create(int fd, uint64_t initial_ticket)
924{
925 struct afsync *afsync = xzalloc(sizeof *afsync);
926 atomic_init(&afsync->cur, initial_ticket);
927 atomic_init(&afsync->next, initial_ticket);
928 afsync->request = seq_create();
929 afsync->complete = seq_create();
930 afsync->thread = ovs_thread_create("log_fsync", afsync_thread, afsync);
931 afsync->fd = fd;
932 return afsync;
933}
934
935static uint64_t
936afsync_destroy(struct afsync *afsync)
937{
938 if (!afsync) {
939 return 0;
940 }
941
942 uint64_t next;
943 atomic_read(&afsync->next, &next);
944 atomic_store(&afsync->next, UINT64_MAX);
945 seq_change(afsync->request);
946 xpthread_join(afsync->thread, NULL);
947
948 seq_destroy(afsync->request);
949 seq_destroy(afsync->complete);
950
951 free(afsync);
952
953 return next;
954}
955
956static struct afsync *
957ovsdb_log_get_afsync(struct ovsdb_log *log)
958{
959 if (!log->afsync) {
960 log->afsync = afsync_create(log->stream ? fileno(log->stream) : -1, 0);
961 }
962 return log->afsync;
963}
964
965/* Starts committing 'log' to disk. Returns a ticket that can be passed to
966 * ovsdb_log_commit_wait() or compared against the return value of
967 * ovsdb_log_commit_progress() later. */
968uint64_t
969ovsdb_log_commit_start(struct ovsdb_log *log)
970{
971 struct afsync *afsync = ovsdb_log_get_afsync(log);
972
973 uint64_t orig;
974 atomic_add_explicit(&afsync->next, 1, &orig, memory_order_acq_rel);
975
976 seq_change(afsync->request);
977
978 return orig + 1;
979}
980
981/* Returns a ticket value that represents the current progress of commits to
982 * 'log'. Suppose that some call to ovsdb_log_commit_start() returns X and any
983 * call ovsdb_log_commit_progress() returns Y, for the same 'log'. Then commit
984 * X is complete if and only if X <= Y. */
985uint64_t
986ovsdb_log_commit_progress(struct ovsdb_log *log)
987{
988 struct afsync *afsync = ovsdb_log_get_afsync(log);
989 uint64_t cur;
990 atomic_read_explicit(&afsync->cur, &cur, memory_order_acquire);
991 return cur;
992}
993
994/* Causes poll_block() to wake up if and when ovsdb_log_commit_progress(log)
995 * would return at least 'goal'. */
996void
997ovsdb_log_commit_wait(struct ovsdb_log *log, uint64_t goal)
998{
999 struct afsync *afsync = ovsdb_log_get_afsync(log);
1000 uint64_t complete = seq_read(afsync->complete);
1001 uint64_t cur = ovsdb_log_commit_progress(log);
1002 if (cur < goal) {
1003 seq_wait(afsync->complete, complete);
1004 } else {
1005 poll_immediate_wake();
1006 }
1007}