]> git.proxmox.com Git - mirror_ovs.git/blame - ovsdb/log.c
stream: Allow timeout configuration for open_block.
[mirror_ovs.git] / ovsdb / log.c
CommitLineData
19b276cb 1/* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
f85f8ebb
BP
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include <config.h>
17
41709ccc 18#include "log.h"
f85f8ebb 19
f85f8ebb
BP
20#include <errno.h>
21#include <fcntl.h>
22#include <stdlib.h>
23#include <string.h>
3762274e 24#include <sys/stat.h>
f85f8ebb
BP
25#include <unistd.h>
26
f70b61d3 27#include "lockfile.h"
c9167341 28#include "openvswitch/dynamic-string.h"
ee89ea7b 29#include "openvswitch/json.h"
c7007aa7 30#include "openvswitch/vlog.h"
f70b61d3
BP
31#include "ovs-atomic.h"
32#include "ovs-rcu.h"
33#include "ovs-thread.h"
f85f8ebb 34#include "ovsdb-error.h"
f70b61d3
BP
35#include "ovsdb.h"
36#include "openvswitch/poll-loop.h"
37#include "seq.h"
f85f8ebb 38#include "sha1.h"
8e71cf88 39#include "socket-util.h"
41709ccc 40#include "transaction.h"
f85f8ebb 41#include "util.h"
5136ce49 42
c7007aa7
BP
43VLOG_DEFINE_THIS_MODULE(ovsdb_log);
44
421fc8a1
BP
45/* State in a log's state machine.
46 *
47 * OVSDB_LOG_READ is the initial state for a newly opened log. Log records may
48 * be read in this state only. Reaching end of file does not cause a state
49 * transition. A read error transitions to OVSDB_LOG_READ_ERROR.
50 *
51 * OVSDB_LOG_READ_ERROR prevents further reads from succeeding; they will
52 * report the same error as before. A log write transitions away to
53 * OVSDB_LOG_WRITE or OVSDB_LOG_WRITE_ERROR.
54 *
55 * OVSDB_LOG_WRITE is the state following a call to ovsdb_log_write(), when all
56 * goes well. Any state other than OVSDB_LOG_BROKEN may transition to this
57 * state. A write error transitions to OVSDB_LOG_WRITE_ERROR.
58 *
59 * OVSDB_LOG_WRITE_ERROR is the state following a write error. Further writes
60 * retry and might transition back to OVSDB_LOG_WRITE.
61 *
62 * OVSDB_LOG_BROKEN is the state following a call to ovsdb_log_replace() or
63 * ovsdb_log_replace_commit(), if it fails in a spectacular enough way that no
64 * further reads or writes can succeed. This is a terminal state.
65 */
66enum ovsdb_log_state {
67 OVSDB_LOG_READ, /* Ready to read. */
68 OVSDB_LOG_READ_ERROR, /* Read failed, see 'error' for details. */
69 OVSDB_LOG_WRITE, /* Ready to write. */
70 OVSDB_LOG_WRITE_ERROR, /* Write failed, see 'error' for details. */
71 OVSDB_LOG_BROKEN, /* Disk on fire, see 'error' for details. */
f85f8ebb
BP
72};
73
41709ccc 74struct ovsdb_log {
421fc8a1
BP
75 enum ovsdb_log_state state;
76 struct ovsdb_error *error;
77
43675e26 78 off_t prev_offset;
f85f8ebb 79 off_t offset;
02f4f231
BP
80 char *name; /* Absolute name of file. */
81 char *display_name; /* For use in log messages, etc. */
19b276cb 82 char *magic;
f85f8ebb
BP
83 struct lockfile *lockfile;
84 FILE *stream;
4cc9d1f0 85 off_t base;
f70b61d3 86 struct afsync *afsync;
f85f8ebb
BP
87};
88
421fc8a1
BP
89/* Whether the OS supports renaming open files.
90 *
91 * (Making this a variable makes it easier to test both strategies on Unix-like
92 * systems.) */
93#ifdef _WIN32
94static bool rename_open_files = false;
95#else
96static bool rename_open_files = true;
97#endif
98
fed27759
BP
99static bool parse_header(char *header, const char **magicp,
100 unsigned long int *length,
101 uint8_t sha1[SHA1_DIGEST_SIZE]);
102static bool is_magic_ok(const char *needle, const char *haystack);
103
f70b61d3
BP
104static struct afsync *afsync_create(int fd, uint64_t initial_ticket);
105static uint64_t afsync_destroy(struct afsync *);
106
7446f148
BP
107/* Attempts to open 'name' with the specified 'open_mode'. On success, stores
108 * the new log into '*filep' and returns NULL; otherwise returns NULL and
109 * stores NULL into '*filep'.
110 *
19b276cb
BP
111 * 'magic' is a short text string put at the beginning of every record and used
112 * to distinguish one kind of log file from another. For a conventional OVSDB
fed27759
BP
113 * log file, use the OVSDB_MAGIC macro. To accept more than one magic string,
114 * separate them with "|", e.g. "MAGIC 1|MAGIC 2".
19b276cb 115 *
7446f148
BP
116 * Whether the file will be locked using lockfile_lock() depends on 'locking':
117 * use true to lock it, false not to lock it, or -1 to lock it only if
118 * 'open_mode' is a mode that allows writing.
4cc9d1f0
BP
119 *
120 * A log consists of a series of records. After opening or creating a log with
121 * this function, the client may use ovsdb_log_read() to read any existing
122 * records, one by one. The client may also use ovsdb_log_write() to write new
123 * records (if some records have not yet been read at this point, then the
124 * first write truncates them).
7446f148 125 */
f85f8ebb 126struct ovsdb_error *
19b276cb
BP
127ovsdb_log_open(const char *name, const char *magic,
128 enum ovsdb_log_open_mode open_mode,
7446f148 129 int locking, struct ovsdb_log **filep)
f85f8ebb
BP
130{
131 struct lockfile *lockfile;
132 struct ovsdb_error *error;
f85f8ebb
BP
133 struct stat s;
134 FILE *stream;
7446f148 135 int flags;
f85f8ebb
BP
136 int fd;
137
fed27759
BP
138 /* If we can create a new file, we need to know what kind of magic to
139 * use, so there must be only one kind. */
140 if (open_mode == OVSDB_LOG_CREATE_EXCL || open_mode == OVSDB_LOG_CREATE) {
141 ovs_assert(!strchr(magic, '|'));
142 }
02f4f231 143
f85f8ebb
BP
144 *filep = NULL;
145
02f4f231
BP
146 /* Get the absolute name of the file because we might need to access it by
147 * name again later after the process has changed directory (e.g. because
148 * daemonize() chdirs to "/").
149 *
150 * We save the user-provided name of the file for use in log messages, to
151 * reduce user confusion. */
152 char *abs_name = abs_file_name(NULL, name);
153 if (!abs_name) {
154 error = ovsdb_io_error(0, "could not determine current "
155 "working directory");
156 goto error;
157 }
158
cb22974d 159 ovs_assert(locking == -1 || locking == false || locking == true);
7446f148
BP
160 if (locking < 0) {
161 locking = open_mode != OVSDB_LOG_READ_ONLY;
162 }
163 if (locking) {
4770e795 164 int retval = lockfile_lock(name, &lockfile);
f85f8ebb
BP
165 if (retval) {
166 error = ovsdb_io_error(retval, "%s: failed to lock lockfile",
167 name);
168 goto error;
169 }
170 } else {
171 lockfile = NULL;
172 }
173
1e0b7e94
BP
174 switch (open_mode) {
175 case OVSDB_LOG_READ_ONLY:
7446f148 176 flags = O_RDONLY;
1e0b7e94
BP
177 break;
178
179 case OVSDB_LOG_READ_WRITE:
7446f148 180 flags = O_RDWR;
1e0b7e94
BP
181 break;
182
183 case OVSDB_LOG_CREATE_EXCL:
7470e8e6 184#ifndef _WIN32
01ca539f
BP
185 if (stat(name, &s) == -1 && errno == ENOENT
186 && lstat(name, &s) == 0 && S_ISLNK(s.st_mode)) {
187 /* 'name' is a dangling symlink. We want to create the file that
188 * the symlink points to, but POSIX says that open() with O_EXCL
189 * must fail with EEXIST if the named file is a symlink. So, we
190 * have to leave off O_EXCL and accept the race. */
191 flags = O_RDWR | O_CREAT;
192 } else {
193 flags = O_RDWR | O_CREAT | O_EXCL;
194 }
7470e8e6
GS
195#else
196 flags = O_RDWR | O_CREAT | O_EXCL;
197#endif
1e0b7e94
BP
198 break;
199
200 case OVSDB_LOG_CREATE:
201 flags = O_RDWR | O_CREAT;
202 break;
203
204 default:
428b2edd 205 OVS_NOT_REACHED();
7446f148 206 }
3e94784c
GS
207#ifdef _WIN32
208 flags = flags | O_BINARY;
209#endif
71e4030f
BP
210 /* Special case for /dev/stdin to make it work even if the operating system
211 * doesn't support it under that name. */
212 if (!strcmp(name, "/dev/stdin") && open_mode == OVSDB_LOG_READ_ONLY) {
213 fd = dup(STDIN_FILENO);
214 } else {
215 fd = open(name, flags, 0666);
216 }
f85f8ebb 217 if (fd < 0) {
1e0b7e94
BP
218 const char *op = (open_mode == OVSDB_LOG_CREATE_EXCL ? "create"
219 : open_mode == OVSDB_LOG_CREATE ? "create or open"
220 : "open");
a6be657b 221 error = ovsdb_io_error(errno, "%s: %s failed", name, op);
f85f8ebb
BP
222 goto error_unlock;
223 }
224
7446f148 225 stream = fdopen(fd, open_mode == OVSDB_LOG_READ_ONLY ? "rb" : "w+b");
f85f8ebb
BP
226 if (!stream) {
227 error = ovsdb_io_error(errno, "%s: fdopen failed", name);
fed27759
BP
228 close(fd);
229 goto error_unlock;
230 }
231
232 /* Read the magic from the first log record. */
233 char header[128];
234 const char *actual_magic;
235 if (!fgets(header, sizeof header, stream)) {
236 if (ferror(stream)) {
237 error = ovsdb_io_error(errno, "%s: read error", name);
238 goto error_fclose;
239 }
240
241 /* We need to be able to report what kind of file this is but we can't
242 * if it's empty and we accept more than one. */
243 if (strchr(magic, '|')) {
244 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
245 goto error_fclose;
246 }
247 actual_magic = magic;
248
249 /* It's an empty file and therefore probably a new file, so fsync()
250 * its parent directory to ensure that its directory entry is
251 * committed to disk. */
252 fsync_parent_dir(name);
253 } else {
254 unsigned long int length;
255 uint8_t sha1[SHA1_DIGEST_SIZE];
256 if (!parse_header(header, &actual_magic, &length, sha1)) {
257 error = ovsdb_error(NULL, "%s: unexpected file format", name);
258 goto error_fclose;
259 } else if (!is_magic_ok(actual_magic, magic)) {
260 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
261 goto error_fclose;
262 }
263 }
264
265 if (fseek(stream, 0, SEEK_SET)) {
266 error = ovsdb_io_error(errno, "%s: seek failed", name);
267 goto error_fclose;
f85f8ebb
BP
268 }
269
fed27759 270 struct ovsdb_log *file = xmalloc(sizeof *file);
421fc8a1
BP
271 file->state = OVSDB_LOG_READ;
272 file->error = NULL;
02f4f231
BP
273 file->name = abs_name;
274 file->display_name = xstrdup(name);
fed27759 275 file->magic = xstrdup(actual_magic);
f85f8ebb
BP
276 file->lockfile = lockfile;
277 file->stream = stream;
43675e26 278 file->prev_offset = 0;
f85f8ebb 279 file->offset = 0;
4cc9d1f0 280 file->base = 0;
f70b61d3 281 file->afsync = NULL;
f85f8ebb
BP
282 *filep = file;
283 return NULL;
284
fed27759
BP
285error_fclose:
286 fclose(stream);
f85f8ebb
BP
287error_unlock:
288 lockfile_unlock(lockfile);
289error:
02f4f231 290 free(abs_name);
f85f8ebb
BP
291 return error;
292}
293
fed27759
BP
294/* Returns true if 'needle' is one of the |-delimited words in 'haystack'. */
295static bool
296is_magic_ok(const char *needle, const char *haystack)
297{
298 /* 'needle' can't be multiple words. */
299 if (strchr(needle, '|')) {
300 return false;
301 }
302
303 size_t n = strlen(needle);
304 for (;;) {
305 if (!strncmp(needle, haystack, n) && strchr("|", haystack[n])) {
306 return true;
307 }
308 haystack = strchr(haystack, '|');
309 if (!haystack) {
310 return false;
311 }
312 haystack++;
313 }
314}
315
f85f8ebb 316void
41709ccc 317ovsdb_log_close(struct ovsdb_log *file)
f85f8ebb
BP
318{
319 if (file) {
421fc8a1 320 ovsdb_error_destroy(file->error);
f70b61d3 321 afsync_destroy(file->afsync);
f85f8ebb 322 free(file->name);
02f4f231 323 free(file->display_name);
19b276cb 324 free(file->magic);
421fc8a1
BP
325 if (file->stream) {
326 fclose(file->stream);
327 }
f85f8ebb 328 lockfile_unlock(file->lockfile);
f85f8ebb
BP
329 free(file);
330 }
331}
332
fed27759
BP
333const char *
334ovsdb_log_get_magic(const struct ovsdb_log *log)
335{
336 return log->magic;
337}
338
339/* Attempts to parse 'header' as a header line for an OVSDB log record (as
340 * described in ovsdb(5)). Stores a pointer to the magic string in '*magicp',
341 * the length in *length, and the parsed sha1 value in sha1[].
342 *
343 * Modifies 'header' and points '*magicp' inside it.
344 *
345 * Returns true if successful, false on failure. */
f85f8ebb 346static bool
fed27759
BP
347parse_header(char *header, const char **magicp,
348 unsigned long int *length, uint8_t sha1[SHA1_DIGEST_SIZE])
f85f8ebb 349{
fed27759
BP
350 /* 'header' must consist of "OVSDB "... */
351 const char lead[] = "OVSDB ";
352 if (strncmp(lead, header, strlen(lead))) {
353 return false;
354 }
f85f8ebb 355
fed27759
BP
356 /* ...followed by a magic string... */
357 char *magic = header + strlen(lead);
358 size_t magic_len = strcspn(magic, " ");
359 if (magic[magic_len] != ' ') {
f85f8ebb
BP
360 return false;
361 }
fed27759
BP
362 magic[magic_len] = '\0';
363 *magicp = magic;
f85f8ebb
BP
364
365 /* ...followed by a length in bytes... */
fed27759
BP
366 char *p;
367 *length = strtoul(magic + magic_len + 1, &p, 10);
f85f8ebb
BP
368 if (!*length || *length == ULONG_MAX || *p != ' ') {
369 return false;
370 }
371 p++;
372
373 /* ...followed by a SHA-1 hash... */
374 if (!sha1_from_hex(sha1, p)) {
375 return false;
376 }
377 p += SHA1_HEX_DIGEST_LEN;
378
379 /* ...and ended by a new-line. */
380 if (*p != '\n') {
381 return false;
382 }
383
384 return true;
385}
386
f85f8ebb 387static struct ovsdb_error *
41709ccc 388parse_body(struct ovsdb_log *file, off_t offset, unsigned long int length,
f85f8ebb
BP
389 uint8_t sha1[SHA1_DIGEST_SIZE], struct json **jsonp)
390{
f85f8ebb
BP
391 struct json_parser *parser;
392 struct sha1_ctx ctx;
393
394 sha1_init(&ctx);
395 parser = json_parser_create(JSPF_TRAILER);
396
f85f8ebb
BP
397 while (length > 0) {
398 char input[BUFSIZ];
399 int chunk;
400
401 chunk = MIN(length, sizeof input);
402 if (fread(input, 1, chunk, file->stream) != chunk) {
403 json_parser_abort(parser);
404 return ovsdb_io_error(ferror(file->stream) ? errno : EOF,
405 "%s: error reading %lu bytes "
02f4f231
BP
406 "starting at offset %lld",
407 file->display_name, length,
408 (long long int) offset);
f85f8ebb
BP
409 }
410 sha1_update(&ctx, input, chunk);
411 json_parser_feed(parser, input, chunk);
412 length -= chunk;
413 }
414
415 sha1_final(&ctx, sha1);
416 *jsonp = json_parser_finish(parser);
417 return NULL;
418}
419
4407aa48
BP
420/* Attempts to read a log record from 'file'.
421 *
422 * If successful, returns NULL and stores in '*jsonp' the JSON object that the
423 * record contains. The caller owns the data and must eventually free it (with
424 * json_destroy()).
425 *
426 * If a read error occurs, returns the error and stores NULL in '*jsonp'.
427 *
428 * If the read reaches end of file, returns NULL and stores NULL in
429 * '*jsonp'. */
f85f8ebb 430struct ovsdb_error *
41709ccc 431ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
f85f8ebb 432{
421fc8a1
BP
433 *jsonp = NULL;
434 switch (file->state) {
435 case OVSDB_LOG_READ:
436 break;
437
438 case OVSDB_LOG_READ_ERROR:
439 case OVSDB_LOG_WRITE_ERROR:
440 case OVSDB_LOG_BROKEN:
441 return ovsdb_error_clone(file->error);
442
443 case OVSDB_LOG_WRITE:
444 return NULL;
445 }
446
f85f8ebb
BP
447 uint8_t expected_sha1[SHA1_DIGEST_SIZE];
448 uint8_t actual_sha1[SHA1_DIGEST_SIZE];
449 struct ovsdb_error *error;
f85f8ebb
BP
450 unsigned long data_length;
451 struct json *json;
452 char header[128];
453
421fc8a1 454 json = NULL;
f85f8ebb
BP
455
456 if (!fgets(header, sizeof header, file->stream)) {
457 if (feof(file->stream)) {
421fc8a1 458 return NULL;
f85f8ebb 459 }
02f4f231 460 error = ovsdb_io_error(errno, "%s: read failed", file->display_name);
f85f8ebb
BP
461 goto error;
462 }
fed27759 463 off_t data_offset = file->offset + strlen(header);
f85f8ebb 464
fed27759
BP
465 const char *magic;
466 if (!parse_header(header, &magic, &data_length, expected_sha1)
467 || strcmp(magic, file->magic)) {
f85f8ebb
BP
468 error = ovsdb_syntax_error(NULL, NULL, "%s: parse error at offset "
469 "%lld in header line \"%.*s\"",
02f4f231
BP
470 file->display_name,
471 (long long int) file->offset,
f85f8ebb
BP
472 (int) strcspn(header, "\n"), header);
473 goto error;
474 }
475
f85f8ebb
BP
476 error = parse_body(file, data_offset, data_length, actual_sha1, &json);
477 if (error) {
478 goto error;
479 }
480
481 if (memcmp(expected_sha1, actual_sha1, SHA1_DIGEST_SIZE)) {
482 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
483 "offset %lld have SHA-1 hash "SHA1_FMT" "
484 "but should have hash "SHA1_FMT,
02f4f231 485 file->display_name, data_length,
f85f8ebb
BP
486 (long long int) data_offset,
487 SHA1_ARGS(actual_sha1),
488 SHA1_ARGS(expected_sha1));
489 goto error;
490 }
491
492 if (json->type == JSON_STRING) {
493 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
494 "offset %lld are not valid JSON (%s)",
02f4f231 495 file->display_name, data_length,
f85f8ebb 496 (long long int) data_offset,
fa37affa 497 json->string);
f85f8ebb
BP
498 goto error;
499 }
4407aa48
BP
500 if (json->type != JSON_OBJECT) {
501 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
502 "offset %lld are not a JSON object",
02f4f231 503 file->display_name, data_length,
4407aa48
BP
504 (long long int) data_offset);
505 goto error;
506 }
f85f8ebb 507
43675e26 508 file->prev_offset = file->offset;
f85f8ebb
BP
509 file->offset = data_offset + data_length;
510 *jsonp = json;
e3c17733 511 return NULL;
f85f8ebb
BP
512
513error:
421fc8a1
BP
514 file->state = OVSDB_LOG_READ_ERROR;
515 file->error = ovsdb_error_clone(error);
f85f8ebb
BP
516 json_destroy(json);
517 return error;
518}
519
43675e26
BP
520/* Causes the log record read by the previous call to ovsdb_log_read() to be
521 * effectively discarded. The next call to ovsdb_log_write() will overwrite
522 * that previously read record.
523 *
524 * Calling this function more than once has no additional effect.
525 *
526 * This function is useful when ovsdb_log_read() successfully reads a record
527 * but that record does not make sense at a higher level (e.g. it specifies an
528 * invalid transaction). */
529void
530ovsdb_log_unread(struct ovsdb_log *file)
531{
421fc8a1 532 ovs_assert(file->state == OVSDB_LOG_READ);
43675e26
BP
533 file->offset = file->prev_offset;
534}
535
02f4f231
BP
536static struct ovsdb_error *
537ovsdb_log_truncate(struct ovsdb_log *file)
538{
539 file->state = OVSDB_LOG_WRITE;
540
541 struct ovsdb_error *error = NULL;
542 if (fseeko(file->stream, file->offset, SEEK_SET)) {
543 error = ovsdb_io_error(errno, "%s: cannot seek to offset %lld",
544 file->display_name,
545 (long long int) file->offset);
546 } else if (ftruncate(fileno(file->stream), file->offset)) {
547 error = ovsdb_io_error(errno, "%s: cannot truncate to length %lld",
548 file->display_name,
549 (long long int) file->offset);
550 }
551 return error;
552}
553
c9167341
BP
554/* Composes a log record for 'json' by filling 'header' with a header line and
555 * 'data' with a data line (each ending with a new-line). To write the record
556 * to a file, write 'header' followed by 'data'.
557 *
02f4f231
BP
558 * 'magic' is the magic to use in the header record, e.g. OVSDB_MAGIC.
559 *
c9167341
BP
560 * The caller must initialize 'header' and 'data' to empty strings. */
561void
562ovsdb_log_compose_record(const struct json *json,
19b276cb 563 const char *magic, struct ds *header, struct ds *data)
c9167341
BP
564{
565 ovs_assert(json->type == JSON_OBJECT || json->type == JSON_ARRAY);
566 ovs_assert(!header->length);
567 ovs_assert(!data->length);
568
569 /* Compose content. */
570 json_to_ds(json, 0, data);
571 ds_put_char(data, '\n');
572
573 /* Compose header. */
574 uint8_t sha1[SHA1_DIGEST_SIZE];
575 sha1_bytes(data->string, data->length, sha1);
fed27759 576 ds_put_format(header, "OVSDB %s %"PRIuSIZE" "SHA1_FMT"\n",
c9167341
BP
577 magic, data->length, SHA1_ARGS(sha1));
578}
579
226600d9 580/* Writes log record 'json' to 'file'. Returns NULL if successful or an error
4cc9d1f0
BP
581 * (which the caller must eventually destroy) on failure.
582 *
583 * If the log contains some records that have not yet been read, then calling
584 * this function truncates them.
585 *
586 * Log writes are atomic. A client may use ovsdb_log_commit() to ensure that
587 * they are durable.
588 */
f85f8ebb 589struct ovsdb_error *
226600d9 590ovsdb_log_write(struct ovsdb_log *file, const struct json *json)
f85f8ebb 591{
421fc8a1
BP
592 switch (file->state) {
593 case OVSDB_LOG_WRITE:
594 break;
595
596 case OVSDB_LOG_READ:
597 case OVSDB_LOG_READ_ERROR:
598 case OVSDB_LOG_WRITE_ERROR:
421fc8a1 599 ovsdb_error_destroy(file->error);
02f4f231
BP
600 file->error = ovsdb_log_truncate(file);
601 if (file->error) {
602 file->state = OVSDB_LOG_WRITE_ERROR;
603 return ovsdb_error_clone(file->error);
f85f8ebb 604 }
02f4f231 605 file->state = OVSDB_LOG_WRITE;
421fc8a1
BP
606 break;
607
608 case OVSDB_LOG_BROKEN:
609 return ovsdb_error_clone(file->error);
f85f8ebb
BP
610 }
611
612 if (json->type != JSON_OBJECT && json->type != JSON_ARRAY) {
02f4f231 613 return OVSDB_BUG("bad JSON type");
f85f8ebb
BP
614 }
615
c9167341
BP
616 struct ds header = DS_EMPTY_INITIALIZER;
617 struct ds data = DS_EMPTY_INITIALIZER;
19b276cb 618 ovsdb_log_compose_record(json, file->magic, &header, &data);
c9167341 619 size_t total_length = header.length + data.length;
f85f8ebb
BP
620
621 /* Write. */
c9167341
BP
622 bool ok = (fwrite(header.string, header.length, 1, file->stream) == 1
623 && fwrite(data.string, data.length, 1, file->stream) == 1
624 && fflush(file->stream) == 0);
625 ds_destroy(&header);
626 ds_destroy(&data);
627 if (!ok) {
02f4f231 628 int error = errno;
f85f8ebb 629
c7007aa7
BP
630 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
631 VLOG_WARN_RL(&rl, "%s: write failed (%s)",
02f4f231 632 file->name, ovs_strerror(error));
c7007aa7 633
f85f8ebb
BP
634 /* Remove any partially written data, ignoring errors since there is
635 * nothing further we can do. */
18b9283b 636 ignore(ftruncate(fileno(file->stream), file->offset));
f85f8ebb 637
02f4f231
BP
638 file->error = ovsdb_io_error(error, "%s: write failed",
639 file->display_name);
640 file->state = OVSDB_LOG_WRITE_ERROR;
641 return ovsdb_error_clone(file->error);
f85f8ebb
BP
642 }
643
c9167341 644 file->offset += total_length;
e3c17733 645 return NULL;
f85f8ebb
BP
646}
647
1b1d2e6d
BP
648struct ovsdb_error * OVS_WARN_UNUSED_RESULT
649ovsdb_log_write_and_free(struct ovsdb_log *log, struct json *json)
650{
651 struct ovsdb_error *error = ovsdb_log_write(log, json);
652 json_destroy(json);
653 return error;
654}
655
f70b61d3
BP
656/* Attempts to commit 'file' to disk. Waits for the commit to succeed or fail.
657 * Returns NULL if successful, otherwise the error that occurred. */
f85f8ebb 658struct ovsdb_error *
f70b61d3 659ovsdb_log_commit_block(struct ovsdb_log *file)
f85f8ebb 660{
421fc8a1 661 if (file->stream && fsync(fileno(file->stream))) {
02f4f231 662 return ovsdb_io_error(errno, "%s: fsync failed", file->display_name);
f85f8ebb 663 }
e3c17733 664 return NULL;
f85f8ebb 665}
41709ccc 666
4cc9d1f0
BP
667/* Sets the current position in 'log' as the "base", that is, the initial size
668 * of the log that ovsdb_log_grew_lots() uses to determine whether the log has
669 * grown enough to make compacting worthwhile. */
670void
671ovsdb_log_mark_base(struct ovsdb_log *log)
ada496b5 672{
4cc9d1f0
BP
673 log->base = log->offset;
674}
675
676/* Returns true if 'log' has grown enough above the base that it's worthwhile
677 * to compact it, false otherwise. */
678bool
679ovsdb_log_grew_lots(const struct ovsdb_log *log)
680{
1cfdc175 681 return log->offset > 10 * 1024 * 1024 && log->offset / 2 > log->base;
ada496b5 682}
421fc8a1
BP
683\f
684/* Attempts to atomically replace the contents of 'log', on disk, by the 'n'
685 * entries in 'entries'. If successful, returns NULL, otherwise returns an
686 * error (which the caller must eventually free).
687 *
688 * If successful, 'log' will be in write mode at the end of the log. */
689struct ovsdb_error * OVS_WARN_UNUSED_RESULT
690ovsdb_log_replace(struct ovsdb_log *log, struct json **entries, size_t n)
691{
692 struct ovsdb_error *error;
693 struct ovsdb_log *new;
694
695 error = ovsdb_log_replace_start(log, &new);
696 if (error) {
697 return error;
698 }
699
700 for (size_t i = 0; i < n; i++) {
701 error = ovsdb_log_write(new, entries[i]);
702 if (error) {
703 ovsdb_log_replace_abort(new);
704 return error;
705 }
706 }
4cc9d1f0 707 ovsdb_log_mark_base(new);
421fc8a1
BP
708
709 return ovsdb_log_replace_commit(log, new);
710}
711
712struct ovsdb_error * OVS_WARN_UNUSED_RESULT
713ovsdb_log_replace_start(struct ovsdb_log *old,
714 struct ovsdb_log **newp)
715{
716 /* If old->name is a symlink, then we want the new file to be in the same
717 * directory as the symlink's referent. */
718 char *deref_name = follow_symlinks(old->name);
719 char *tmp_name = xasprintf("%s.tmp", deref_name);
720 free(deref_name);
721
722 struct ovsdb_error *error;
723
724 ovs_assert(old->lockfile);
725
726 /* Remove temporary file. (It might not exist.) */
727 if (unlink(tmp_name) < 0 && errno != ENOENT) {
728 error = ovsdb_io_error(errno, "failed to remove %s", tmp_name);
729 free(tmp_name);
730 *newp = NULL;
731 return error;
732 }
733
734 /* Create temporary file. */
735 error = ovsdb_log_open(tmp_name, old->magic, OVSDB_LOG_CREATE_EXCL,
736 false, newp);
737 free(tmp_name);
738 return error;
739}
740
741/* Rename 'old' to 'new', replacing 'new' if it exists. Returns NULL if
742 * successful, otherwise an ovsdb_error that the caller must destroy. */
743static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
744ovsdb_rename(const char *old, const char *new)
745{
746#ifdef _WIN32
747 /* Avoid rename() because it fails if the destination exists. */
748 int error = (MoveFileEx(old, new, MOVEFILE_REPLACE_EXISTING
749 | MOVEFILE_WRITE_THROUGH | MOVEFILE_COPY_ALLOWED)
750 ? 0 : EACCES);
751#else
752 int error = rename(old, new) ? errno : 0;
753#endif
754
755 return (error
756 ? ovsdb_io_error(error, "failed to rename \"%s\" to \"%s\"",
757 old, new)
758 : NULL);
759}
760
761struct ovsdb_error * OVS_WARN_UNUSED_RESULT
762ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new)
763{
f70b61d3 764 struct ovsdb_error *error = ovsdb_log_commit_block(new);
421fc8a1
BP
765 if (error) {
766 ovsdb_log_replace_abort(new);
767 return error;
768 }
769
770 /* Replace original file by the temporary file.
771 *
772 * We support two strategies:
773 *
774 * - The preferred strategy is to rename the temporary file over the
775 * original one in-place, then close the original one. This works on
776 * Unix-like systems. It does not work on Windows, which does not
777 * allow open files to be renamed. The approach has the advantage
778 * that, at any point, we can drop back to something that already
779 * works.
780 *
781 * - Alternatively, we can close both files, rename, then open the new
782 * file (which now has the original name). This works on all
783 * systems, but if reopening the file fails then 'old' is broken.
784 *
785 * We make the strategy a variable instead of an #ifdef to make it easier
786 * to test both strategies on Unix-like systems, and to make the code
787 * easier to read. */
788 if (!rename_open_files) {
789 fclose(old->stream);
790 old->stream = NULL;
791
792 fclose(new->stream);
793 new->stream = NULL;
794 }
795
796 /* Rename 'old' to 'new'. We dereference the old name because, if it is a
797 * symlink, we want to replace the referent of the symlink instead of the
798 * symlink itself. */
799 char *deref_name = follow_symlinks(old->name);
800 error = ovsdb_rename(new->name, deref_name);
801 free(deref_name);
802
803 if (error) {
804 ovsdb_log_replace_abort(new);
805 return error;
806 }
807 if (rename_open_files) {
808 fsync_parent_dir(old->name);
809 fclose(old->stream);
810 old->stream = new->stream;
811 new->stream = NULL;
812 } else {
813 old->stream = fopen(old->name, "r+b");
814 if (!old->stream) {
815 old->error = ovsdb_io_error(errno, "%s: could not reopen log",
816 old->name);
817 old->state = OVSDB_LOG_BROKEN;
818 return ovsdb_error_clone(old->error);
819 }
820
821 if (fseek(old->stream, new->offset, SEEK_SET)) {
822 old->error = ovsdb_io_error(errno, "%s: seek failed", old->name);
823 old->state = OVSDB_LOG_BROKEN;
824 return ovsdb_error_clone(old->error);
825 }
826 }
827
828 /* Replace 'old' by 'new' in memory.
829 *
830 * 'old' transitions to OVSDB_LOG_WRITE (it was probably in that mode
831 * anyway). */
832 old->state = OVSDB_LOG_WRITE;
833 ovsdb_error_destroy(old->error);
834 old->error = NULL;
835 /* prev_offset only matters for OVSDB_LOG_READ. */
f70b61d3
BP
836 if (old->afsync) {
837 uint64_t ticket = afsync_destroy(old->afsync);
838 old->afsync = afsync_create(fileno(old->stream), ticket + 1);
839 }
421fc8a1
BP
840 old->offset = new->offset;
841 /* Keep old->name. */
842 free(old->magic);
843 old->magic = new->magic;
844 new->magic = NULL;
845 /* Keep old->lockfile. */
4cc9d1f0
BP
846 old->base = new->base;
847
421fc8a1
BP
848 /* Free 'new'. */
849 ovsdb_log_close(new);
850
851 return NULL;
852}
853
854void
855ovsdb_log_replace_abort(struct ovsdb_log *new)
856{
857 if (new) {
858 /* Unlink the new file, but only after we close it (because Windows
859 * does not allow removing an open file). */
860 char *name = xstrdup(new->name);
861 ovsdb_log_close(new);
862 unlink(name);
863 free(name);
864 }
865}
866
867void
868ovsdb_log_disable_renaming_open_files(void)
869{
870 rename_open_files = false;
871}
f70b61d3
BP
872\f
873struct afsync {
874 pthread_t thread;
875 atomic_uint64_t cur, next;
876 struct seq *request, *complete;
877 int fd;
878};
879
880static void *
881afsync_thread(void *afsync_)
882{
883 struct afsync *afsync = afsync_;
884 uint64_t cur = 0;
885 for (;;) {
886 ovsrcu_quiesce_start();
887
888 uint64_t request_seq = seq_read(afsync->request);
889
890 uint64_t next;
891 atomic_read_explicit(&afsync->next, &next, memory_order_acquire);
892 if (next == UINT64_MAX) {
893 break;
894 }
895
896 if (cur != next && afsync->fd != -1) {
897 int error = fsync(afsync->fd) ? errno : 0;
898 if (!error) {
899 cur = next;
900 atomic_store_explicit(&afsync->cur, cur, memory_order_release);
901 seq_change(afsync->complete);
902 } else {
903 VLOG_WARN("fsync failed (%s)", ovs_strerror(error));
904 }
905 }
906
907 seq_wait(afsync->request, request_seq);
908 poll_block();
909 }
910 return NULL;
911}
912
913static struct afsync *
914afsync_create(int fd, uint64_t initial_ticket)
915{
916 struct afsync *afsync = xzalloc(sizeof *afsync);
917 atomic_init(&afsync->cur, initial_ticket);
918 atomic_init(&afsync->next, initial_ticket);
919 afsync->request = seq_create();
920 afsync->complete = seq_create();
921 afsync->thread = ovs_thread_create("log_fsync", afsync_thread, afsync);
922 afsync->fd = fd;
923 return afsync;
924}
925
926static uint64_t
927afsync_destroy(struct afsync *afsync)
928{
929 if (!afsync) {
930 return 0;
931 }
932
933 uint64_t next;
934 atomic_read(&afsync->next, &next);
935 atomic_store(&afsync->next, UINT64_MAX);
936 seq_change(afsync->request);
937 xpthread_join(afsync->thread, NULL);
938
939 seq_destroy(afsync->request);
940 seq_destroy(afsync->complete);
941
942 free(afsync);
943
944 return next;
945}
946
947static struct afsync *
948ovsdb_log_get_afsync(struct ovsdb_log *log)
949{
950 if (!log->afsync) {
951 log->afsync = afsync_create(log->stream ? fileno(log->stream) : -1, 0);
952 }
953 return log->afsync;
954}
955
956/* Starts committing 'log' to disk. Returns a ticket that can be passed to
957 * ovsdb_log_commit_wait() or compared against the return value of
958 * ovsdb_log_commit_progress() later. */
959uint64_t
960ovsdb_log_commit_start(struct ovsdb_log *log)
961{
962 struct afsync *afsync = ovsdb_log_get_afsync(log);
963
964 uint64_t orig;
965 atomic_add_explicit(&afsync->next, 1, &orig, memory_order_acq_rel);
966
967 seq_change(afsync->request);
968
969 return orig + 1;
970}
971
972/* Returns a ticket value that represents the current progress of commits to
973 * 'log'. Suppose that some call to ovsdb_log_commit_start() returns X and any
974 * call ovsdb_log_commit_progress() returns Y, for the same 'log'. Then commit
975 * X is complete if and only if X <= Y. */
976uint64_t
977ovsdb_log_commit_progress(struct ovsdb_log *log)
978{
979 struct afsync *afsync = ovsdb_log_get_afsync(log);
980 uint64_t cur;
981 atomic_read_explicit(&afsync->cur, &cur, memory_order_acquire);
982 return cur;
983}
984
985/* Causes poll_block() to wake up if and when ovsdb_log_commit_progress(log)
986 * would return at least 'goal'. */
987void
988ovsdb_log_commit_wait(struct ovsdb_log *log, uint64_t goal)
989{
990 struct afsync *afsync = ovsdb_log_get_afsync(log);
991 uint64_t complete = seq_read(afsync->complete);
992 uint64_t cur = ovsdb_log_commit_progress(log);
993 if (cur < goal) {
994 seq_wait(afsync->complete, complete);
995 } else {
996 poll_immediate_wake();
997 }
998}