]> git.proxmox.com Git - mirror_ovs.git/blob - ovsdb/log.c
ovsdb: Loosen requirements for automatically compacting databases.
[mirror_ovs.git] / ovsdb / log.c
1 /* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2017 Nicira, Inc.
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include <config.h>
17
18 #include "log.h"
19
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/stat.h>
25 #include <unistd.h>
26
27 #include "openvswitch/dynamic-string.h"
28 #include "openvswitch/json.h"
29 #include "openvswitch/vlog.h"
30 #include "lockfile.h"
31 #include "ovsdb.h"
32 #include "ovsdb-error.h"
33 #include "sha1.h"
34 #include "socket-util.h"
35 #include "transaction.h"
36 #include "util.h"
37
38 VLOG_DEFINE_THIS_MODULE(ovsdb_log);
39
40 /* State in a log's state machine.
41 *
42 * OVSDB_LOG_READ is the initial state for a newly opened log. Log records may
43 * be read in this state only. Reaching end of file does not cause a state
44 * transition. A read error transitions to OVSDB_LOG_READ_ERROR.
45 *
46 * OVSDB_LOG_READ_ERROR prevents further reads from succeeding; they will
47 * report the same error as before. A log write transitions away to
48 * OVSDB_LOG_WRITE or OVSDB_LOG_WRITE_ERROR.
49 *
50 * OVSDB_LOG_WRITE is the state following a call to ovsdb_log_write(), when all
51 * goes well. Any state other than OVSDB_LOG_BROKEN may transition to this
52 * state. A write error transitions to OVSDB_LOG_WRITE_ERROR.
53 *
54 * OVSDB_LOG_WRITE_ERROR is the state following a write error. Further writes
55 * retry and might transition back to OVSDB_LOG_WRITE.
56 *
57 * OVSDB_LOG_BROKEN is the state following a call to ovsdb_log_replace() or
58 * ovsdb_log_replace_commit(), if it fails in a spectacular enough way that no
59 * further reads or writes can succeed. This is a terminal state.
60 */
61 enum ovsdb_log_state {
62 OVSDB_LOG_READ, /* Ready to read. */
63 OVSDB_LOG_READ_ERROR, /* Read failed, see 'error' for details. */
64 OVSDB_LOG_WRITE, /* Ready to write. */
65 OVSDB_LOG_WRITE_ERROR, /* Write failed, see 'error' for details. */
66 OVSDB_LOG_BROKEN, /* Disk on fire, see 'error' for details. */
67 };
68
69 struct ovsdb_log {
70 enum ovsdb_log_state state;
71 struct ovsdb_error *error;
72
73 off_t prev_offset;
74 off_t offset;
75 char *name; /* Absolute name of file. */
76 char *display_name; /* For use in log messages, etc. */
77 char *magic;
78 struct lockfile *lockfile;
79 FILE *stream;
80 off_t base;
81 };
82
83 /* Whether the OS supports renaming open files.
84 *
85 * (Making this a variable makes it easier to test both strategies on Unix-like
86 * systems.) */
87 #ifdef _WIN32
88 static bool rename_open_files = false;
89 #else
90 static bool rename_open_files = true;
91 #endif
92
93 static bool parse_header(char *header, const char **magicp,
94 unsigned long int *length,
95 uint8_t sha1[SHA1_DIGEST_SIZE]);
96 static bool is_magic_ok(const char *needle, const char *haystack);
97
98 /* Attempts to open 'name' with the specified 'open_mode'. On success, stores
99 * the new log into '*filep' and returns NULL; otherwise returns NULL and
100 * stores NULL into '*filep'.
101 *
102 * 'magic' is a short text string put at the beginning of every record and used
103 * to distinguish one kind of log file from another. For a conventional OVSDB
104 * log file, use the OVSDB_MAGIC macro. To accept more than one magic string,
105 * separate them with "|", e.g. "MAGIC 1|MAGIC 2".
106 *
107 * Whether the file will be locked using lockfile_lock() depends on 'locking':
108 * use true to lock it, false not to lock it, or -1 to lock it only if
109 * 'open_mode' is a mode that allows writing.
110 *
111 * A log consists of a series of records. After opening or creating a log with
112 * this function, the client may use ovsdb_log_read() to read any existing
113 * records, one by one. The client may also use ovsdb_log_write() to write new
114 * records (if some records have not yet been read at this point, then the
115 * first write truncates them).
116 */
117 struct ovsdb_error *
118 ovsdb_log_open(const char *name, const char *magic,
119 enum ovsdb_log_open_mode open_mode,
120 int locking, struct ovsdb_log **filep)
121 {
122 struct lockfile *lockfile;
123 struct ovsdb_error *error;
124 struct stat s;
125 FILE *stream;
126 int flags;
127 int fd;
128
129 /* If we can create a new file, we need to know what kind of magic to
130 * use, so there must be only one kind. */
131 if (open_mode == OVSDB_LOG_CREATE_EXCL || open_mode == OVSDB_LOG_CREATE) {
132 ovs_assert(!strchr(magic, '|'));
133 }
134
135 *filep = NULL;
136
137 /* Get the absolute name of the file because we might need to access it by
138 * name again later after the process has changed directory (e.g. because
139 * daemonize() chdirs to "/").
140 *
141 * We save the user-provided name of the file for use in log messages, to
142 * reduce user confusion. */
143 char *abs_name = abs_file_name(NULL, name);
144 if (!abs_name) {
145 error = ovsdb_io_error(0, "could not determine current "
146 "working directory");
147 goto error;
148 }
149
150 ovs_assert(locking == -1 || locking == false || locking == true);
151 if (locking < 0) {
152 locking = open_mode != OVSDB_LOG_READ_ONLY;
153 }
154 if (locking) {
155 int retval = lockfile_lock(name, &lockfile);
156 if (retval) {
157 error = ovsdb_io_error(retval, "%s: failed to lock lockfile",
158 name);
159 goto error;
160 }
161 } else {
162 lockfile = NULL;
163 }
164
165 switch (open_mode) {
166 case OVSDB_LOG_READ_ONLY:
167 flags = O_RDONLY;
168 break;
169
170 case OVSDB_LOG_READ_WRITE:
171 flags = O_RDWR;
172 break;
173
174 case OVSDB_LOG_CREATE_EXCL:
175 #ifndef _WIN32
176 if (stat(name, &s) == -1 && errno == ENOENT
177 && lstat(name, &s) == 0 && S_ISLNK(s.st_mode)) {
178 /* 'name' is a dangling symlink. We want to create the file that
179 * the symlink points to, but POSIX says that open() with O_EXCL
180 * must fail with EEXIST if the named file is a symlink. So, we
181 * have to leave off O_EXCL and accept the race. */
182 flags = O_RDWR | O_CREAT;
183 } else {
184 flags = O_RDWR | O_CREAT | O_EXCL;
185 }
186 #else
187 flags = O_RDWR | O_CREAT | O_EXCL;
188 #endif
189 break;
190
191 case OVSDB_LOG_CREATE:
192 flags = O_RDWR | O_CREAT;
193 break;
194
195 default:
196 OVS_NOT_REACHED();
197 }
198 #ifdef _WIN32
199 flags = flags | O_BINARY;
200 #endif
201 /* Special case for /dev/stdin to make it work even if the operating system
202 * doesn't support it under that name. */
203 if (!strcmp(name, "/dev/stdin") && open_mode == OVSDB_LOG_READ_ONLY) {
204 fd = dup(STDIN_FILENO);
205 } else {
206 fd = open(name, flags, 0666);
207 }
208 if (fd < 0) {
209 const char *op = (open_mode == OVSDB_LOG_CREATE_EXCL ? "create"
210 : open_mode == OVSDB_LOG_CREATE ? "create or open"
211 : "open");
212 error = ovsdb_io_error(errno, "%s: %s failed", name, op);
213 goto error_unlock;
214 }
215
216 stream = fdopen(fd, open_mode == OVSDB_LOG_READ_ONLY ? "rb" : "w+b");
217 if (!stream) {
218 error = ovsdb_io_error(errno, "%s: fdopen failed", name);
219 close(fd);
220 goto error_unlock;
221 }
222
223 /* Read the magic from the first log record. */
224 char header[128];
225 const char *actual_magic;
226 if (!fgets(header, sizeof header, stream)) {
227 if (ferror(stream)) {
228 error = ovsdb_io_error(errno, "%s: read error", name);
229 goto error_fclose;
230 }
231
232 /* We need to be able to report what kind of file this is but we can't
233 * if it's empty and we accept more than one. */
234 if (strchr(magic, '|')) {
235 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
236 goto error_fclose;
237 }
238 actual_magic = magic;
239
240 /* It's an empty file and therefore probably a new file, so fsync()
241 * its parent directory to ensure that its directory entry is
242 * committed to disk. */
243 fsync_parent_dir(name);
244 } else {
245 unsigned long int length;
246 uint8_t sha1[SHA1_DIGEST_SIZE];
247 if (!parse_header(header, &actual_magic, &length, sha1)) {
248 error = ovsdb_error(NULL, "%s: unexpected file format", name);
249 goto error_fclose;
250 } else if (!is_magic_ok(actual_magic, magic)) {
251 error = ovsdb_error(NULL, "%s: cannot identify file type", name);
252 goto error_fclose;
253 }
254 }
255
256 if (fseek(stream, 0, SEEK_SET)) {
257 error = ovsdb_io_error(errno, "%s: seek failed", name);
258 goto error_fclose;
259 }
260
261 struct ovsdb_log *file = xmalloc(sizeof *file);
262 file->state = OVSDB_LOG_READ;
263 file->error = NULL;
264 file->name = abs_name;
265 file->display_name = xstrdup(name);
266 file->magic = xstrdup(actual_magic);
267 file->lockfile = lockfile;
268 file->stream = stream;
269 file->prev_offset = 0;
270 file->offset = 0;
271 file->base = 0;
272 *filep = file;
273 return NULL;
274
275 error_fclose:
276 fclose(stream);
277 error_unlock:
278 lockfile_unlock(lockfile);
279 error:
280 free(abs_name);
281 return error;
282 }
283
284 /* Returns true if 'needle' is one of the |-delimited words in 'haystack'. */
285 static bool
286 is_magic_ok(const char *needle, const char *haystack)
287 {
288 /* 'needle' can't be multiple words. */
289 if (strchr(needle, '|')) {
290 return false;
291 }
292
293 size_t n = strlen(needle);
294 for (;;) {
295 if (!strncmp(needle, haystack, n) && strchr("|", haystack[n])) {
296 return true;
297 }
298 haystack = strchr(haystack, '|');
299 if (!haystack) {
300 return false;
301 }
302 haystack++;
303 }
304 }
305
306 void
307 ovsdb_log_close(struct ovsdb_log *file)
308 {
309 if (file) {
310 ovsdb_error_destroy(file->error);
311 free(file->name);
312 free(file->display_name);
313 free(file->magic);
314 if (file->stream) {
315 fclose(file->stream);
316 }
317 lockfile_unlock(file->lockfile);
318 free(file);
319 }
320 }
321
322 const char *
323 ovsdb_log_get_magic(const struct ovsdb_log *log)
324 {
325 return log->magic;
326 }
327
328 /* Attempts to parse 'header' as a header line for an OVSDB log record (as
329 * described in ovsdb(5)). Stores a pointer to the magic string in '*magicp',
330 * the length in *length, and the parsed sha1 value in sha1[].
331 *
332 * Modifies 'header' and points '*magicp' inside it.
333 *
334 * Returns true if successful, false on failure. */
335 static bool
336 parse_header(char *header, const char **magicp,
337 unsigned long int *length, uint8_t sha1[SHA1_DIGEST_SIZE])
338 {
339 /* 'header' must consist of "OVSDB "... */
340 const char lead[] = "OVSDB ";
341 if (strncmp(lead, header, strlen(lead))) {
342 return false;
343 }
344
345 /* ...followed by a magic string... */
346 char *magic = header + strlen(lead);
347 size_t magic_len = strcspn(magic, " ");
348 if (magic[magic_len] != ' ') {
349 return false;
350 }
351 magic[magic_len] = '\0';
352 *magicp = magic;
353
354 /* ...followed by a length in bytes... */
355 char *p;
356 *length = strtoul(magic + magic_len + 1, &p, 10);
357 if (!*length || *length == ULONG_MAX || *p != ' ') {
358 return false;
359 }
360 p++;
361
362 /* ...followed by a SHA-1 hash... */
363 if (!sha1_from_hex(sha1, p)) {
364 return false;
365 }
366 p += SHA1_HEX_DIGEST_LEN;
367
368 /* ...and ended by a new-line. */
369 if (*p != '\n') {
370 return false;
371 }
372
373 return true;
374 }
375
376 static struct ovsdb_error *
377 parse_body(struct ovsdb_log *file, off_t offset, unsigned long int length,
378 uint8_t sha1[SHA1_DIGEST_SIZE], struct json **jsonp)
379 {
380 struct json_parser *parser;
381 struct sha1_ctx ctx;
382
383 sha1_init(&ctx);
384 parser = json_parser_create(JSPF_TRAILER);
385
386 while (length > 0) {
387 char input[BUFSIZ];
388 int chunk;
389
390 chunk = MIN(length, sizeof input);
391 if (fread(input, 1, chunk, file->stream) != chunk) {
392 json_parser_abort(parser);
393 return ovsdb_io_error(ferror(file->stream) ? errno : EOF,
394 "%s: error reading %lu bytes "
395 "starting at offset %lld",
396 file->display_name, length,
397 (long long int) offset);
398 }
399 sha1_update(&ctx, input, chunk);
400 json_parser_feed(parser, input, chunk);
401 length -= chunk;
402 }
403
404 sha1_final(&ctx, sha1);
405 *jsonp = json_parser_finish(parser);
406 return NULL;
407 }
408
409 /* Attempts to read a log record from 'file'.
410 *
411 * If successful, returns NULL and stores in '*jsonp' the JSON object that the
412 * record contains. The caller owns the data and must eventually free it (with
413 * json_destroy()).
414 *
415 * If a read error occurs, returns the error and stores NULL in '*jsonp'.
416 *
417 * If the read reaches end of file, returns NULL and stores NULL in
418 * '*jsonp'. */
419 struct ovsdb_error *
420 ovsdb_log_read(struct ovsdb_log *file, struct json **jsonp)
421 {
422 *jsonp = NULL;
423 switch (file->state) {
424 case OVSDB_LOG_READ:
425 break;
426
427 case OVSDB_LOG_READ_ERROR:
428 case OVSDB_LOG_WRITE_ERROR:
429 case OVSDB_LOG_BROKEN:
430 return ovsdb_error_clone(file->error);
431
432 case OVSDB_LOG_WRITE:
433 return NULL;
434 }
435
436 uint8_t expected_sha1[SHA1_DIGEST_SIZE];
437 uint8_t actual_sha1[SHA1_DIGEST_SIZE];
438 struct ovsdb_error *error;
439 unsigned long data_length;
440 struct json *json;
441 char header[128];
442
443 json = NULL;
444
445 if (!fgets(header, sizeof header, file->stream)) {
446 if (feof(file->stream)) {
447 return NULL;
448 }
449 error = ovsdb_io_error(errno, "%s: read failed", file->display_name);
450 goto error;
451 }
452 off_t data_offset = file->offset + strlen(header);
453
454 const char *magic;
455 if (!parse_header(header, &magic, &data_length, expected_sha1)
456 || strcmp(magic, file->magic)) {
457 error = ovsdb_syntax_error(NULL, NULL, "%s: parse error at offset "
458 "%lld in header line \"%.*s\"",
459 file->display_name,
460 (long long int) file->offset,
461 (int) strcspn(header, "\n"), header);
462 goto error;
463 }
464
465 error = parse_body(file, data_offset, data_length, actual_sha1, &json);
466 if (error) {
467 goto error;
468 }
469
470 if (memcmp(expected_sha1, actual_sha1, SHA1_DIGEST_SIZE)) {
471 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
472 "offset %lld have SHA-1 hash "SHA1_FMT" "
473 "but should have hash "SHA1_FMT,
474 file->display_name, data_length,
475 (long long int) data_offset,
476 SHA1_ARGS(actual_sha1),
477 SHA1_ARGS(expected_sha1));
478 goto error;
479 }
480
481 if (json->type == JSON_STRING) {
482 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
483 "offset %lld are not valid JSON (%s)",
484 file->display_name, data_length,
485 (long long int) data_offset,
486 json->u.string);
487 goto error;
488 }
489 if (json->type != JSON_OBJECT) {
490 error = ovsdb_syntax_error(NULL, NULL, "%s: %lu bytes starting at "
491 "offset %lld are not a JSON object",
492 file->display_name, data_length,
493 (long long int) data_offset);
494 goto error;
495 }
496
497 file->prev_offset = file->offset;
498 file->offset = data_offset + data_length;
499 *jsonp = json;
500 return NULL;
501
502 error:
503 file->state = OVSDB_LOG_READ_ERROR;
504 file->error = ovsdb_error_clone(error);
505 json_destroy(json);
506 return error;
507 }
508
509 /* Causes the log record read by the previous call to ovsdb_log_read() to be
510 * effectively discarded. The next call to ovsdb_log_write() will overwrite
511 * that previously read record.
512 *
513 * Calling this function more than once has no additional effect.
514 *
515 * This function is useful when ovsdb_log_read() successfully reads a record
516 * but that record does not make sense at a higher level (e.g. it specifies an
517 * invalid transaction). */
518 void
519 ovsdb_log_unread(struct ovsdb_log *file)
520 {
521 ovs_assert(file->state == OVSDB_LOG_READ);
522 file->offset = file->prev_offset;
523 }
524
525 static struct ovsdb_error *
526 ovsdb_log_truncate(struct ovsdb_log *file)
527 {
528 file->state = OVSDB_LOG_WRITE;
529
530 struct ovsdb_error *error = NULL;
531 if (fseeko(file->stream, file->offset, SEEK_SET)) {
532 error = ovsdb_io_error(errno, "%s: cannot seek to offset %lld",
533 file->display_name,
534 (long long int) file->offset);
535 } else if (ftruncate(fileno(file->stream), file->offset)) {
536 error = ovsdb_io_error(errno, "%s: cannot truncate to length %lld",
537 file->display_name,
538 (long long int) file->offset);
539 }
540 return error;
541 }
542
543 /* Composes a log record for 'json' by filling 'header' with a header line and
544 * 'data' with a data line (each ending with a new-line). To write the record
545 * to a file, write 'header' followed by 'data'.
546 *
547 * 'magic' is the magic to use in the header record, e.g. OVSDB_MAGIC.
548 *
549 * The caller must initialize 'header' and 'data' to empty strings. */
550 void
551 ovsdb_log_compose_record(const struct json *json,
552 const char *magic, struct ds *header, struct ds *data)
553 {
554 ovs_assert(json->type == JSON_OBJECT || json->type == JSON_ARRAY);
555 ovs_assert(!header->length);
556 ovs_assert(!data->length);
557
558 /* Compose content. */
559 json_to_ds(json, 0, data);
560 ds_put_char(data, '\n');
561
562 /* Compose header. */
563 uint8_t sha1[SHA1_DIGEST_SIZE];
564 sha1_bytes(data->string, data->length, sha1);
565 ds_put_format(header, "OVSDB %s %"PRIuSIZE" "SHA1_FMT"\n",
566 magic, data->length, SHA1_ARGS(sha1));
567 }
568
569 /* Writes log record 'json' to 'file'. Returns NULL if successful or an error
570 * (which the caller must eventually destroy) on failure.
571 *
572 * If the log contains some records that have not yet been read, then calling
573 * this function truncates them.
574 *
575 * Log writes are atomic. A client may use ovsdb_log_commit() to ensure that
576 * they are durable.
577 */
578 struct ovsdb_error *
579 ovsdb_log_write(struct ovsdb_log *file, const struct json *json)
580 {
581 switch (file->state) {
582 case OVSDB_LOG_WRITE:
583 break;
584
585 case OVSDB_LOG_READ:
586 case OVSDB_LOG_READ_ERROR:
587 case OVSDB_LOG_WRITE_ERROR:
588 ovsdb_error_destroy(file->error);
589 file->error = ovsdb_log_truncate(file);
590 if (file->error) {
591 file->state = OVSDB_LOG_WRITE_ERROR;
592 return ovsdb_error_clone(file->error);
593 }
594 file->state = OVSDB_LOG_WRITE;
595 break;
596
597 case OVSDB_LOG_BROKEN:
598 return ovsdb_error_clone(file->error);
599 }
600
601 if (json->type != JSON_OBJECT && json->type != JSON_ARRAY) {
602 return OVSDB_BUG("bad JSON type");
603 }
604
605 struct ds header = DS_EMPTY_INITIALIZER;
606 struct ds data = DS_EMPTY_INITIALIZER;
607 ovsdb_log_compose_record(json, file->magic, &header, &data);
608 size_t total_length = header.length + data.length;
609
610 /* Write. */
611 bool ok = (fwrite(header.string, header.length, 1, file->stream) == 1
612 && fwrite(data.string, data.length, 1, file->stream) == 1
613 && fflush(file->stream) == 0);
614 ds_destroy(&header);
615 ds_destroy(&data);
616 if (!ok) {
617 int error = errno;
618
619 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
620 VLOG_WARN_RL(&rl, "%s: write failed (%s)",
621 file->name, ovs_strerror(error));
622
623 /* Remove any partially written data, ignoring errors since there is
624 * nothing further we can do. */
625 ignore(ftruncate(fileno(file->stream), file->offset));
626
627 file->error = ovsdb_io_error(error, "%s: write failed",
628 file->display_name);
629 file->state = OVSDB_LOG_WRITE_ERROR;
630 return ovsdb_error_clone(file->error);
631 }
632
633 file->offset += total_length;
634 return NULL;
635 }
636
637 struct ovsdb_error *
638 ovsdb_log_commit(struct ovsdb_log *file)
639 {
640 if (file->stream && fsync(fileno(file->stream))) {
641 return ovsdb_io_error(errno, "%s: fsync failed", file->display_name);
642 }
643 return NULL;
644 }
645
646 /* Sets the current position in 'log' as the "base", that is, the initial size
647 * of the log that ovsdb_log_grew_lots() uses to determine whether the log has
648 * grown enough to make compacting worthwhile. */
649 void
650 ovsdb_log_mark_base(struct ovsdb_log *log)
651 {
652 log->base = log->offset;
653 }
654
655 /* Returns true if 'log' has grown enough above the base that it's worthwhile
656 * to compact it, false otherwise. */
657 bool
658 ovsdb_log_grew_lots(const struct ovsdb_log *log)
659 {
660 return log->offset > 10 * 1024 * 1024 && log->offset / 2 > log->base;
661 }
662 \f
663 /* Attempts to atomically replace the contents of 'log', on disk, by the 'n'
664 * entries in 'entries'. If successful, returns NULL, otherwise returns an
665 * error (which the caller must eventually free).
666 *
667 * If successful, 'log' will be in write mode at the end of the log. */
668 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
669 ovsdb_log_replace(struct ovsdb_log *log, struct json **entries, size_t n)
670 {
671 struct ovsdb_error *error;
672 struct ovsdb_log *new;
673
674 error = ovsdb_log_replace_start(log, &new);
675 if (error) {
676 return error;
677 }
678
679 for (size_t i = 0; i < n; i++) {
680 error = ovsdb_log_write(new, entries[i]);
681 if (error) {
682 ovsdb_log_replace_abort(new);
683 return error;
684 }
685 }
686 ovsdb_log_mark_base(new);
687
688 return ovsdb_log_replace_commit(log, new);
689 }
690
691 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
692 ovsdb_log_replace_start(struct ovsdb_log *old,
693 struct ovsdb_log **newp)
694 {
695 /* If old->name is a symlink, then we want the new file to be in the same
696 * directory as the symlink's referent. */
697 char *deref_name = follow_symlinks(old->name);
698 char *tmp_name = xasprintf("%s.tmp", deref_name);
699 free(deref_name);
700
701 struct ovsdb_error *error;
702
703 ovs_assert(old->lockfile);
704
705 /* Remove temporary file. (It might not exist.) */
706 if (unlink(tmp_name) < 0 && errno != ENOENT) {
707 error = ovsdb_io_error(errno, "failed to remove %s", tmp_name);
708 free(tmp_name);
709 *newp = NULL;
710 return error;
711 }
712
713 /* Create temporary file. */
714 error = ovsdb_log_open(tmp_name, old->magic, OVSDB_LOG_CREATE_EXCL,
715 false, newp);
716 free(tmp_name);
717 return error;
718 }
719
720 /* Rename 'old' to 'new', replacing 'new' if it exists. Returns NULL if
721 * successful, otherwise an ovsdb_error that the caller must destroy. */
722 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
723 ovsdb_rename(const char *old, const char *new)
724 {
725 #ifdef _WIN32
726 /* Avoid rename() because it fails if the destination exists. */
727 int error = (MoveFileEx(old, new, MOVEFILE_REPLACE_EXISTING
728 | MOVEFILE_WRITE_THROUGH | MOVEFILE_COPY_ALLOWED)
729 ? 0 : EACCES);
730 #else
731 int error = rename(old, new) ? errno : 0;
732 #endif
733
734 return (error
735 ? ovsdb_io_error(error, "failed to rename \"%s\" to \"%s\"",
736 old, new)
737 : NULL);
738 }
739
740 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
741 ovsdb_log_replace_commit(struct ovsdb_log *old, struct ovsdb_log *new)
742 {
743 struct ovsdb_error *error = ovsdb_log_commit(new);
744 if (error) {
745 ovsdb_log_replace_abort(new);
746 return error;
747 }
748
749 /* Replace original file by the temporary file.
750 *
751 * We support two strategies:
752 *
753 * - The preferred strategy is to rename the temporary file over the
754 * original one in-place, then close the original one. This works on
755 * Unix-like systems. It does not work on Windows, which does not
756 * allow open files to be renamed. The approach has the advantage
757 * that, at any point, we can drop back to something that already
758 * works.
759 *
760 * - Alternatively, we can close both files, rename, then open the new
761 * file (which now has the original name). This works on all
762 * systems, but if reopening the file fails then 'old' is broken.
763 *
764 * We make the strategy a variable instead of an #ifdef to make it easier
765 * to test both strategies on Unix-like systems, and to make the code
766 * easier to read. */
767 if (!rename_open_files) {
768 fclose(old->stream);
769 old->stream = NULL;
770
771 fclose(new->stream);
772 new->stream = NULL;
773 }
774
775 /* Rename 'old' to 'new'. We dereference the old name because, if it is a
776 * symlink, we want to replace the referent of the symlink instead of the
777 * symlink itself. */
778 char *deref_name = follow_symlinks(old->name);
779 error = ovsdb_rename(new->name, deref_name);
780 free(deref_name);
781
782 if (error) {
783 ovsdb_log_replace_abort(new);
784 return error;
785 }
786 if (rename_open_files) {
787 fsync_parent_dir(old->name);
788 fclose(old->stream);
789 old->stream = new->stream;
790 new->stream = NULL;
791 } else {
792 old->stream = fopen(old->name, "r+b");
793 if (!old->stream) {
794 old->error = ovsdb_io_error(errno, "%s: could not reopen log",
795 old->name);
796 old->state = OVSDB_LOG_BROKEN;
797 return ovsdb_error_clone(old->error);
798 }
799
800 if (fseek(old->stream, new->offset, SEEK_SET)) {
801 old->error = ovsdb_io_error(errno, "%s: seek failed", old->name);
802 old->state = OVSDB_LOG_BROKEN;
803 return ovsdb_error_clone(old->error);
804 }
805 }
806
807 /* Replace 'old' by 'new' in memory.
808 *
809 * 'old' transitions to OVSDB_LOG_WRITE (it was probably in that mode
810 * anyway). */
811 old->state = OVSDB_LOG_WRITE;
812 ovsdb_error_destroy(old->error);
813 old->error = NULL;
814 /* prev_offset only matters for OVSDB_LOG_READ. */
815 old->offset = new->offset;
816 /* Keep old->name. */
817 free(old->magic);
818 old->magic = new->magic;
819 new->magic = NULL;
820 /* Keep old->lockfile. */
821 old->base = new->base;
822
823 /* Free 'new'. */
824 ovsdb_log_close(new);
825
826 return NULL;
827 }
828
829 void
830 ovsdb_log_replace_abort(struct ovsdb_log *new)
831 {
832 if (new) {
833 /* Unlink the new file, but only after we close it (because Windows
834 * does not allow removing an open file). */
835 char *name = xstrdup(new->name);
836 ovsdb_log_close(new);
837 unlink(name);
838 free(name);
839 }
840 }
841
842 void
843 ovsdb_log_disable_renaming_open_files(void)
844 {
845 rename_open_files = false;
846 }