lib/daemon-unix.c

   1 /*
   2  * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "backtrace.h"
  19 #include "daemon.h"
  20 #include "daemon-private.h"
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <grp.h>
  24 #include <pwd.h>
  25 #include <signal.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <sys/resource.h>
  29 #include <sys/wait.h>
  30 #include <sys/stat.h>
  31 #include <unistd.h>
  32 #if HAVE_LIBCAPNG
  33 #include <cap-ng.h>
  34 #endif
  35 #include "command-line.h"
  36 #include "fatal-signal.h"
  37 #include "dirs.h"
  38 #include "lockfile.h"
  39 #include "ovs-thread.h"
  40 #include "process.h"
  41 #include "socket-util.h"
  42 #include "timeval.h"
  43 #include "util.h"
  44 #include "openvswitch/vlog.h"
  45
  46 VLOG_DEFINE_THIS_MODULE(daemon_unix);
  47
  48 #ifdef __linux__
  49 #define LINUX 1
  50 #else
  51 #define LINUX 0
  52 #endif
  53
  54 #if HAVE_LIBCAPNG
  55 #define LIBCAPNG 1
  56 #else
  57 #define LIBCAPNG 0
  58 #endif
  59
  60 /* --detach: Should we run in the background? */
  61 bool detach;                    /* Was --detach specified? */
  62 static bool detached;           /* Have we already detached? */
  63
  64 /* --pidfile: Name of pidfile (null if none). */
  65 char *pidfile;
  66
  67 /* Device and inode of pidfile, so we can avoid reopening it. */
  68 static dev_t pidfile_dev;
  69 static ino_t pidfile_ino;
  70
  71 /* --overwrite-pidfile: Create pidfile even if one already exists and is
  72    locked? */
  73 static bool overwrite_pidfile;
  74
  75 /* --no-chdir: Should we chdir to "/"? */
  76 static bool chdir_ = true;
  77
  78 /* File descriptor used by daemonize_start() and daemonize_complete(). */
  79 int daemonize_fd = -1;
  80
  81 /* --monitor: Should a supervisory process monitor the daemon and restart it if
  82  * it dies due to an error signal? */
  83 bool monitor;
  84
  85 /* --user: Only root can use this option. Switch to new uid:gid after
  86  * initially running as root.  */
  87 static bool switch_user = false;
  88 static uid_t uid;
  89 static gid_t gid;
  90 static char *user = NULL;
  91 static void daemon_become_new_user__(bool access_datapath);
  92
  93 static void check_already_running(void);
  94 static int lock_pidfile(FILE *, int command);
  95 static pid_t fork_and_clean_up(void);
  96 static void daemonize_post_detach(void);
  97
  98 /* Returns the file name that would be used for a pidfile if 'name' were
  99  * provided to set_pidfile().  The caller must free the returned string. */
 100 char *
 101 make_pidfile_name(const char *name)
 102 {
 103     return (!name
 104             ? xasprintf("%s/%s.pid", ovs_rundir(), program_name)
 105             : abs_file_name(ovs_rundir(), name));
 106 }
 107
 108 /* Sets that we do not chdir to "/". */
 109 void
 110 set_no_chdir(void)
 111 {
 112     chdir_ = false;
 113 }
 114
 115 /* Normally, daemonize() or damonize_start() will terminate the program with a
 116  * message if a locked pidfile already exists.  If this function is called, an
 117  * existing pidfile will be replaced, with a warning. */
 118 void
 119 ignore_existing_pidfile(void)
 120 {
 121     overwrite_pidfile = true;
 122 }
 123
 124 /* Sets up a following call to daemonize() to detach from the foreground
 125  * session, running this process in the background.  */
 126 void
 127 set_detach(void)
 128 {
 129     detach = true;
 130 }
 131
 132 /* Sets up a following call to daemonize() to fork a supervisory process to
 133  * monitor the daemon and restart it if it dies due to an error signal.  */
 134 void
 135 daemon_set_monitor(void)
 136 {
 137     monitor = true;
 138 }
 139
 140 /* If a pidfile has been configured, creates it and stores the running
 141  * process's pid in it.  Ensures that the pidfile will be deleted when the
 142  * process exits. */
 143 static void
 144 make_pidfile(void)
 145 {
 146     long int pid = getpid();
 147     struct stat s;
 148     char *tmpfile;
 149     FILE *file;
 150     int error;
 151
 152     /* Create a temporary pidfile. */
 153     if (overwrite_pidfile) {
 154         tmpfile = xasprintf("%s.tmp%ld", pidfile, pid);
 155         fatal_signal_add_file_to_unlink(tmpfile);
 156     } else {
 157         /* Everyone shares the same file which will be treated as a lock.  To
 158          * avoid some uncomfortable race conditions, we can't set up the fatal
 159          * signal unlink until we've acquired it. */
 160         tmpfile = xasprintf("%s.tmp", pidfile);
 161     }
 162
 163     file = fopen(tmpfile, "a+");
 164     if (!file) {
 165         VLOG_FATAL("%s: create failed (%s)", tmpfile, ovs_strerror(errno));
 166     }
 167
 168     error = lock_pidfile(file, F_SETLK);
 169     if (error) {
 170         /* Looks like we failed to acquire the lock.  Note that, if we failed
 171          * for some other reason (and '!overwrite_pidfile'), we will have
 172          * left 'tmpfile' as garbage in the file system. */
 173         VLOG_FATAL("%s: fcntl(F_SETLK) failed (%s)", tmpfile,
 174                    ovs_strerror(error));
 175     }
 176
 177     if (!overwrite_pidfile) {
 178         /* We acquired the lock.  Make sure to clean up on exit, and verify
 179          * that we're allowed to create the actual pidfile. */
 180         fatal_signal_add_file_to_unlink(tmpfile);
 181         check_already_running();
 182     }
 183
 184     if (fstat(fileno(file), &s) == -1) {
 185         VLOG_FATAL("%s: fstat failed (%s)", tmpfile, ovs_strerror(errno));
 186     }
 187
 188     if (ftruncate(fileno(file), 0) == -1) {
 189         VLOG_FATAL("%s: truncate failed (%s)", tmpfile, ovs_strerror(errno));
 190     }
 191
 192     fprintf(file, "%ld\n", pid);
 193     if (fflush(file) == EOF) {
 194         VLOG_FATAL("%s: write failed (%s)", tmpfile, ovs_strerror(errno));
 195     }
 196
 197     error = rename(tmpfile, pidfile);
 198
 199     /* Due to a race, 'tmpfile' may be owned by a different process, so we
 200      * shouldn't delete it on exit. */
 201     fatal_signal_remove_file_to_unlink(tmpfile);
 202
 203     if (error < 0) {
 204         VLOG_FATAL("failed to rename \"%s\" to \"%s\" (%s)",
 205                    tmpfile, pidfile, ovs_strerror(errno));
 206     }
 207
 208     /* Ensure that the pidfile will get deleted on exit. */
 209     fatal_signal_add_file_to_unlink(pidfile);
 210
 211     /* Clean up.
 212      *
 213      * We don't close 'file' because its file descriptor must remain open to
 214      * hold the lock. */
 215     pidfile_dev = s.st_dev;
 216     pidfile_ino = s.st_ino;
 217     free(tmpfile);
 218 }
 219
 220 /* Calls fork() and on success returns its return value.  On failure, logs an
 221  * error and exits unsuccessfully.
 222  *
 223  * Post-fork, but before returning, this function calls a few other functions
 224  * that are generally useful if the child isn't planning to exec a new
 225  * process. */
 226 static pid_t
 227 fork_and_clean_up(void)
 228 {
 229     pid_t pid = xfork();
 230     if (pid > 0) {
 231         /* Running in parent process. */
 232         fatal_signal_fork();
 233     } else if (!pid) {
 234         /* Running in child process. */
 235         lockfile_postfork();
 236     }
 237     return pid;
 238 }
 239
 240 /* Forks, then:
 241  *
 242  *   - In the parent, waits for the child to signal that it has completed its
 243  *     startup sequence.  Then stores -1 in '*fdp' and returns the child's
 244  *     pid in '*child_pid' argument.
 245  *
 246  *   - In the child, stores a fd in '*fdp' and returns 0 through '*child_pid'
 247  *     argument.  The caller should pass the fd to fork_notify_startup() after
 248  *     it finishes its startup sequence.
 249  *
 250  * Returns 0 on success.  If something goes wrong and child process was not
 251  * able to signal its readiness by calling fork_notify_startup(), then this
 252  * function returns -1. However, even in case of failure it still sets child
 253  * process id in '*child_pid'. */
 254 static int
 255 fork_and_wait_for_startup(int *fdp, pid_t *child_pid)
 256 {
 257     int fds[2];
 258     pid_t pid;
 259     int ret = 0;
 260
 261     xpipe(fds);
 262
 263     pid = fork_and_clean_up();
 264     if (pid > 0) {
 265         /* Running in parent process. */
 266         size_t bytes_read;
 267         char c;
 268
 269         close(fds[1]);
 270         if (read_fully(fds[0], &c, 1, &bytes_read) != 0) {
 271             int retval;
 272             int status;
 273
 274             do {
 275                 retval = waitpid(pid, &status, 0);
 276             } while (retval == -1 && errno == EINTR);
 277
 278             if (retval == pid) {
 279                 if (WIFEXITED(status) && WEXITSTATUS(status)) {
 280                     /* Child exited with an error.  Convey the same error
 281                      * to our parent process as a courtesy. */
 282                     exit(WEXITSTATUS(status));
 283                 } else {
 284                     char *status_msg = process_status_msg(status);
 285                     VLOG_ERR("fork child died before signaling startup (%s)",
 286                              status_msg);
 287                     ret = -1;
 288                 }
 289             } else if (retval < 0) {
 290                 VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
 291             } else {
 292                 OVS_NOT_REACHED();
 293             }
 294         }
 295         *fdp = fds[0];
 296     } else if (!pid) {
 297         /* Running in child process. */
 298         close(fds[0]);
 299         *fdp = fds[1];
 300     }
 301     *child_pid = pid;
 302     return ret;
 303 }
 304
 305 static void
 306 fork_notify_startup(int fd)
 307 {
 308     if (fd != -1) {
 309         size_t bytes_written;
 310         int error;
 311
 312         error = write_fully(fd, "", 1, &bytes_written);
 313         if (error) {
 314             VLOG_FATAL("pipe write failed (%s)", ovs_strerror(error));
 315         }
 316     }
 317 }
 318
 319 static bool
 320 should_restart(int status)
 321 {
 322     if (WIFSIGNALED(status)) {
 323         static const int error_signals[] = {
 324             /* This list of signals is documented in daemon.man.  If you
 325              * change the list, update the documentation too. */
 326             SIGABRT, SIGALRM, SIGBUS, SIGFPE, SIGILL, SIGPIPE, SIGSEGV,
 327             SIGXCPU, SIGXFSZ
 328         };
 329
 330         size_t i;
 331
 332         for (i = 0; i < ARRAY_SIZE(error_signals); i++) {
 333             if (error_signals[i] == WTERMSIG(status)) {
 334                 return true;
 335             }
 336         }
 337     }
 338     return false;
 339 }
 340
 341 static void
 342 monitor_daemon(pid_t daemon_pid)
 343 {
 344     /* XXX Should log daemon's stderr output at startup time. */
 345     time_t last_restart;
 346     char *status_msg;
 347     int crashes;
 348     bool child_ready = true;
 349
 350     set_subprogram_name("monitor");
 351     status_msg = xstrdup("healthy");
 352     last_restart = TIME_MIN;
 353     crashes = 0;
 354     for (;;) {
 355         int retval;
 356         int status;
 357
 358         ovs_cmdl_proctitle_set("monitoring pid %lu (%s)",
 359                                (unsigned long int) daemon_pid, status_msg);
 360
 361         if (child_ready) {
 362             int error;
 363             do {
 364                 retval = waitpid(daemon_pid, &status, 0);
 365                 error = retval == -1 ? errno : 0;
 366             } while (error == EINTR);
 367             vlog_reopen_log_file();
 368             if (error) {
 369                 VLOG_FATAL("waitpid failed (%s)", ovs_strerror(error));
 370             }
 371         }
 372
 373         if (!child_ready || retval == daemon_pid) {
 374             char *s = process_status_msg(status);
 375             if (should_restart(status)) {
 376                 free(status_msg);
 377                 status_msg = xasprintf("%d crashes: pid %lu died, %s",
 378                                        ++crashes,
 379                                        (unsigned long int) daemon_pid, s);
 380                 free(s);
 381
 382                 if (WCOREDUMP(status)) {
 383                     /* Disable further core dumps to save disk space. */
 384                     struct rlimit r;
 385
 386                     r.rlim_cur = 0;
 387                     r.rlim_max = 0;
 388                     if (setrlimit(RLIMIT_CORE, &r) == -1) {
 389                         VLOG_WARN("failed to disable core dumps: %s",
 390                                   ovs_strerror(errno));
 391                     }
 392                 }
 393
 394                 log_received_backtrace(daemonize_fd);
 395
 396                 /* Throttle restarts to no more than once every 10 seconds. */
 397                 if (time(NULL) < last_restart + 10) {
 398                     VLOG_WARN("%s, waiting until 10 seconds since last "
 399                               "restart", status_msg);
 400                     for (;;) {
 401                         time_t now = time(NULL);
 402                         time_t wakeup = last_restart + 10;
 403                         if (now >= wakeup) {
 404                             break;
 405                         }
 406                         xsleep(wakeup - now);
 407                     }
 408                 }
 409                 last_restart = time(NULL);
 410
 411                 VLOG_ERR("%s, restarting", status_msg);
 412                 child_ready = !fork_and_wait_for_startup(&daemonize_fd,
 413                                                          &daemon_pid);
 414                 if (child_ready && !daemon_pid) {
 415                     /* Child process needs to break out of monitoring
 416                      * loop. */
 417                     break;
 418                 }
 419             } else {
 420                 VLOG_INFO("pid %lu died, %s, exiting",
 421                           (unsigned long int) daemon_pid, s);
 422                 free(s);
 423                 exit(0);
 424             }
 425         }
 426     }
 427     free(status_msg);
 428
 429     /* Running in new daemon process. */
 430     ovs_cmdl_proctitle_restore();
 431     set_subprogram_name(program_name);
 432 }
 433
 434 /* If daemonization is configured, then starts daemonization, by forking and
 435  * returning in the child process.  The parent process hangs around until the
 436  * child lets it know either that it completed startup successfully (by calling
 437  * daemonize_complete()) or that it failed to start up (by exiting with a
 438  * nonzero exit code). */
 439 void
 440 daemonize_start(bool access_datapath)
 441 {
 442     assert_single_threaded();
 443     daemonize_fd = -1;
 444
 445     if (switch_user) {
 446         daemon_become_new_user__(access_datapath);
 447         switch_user = false;
 448     }
 449
 450     if (detach) {
 451         pid_t pid;
 452
 453         if (fork_and_wait_for_startup(&daemonize_fd, &pid)) {
 454             VLOG_FATAL("could not detach from foreground session");
 455         }
 456         if (pid > 0) {
 457             /* Running in parent process. */
 458             exit(0);
 459         }
 460
 461         /* Running in daemon or monitor process. */
 462         setsid();
 463     }
 464
 465     if (monitor) {
 466         int saved_daemonize_fd = daemonize_fd;
 467         pid_t daemon_pid;
 468
 469         if (fork_and_wait_for_startup(&daemonize_fd, &daemon_pid)) {
 470             VLOG_FATAL("could not initiate process monitoring");
 471         }
 472         if (daemon_pid > 0) {
 473             /* Running in monitor process. */
 474             fork_notify_startup(saved_daemonize_fd);
 475             if (detach) {
 476                 close_standard_fds();
 477             }
 478             monitor_daemon(daemon_pid);
 479         }
 480         /* Running in daemon process. */
 481     }
 482
 483     forbid_forking("running in daemon process");
 484
 485     if (pidfile) {
 486         make_pidfile();
 487     }
 488
 489     /* Make sure that the unixctl commands for vlog get registered in a
 490      * daemon, even before the first log message. */
 491     vlog_init();
 492 }
 493
 494 /* If daemonization is configured, then this function notifies the parent
 495  * process that the child process has completed startup successfully.  It also
 496  * call daemonize_post_detach().
 497  *
 498  * Calling this function more than once has no additional effect. */
 499 void
 500 daemonize_complete(void)
 501 {
 502     if (pidfile) {
 503         free(pidfile);
 504         pidfile = NULL;
 505     }
 506
 507     if (!detached) {
 508         detached = true;
 509
 510         fork_notify_startup(daemonize_fd);
 511         daemonize_post_detach();
 512     }
 513 }
 514
 515 /* If daemonization is configured, then this function does traditional Unix
 516  * daemonization behavior: join a new session, chdir to the root (if not
 517  * disabled), and close the standard file descriptors.
 518  *
 519  * It only makes sense to call this function as part of an implementation of a
 520  * special daemon subprocess.  A normal daemon should just call
 521  * daemonize_complete(). */
 522 static void
 523 daemonize_post_detach(void)
 524 {
 525     if (detach) {
 526         if (chdir_) {
 527             ignore(chdir("/"));
 528         }
 529         close_standard_fds();
 530     }
 531 }
 532
 533 void
 534 daemon_usage(void)
 535 {
 536     printf(
 537         "\nDaemon options:\n"
 538         "  --detach                run in background as daemon\n"
 539         "  --monitor               creates a process to monitor this daemon\n"
 540         "  --user=username[:group] changes the effective daemon user:group\n"
 541         "  --no-chdir              do not chdir to '/'\n"
 542         "  --pidfile[=FILE]        create pidfile (default: %s/%s.pid)\n"
 543         "  --overwrite-pidfile     with --pidfile, start even if already "
 544                                    "running\n",
 545         ovs_rundir(), program_name);
 546 }
 547
 548 static int
 549 lock_pidfile__(FILE *file, int command, struct flock *lck)
 550 {
 551     int error;
 552
 553     lck->l_type = F_WRLCK;
 554     lck->l_whence = SEEK_SET;
 555     lck->l_start = 0;
 556     lck->l_len = 0;
 557     lck->l_pid = 0;
 558
 559     do {
 560         error = fcntl(fileno(file), command, lck) == -1 ? errno : 0;
 561     } while (error == EINTR);
 562     return error;
 563 }
 564
 565 static int
 566 lock_pidfile(FILE *file, int command)
 567 {
 568     struct flock lck;
 569
 570     return lock_pidfile__(file, command, &lck);
 571 }
 572
 573 static pid_t
 574 read_pidfile__(const char *pidfile_, bool delete_if_stale)
 575 {
 576     struct stat s, s2;
 577     struct flock lck;
 578     char line[128];
 579     FILE *file;
 580     int error;
 581
 582     if ((pidfile_ino || pidfile_dev)
 583         && !stat(pidfile_, &s)
 584         && s.st_ino == pidfile_ino && s.st_dev == pidfile_dev) {
 585         /* It's our own pidfile.  We can't afford to open it, because closing
 586          * *any* fd for a file that a process has locked also releases all the
 587          * locks on that file.
 588          *
 589          * Fortunately, we know the associated pid anyhow: */
 590         return getpid();
 591     }
 592
 593     file = fopen(pidfile_, "r+");
 594     if (!file) {
 595         if (errno == ENOENT && delete_if_stale) {
 596             return 0;
 597         }
 598         error = errno;
 599         VLOG_WARN("%s: open: %s", pidfile_, ovs_strerror(error));
 600         goto error;
 601     }
 602
 603     error = lock_pidfile__(file, F_GETLK, &lck);
 604     if (error) {
 605         VLOG_WARN("%s: fcntl: %s", pidfile_, ovs_strerror(error));
 606         goto error;
 607     }
 608     if (lck.l_type == F_UNLCK) {
 609         /* pidfile exists but it isn't locked by anyone.  We need to delete it
 610          * so that a new pidfile can go in its place.  But just calling
 611          * unlink(pidfile) makes a nasty race: what if someone else unlinks it
 612          * before we do and then replaces it by a valid pidfile?  We'd unlink
 613          * their valid pidfile.  We do a little dance to avoid the race, by
 614          * locking the invalid pidfile.  Only one process can have the invalid
 615          * pidfile locked, and only that process has the right to unlink it. */
 616         if (!delete_if_stale) {
 617             error = ESRCH;
 618             VLOG_DBG("%s: pid file is stale", pidfile_);
 619             goto error;
 620         }
 621
 622         /* Get the lock. */
 623         error = lock_pidfile(file, F_SETLK);
 624         if (error) {
 625             /* We lost a race with someone else doing the same thing. */
 626             VLOG_WARN("%s: lost race to lock pidfile", pidfile_);
 627             goto error;
 628         }
 629
 630         /* Is the file we have locked still named 'pidfile_'? */
 631         if (stat(pidfile_, &s) || fstat(fileno(file), &s2)
 632             || s.st_ino != s2.st_ino || s.st_dev != s2.st_dev) {
 633             /* No.  We lost a race with someone else who got the lock before
 634              * us, deleted the pidfile, and closed it (releasing the lock). */
 635             error = EALREADY;
 636             VLOG_WARN("%s: lost race to delete pidfile", pidfile_);
 637             goto error;
 638         }
 639
 640         /* We won the right to delete the stale pidfile. */
 641         if (unlink(pidfile_)) {
 642             error = errno;
 643             VLOG_WARN("%s: failed to delete stale pidfile (%s)",
 644                       pidfile_, ovs_strerror(error));
 645             goto error;
 646         }
 647         VLOG_DBG("%s: deleted stale pidfile", pidfile_);
 648         fclose(file);
 649         return 0;
 650     }
 651
 652     if (!fgets(line, sizeof line, file)) {
 653         if (ferror(file)) {
 654             error = errno;
 655             VLOG_WARN("%s: read: %s", pidfile_, ovs_strerror(error));
 656         } else {
 657             error = ESRCH;
 658             VLOG_WARN("%s: read: unexpected end of file", pidfile_);
 659         }
 660         goto error;
 661     }
 662
 663     if (lck.l_pid != strtoul(line, NULL, 10)) {
 664         /* The process that has the pidfile locked is not the process that
 665          * created it.  It must be stale, with the process that has it locked
 666          * preparing to delete it. */
 667         error = ESRCH;
 668         VLOG_WARN("%s: stale pidfile for pid %s being deleted by pid %ld",
 669                   pidfile_, line, (long int) lck.l_pid);
 670         goto error;
 671     }
 672
 673     fclose(file);
 674     return lck.l_pid;
 675
 676 error:
 677     if (file) {
 678         fclose(file);
 679     }
 680     return -error;
 681 }
 682
 683 /* Opens and reads a PID from 'pidfile_'.  Returns the positive PID if
 684  * successful, otherwise a negative errno value. */
 685 pid_t
 686 read_pidfile(const char *pidfile_)
 687 {
 688     return read_pidfile__(pidfile_, false);
 689 }
 690
 691 /* Checks whether a process with the given 'pidfile' is already running and,
 692  * if so, aborts.  If 'pidfile' is stale, deletes it. */
 693 static void
 694 check_already_running(void)
 695 {
 696     long int pid = read_pidfile__(pidfile, true);
 697     if (pid > 0) {
 698         VLOG_FATAL("%s: already running as pid %ld, aborting", pidfile, pid);
 699     } else if (pid < 0) {
 700         VLOG_FATAL("%s: pidfile check failed (%s), aborting",
 701                    pidfile, ovs_strerror(-pid));
 702     }
 703 }
 704
 705 \f
 706 /* stub functions for non-windows platform. */
 707
 708 void
 709 service_start(int *argc OVS_UNUSED, char **argv[] OVS_UNUSED)
 710 {
 711 }
 712
 713 void
 714 service_stop(void)
 715 {
 716 }
 717
 718 bool
 719 should_service_stop(void)
 720 {
 721     return false;
 722 }
 723
 724 \f
 725 static bool
 726 gid_matches(gid_t expected, gid_t value)
 727 {
 728     return expected == -1 || expected == value;
 729 }
 730
 731 static bool
 732 gid_verify(gid_t gid_)
 733 {
 734     gid_t r, e;
 735
 736     r = getgid();
 737     e = getegid();
 738     return (gid_matches(gid_, r) &&
 739             gid_matches(gid_, e));
 740 }
 741
 742 static void
 743 daemon_switch_group(gid_t gid_)
 744 {
 745     if ((setgid(gid_) == -1) || !gid_verify(gid_)) {
 746         VLOG_FATAL("%s: fail to switch group to gid as %d, aborting",
 747                    pidfile, gid_);
 748     }
 749 }
 750
 751 static bool
 752 uid_matches(uid_t expected, uid_t value)
 753 {
 754     return expected == -1 || expected == value;
 755 }
 756
 757 static bool
 758 uid_verify(const uid_t uid_)
 759 {
 760     uid_t r, e;
 761
 762     r = getuid();
 763     e = geteuid();
 764     return (uid_matches(uid_, r) &&
 765             uid_matches(uid_, e));
 766 }
 767
 768 static void
 769 daemon_switch_user(const uid_t uid_, const char *user_)
 770 {
 771     if ((setuid(uid_) == -1) || !uid_verify(uid_)) {
 772         VLOG_FATAL("%s: fail to switch user to %s, aborting",
 773                    pidfile, user_);
 774     }
 775 }
 776
 777 /* Use portable Unix APIs to switch uid:gid, when datapath
 778  * access is not required.  On Linux systems, all capabilities
 779  * will be dropped.  */
 780 static void
 781 daemon_become_new_user_unix(void)
 782 {
 783     /* "Setuid Demystified" by Hao Chen, etc outlines some caveats of
 784      * around unix system call setuid() and friends. This implementation
 785      * mostly follow the advice given by the paper.  The paper is
 786      * published in 2002, so things could have changed.  */
 787
 788     /* Change both real and effective uid and gid will permanently
 789      * drop the process' privilege.  "Setuid Demystified" suggested
 790      * that calling getuid() after each setuid() call to verify they
 791      * are actually set, because checking return code alone is not
 792      * sufficient.  */
 793     daemon_switch_group(gid);
 794     if (user && initgroups(user, gid) == -1) {
 795         VLOG_FATAL("%s: fail to add supplementary group gid %d, "
 796                    "aborting", pidfile, gid);
 797     }
 798     daemon_switch_user(uid, user);
 799 }
 800
 801 /* Linux specific implementation of daemon_become_new_user()
 802  * using libcap-ng.   */
 803 static void
 804 daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
 805 {
 806 #if defined __linux__ &&  HAVE_LIBCAPNG
 807     int ret;
 808
 809     ret = capng_get_caps_process();
 810
 811     if (!ret) {
 812         if (capng_have_capabilities(CAPNG_SELECT_CAPS) > CAPNG_NONE) {
 813             const capng_type_t cap_sets = CAPNG_EFFECTIVE|CAPNG_PERMITTED;
 814
 815             capng_clear(CAPNG_SELECT_BOTH);
 816
 817             ret = capng_update(CAPNG_ADD, cap_sets, CAP_IPC_LOCK)
 818                   || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BIND_SERVICE);
 819
 820             if (access_datapath && !ret) {
 821                 ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
 822                       || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW)
 823                       || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST);
 824             }
 825         } else {
 826             ret = -1;
 827         }
 828     }
 829
 830     if (!ret) {
 831         /* CAPNG_INIT_SUPP_GRP will be a better choice than
 832          * CAPNG_DROP_SUPP_GRP. However this enum value is only defined
 833          * with libcap-ng higher than version 0.7.4, which is not wildly
 834          * available on many Linux distributions yet. Taking a more
 835          * conservative approach to make sure OVS behaves consistently.
 836          *
 837          * XXX We may change this for future OVS releases.
 838          */
 839         ret = capng_change_id(uid, gid, CAPNG_DROP_SUPP_GRP
 840                               | CAPNG_CLEAR_BOUNDING);
 841     }
 842
 843     if (ret) {
 844         VLOG_FATAL("%s: libcap-ng fail to switch to user and group "
 845                    "%d:%d, aborting", pidfile, uid, gid);
 846     }
 847 #endif
 848 }
 849
 850 static void
 851 daemon_become_new_user__(bool access_datapath)
 852 {
 853     /* If vlog file has been created, change its owner to the non-root user
 854      * as specifed by the --user option.  */
 855     vlog_change_owner_unix(uid, gid);
 856
 857     if (LINUX) {
 858         if (LIBCAPNG) {
 859             daemon_become_new_user_linux(access_datapath);
 860         } else {
 861             VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
 862                        "(libcap-ng is not configured at compile time), "
 863                        "aborting.", pidfile);
 864         }
 865     } else {
 866         daemon_become_new_user_unix();
 867     }
 868 }
 869
 870 /* Noramlly, user switch is embedded within daemonize_start().
 871  * However, there in case the user switch needs to be done
 872  * before daemonize_start(), the following API can be used.  */
 873 void
 874 daemon_become_new_user(bool access_datapath)
 875 {
 876     assert_single_threaded();
 877     if (switch_user) {
 878         daemon_become_new_user__(access_datapath);
 879         /* daemonize_start() should not switch user again. */
 880         switch_user = false;
 881     }
 882 }
 883
 884 /* Return the maximun suggested buffer size for both getpwname_r()
 885  * and getgrnam_r().
 886  *
 887  * This size may still not be big enough. in case getpwname_r()
 888  * and friends return ERANGE, a larger buffer should be supplied to
 889  * retry. (The man page did not specify the max size to stop at, we
 890  * will keep trying with doubling the buffer size for each round until
 891  * the size wrapps around size_t.  */
 892 static size_t
 893 get_sysconf_buffer_size(void)
 894 {
 895     size_t bufsize, pwd_bs = 0, grp_bs = 0;
 896     const size_t default_bufsize = 1024;
 897
 898     errno = 0;
 899     if ((pwd_bs = sysconf(_SC_GETPW_R_SIZE_MAX)) == -1) {
 900         if (errno) {
 901             VLOG_FATAL("%s: Read initial passwordd struct size "
 902                        "failed (%s), aborting. ", pidfile,
 903                        ovs_strerror(errno));
 904         }
 905     }
 906
 907     if ((grp_bs = sysconf(_SC_GETGR_R_SIZE_MAX)) == -1) {
 908         if (errno) {
 909             VLOG_FATAL("%s: Read initial group struct size "
 910                        "failed (%s), aborting. ", pidfile,
 911                        ovs_strerror(errno));
 912         }
 913     }
 914
 915     bufsize = MAX(pwd_bs, grp_bs);
 916     return bufsize ? bufsize : default_bufsize;
 917 }
 918
 919 /* Try to double the size of '*buf', return true
 920  * if successful, and '*sizep' will be updated with
 921  * the new size. Otherwise, return false.  */
 922 static bool
 923 enlarge_buffer(char **buf, size_t *sizep)
 924 {
 925     size_t newsize = *sizep * 2;
 926
 927     if (newsize > *sizep) {
 928         *buf = xrealloc(*buf, newsize);
 929         *sizep = newsize;
 930         return true;
 931     }
 932
 933     return false;
 934 }
 935
 936 /* Parse and sanity check user_spec.
 937  *
 938  * If successful, set global variables 'uid' and 'gid'
 939  * with the parsed results. Global variable 'user'
 940  * will be pointing to a string that stores the name
 941  * of the user to be switched into.
 942  *
 943  * Also set 'switch_to_new_user' to true, The actual
 944  * user switching is done as soon as daemonize_start()
 945  * is called. I/O access before calling daemonize_start()
 946  * will still be with root's credential.  */
 947 void
 948 daemon_set_new_user(const char *user_spec)
 949 {
 950     char *pos = strchr(user_spec, ':');
 951     size_t init_bufsize, bufsize;
 952
 953     init_bufsize = get_sysconf_buffer_size();
 954     uid = getuid();
 955     gid = getgid();
 956
 957     if (geteuid() || uid) {
 958         VLOG_FATAL("%s: only root can use --user option", pidfile);
 959     }
 960
 961     user_spec += strspn(user_spec, " \t\r\n");
 962     size_t len = pos ? pos - user_spec : strlen(user_spec);
 963     char *buf;
 964     struct passwd pwd, *res;
 965     int e;
 966
 967     bufsize = init_bufsize;
 968     buf = xmalloc(bufsize);
 969     if (len) {
 970         user = xmemdup0(user_spec, len);
 971
 972         while ((e = getpwnam_r(user, &pwd, buf, bufsize, &res)) == ERANGE) {
 973             if (!enlarge_buffer(&buf, &bufsize)) {
 974                 break;
 975             }
 976         }
 977
 978         if (e != 0) {
 979             VLOG_FATAL("%s: Failed to retrive user %s's uid (%s), aborting.",
 980                        pidfile, user, ovs_strerror(e));
 981         }
 982         if (res == NULL) {
 983             VLOG_FATAL("%s: user %s not found, aborting.", pidfile, user);
 984         }
 985     } else {
 986         /* User name is not specified, use current user.  */
 987         while ((e = getpwuid_r(uid, &pwd, buf, bufsize, &res)) == ERANGE) {
 988             if (!enlarge_buffer(&buf, &bufsize)) {
 989                 break;
 990             }
 991         }
 992
 993         if (e != 0) {
 994             VLOG_FATAL("%s: Failed to retrive current user's name "
 995                        "(%s), aborting.", pidfile, ovs_strerror(e));
 996         }
 997         user = xstrdup(pwd.pw_name);
 998     }
 999
1000     uid = pwd.pw_uid;
1001     gid = pwd.pw_gid;
1002     free(buf);
1003
1004     if (pos) {
1005         char *grpstr = pos + 1;
1006         grpstr += strspn(grpstr, " \t\r\n");
1007
1008         if (*grpstr) {
1009             struct group grp, *gres;
1010
1011             bufsize = init_bufsize;
1012             buf = xmalloc(bufsize);
1013             while ((e = getgrnam_r(grpstr, &grp, buf, bufsize, &gres))
1014                          == ERANGE) {
1015                 if (!enlarge_buffer(&buf, &bufsize)) {
1016                     break;
1017                 }
1018             }
1019
1020             if (e) {
1021                 VLOG_FATAL("%s: Failed to get group entry for %s, "
1022                            "(%s), aborting.", pidfile, grpstr,
1023                            ovs_strerror(e));
1024             }
1025             if (gres == NULL) {
1026                 VLOG_FATAL("%s: group %s not found, aborting.", pidfile,
1027                            grpstr);
1028             }
1029
1030             if (gid != grp.gr_gid) {
1031                 char **mem;
1032
1033                 for (mem = grp.gr_mem; *mem; ++mem) {
1034                     if (!strcmp(*mem, user)) {
1035                         break;
1036                     }
1037                 }
1038
1039                 if (!*mem) {
1040                     VLOG_FATAL("%s: Invalid --user option %s (user %s is "
1041                                "not in group %s), aborting.", pidfile,
1042                                user_spec, user, grpstr);
1043                 }
1044                 gid = grp.gr_gid;
1045             }
1046             free(buf);
1047         }
1048     }
1049
1050     switch_user = true;
1051 }