]> git.proxmox.com Git - systemd.git/commitdiff
New upstream version 248.2
authorMichael Biebl <biebl@debian.org>
Sun, 9 May 2021 19:13:46 +0000 (21:13 +0200)
committerMichael Biebl <biebl@debian.org>
Sun, 9 May 2021 19:13:46 +0000 (21:13 +0200)
15 files changed:
man/oomd.conf.xml
src/basic/mountpoint-util.c
src/core/execute.c
src/modules-load/modules-load.c
src/nss-systemd/nss-systemd.c
src/nss-systemd/userdb-glue.c
src/oom/oomd-manager.c
src/oom/oomd-manager.h
src/oom/oomd-util.c
src/oom/oomd-util.h
src/oom/oomd.c
src/oom/test-oomd-util.c
src/shared/mount-util.c
src/shared/userdb.c
src/test/test-mount-util.c

index 6156c98fbd90dc04976a888deef5bf1fbc68d6c9..27914e06999137a04cd1bd0ece21b70e878be42b 100644 (file)
@@ -52,9 +52,9 @@
 
         <listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command>
         will take action. If the fraction of swap used on the system is more than what is defined here,
-        <command>systemd-oomd</command> will act on eligible descendant control groups, starting from the
-        ones with the highest swap usage to the lowest swap usage. Which control groups are monitored and
-        what action gets taken depends on what the unit has configured for
+        <command>systemd-oomd</command> will act on eligible descendant control groups with swap usage greater
+        than 5% of total swap, starting from the ones with the highest swap usage. Which
+        control groups are monitored and what action gets taken depends on what the unit has configured for
         <varname>ManagedOOMSwap=</varname>.  Takes a value specified in percent (when suffixed with "%"),
         permille ("‰") or permyriad ("‱"), between 0% and 100%, inclusive. Defaults to 90%.</para></listitem>
       </varlistentry>
@@ -81,7 +81,7 @@
         <listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure
         limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by
         <varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>.
-        Defaults to 30 seconds when this property is unset or set to 0.</para></listitem>
+        Must be set to 0, or at least 1 second. Defaults to 30 seconds when unset or 0.</para></listitem>
       </varlistentry>
 
     </variablelist>
index c8f6675eb8676b88855c70115f86ba52ffffcc0d..1d617e87b276a97946230feb350e157bf5163ea4 100644 (file)
@@ -196,13 +196,15 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
 
         if (statx(fd, filename, (FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : AT_SYMLINK_NOFOLLOW) |
                                 (flags & AT_EMPTY_PATH) |
-                                AT_NO_AUTOMOUNT, 0, &sx) < 0) {
+                                AT_NO_AUTOMOUNT, STATX_TYPE, &sx) < 0) {
                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
                         return -errno;
 
                 /* If statx() is not available or forbidden, fall back to name_to_handle_at() below */
         } else if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) /* yay! */
                 return FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
+        else if (FLAGS_SET(sx.stx_mask, STATX_TYPE) && S_ISLNK(sx.stx_mode))
+                return false; /* symlinks are never mount points */
 
         r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags);
         if (IN_SET(r, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL))
@@ -231,16 +233,13 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
         } else if (r < 0)
                 return r;
 
-        /* The parent can do name_to_handle_at() but the
-         * directory we are interested in can't? If so, it
-         * must be a mount point. */
+        /* The parent can do name_to_handle_at() but the directory we are interested in can't? If so, it must
+         * be a mount point. */
         if (nosupp)
                 return 1;
 
-        /* If the file handle for the directory we are
-         * interested in and its parent are identical, we
-         * assume this is the root directory, which is a mount
-         * point. */
+        /* If the file handle for the directory we are interested in and its parent are identical, we assume
+         * this is the root directory, which is a mount point. */
 
         if (h->handle_bytes == h_parent->handle_bytes &&
             h->handle_type == h_parent->handle_type &&
@@ -263,23 +262,22 @@ fallback_fdinfo:
         if (mount_id != mount_id_parent)
                 return 1;
 
-        /* Hmm, so, the mount ids are the same. This leaves one
-         * special case though for the root file system. For that,
-         * let's see if the parent directory has the same inode as we
-         * are interested in. Hence, let's also do fstat() checks now,
-         * too, but avoid the st_dev comparisons, since they aren't
-         * that useful on unionfs mounts. */
+        /* Hmm, so, the mount ids are the same. This leaves one special case though for the root file
+         * system. For that, let's see if the parent directory has the same inode as we are interested
+         * in. Hence, let's also do fstat() checks now, too, but avoid the st_dev comparisons, since they
+         * aren't that useful on unionfs mounts. */
         check_st_dev = false;
 
 fallback_fstat:
-        /* yay for fstatat() taking a different set of flags than the other
-         * _at() above */
+        /* yay for fstatat() taking a different set of flags than the other _at() above */
         if (flags & AT_SYMLINK_FOLLOW)
                 flags &= ~AT_SYMLINK_FOLLOW;
         else
                 flags |= AT_SYMLINK_NOFOLLOW;
         if (fstatat(fd, filename, &a, flags) < 0)
                 return -errno;
+        if (S_ISLNK(a.st_mode)) /* Symlinks are never mount points */
+                return false;
 
         if (fstatat(fd, "", &b, AT_EMPTY_PATH) < 0)
                 return -errno;
index d5954cac07ac6c1e330a788e713b73cb7a5cf5ab..922913e7afbc80ab43e664071508cc29657fe0f6 100644 (file)
@@ -5222,7 +5222,7 @@ static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv
         assert(name);
 
         if (!strv_isempty(strv)) {
-                fprintf(f, "%s%s:", name, prefix);
+                fprintf(f, "%s%s:", prefix, name);
                 strv_fprintf(f, strv);
                 fputs("\n", f);
         }
index 9cfd292011602d00427d83b5308b3b8e6a415a3a..dafd62ca636f7fee834ff553775325baaa66927a 100644 (file)
@@ -185,9 +185,7 @@ static int run(int argc, char *argv[]) {
         r = 0;
 
         if (argc > optind) {
-                int i;
-
-                for (i = optind; i < argc; i++) {
+                for (int i = optind; i < argc; i++) {
                         k = apply_file(ctx, argv[i], false);
                         if (k < 0 && r == 0)
                                 r = k;
index 0b716d22dddaa400fd639bd4fd7b1d30fd8d2e24..84f94f500f33750c34d1bf2a4cf68e60aebf83fb 100644 (file)
@@ -348,7 +348,7 @@ enum nss_status _nss_systemd_setgrent(int stayopen) {
         _l = pthread_mutex_lock_assert(&getgrent_data.mutex);
 
         getgrent_data.iterator = userdb_iterator_free(getgrent_data.iterator);
-        getpwent_data.by_membership = false;
+        getgrent_data.by_membership = false;
 
         /* See _nss_systemd_setpwent() for an explanation why we use USERDB_DONT_SYNTHESIZE here */
         r = groupdb_all(nss_glue_userdb_flags() | USERDB_DONT_SYNTHESIZE, &getgrent_data.iterator);
@@ -441,7 +441,7 @@ enum nss_status _nss_systemd_getgrent_r(
                         getgrent_data.iterator = userdb_iterator_free(getgrent_data.iterator);
 
                         r = membershipdb_all(nss_glue_userdb_flags(), &getgrent_data.iterator);
-                        if (r < 0) {
+                        if (r < 0 && r != -ESRCH) {
                                 UNPROTECT_ERRNO;
                                 *errnop = -r;
                                 return NSS_STATUS_UNAVAIL;
@@ -454,7 +454,7 @@ enum nss_status _nss_systemd_getgrent_r(
                         return NSS_STATUS_UNAVAIL;
                 } else if (!STR_IN_SET(gr->group_name, root_group.gr_name, nobody_group.gr_name)) {
                         r = membershipdb_by_group_strv(gr->group_name, nss_glue_userdb_flags(), &members);
-                        if (r < 0) {
+                        if (r < 0 && r != -ESRCH) {
                                 UNPROTECT_ERRNO;
                                 *errnop = -r;
                                 return NSS_STATUS_UNAVAIL;
@@ -465,6 +465,9 @@ enum nss_status _nss_systemd_getgrent_r(
         if (getgrent_data.by_membership) {
                 _cleanup_(_nss_systemd_unblockp) bool blocked = false;
 
+                if (!getgrent_data.iterator)
+                        return NSS_STATUS_NOTFOUND;
+
                 for (;;) {
                         _cleanup_free_ char *user_name = NULL, *group_name = NULL;
 
index 22af0fde60176e50f524bb7c2128c21db0bae373..8ad7ef608eb99b150490eb19092a7eba82c769a2 100644 (file)
@@ -215,7 +215,7 @@ enum nss_status userdb_getgrnam(
         }
 
         r = membershipdb_by_group_strv(name, nss_glue_userdb_flags(), &members);
-        if (r < 0) {
+        if (r < 0 && r != -ESRCH) {
                 *errnop = -r;
                 return NSS_STATUS_UNAVAIL;
         }
@@ -308,7 +308,7 @@ enum nss_status userdb_getgrgid(
                 from_nss = false;
 
         r = membershipdb_by_group_strv(g->group_name, nss_glue_userdb_flags(), &members);
-        if (r < 0) {
+        if (r < 0 && r != -ESRCH) {
                 *errnop = -r;
                 return NSS_STATUS_UNAVAIL;
         }
index c3e84aadde98183fae6c6b5a9d3658807f25be2b..49dc5eb26e84480a2ded8aff77fb8638c8d625c9 100644 (file)
@@ -299,8 +299,7 @@ static int acquire_managed_oom_connect(Manager *m) {
         return 0;
 }
 
-static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
-        _cleanup_set_free_ Set *targets = NULL;
+static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
         Manager *m = userdata;
         usec_t usec_now;
         int r;
@@ -313,7 +312,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
         if (r < 0)
                 return log_error_errno(r, "Failed to reset event timer: %m");
 
-        r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
+        r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
         if (r < 0)
                 return log_error_errno(r, "Failed to set relative time for timer: %m");
 
@@ -324,97 +323,29 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
                         return log_error_errno(r, "Failed to acquire varlink connection: %m");
         }
 
-        /* Update the cgroups used for detection/action */
-        r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
-        if (r == -ENOMEM)
-                return log_oom();
-        if (r < 0)
-                log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m");
-
-        r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
-        if (r == -ENOMEM)
-                return log_oom();
-        if (r < 0)
-                log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
-
-        r = update_monitored_cgroup_contexts_candidates(
-                        m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
-        if (r == -ENOMEM)
-                return log_oom();
-        if (r < 0)
-                log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
-
+        /* We still try to acquire swap information for oomctl even if no units want swap monitoring */
         r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
-        /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
+        /* If there are no units depending on swap actions, the only error we exit on is ENOMEM.
          * Allow ENOENT in the event that swap is disabled on the system. */
-        if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
-                return log_error_errno(r, "Failed to acquire system context: %m");
-        else if (r == -ENOENT)
+        if (r == -ENOENT) {
                 zero(m->system_context);
+                return 0;
+        } else if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
+                return log_error_errno(r, "Failed to acquire system context: %m");
 
-        if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
-                m->last_reclaim_at = usec_now;
+        /* Return early if nothing is requesting swap monitoring */
+        if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
+                return 0;
 
-        /* If we're still recovering from a kill, don't try to kill again yet */
-        if (m->post_action_delay_start > 0) {
-                if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
-                        return 0;
-                else
-                        m->post_action_delay_start = 0;
-        }
-
-        r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
-        if (r == -ENOMEM)
-                return log_oom();
-        if (r < 0)
-                log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
-        else if (r == 1) {
-                /* Check if there was reclaim activity in the given interval. The concern is the following case:
-                 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
-                 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
-                 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
-                 * to kill something (it won't help anyways). */
-                if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
-                        OomdCGroupContext *t;
-
-                        SET_FOREACH(t, targets) {
-                                _cleanup_free_ char *selected = NULL;
-                                char ts[FORMAT_TIMESPAN_MAX];
-
-                                log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
-                                          t->path,
-                                          LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
-                                          LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
-                                          format_timespan(ts, sizeof ts,
-                                                          m->default_mem_pressure_duration_usec,
-                                                          USEC_PER_SEC));
-
-                                r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
-                                if (r == -ENOMEM)
-                                        return log_oom();
-                                if (r < 0)
-                                        log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
-                                else {
-                                        /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
-                                        m->post_action_delay_start = usec_now;
-                                        if (selected)
-                                                log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
-                                                           " for > %s with reclaim activity",
-                                                           selected, t->path,
-                                                           LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
-                                                           LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
-                                                           format_timespan(ts, sizeof ts,
-                                                                           m->default_mem_pressure_duration_usec,
-                                                                           USEC_PER_SEC));
-                                        return 0;
-                                }
-                        }
-                }
-        }
+        /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
+         * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
+         * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
+         * nodes are the ones that matter). */
 
         if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
                 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
                 _cleanup_free_ char *selected = NULL;
+                uint64_t threshold;
 
                 log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
                           m->system_context.swap_used, m->system_context.swap_total,
@@ -426,13 +357,13 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
                 if (r < 0)
                         log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
 
-                r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
+                threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
+                r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
                 if (r == -ENOMEM)
                         return log_oom();
                 if (r < 0)
                         log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
                 else {
-                        m->post_action_delay_start = usec_now;
                         if (selected)
                                 log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
                                            PERMYRIAD_AS_PERCENT_FORMAT_STR,
@@ -445,14 +376,183 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
         return 0;
 }
 
-static int monitor_cgroup_contexts(Manager *m) {
+static void clear_candidate_hashmapp(Manager **m) {
+        if (*m)
+                hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
+}
+
+static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+        /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
+         * update the candidate data (in which case clear_candidates will be NULL). */
+        _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
+        _cleanup_set_free_ Set *targets = NULL;
+        bool in_post_action_delay = false;
+        Manager *m = userdata;
+        usec_t usec_now;
+        int r;
+
+        assert(s);
+        assert(userdata);
+
+        /* Reset timer */
+        r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset event timer: %m");
+
+        r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set relative time for timer: %m");
+
+        /* Reconnect if our connection dropped */
+        if (!m->varlink) {
+                r = acquire_managed_oom_connect(m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to acquire varlink connection: %m");
+        }
+
+        /* Return early if nothing is requesting memory pressure monitoring */
+        if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
+                return 0;
+
+        /* Update the cgroups used for detection/action */
+        r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
+
+        r = update_monitored_cgroup_contexts_candidates(
+                        m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
+
+        /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
+         * values and go on a kill storm. */
+        if (m->mem_pressure_post_action_delay_start > 0) {
+                if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
+                        in_post_action_delay = true;
+                else
+                        m->mem_pressure_post_action_delay_start = 0;
+        }
+
+        r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
+        if (r == -ENOMEM)
+                return log_oom();
+        if (r < 0)
+                log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
+        else if (r == 1 && !in_post_action_delay) {
+                OomdCGroupContext *t;
+                SET_FOREACH(t, targets) {
+                        _cleanup_free_ char *selected = NULL;
+                        char ts[FORMAT_TIMESPAN_MAX];
+
+                        /* Check if there was reclaim activity in the given interval. The concern is the following case:
+                         * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
+                         * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
+                         * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
+                         * to kill something (it won't help anyways). */
+                        if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
+                                continue;
+
+                        log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
+                                  t->path,
+                                  LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
+                                  LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
+                                  format_timespan(ts, sizeof ts,
+                                                  m->default_mem_pressure_duration_usec,
+                                                  USEC_PER_SEC));
+
+                        r = update_monitored_cgroup_contexts_candidates(
+                                        m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
+                        else
+                                clear_candidates = NULL;
+
+                        r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0)
+                                log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
+                        else {
+                                /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
+                                m->mem_pressure_post_action_delay_start = usec_now;
+                                if (selected)
+                                        log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
+                                                   " for > %s with reclaim activity",
+                                                   selected, t->path,
+                                                   LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
+                                                   LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
+                                                   format_timespan(ts, sizeof ts,
+                                                                   m->default_mem_pressure_duration_usec,
+                                                                   USEC_PER_SEC));
+                                return 0;
+                        }
+                }
+        } else {
+                /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
+                 * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
+                 * might happen.
+                 * Candidate cgroup data will continue to get updated during the post-action delay period in case
+                 * pressure continues to be high after a kill. */
+                OomdCGroupContext *c;
+                HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
+                        if (c->mem_pressure_limit_hit_start == 0)
+                                continue;
+
+                        r = update_monitored_cgroup_contexts_candidates(
+                                        m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
+                        if (r == -ENOMEM)
+                                return log_oom();
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
+                        else {
+                                clear_candidates = NULL;
+                                break;
+                        }
+                }
+        }
+
+        return 0;
+}
+
+static int monitor_swap_contexts(Manager *m) {
+        _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+        int r;
+
+        assert(m);
+        assert(m->event);
+
+        r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_exit_on_failure(s, true);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_enabled(s, SD_EVENT_ON);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(s, "oomd-swap-timer");
+
+        m->swap_context_event_source = TAKE_PTR(s);
+        return 0;
+}
+
+static int monitor_memory_pressure_contexts(Manager *m) {
         _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
         int r;
 
         assert(m);
         assert(m->event);
 
-        r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
+        r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
         if (r < 0)
                 return r;
 
@@ -464,9 +564,9 @@ static int monitor_cgroup_contexts(Manager *m) {
         if (r < 0)
                 return r;
 
-        (void) sd_event_source_set_description(s, "oomd-timer");
+        (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
 
-        m->cgroup_context_event_source = TAKE_PTR(s);
+        m->mem_pressure_context_event_source = TAKE_PTR(s);
         return 0;
 }
 
@@ -474,7 +574,8 @@ Manager* manager_free(Manager *m) {
         assert(m);
 
         varlink_close_unref(m->varlink);
-        sd_event_source_unref(m->cgroup_context_event_source);
+        sd_event_source_unref(m->swap_context_event_source);
+        sd_event_source_unref(m->mem_pressure_context_event_source);
         sd_event_unref(m->event);
 
         bus_verify_polkit_async_registry_free(m->polkit_registry);
@@ -596,7 +697,11 @@ int manager_start(
         if (r < 0)
                 return r;
 
-        r = monitor_cgroup_contexts(m);
+        r = monitor_memory_pressure_contexts(m);
+        if (r < 0)
+                return r;
+
+        r = monitor_swap_contexts(m);
         if (r < 0)
                 return r;
 
index 9c580c8a249c5bd7f1841077ed69d9633fb58a02..dc170f2bdaac0dc4f94fbee49570c0596ce374cc 100644 (file)
@@ -7,10 +7,9 @@
 #include "varlink.h"
 
 /* Polling interval for monitoring stats */
-#define INTERVAL_USEC (1 * USEC_PER_SEC)
-
-/* Used to weight the averages */
-#define AVERAGE_SIZE_DECAY 4
+#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
+/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
+#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
 
 /* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the
  * percentage of time all tasks were delayed (i.e. unproductive).
@@ -20,6 +19,9 @@
 #define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60
 #define DEFAULT_SWAP_USED_LIMIT_PERCENT 90
 
+/* Only tackle candidates with large swap usage. */
+#define THRESHOLD_SWAP_USED_PERCENT 5
+
 #define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
 #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
 
@@ -44,10 +46,10 @@ struct Manager {
 
         OomdSystemContext system_context;
 
-        usec_t last_reclaim_at;
-        usec_t post_action_delay_start;
+        usec_t mem_pressure_post_action_delay_start;
 
-        sd_event_source *cgroup_context_event_source;
+        sd_event_source *swap_context_event_source;
+        sd_event_source *mem_pressure_context_event_source;
 
         Varlink *varlink;
 };
index 894d23a83a4a19746efec90d6de105fc622ed9bc..5bf81479c9f44b61acda229b2b1d279d55800151 100644 (file)
@@ -82,17 +82,17 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
                 if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
                         usec_t diff;
 
-                        if (ctx->last_hit_mem_pressure_limit == 0)
-                                ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC);
+                        if (ctx->mem_pressure_limit_hit_start == 0)
+                                ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC);
 
-                        diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit;
+                        diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start;
                         if (diff >= duration) {
                                 r = set_put(targets, ctx);
                                 if (r < 0)
                                         return -ENOMEM;
                         }
                 } else
-                        ctx->last_hit_mem_pressure_limit = 0;
+                        ctx->mem_pressure_limit_hit_start = 0;
         }
 
         if (!set_isempty(targets)) {
@@ -104,34 +104,21 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
         return 0;
 }
 
-bool oomd_memory_reclaim(Hashmap *h) {
-        uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0;
-        OomdCGroupContext *ctx;
-
-        assert(h);
-
-        /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
-         * there was reclaim activity. Used along with pressure checks to decide whether to take action. */
+uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
+        uint64_t last_pgscan;
 
-        HASHMAP_FOREACH(ctx, h) {
-                uint64_t sum;
+        assert(c);
 
-                sum = pgscan + ctx->pgscan;
-                if (sum < pgscan || sum < ctx->pgscan)
-                        pgscan_of++; /* count overflows */
-                pgscan = sum;
-
-                sum = last_pgscan + ctx->last_pgscan;
-                if (sum < last_pgscan || sum < ctx->last_pgscan)
-                        last_pgscan_of++; /* count overflows */
-                last_pgscan = sum;
+        /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero.
+         * pgscan is monotonic and in practice should not decrease (except in the recreation case). */
+        last_pgscan = c->last_pgscan;
+        if (c->last_pgscan > c->pgscan) {
+                log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.",
+                                c->last_pgscan, c->pgscan, c->path);
+                last_pgscan = 0;
         }
 
-        /* overflow counts are the same, return sums comparison */
-        if (last_pgscan_of == pgscan_of)
-                return pgscan > last_pgscan;
-
-        return pgscan_of > last_pgscan_of;
+        return c->pgscan - last_pgscan;
 }
 
 bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
@@ -246,7 +233,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char
         return ret;
 }
 
-int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
+int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
         _cleanup_free_ OomdCGroupContext **sorted = NULL;
         int n, r, ret = 0;
 
@@ -257,12 +244,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
         if (n < 0)
                 return n;
 
-        /* Try to kill cgroups with non-zero swap usage until we either succeed in
-         * killing or we get to a cgroup with no swap usage. */
+        /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with
+         * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
         for (int i = 0; i < n; i++) {
-                /* Skip over cgroups with no resource usage.
-                 * Continue break since there might be "avoid" cgroups at the end. */
-                if (sorted[i]->swap_usage == 0)
+                /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
+                 * cgroups at the end. */
+                if (sorted[i]->swap_usage <= threshold_usage)
                         continue;
 
                 r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
@@ -430,9 +417,13 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
         if (old_ctx) {
                 curr_ctx->last_pgscan = old_ctx->pgscan;
                 curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
-                curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
+                curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
+                curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
         }
 
+        if (oomd_pgscan_rate(curr_ctx) > 0)
+                curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
+
         r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
         if (r < 0)
                 return r;
@@ -456,7 +447,11 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
 
                 ctx->last_pgscan = old_ctx->pgscan;
                 ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
-                ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
+                ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
+                ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
+
+                if (oomd_pgscan_rate(ctx) > 0)
+                        ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
         }
 }
 
index 51423130d1b8c8c964d3b608f04fa0841e02148c..81fdc5e088c03b79a332b4ddcb15d1bf4b3a8bc5 100644 (file)
@@ -32,10 +32,10 @@ struct OomdCGroupContext {
 
         ManagedOOMPreference preference;
 
-        /* These are only used by oomd_pressure_above for acting on high memory pressure. */
+        /* These are only used for acting on high memory pressure. */
         loadavg_t mem_pressure_limit;
-        usec_t mem_pressure_duration_usec;
-        usec_t last_hit_mem_pressure_limit;
+        usec_t mem_pressure_limit_hit_start;
+        usec_t last_had_mem_reclaim;
 };
 
 struct OomdSystemContext {
@@ -51,23 +51,22 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
 
 /* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret`
  * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time.
- * `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns
+ * `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns
  * below the limit.
  * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`.
  * Returns -ENOMEM for allocation errors. */
 int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
 
-/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the
- * current sum is higher than the last interval's sum (there was some reclaim activity). */
-bool oomd_memory_reclaim(Hashmap *h);
-
 /* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
 bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);
 
+/* Returns pgscan - last_pgscan, accounting for corner cases. */
+uint64_t oomd_pgscan_rate(const OomdCGroupContext *c);
+
 /* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
  * (after the smallest values). */
 static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
-        uint64_t last1, last2;
+        uint64_t diff1, diff2;
         int r;
 
         assert(c1);
@@ -77,22 +76,9 @@ static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const
         if (r != 0)
                 return r;
 
-        /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. */
-        last2 = (*c2)->last_pgscan;
-        if ((*c2)->last_pgscan > (*c2)->pgscan) {
-                log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
-                                (*c2)->last_pgscan, (*c2)->pgscan, (*c2)->path);
-                last2 = 0;
-        }
-
-        last1 = (*c1)->last_pgscan;
-        if ((*c1)->last_pgscan > (*c1)->pgscan) {
-                log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
-                                (*c1)->last_pgscan, (*c1)->pgscan, (*c1)->path);
-                last1 = 0;
-        }
-
-        r = CMP((*c2)->pgscan - last2, (*c1)->pgscan - last1);
+        diff1 = oomd_pgscan_rate(*c1);
+        diff2 = oomd_pgscan_rate(*c2);
+        r = CMP(diff2, diff1);
         if (r != 0)
                 return r;
 
@@ -125,7 +111,7 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
  * everything in `h` is a candidate.
  * Returns the killed cgroup in ret_selected. */
 int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected);
-int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected);
+int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected);
 
 int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
 int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret);
index 6e2a5889d1e276bd8a8be810f6b5bfd811cdc5bb..deb7b094d504b49858f78f57a1ff9bf5361ac859 100644 (file)
@@ -155,6 +155,9 @@ static int run(int argc, char *argv[]) {
 
         assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
 
+        if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC)
+                log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s");
+
         r = manager_new(&m);
         if (r < 0)
                 return log_error_errno(r, "Failed to create manager: %m");
index 278fc305b3e518fee696c0ae7c7aa8c9d4d87a6e..7ebea29c1a6b3e3b2e044b27d9321b3ab354b080 100644 (file)
@@ -160,9 +160,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
         assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST);
 
          /* make sure certain values from h1 get updated in h2 */
-        c1->pgscan = 5555;
+        c1->pgscan = UINT64_MAX;
         c1->mem_pressure_limit = 6789;
-        c1->last_hit_mem_pressure_limit = 42;
+        c1->mem_pressure_limit_hit_start = 42;
+        c1->last_had_mem_reclaim = 888;
         assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
         assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
         c1 = hashmap_get(h1, cgroup);
@@ -170,9 +171,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
         assert_se(c1);
         assert_se(c2);
         assert_se(c1 != c2);
-        assert_se(c2->last_pgscan == 5555);
+        assert_se(c2->last_pgscan == UINT64_MAX);
         assert_se(c2->mem_pressure_limit == 6789);
-        assert_se(c2->last_hit_mem_pressure_limit == 42);
+        assert_se(c2->mem_pressure_limit_hit_start == 42);
+        assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
 
         /* Assert that avoid/omit are not set if the cgroup is not owned by root */
         if (test_xattrs) {
@@ -189,20 +191,22 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
         char **paths = STRV_MAKE("/0.slice",
                                  "/1.slice");
 
-        OomdCGroupContext ctx_old[3] = {
+        OomdCGroupContext ctx_old[2] = {
                 { .path = paths[0],
                   .mem_pressure_limit = 5,
-                  .last_hit_mem_pressure_limit = 777,
+                  .mem_pressure_limit_hit_start = 777,
+                  .last_had_mem_reclaim = 888,
                   .pgscan = 57 },
                 { .path = paths[1],
                   .mem_pressure_limit = 6,
-                  .last_hit_mem_pressure_limit = 888,
+                  .mem_pressure_limit_hit_start = 888,
+                  .last_had_mem_reclaim = 888,
                   .pgscan = 42 },
         };
 
-        OomdCGroupContext ctx_new[3] = {
+        OomdCGroupContext ctx_new[2] = {
                 { .path = paths[0],
-                  .pgscan = 100 },
+                  .pgscan = 57 },
                 { .path = paths[1],
                   .pgscan = 101 },
         };
@@ -221,13 +225,15 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
         assert_se(c_new = hashmap_get(h_new, "/0.slice"));
         assert_se(c_old->pgscan == c_new->last_pgscan);
         assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
-        assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
+        assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
+        assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
 
         assert_se(c_old = hashmap_get(h_old, "/1.slice"));
         assert_se(c_new = hashmap_get(h_new, "/1.slice"));
         assert_se(c_old->pgscan == c_new->last_pgscan);
         assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
-        assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
+        assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
+        assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
 }
 
 static void test_oomd_system_context_acquire(void) {
@@ -290,7 +296,7 @@ static void test_oomd_pressure_above(void) {
         assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1);
         assert_se(set_contains(t1, &ctx[0]) == true);
         assert_se(c = hashmap_get(h1, "/herp.slice"));
-        assert_se(c->last_hit_mem_pressure_limit > 0);
+        assert_se(c->mem_pressure_limit_hit_start > 0);
 
         /* Low memory pressure */
         assert_se(h2 = hashmap_new(&string_hash_ops));
@@ -298,7 +304,7 @@ static void test_oomd_pressure_above(void) {
         assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0);
         assert_se(t2 == NULL);
         assert_se(c = hashmap_get(h2, "/derp.slice"));
-        assert_se(c->last_hit_mem_pressure_limit == 0);
+        assert_se(c->mem_pressure_limit_hit_start == 0);
 
         /* High memory pressure w/ multiple cgroups */
         assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0);
@@ -306,50 +312,9 @@ static void test_oomd_pressure_above(void) {
         assert_se(set_contains(t3, &ctx[0]) == true);
         assert_se(set_size(t3) == 1);
         assert_se(c = hashmap_get(h1, "/herp.slice"));
-        assert_se(c->last_hit_mem_pressure_limit > 0);
+        assert_se(c->mem_pressure_limit_hit_start > 0);
         assert_se(c = hashmap_get(h1, "/derp.slice"));
-        assert_se(c->last_hit_mem_pressure_limit == 0);
-}
-
-static void test_oomd_memory_reclaim(void) {
-        _cleanup_hashmap_free_ Hashmap *h1 = NULL;
-        char **paths = STRV_MAKE("/0.slice",
-                                 "/1.slice",
-                                 "/2.slice",
-                                 "/3.slice",
-                                 "/4.slice");
-
-        OomdCGroupContext ctx[5] = {
-                { .path = paths[0],
-                  .last_pgscan = 100,
-                  .pgscan = 100 },
-                { .path = paths[1],
-                  .last_pgscan = 100,
-                  .pgscan = 100 },
-                { .path = paths[2],
-                  .last_pgscan = 77,
-                  .pgscan = 33 },
-                { .path = paths[3],
-                  .last_pgscan = UINT64_MAX,
-                  .pgscan = 100 },
-                { .path = paths[4],
-                  .last_pgscan = 100,
-                  .pgscan = UINT64_MAX },
-        };
-
-        assert_se(h1 = hashmap_new(&string_hash_ops));
-        assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0);
-        assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == false);
-
-        assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == false);
-
-        assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == true);
-
-        assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0);
-        assert_se(oomd_memory_reclaim(h1) == false);
+        assert_se(c->mem_pressure_limit_hit_start == 0);
 }
 
 static void test_oomd_swap_free_below(void) {
@@ -468,7 +433,6 @@ int main(void) {
         test_oomd_update_cgroup_contexts_between_hashmaps();
         test_oomd_system_context_acquire();
         test_oomd_pressure_above();
-        test_oomd_memory_reclaim();
         test_oomd_swap_free_below();
         test_oomd_sort_cgroups();
 
index 576e4054c2944d2e1179761f19d78655e7b46a14..13f202d7e7dfb241fb2eda925b9d2ecc7d289c1c 100644 (file)
@@ -84,9 +84,8 @@ int umount_recursive(const char *prefix, int flags) {
         int n = 0, r;
         bool again;
 
-        /* Try to umount everything recursively below a
-         * directory. Also, take care of stacked mounts, and keep
-         * unmounting them until they are gone. */
+        /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
+         * keep unmounting them until they are gone. */
 
         do {
                 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
@@ -132,68 +131,6 @@ int umount_recursive(const char *prefix, int flags) {
         return n;
 }
 
-static int get_mount_flags(
-                struct libmnt_table *table,
-                const char *path,
-                unsigned long *ret) {
-
-        _cleanup_close_ int fd = -1;
-        struct libmnt_fs *fs;
-        struct statvfs buf;
-        const char *opts;
-        int r;
-
-        /* Get the mount flags for the mountpoint at "path" from "table". We have a fallback using statvfs()
-         * in place (which provides us with mostly the same info), but it's just a fallback, since using it
-         * means triggering autofs or NFS mounts, which we'd rather avoid needlessly.
-         *
-         * This generally doesn't follow symlinks. */
-
-        fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
-        if (!fs) {
-                log_debug("Could not find '%s' in mount table, ignoring.", path);
-                goto fallback;
-        }
-
-        opts = mnt_fs_get_vfs_options(fs);
-        if (!opts) {
-                *ret = 0;
-                return 0;
-        }
-
-        r = mnt_optstr_get_flags(opts, ret, mnt_get_builtin_optmap(MNT_LINUX_MAP));
-        if (r != 0) {
-                log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
-                goto fallback;
-        }
-
-        /* MS_RELATIME is default and trying to set it in an unprivileged container causes EPERM */
-        *ret &= ~MS_RELATIME;
-        return 0;
-
-fallback:
-        fd = open(path, O_PATH|O_CLOEXEC|O_NOFOLLOW);
-        if (fd < 0)
-                return -errno;
-
-        if (fstatvfs(fd, &buf) < 0)
-                return -errno;
-
-        /* The statvfs() flags and the mount flags mostly have the same values, but for some cases do
-         * not. Hence map the flags manually. (Strictly speaking, ST_RELATIME/MS_RELATIME is the most
-         * prominent one that doesn't match, but that's the one we mask away anyway, see above.) */
-
-        *ret =
-                FLAGS_SET(buf.f_flag, ST_RDONLY) * MS_RDONLY |
-                FLAGS_SET(buf.f_flag, ST_NODEV) * MS_NODEV |
-                FLAGS_SET(buf.f_flag, ST_NOEXEC) * MS_NOEXEC |
-                FLAGS_SET(buf.f_flag, ST_NOSUID) * MS_NOSUID |
-                FLAGS_SET(buf.f_flag, ST_NOATIME) * MS_NOATIME |
-                FLAGS_SET(buf.f_flag, ST_NODIRATIME) * MS_NODIRATIME;
-
-        return 0;
-}
-
 /* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
  * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
 int bind_remount_recursive_with_mountinfo(
@@ -203,8 +140,8 @@ int bind_remount_recursive_with_mountinfo(
                 char **deny_list,
                 FILE *proc_self_mountinfo) {
 
-        _cleanup_set_free_free_ Set *done = NULL;
-        _cleanup_free_ char *simplified = NULL;
+        _cleanup_set_free_ Set *done = NULL;
+        unsigned n_tries = 0;
         int r;
 
         assert(prefix);
@@ -217,32 +154,21 @@ int bind_remount_recursive_with_mountinfo(
          * access, too. When mounts are stacked on the same mount point we only care for each individual
          * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
          * not have any effect on future submounts that might get propagated, they might be writable
-         * etc. This includes future submounts that have been triggered via autofs.
+         * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
+         * operate atomically here. Mounts established while we process the tree might or might not get
+         * noticed and thus might or might not be covered.
          *
          * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
          * remount operation. Note that we'll ignore the deny list for the top-level path. */
 
-        simplified = strdup(prefix);
-        if (!simplified)
-                return -ENOMEM;
-
-        path_simplify(simplified, false);
-
-        done = set_new(&path_hash_ops);
-        if (!done)
-                return -ENOMEM;
-
         for (;;) {
-                _cleanup_set_free_free_ Set *todo = NULL;
                 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
                 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+                _cleanup_hashmap_free_ Hashmap *todo = NULL;
                 bool top_autofs = false;
-                char *x;
-                unsigned long orig_flags;
 
-                todo = set_new(&path_hash_ops);
-                if (!todo)
-                        return -ENOMEM;
+                if (n_tries++ >= 32) /* Let's not retry this loop forever */
+                        return -EBUSY;
 
                 rewind(proc_self_mountinfo);
 
@@ -251,130 +177,159 @@ int bind_remount_recursive_with_mountinfo(
                         return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
 
                 for (;;) {
+                        _cleanup_free_ char *d = NULL;
+                        const char *path, *type, *opts;
+                        unsigned long flags = 0;
                         struct libmnt_fs *fs;
-                        const char *path, *type;
 
                         r = mnt_table_next_fs(table, iter, &fs);
-                        if (r == 1)
+                        if (r == 1) /* EOF */
                                 break;
                         if (r < 0)
                                 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
 
                         path = mnt_fs_get_target(fs);
+                        if (!path)
+                                continue;
+
+                        if (!path_startswith(path, prefix))
+                                continue;
+
                         type = mnt_fs_get_fstype(fs);
-                        if (!path || !type)
+                        if (!type)
+                                continue;
+
+                        /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
+                         * triggering them, as we don't make any guarantees for future submounts anyway. If
+                         * they are already triggered, then we will find another entry for this. */
+                        if (streq(type, "autofs")) {
+                                top_autofs = top_autofs || path_equal(path, prefix);
                                 continue;
+                        }
 
-                        if (!path_startswith(path, simplified))
+                        if (set_contains(done, path))
                                 continue;
 
                         /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
                          * we shall operate on. */
-                        if (!path_equal(path, simplified)) {
+                        if (!path_equal(path, prefix)) {
                                 bool deny_listed = false;
                                 char **i;
 
                                 STRV_FOREACH(i, deny_list) {
-                                        if (path_equal(*i, simplified))
+                                        if (path_equal(*i, prefix))
                                                 continue;
 
-                                        if (!path_startswith(*i, simplified))
+                                        if (!path_startswith(*i, prefix))
                                                 continue;
 
                                         if (path_startswith(path, *i)) {
                                                 deny_listed = true;
-                                                log_debug("Not remounting %s deny-listed by %s, called for %s",
-                                                          path, *i, simplified);
+                                                log_debug("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
                                                 break;
                                         }
                                 }
+
                                 if (deny_listed)
                                         continue;
                         }
 
-                        /* Let's ignore autofs mounts.  If they aren't
-                         * triggered yet, we want to avoid triggering
-                         * them, as we don't make any guarantees for
-                         * future submounts anyway.  If they are
-                         * already triggered, then we will find
-                         * another entry for this. */
-                        if (streq(type, "autofs")) {
-                                top_autofs = top_autofs || path_equal(path, simplified);
-                                continue;
-                        }
-
-                        if (!set_contains(done, path)) {
-                                r = set_put_strdup(&todo, path);
+                        opts = mnt_fs_get_vfs_options(fs);
+                        if (opts) {
+                                r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
                                 if (r < 0)
-                                        return r;
+                                        log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
                         }
-                }
 
-                /* If we have no submounts to process anymore and if
-                 * the root is either already done, or an autofs, we
-                 * are done */
-                if (set_isempty(todo) &&
-                    (top_autofs || set_contains(done, simplified)))
-                        return 0;
+                        d = strdup(path);
+                        if (!d)
+                                return -ENOMEM;
 
-                if (!set_contains(done, simplified) &&
-                    !set_contains(todo, simplified)) {
-                        /* The prefix directory itself is not yet a mount, make it one. */
-                        r = mount_nofollow(simplified, simplified, NULL, MS_BIND|MS_REC, NULL);
+                        r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
+                        if (r == -EEXIST)
+                                continue;
                         if (r < 0)
                                 return r;
+                        if (r > 0)
+                                TAKE_PTR(d);
+                }
 
-                        orig_flags = 0;
-                        (void) get_mount_flags(table, simplified, &orig_flags);
+                /* Check if the top-level directory was among what we have seen so far. For that check both
+                 * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
+                 * not include it in either set but will set this bool. */
+                if (!set_contains(done, prefix) &&
+                    !(top_autofs || hashmap_contains(todo, prefix))) {
 
-                        r = mount_nofollow(NULL, simplified, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
+                        /* The prefix directory itself is not yet a mount, make it one. */
+                        r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
                         if (r < 0)
                                 return r;
 
-                        log_debug("Made top-level directory %s a mount point.", prefix);
-
-                        r = set_put_strdup(&done, simplified);
-                        if (r < 0)
-                                return r;
+                        /* Immediately rescan, so that we pick up the new mount's flags */
+                        continue;
                 }
 
-                while ((x = set_steal_first(todo))) {
+                /* If we have no submounts to process anymore, we are done */
+                if (hashmap_isempty(todo))
+                        return 0;
+
+                for (;;) {
+                        unsigned long flags;
+                        char *x = NULL;
+
+                        /* Take the first mount from our list of mounts to still process */
+                        flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x));
+                        if (!x)
+                                break;
 
-                        r = set_consume(done, x);
+                        r = set_ensure_consume(&done, &path_hash_ops_free, x);
                         if (IN_SET(r, 0, -EEXIST))
-                                continue;
+                                continue; /* Already done */
                         if (r < 0)
                                 return r;
 
-                        /* Deal with mount points that are obstructed by a later mount */
-                        r = path_is_mount_point(x, NULL, 0);
-                        if (IN_SET(r, 0, -ENOENT))
-                                continue;
+                        /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
+                         * the default anyway, thus redundant, and in userns we'll get an error if we try to
+                         * explicitly enable it) */
+                        r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
                         if (r < 0) {
-                                if (!ERRNO_IS_PRIVILEGE(r))
+                                int q;
+
+                                /* OK, so the remount of this entry failed. We'll ultimately ignore this in
+                                 * almost all cases (there are simply so many reasons why this can fail,
+                                 * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
+                                 * the very least. */
+
+                                q = path_is_mount_point(x, NULL, 0);
+                                if (IN_SET(q, 0, -ENOENT)) {
+                                        /* Hmm, whaaaa? The mount point is not actually a mount point? Then
+                                         * it is either obstructed by a later mount or somebody has been
+                                         * racing against us and removed it. Either way the mount point
+                                         * doesn't matter to us, let's ignore it hence. */
+                                        log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x);
+                                        continue;
+                                }
+                                if (q < 0) /* Any other error on this? Just log and continue */
+                                        log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x);
+
+                                if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */
+                                        log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x);
+                                        continue;
+                                }
+
+                                /* Make this fatal if this is the top-level mount */
+                                if (path_equal(x, prefix))
                                         return r;
 
-                                /* Even if root user invoke this, submounts under private FUSE or NFS mount points
-                                 * may not be acceessed. E.g.,
-                                 *
-                                 * $ bindfs --no-allow-other ~/mnt/mnt ~/mnt/mnt
-                                 * $ bindfs --no-allow-other ~/mnt ~/mnt
-                                 *
-                                 * Then, root user cannot access the mount point ~/mnt/mnt.
-                                 * In such cases, the submounts are ignored, as we have no way to manage them. */
-                                log_debug_errno(r, "Failed to determine '%s' is mount point or not, ignoring: %m", x);
+                                /* If this is not the top-level mount, then handle this gracefully: log but
+                                 * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
+                                 * this might fail without a chance for us to do anything about it, let's
+                                 * hence be strict on the top-level mount and lenient on the inner ones. */
+                                log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x);
                                 continue;
                         }
 
-                        /* Try to reuse the original flag set */
-                        orig_flags = 0;
-                        (void) get_mount_flags(table, x, &orig_flags);
-
-                        r = mount_nofollow(NULL, x, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
-                        if (r < 0)
-                                return r;
-
-                        log_debug("Remounted %s read-only.", x);
+                        log_debug("Remounted %s.", x);
                 }
         }
 }
@@ -402,7 +357,9 @@ int bind_remount_one_with_mountinfo(
                 FILE *proc_self_mountinfo) {
 
         _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
-        unsigned long orig_flags = 0;
+        unsigned long flags = 0;
+        struct libmnt_fs *fs;
+        const char *opts;
         int r;
 
         assert(path);
@@ -418,12 +375,32 @@ int bind_remount_one_with_mountinfo(
         if (r < 0)
                 return r;
 
-        /* Try to reuse the original flag set */
-        (void) get_mount_flags(table, path, &orig_flags);
+        fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
+        if (!fs) {
+                if (laccess(path, F_OK) < 0) /* Hmm, it's not in the mount table, but does it exist at all? */
+                        return -errno;
+
+                return -EINVAL; /* Not a mount point we recognize */
+        }
 
-        r = mount_nofollow(NULL, path, NULL, (orig_flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags, NULL);
-        if (r < 0)
-                return r;
+        opts = mnt_fs_get_vfs_options(fs);
+        if (opts) {
+                r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
+                if (r < 0)
+                        log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
+        }
+
+        r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
+        if (r < 0) {
+                if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again,
+                                                                             * since kernel adds it in
+                                                                             * everywhere, because it's the
+                                                                             * default. */
+                        return r;
+
+                /* Let's handle redundant remounts gracefully */
+                log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path);
+        }
 
         return 0;
 }
index 0d471a2f942f5ca3eac204f3e93f3fdaf4e2dcd9..613350bd469f580f02fe2b88d324b17a67281170 100644 (file)
@@ -27,6 +27,7 @@ typedef enum LookupWhat {
 
 struct UserDBIterator {
         LookupWhat what;
+        UserDBFlags flags;
         Set *links;
         bool nss_covered:1;
         bool nss_iterating:1;
@@ -92,7 +93,7 @@ UserDBIterator* userdb_iterator_free(UserDBIterator *iterator) {
         return mfree(iterator);
 }
 
-static UserDBIterator* userdb_iterator_new(LookupWhat what) {
+static UserDBIterator* userdb_iterator_new(LookupWhat what, UserDBFlags flags) {
         UserDBIterator *i;
 
         assert(what >= 0);
@@ -104,6 +105,7 @@ static UserDBIterator* userdb_iterator_new(LookupWhat what) {
 
         *i = (UserDBIterator) {
                 .what = what,
+                .flags = flags,
         };
 
         return i;
@@ -608,7 +610,7 @@ int userdb_by_name(const char *name, UserDBFlags flags, UserRecord **ret) {
         if (r < 0)
                 return r;
 
-        iterator = userdb_iterator_new(LOOKUP_USER);
+        iterator = userdb_iterator_new(LOOKUP_USER, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -655,7 +657,7 @@ int userdb_by_uid(uid_t uid, UserDBFlags flags, UserRecord **ret) {
         if (r < 0)
                 return r;
 
-        iterator = userdb_iterator_new(LOOKUP_USER);
+        iterator = userdb_iterator_new(LOOKUP_USER, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -693,7 +695,7 @@ int userdb_all(UserDBFlags flags, UserDBIterator **ret) {
 
         assert(ret);
 
-        iterator = userdb_iterator_new(LOOKUP_USER);
+        iterator = userdb_iterator_new(LOOKUP_USER, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -738,10 +740,15 @@ int userdb_iterator_get(UserDBIterator *iterator, UserRecord **ret) {
                         if (pw->pw_uid == UID_NOBODY)
                                 iterator->synthesize_nobody = false;
 
-                        r = nss_spwd_for_passwd(pw, &spwd, &buffer);
-                        if (r < 0) {
-                                log_debug_errno(r, "Failed to acquire shadow entry for user %s, ignoring: %m", pw->pw_name);
-                                incomplete = ERRNO_IS_PRIVILEGE(r);
+                        if (!FLAGS_SET(iterator->flags, USERDB_AVOID_SHADOW)) {
+                                r = nss_spwd_for_passwd(pw, &spwd, &buffer);
+                                if (r < 0) {
+                                        log_debug_errno(r, "Failed to acquire shadow entry for user %s, ignoring: %m", pw->pw_name);
+                                        incomplete = ERRNO_IS_PRIVILEGE(r);
+                                }
+                        } else {
+                                r = -EUCLEAN;
+                                incomplete = true;
                         }
 
                         r = nss_passwd_to_user_record(pw, r >= 0 ? &spwd : NULL, ret);
@@ -750,6 +757,8 @@ int userdb_iterator_get(UserDBIterator *iterator, UserRecord **ret) {
 
                         if (ret)
                                 (*ret)->incomplete = incomplete;
+
+                        iterator->n_found++;
                         return r;
                 }
 
@@ -774,11 +783,11 @@ int userdb_iterator_get(UserDBIterator *iterator, UserRecord **ret) {
                         iterator->n_found++;
                         return synthetic_nobody_user_build(ret);
                 }
-        }
 
-        /* if we found at least one entry, then ignore errors and indicate that we reached the end */
-        if (r < 0 && iterator->n_found > 0)
-                return -ESRCH;
+                /* if we found at least one entry, then ignore errors and indicate that we reached the end */
+                if (iterator->n_found > 0)
+                        return -ESRCH;
+        }
 
         return r;
 }
@@ -812,7 +821,7 @@ int groupdb_by_name(const char *name, UserDBFlags flags, GroupRecord **ret) {
         if (r < 0)
                 return r;
 
-        iterator = userdb_iterator_new(LOOKUP_GROUP);
+        iterator = userdb_iterator_new(LOOKUP_GROUP, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -856,7 +865,7 @@ int groupdb_by_gid(gid_t gid, UserDBFlags flags, GroupRecord **ret) {
         if (r < 0)
                 return r;
 
-        iterator = userdb_iterator_new(LOOKUP_GROUP);
+        iterator = userdb_iterator_new(LOOKUP_GROUP, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -893,7 +902,7 @@ int groupdb_all(UserDBFlags flags, UserDBIterator **ret) {
 
         assert(ret);
 
-        iterator = userdb_iterator_new(LOOKUP_GROUP);
+        iterator = userdb_iterator_new(LOOKUP_GROUP, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -908,8 +917,8 @@ int groupdb_all(UserDBFlags flags, UserDBIterator **ret) {
 
                 setgrent();
                 iterator->nss_iterating = true;
-        } if (r < 0)
-                  return r;
+        } else if (r < 0)
+                return r;
 
         *ret = TAKE_PTR(iterator);
         return 0;
@@ -936,10 +945,15 @@ int groupdb_iterator_get(UserDBIterator *iterator, GroupRecord **ret) {
                         if (gr->gr_gid == GID_NOBODY)
                                 iterator->synthesize_nobody = false;
 
-                        r = nss_sgrp_for_group(gr, &sgrp, &buffer);
-                        if (r < 0) {
-                                log_debug_errno(r, "Failed to acquire shadow entry for group %s, ignoring: %m", gr->gr_name);
-                                incomplete = ERRNO_IS_PRIVILEGE(r);
+                        if (!FLAGS_SET(iterator->flags, USERDB_AVOID_SHADOW)) {
+                                r = nss_sgrp_for_group(gr, &sgrp, &buffer);
+                                if (r < 0) {
+                                        log_debug_errno(r, "Failed to acquire shadow entry for group %s, ignoring: %m", gr->gr_name);
+                                        incomplete = ERRNO_IS_PRIVILEGE(r);
+                                }
+                        } else {
+                                r = -EUCLEAN;
+                                incomplete = true;
                         }
 
                         r = nss_group_to_group_record(gr, r >= 0 ? &sgrp : NULL, ret);
@@ -948,6 +962,8 @@ int groupdb_iterator_get(UserDBIterator *iterator, GroupRecord **ret) {
 
                         if (ret)
                                 (*ret)->incomplete = incomplete;
+
+                        iterator->n_found++;
                         return r;
                 }
 
@@ -971,11 +987,11 @@ int groupdb_iterator_get(UserDBIterator *iterator, GroupRecord **ret) {
                         iterator->n_found++;
                         return synthetic_nobody_group_build(ret);
                 }
-        }
 
-        /* if we found at least one entry, then ignore errors and indicate that we reached the end */
-        if (r < 0 && iterator->n_found > 0)
-                return -ESRCH;
+                /* if we found at least one entry, then ignore errors and indicate that we reached the end */
+                if (iterator->n_found > 0)
+                        return -ESRCH;
+        }
 
         return r;
 }
@@ -995,7 +1011,7 @@ int membershipdb_by_user(const char *name, UserDBFlags flags, UserDBIterator **r
         if (r < 0)
                 return r;
 
-        iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP);
+        iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -1038,7 +1054,7 @@ int membershipdb_by_group(const char *name, UserDBFlags flags, UserDBIterator **
         if (r < 0)
                 return r;
 
-        iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP);
+        iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags);
         if (!iterator)
                 return -ENOMEM;
 
@@ -1079,7 +1095,7 @@ int membershipdb_all(UserDBFlags flags, UserDBIterator **ret) {
 
         assert(ret);
 
-        iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP);
+        iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags);
         if (!iterator)
                 return -ENOMEM;
 
index 41df5588d082948a11a15a1951c1fa6a6ba8bcbc..2e9116d359dbeb9306f99a75a844541dd70d5753 100644 (file)
@@ -1,11 +1,20 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 
 #include <sys/mount.h>
+#include <sys/statvfs.h>
 
 #include "alloc-util.h"
+#include "fd-util.h"
+#include "fileio.h"
 #include "mount-util.h"
+#include "namespace-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "rm-rf.h"
 #include "string-util.h"
+#include "strv.h"
 #include "tests.h"
+#include "tmpfile-util.h"
 
 static void test_mount_option_mangle(void) {
         char *opts = NULL;
@@ -61,10 +70,98 @@ static void test_mount_option_mangle(void) {
         assert_se(mount_option_mangle("rw,relatime,fmask=0022,dmask=0022,\"hogehoge", MS_RDONLY, &f, &opts) < 0);
 }
 
+static void test_bind_remount_recursive(void) {
+        _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL;
+        _cleanup_free_ char *subdir = NULL;
+        const char *p;
+
+        if (geteuid() != 0) {
+                (void) log_tests_skipped("not running as root");
+                return;
+        }
+
+        assert_se(mkdtemp_malloc("/tmp/XXXXXX", &tmp) >= 0);
+        subdir = path_join(tmp, "subdir");
+        assert_se(subdir);
+        assert_se(mkdir(subdir, 0755) >= 0);
+
+        FOREACH_STRING(p, "/usr", "/sys", "/", tmp) {
+                pid_t pid;
+
+                pid = fork();
+                assert_se(pid >= 0);
+
+                if (pid == 0) {
+                        struct statvfs svfs;
+                        /* child */
+                        assert_se(detach_mount_namespace() >= 0);
+
+                        /* Check that the subdir is writable (it must be because it's in /tmp) */
+                        assert_se(statvfs(subdir, &svfs) >= 0);
+                        assert_se(!FLAGS_SET(svfs.f_flag, ST_RDONLY));
+
+                        /* Make the subdir a bind mount */
+                        assert_se(mount_nofollow(subdir, subdir, NULL, MS_BIND|MS_REC, NULL) >= 0);
+
+                        /* Ensure it's still writable */
+                        assert_se(statvfs(subdir, &svfs) >= 0);
+                        assert_se(!FLAGS_SET(svfs.f_flag, ST_RDONLY));
+
+                        /* Now mark the path we currently run for read-only */
+                        assert_se(bind_remount_recursive(p, MS_RDONLY, MS_RDONLY, STRV_MAKE("/sys/kernel")) >= 0);
+
+                        /* Ensure that this worked on the top-level */
+                        assert_se(statvfs(p, &svfs) >= 0);
+                        assert_se(FLAGS_SET(svfs.f_flag, ST_RDONLY));
+
+                        /* And ensure this had an effect on the subdir exactly if we are talking about a path above the subdir */
+                        assert_se(statvfs(subdir, &svfs) >= 0);
+                        assert_se(FLAGS_SET(svfs.f_flag, ST_RDONLY) == !!path_startswith(subdir, p));
+
+                        _exit(EXIT_SUCCESS);
+                }
+
+                assert_se(wait_for_terminate_and_check("test-remount-rec", pid, WAIT_LOG) == EXIT_SUCCESS);
+        }
+}
+
+static void test_bind_remount_one(void) {
+        pid_t pid;
+
+        if (geteuid() != 0) {
+                (void) log_tests_skipped("not running as root");
+                return;
+        }
+
+        pid = fork();
+        assert_se(pid >= 0);
+
+        if (pid == 0) {
+                /* child */
+
+                _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
+
+                assert_se(detach_mount_namespace() >= 0);
+
+                assert_se(fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo) >= 0);
+
+                assert_se(bind_remount_one_with_mountinfo("/run", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) >= 0);
+                assert_se(bind_remount_one_with_mountinfo("/proc/idontexist", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) == -ENOENT);
+                assert_se(bind_remount_one_with_mountinfo("/proc/self", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) == -EINVAL);
+                assert_se(bind_remount_one_with_mountinfo("/", MS_RDONLY, MS_RDONLY, proc_self_mountinfo) >= 0);
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        assert_se(wait_for_terminate_and_check("test-remount-one", pid, WAIT_LOG) == EXIT_SUCCESS);
+}
+
 int main(int argc, char *argv[]) {
         test_setup_logging(LOG_DEBUG);
 
         test_mount_option_mangle();
+        test_bind_remount_recursive();
+        test_bind_remount_one();
 
         return 0;
 }