core/mount: minimize impact on mount storm.

If we create 2000 mounts (on a 1-CPU qemu VM) with mkdir -p /MNT/{1..2000} time for i in {1..2000}; do mount --bind /etc /MNT/$i ; done it takes around 20 seconds to complete. Much of this time is taken up by systemd repeatedly processing /proc/self/mountinfo. If I disable the processing, the time drops to about 4 seconds. I have reports that on a larger system with multiple active user sessions, each with it's own systemd, the impact can be higher. One particular use-case where a large number of mounts can be expected in quick succession is when the "clearcase" SCM starts up. This patch modifies the handling up events from /proc/self/mountinfo so that systemd backs off when a storm is detected. Specifically the time to process mountinfo is measured, and the process will not be repeated until 10 times that duration has passed. This ensures systemd won't use more than 10% of real time processing mountinfo. With this patch, my test above takes about 5 seconds.
author: NeilBrown <neilb@suse.com> 2018-10-04 15:49:22 +1000
committer: Lennart Poettering <lennart@poettering.net> 2018-12-16 12:38:40 +0100
commit: 89f9752ea08f516b5d77f8e577bb772073c70c01 (patch)
tree: ba50cc1133b37d85da63dda234971a475833ccb5
parent: Merge pull request #11143 from keszybz/enable-symlink (diff)
download: systemd-89f9752ea08f516b5d77f8e577bb772073c70c01.tar.gz
systemd-89f9752ea08f516b5d77f8e577bb772073c70c01.tar.bz2
systemd-89f9752ea08f516b5d77f8e577bb772073c70c01.zip
2 files changed, 76 insertions, 7 deletions
diff --git a/src/core/manager.h b/src/core/manager.h
index bce8020cf..9f8fc4643 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -227,6 +227,9 @@ struct Manager {
         /* Data specific to the mount subsystem */
         struct libmnt_monitor *mount_monitor;
         sd_event_source *mount_event_source;
+        sd_event_source *mount_timeout_source;
+        usec_t mount_last_read_usec;
+        usec_t mount_last_duration_usec;
 
         /* Data specific to the swap filesystem */
         FILE *proc_swaps;
diff --git a/src/core/mount.c b/src/core/mount.c
index ead9bc1f4..cfdcc6e6f 100644
--- a/src/core/mount.c
+++ b/src/core/mount.c
@@ -55,6 +55,7 @@ static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = {
 
 static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
 static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int mount_dispatch_proc_self_mountinfo_timer(sd_event_source *source, usec_t usec, void *userdata);
 
 static bool MOUNT_STATE_WITH_PROCESS(MountState state) {
         return IN_SET(state,
@@ -1665,6 +1666,7 @@ static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) {
 static void mount_shutdown(Manager *m) {
         assert(m);
 
+        m->mount_timeout_source = sd_event_source_unref(m->mount_timeout_source);
         m->mount_event_source = sd_event_source_unref(m->mount_event_source);
 
         mnt_unref_monitor(m->mount_monitor);
@@ -1780,13 +1782,50 @@ fail:
         mount_shutdown(m);
 }
 
+static void mount_process_proc_self_mountinfo(Manager *m);
+
 static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
-        _cleanup_set_free_free_ Set *around = NULL, *gone = NULL;
         Manager *m = userdata;
-        const char *what;
-        Iterator i;
-        Unit *u;
         int r;
+        usec_t next_read = usec_add(m->mount_last_read_usec,
+                                    m->mount_last_duration_usec * 10);
+
+        if (now(CLOCK_MONOTONIC) < next_read) {
+                /* The (current) API for getting mount events from the Linux kernel
+                 * involves getting a "something changed" notification, and then having
+                 * to re-read the entire /proc/self/mountinfo file.  When there are lots
+                 * of mountpoints, this file is large and parsing it can take noticeable
+                 * time.  As most of the file won't have changed, this can be seen as wasted time.
+                 * If there is a "mount storm" such as 1000 mount points being created
+                 * in quick succession, this will result in 1000 successive notification.
+                 * If we respond to every notification, we will do quadratically more work
+                 * than if we respond just once after all the notifications have arrived.
+                 * In this (pathological) case, a delay in scheduling would actually
+                 * improve throughput as we would combine notifications and parse
+                 * the file less often.  We cannot expect the scheduler to notice
+                 * this pathology without help.
+                 * So when the rate of notifications means we are spending more than
+                 * 10% of real time handling them, we set a timer and stop listening
+                 * to notifications for a while.
+                 * If/when Linux provides an API which provides only details of what
+                 * has changed, this rate-limiting can be removed.
+                 */
+
+                r = sd_event_source_set_enabled(source, SD_EVENT_OFF);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to disable monitoring of /proc/self/mounting, ignoring: %m");
+                if (!m->mount_timeout_source) {
+                        r = sd_event_add_time(m->event, &m->mount_timeout_source,
+                                              CLOCK_MONOTONIC,
+                                              next_read,
+                                              0,
+                                              mount_dispatch_proc_self_mountinfo_timer,
+                                              m);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to set timeout to reread /proc/self/mounting, ignoring: %m");
+                }
+                return 0;
+        }
 
         assert(m);
         assert(revents & EPOLLIN);
@@ -1814,13 +1853,40 @@ static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents,
                         return 0;
         }
 
+        mount_process_proc_self_mountinfo(m);
+        return 0;
+}
+
+static int mount_dispatch_proc_self_mountinfo_timer(sd_event_source *source, usec_t usec, void *userdata) {
+        Manager *m = userdata;
+        int r;
+
+        r = sd_event_source_set_enabled(m->mount_event_source, SD_EVENT_ON);
+        if (r < 0)
+                log_warning_errno(r, "Failed to reenable /proc/self/mountinfo monitor, ignoring: %m");
+        m->mount_timeout_source = sd_event_source_unref(source);
+        mount_process_proc_self_mountinfo(m);
+        return 0;
+}
+
+static void mount_process_proc_self_mountinfo(Manager *m) {
+        _cleanup_set_free_free_ Set *around = NULL, *gone = NULL;
+        const char *what;
+        Iterator i;
+        Unit *u;
+        int r;
+
+        m->mount_last_read_usec = now(CLOCK_MONOTONIC);
+        /* If an error occurs, assume 10ms */
+        m->mount_last_duration_usec = 10 * USEC_PER_MSEC;
+
         r = mount_load_proc_self_mountinfo(m, true);
         if (r < 0) {
                 /* Reset flags, just in case, for later calls */
                 LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT])
                         MOUNT(u)->proc_flags = 0;
 
-                return 0;
+                return;
         }
 
         manager_dispatch_load_queue(m);
@@ -1908,8 +1974,8 @@ static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents,
                 /* Let the device units know that the device is no longer mounted */
                 device_found_node(m, what, 0, DEVICE_FOUND_MOUNT);
         }
-
-        return 0;
+        m->mount_last_duration_usec = usec_sub_unsigned(now(CLOCK_MONOTONIC),
+                                                        m->mount_last_read_usec);
 }
 
 static void mount_reset_failed(Unit *u) {
author	NeilBrown <neilb@suse.com>	2018-10-04 15:49:22 +1000
committer	Lennart Poettering <lennart@poettering.net>	2018-12-16 12:38:40 +0100
commit	89f9752ea08f516b5d77f8e577bb772073c70c01 (patch)
tree	ba50cc1133b37d85da63dda234971a475833ccb5
parent	Merge pull request #11143 from keszybz/enable-symlink (diff)
download	systemd-89f9752ea08f516b5d77f8e577bb772073c70c01.tar.gz systemd-89f9752ea08f516b5d77f8e577bb772073c70c01.tar.bz2 systemd-89f9752ea08f516b5d77f8e577bb772073c70c01.zip