From 89f9752ea08f516b5d77f8e577bb772073c70c01 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 4 Oct 2018 15:49:22 +1000
Subject: core/mount: minimize impact on mount storm.

If we create 2000 mounts (on a 1-CPU qemu VM) with
  mkdir -p /MNT/{1..2000}
  time for i in {1..2000}; do mount --bind /etc /MNT/$i ; done

it takes around 20 seconds to complete.  Much of this time is taken up
by systemd repeatedly processing /proc/self/mountinfo.
If I disable the processing, the time drops to about 4 seconds.

I have reports that on a larger system with multiple active user sessions, each
with it's own systemd, the impact can be higher.

One particular use-case where a large number of mounts can be expected in quick
succession is when the "clearcase" SCM starts up.

This patch modifies the handling up events from /proc/self/mountinfo so
that systemd backs off when a storm is detected.  Specifically the time to process
mountinfo is measured, and the process will not be repeated until 10 times
that duration has passed.  This ensures systemd won't use more than 10% of
real time processing mountinfo.

With this patch, my test above takes about 5 seconds.
---
 src/core/manager.h |  3 ++
 src/core/mount.c   | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/src/core/manager.h b/src/core/manager.h
index bce8020cf..9f8fc4643 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -227,6 +227,9 @@ struct Manager {
         /* Data specific to the mount subsystem */
         struct libmnt_monitor *mount_monitor;
         sd_event_source *mount_event_source;
+        sd_event_source *mount_timeout_source;
+        usec_t mount_last_read_usec;
+        usec_t mount_last_duration_usec;
 
         /* Data specific to the swap filesystem */
         FILE *proc_swaps;
diff --git a/src/core/mount.c b/src/core/mount.c
index ead9bc1f4..cfdcc6e6f 100644
--- a/src/core/mount.c
+++ b/src/core/mount.c
@@ -55,6 +55,7 @@ static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = {
 
 static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
 static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int mount_dispatch_proc_self_mountinfo_timer(sd_event_source *source, usec_t usec, void *userdata);
 
 static bool MOUNT_STATE_WITH_PROCESS(MountState state) {
         return IN_SET(state,
@@ -1665,6 +1666,7 @@ static int mount_load_proc_self_mountinfo(Manager *m, bool set_flags) {
 static void mount_shutdown(Manager *m) {
         assert(m);
 
+        m->mount_timeout_source = sd_event_source_unref(m->mount_timeout_source);
         m->mount_event_source = sd_event_source_unref(m->mount_event_source);
 
         mnt_unref_monitor(m->mount_monitor);
@@ -1780,13 +1782,50 @@ fail:
         mount_shutdown(m);
 }
 
+static void mount_process_proc_self_mountinfo(Manager *m);
+
 static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
-        _cleanup_set_free_free_ Set *around = NULL, *gone = NULL;
         Manager *m = userdata;
-        const char *what;
-        Iterator i;
-        Unit *u;
         int r;
+        usec_t next_read = usec_add(m->mount_last_read_usec,
+                                    m->mount_last_duration_usec * 10);
+
+        if (now(CLOCK_MONOTONIC) < next_read) {
+                /* The (current) API for getting mount events from the Linux kernel
+                 * involves getting a "something changed" notification, and then having
+                 * to re-read the entire /proc/self/mountinfo file.  When there are lots
+                 * of mountpoints, this file is large and parsing it can take noticeable
+                 * time.  As most of the file won't have changed, this can be seen as wasted time.
+                 * If there is a "mount storm" such as 1000 mount points being created
+                 * in quick succession, this will result in 1000 successive notification.
+                 * If we respond to every notification, we will do quadratically more work
+                 * than if we respond just once after all the notifications have arrived.
+                 * In this (pathological) case, a delay in scheduling would actually
+                 * improve throughput as we would combine notifications and parse
+                 * the file less often.  We cannot expect the scheduler to notice
+                 * this pathology without help.
+                 * So when the rate of notifications means we are spending more than
+                 * 10% of real time handling them, we set a timer and stop listening
+                 * to notifications for a while.
+                 * If/when Linux provides an API which provides only details of what
+                 * has changed, this rate-limiting can be removed.
+                 */
+
+                r = sd_event_source_set_enabled(source, SD_EVENT_OFF);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to disable monitoring of /proc/self/mounting, ignoring: %m");
+                if (!m->mount_timeout_source) {
+                        r = sd_event_add_time(m->event, &m->mount_timeout_source,
+                                              CLOCK_MONOTONIC,
+                                              next_read,
+                                              0,
+                                              mount_dispatch_proc_self_mountinfo_timer,
+                                              m);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to set timeout to reread /proc/self/mounting, ignoring: %m");
+                }
+                return 0;
+        }
 
         assert(m);
         assert(revents & EPOLLIN);
@@ -1814,13 +1853,40 @@ static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents,
                         return 0;
         }
 
+        mount_process_proc_self_mountinfo(m);
+        return 0;
+}
+
+static int mount_dispatch_proc_self_mountinfo_timer(sd_event_source *source, usec_t usec, void *userdata) {
+        Manager *m = userdata;
+        int r;
+
+        r = sd_event_source_set_enabled(m->mount_event_source, SD_EVENT_ON);
+        if (r < 0)
+                log_warning_errno(r, "Failed to reenable /proc/self/mountinfo monitor, ignoring: %m");
+        m->mount_timeout_source = sd_event_source_unref(source);
+        mount_process_proc_self_mountinfo(m);
+        return 0;
+}
+
+static void mount_process_proc_self_mountinfo(Manager *m) {
+        _cleanup_set_free_free_ Set *around = NULL, *gone = NULL;
+        const char *what;
+        Iterator i;
+        Unit *u;
+        int r;
+
+        m->mount_last_read_usec = now(CLOCK_MONOTONIC);
+        /* If an error occurs, assume 10ms */
+        m->mount_last_duration_usec = 10 * USEC_PER_MSEC;
+
         r = mount_load_proc_self_mountinfo(m, true);
         if (r < 0) {
                 /* Reset flags, just in case, for later calls */
                 LIST_FOREACH(units_by_type, u, m->units_by_type[UNIT_MOUNT])
                         MOUNT(u)->proc_flags = 0;
 
-                return 0;
+                return;
         }
 
         manager_dispatch_load_queue(m);
@@ -1908,8 +1974,8 @@ static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents,
                 /* Let the device units know that the device is no longer mounted */
                 device_found_node(m, what, 0, DEVICE_FOUND_MOUNT);
         }
-
-        return 0;
+        m->mount_last_duration_usec = usec_sub_unsigned(now(CLOCK_MONOTONIC),
+                                                        m->mount_last_read_usec);
 }
 
 static void mount_reset_failed(Unit *u) {
-- 
cgit v1.2.3-65-gdbad