Skip to content

Commit

Permalink
fs: lockless mntns lookup for nsfs
Browse files Browse the repository at this point in the history
We already made the rbtree lookup lockless for the simple lookup case.
However, walking the list of mount namespaces via nsfs still happens
with taking the read lock blocking concurrent additions of new mount
namespaces pointlessly. Plus, such additions are rare anyway so allow
lockless lookup of the previous and next mount namespace by keeping a
separate list. This also allows to make some things simpler in the code.

Link: https://lore.kernel.org/r/[email protected]
Reviewed-by: Jeff Layton <[email protected]>
Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
  • Loading branch information
brauner committed Jan 9, 2025
1 parent 67d676b commit 4368898
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 26 deletions.
13 changes: 4 additions & 9 deletions fs/mount.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ struct mnt_namespace {
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;

Expand Down Expand Up @@ -160,15 +161,9 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
}

bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, false);
}
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, true);
}
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
bool previous);

static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
Expand Down
42 changes: 29 additions & 13 deletions fs/namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ static DEFINE_RWLOCK(mnt_ns_tree_lock);
static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);

static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */

struct mount_kattr {
unsigned int attr_set;
Expand Down Expand Up @@ -142,10 +143,19 @@ static inline void mnt_ns_tree_write_unlock(void)

static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
struct rb_node *node;
struct rb_node *node, *prev;

mnt_ns_tree_write_lock();
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
/*
* If there's no previous entry simply add it after the
* head and if there is add it after the previous entry.
*/
prev = rb_prev(&ns->mnt_ns_tree_node);
if (!prev)
list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
else
list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
mnt_ns_tree_write_unlock();

WARN_ON_ONCE(node);
Expand Down Expand Up @@ -174,6 +184,7 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns)
if (!is_anon_ns(ns)) {
mnt_ns_tree_write_lock();
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
list_bidir_del_rcu(&ns->mnt_ns_list);
mnt_ns_tree_write_unlock();
}

Expand Down Expand Up @@ -2086,30 +2097,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
return &mnt->ns;
}

struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
guard(read_lock)(&mnt_ns_tree_lock);
guard(rcu)();

for (;;) {
struct rb_node *node;
struct list_head *list;

if (previous)
node = rb_prev(&mntns->mnt_ns_tree_node);
list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
else
node = rb_next(&mntns->mnt_ns_tree_node);
if (!node)
list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
if (list_is_head(list, &mnt_ns_list))
return ERR_PTR(-ENOENT);

mntns = node_to_mnt_ns(node);
node = &mntns->mnt_ns_tree_node;
mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);

/*
* The last passive reference count is put with RCU
* delay so accessing the mount namespace is not just
* safe but all relevant members are still valid.
*/
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
continue;

/*
* Holding mnt_ns_tree_lock prevents the mount namespace from
* being freed but it may well be on it's deathbed. We want an
* active reference, not just a passive one here as we're
* persisting the mount namespace.
* We need an active reference count as we're persisting
* the mount namespace and it might already be on its
* deathbed.
*/
if (!refcount_inc_not_zero(&mntns->ns.count))
continue;
Expand Down Expand Up @@ -3926,6 +3941,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
INIT_LIST_HEAD(&new_ns->mnt_ns_list);
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
Expand Down
5 changes: 1 addition & 4 deletions fs/nsfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (usize < MNT_NS_INFO_SIZE_VER0)
return -EINVAL;

if (previous)
mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
else
mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
if (IS_ERR(mnt_ns))
return PTR_ERR(mnt_ns);

Expand Down

0 comments on commit 4368898

Please sign in to comment.