Skip to content

Commit

Permalink
Merge pull request ceph#36927 from idryomov/wip-krbd-noudev
Browse files Browse the repository at this point in the history
krbd: optionally skip waiting for udev events

Reviewed-by: Jason Dillaman <[email protected]>
Reviewed-by: Sébastien Han <[email protected]>
  • Loading branch information
idryomov authored Sep 21, 2020
2 parents 24e2e66 + d2884ad commit 6827bbb
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 26 deletions.
13 changes: 13 additions & 0 deletions doc/man/8/rbd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -843,12 +843,25 @@ Per mapping (block device) `rbd device map` options:
backend that the data is incompressible, disabling compression in aggressive
mode (since 5.8).

* udev - Wait for udev device manager to finish executing all matching
"add" rules and release the device before exiting (default). This option
is not passed to the kernel.

* noudev - Don't wait for udev device manager. When enabled, the device may
not be fully usable immediately on exit.

`rbd device unmap` options:

* force - Force the unmapping of a block device that is open (since 4.9). The
driver will wait for running requests to complete and then unmap; requests
sent to the driver after initiating the unmap will be failed.

* udev - Wait for udev device manager to finish executing all matching
"remove" rules and clean up after the device before exiting (default).
This option is not passed to the kernel.

* noudev - Don't wait for udev device manager.


Examples
========
Expand Down
5 changes: 5 additions & 0 deletions qa/suites/krbd/rbd-nomount/tasks/krbd_udev_netns.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tasks:
- workunit:
clients:
all:
- rbd/krbd_udev_netns.sh
86 changes: 86 additions & 0 deletions qa/workunits/rbd/krbd_udev_netns.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env bash

set -ex

sudo ip netns add ns1
sudo ip link add veth1-ext type veth peer name veth1-int
sudo ip link set veth1-int netns ns1

sudo ip netns exec ns1 ip link set dev lo up
sudo ip netns exec ns1 ip addr add 192.168.1.2/24 dev veth1-int
sudo ip netns exec ns1 ip link set veth1-int up
sudo ip netns exec ns1 ip route add default via 192.168.1.1

sudo ip addr add 192.168.1.1/24 dev veth1-ext
sudo ip link set veth1-ext up

# Enable forwarding between the namespace and the default route
# interface and set up NAT. In case of multiple default routes,
# just pick the first one.
if [[ $(sysctl -n net.ipv4.ip_forward) -eq 0 ]]; then
sudo iptables -P FORWARD DROP
sudo sysctl -w net.ipv4.ip_forward=1
fi
IFACE="$(ip route list 0.0.0.0/0 | head -n 1 | cut -d ' ' -f 5)"
sudo iptables -A FORWARD -i veth1-ext -o "$IFACE" -j ACCEPT
sudo iptables -A FORWARD -i "$IFACE" -o veth1-ext -j ACCEPT
sudo iptables -t nat -A POSTROUTING -s 192.168.1.2 -o "$IFACE" -j MASQUERADE

rbd create --size 300 img

DEV="$(sudo rbd map img)"
mkfs.ext4 "$DEV"
sudo mount "$DEV" /mnt
sudo umount /mnt
sudo rbd unmap "$DEV"

sudo ip netns exec ns1 bash <<'EOF'
set -ex
DEV="/dev/rbd/rbd/img"
[[ ! -e "$DEV" ]]
# In a network namespace, "rbd map" maps the device and hangs waiting
# for udev add uevents. udev runs as usual (in particular creating the
# symlink which is used here because the device node is never printed),
# but the uevents it sends out never come because they don't cross
# network namespace boundaries.
set +e
timeout 30s rbd map img
RET=$?
set -e
[[ $RET -eq 124 ]]
[[ -L "$DEV" ]]
mkfs.ext4 -F "$DEV"
mount "$DEV" /mnt
umount /mnt
# In a network namespace, "rbd unmap" unmaps the device and hangs
# waiting for udev remove uevents. udev runs as usual (removing the
# symlink), but the uevents it sends out never come because they don't
# cross network namespace boundaries.
set +e
timeout 30s rbd unmap "$DEV"
RET=$?
set -e
[[ $RET -eq 124 ]]
[[ ! -e "$DEV" ]]
# Skip waiting for udev uevents with "-o noudev".
DEV="$(rbd map -o noudev img)"
mkfs.ext4 -F "$DEV"
mount "$DEV" /mnt
umount /mnt
rbd unmap -o noudev "$DEV"
EOF

rbd rm img

sudo iptables -t nat -D POSTROUTING -s 192.168.1.2 -o "$IFACE" -j MASQUERADE
sudo iptables -D FORWARD -i "$IFACE" -o veth1-ext -j ACCEPT
sudo iptables -D FORWARD -i veth1-ext -o "$IFACE" -j ACCEPT
sudo ip netns delete ns1

echo OK
35 changes: 34 additions & 1 deletion src/include/krbd.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,46 @@

#include "rados/librados.h"

/*
* Don't wait for udev add uevents in krbd_map() and udev remove
* uevents in krbd_unmap*(). Instead, make do with the respective
* kernel uevents and return as soon as they are received.
*
* systemd-udevd sends out udev uevents after it finishes processing
* the respective kernel uevents, which mostly boils down to executing
* all matching udev rules. With this flag set, on return from
* krbd_map() systemd-udevd may still be poking at the device: it
* may still be open with tools such as blkid and various ioctls to
* be run against it, none of the persistent symlinks to the device
* node may be there, etc. udev used to be responsible for creating
* the device node as well, but that has been handled by devtmpfs in
* the kernel for many years now, so the device node (as returned
* through @pdevnode) is guaranteed to be there.
*
* If set, krbd_map() and krbd_unmap*() can be invoked from any
* network namespace that is owned by the initial user namespace
* (which is a formality because things like loading kernel modules
* and creating block devices are not namespaced and require global
* privileges, i.e. capabilities in the initial user namespace).
* Otherwise, krbd_map() and krbd_unmap*() must be invoked from
* the initial network namespace.
*
* If set, krbd_unmap*() doesn't attempt to settle the udev queue
* before retrying unmap for the last time. Some EBUSY errors due
* to systemd-udevd poking at the device at the time krbd_unmap*()
* is invoked that are otherwise covered by the retry logic may be
* returned.
*/
#define KRBD_CTX_F_NOUDEV (1U << 0)

#ifdef __cplusplus
extern "C" {
#endif

struct krbd_ctx;

int krbd_create_from_context(rados_config_t cct, struct krbd_ctx **pctx);
int krbd_create_from_context(rados_config_t cct, uint32_t flags,
struct krbd_ctx **pctx);
void krbd_destroy(struct krbd_ctx *ctx);

int krbd_map(struct krbd_ctx *ctx,
Expand Down
57 changes: 47 additions & 10 deletions src/krbd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ DEFINE_UDEV_UPTR(device) /* udev_device_uptr */
struct krbd_ctx {
CephContext *cct;
struct udev *udev;
uint32_t flags; /* KRBD_CTX_F_* */
};

struct krbd_spec {
Expand Down Expand Up @@ -380,14 +381,47 @@ class UdevMapHandler {
std::string *m_pdevnode;
};

static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf,
static const char *get_event_source(const krbd_ctx *ctx)
{
if (ctx->flags & KRBD_CTX_F_NOUDEV) {
/*
* For block devices (unlike network interfaces, they don't
* carry any namespace tags), the kernel broadcasts uevents
* into all network namespaces that are owned by the initial
* user namespace. This restriction is new in 4.18: starting
* with 2.6.35 and through 4.17 the kernel broadcast uevents
* into all network namespaces, period.
*
* However, when invoked from a non-initial user namespace,
* udev_monitor_receive_device() has always ignored both kernel
* and udev uevents by virtue of requiring SCM_CREDENTIALS and
* checking that ucred->uid == 0. When UIDs and GIDs are sent to
* a process in a user namespace, they are translated according
* to that process's UID and GID mappings and, unless root in the
* user namespace is mapped to the global root, that check fails.
* Normally they show up as 65534(nobody) because the global root
* is not mapped.
*/
return "kernel";
}

/*
* Like most netlink messages, udev uevents don't cross network
* namespace boundaries and are therefore confined to the initial
* network namespace.
*/
return "udev";
}

static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf,
string *pname)
{
bool mapped;
int fds[2];
int r;

udev_monitor_uptr mon(udev_monitor_new_from_netlink(udev, "udev"));
udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
get_event_source(ctx)));
if (!mon)
return -ENOMEM;

Expand Down Expand Up @@ -473,7 +507,7 @@ static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec,
if (r < 0)
return r;

return do_map(ctx->udev, spec, buf, pname);
return do_map(ctx, spec, buf, pname);
}

static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
Expand Down Expand Up @@ -681,13 +715,14 @@ class UdevUnmapHandler {
dev_t m_devno;
};

static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf)
{
bool unmapped;
int fds[2];
int r;

udev_monitor_uptr mon(udev_monitor_new_from_netlink(udev, "udev"));
udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
get_event_source(ctx)));
if (!mon)
return -ENOMEM;

Expand All @@ -710,7 +745,8 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
if (pipe2(fds, O_NONBLOCK) < 0)
return -errno;

auto unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() {
auto unmapper = make_named_thread(
"unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() {
/*
* On final device close(), kernel sends a block change event, in
* response to which udev apparently runs blkid on the device. This
Expand All @@ -722,7 +758,7 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
if (sysfs_r == -EBUSY && tries < 2) {
if (!tries) {
usleep(250 * 1000);
} else {
} else if (!(flags & KRBD_CTX_F_NOUDEV)) {
/*
* libudev does not provide the "wait until the queue is empty"
* API or the sufficient amount of primitives to build it from.
Expand Down Expand Up @@ -805,7 +841,7 @@ static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
}

append_unmap_options(&buf, options);
return do_unmap(ctx->udev, wholedevno, buf);
return do_unmap(ctx, wholedevno, buf);
}

static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
Expand Down Expand Up @@ -837,7 +873,7 @@ static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
}

append_unmap_options(&buf, options);
return do_unmap(ctx->udev, devno, buf);
return do_unmap(ctx, devno, buf);
}

static bool dump_one_image(Formatter *f, TextTable *tbl,
Expand Down Expand Up @@ -955,7 +991,7 @@ static int is_mapped_image(struct udev *udev, const krbd_spec& spec,
return 0; /* not mapped */
}

extern "C" int krbd_create_from_context(rados_config_t cct,
extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags,
struct krbd_ctx **pctx)
{
struct krbd_ctx *ctx = new struct krbd_ctx();
Expand All @@ -966,6 +1002,7 @@ extern "C" int krbd_create_from_context(rados_config_t cct,
delete ctx;
return -ENOMEM;
}
ctx->flags = flags;

*pctx = ctx;
return 0;
Expand Down
2 changes: 1 addition & 1 deletion src/test/librbd/fsx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1954,7 +1954,7 @@ create_image()
goto failed_shutdown;
}
#if defined(WITH_KRBD)
r = krbd_create_from_context(rados_cct(cluster), &krbd);
r = krbd_create_from_context(rados_cct(cluster), 0, &krbd);
if (r < 0) {
simple_err("Could not create libkrbd handle", r);
goto failed_shutdown;
Expand Down
Loading

0 comments on commit 6827bbb

Please sign in to comment.