Skip to content

Commit

Permalink
DAOS-14073 dfuse: Move writeback caching from kernel to dfuse. (daos-…
Browse files Browse the repository at this point in the history
…stack#12729)

Have dfuse implement write-back caching rather than relying on the kernel.

Having the kernel do this means that the kernel assumes that it is the
single point of truth for both file size and mtime so it will disregard any
updates from dfuse which makes working across multiple clients very
difficult.

This change removes the kernel flag allowing it to perform writeback caching
but rather if enabled at the dfuse level then dfuse will acknowledge writes
to the kernel before daos/dfs has acknowledged them on the backend. This
gives better performance in the (default) writeback case as it means the kernel
does not make a setattr call to update the mtime after every write and it
re-instates same semantics where the kernel will handle updates from other
clients correctly.

The kernel writeback cache would also perform write coalescing of smaller writes
into larger ones, and dfuse no longer gets the benefit of that, this is something we
need to add back into dfuse as part of future work.

Signed-off-by: Ashley Pittman [email protected]
  • Loading branch information
ashleypittman authored Apr 15, 2024
1 parent fb332b2 commit 4568c2d
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 30 deletions.
18 changes: 18 additions & 0 deletions src/client/dfuse/dfuse.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ struct dfuse_cont {
double dfc_data_timeout;
bool dfc_data_otoc;
bool dfc_direct_io_disable;
bool dfc_wb_cache;

/* Set to true if the inode was allocated to this structure, so should be kept on close*/
bool dfc_save_ino;
Expand Down Expand Up @@ -987,6 +988,10 @@ struct dfuse_inode_entry {
/** File has been unlinked from daos */
bool ie_unlinked;

/* Lock for writes, shared locks are held during write-back reads, exclusive lock is
* acquired and released to flush outstanding writes for getattr, close and forget.
*/
pthread_rwlock_t ie_wlock;
/** Last file closed in this directory was read linearly. Directories only.
*
* Set on close() of a file in the directory to the value of linear_read from the fh.
Expand All @@ -998,6 +1003,19 @@ struct dfuse_inode_entry {
d_list_t ie_evict_entry;
};

/* Flush write-back cache writes to a inode. It does this by waiting for and then releasing an
* exclusive lock on the inode. Writes take a shared lock so this will block until all pending
* writes are complete.
*/

#define DFUSE_IE_WFLUSH(_ie) \
do { \
if ((_ie)->ie_dfs->dfc_wb_cache && S_ISREG((_ie)->ie_stat.st_mode)) { \
D_RWLOCK_WRLOCK(&(_ie)->ie_wlock); \
D_RWLOCK_UNLOCK(&(_ie)->ie_wlock); \
} \
} while (0)

/* Lookup an inode and take a ref on it. */
static inline struct dfuse_inode_entry *
dfuse_inode_lookup(struct dfuse_info *dfuse_info, fuse_ino_t ino)
Expand Down
8 changes: 6 additions & 2 deletions src/client/dfuse/dfuse_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ dfuse_char_disabled(char *addr, size_t len)
* set.
*/
static int
dfuse_cont_get_cache(struct dfuse_cont *dfc)
dfuse_cont_get_cache(struct dfuse_info *dfuse_info, struct dfuse_cont *dfc)
{
size_t sizes[ATTR_COUNT];
char *buff;
Expand Down Expand Up @@ -730,6 +730,9 @@ dfuse_cont_get_cache(struct dfuse_cont *dfc)

if (have_dentry && !have_dentry_dir)
dfc->dfc_dentry_dir_timeout = dfc->dfc_dentry_timeout;

if (dfc->dfc_data_timeout != 0 && dfuse_info->di_wb_cache)
dfc->dfc_wb_cache = true;
rc = 0;
out:
D_FREE(buff);
Expand Down Expand Up @@ -873,7 +876,7 @@ dfuse_cont_open(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, const cha
uuid_copy(dfc->dfc_uuid, c_info.ci_uuid);

if (dfuse_info->di_caching) {
rc = dfuse_cont_get_cache(dfc);
rc = dfuse_cont_get_cache(dfuse_info, dfc);
if (rc == ENODATA) {
DFUSE_TRA_INFO(dfc, "Using default caching values");
dfuse_set_default_cont_cache_values(dfc);
Expand Down Expand Up @@ -1250,6 +1253,7 @@ dfuse_ie_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie)
atomic_init(&ie->ie_il_count, 0);
atomic_fetch_add_relaxed(&dfuse_info->di_inode_count, 1);
D_INIT_LIST_HEAD(&ie->ie_evict_entry);
D_RWLOCK_INIT(&ie->ie_wlock, 0);
}

void
Expand Down
51 changes: 40 additions & 11 deletions src/client/dfuse/dfuse_fuseops.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,8 @@ dfuse_show_flags(void *handle, unsigned int cap, unsigned int want)
DFUSE_TRA_WARNING(handle, "Unknown requested flags %#x", want);
}

/* Called on filesystem init. It has the ability to both observe configuration
* options, but also to modify them. As we do not use the FUSE command line
* parsing this is where we apply tunables.
/* Called on filesystem init. It has the ability to both observe configuration options, but also to
* modify them. As we do not use the FUSE command line parsing this is where we apply tunables.
*/
static void
dfuse_fuse_init(void *arg, struct fuse_conn_info *conn)
Expand All @@ -76,8 +75,8 @@ dfuse_fuse_init(void *arg, struct fuse_conn_info *conn)
DFUSE_TRA_INFO(dfuse_info, "Proto %d %d", conn->proto_major, conn->proto_minor);

/* These are requests dfuse makes to the kernel, but are then capped by the kernel itself,
* for max_read zero means "as large as possible" which is what we want, but then dfuse
* does not know how large to pre-allocate any buffers.
* for max_read zero means "as large as possible" which is what we want, but then dfuse does
* not know how large to pre-allocate any buffers.
*/
DFUSE_TRA_INFO(dfuse_info, "max read %#x", conn->max_read);
DFUSE_TRA_INFO(dfuse_info, "max write %#x", conn->max_write);
Expand All @@ -91,16 +90,12 @@ dfuse_fuse_init(void *arg, struct fuse_conn_info *conn)
conn->want |= FUSE_CAP_READDIRPLUS;
conn->want |= FUSE_CAP_READDIRPLUS_AUTO;

conn->time_gran = 1;

if (dfuse_info->di_wb_cache)
conn->want |= FUSE_CAP_WRITEBACK_CACHE;

#ifdef FUSE_CAP_CACHE_SYMLINKS
conn->want |= FUSE_CAP_CACHE_SYMLINKS;
#endif
dfuse_show_flags(dfuse_info, conn->capable, conn->want);

conn->time_gran = 1;
conn->max_background = 16;
conn->congestion_threshold = 8;

Expand Down Expand Up @@ -170,6 +165,8 @@ df_ll_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
DFUSE_IE_STAT_ADD(inode, DS_GETATTR);
}

DFUSE_IE_WFLUSH(inode);

if (inode->ie_dfs->dfc_attr_timeout &&
(atomic_load_relaxed(&inode->ie_open_write_count) == 0) &&
(atomic_load_relaxed(&inode->ie_il_count) == 0)) {
Expand Down Expand Up @@ -209,6 +206,8 @@ df_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, int to_set,
DFUSE_IE_STAT_ADD(inode, DS_SETATTR);
}

DFUSE_IE_WFLUSH(inode);

if (inode->ie_dfs->dfs_ops->setattr)
inode->ie_dfs->dfs_ops->setattr(req, inode, attr, to_set);
else
Expand Down Expand Up @@ -541,6 +540,34 @@ df_ll_statfs(fuse_req_t req, fuse_ino_t ino)
DFUSE_REPLY_ERR_RAW(dfuse_info, req, rc);
}

static void
dfuse_cb_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
{
struct dfuse_obj_hdl *oh;
struct dfuse_inode_entry *inode;

D_ASSERT(fi != NULL);
oh = (struct dfuse_obj_hdl *)fi->fh;
inode = oh->doh_ie;

DFUSE_IE_WFLUSH(inode);
DFUSE_REPLY_ZERO(inode, req);
}

static void
dfuse_cb_fdatasync(fuse_req_t req, fuse_ino_t ino, int datasync, struct fuse_file_info *fi)
{
struct dfuse_obj_hdl *oh;
struct dfuse_inode_entry *inode;

D_ASSERT(fi != NULL);
oh = (struct dfuse_obj_hdl *)fi->fh;
inode = oh->doh_ie;

DFUSE_IE_WFLUSH(inode);
DFUSE_REPLY_ZERO(inode, req);
}

/* dfuse ops that are used for accessing dfs mounts */
const struct dfuse_inode_ops dfuse_dfs_ops = {
.lookup = dfuse_cb_lookup,
Expand Down Expand Up @@ -598,7 +625,9 @@ const struct dfuse_inode_ops dfuse_pool_ops = {
ACTION(write_buf, dfuse_cb_write, true) \
ACTION(read, dfuse_cb_read, false) \
ACTION(readlink, dfuse_cb_readlink, false) \
ACTION(ioctl, dfuse_cb_ioctl, false)
ACTION(ioctl, dfuse_cb_ioctl, false) \
ACTION(flush, dfuse_cb_flush, true) \
ACTION(fsync, dfuse_cb_fdatasync, true)

#define SET_MEMBER(member, fn, ...) ops.member = fn;

Expand Down
2 changes: 2 additions & 0 deletions src/client/dfuse/ops/open.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)

DFUSE_TRA_DEBUG(oh, "Closing %d", oh->doh_caching);

DFUSE_IE_WFLUSH(oh->doh_ie);

if (oh->doh_readahead) {
struct dfuse_event *ev;

Expand Down
2 changes: 2 additions & 0 deletions src/client/dfuse/ops/read.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct

ev->de_complete_cb = dfuse_cb_read_complete;

DFUSE_IE_WFLUSH(oh->doh_ie);

rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev);
if (rc != 0) {
D_GOTO(err, rc);
Expand Down
51 changes: 34 additions & 17 deletions src/client/dfuse/ops/write.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -10,10 +10,14 @@
static void
dfuse_cb_write_complete(struct dfuse_event *ev)
{
if (ev->de_ev.ev_error == 0)
DFUSE_REPLY_WRITE(ev->de_oh, ev->de_req, ev->de_len);
else
DFUSE_REPLY_ERR_RAW(ev->de_oh, ev->de_req, ev->de_ev.ev_error);
if (ev->de_req) {
if (ev->de_ev.ev_error == 0)
DFUSE_REPLY_WRITE(ev->de_oh, ev->de_req, ev->de_len);
else
DFUSE_REPLY_ERR_RAW(ev->de_oh, ev->de_req, ev->de_ev.ev_error);
} else {
D_RWLOCK_UNLOCK(&ev->de_oh->doh_ie->ie_wlock);
}
daos_event_fini(&ev->de_ev);
d_slab_release(ev->de_eqt->de_write_slab, ev);
}
Expand All @@ -22,15 +26,15 @@ void
dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t position,
struct fuse_file_info *fi)
{
struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh;
struct dfuse_info *dfuse_info = fuse_req_userdata(req);
const struct fuse_ctx *fc = fuse_req_ctx(req);
size_t len = fuse_buf_size(bufv);
struct fuse_bufvec ibuf = FUSE_BUFVEC_INIT(len);
struct dfuse_eq *eqt;
int rc;
struct dfuse_event *ev;
uint64_t eqt_idx;
struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh;
struct dfuse_info *dfuse_info = fuse_req_userdata(req);
size_t len = fuse_buf_size(bufv);
struct fuse_bufvec ibuf = FUSE_BUFVEC_INIT(len);
struct dfuse_eq *eqt;
int rc;
struct dfuse_event *ev;
uint64_t eqt_idx;
bool wb_cache = false;

DFUSE_IE_STAT_ADD(oh->doh_ie, DS_WRITE);

Expand All @@ -40,8 +44,13 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p

eqt = &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count];

DFUSE_TRA_DEBUG(oh, "%#zx-%#zx requested flags %#x pid=%d", position, position + len - 1,
bufv->buf[0].flags, fc->pid);
if (oh->doh_ie->ie_dfs->dfc_wb_cache) {
D_RWLOCK_RDLOCK(&oh->doh_ie->ie_wlock);
wb_cache = true;
}

DFUSE_TRA_DEBUG(oh, "%#zx-%#zx requested flags %#x", position, position + len - 1,
bufv->buf[0].flags);

/* Evict the metadata cache here so the lookup doesn't return stale size/time info */
if (atomic_fetch_add_relaxed(&oh->doh_write_count, 1) == 0) {
Expand All @@ -66,7 +75,10 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p

ev->de_oh = oh;
ev->de_iov.iov_len = len;
ev->de_req = req;
if (wb_cache)
ev->de_req = 0;
else
ev->de_req = req;
ev->de_len = len;
ev->de_complete_cb = dfuse_cb_write_complete;

Expand All @@ -93,6 +105,9 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p
if (rc != 0)
D_GOTO(err, rc);

if (wb_cache)
DFUSE_REPLY_WRITE(oh, req, len);

/* Send a message to the async thread to wake it up and poll for events */
sem_post(&eqt->de_sem);

Expand All @@ -102,6 +117,8 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p
return;

err:
if (wb_cache)
D_RWLOCK_UNLOCK(&ev->de_oh->doh_ie->ie_wlock);
DFUSE_REPLY_ERR_RAW(oh, req, rc);
if (ev) {
daos_event_fini(&ev->de_ev);
Expand Down

0 comments on commit 4568c2d

Please sign in to comment.