DAOS-14073 dfuse: Move writeback caching from kernel to dfuse. (daos-…

…stack#12729) Have dfuse implement write-back caching rather than relying on the kernel. Having the kernel do this means that the kernel assumes that it is the single point of truth for both file size and mtime so it will disregard any updates from dfuse which makes working across multiple clients very difficult. This change removes the kernel flag allowing it to perform writeback caching but rather if enabled at the dfuse level then dfuse will acknowledge writes to the kernel before daos/dfs has acknowledged them on the backend. This gives better performance in the (default) writeback case as it means the kernel does not make a setattr call to update the mtime after every write and it re-instates same semantics where the kernel will handle updates from other clients correctly. The kernel writeback cache would also perform write coalescing of smaller writes into larger ones, and dfuse no longer gets the benefit of that, this is something we need to add back into dfuse as part of future work. Signed-off-by: Ashley Pittman [email protected]
paul356 · Apr 15, 2024 · 4568c2d · 4568c2d
1 parent fb332b2
commit 4568c2d
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 30 deletions.
diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h
@@ -526,6 +526,7 @@ struct dfuse_cont {
 	double                  dfc_data_timeout;
 	bool                    dfc_data_otoc;
 	bool                    dfc_direct_io_disable;
+	bool                          dfc_wb_cache;
 
 	/* Set to true if the inode was allocated to this structure, so should be kept on close*/
 	bool                    dfc_save_ino;
@@ -987,6 +988,10 @@ struct dfuse_inode_entry {
 	/** File has been unlinked from daos */
 	bool                      ie_unlinked;
 
+	/* Lock for writes, shared locks are held during write-back reads, exclusive lock is
+	 * acquired and released to flush outstanding writes for getattr, close and forget.
+	 */
+	pthread_rwlock_t          ie_wlock;
 	/** Last file closed in this directory was read linearly.  Directories only.
 	 *
 	 * Set on close() of a file in the directory to the value of linear_read from the fh.
@@ -998,6 +1003,19 @@ struct dfuse_inode_entry {
 	d_list_t                  ie_evict_entry;
 };
 
+/* Flush write-back cache writes to a inode.  It does this by waiting for and then releasing an
+ * exclusive lock on the inode.  Writes take a shared lock so this will block until all pending
+ * writes are complete.
+ */
+
+#define DFUSE_IE_WFLUSH(_ie)                                                                       \
+	do {                                                                                       \
+		if ((_ie)->ie_dfs->dfc_wb_cache && S_ISREG((_ie)->ie_stat.st_mode)) {              \
+			D_RWLOCK_WRLOCK(&(_ie)->ie_wlock);                                         \
+			D_RWLOCK_UNLOCK(&(_ie)->ie_wlock);                                         \
+		}                                                                                  \
+	} while (0)
+
 /* Lookup an inode and take a ref on it. */
 static inline struct dfuse_inode_entry *
 dfuse_inode_lookup(struct dfuse_info *dfuse_info, fuse_ino_t ino)

diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c
@@ -608,7 +608,7 @@ dfuse_char_disabled(char *addr, size_t len)
  * set.
  */
 static int
-dfuse_cont_get_cache(struct dfuse_cont *dfc)
+dfuse_cont_get_cache(struct dfuse_info *dfuse_info, struct dfuse_cont *dfc)
 {
 	size_t       sizes[ATTR_COUNT];
 	char        *buff;
@@ -730,6 +730,9 @@ dfuse_cont_get_cache(struct dfuse_cont *dfc)
 
 	if (have_dentry && !have_dentry_dir)
 		dfc->dfc_dentry_dir_timeout = dfc->dfc_dentry_timeout;
+
+	if (dfc->dfc_data_timeout != 0 && dfuse_info->di_wb_cache)
+		dfc->dfc_wb_cache = true;
 	rc = 0;
 out:
 	D_FREE(buff);
@@ -873,7 +876,7 @@ dfuse_cont_open(struct dfuse_info *dfuse_info, struct dfuse_pool *dfp, const cha
 		uuid_copy(dfc->dfc_uuid, c_info.ci_uuid);
 
 		if (dfuse_info->di_caching) {
-			rc = dfuse_cont_get_cache(dfc);
+			rc = dfuse_cont_get_cache(dfuse_info, dfc);
 			if (rc == ENODATA) {
 				DFUSE_TRA_INFO(dfc, "Using default caching values");
 				dfuse_set_default_cont_cache_values(dfc);
@@ -1250,6 +1253,7 @@ dfuse_ie_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie)
 	atomic_init(&ie->ie_il_count, 0);
 	atomic_fetch_add_relaxed(&dfuse_info->di_inode_count, 1);
 	D_INIT_LIST_HEAD(&ie->ie_evict_entry);
+	D_RWLOCK_INIT(&ie->ie_wlock, 0);
 }
 
 void

diff --git a/src/client/dfuse/dfuse_fuseops.c b/src/client/dfuse/dfuse_fuseops.c
@@ -62,9 +62,8 @@ dfuse_show_flags(void *handle, unsigned int cap, unsigned int want)
 		DFUSE_TRA_WARNING(handle, "Unknown requested flags %#x", want);
 }
 
-/* Called on filesystem init.  It has the ability to both observe configuration
- * options, but also to modify them.  As we do not use the FUSE command line
- * parsing this is where we apply tunables.
+/* Called on filesystem init.  It has the ability to both observe configuration options, but also to
+ * modify them.  As we do not use the FUSE command line parsing this is where we apply tunables.
  */
 static void
 dfuse_fuse_init(void *arg, struct fuse_conn_info *conn)
@@ -76,8 +75,8 @@ dfuse_fuse_init(void *arg, struct fuse_conn_info *conn)
 	DFUSE_TRA_INFO(dfuse_info, "Proto %d %d", conn->proto_major, conn->proto_minor);
 
 	/* These are requests dfuse makes to the kernel, but are then capped by the kernel itself,
-	 * for max_read zero means "as large as possible" which is what we want, but then dfuse
-	 * does not know how large to pre-allocate any buffers.
+	 * for max_read zero means "as large as possible" which is what we want, but then dfuse does
+	 * not know how large to pre-allocate any buffers.
 	 */
 	DFUSE_TRA_INFO(dfuse_info, "max read %#x", conn->max_read);
 	DFUSE_TRA_INFO(dfuse_info, "max write %#x", conn->max_write);
@@ -91,16 +90,12 @@ dfuse_fuse_init(void *arg, struct fuse_conn_info *conn)
 	conn->want |= FUSE_CAP_READDIRPLUS;
 	conn->want |= FUSE_CAP_READDIRPLUS_AUTO;
 
-	conn->time_gran = 1;
-
-	if (dfuse_info->di_wb_cache)
-		conn->want |= FUSE_CAP_WRITEBACK_CACHE;
-
 #ifdef FUSE_CAP_CACHE_SYMLINKS
 	conn->want |= FUSE_CAP_CACHE_SYMLINKS;
 #endif
 	dfuse_show_flags(dfuse_info, conn->capable, conn->want);
 
+	conn->time_gran            = 1;
 	conn->max_background       = 16;
 	conn->congestion_threshold = 8;
 
@@ -170,6 +165,8 @@ df_ll_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
 		DFUSE_IE_STAT_ADD(inode, DS_GETATTR);
 	}
 
+	DFUSE_IE_WFLUSH(inode);
+
 	if (inode->ie_dfs->dfc_attr_timeout &&
 	    (atomic_load_relaxed(&inode->ie_open_write_count) == 0) &&
 	    (atomic_load_relaxed(&inode->ie_il_count) == 0)) {
@@ -209,6 +206,8 @@ df_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, int to_set,
 		DFUSE_IE_STAT_ADD(inode, DS_SETATTR);
 	}
 
+	DFUSE_IE_WFLUSH(inode);
+
 	if (inode->ie_dfs->dfs_ops->setattr)
 		inode->ie_dfs->dfs_ops->setattr(req, inode, attr, to_set);
 	else
@@ -541,6 +540,34 @@ df_ll_statfs(fuse_req_t req, fuse_ino_t ino)
 	DFUSE_REPLY_ERR_RAW(dfuse_info, req, rc);
 }
 
+static void
+dfuse_cb_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+	struct dfuse_obj_hdl     *oh;
+	struct dfuse_inode_entry *inode;
+
+	D_ASSERT(fi != NULL);
+	oh    = (struct dfuse_obj_hdl *)fi->fh;
+	inode = oh->doh_ie;
+
+	DFUSE_IE_WFLUSH(inode);
+	DFUSE_REPLY_ZERO(inode, req);
+}
+
+static void
+dfuse_cb_fdatasync(fuse_req_t req, fuse_ino_t ino, int datasync, struct fuse_file_info *fi)
+{
+	struct dfuse_obj_hdl     *oh;
+	struct dfuse_inode_entry *inode;
+
+	D_ASSERT(fi != NULL);
+	oh    = (struct dfuse_obj_hdl *)fi->fh;
+	inode = oh->doh_ie;
+
+	DFUSE_IE_WFLUSH(inode);
+	DFUSE_REPLY_ZERO(inode, req);
+}
+
 /* dfuse ops that are used for accessing dfs mounts */
 const struct dfuse_inode_ops dfuse_dfs_ops = {
     .lookup      = dfuse_cb_lookup,
@@ -598,7 +625,9 @@ const struct dfuse_inode_ops dfuse_pool_ops = {
 	ACTION(write_buf, dfuse_cb_write, true)                                                    \
 	ACTION(read, dfuse_cb_read, false)                                                         \
 	ACTION(readlink, dfuse_cb_readlink, false)                                                 \
-	ACTION(ioctl, dfuse_cb_ioctl, false)
+	ACTION(ioctl, dfuse_cb_ioctl, false)                                                       \
+	ACTION(flush, dfuse_cb_flush, true)                                                        \
+	ACTION(fsync, dfuse_cb_fdatasync, true)
 
 #define SET_MEMBER(member, fn, ...) ops.member = fn;
 

diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c
@@ -145,6 +145,8 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
 
 	DFUSE_TRA_DEBUG(oh, "Closing %d", oh->doh_caching);
 
+	DFUSE_IE_WFLUSH(oh->doh_ie);
+
 	if (oh->doh_readahead) {
 		struct dfuse_event *ev;
 

diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c
@@ -186,6 +186,8 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct
 
 	ev->de_complete_cb = dfuse_cb_read_complete;
 
+	DFUSE_IE_WFLUSH(oh->doh_ie);
+
 	rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev);
 	if (rc != 0) {
 		D_GOTO(err, rc);

diff --git a/src/client/dfuse/ops/write.c b/src/client/dfuse/ops/write.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -10,10 +10,14 @@
 static void
 dfuse_cb_write_complete(struct dfuse_event *ev)
 {
-	if (ev->de_ev.ev_error == 0)
-		DFUSE_REPLY_WRITE(ev->de_oh, ev->de_req, ev->de_len);
-	else
-		DFUSE_REPLY_ERR_RAW(ev->de_oh, ev->de_req, ev->de_ev.ev_error);
+	if (ev->de_req) {
+		if (ev->de_ev.ev_error == 0)
+			DFUSE_REPLY_WRITE(ev->de_oh, ev->de_req, ev->de_len);
+		else
+			DFUSE_REPLY_ERR_RAW(ev->de_oh, ev->de_req, ev->de_ev.ev_error);
+	} else {
+		D_RWLOCK_UNLOCK(&ev->de_oh->doh_ie->ie_wlock);
+	}
 	daos_event_fini(&ev->de_ev);
 	d_slab_release(ev->de_eqt->de_write_slab, ev);
 }
@@ -22,15 +26,15 @@ void
 dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t position,
 	       struct fuse_file_info *fi)
 {
-	struct dfuse_obj_hdl  *oh         = (struct dfuse_obj_hdl *)fi->fh;
-	struct dfuse_info     *dfuse_info = fuse_req_userdata(req);
-	const struct fuse_ctx *fc         = fuse_req_ctx(req);
-	size_t                 len        = fuse_buf_size(bufv);
-	struct fuse_bufvec     ibuf       = FUSE_BUFVEC_INIT(len);
-	struct dfuse_eq       *eqt;
-	int                    rc;
-	struct dfuse_event    *ev;
-	uint64_t               eqt_idx;
+	struct dfuse_obj_hdl *oh         = (struct dfuse_obj_hdl *)fi->fh;
+	struct dfuse_info    *dfuse_info = fuse_req_userdata(req);
+	size_t                len        = fuse_buf_size(bufv);
+	struct fuse_bufvec    ibuf       = FUSE_BUFVEC_INIT(len);
+	struct dfuse_eq      *eqt;
+	int                   rc;
+	struct dfuse_event   *ev;
+	uint64_t              eqt_idx;
+	bool                  wb_cache = false;
 
 	DFUSE_IE_STAT_ADD(oh->doh_ie, DS_WRITE);
 
@@ -40,8 +44,13 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p
 
 	eqt = &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count];
 
-	DFUSE_TRA_DEBUG(oh, "%#zx-%#zx requested flags %#x pid=%d", position, position + len - 1,
-			bufv->buf[0].flags, fc->pid);
+	if (oh->doh_ie->ie_dfs->dfc_wb_cache) {
+		D_RWLOCK_RDLOCK(&oh->doh_ie->ie_wlock);
+		wb_cache = true;
+	}
+
+	DFUSE_TRA_DEBUG(oh, "%#zx-%#zx requested flags %#x", position, position + len - 1,
+			bufv->buf[0].flags);
 
 	/* Evict the metadata cache here so the lookup doesn't return stale size/time info */
 	if (atomic_fetch_add_relaxed(&oh->doh_write_count, 1) == 0) {
@@ -66,7 +75,10 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p
 
 	ev->de_oh          = oh;
 	ev->de_iov.iov_len = len;
-	ev->de_req         = req;
+	if (wb_cache)
+		ev->de_req = 0;
+	else
+		ev->de_req = req;
 	ev->de_len         = len;
 	ev->de_complete_cb = dfuse_cb_write_complete;
 
@@ -93,6 +105,9 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p
 	if (rc != 0)
 		D_GOTO(err, rc);
 
+	if (wb_cache)
+		DFUSE_REPLY_WRITE(oh, req, len);
+
 	/* Send a message to the async thread to wake it up and poll for events */
 	sem_post(&eqt->de_sem);
 
@@ -102,6 +117,8 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p
 	return;
 
 err:
+	if (wb_cache)
+		D_RWLOCK_UNLOCK(&ev->de_oh->doh_ie->ie_wlock);
 	DFUSE_REPLY_ERR_RAW(oh, req, rc);
 	if (ev) {
 		daos_event_fini(&ev->de_ev);