Skip to content

Commit

Permalink
Merge "Write-behind for XFS"
Browse files Browse the repository at this point in the history
"XFS serializes all I/O when it sees a size-changing operation, like an
append.  This slows down ScyllaDB's writes, because files can only be
written with a concurrency of 1.  Enabling file_output_stream's write-behind
mode causes xfs to block in the reactor thread, destroying performance.

This patchset improves the situation.  We begin by shadowing xfs locking
in seastar; replicating its locking rules using seastar primitives.
Because we use the same rules, xfs will never see a situation in which
it has to block; and because seastar primitives are asynchronous, the
reactor can continue to process events.

We continue by looking ahead at queued writes; if we see several
size-changing writes, we issue a truncate() to extend the file size
(taking care to do this in a way which would not cause xfs to block).
This allows some writes to proceed in parallel, but we still serialize
periodically so we can issue the next truncate.

We also introduce an option, for users that can tolerate an inaccurate
file size while the file is being written, to maintain a "sloppy size".
When this option is enabled, seastar will speculatively extend the file
size to make room for future writes, and truncate it back when the file
is flushed or closed.

Support for ext4 is left as a future exercise.

Performance results on my desktop SSD:

   bufsize        ops    iodepth         IOPS  patch
      4096     150000          1        13614      1
      4096     150000          1        14999      2
      4096     150000          1        14550      3
      4096     150000          1        14678      4
   bufsize        ops    iodepth         IOPS  patch
      4096     150000         10        15008      1
      4096     150000         10        13301      2
      4096     150000         10        14420      3
      4096     150000         10        31475      4"

Reviewed-by: Duarte Nunes <[email protected]>
  • Loading branch information
avikivity committed Aug 8, 2016
2 parents 6a62307 + 85867f7 commit a0b481f
Show file tree
Hide file tree
Showing 5 changed files with 497 additions and 31 deletions.
2 changes: 2 additions & 0 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def sanitize_vptr_flag(compiler):
'tests/connect_test',
'tests/chunked_fifo_test',
'tests/scollectd_test',
'tests/perf/perf_fstream',
]

apps = [
Expand Down Expand Up @@ -398,6 +399,7 @@ def have_xen():
'tests/connect_test': ['tests/connect_test.cc'] + core + libnet + boost_test_lib,
'tests/chunked_fifo_test': ['tests/chunked_fifo_test.cc'] + core,
'tests/scollectd_test': ['tests/scollectd_test.cc'] + core + boost_test_lib,
'tests/perf/perf_fstream': ['tests/perf/perf_fstream.cc'] + core,
}

warnings = [
Expand Down
115 changes: 115 additions & 0 deletions core/file-impl.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* This file is open source software, licensed to you under the terms
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
* distributed with this work for additional information regarding copyright
* ownership. You may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright 2016 ScyllaDB
*/

#pragma once

#include "file.hh"
#include <deque>

class posix_file_impl : public file_impl {
public:
int _fd;
posix_file_impl(int fd, file_open_options options);
virtual ~posix_file_impl() override;
future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc);
future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc);
future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc);
future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc);
future<> flush(void);
future<struct stat> stat(void);
future<> truncate(uint64_t length);
future<> discard(uint64_t offset, uint64_t length);
virtual future<> allocate(uint64_t position, uint64_t length) override;
future<size_t> size(void);
virtual future<> close() noexcept override;
virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override;
private:
void query_dma_alignment();
};

// The Linux XFS implementation is challenged wrt. append: a write that changes
// eof will be blocked by any other concurrent AIO operation to the same file, whether
// it changes file size or not. Furthermore, ftruncate() will also block and be blocked
// by AIO, so attempts to game the system and call ftruncate() have to be done very carefully.
//
// Other Linux filesystems may have different locking rules, so this may need to be
// adjusted for them.
class append_challenged_posix_file_impl : public posix_file_impl {
// File size as a result of completed kernel operations (writes and truncates)
uint64_t _committed_size;
// File size as a result of seastar API calls
uint64_t _logical_size;
// Pending operations
enum class opcode {
invalid,
read,
write,
truncate,
flush,
};
struct op {
opcode type;
uint64_t pos;
size_t len;
std::function<future<> ()> run;
};
// Queue of pending operations; processed from front to end to avoid
// starvation, but can issue concurrent operations.
std::deque<op> _q;
unsigned _current_non_size_changing_ops = 0;
unsigned _current_size_changing_ops = 0;
// Set when the user closes the file
bool _done = false;
bool _sloppy_size = false;
// Fulfiled when _done and I/O is complete
promise<> _completed;
private:
void commit_size(uint64_t size);
bool size_changing(const op& candidate) const;
bool may_dispatch(const op& candidate) const;
void dispatch(op& candidate);
void optimize_queue();
void process_queue();
bool may_quit() const;
void enqueue(op&& op);
public:
append_challenged_posix_file_impl(int fd, file_open_options options);
~append_challenged_posix_file_impl() override;
future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override;
future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override;
future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override;
future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override;
future<> flush() override;
future<struct stat> stat() override;
future<> truncate(uint64_t length) override;
future<size_t> size() override;
future<> close() noexcept override;
};

class blockdev_file_impl : public posix_file_impl {
public:
blockdev_file_impl(int fd, file_open_options options);
future<> truncate(uint64_t length) override;
future<> discard(uint64_t offset, uint64_t length) override;
future<size_t> size(void) override;
virtual future<> allocate(uint64_t position, uint64_t length) override;
};

32 changes: 2 additions & 30 deletions core/file.hh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ struct directory_entry {
/// \ref file
struct file_open_options {
uint64_t extent_allocation_size_hint = 1 << 20; ///< Allocate this much disk space when extending the file
bool sloppy_size = false; ///< Allow the file size not to track the amount of data written until a flush
uint64_t sloppy_size_hint = 1 << 20; ///< Hint as to what the eventual file size will be
};

/// \cond internal
Expand Down Expand Up @@ -122,36 +124,6 @@ public:
friend class reactor;
};

class posix_file_impl : public file_impl {
public:
int _fd;
posix_file_impl(int fd, file_open_options options);
virtual ~posix_file_impl() override;
future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc);
future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc);
future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc);
future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc);
future<> flush(void);
future<struct stat> stat(void);
future<> truncate(uint64_t length);
future<> discard(uint64_t offset, uint64_t length);
virtual future<> allocate(uint64_t position, uint64_t length) override;
future<size_t> size(void);
virtual future<> close() noexcept override;
virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override;
private:
void query_dma_alignment();
};

class blockdev_file_impl : public posix_file_impl {
public:
blockdev_file_impl(int fd, file_open_options options);
future<> truncate(uint64_t length) override;
future<> discard(uint64_t offset, uint64_t length) override;
future<size_t> size(void) override;
virtual future<> allocate(uint64_t position, uint64_t length) override;
};

/// \endcond

/// A data file on persistent storage.
Expand Down
Loading

0 comments on commit a0b481f

Please sign in to comment.