forked from scylladb/seastar
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
"XFS serializes all I/O when it sees a size-changing operation, like an append. This slows down ScyllaDB's writes, because files can only be written with a concurrency of 1. Enabling file_output_stream's write-behind mode causes xfs to block in the reactor thread, destroying performance. This patchset improves the situation. We begin by shadowing xfs locking in seastar; replicating its locking rules using seastar primitives. Because we use the same rules, xfs will never see a situation in which it has to block; and because seastar primitives are asynchronous, the reactor can continue to process events. We continue by looking ahead at queued writes; if we see several size-changing writes, we issue a truncate() to extend the file size (taking care to do this in a way which would not cause xfs to block). This allows some writes to proceed in parallel, but we still serialize periodically so we can issue the next truncate. We also introduce an option, for users that can tolerate an inaccurate file size while the file is being written, to maintain a "sloppy size". When this option is enabled, seastar will speculatively extend the file size to make room for future writes, and truncate it back when the file is flushed or closed. Support for ext4 is left as a future exercise. Performance results on my desktop SSD: bufsize ops iodepth IOPS patch 4096 150000 1 13614 1 4096 150000 1 14999 2 4096 150000 1 14550 3 4096 150000 1 14678 4 bufsize ops iodepth IOPS patch 4096 150000 10 15008 1 4096 150000 10 13301 2 4096 150000 10 14420 3 4096 150000 10 31475 4" Reviewed-by: Duarte Nunes <[email protected]>
- Loading branch information
Showing
5 changed files
with
497 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
/* | ||
* This file is open source software, licensed to you under the terms | ||
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file | ||
* distributed with this work for additional information regarding copyright | ||
* ownership. You may not use this file except in compliance with the License. | ||
* | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
/* | ||
* Copyright 2016 ScyllaDB | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "file.hh" | ||
#include <deque> | ||
|
||
class posix_file_impl : public file_impl { | ||
public: | ||
int _fd; | ||
posix_file_impl(int fd, file_open_options options); | ||
virtual ~posix_file_impl() override; | ||
future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc); | ||
future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc); | ||
future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc); | ||
future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc); | ||
future<> flush(void); | ||
future<struct stat> stat(void); | ||
future<> truncate(uint64_t length); | ||
future<> discard(uint64_t offset, uint64_t length); | ||
virtual future<> allocate(uint64_t position, uint64_t length) override; | ||
future<size_t> size(void); | ||
virtual future<> close() noexcept override; | ||
virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override; | ||
private: | ||
void query_dma_alignment(); | ||
}; | ||
|
||
// The Linux XFS implementation is challenged wrt. append: a write that changes | ||
// eof will be blocked by any other concurrent AIO operation to the same file, whether | ||
// it changes file size or not. Furthermore, ftruncate() will also block and be blocked | ||
// by AIO, so attempts to game the system and call ftruncate() have to be done very carefully. | ||
// | ||
// Other Linux filesystems may have different locking rules, so this may need to be | ||
// adjusted for them. | ||
class append_challenged_posix_file_impl : public posix_file_impl { | ||
// File size as a result of completed kernel operations (writes and truncates) | ||
uint64_t _committed_size; | ||
// File size as a result of seastar API calls | ||
uint64_t _logical_size; | ||
// Pending operations | ||
enum class opcode { | ||
invalid, | ||
read, | ||
write, | ||
truncate, | ||
flush, | ||
}; | ||
struct op { | ||
opcode type; | ||
uint64_t pos; | ||
size_t len; | ||
std::function<future<> ()> run; | ||
}; | ||
// Queue of pending operations; processed from front to end to avoid | ||
// starvation, but can issue concurrent operations. | ||
std::deque<op> _q; | ||
unsigned _current_non_size_changing_ops = 0; | ||
unsigned _current_size_changing_ops = 0; | ||
// Set when the user closes the file | ||
bool _done = false; | ||
bool _sloppy_size = false; | ||
// Fulfiled when _done and I/O is complete | ||
promise<> _completed; | ||
private: | ||
void commit_size(uint64_t size); | ||
bool size_changing(const op& candidate) const; | ||
bool may_dispatch(const op& candidate) const; | ||
void dispatch(op& candidate); | ||
void optimize_queue(); | ||
void process_queue(); | ||
bool may_quit() const; | ||
void enqueue(op&& op); | ||
public: | ||
append_challenged_posix_file_impl(int fd, file_open_options options); | ||
~append_challenged_posix_file_impl() override; | ||
future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override; | ||
future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override; | ||
future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override; | ||
future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override; | ||
future<> flush() override; | ||
future<struct stat> stat() override; | ||
future<> truncate(uint64_t length) override; | ||
future<size_t> size() override; | ||
future<> close() noexcept override; | ||
}; | ||
|
||
class blockdev_file_impl : public posix_file_impl { | ||
public: | ||
blockdev_file_impl(int fd, file_open_options options); | ||
future<> truncate(uint64_t length) override; | ||
future<> discard(uint64_t offset, uint64_t length) override; | ||
future<size_t> size(void) override; | ||
virtual future<> allocate(uint64_t position, uint64_t length) override; | ||
}; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.