Skip to content

Commit

Permalink
Switch Linux-aio implementation away from libaio
Browse files Browse the repository at this point in the history
Drop libaio calls, start using internal implementation. Note that
the internal implementation returns errors in the system call convention
(-1 + errno) rather than the kernel convention (-error).
  • Loading branch information
avikivity committed Jan 2, 2018
1 parent 27158ce commit 499cbf8
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 48 deletions.
41 changes: 21 additions & 20 deletions apps/iotune/iotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include <memory>
#include <vector>
#include <cmath>
#include <libaio.h>
#include <wordexp.h>
#include <boost/thread/barrier.hpp>
#include <boost/filesystem.hpp>
Expand All @@ -38,13 +37,15 @@
#include <queue>
#include <fstream>
#include <future>
#include "core/linux-aio.hh"
#include "core/sstring.hh"
#include "core/posix.hh"
#include "core/resource.hh"
#include "core/aligned_buffer.hh"
#include "util/defer.hh"

using namespace seastar;
using namespace seastar::internal;
using namespace std::chrono_literals;

namespace seastar {
Expand Down Expand Up @@ -478,8 +479,8 @@ class reader {
{}

iocb* issue() {
io_prep_pread(&_iocb, _file.get(), _buf.get(), iotune_manager::rbuffer_size, _pos_distribution(random_generator) * iotune_manager::rbuffer_size);
_iocb.data = this;
_iocb = make_read_iocb(_file.get(), _pos_distribution(random_generator) * iotune_manager::rbuffer_size, _buf.get(), iotune_manager::rbuffer_size);
set_user_data(_iocb, this);
_tstamp = std::chrono::steady_clock::now();
return &_iocb;
}
Expand Down Expand Up @@ -517,10 +518,10 @@ void sanity_check_ev(const io_event& ev, size_t size) {
}

run_stats iotune_manager::issue_reads(size_t cpu_id, unsigned concurrency) {
io_context_t io_context = {0};
auto r = ::io_setup(concurrency, &io_context);
::aio_context_t io_context = {0};
auto r = io_setup(concurrency, &io_context);
assert(r >= 0);
auto destroyer = defer([&io_context] { ::io_destroy(io_context); });
auto destroyer = defer([&io_context] { io_destroy(io_context); });

unsigned finished = 0;
std::vector<io_event> ev;
Expand All @@ -541,13 +542,13 @@ run_stats iotune_manager::issue_reads(size_t cpu_id, unsigned concurrency) {
iocb_vecptr.push_back(r.issue());
}

r = ::io_submit(io_context, iocb_vecptr.size(), iocb_vecptr.data());
throw_kernel_error(r);
r = io_submit(io_context, iocb_vecptr.size(), iocb_vecptr.data());
throw_system_error_on(r == -1, "io_submit");

struct timespec timeout = {0, 0};
while (finished != concurrency) {
int n = ::io_getevents(io_context, 1, ev.size(), ev.data(), &timeout);
throw_kernel_error(n);
int n = io_getevents(io_context, 1, ev.size(), ev.data(), &timeout);
throw_system_error_on(n == -1, "io_getevents");
unsigned new_req = 0;
for (auto i = 0ul; i < size_t(n); ++i) {
sanity_check_ev(ev[i], iotune_manager::rbuffer_size);
Expand All @@ -560,7 +561,7 @@ run_stats iotune_manager::issue_reads(size_t cpu_id, unsigned concurrency) {
}
}
r = ::io_submit(io_context, new_req, iocb_vecptr.data());
throw_kernel_error(r);
throw_system_error_on(r == -1, "io_submit");
}
struct run_stats result;
for (auto&& r: fds) {
Expand All @@ -579,11 +580,11 @@ void test_file::generate(iotune_manager& iotune_manager, std::chrono::seconds ti
auto start_time = iotune_manager::clock::now();
auto latest_tstamp = start_time;

io_context_t io_context = {0};
aio_context_t io_context = {0};
auto max_aio = 128;
auto r = ::io_setup(max_aio, &io_context);
auto r = io_setup(max_aio, &io_context);
assert(r >= 0);
auto destroyer = defer([&io_context] { ::io_destroy(io_context); });
auto destroyer = defer([&io_context] { io_destroy(io_context); });

auto buf = allocate_aligned_buffer<char>(iotune_manager::wbuffer_size, 4096);
memset(buf.get(), 0, iotune_manager::wbuffer_size);
Expand All @@ -608,19 +609,19 @@ void test_file::generate(iotune_manager& iotune_manager, std::chrono::seconds ti
while (i < max_aio - aio_outstanding && pos < iotune_manager.file_size) {
auto now = std::min(iotune_manager.file_size - pos, iotune_manager::wbuffer_size);
auto& iocb = iocbs[i++];
iocb.data = buf.get();
io_prep_pwrite(&iocb, file.get(), buf.get(), now, pos);
iocb = make_write_iocb(file.get(), pos, buf.get(), now);
set_user_data(iocb, buf.get());
pos += now;
}
if (i) {
r = ::io_submit(io_context, i, iocb_vecptr.data());
throw_kernel_error(r);
r = io_submit(io_context, i, iocb_vecptr.data());
throw_system_error_on(r == -1, "io_submit");
aio_outstanding += r;
}
if (aio_outstanding) {
struct timespec timeout = {0, 0};
int n = ::io_getevents(io_context, 1, ev.size(), ev.data(), &timeout);
throw_kernel_error(n);
int n = io_getevents(io_context, 1, ev.size(), ev.data(), &timeout);
throw_system_error_on(n == -1, "io_getevents");
aio_outstanding -= n;
for (auto i = 0ul; i < size_t(n); ++i) {
if (stopped_on_error) {
Expand Down
5 changes: 2 additions & 3 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,7 @@ def maybe_static(flag, libs):

defines = ['FMT_HEADER_ONLY']
# Include -lgcc_s before -lunwind to work around for https://savannah.nongnu.org/bugs/?48486. See https://github.com/scylladb/scylla/issues/1725.
libs = ' '.join(['-laio',
maybe_static(args.staticboost,
libs = ' '.join([maybe_static(args.staticboost,
'-lboost_program_options -lboost_system -lboost_filesystem'),
'-lstdc++ -lm',
maybe_static(args.staticboost, '-lboost_thread'),
Expand Down Expand Up @@ -497,7 +496,7 @@ def maybe_static(flag, libs):
'tests/fair_queue_test': ['tests/fair_queue_test.cc'] + core,
'apps/seawreck/seawreck': ['apps/seawreck/seawreck.cc', 'http/http_response_parser.rl'] + core + libnet,
'apps/io_tester/io_tester': ['apps/io_tester/io_tester.cc'] + core,
'apps/iotune/iotune': ['apps/iotune/iotune.cc'] + ['core/resource.cc', 'core/fsqual.cc'],
'apps/iotune/iotune': ['apps/iotune/iotune.cc'] + ['core/resource.cc', 'core/fsqual.cc', 'core/linux-aio.cc'],
'tests/blkdiscard_test': ['tests/blkdiscard_test.cc'] + core,
'tests/sstring_test': ['tests/sstring_test.cc'] + core,
'tests/unwind_test': ['tests/unwind_test.cc'] + core,
Expand Down
14 changes: 8 additions & 6 deletions core/fsqual.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#include "posix.hh"
#include "util/defer.hh"
#include <libaio.h>
#include "core/linux-aio.hh"
#include <sys/time.h>
#include <sys/resource.h>
#include <fcntl.h>
Expand All @@ -33,6 +33,8 @@

namespace seastar {

using namespace seastar::internal;

// Runs func(), and also adds the number of context switches
// that happened during func() to counter.
template <typename Counter, typename Func>
Expand All @@ -57,9 +59,9 @@ with_ctxsw_counting(Counter& counter, Func&& func) {
}

bool filesystem_has_good_aio_support(sstring directory, bool verbose) {
io_context_t ioctx = {};
aio_context_t ioctx = {};
auto r = io_setup(1, &ioctx);
throw_kernel_error(r);
throw_system_error_on(r == -1, "io_setup");
auto cleanup = defer([&] { io_destroy(ioctx); });
auto fname = directory + "/fsqual.tmp";
auto fd = file_desc::open(fname, O_CREAT|O_EXCL|O_RDWR|O_DIRECT, 0600);
Expand All @@ -71,16 +73,16 @@ bool filesystem_has_good_aio_support(sstring directory, bool verbose) {
auto buf = aligned_alloc(4096, 4096);
for (int i = 0; i < nr; ++i) {
struct iocb cmd;
io_prep_pwrite(&cmd, fd.get(), buf, bufsize, bufsize*i);
cmd = make_write_iocb(fd.get(), bufsize*i, buf, bufsize);
struct iocb* cmds[1] = { &cmd };
with_ctxsw_counting(ctxsw, [&] {
auto r = io_submit(ioctx, 1, cmds);
throw_kernel_error(r);
throw_system_error_on(r == -1, "io_submit");
assert(r == 1);
});
struct io_event ioev;
auto n = io_getevents(ioctx, 1, 1, &ioev, nullptr);
throw_kernel_error(n);
throw_system_error_on(n == -1, "io_getevents");
assert(n == 1);
throw_kernel_error(long(ioev.res));
assert(long(ioev.res) == bufsize);
Expand Down
30 changes: 16 additions & 14 deletions core/reactor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ using namespace std::chrono_literals;

using namespace net;

using namespace internal;

seastar::logger seastar_logger("seastar");
seastar::logger sched_logger("scheduler");

Expand Down Expand Up @@ -431,7 +433,7 @@ reactor::reactor(unsigned id)
_task_queues.push_back(std::make_unique<task_queue>(1, "atexit", 1000));
_at_destroy_tasks = _task_queues.back().get();
seastar::thread_impl::init();
auto r = ::io_setup(max_aio, &_io_context);
auto r = io_setup(max_aio, &_io_context);
assert(r >= 0);
#ifdef HAVE_OSV
_timer_thread.start();
Expand Down Expand Up @@ -478,7 +480,7 @@ reactor::~reactor() {
eraser(_expired_timers);
eraser(_expired_lowres_timers);
eraser(_expired_manual_timers);
::io_destroy(_io_context);
io_destroy(_io_context);
}

// Add to an atomic integral non-atomically and returns the previous value
Expand Down Expand Up @@ -876,10 +878,10 @@ reactor::submit_io(Func prepare_io) {
iocb io;
prepare_io(io);
if (_aio_eventfd) {
io_set_eventfd(&io, _aio_eventfd->get_fd());
set_eventfd_notification(io, _aio_eventfd->get_fd());
}
auto f = pr->get_future();
io.data = pr.get();
set_user_data(io, pr.get());
_pending_aio.push_back(io);
pr.release();
if ((_io_queue->queued_requests() > 0) ||
Expand All @@ -899,15 +901,15 @@ reactor::flush_pending_aio() {
for (size_t i = 0; i < nr; ++i) {
iocbs[i] = &_pending_aio[i];
}
auto r = ::io_submit(_io_context, nr, iocbs);
auto r = io_submit(_io_context, nr, iocbs);
size_t nr_consumed;
if (r < 0) {
auto ec = -r;
if (r == -1) {
auto ec = errno;
switch (ec) {
case EAGAIN:
return did_work;
case EBADF: {
auto pr = reinterpret_cast<promise<io_event>*>(iocbs[0]->data);
auto pr = reinterpret_cast<promise<io_event>*>(get_user_data(*iocbs[0]));
try {
throw_kernel_error(r);
} catch (...) {
Expand All @@ -921,7 +923,7 @@ reactor::flush_pending_aio() {
break;
}
default:
throw_kernel_error(r);
throw_system_error_on(true, "io_submit");
abort();
}
} else {
Expand Down Expand Up @@ -965,7 +967,7 @@ bool reactor::process_io()
{
io_event ev[max_aio];
struct timespec timeout = {0, 0};
auto n = ::io_getevents(_io_context, 1, max_aio, ev, &timeout);
auto n = io_getevents(_io_context, 1, max_aio, ev, &timeout);
assert(n >= 0);
for (size_t i = 0; i < size_t(n); ++i) {
auto pr = reinterpret_cast<promise<io_event>*>(ev[i].data);
Expand Down Expand Up @@ -1147,7 +1149,7 @@ posix_file_impl::query_dma_alignment() {
future<size_t>
posix_file_impl::write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& io_priority_class) {
return engine().submit_io_write(io_priority_class, len, [fd = _fd, pos, buffer, len] (iocb& io) {
io_prep_pwrite(&io, fd, const_cast<void*>(buffer), len, pos);
io = make_write_iocb(fd, pos, const_cast<void*>(buffer), len);
}).then([] (io_event ev) {
throw_kernel_error(long(ev.res));
return make_ready_future<size_t>(size_t(ev.res));
Expand All @@ -1161,7 +1163,7 @@ posix_file_impl::write_dma(uint64_t pos, std::vector<iovec> iov, const io_priori
auto size = iov_ptr->size();
auto data = iov_ptr->data();
return engine().submit_io_write(io_priority_class, len, [fd = _fd, pos, data, size] (iocb& io) {
io_prep_pwritev(&io, fd, data, size, pos);
io = make_writev_iocb(fd, pos, data, size);
}).then([iov_ptr = std::move(iov_ptr)] (io_event ev) {
throw_kernel_error(long(ev.res));
return make_ready_future<size_t>(size_t(ev.res));
Expand All @@ -1171,7 +1173,7 @@ posix_file_impl::write_dma(uint64_t pos, std::vector<iovec> iov, const io_priori
future<size_t>
posix_file_impl::read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& io_priority_class) {
return engine().submit_io_read(io_priority_class, len, [fd = _fd, pos, buffer, len] (iocb& io) {
io_prep_pread(&io, fd, buffer, len, pos);
io = make_read_iocb(fd, pos, buffer, len);
}).then([] (io_event ev) {
throw_kernel_error(long(ev.res));
return make_ready_future<size_t>(size_t(ev.res));
Expand All @@ -1185,7 +1187,7 @@ posix_file_impl::read_dma(uint64_t pos, std::vector<iovec> iov, const io_priorit
auto size = iov_ptr->size();
auto data = iov_ptr->data();
return engine().submit_io_read(io_priority_class, len, [fd = _fd, pos, data, size] (iocb& io) {
io_prep_preadv(&io, fd, data, size, pos);
io = make_read_iocb(fd, pos, data, size);
}).then([iov_ptr = std::move(iov_ptr)] (io_event ev) {
throw_kernel_error(long(ev.res));
return make_ready_future<size_t>(size_t(ev.res));
Expand Down
4 changes: 2 additions & 2 deletions core/reactor.hh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
#include "circular_buffer_fixed_capacity.hh"
#include <memory>
#include <type_traits>
#include <libaio.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
Expand All @@ -55,6 +54,7 @@
#include <boost/thread/barrier.hpp>
#include <boost/container/static_vector.hpp>
#include <set>
#include "linux-aio.hh"
#include "util/eclipse.hh"
#include "future.hh"
#include "posix.hh"
Expand Down Expand Up @@ -725,7 +725,7 @@ private:
timer_set<timer<lowres_clock>, &timer<lowres_clock>::_link>::timer_list_t _expired_lowres_timers;
timer_set<timer<manual_clock>, &timer<manual_clock>::_link> _manual_timers;
timer_set<timer<manual_clock>, &timer<manual_clock>::_link>::timer_list_t _expired_manual_timers;
io_context_t _io_context;
::aio_context_t _io_context;
std::vector<struct ::iocb> _pending_aio;
semaphore _io_context_available;
io_stats _io_stats;
Expand Down
6 changes: 3 additions & 3 deletions install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ if [ "$ID" = "ubuntu" ] || [ "$ID" = "debian" ]; then
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get -y update
fi
apt-get install -y libaio-dev ninja-build ragel libhwloc-dev libnuma-dev libpciaccess-dev libcrypto++-dev libboost-all-dev libxml2-dev xfslibs-dev libgnutls28-dev liblz4-dev libsctp-dev gcc make libprotobuf-dev protobuf-compiler python3 libunwind8-dev systemtap-sdt-dev libtool cmake libyaml-cpp-dev
apt-get install -y ninja-build ragel libhwloc-dev libnuma-dev libpciaccess-dev libcrypto++-dev libboost-all-dev libxml2-dev xfslibs-dev libgnutls28-dev liblz4-dev libsctp-dev gcc make libprotobuf-dev protobuf-compiler python3 libunwind8-dev systemtap-sdt-dev libtool cmake libyaml-cpp-dev
if [ "$ID" = "ubuntu" ]; then
apt-get install -y g++-5
echo "g++-5 is installed for Seastar. To build Seastar with g++-5, specify '--compiler=g++-5' on configure.py"
Expand All @@ -51,7 +51,7 @@ enabled=1
enabled_metadata=1
EOF
fi
yum install -y libaio-devel hwloc-devel numactl-devel libpciaccess-devel cryptopp-devel libxml2-devel xfsprogs-devel gnutls-devel lksctp-tools-devel lz4-devel gcc make protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel libtool cmake yaml-cpp-devel
yum install -y hwloc-devel numactl-devel libpciaccess-devel cryptopp-devel libxml2-devel xfsprogs-devel gnutls-devel lksctp-tools-devel lz4-devel gcc make protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel libtool cmake yaml-cpp-devel
if [ "$ID" = "fedora" ]; then
dnf install -y gcc-c++ ninja-build ragel boost-devel libubsan libasan
else # centos
Expand All @@ -60,7 +60,7 @@ EOF
echo "Before running ninja-build, execute following command: . /etc/profile.d/scylla.sh"
fi
elif [ "$ID" = "arch" -o "$ID_LIKE" = "arch" ]; then
pacman -Sy --needed gcc ninja ragel boost boost-libs libaio hwloc numactl libpciaccess crypto++ libxml2 xfsprogs gnutls lksctp-tools lz4 make protobuf libunwind systemtap libtool cmake yaml-cpp
pacman -Sy --needed gcc ninja ragel boost boost-libs hwloc numactl libpciaccess crypto++ libxml2 xfsprogs gnutls lksctp-tools lz4 make protobuf libunwind systemtap libtool cmake yaml-cpp
else
echo "Your system ($ID) is not supported by this script. Please install dependencies manually."
exit 1
Expand Down

0 comments on commit 499cbf8

Please sign in to comment.