Skip to content

Commit

Permalink
Merge pull request ceph#2766 from dachary/wip-9408-buffer-alignment-g…
Browse files Browse the repository at this point in the history
…iant

erasure-code: buffer alignment (giant)

Reviewed-by: Samuel Just <[email protected]>
  • Loading branch information
Samuel Just committed Oct 23, 2014
2 parents 95a0ee1 + bc8fd49 commit f2178d6
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 66 deletions.
91 changes: 55 additions & 36 deletions src/common/buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -230,55 +230,60 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
};

class buffer::raw_posix_aligned : public buffer::raw {
unsigned align;
public:
raw_posix_aligned(unsigned l) : raw(l) {
raw_posix_aligned(unsigned l, unsigned _align) : raw(l) {
align = _align;
assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
#ifdef DARWIN
data = (char *) valloc (len);
#else
data = 0;
int r = ::posix_memalign((void**)(void*)&data, CEPH_PAGE_SIZE, len);
int r = ::posix_memalign((void**)(void*)&data, align, len);
if (r)
throw bad_alloc();
#endif /* DARWIN */
if (!data)
throw bad_alloc();
inc_total_alloc(len);
bdout << "raw_posix_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
bdout << "raw_posix_aligned " << this << " alloc " << (void *)data << " l=" << l << ", align=" << align << " total_alloc=" << buffer::get_total_alloc() << bendl;
}
~raw_posix_aligned() {
::free((void*)data);
dec_total_alloc(len);
bdout << "raw_posix_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
}
raw* clone_empty() {
return new raw_posix_aligned(len);
return new raw_posix_aligned(len, align);
}
};
#endif

#ifdef __CYGWIN__
class buffer::raw_hack_aligned : public buffer::raw {
unsigned align;
char *realdata;
public:
raw_hack_aligned(unsigned l) : raw(l) {
realdata = new char[len+CEPH_PAGE_SIZE-1];
unsigned off = ((unsigned)realdata) & ~CEPH_PAGE_MASK;
raw_hack_aligned(unsigned l, unsigned _align) : raw(l) {
align = _align;
realdata = new char[len+align-1];
unsigned off = ((unsigned)realdata) & (align-1);
if (off)
data = realdata + CEPH_PAGE_SIZE - off;
data = realdata + align - off;
else
data = realdata;
inc_total_alloc(len+CEPH_PAGE_SIZE-1);
inc_total_alloc(len+align-1);
//cout << "hack aligned " << (unsigned)data
//<< " in raw " << (unsigned)realdata
//<< " off " << off << std::endl;
assert(((unsigned)data & (CEPH_PAGE_SIZE-1)) == 0);
assert(((unsigned)data & (align-1)) == 0);
}
~raw_hack_aligned() {
delete[] realdata;
dec_total_alloc(len+CEPH_PAGE_SIZE-1);
dec_total_alloc(len+align-1);
}
raw* clone_empty() {
return new raw_hack_aligned(len);
return new raw_hack_aligned(len, align);
}
};
#endif
Expand Down Expand Up @@ -334,10 +339,6 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
return true;
}

bool is_page_aligned() {
return false;
}

int set_source(int fd, loff_t *off) {
int flags = SPLICE_F_NONBLOCK;
ssize_t r = safe_splice(fd, off, pipefds[1], NULL, len, flags);
Expand Down Expand Up @@ -520,14 +521,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
buffer::raw* buffer::create_static(unsigned len, char *buf) {
return new raw_static(buf, len);
}
buffer::raw* buffer::create_page_aligned(unsigned len) {
buffer::raw* buffer::create_aligned(unsigned len, unsigned align) {
#ifndef __CYGWIN__
//return new raw_mmap_pages(len);
return new raw_posix_aligned(len);
return new raw_posix_aligned(len, align);
#else
return new raw_hack_aligned(len);
return new raw_hack_aligned(len, align);
#endif
}
buffer::raw* buffer::create_page_aligned(unsigned len) {
return create_aligned(len, CEPH_PAGE_SIZE);
}

buffer::raw* buffer::create_zero_copy(unsigned len, int fd, int64_t *offset) {
#ifdef CEPH_HAVE_SPLICE
Expand Down Expand Up @@ -1013,22 +1017,22 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
return true;
}

bool buffer::list::is_page_aligned() const
bool buffer::list::is_aligned(unsigned align) const
{
for (std::list<ptr>::const_iterator it = _buffers.begin();
it != _buffers.end();
++it)
if (!it->is_page_aligned())
if (!it->is_aligned(align))
return false;
return true;
}

bool buffer::list::is_n_page_sized() const
bool buffer::list::is_n_align_sized(unsigned align) const
{
for (std::list<ptr>::const_iterator it = _buffers.begin();
it != _buffers.end();
++it)
if (!it->is_n_page_sized())
if (!it->is_n_align_sized(align))
return false;
return true;
}
Expand Down Expand Up @@ -1078,6 +1082,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
return &(*_buffers.begin()) == &(*_buffers.rbegin());
}

bool buffer::list::is_n_page_sized() const
{
return is_n_align_sized(CEPH_PAGE_SIZE);
}

bool buffer::list::is_page_aligned() const
{
return is_aligned(CEPH_PAGE_SIZE);
}

void buffer::list::rebuild()
{
ptr nb;
Expand All @@ -1101,16 +1115,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
_buffers.push_back(nb);
}

void buffer::list::rebuild_page_aligned()
void buffer::list::rebuild_aligned(unsigned align)
{
std::list<ptr>::iterator p = _buffers.begin();
while (p != _buffers.end()) {
// keep anything that's already page sized+aligned
if (p->is_page_aligned() && p->is_n_page_sized()) {
// keep anything that's already align and sized aligned
if (p->is_aligned(align) && p->is_n_align_sized(align)) {
/*cout << " segment " << (void*)p->c_str()
<< " offset " << ((unsigned long)p->c_str() & ~CEPH_PAGE_MASK)
<< " offset " << ((unsigned long)p->c_str() & (align - 1))
<< " length " << p->length()
<< " " << (p->length() & ~CEPH_PAGE_MASK) << " ok" << std::endl;
<< " " << (p->length() & (align - 1)) << " ok" << std::endl;
*/
++p;
continue;
Expand All @@ -1121,26 +1135,31 @@ void buffer::list::rebuild_page_aligned()
unsigned offset = 0;
do {
/*cout << " segment " << (void*)p->c_str()
<< " offset " << ((unsigned long)p->c_str() & ~CEPH_PAGE_MASK)
<< " length " << p->length() << " " << (p->length() & ~CEPH_PAGE_MASK)
<< " overall offset " << offset << " " << (offset & ~CEPH_PAGE_MASK)
<< " offset " << ((unsigned long)p->c_str() & (align - 1))
<< " length " << p->length() << " " << (p->length() & (align - 1))
<< " overall offset " << offset << " " << (offset & (align - 1))
<< " not ok" << std::endl;
*/
offset += p->length();
unaligned.push_back(*p);
_buffers.erase(p++);
} while (p != _buffers.end() &&
(!p->is_page_aligned() ||
!p->is_n_page_sized() ||
(offset & ~CEPH_PAGE_MASK)));
if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_page_aligned())) {
ptr nb(buffer::create_page_aligned(unaligned._len));
(!p->is_aligned(align) ||
!p->is_n_align_sized(align) ||
(offset & (align-1))));
if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align))) {
ptr nb(buffer::create_aligned(unaligned._len, align));
unaligned.rebuild(nb);
}
_buffers.insert(p, unaligned._buffers.front());
}
}

void buffer::list::rebuild_page_aligned()
{
rebuild_aligned(CEPH_PAGE_SIZE);
}

// sort-of-like-assignment-op
void buffer::list::claim(list& bl)
{
Expand Down
71 changes: 47 additions & 24 deletions src/erasure-code/ErasureCode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
#include "common/strtol.h"
#include "ErasureCode.h"

const unsigned ErasureCode::SIMD_ALIGN = 32;

int ErasureCode::chunk_index(unsigned int i) const
{
return chunk_mapping.size() > i ? chunk_mapping[i] : i;
}

int ErasureCode::minimum_to_decode(const set<int> &want_to_read,
const set<int> &available_chunks,
set<int> *minimum)
Expand Down Expand Up @@ -54,22 +61,46 @@ int ErasureCode::minimum_to_decode_with_cost(const set<int> &want_to_read,
}

int ErasureCode::encode_prepare(const bufferlist &raw,
bufferlist *prepared) const
map<int, bufferlist> &encoded) const
{
unsigned int k = get_data_chunk_count();
unsigned int m = get_chunk_count() - k;
unsigned blocksize = get_chunk_size(raw.length());
unsigned padded_length = blocksize * k;
*prepared = raw;
if (padded_length - raw.length() > 0) {
bufferptr pad(padded_length - raw.length());
pad.zero();
prepared->push_back(pad);
unsigned pad_len = blocksize * k - raw.length();
unsigned padded_chunks = k - raw.length() / blocksize;
bufferlist prepared = raw;

if (!prepared.is_aligned(SIMD_ALIGN)) {
// splice padded chunks off to make the rebuild faster
if (padded_chunks)
prepared.splice((k - padded_chunks) * blocksize,
padded_chunks * blocksize - pad_len);
prepared.rebuild_aligned(SIMD_ALIGN);
}

for (unsigned int i = 0; i < k - padded_chunks; i++) {
bufferlist &chunk = encoded[chunk_index(i)];
chunk.substr_of(prepared, i * blocksize, blocksize);
}
unsigned coding_length = blocksize * m;
bufferptr coding(buffer::create_page_aligned(coding_length));
prepared->push_back(coding);
prepared->rebuild_page_aligned();
if (padded_chunks) {
unsigned remainder = raw.length() - (k - padded_chunks) * blocksize;
bufferptr buf(buffer::create_aligned(blocksize, SIMD_ALIGN));

raw.copy((k - padded_chunks) * blocksize, remainder, buf.c_str());
buf.zero(remainder, blocksize - remainder);
encoded[chunk_index(k-padded_chunks)].push_back(buf);

for (unsigned int i = k - padded_chunks + 1; i < k; i++) {
bufferptr buf(buffer::create_aligned(blocksize, SIMD_ALIGN));
buf.zero();
encoded[chunk_index(i)].push_back(buf);
}
}
for (unsigned int i = k; i < k + m; i++) {
bufferlist &chunk = encoded[chunk_index(i)];
chunk.push_back(buffer::create_aligned(blocksize, SIMD_ALIGN));
}

return 0;
}

Expand All @@ -80,15 +111,9 @@ int ErasureCode::encode(const set<int> &want_to_encode,
unsigned int k = get_data_chunk_count();
unsigned int m = get_chunk_count() - k;
bufferlist out;
int err = encode_prepare(in, &out);
int err = encode_prepare(in, *encoded);
if (err)
return err;
unsigned blocksize = get_chunk_size(in.length());
for (unsigned int i = 0; i < k + m; i++) {
int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
bufferlist &chunk = (*encoded)[chunk_index];
chunk.substr_of(out, i * blocksize, blocksize);
}
encode_chunks(want_to_encode, encoded);
for (unsigned int i = 0; i < k + m; i++) {
if (want_to_encode.count(i) == 0)
Expand Down Expand Up @@ -128,11 +153,11 @@ int ErasureCode::decode(const set<int> &want_to_read,
unsigned blocksize = (*chunks.begin()).second.length();
for (unsigned int i = 0; i < k + m; i++) {
if (chunks.find(i) == chunks.end()) {
bufferptr ptr(buffer::create_page_aligned(blocksize));
bufferptr ptr(buffer::create_aligned(blocksize, SIMD_ALIGN));
(*decoded)[i].push_front(ptr);
} else {
(*decoded)[i] = chunks.find(i)->second;
(*decoded)[i].rebuild_page_aligned();
(*decoded)[i].rebuild_aligned(SIMD_ALIGN);
}
}
return decode_chunks(want_to_read, chunks, decoded);
Expand Down Expand Up @@ -223,15 +248,13 @@ int ErasureCode::decode_concat(const map<int, bufferlist> &chunks,
set<int> want_to_read;

for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
int chunk = chunk_mapping.size() > i ? chunk_mapping[i] : i;
want_to_read.insert(chunk);
want_to_read.insert(chunk_index(i));
}
map<int, bufferlist> decoded_map;
int r = decode(want_to_read, chunks, &decoded_map);
if (r == 0) {
for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
int chunk = chunk_mapping.size() > i ? chunk_mapping[i] : i;
decoded->claim_append(decoded_map[chunk]);
decoded->claim_append(decoded_map[chunk_index(i)]);
}
}
return r;
Expand Down
8 changes: 7 additions & 1 deletion src/erasure-code/ErasureCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ namespace ceph {

class ErasureCode : public ErasureCodeInterface {
public:
static const unsigned SIMD_ALIGN;

vector<int> chunk_mapping;

virtual ~ErasureCode() {}
Expand All @@ -46,7 +48,8 @@ namespace ceph {
const map<int, int> &available,
set<int> *minimum);

int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
int encode_prepare(const bufferlist &raw,
map<int, bufferlist> &encoded) const;

virtual int encode(const set<int> &want_to_encode,
const bufferlist &in,
Expand Down Expand Up @@ -85,6 +88,9 @@ namespace ceph {

virtual int decode_concat(const map<int, bufferlist> &chunks,
bufferlist *decoded);

private:
int chunk_index(unsigned int i) const;
};
}

Expand Down
Loading

0 comments on commit f2178d6

Please sign in to comment.