Skip to content

Commit 53a38e5

Browse files
alkispwnall
alkis
authored andcommittedJan 5, 2019
Reduce number of allocations when compressing and simplify the code.
Before we were allocating at least once: twice with large table and thrice when we used a scratch buffer. With this approach we always allocate once. name old speed new speed delta BM_UFlat/0 [html ] 2.45GB/s ± 0% 2.45GB/s ± 0% -0.13% (p=0.000 n=11+11) BM_UFlat/1 [urls ] 1.19GB/s ± 0% 1.22GB/s ± 0% +2.48% (p=0.000 n=11+11) BM_UFlat/2 [jpg ] 17.2GB/s ± 2% 17.3GB/s ± 1% ~ (p=0.193 n=11+11) BM_UFlat/3 [jpg_200 ] 1.52GB/s ± 0% 1.51GB/s ± 0% -0.78% (p=0.000 n=10+9) BM_UFlat/4 [pdf ] 12.5GB/s ± 1% 12.5GB/s ± 1% ~ (p=0.881 n=9+9) BM_UFlat/5 [html4 ] 1.86GB/s ± 0% 1.86GB/s ± 0% ~ (p=0.123 n=11+11) BM_UFlat/6 [txt1 ] 793MB/s ± 0% 799MB/s ± 0% +0.78% (p=0.000 n=11+9) BM_UFlat/7 [txt2 ] 739MB/s ± 0% 744MB/s ± 0% +0.77% (p=0.000 n=11+11) BM_UFlat/8 [txt3 ] 839MB/s ± 0% 845MB/s ± 0% +0.71% (p=0.000 n=11+11) BM_UFlat/9 [txt4 ] 678MB/s ± 0% 685MB/s ± 0% +1.01% (p=0.000 n=11+11) BM_UFlat/10 [pb ] 3.08GB/s ± 0% 3.12GB/s ± 0% +1.21% (p=0.000 n=11+11) BM_UFlat/11 [gaviota ] 975MB/s ± 0% 976MB/s ± 0% +0.11% (p=0.000 n=11+11) BM_UFlat/12 [cp ] 1.73GB/s ± 1% 1.74GB/s ± 1% +0.46% (p=0.010 n=11+11) BM_UFlat/13 [c ] 1.53GB/s ± 0% 1.53GB/s ± 0% ~ (p=0.987 n=11+10) BM_UFlat/14 [lsp ] 1.65GB/s ± 0% 1.63GB/s ± 1% -1.04% (p=0.000 n=11+11) BM_UFlat/15 [xls ] 1.08GB/s ± 0% 1.15GB/s ± 0% +6.12% (p=0.000 n=10+11) BM_UFlat/16 [xls_200 ] 944MB/s ± 0% 920MB/s ± 3% -2.51% (p=0.000 n=9+11) BM_UFlat/17 [bin ] 1.86GB/s ± 0% 1.87GB/s ± 0% +0.68% (p=0.000 n=10+11) BM_UFlat/18 [bin_200 ] 1.91GB/s ± 3% 1.92GB/s ± 5% ~ (p=0.356 n=11+11) BM_UFlat/19 [sum ] 1.31GB/s ± 0% 1.40GB/s ± 0% +6.53% (p=0.000 n=11+11) BM_UFlat/20 [man ] 1.42GB/s ± 0% 1.42GB/s ± 0% +0.33% (p=0.000 n=10+10)
1 parent df5548c commit 53a38e5

File tree

3 files changed

+55
-62
lines changed

3 files changed

+55
-62
lines changed
 

‎snappy-internal.h

+12-5
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,26 @@
3636
namespace snappy {
3737
namespace internal {
3838

39+
// Working memory performs a single allocation to hold all scratch space
40+
// required for compression.
3941
class WorkingMemory {
4042
public:
41-
WorkingMemory() : large_table_(NULL) { }
42-
~WorkingMemory() { delete[] large_table_; }
43+
explicit WorkingMemory(size_t input_size);
44+
~WorkingMemory();
4345

4446
// Allocates and clears a hash table using memory in "*this",
4547
// stores the number of buckets in "*table_size" and returns a pointer to
4648
// the base of the hash table.
47-
uint16* GetHashTable(size_t input_size, int* table_size);
49+
uint16* GetHashTable(size_t fragment_size, int* table_size) const;
50+
char* GetScratchInput() const { return input_; }
51+
char* GetScratchOutput() const { return output_; }
4852

4953
private:
50-
uint16 small_table_[1<<10]; // 2KB
51-
uint16* large_table_; // Allocated only when needed
54+
char* mem_; // the allocated memory, never nullptr
55+
size_t size_; // the size of the allocated memory, never 0
56+
uint16* table_; // the pointer to the hashtable
57+
char* input_; // the pointer to the input scratch buffer
58+
char* output_; // the pointer to the output scratch buffer
5259

5360
// No copying
5461
WorkingMemory(const WorkingMemory&);

‎snappy.cc

+42-56
Original file line numberDiff line numberDiff line change
@@ -418,31 +418,41 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
418418
}
419419
}
420420

421-
namespace internal {
422-
uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
423-
// Use smaller hash table when input.size() is smaller, since we
424-
// fill the table, incurring O(hash table size) overhead for
425-
// compression, and if the input is short, we won't need that
426-
// many hash table entries anyway.
421+
namespace {
422+
uint32 CalculateTableSize(uint32 input_size) {
427423
assert(kMaxHashTableSize >= 256);
428-
size_t htsize = 256;
429-
while (htsize < kMaxHashTableSize && htsize < input_size) {
430-
htsize <<= 1;
424+
if (input_size > kMaxHashTableSize) {
425+
return kMaxHashTableSize;
431426
}
432-
433-
uint16* table;
434-
if (htsize <= ARRAYSIZE(small_table_)) {
435-
table = small_table_;
436-
} else {
437-
if (large_table_ == NULL) {
438-
large_table_ = new uint16[kMaxHashTableSize];
439-
}
440-
table = large_table_;
427+
if (input_size < 256) {
428+
return 256;
441429
}
430+
return 1u << (32 - __builtin_clz(input_size - 1));
431+
}
432+
} // namespace
442433

434+
namespace internal {
435+
WorkingMemory::WorkingMemory(size_t input_size) {
436+
const size_t max_fragment_size = std::min(input_size, kBlockSize);
437+
const size_t table_size = CalculateTableSize(max_fragment_size);
438+
size_ = table_size * sizeof(*table_) + max_fragment_size +
439+
MaxCompressedLength(max_fragment_size);
440+
mem_ = std::allocator<char>().allocate(size_);
441+
table_ = reinterpret_cast<uint16*>(mem_);
442+
input_ = mem_ + table_size * sizeof(*table_);
443+
output_ = input_ + max_fragment_size;
444+
}
445+
446+
WorkingMemory::~WorkingMemory() {
447+
std::allocator<char>().deallocate(mem_, size_);
448+
}
449+
450+
uint16* WorkingMemory::GetHashTable(size_t fragment_size,
451+
int* table_size) const {
452+
const size_t htsize = CalculateTableSize(fragment_size);
453+
memset(table_, 0, htsize * sizeof(*table_));
443454
*table_size = htsize;
444-
memset(table, 0, htsize * sizeof(*table));
445-
return table;
455+
return table_;
446456
}
447457
} // end namespace internal
448458

@@ -942,17 +952,6 @@ bool GetUncompressedLength(Source* source, uint32* result) {
942952
return decompressor.ReadUncompressedLength(result);
943953
}
944954

945-
struct Deleter {
946-
Deleter() : size_(0) {}
947-
explicit Deleter(size_t size) : size_(size) {}
948-
949-
void operator()(char* ptr) const {
950-
std::allocator<char>().deallocate(ptr, size_);
951-
}
952-
953-
size_t size_;
954-
};
955-
956955
size_t Compress(Source* reader, Sink* writer) {
957956
size_t written = 0;
958957
size_t N = reader->Available();
@@ -962,9 +961,7 @@ size_t Compress(Source* reader, Sink* writer) {
962961
writer->Append(ulength, p-ulength);
963962
written += (p - ulength);
964963

965-
internal::WorkingMemory wmem;
966-
std::unique_ptr<char, Deleter> scratch;
967-
std::unique_ptr<char, Deleter> scratch_output;
964+
internal::WorkingMemory wmem(N);
968965

969966
while (N > 0) {
970967
// Get next block to compress (without copying if possible)
@@ -980,26 +977,19 @@ size_t Compress(Source* reader, Sink* writer) {
980977
pending_advance = num_to_read;
981978
fragment_size = num_to_read;
982979
} else {
983-
// Read into scratch buffer
984-
if (scratch == NULL) {
985-
// If this is the last iteration, we want to allocate N bytes
986-
// of space, otherwise the max possible kBlockSize space.
987-
// num_to_read contains exactly the correct value
988-
scratch = {
989-
std::allocator<char>().allocate(num_to_read), Deleter(num_to_read)};
990-
}
991-
memcpy(scratch.get(), fragment, bytes_read);
980+
char* scratch = wmem.GetScratchInput();
981+
memcpy(scratch, fragment, bytes_read);
992982
reader->Skip(bytes_read);
993983

994984
while (bytes_read < num_to_read) {
995985
fragment = reader->Peek(&fragment_size);
996986
size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
997-
memcpy(scratch.get() + bytes_read, fragment, n);
987+
memcpy(scratch + bytes_read, fragment, n);
998988
bytes_read += n;
999989
reader->Skip(n);
1000990
}
1001991
assert(bytes_read == num_to_read);
1002-
fragment = scratch.get();
992+
fragment = scratch;
1003993
fragment_size = num_to_read;
1004994
}
1005995
assert(fragment_size == num_to_read);
@@ -1013,17 +1003,13 @@ size_t Compress(Source* reader, Sink* writer) {
10131003

10141004
// Need a scratch buffer for the output, in case the byte sink doesn't
10151005
// have room for us directly.
1016-
if (scratch_output == NULL) {
1017-
scratch_output =
1018-
{std::allocator<char>().allocate(max_output), Deleter(max_output)};
1019-
} else {
1020-
// Since we encode kBlockSize regions followed by a region
1021-
// which is <= kBlockSize in length, a previously allocated
1022-
// scratch_output[] region is big enough for this iteration.
1023-
}
1024-
char* dest = writer->GetAppendBuffer(max_output, scratch_output.get());
1025-
char* end = internal::CompressFragment(fragment, fragment_size,
1026-
dest, table, table_size);
1006+
1007+
// Since we encode kBlockSize regions followed by a region
1008+
// which is <= kBlockSize in length, a previously allocated
1009+
// scratch_output[] region is big enough for this iteration.
1010+
char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1011+
char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
1012+
table_size);
10271013
writer->Append(dest, end - dest);
10281014
written += (end - dest);
10291015

‎snappy_unittest.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ static void VerifyNonBlockedCompression(const string& input) {
445445
Varint::Append32(&prefix, input.size());
446446

447447
// Setup compression table
448-
snappy::internal::WorkingMemory wmem;
448+
snappy::internal::WorkingMemory wmem(input.size());
449449
int table_size;
450450
uint16* table = wmem.GetHashTable(input.size(), &table_size);
451451

0 commit comments

Comments
 (0)
Please sign in to comment.