Reduce number of allocations when compressing and simplify the code.

alkis · pwnall · commit 53a38e5e339d · 2019-01-04T19:07:49.000-08:00
Before we were allocating at least once: twice with large table and
thrice when we used a scratch buffer. With this approach we always
allocate once.

  name                                          old speed               new speed               delta
  BM_UFlat/0      [html             ]           2.45GB/s ± 0%           2.45GB/s ± 0%   -0.13%        (p=0.000 n=11+11)
  BM_UFlat/1      [urls             ]           1.19GB/s ± 0%           1.22GB/s ± 0%   +2.48%        (p=0.000 n=11+11)
  BM_UFlat/2      [jpg              ]           17.2GB/s ± 2%           17.3GB/s ± 1%     ~           (p=0.193 n=11+11)
  BM_UFlat/3      [jpg_200          ]           1.52GB/s ± 0%           1.51GB/s ± 0%   -0.78%         (p=0.000 n=10+9)
  BM_UFlat/4      [pdf              ]           12.5GB/s ± 1%           12.5GB/s ± 1%     ~             (p=0.881 n=9+9)
  BM_UFlat/5      [html4            ]           1.86GB/s ± 0%           1.86GB/s ± 0%     ~           (p=0.123 n=11+11)
  BM_UFlat/6      [txt1             ]            793MB/s ± 0%            799MB/s ± 0%   +0.78%         (p=0.000 n=11+9)
  BM_UFlat/7      [txt2             ]            739MB/s ± 0%            744MB/s ± 0%   +0.77%        (p=0.000 n=11+11)
  BM_UFlat/8      [txt3             ]            839MB/s ± 0%            845MB/s ± 0%   +0.71%        (p=0.000 n=11+11)
  BM_UFlat/9      [txt4             ]            678MB/s ± 0%            685MB/s ± 0%   +1.01%        (p=0.000 n=11+11)
  BM_UFlat/10     [pb               ]           3.08GB/s ± 0%           3.12GB/s ± 0%   +1.21%        (p=0.000 n=11+11)
  BM_UFlat/11     [gaviota          ]            975MB/s ± 0%            976MB/s ± 0%   +0.11%        (p=0.000 n=11+11)
  BM_UFlat/12     [cp               ]           1.73GB/s ± 1%           1.74GB/s ± 1%   +0.46%        (p=0.010 n=11+11)
  BM_UFlat/13     [c                ]           1.53GB/s ± 0%           1.53GB/s ± 0%     ~           (p=0.987 n=11+10)
  BM_UFlat/14     [lsp              ]           1.65GB/s ± 0%           1.63GB/s ± 1%   -1.04%        (p=0.000 n=11+11)
  BM_UFlat/15     [xls              ]           1.08GB/s ± 0%           1.15GB/s ± 0%   +6.12%        (p=0.000 n=10+11)
  BM_UFlat/16     [xls_200          ]            944MB/s ± 0%            920MB/s ± 3%   -2.51%         (p=0.000 n=9+11)
  BM_UFlat/17     [bin              ]           1.86GB/s ± 0%           1.87GB/s ± 0%   +0.68%        (p=0.000 n=10+11)
  BM_UFlat/18     [bin_200          ]           1.91GB/s ± 3%           1.92GB/s ± 5%     ~           (p=0.356 n=11+11)
  BM_UFlat/19     [sum              ]           1.31GB/s ± 0%           1.40GB/s ± 0%   +6.53%        (p=0.000 n=11+11)
  BM_UFlat/20     [man              ]           1.42GB/s ± 0%           1.42GB/s ± 0%   +0.33%        (p=0.000 n=10+10)
diff --git a/snappy-internal.h b/snappy-internal.h
@@ -36,19 +36,26 @@
 namespace snappy {
 namespace internal {
 
+// Working memory performs a single allocation to hold all scratch space
+// required for compression.
 class WorkingMemory {
  public:
-  WorkingMemory() : large_table_(NULL) { }
-  ~WorkingMemory() { delete[] large_table_; }
+  explicit WorkingMemory(size_t input_size);
+  ~WorkingMemory();
 
   // Allocates and clears a hash table using memory in "*this",
   // stores the number of buckets in "*table_size" and returns a pointer to
   // the base of the hash table.
-  uint16* GetHashTable(size_t input_size, int* table_size);
+  uint16* GetHashTable(size_t fragment_size, int* table_size) const;
+  char* GetScratchInput() const { return input_; }
+  char* GetScratchOutput() const { return output_; }
 
  private:
-  uint16 small_table_[1<<10];    // 2KB
-  uint16* large_table_;          // Allocated only when needed
+  char* mem_;      // the allocated memory, never nullptr
+  size_t size_;    // the size of the allocated memory, never 0
+  uint16* table_;  // the pointer to the hashtable
+  char* input_;    // the pointer to the input scratch buffer
+  char* output_;   // the pointer to the output scratch buffer
 
   // No copying
   WorkingMemory(const WorkingMemory&);
diff --git a/snappy.cc b/snappy.cc
@@ -418,31 +418,41 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
   }
 }
 
-namespace internal {
-uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
-  // Use smaller hash table when input.size() is smaller, since we
-  // fill the table, incurring O(hash table size) overhead for
-  // compression, and if the input is short, we won't need that
-  // many hash table entries anyway.
+namespace {
+uint32 CalculateTableSize(uint32 input_size) {
   assert(kMaxHashTableSize >= 256);
-  size_t htsize = 256;
-  while (htsize < kMaxHashTableSize && htsize < input_size) {
-    htsize <<= 1;
+  if (input_size > kMaxHashTableSize) {
+    return kMaxHashTableSize;
   }
-
-  uint16* table;
-  if (htsize <= ARRAYSIZE(small_table_)) {
-    table = small_table_;
-  } else {
-    if (large_table_ == NULL) {
-      large_table_ = new uint16[kMaxHashTableSize];
-    }
-    table = large_table_;
+  if (input_size < 256) {
+    return 256;
   }
+  return 1u << (32 - __builtin_clz(input_size - 1));
+}
+}  // namespace
 
+namespace internal {
+WorkingMemory::WorkingMemory(size_t input_size) {
+  const size_t max_fragment_size = std::min(input_size, kBlockSize);
+  const size_t table_size = CalculateTableSize(max_fragment_size);
+  size_ = table_size * sizeof(*table_) + max_fragment_size +
+          MaxCompressedLength(max_fragment_size);
+  mem_ = std::allocator<char>().allocate(size_);
+  table_ = reinterpret_cast<uint16*>(mem_);
+  input_ = mem_ + table_size * sizeof(*table_);
+  output_ = input_ + max_fragment_size;
+}
+
+WorkingMemory::~WorkingMemory() {
+  std::allocator<char>().deallocate(mem_, size_);
+}
+
+uint16* WorkingMemory::GetHashTable(size_t fragment_size,
+                                    int* table_size) const {
+  const size_t htsize = CalculateTableSize(fragment_size);
+  memset(table_, 0, htsize * sizeof(*table_));
   *table_size = htsize;
-  memset(table, 0, htsize * sizeof(*table));
-  return table;
+  return table_;
 }
 }  // end namespace internal
 
@@ -942,17 +952,6 @@ bool GetUncompressedLength(Source* source, uint32* result) {
   return decompressor.ReadUncompressedLength(result);
 }
 
-struct Deleter {
-  Deleter() : size_(0) {}
-  explicit Deleter(size_t size) : size_(size) {}
-
-  void operator()(char* ptr) const {
-    std::allocator<char>().deallocate(ptr, size_);
-  }
-
-  size_t size_;
-};
-
 size_t Compress(Source* reader, Sink* writer) {
   size_t written = 0;
   size_t N = reader->Available();
@@ -962,9 +961,7 @@ size_t Compress(Source* reader, Sink* writer) {
   writer->Append(ulength, p-ulength);
   written += (p - ulength);
 
-  internal::WorkingMemory wmem;
-  std::unique_ptr<char, Deleter> scratch;
-  std::unique_ptr<char, Deleter> scratch_output;
+  internal::WorkingMemory wmem(N);
 
   while (N > 0) {
     // Get next block to compress (without copying if possible)
@@ -980,26 +977,19 @@ size_t Compress(Source* reader, Sink* writer) {
       pending_advance = num_to_read;
       fragment_size = num_to_read;
     } else {
-      // Read into scratch buffer
-      if (scratch == NULL) {
-        // If this is the last iteration, we want to allocate N bytes
-        // of space, otherwise the max possible kBlockSize space.
-        // num_to_read contains exactly the correct value
-        scratch = {
-            std::allocator<char>().allocate(num_to_read), Deleter(num_to_read)};
-      }
-      memcpy(scratch.get(), fragment, bytes_read);
+      char* scratch = wmem.GetScratchInput();
+      memcpy(scratch, fragment, bytes_read);
       reader->Skip(bytes_read);
 
       while (bytes_read < num_to_read) {
         fragment = reader->Peek(&fragment_size);
         size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
-        memcpy(scratch.get() + bytes_read, fragment, n);
+        memcpy(scratch + bytes_read, fragment, n);
         bytes_read += n;
         reader->Skip(n);
       }
       assert(bytes_read == num_to_read);
-      fragment = scratch.get();
+      fragment = scratch;
       fragment_size = num_to_read;
     }
     assert(fragment_size == num_to_read);
@@ -1013,17 +1003,13 @@ size_t Compress(Source* reader, Sink* writer) {
 
     // Need a scratch buffer for the output, in case the byte sink doesn't
     // have room for us directly.
-    if (scratch_output == NULL) {
-      scratch_output =
-          {std::allocator<char>().allocate(max_output), Deleter(max_output)};
-    } else {
-      // Since we encode kBlockSize regions followed by a region
-      // which is <= kBlockSize in length, a previously allocated
-      // scratch_output[] region is big enough for this iteration.
-    }
-    char* dest = writer->GetAppendBuffer(max_output, scratch_output.get());
-    char* end = internal::CompressFragment(fragment, fragment_size,
-                                           dest, table, table_size);
+
+    // Since we encode kBlockSize regions followed by a region
+    // which is <= kBlockSize in length, a previously allocated
+    // scratch_output[] region is big enough for this iteration.
+    char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
+    char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
+                                           table_size);
     writer->Append(dest, end - dest);
     written += (end - dest);
 
diff --git a/snappy_unittest.cc b/snappy_unittest.cc
@@ -445,7 +445,7 @@ static void VerifyNonBlockedCompression(const string& input) {
   Varint::Append32(&prefix, input.size());
 
   // Setup compression table
-  snappy::internal::WorkingMemory wmem;
+  snappy::internal::WorkingMemory wmem(input.size());
   int table_size;
   uint16* table = wmem.GetHashTable(input.size(), &table_size);