diff --git a/NOTICE.txt b/NOTICE.txt
index 9d1757c46b08..74637c84a77d 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -89,6 +89,15 @@ and decompression library written by Matthew J. Francis. It can be obtained at:
   * HOMEPAGE:
     * https://code.google.com/p/jbzip2/
 
+This product contains a modified portion of 'libdivsufsort', a C API library to construct
+the suffix array and the Burrows-Wheeler transformed string for any input string of
+a constant-size alphabet written by Yuta Mori. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.libdivsufsort.txt (MIT License)
+  * HOMEPAGE:
+    * https://code.google.com/p/libdivsufsort/
+
 This product optionally depends on 'JZlib', a re-implementation of zlib in
 pure Java, which can be obtained at:
 
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BitWriter.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BitWriter.java
new file mode 100644
index 000000000000..080b6aef5143
--- /dev/null
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BitWriter.java
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import io.netty.buffer.ByteBuf;
+
+/**
+ * A bit writer that allows the writing of single bit booleans, unary numbers, bit strings
+ * of arbitrary length (up to 24 bits), and bit aligned 32-bit integers. A single byte at a
+ * time is written to the {@link ByteBuf} when sufficient bits have been accumulated.
+ */
+final class Bzip2BitWriter {
+    /**
+     * A buffer of bits waiting to be written to the output stream.
+     */
+    private int bitBuffer;
+
+    /**
+     * The number of bits currently buffered in {@link #bitBuffer}.
+     */
+    private int bitCount;
+
+    /**
+     * Writes up to 24 bits to the output {@link ByteBuf}.
+     * @param count The number of bits to write (maximum {@code 24}, because the {@link #bitBuffer}
+     *              is {@code int} and it can store up to {@code 8} bits before calling)
+     * @param value The bits to write
+     */
+    void writeBits(ByteBuf out, final int count, final int value) {
+        if (count < 0 || count > 24) {
+            throw new IllegalArgumentException("count: " + count + " (expected: 0-24)");
+        }
+        int bitCount = this.bitCount;
+        int bitBuffer = this.bitBuffer | (value << (32 - count)) >>> bitCount;
+        bitCount += count;
+
+        while (bitCount >= 8) {
+            out.writeByte(bitBuffer >>> 24);
+            bitBuffer <<= 8;
+            bitCount -= 8;
+        }
+        this.bitBuffer = bitBuffer;
+        this.bitCount = bitCount;
+    }
+
+    /**
+     * Writes a single bit to the output {@link ByteBuf}.
+     * @param value The bit to write
+     */
+    void writeBoolean(ByteBuf out, final boolean value) {
+        int bitCount = this.bitCount + 1;
+        int bitBuffer = this.bitBuffer | (value ? 1 : 0) << (32 - bitCount);
+
+        if (bitCount == 8) {
+            out.writeByte(bitBuffer >>> 24);
+            bitBuffer = 0;
+            bitCount = 0;
+        }
+        this.bitBuffer = bitBuffer;
+        this.bitCount = bitCount;
+    }
+
+    /**
+     * Writes a zero-terminated unary number to the output {@link ByteBuf}.
+     * Example of the output for value = 6: {@code 1111110}
+     * @param value The number of {@code 1} to write
+     */
+    void writeUnary(ByteBuf out, int value) {
+        if (value < 0) {
+            throw new IllegalArgumentException("value: " + value + " (expected 0 or more)");
+        }
+        while (value-- > 0) {
+            writeBoolean(out, true);
+        }
+        writeBoolean(out, false);
+    }
+
+    /**
+     * Writes an integer as 32 bits to the output {@link ByteBuf}.
+     * @param value The integer to write
+     */
+    void writeInt(ByteBuf out, final int value) {
+        writeBits(out, 16, (value >>> 16) & 0xffff);
+        writeBits(out, 16, value & 0xffff);
+    }
+
+    /**
+     * Writes any remaining bits to the output {@link ByteBuf},
+     * zero padding to a whole byte as required.
+     */
+    void flush(ByteBuf out) {
+        if (bitCount > 0) {
+            writeBits(out, 8 - bitCount, 0);
+        }
+    }
+}
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BlockCompressor.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BlockCompressor.java
new file mode 100644
index 000000000000..471840540309
--- /dev/null
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BlockCompressor.java
@@ -0,0 +1,294 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import io.netty.buffer.ByteBuf;
+
+import static io.netty.handler.codec.compression.Bzip2Constants.*;
+
+/**
+ * Compresses and writes a single Bzip2 block.<br><br>
+ *
+ * Block encoding consists of the following stages:<br>
+ * 1. Run-Length Encoding[1] - {@link #write(int)}<br>
+ * 2. Burrows Wheeler Transform - {@link #close(ByteBuf)} (through {@link Bzip2DivSufSort})<br>
+ * 3. Write block header - {@link #close(ByteBuf)}<br>
+ * 4. Move To Front Transform - {@link #close(ByteBuf)} (through {@link Bzip2HuffmanStageEncoder})<br>
+ * 5. Run-Length Encoding[2] - {@link #close(ByteBuf)}  (through {@link Bzip2HuffmanStageEncoder})<br>
+ * 6. Create and write Huffman tables - {@link #close(ByteBuf)} (through {@link Bzip2HuffmanStageEncoder})<br>
+ * 7. Huffman encode and write data - {@link #close(ByteBuf)} (through {@link Bzip2HuffmanStageEncoder})
+ */
+final class Bzip2BlockCompressor {
+    /**
+     * A writer that provides bit-level writes.
+     */
+    private final Bzip2BitWriter writer;
+
+    /**
+     * CRC builder for the block.
+     */
+    private final Crc32 crc = new Crc32();
+
+    /**
+     * The RLE'd block data.
+     */
+    private final byte[] block;
+
+    /**
+     * Current length of the data within the {@link #block} array.
+     */
+    private int blockLength;
+
+    /**
+     * A limit beyond which new data will not be accepted into the block.
+     */
+    private final int blockLengthLimit;
+
+    /**
+     * The values that are present within the RLE'd block data. For each index, {@code true} if that
+     * value is present within the data, otherwise {@code false}.
+     */
+    private final boolean[] blockValuesPresent = new boolean[256];
+
+    /**
+     * The Burrows Wheeler Transformed block data.
+     */
+    private final int[] bwtBlock;
+
+    /**
+     * The current RLE value being accumulated (undefined when {@link #rleLength} is 0).
+     */
+    private int rleCurrentValue = -1;
+
+    /**
+     * The repeat count of the current RLE value.
+     */
+    private int rleLength;
+
+    /**
+     * @param writer The {@link Bzip2BitWriter} which provides bit-level writes
+     * @param blockSize The declared block size in bytes. Up to this many bytes will be accepted
+     *                  into the block after Run-Length Encoding is applied
+     */
+    Bzip2BlockCompressor(final Bzip2BitWriter writer, final int blockSize) {
+        this.writer = writer;
+
+        // One extra byte is added to allow for the block wrap applied in close()
+        block = new byte[blockSize + 1];
+        bwtBlock = new int[blockSize + 1];
+        blockLengthLimit = blockSize - 6; // 5 bytes for one RLE run plus one byte - see {@link #write(int)}
+    }
+
+    /**
+     * Write the Huffman symbol to output byte map.
+     */
+    private void writeSymbolMap(ByteBuf out) {
+        Bzip2BitWriter writer = this.writer;
+
+        final boolean[] blockValuesPresent = this.blockValuesPresent;
+        final boolean[] condensedInUse = new boolean[16];
+
+        for (int i = 0; i < condensedInUse.length; i++) {
+            for (int j = 0, k = i << 4; j < 16; j++, k++) {
+                if (blockValuesPresent[k]) {
+                    condensedInUse[i] = true;
+                }
+            }
+        }
+
+        for (int i = 0; i < condensedInUse.length; i++) {
+            writer.writeBoolean(out, condensedInUse[i]);
+        }
+
+        for (int i = 0; i < condensedInUse.length; i++) {
+            if (condensedInUse[i]) {
+                for (int j = 0, k = i << 4; j < 16; j++, k++) {
+                    writer.writeBoolean(out, blockValuesPresent[k]);
+                }
+            }
+        }
+    }
+
+    /**
+     * Writes an RLE run to the block array, updating the block CRC and present values array as required.
+     * @param value The value to write
+     * @param runLength The run length of the value to write
+     */
+    private void writeRun(final int value, int runLength) {
+        final int blockLength = this.blockLength;
+        final byte[] block = this.block;
+
+        blockValuesPresent[value] = true;
+        crc.updateCRC(value, runLength);
+
+        final byte byteValue = (byte) value;
+        switch (runLength) {
+            case 1:
+                block[blockLength] = byteValue;
+                this.blockLength = blockLength + 1;
+                break;
+            case 2:
+                block[blockLength] = byteValue;
+                block[blockLength + 1] = byteValue;
+                this.blockLength = blockLength + 2;
+                break;
+            case 3:
+                block[blockLength] = byteValue;
+                block[blockLength + 1] = byteValue;
+                block[blockLength + 2] = byteValue;
+                this.blockLength = blockLength + 3;
+                break;
+            default:
+                runLength -= 4;
+                blockValuesPresent[runLength] = true;
+                block[blockLength] = byteValue;
+                block[blockLength + 1] = byteValue;
+                block[blockLength + 2] = byteValue;
+                block[blockLength + 3] = byteValue;
+                block[blockLength + 4] = (byte) runLength;
+                this.blockLength = blockLength + 5;
+                break;
+        }
+    }
+
+    /**
+     * Writes a byte to the block, accumulating to an RLE run where possible.
+     * @param value The byte to write
+     * @return {@code true} if the byte was written, or {@code false} if the block is already full
+     */
+    boolean write(final int value) {
+        if (blockLength > blockLengthLimit) {
+            return false;
+        }
+        final int rleCurrentValue = this.rleCurrentValue;
+        final int rleLength = this.rleLength;
+
+        if (rleLength == 0) {
+            this.rleCurrentValue = value;
+            this.rleLength = 1;
+        } else if (rleCurrentValue != value) {
+            // This path commits us to write 6 bytes - one RLE run (5 bytes) plus one extra
+            writeRun(rleCurrentValue & 0xff, rleLength);
+            this.rleCurrentValue = value;
+            this.rleLength = 1;
+        } else {
+            if (rleLength == 254) {
+                writeRun(rleCurrentValue & 0xff, 255);
+                this.rleLength = 0;
+            } else {
+                this.rleLength = rleLength + 1;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Writes an array to the block.
+     * @param data The array to write
+     * @param offset The offset within the input data to write from
+     * @param length The number of bytes of input data to write
+     * @return The actual number of input bytes written. May be less than the number requested, or
+     *         zero if the block is already full
+     */
+    int write(final byte[] data, int offset, int length) {
+        int written = 0;
+
+        while (length-- > 0) {
+            if (!write(data[offset++])) {
+                break;
+            }
+            written++;
+        }
+        return written;
+    }
+
+    /**
+     * Compresses and writes out the block.
+     */
+    void close(ByteBuf out) {
+        // If an RLE run is in progress, write it out
+        if (rleLength > 0) {
+            writeRun(rleCurrentValue & 0xff, rleLength);
+        }
+
+        // Apply a one byte block wrap required by the BWT implementation
+        block[blockLength] = block[0];
+
+        // Perform the Burrows Wheeler Transform
+        Bzip2DivSufSort divSufSort = new Bzip2DivSufSort(block, bwtBlock, blockLength);
+        int bwtStartPointer = divSufSort.bwt();
+
+        Bzip2BitWriter writer = this.writer;
+
+        // Write out the block header
+        writer.writeBits(out, 24, BLOCK_HEADER_MAGIC_1);
+        writer.writeBits(out, 24, BLOCK_HEADER_MAGIC_2);
+        writer.writeInt(out, crc.getCRC());
+        writer.writeBoolean(out, false); // Randomised block flag. We never create randomised blocks
+        writer.writeBits(out, 24, bwtStartPointer);
+
+        // Write out the symbol map
+        writeSymbolMap(out);
+
+        // Perform the Move To Front Transform and Run-Length Encoding[2] stages
+        Bzip2MTFAndRLE2StageEncoder mtfEncoder = new Bzip2MTFAndRLE2StageEncoder(bwtBlock, blockLength,
+                                                                                    blockValuesPresent);
+        mtfEncoder.encode();
+
+        // Perform the Huffman Encoding stage and write out the encoded data
+        Bzip2HuffmanStageEncoder huffmanEncoder = new Bzip2HuffmanStageEncoder(writer,
+                mtfEncoder.mtfBlock(),
+                mtfEncoder.mtfLength(),
+                mtfEncoder.mtfAlphabetSize(),
+                mtfEncoder.mtfSymbolFrequencies());
+        huffmanEncoder.encode(out);
+    }
+
+    /**
+     * Gets available size of the current block.
+     * @return Number of available bytes which can be written
+     */
+    int availableSize() {
+        if (blockLength == 0) {
+            return blockLengthLimit + 2;
+        }
+        return blockLengthLimit - blockLength + 1;
+    }
+
+    /**
+     * Determines if the block is full and ready for compression.
+     * @return {@code true} if the block is full, otherwise {@code false}
+     */
+    boolean isFull() {
+        return blockLength > blockLengthLimit;
+    }
+
+    /**
+     * Determines if any bytes have been written to the block.
+     * @return {@code true} if one or more bytes has been written to the block, otherwise {@code false}
+     */
+    boolean isEmpty() {
+        return blockLength == 0 && rleLength == 0;
+    }
+
+    /**
+     * Gets the CRC of the completed block. Only valid after calling {@link #close(ByteBuf)}.
+     * @return The block's CRC
+     */
+    int crc() {
+        return crc.getCRC();
+    }
+}
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Constants.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Constants.java
index cb1276d86674..69b1dd253c03 100644
--- a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Constants.java
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Constants.java
@@ -16,7 +16,7 @@
 package io.netty.handler.codec.compression;
 
 /**
- * Constants for {@link Bzip2Decoder}.
+ * Constants for both the {@link Bzip2Encoder} and the {@link Bzip2Decoder}.
  */
 final class Bzip2Constants {
 
@@ -28,12 +28,14 @@ final class Bzip2Constants {
     /**
      * Block header magic number. Equals to BCD (pi).
      */
-    static final long COMPRESSED_MAGIC = 0x314159265359L;
+    static final int BLOCK_HEADER_MAGIC_1 = 0x314159;
+    static final int BLOCK_HEADER_MAGIC_2 = 0x265359;
 
     /**
      * End of stream magic number. Equals to BCD sqrt(pi).
      */
-    static final long END_OF_STREAM_MAGIC = 0x177245385090L;
+    static final int END_OF_STREAM_MAGIC_1 = 0x177245;
+    static final int END_OF_STREAM_MAGIC_2 = 0x385090;
 
     /**
      * Base block size.
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Decoder.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Decoder.java
index 1b6d75b78fcf..335b1f8b924f 100644
--- a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Decoder.java
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Decoder.java
@@ -108,8 +108,9 @@ protected void decode(ChannelHandlerContext ctx, ByteBuf in, List<Object> out) t
                     }
                     Bzip2BitReader reader = this.reader;
                     // Get the block magic bytes.
-                    final long magic = (long) reader.readBits(in, 24) << 24 | reader.readBits(in, 24);
-                    if (magic == END_OF_STREAM_MAGIC) {
+                    final int magic1 = reader.readBits(in, 24);
+                    final int magic2 = reader.readBits(in, 24);
+                    if (magic1 == END_OF_STREAM_MAGIC_1 && magic2 == END_OF_STREAM_MAGIC_2) {
                         // End of stream was reached. Check the combined CRC.
                         final int storedCombinedCRC = reader.readInt(in);
                         if (storedCombinedCRC != streamCRC) {
@@ -118,7 +119,7 @@ protected void decode(ChannelHandlerContext ctx, ByteBuf in, List<Object> out) t
                         currentState = State.EOF;
                         break;
                     }
-                    if (magic != COMPRESSED_MAGIC) {
+                    if (magic1 != BLOCK_HEADER_MAGIC_1 || magic2 != BLOCK_HEADER_MAGIC_2) {
                         throw new DecompressionException("bad block header");
                     }
                     blockCRC = reader.readInt(in);
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2DivSufSort.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2DivSufSort.java
new file mode 100644
index 000000000000..cdf92a698313
--- /dev/null
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2DivSufSort.java
@@ -0,0 +1,2115 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+/**
+ * DivSufSort suffix array generator.<br>
+ *
+ * Based on <a href="https://code.google.com/p/libdivsufsort/">libdivsufsort</a> 1.2.3 patched to support Bzip2.<br>
+ * This is a simple conversion of the original C with two minor bugfixes applied (see "BUGFIX"
+ * comments within the class). Documentation within the class is largely absent.
+ */
+final class Bzip2DivSufSort {
+
+    private static final int STACK_SIZE = 64;
+    private static final int BUCKET_A_SIZE = 256;
+    private static final int BUCKET_B_SIZE = 65536;
+    private static final int SS_BLOCKSIZE = 1024;
+    private static final int INSERTIONSORT_THRESHOLD = 8;
+
+    private static final int[] LOG_2_TABLE = {
+        -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+    };
+
+    private final int[] SA;
+    private final byte[] T;
+    private final int n;
+
+    /**
+     * @param block The input array
+     * @param bwtBlock The output array
+     * @param blockLength The length of the input data
+     */
+    Bzip2DivSufSort(final byte[] block, final int[] bwtBlock, final int blockLength) {
+        T = block;
+        SA = bwtBlock;
+        n = blockLength;
+    }
+
+    private static void swapElements(final int[] array1, final int idx1, final int[] array2, final int idx2) {
+        final int temp = array1[idx1];
+        array1[idx1] = array2[idx2];
+        array2[idx2] = temp;
+    }
+
+    private int ssCompare(final int p1, final int p2, final int depth) {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+
+        // pointers within T
+        final int U1n = SA[p1 + 1] + 2;
+        final int  U2n = SA[p2 + 1] + 2;
+
+        int U1 = depth + SA[p1];
+        int U2 = depth + SA[p2];
+
+        while (U1 < U1n && U2 < U2n && T[U1] == T[U2]) {
+            ++U1;
+            ++U2;
+        }
+
+        return U1 < U1n ?
+                   U2 < U2n ? (T[U1] & 0xff) - (T[U2] & 0xff) : 1
+                 : U2 < U2n ? -1 : 0;
+    }
+
+    private int ssCompareLast(int pa, int p1, int p2, int depth, int size) {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+
+        int U1 = depth + SA[p1];
+        int U2 = depth + SA[p2];
+        int U1n = size;
+        int U2n = SA[p2 + 1] + 2;
+
+        while (U1 < U1n && U2 < U2n && T[U1] == T[U2]) {
+            ++U1;
+            ++U2;
+        }
+
+        if (U1 < U1n) {
+            return U2 < U2n ? (T[U1] & 0xff) - (T[U2] & 0xff) : 1;
+        }
+        if (U2 == U2n) {
+            return 1;
+        }
+
+        U1 %= size;
+        U1n = SA[pa] + 2;
+        while (U1 < U1n && U2 < U2n && T[U1] == T[U2]) {
+            ++U1;
+            ++U2;
+        }
+
+        return U1 < U1n ?
+                   U2 < U2n ? (T[U1] & 0xff) - (T[U2] & 0xff) : 1
+                 : U2 < U2n ? -1 : 0;
+    }
+
+    private void ssInsertionSort(int pa, int first, int last, int depth) {
+        final int[] SA = this.SA;
+
+        int i, j; // pointer within SA
+        int t;
+        int r;
+
+        for (i = last - 2; first <= i; --i) {
+            for (t = SA[i], j = i + 1; 0 < (r = ssCompare(pa + t, pa + SA[j], depth));) {
+                do {
+                    SA[j - 1] = SA[j];
+                } while (++j < last && SA[j] < 0);
+                if (last <= j) {
+                    break;
+                }
+            }
+            if (r == 0) {
+                SA[j] = ~SA[j];
+            }
+            SA[j - 1] = t;
+        }
+    }
+
+    private void ssFixdown(int td, int pa, int sa, int i, int size) {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+
+        int j, k;
+        int v;
+        int c, d, e;
+
+        for (v = SA[sa + i], c = T[td + SA[pa + v]] & 0xff; (j = 2 * i + 1) < size; SA[sa + i] = SA[sa + k], i = k) {
+            d = T[td + SA[pa + SA[sa + (k = j++)]]] & 0xff;
+            if (d < (e = T[td + SA[pa + SA[sa + j]]] & 0xff)) {
+                k = j;
+                d = e;
+            }
+            if (d <= c) {
+                break;
+            }
+        }
+        SA[sa + i] = v;
+    }
+
+    private void ssHeapSort(int td, int pa, int sa, int size) {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+
+        int i, m;
+        int t;
+
+        m = size;
+        if (size % 2 == 0) {
+            m--;
+            if ((T[td + SA[pa + SA[sa + m / 2]]] & 0xff) < (T[td + SA[pa + SA[sa + m]]] & 0xff)) {
+                swapElements(SA, sa + m, SA, sa + m / 2);
+            }
+        }
+
+        for (i = m / 2 - 1; 0 <= i; --i) {
+            ssFixdown(td, pa, sa, i, m);
+        }
+
+        if (size % 2 == 0) {
+            swapElements(SA, sa, SA, sa + m);
+            ssFixdown(td, pa, sa, 0, m);
+        }
+
+        for (i = m - 1; 0 < i; --i) {
+            t = SA[sa];
+            SA[sa] = SA[sa + i];
+            ssFixdown(td, pa, sa, 0, i);
+            SA[sa + i] = t;
+        }
+    }
+
+    private int ssMedian3(final int td, final int pa, int v1, int v2, int v3) {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+
+        int T_v1 = T[td + SA[pa + SA[v1]]] & 0xff;
+        int T_v2 = T[td + SA[pa + SA[v2]]] & 0xff;
+        int T_v3 = T[td + SA[pa + SA[v3]]] & 0xff;
+
+        if (T_v1 > T_v2) {
+            final int temp = v1;
+            v1 = v2;
+            v2 = temp;
+            final int T_vtemp = T_v1;
+            T_v1 = T_v2;
+            T_v2 = T_vtemp;
+        }
+        if (T_v2 > T_v3) {
+            if (T_v1 > T_v3) {
+                return v1;
+            }
+            return v3;
+        }
+        return v2;
+    }
+
+    private int ssMedian5(final int td, final int pa, int v1, int v2, int v3, int v4, int v5) {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+
+        int T_v1 = T[td + SA[pa + SA[v1]]] & 0xff;
+        int T_v2 = T[td + SA[pa + SA[v2]]] & 0xff;
+        int T_v3 = T[td + SA[pa + SA[v3]]] & 0xff;
+        int T_v4 = T[td + SA[pa + SA[v4]]] & 0xff;
+        int T_v5 = T[td + SA[pa + SA[v5]]] & 0xff;
+        int temp;
+        int T_vtemp;
+
+        if (T_v2 > T_v3) {
+            temp = v2;
+            v2 = v3;
+            v3 = temp;
+            T_vtemp = T_v2;
+            T_v2 = T_v3;
+            T_v3 = T_vtemp;
+        }
+        if (T_v4 > T_v5) {
+            temp = v4;
+            v4 = v5;
+            v5 = temp;
+            T_vtemp = T_v4;
+            T_v4 = T_v5;
+            T_v5 = T_vtemp;
+        }
+        if (T_v2 > T_v4) {
+            temp = v2;
+            v4 = temp;
+            T_vtemp = T_v2;
+            T_v4 = T_vtemp;
+            temp = v3;
+            v3 = v5;
+            v5 = temp;
+            T_vtemp = T_v3;
+            T_v3 = T_v5;
+            T_v5 = T_vtemp;
+        }
+        if (T_v1 > T_v3) {
+            temp = v1;
+            v1 = v3;
+            v3 = temp;
+            T_vtemp = T_v1;
+            T_v1 = T_v3;
+            T_v3 = T_vtemp;
+        }
+        if (T_v1 > T_v4) {
+            temp = v1;
+            v4 = temp;
+            T_vtemp = T_v1;
+            T_v4 = T_vtemp;
+            v3 = v5;
+            T_v3 = T_v5;
+        }
+        if (T_v3 > T_v4) {
+            return v4;
+        }
+        return v3;
+    }
+
+    private int ssPivot(final int td, final int pa, final int first, final int last) {
+        int middle;
+        int t;
+
+        t = last - first;
+        middle = first + t / 2;
+
+        if (t <= 512) {
+            if (t <= 32) {
+                return ssMedian3(td, pa, first, middle, last - 1);
+            }
+            t >>= 2;
+            return ssMedian5(td, pa, first, first + t, middle, last - 1 - t, last - 1);
+        }
+        t >>= 3;
+        return ssMedian3(
+                td, pa,
+                ssMedian3(td, pa, first, first + t, first + (t << 1)),
+                ssMedian3(td, pa, middle - t, middle, middle + t),
+                ssMedian3(td, pa, last - 1 - (t << 1), last - 1 - t, last - 1)
+        );
+    }
+
+    private static int ssLog(final int n) {
+        return (n & 0xff00) != 0 ?
+                  8 + LOG_2_TABLE[n >> 8 & 0xff]
+                : LOG_2_TABLE[n & 0xff];
+    }
+
+    private int ssSubstringPartition(final int pa, final int first, final int last, final int depth) {
+        final int[] SA = this.SA;
+
+        int a, b;
+        int t;
+
+        for (a = first - 1, b = last;;) {
+            while (++a < b && (SA[pa + SA[a]] + depth >= SA[pa + SA[a] + 1] + 1)) {
+                SA[a] = ~SA[a];
+            }
+            --b;
+            while (a < b && (SA[pa + SA[b]] + depth < SA[pa + SA[b] + 1] + 1)) {
+                --b;
+            }
+
+            if (b <= a) {
+                break;
+            }
+            t = ~SA[b];
+            SA[b] = SA[a];
+            SA[a] = t;
+        }
+        if (first < a) {
+            SA[first] = ~SA[first];
+        }
+        return a;
+    }
+
+    private static class StackEntry {
+        final int a;
+        final int b;
+        final int c;
+        final int d;
+
+        StackEntry(final int a, final int b, final int c, final int d) {
+            this.a = a;
+            this.b = b;
+            this.c = c;
+            this.d = d;
+        }
+    }
+
+    private void ssMultiKeyIntroSort(final int pa, int first, int last, int depth) {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+
+        final StackEntry[] stack = new StackEntry[STACK_SIZE];
+
+        int Td;
+        int a, b, c, d, e, f;
+        int s, t;
+        int ssize;
+        int limit;
+        int v, x = 0;
+
+        for (ssize = 0, limit = ssLog(last - first);;) {
+            if (last - first <= INSERTIONSORT_THRESHOLD) {
+                if (1 < last - first) {
+                    ssInsertionSort(pa, first, last, depth);
+                }
+                if (ssize == 0) {
+                    return;
+                }
+                StackEntry entry = stack[--ssize];
+                first = entry.a;
+                last = entry.b;
+                depth = entry.c;
+                limit = entry.d;
+                continue;
+            }
+
+            Td = depth;
+            if (limit-- == 0) {
+                ssHeapSort(Td, pa, first, last - first);
+            }
+            if (limit < 0) {
+                for (a = first + 1, v = T[Td + SA[pa + SA[first]]] & 0xff; a < last; ++a) {
+                    if ((x = T[Td + SA[pa + SA[a]]] & 0xff) != v) {
+                        if (1 < a - first) {
+                            break;
+                        }
+                        v = x;
+                        first = a;
+                    }
+                }
+                if ((T[Td + SA[pa + SA[first]] - 1] & 0xff) < v) {
+                    first = ssSubstringPartition(pa, first, a, depth);
+                }
+                if (a - first <= last - a) {
+                    if (1 < a - first) {
+                        stack[ssize++] = new StackEntry(a, last, depth, -1);
+                        last = a;
+                        depth += 1;
+                        limit = ssLog(a - first);
+                    } else {
+                        first = a;
+                        limit = -1;
+                    }
+                } else {
+                    if (1 < last - a) {
+                        stack[ssize++] = new StackEntry(first, a, depth + 1, ssLog(a - first));
+                        first = a;
+                        limit = -1;
+                    } else {
+                        last = a;
+                        depth += 1;
+                        limit = ssLog(a - first);
+                    }
+                }
+                continue;
+            }
+
+            a = ssPivot(Td, pa, first, last);
+            v = T[Td + SA[pa + SA[a]]] & 0xff;
+            swapElements(SA, first, SA, a);
+
+            b = first + 1;
+            while (b < last && (x = T[Td + SA[pa + SA[b]]] & 0xff) == v) {
+                ++b;
+            }
+            if ((a = b) < last && x < v) {
+                while (++b < last && (x = T[Td + SA[pa + SA[b]]] & 0xff) <= v) {
+                    if (x == v) {
+                        swapElements(SA, b, SA, a);
+                        ++a;
+                    }
+                }
+            }
+
+            c = last - 1;
+            while (b < c && (x = T[Td + SA[pa + SA[c]]] & 0xff) == v) {
+                --c;
+            }
+            if (b < (d = c) && x > v) {
+                while (b < --c && (x = T[Td + SA[pa + SA[c]]] & 0xff) >= v) {
+                    if (x == v) {
+                        swapElements(SA, c, SA, d);
+                        --d;
+                    }
+                }
+            }
+            while (b < c) {
+                swapElements(SA, b, SA, c);
+                while (++b < c && (x = T[Td + SA[pa + SA[b]]] & 0xff) <= v) {
+                    if (x == v) {
+                        swapElements(SA, b, SA, a);
+                        ++a;
+                    }
+                }
+                while (b < --c && (x = T[Td + SA[pa + SA[c]]] & 0xff) >= v) {
+                    if (x == v) {
+                        swapElements(SA, c, SA, d);
+                        --d;
+                    }
+                }
+            }
+
+            if (a <= d) {
+                c = b - 1;
+
+                if ((s = a - first) > (t = b - a)) {
+                    s = t;
+                }
+                for (e = first, f = b - s; 0 < s; --s, ++e, ++f) {
+                    swapElements(SA, e, SA, f);
+                }
+                if ((s = d - c) > (t = last - d - 1)) {
+                    s = t;
+                }
+                for (e = b, f = last - s; 0 < s; --s, ++e, ++f) {
+                    swapElements(SA, e, SA, f);
+                }
+
+                a = first + (b - a);
+                c = last - (d - c);
+                b = v <= (T[Td + SA[pa + SA[a]] - 1] & 0xff) ? a : ssSubstringPartition(pa, a, c, depth);
+
+                if (a - first <= last - c) {
+                    if (last - c <= c - b) {
+                        stack[ssize++] = new StackEntry(b, c, depth + 1, ssLog(c - b));
+                        stack[ssize++] = new StackEntry(c, last, depth, limit);
+                        last = a;
+                    } else if (a - first <= c - b) {
+                        stack[ssize++] = new StackEntry(c, last, depth, limit);
+                        stack[ssize++] = new StackEntry(b, c, depth + 1, ssLog(c - b));
+                        last = a;
+                    } else {
+                        stack[ssize++] = new StackEntry(c, last, depth, limit);
+                        stack[ssize++] = new StackEntry(first, a, depth, limit);
+                        first = b;
+                        last = c;
+                        depth += 1;
+                        limit = ssLog(c - b);
+                    }
+                } else {
+                    if (a - first <= c - b) {
+                        stack[ssize++] = new StackEntry(b, c, depth + 1, ssLog(c - b));
+                        stack[ssize++] = new StackEntry(first, a, depth, limit);
+                        first = c;
+                    } else if (last - c <= c - b) {
+                        stack[ssize++] = new StackEntry(first, a, depth, limit);
+                        stack[ssize++] = new StackEntry(b, c, depth + 1, ssLog(c - b));
+                        first = c;
+                    } else {
+                        stack[ssize++] = new StackEntry(first, a, depth, limit);
+                        stack[ssize++] = new StackEntry(c, last, depth, limit);
+                        first = b;
+                        last = c;
+                        depth += 1;
+                        limit = ssLog(c - b);
+                    }
+                }
+            } else {
+                limit += 1;
+                if ((T[Td + SA[pa + SA[first]] - 1] & 0xff) < v) {
+                    first = ssSubstringPartition(pa, first, last, depth);
+                    limit = ssLog(last - first);
+                }
+                depth += 1;
+            }
+        }
+    }
+
+    private static void ssBlockSwap(final int[] array1, final int first1,
+                                    final int[] array2, final int first2, final int size) {
+        int a, b;
+        int i;
+        for (i = size, a = first1, b = first2; 0 < i; --i, ++a, ++b) {
+            swapElements(array1, a, array2, b);
+        }
+    }
+
+    private void ssMergeForward(final int pa, int[] buf, final int bufoffset,
+                                final int first, final int middle, final int last, final int depth) {
+        final int[] SA = this.SA;
+
+        int bufend;
+        int i, j, k;
+        int t;
+        int r;
+
+        bufend = bufoffset + (middle - first) - 1;
+        ssBlockSwap(buf, bufoffset, SA, first, middle - first);
+
+        for (t = SA[first], i = first, j = bufoffset, k = middle;;) {
+            r = ssCompare(pa + buf[j], pa + SA[k], depth);
+            if (r < 0) {
+                do {
+                    SA[i++] = buf[j];
+                    if (bufend <= j) {
+                        buf[j] = t;
+                        return;
+                    }
+                    buf[j++] = SA[i];
+                } while (buf[j] < 0);
+            } else if (r > 0) {
+                do {
+                    SA[i++] = SA[k];
+                    SA[k++] = SA[i];
+                    if (last <= k) {
+                        while (j < bufend) { SA[i++] = buf[j]; buf[j++] = SA[i]; }
+                        SA[i] = buf[j]; buf[j] = t;
+                        return;
+                    }
+                } while (SA[k] < 0);
+            } else {
+                SA[k] = ~SA[k];
+                do {
+                    SA[i++] = buf[j];
+                    if (bufend <= j) {
+                        buf[j] = t;
+                        return;
+                    }
+                    buf[j++] = SA[i];
+                } while (buf[j] < 0);
+
+                do {
+                    SA[i++] = SA[k];
+                    SA[k++] = SA[i];
+                    if (last <= k) {
+                        while (j < bufend) {
+                            SA[i++] = buf[j];
+                            buf[j++] = SA[i];
+                        }
+                        SA[i] = buf[j]; buf[j] = t;
+                        return;
+                    }
+                } while (SA[k] < 0);
+            }
+        }
+    }
+
+    private void ssMergeBackward(final int pa, int[] buf, final int bufoffset,
+                                 final int first, final int middle, final int last, final int depth) {
+        final int[] SA = this.SA;
+
+        int p1, p2;
+        int bufend;
+        int i, j, k;
+        int t;
+        int r;
+        int x;
+
+        bufend = bufoffset + (last - middle);
+        ssBlockSwap(buf, bufoffset, SA, middle, last - middle);
+
+        x = 0;
+        if (buf[bufend - 1] < 0) {
+            x |=  1;
+            p1 = pa + ~buf[bufend - 1];
+        } else {
+            p1 = pa +  buf[bufend - 1];
+        }
+        if (SA[middle - 1] < 0) {
+            x |=  2;
+            p2 = pa + ~SA[middle - 1];
+        } else {
+            p2 = pa +  SA[middle - 1];
+        }
+        for (t = SA[last - 1], i = last - 1, j = bufend - 1, k = middle - 1;;) {
+
+            r = ssCompare(p1, p2, depth);
+            if (r > 0) {
+                if ((x & 1) != 0) {
+                    do {
+                        SA[i--] = buf[j];
+                        buf[j--] = SA[i];
+                    } while (buf[j] < 0);
+                    x ^= 1;
+                }
+                SA[i--] = buf[j];
+                if (j <= bufoffset) {
+                    buf[j] = t;
+                    return;
+                }
+                buf[j--] = SA[i];
+
+                if (buf[j] < 0) {
+                    x |=  1;
+                    p1 = pa + ~buf[j];
+                } else {
+                    p1 = pa +  buf[j];
+                }
+            } else if (r < 0) {
+                if ((x & 2) != 0) {
+                    do {
+                        SA[i--] = SA[k];
+                        SA[k--] = SA[i];
+                    } while (SA[k] < 0);
+                    x ^= 2;
+                }
+                SA[i--] = SA[k];
+                SA[k--] = SA[i];
+                if (k < first) {
+                    while (bufoffset < j) {
+                        SA[i--] = buf[j];
+                        buf[j--] = SA[i];
+                    }
+                    SA[i] = buf[j];
+                    buf[j] = t;
+                    return;
+                }
+
+                if (SA[k] < 0) {
+                    x |=  2;
+                    p2 = pa + ~SA[k];
+                } else {
+                    p2 = pa +  SA[k];
+                }
+            } else {
+                if ((x & 1) != 0) {
+                    do {
+                        SA[i--] = buf[j];
+                        buf[j--] = SA[i];
+                    } while (buf[j] < 0);
+                    x ^= 1;
+                }
+                SA[i--] = ~buf[j];
+                if (j <= bufoffset) {
+                    buf[j] = t;
+                    return;
+                }
+                buf[j--] = SA[i];
+
+                if ((x & 2) != 0) {
+                    do {
+                        SA[i--] = SA[k];
+                        SA[k--] = SA[i];
+                    } while (SA[k] < 0);
+                    x ^= 2;
+                }
+                SA[i--] = SA[k];
+                SA[k--] = SA[i];
+                if (k < first) {
+                    while (bufoffset < j) {
+                        SA[i--] = buf[j];
+                        buf[j--] = SA[i];
+                    }
+                    SA[i] = buf[j];
+                    buf[j] = t;
+                    return;
+                }
+
+                if (buf[j] < 0) {
+                    x |=  1;
+                    p1 = pa + ~buf[j];
+                } else {
+                    p1 = pa +  buf[j];
+                }
+                if (SA[k] < 0) {
+                    x |=  2;
+                    p2 = pa + ~SA[k];
+                } else {
+                    p2 = pa +  SA[k];
+                }
+            }
+        }
+    }
+
+    private static int getIDX(final int a) {
+        return 0 <= a ? a : ~a;
+    }
+
+    private void ssMergeCheckEqual(final int pa, final int depth, final int a) {
+        final int[] SA = this.SA;
+
+        if (0 <= SA[a] && ssCompare(pa + getIDX(SA[a - 1]), pa + SA[a], depth) == 0) {
+            SA[a] = ~SA[a];
+        }
+    }
+
+    private void ssMerge(final int pa, int first, int middle, int last, int[] buf,
+                         final int bufoffset, final int bufsize, final int depth) {
+        final int[] SA = this.SA;
+
+        final StackEntry[] stack = new StackEntry[STACK_SIZE];
+
+        int i, j;
+        int m, len, half;
+        int ssize;
+        int check, next;
+
+        for (check = 0, ssize = 0;;) {
+
+            if (last - middle <= bufsize) {
+                if (first < middle && middle < last) {
+                    ssMergeBackward(pa, buf, bufoffset, first, middle, last, depth);
+                }
+
+                if ((check & 1) != 0) {
+                    ssMergeCheckEqual(pa, depth, first);
+                }
+                if ((check & 2) != 0) {
+                    ssMergeCheckEqual(pa, depth, last);
+                }
+                if (ssize == 0) {
+                    return;
+                }
+                StackEntry entry = stack[--ssize];
+                first = entry.a;
+                middle = entry.b;
+                last = entry.c;
+                check = entry.d;
+                continue;
+            }
+
+            if (middle - first <= bufsize) {
+                if (first < middle) {
+                    ssMergeForward(pa, buf, bufoffset, first, middle, last, depth);
+                }
+                if ((check & 1) != 0) {
+                    ssMergeCheckEqual(pa, depth, first);
+                }
+                if ((check & 2) != 0) {
+                    ssMergeCheckEqual(pa, depth, last);
+                }
+                if (ssize == 0) {
+                    return;
+                }
+                StackEntry entry = stack[--ssize];
+                first = entry.a;
+                middle = entry.b;
+                last = entry.c;
+                check = entry.d;
+                continue;
+            }
+
+            for (m = 0, len = Math.min(middle - first, last - middle), half = len >> 1;
+                    0 < len;
+                    len = half, half >>= 1) {
+
+                if (ssCompare(pa + getIDX(SA[middle + m + half]),
+                        pa + getIDX(SA[middle - m - half - 1]), depth) < 0) {
+                    m += half + 1;
+                    half -= (len & 1) ^ 1;
+                }
+            }
+
+            if (0 < m) {
+                ssBlockSwap(SA, middle - m, SA, middle, m);
+                i = j = middle;
+                next = 0;
+                if (middle + m < last) {
+                    if (SA[middle + m] < 0) {
+                        while (SA[i - 1] < 0) {
+                            --i;
+                        }
+                        SA[middle + m] = ~SA[middle + m];
+                    }
+                    for (j = middle; SA[j] < 0;) {
+                        ++j;
+                    }
+                    next = 1;
+                }
+                if (i - first <= last - j) {
+                    stack[ssize++] = new StackEntry(j, middle + m, last, (check &  2) | (next & 1));
+                    middle -= m;
+                    last = i;
+                    check &= 1;
+                } else {
+                    if (i == middle && middle == j) {
+                        next <<= 1;
+                    }
+                    stack[ssize++] = new StackEntry(first, middle - m, i, (check & 1) | (next & 2));
+                    first = j;
+                    middle += m;
+                    check = (check & 2) | (next & 1);
+                }
+            } else {
+                if ((check & 1) != 0) {
+                    ssMergeCheckEqual(pa, depth, first);
+                }
+                ssMergeCheckEqual(pa, depth, middle);
+                if ((check & 2) != 0) {
+                    ssMergeCheckEqual(pa, depth, last);
+                }
+                if (ssize == 0) {
+                    return;
+                }
+                StackEntry entry = stack[--ssize];
+                first = entry.a;
+                middle = entry.b;
+                last = entry.c;
+                check = entry.d;
+            }
+        }
+    }
+
+    private void subStringSort(final int pa, int first, final int last,
+                               final int[] buf, final int bufoffset, final int bufsize,
+                               final int depth, final boolean lastsuffix, final int size) {
+        final int[] SA = this.SA;
+
+        int a, b;
+        int[] curbuf;
+        int curbufoffset;
+        int i, j, k;
+        int curbufsize;
+
+        if (lastsuffix) {
+            ++first;
+        }
+        for (a = first, i = 0; a + SS_BLOCKSIZE < last; a += SS_BLOCKSIZE, ++i) {
+            ssMultiKeyIntroSort(pa, a, a + SS_BLOCKSIZE, depth);
+            curbuf = SA;
+            curbufoffset = a + SS_BLOCKSIZE;
+            curbufsize = last - (a + SS_BLOCKSIZE);
+            if (curbufsize <= bufsize) {
+                curbufsize = bufsize;
+                curbuf = buf;
+                curbufoffset = bufoffset;
+            }
+            for (b = a, k = SS_BLOCKSIZE, j = i; (j & 1) != 0; b -= k, k <<= 1, j >>>= 1) {
+                ssMerge(pa, b - k, b, b + k, curbuf, curbufoffset, curbufsize, depth);
+            }
+        }
+
+        ssMultiKeyIntroSort(pa, a, last, depth);
+
+        for (k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+            if ((i & 1) != 0) {
+                ssMerge(pa, a - k, a, last, buf, bufoffset, bufsize, depth);
+                a -= k;
+            }
+        }
+
+        if (lastsuffix) {
+            int r;
+            for (a = first, i = SA[first - 1], r = 1;
+                    a < last && (SA[a] < 0 || 0 < (r = ssCompareLast(pa, pa + i, pa + SA[a], depth, size)));
+                    ++a) {
+                SA[a - 1] = SA[a];
+            }
+            if (r == 0) {
+                SA[a] = ~SA[a];
+            }
+            SA[a - 1] = i;
+        }
+    }
+
+    /*----------------------------------------------------------------------------*/
+
+    private int trGetC(final int isa, final int isaD, final int isaN, final int p) {
+        return isaD + p < isaN ?
+                SA[isaD + p]
+              : SA[isa + ((isaD - isa + p) % (isaN - isa))];
+    }
+
+    private void trFixdown(final int isa, final int isaD, final int isaN, final int sa, int i, final int size) {
+        final int[] SA = this.SA;
+
+        int j, k;
+        int v;
+        int c, d, e;
+
+        for (v = SA[sa + i], c = trGetC(isa, isaD, isaN, v); (j = 2 * i + 1) < size; SA[sa + i] = SA[sa + k], i = k) {
+            k = j++;
+            d = trGetC(isa, isaD, isaN, SA[sa + k]);
+            if (d < (e = trGetC(isa, isaD, isaN, SA[sa + j]))) {
+                k = j;
+                d = e;
+            }
+            if (d <= c) {
+                break;
+            }
+        }
+        SA[sa + i] = v;
+    }
+
+    private void trHeapSort(final int isa, final int isaD, final int isaN, final int sa, final int size) {
+        final int[] SA = this.SA;
+
+        int i, m;
+        int t;
+
+        m = size;
+        if (size % 2 == 0) {
+            m--;
+            if (trGetC(isa, isaD, isaN, SA[sa + m / 2]) < trGetC(isa, isaD, isaN, SA[sa + m])) {
+                swapElements(SA, sa + m, SA, sa + m / 2);
+            }
+        }
+
+        for (i = m / 2 - 1; 0 <= i; --i) {
+            trFixdown(isa, isaD, isaN, sa, i, m);
+        }
+
+        if (size % 2 == 0) {
+            swapElements(SA, sa, SA, sa + m);
+            trFixdown(isa, isaD, isaN, sa, 0, m);
+        }
+
+        for (i = m - 1; 0 < i; --i) {
+            t = SA[sa];
+            SA[sa] = SA[sa + i];
+            trFixdown(isa, isaD, isaN, sa, 0, i);
+            SA[sa + i] = t;
+        }
+    }
+
+    private void trInsertionSort(final int isa, final int isaD, final int isaN, int first, int last) {
+        final int[] SA = this.SA;
+
+        int a, b;
+        int t, r;
+
+        for (a = first + 1; a < last; ++a) {
+            for (t = SA[a], b = a - 1; 0 > (r = trGetC(isa, isaD, isaN, t) - trGetC(isa, isaD, isaN, SA[b]));) {
+                do {
+                    SA[b + 1] = SA[b];
+                } while (first <= --b && SA[b] < 0);
+                if (b < first) {
+                    break;
+                }
+            }
+            if (r == 0) {
+                SA[b] = ~SA[b];
+            }
+            SA[b + 1] = t;
+        }
+    }
+
+    private static int trLog(int n) {
+        return (n & 0xffff0000) != 0 ?
+                  (n & 0xff000000) != 0 ? 24 + LOG_2_TABLE[n >> 24 & 0xff] : LOG_2_TABLE[n >> 16 & 0xff + 16]
+                : (n & 0x0000ff00) != 0 ?  8 + LOG_2_TABLE[n >>  8 & 0xff] : LOG_2_TABLE[n & 0xff];
+    }
+
+    private int trMedian3(final int isa, final int isaD, final int isaN, int v1, int v2, int v3) {
+        final int[] SA = this.SA;
+
+        int SA_v1 = trGetC(isa, isaD, isaN, SA[v1]);
+        int SA_v2 = trGetC(isa, isaD, isaN, SA[v2]);
+        int SA_v3 = trGetC(isa, isaD, isaN, SA[v3]);
+
+        if (SA_v1 > SA_v2) {
+            final int temp = v1;
+            v1 = v2;
+            v2 = temp;
+            final int SA_vtemp = SA_v1;
+            SA_v1 = SA_v2;
+            SA_v2 = SA_vtemp;
+        }
+        if (SA_v2 > SA_v3) {
+            if (SA_v1 > SA_v3) {
+                return v1;
+            }
+            return v3;
+        }
+
+        return v2;
+    }
+
+    private int trMedian5(final int isa, final int isaD, final int isaN, int v1, int v2, int v3, int v4, int v5) {
+        final int[] SA = this.SA;
+
+        int SA_v1 = trGetC(isa, isaD, isaN, SA[v1]);
+        int SA_v2 = trGetC(isa, isaD, isaN, SA[v2]);
+        int SA_v3 = trGetC(isa, isaD, isaN, SA[v3]);
+        int SA_v4 = trGetC(isa, isaD, isaN, SA[v4]);
+        int SA_v5 = trGetC(isa, isaD, isaN, SA[v5]);
+        int temp;
+        int SA_vtemp;
+
+        if (SA_v2 > SA_v3) {
+            temp = v2;
+            v2 = v3;
+            v3 = temp;
+            SA_vtemp = SA_v2;
+            SA_v2 = SA_v3;
+            SA_v3 = SA_vtemp;
+        }
+        if (SA_v4 > SA_v5) {
+            temp = v4;
+            v4 = v5;
+            v5 = temp;
+            SA_vtemp = SA_v4;
+            SA_v4 = SA_v5;
+            SA_v5 = SA_vtemp;
+        }
+        if (SA_v2 > SA_v4) {
+            temp = v2;
+            v4 = temp;
+            SA_vtemp = SA_v2;
+            SA_v4 = SA_vtemp;
+            temp = v3;
+            v3 = v5;
+            v5 = temp;
+            SA_vtemp = SA_v3;
+            SA_v3 = SA_v5;
+            SA_v5 = SA_vtemp;
+        }
+        if (SA_v1 > SA_v3) {
+            temp = v1;
+            v1 = v3;
+            v3 = temp;
+            SA_vtemp = SA_v1;
+            SA_v1 = SA_v3;
+            SA_v3 = SA_vtemp;
+        }
+        if (SA_v1 > SA_v4) {
+            temp = v1;
+            v4 = temp;
+            SA_vtemp = SA_v1;
+            SA_v4 = SA_vtemp;
+            v3 = v5;
+            SA_v3 = SA_v5;
+        }
+        if (SA_v3 > SA_v4) {
+            return v4;
+        }
+        return v3;
+    }
+
+    private int trPivot(final int isa, final int isaD, final int isaN, final int first, final int last) {
+        final int middle;
+        int t;
+
+        t = last - first;
+        middle = first + t / 2;
+
+        if (t <= 512) {
+            if (t <= 32) {
+                return trMedian3(isa, isaD, isaN, first, middle, last - 1);
+            }
+            t >>= 2;
+            return trMedian5(
+                    isa, isaD, isaN,
+                    first, first + t,
+                    middle,
+                    last - 1 - t, last - 1
+            );
+        }
+        t >>= 3;
+        return trMedian3(
+                isa, isaD, isaN,
+                trMedian3(isa, isaD, isaN, first, first + t, first + (t << 1)),
+                trMedian3(isa, isaD, isaN, middle - t, middle, middle + t),
+                trMedian3(isa, isaD, isaN, last - 1 - (t << 1), last - 1 - t, last - 1)
+        );
+    }
+
+    /*---------------------------------------------------------------------------*/
+
+    private void lsUpdateGroup(final int isa, final int first, final int last) {
+        final int[] SA = this.SA;
+
+        int a, b;
+        int t;
+
+        for (a = first; a < last; ++a) {
+            if (0 <= SA[a]) {
+                b = a;
+                do {
+                    SA[isa + SA[a]] = a;
+                } while (++a < last && 0 <= SA[a]);
+                SA[b] = b - a;
+                if (last <= a) {
+                    break;
+                }
+            }
+            b = a;
+            do {
+                SA[a] = ~SA[a];
+            } while (SA[++a] < 0);
+            t = a;
+            do {
+                SA[isa + SA[b]] = t;
+            } while (++b <= a);
+        }
+    }
+
+    private void lsIntroSort(final int isa, final int isaD, final int isaN, int first, int last) {
+        final int[] SA = this.SA;
+
+        final StackEntry[] stack = new StackEntry[STACK_SIZE];
+
+        int a, b, c, d, e, f;
+        int s, t;
+        int limit;
+        int v, x = 0;
+        int ssize;
+
+        for (ssize = 0, limit = trLog(last - first);;) {
+            if (last - first <= INSERTIONSORT_THRESHOLD) {
+                if (1 < last - first) {
+                    trInsertionSort(isa, isaD, isaN, first, last);
+                    lsUpdateGroup(isa, first, last);
+                } else if (last - first == 1) {
+                    SA[first] = -1;
+                }
+                if (ssize == 0) {
+                    return;
+                }
+                StackEntry entry = stack[--ssize];
+                first = entry.a;
+                last = entry.b;
+                limit = entry.c;
+                continue;
+            }
+
+            if (limit-- == 0) {
+                trHeapSort(isa, isaD, isaN, first, last - first);
+                for (a = last - 1; first < a; a = b) {
+                    for (x = trGetC(isa, isaD, isaN, SA[a]), b = a - 1;
+                            first <= b && trGetC(isa, isaD, isaN, SA[b]) == x;
+                            --b) {
+                        SA[b] = ~SA[b];
+                    }
+                }
+                lsUpdateGroup(isa, first, last);
+                if (ssize == 0) {
+                    return;
+                }
+                StackEntry entry = stack[--ssize];
+                first = entry.a;
+                last = entry.b;
+                limit = entry.c;
+                continue;
+            }
+
+            a = trPivot(isa, isaD, isaN, first, last);
+            swapElements(SA, first, SA, a);
+            v = trGetC(isa, isaD, isaN, SA[first]);
+
+            b = first + 1;
+            while (b < last && (x = trGetC(isa, isaD, isaN, SA[b])) == v) {
+                ++b;
+            }
+            if ((a = b) < last && x < v) {
+                while (++b < last && (x = trGetC(isa, isaD, isaN, SA[b])) <= v) {
+                    if (x == v) {
+                        swapElements(SA, b, SA, a);
+                        ++a;
+                    }
+                }
+            }
+
+            c = last - 1;
+            while (b < c && (x = trGetC(isa, isaD, isaN, SA[c])) == v) {
+                --c;
+            }
+            if (b < (d = c) && x > v) {
+                while (b < --c && (x = trGetC(isa, isaD, isaN, SA[c])) >= v) {
+                    if (x == v) {
+                        swapElements(SA, c, SA, d);
+                        --d;
+                    }
+                }
+            }
+            while (b < c) {
+                swapElements(SA, b, SA, c);
+                while (++b < c && (x = trGetC(isa, isaD, isaN, SA[b])) <= v) {
+                    if (x == v) {
+                        swapElements(SA, b, SA, a);
+                        ++a;
+                    }
+                }
+                while (b < --c && (x = trGetC(isa, isaD, isaN, SA[c])) >= v) {
+                    if (x == v) {
+                        swapElements(SA, c, SA, d);
+                        --d;
+                    }
+                }
+            }
+
+            if (a <= d) {
+                c = b - 1;
+
+                if ((s = a - first) > (t = b - a)) {
+                    s = t;
+                }
+                for (e = first, f = b - s; 0 < s; --s, ++e, ++f) {
+                    swapElements(SA, e, SA, f);
+                }
+                if ((s = d - c) > (t = last - d - 1)) {
+                    s = t;
+                }
+                for (e = b, f = last - s; 0 < s; --s, ++e, ++f) {
+                    swapElements(SA, e, SA, f);
+                }
+
+                a = first + (b - a);
+                b = last - (d - c);
+
+                for (c = first, v = a - 1; c < a; ++c) {
+                    SA[isa + SA[c]] = v;
+                }
+                if (b < last) {
+                    for (c = a, v = b - 1; c < b; ++c) {
+                        SA[isa + SA[c]] = v;
+                    }
+                }
+                if ((b - a) == 1) {
+                    SA[a] = - 1;
+                }
+
+                if (a - first <= last - b) {
+                    if (first < a) {
+                        stack[ssize++] = new StackEntry(b, last, limit, 0);
+                        last = a;
+                    } else {
+                        first = b;
+                    }
+                } else {
+                    if (b < last) {
+                        stack[ssize++] = new StackEntry(first, a, limit, 0);
+                        first = b;
+                    } else {
+                        last = a;
+                    }
+                }
+            } else {
+                if (ssize == 0) {
+                    return;
+                }
+                StackEntry entry = stack[--ssize];
+                first = entry.a;
+                last = entry.b;
+                limit = entry.c;
+            }
+        }
+    }
+
+    private void lsSort(final int isa, final int n, final int depth) {
+        final int[] SA = this.SA;
+
+        int isaD;
+        int first, last, i;
+        int t, skip;
+
+        for (isaD = isa + depth; -n < SA[0]; isaD += isaD - isa) {
+            first = 0;
+            skip = 0;
+            do {
+                if ((t = SA[first]) < 0) {
+                    first -= t;
+                    skip += t;
+                } else {
+                    if (skip != 0) {
+                        SA[first + skip] = skip;
+                        skip = 0;
+                    }
+                    last = SA[isa + t] + 1;
+                    lsIntroSort(isa, isaD, isa + n, first, last);
+                    first = last;
+                }
+            } while (first < n);
+            if (skip != 0) {
+                SA[first + skip] = skip;
+            }
+            if (n < isaD - isa) {
+                first = 0;
+                do {
+                    if ((t = SA[first]) < 0) {
+                        first -= t;
+                    } else {
+                        last = SA[isa + t] + 1;
+                        for (i = first; i < last; ++i) {
+                            SA[isa + SA[i]] = i;
+                        }
+                        first = last;
+                    }
+                } while (first < n);
+                break;
+            }
+        }
+    }
+
+    /*---------------------------------------------------------------------------*/
+
+    private static class PartitionResult {
+        final int first;
+        final int last;
+
+        PartitionResult(final int first, final int last) {
+            this.first = first;
+            this.last = last;
+        }
+    }
+
+    private PartitionResult trPartition(final int isa, final int isaD, final int isaN,
+                                        int first, int last, final int v) {
+        final int[] SA = this.SA;
+
+        int a, b, c, d, e, f;
+        int t, s;
+        int x = 0;
+
+        b = first;
+        while (b < last && (x = trGetC(isa, isaD, isaN, SA[b])) == v) {
+            ++b;
+        }
+        if ((a = b) < last && x < v) {
+            while (++b < last && (x = trGetC(isa, isaD, isaN, SA[b])) <= v) {
+                if (x == v) {
+                    swapElements(SA, b, SA, a);
+                    ++a;
+                }
+            }
+        }
+
+        c = last - 1;
+        while (b < c && (x = trGetC(isa, isaD, isaN, SA[c])) == v) {
+            --c;
+        }
+        if (b < (d = c) && x > v) {
+            while (b < --c && (x = trGetC(isa, isaD, isaN, SA[c])) >= v) {
+                if (x == v) {
+                    swapElements(SA, c, SA, d);
+                    --d;
+                }
+            }
+        }
+        while (b < c) {
+            swapElements(SA, b, SA, c);
+            while (++b < c && (x = trGetC(isa, isaD, isaN, SA[b])) <= v) {
+                if (x == v) {
+                    swapElements(SA, b, SA, a);
+                    ++a;
+                }
+            }
+            while (b < --c && (x = trGetC(isa, isaD, isaN, SA[c])) >= v) {
+                if (x == v) {
+                    swapElements(SA, c, SA, d);
+                    --d;
+                }
+            }
+        }
+
+        if (a <= d) {
+            c = b - 1;
+            if ((s = a - first) > (t = b - a)) {
+                s = t;
+            }
+            for (e = first, f = b - s; 0 < s; --s, ++e, ++f) {
+                swapElements(SA, e, SA, f);
+            }
+            if ((s = d - c) > (t = last - d - 1)) {
+                s = t;
+            }
+            for (e = b, f = last - s; 0 < s; --s, ++e, ++f) {
+                swapElements(SA, e, SA, f);
+            }
+            first += b - a;
+            last -= d - c;
+        }
+        return new PartitionResult(first, last);
+    }
+
+    private void trCopy(final int isa, final int isaN, final int first,
+                        final int a, final int b, final int last, final int depth) {
+        final int[] SA = this.SA;
+
+        int c, d, e;
+        int s, v;
+
+        v = b - 1;
+
+        for (c = first, d = a - 1; c <= d; ++c) {
+            if ((s = SA[c] - depth) < 0) {
+                s += isaN - isa;
+            }
+            if (SA[isa + s] == v) {
+                SA[++d] = s;
+                SA[isa + s] = d;
+            }
+        }
+        for (c = last - 1, e = d + 1, d = b; e < d; --c) {
+            if ((s = SA[c] - depth) < 0) {
+                s += isaN - isa;
+            }
+            if (SA[isa + s] == v) {
+                SA[--d] = s;
+                SA[isa + s] = d;
+            }
+        }
+    }
+
+    private void trIntroSort(final int isa, int isaD, int isaN, int first,
+                             int last, final TRBudget budget, final int size) {
+        final int[] SA = this.SA;
+
+        final StackEntry[] stack = new StackEntry[STACK_SIZE];
+
+        int a, b, c, d, e, f;
+        int s, t;
+        int v, x = 0;
+        int limit, next;
+        int ssize;
+
+        for (ssize = 0, limit = trLog(last - first);;) {
+            if (limit < 0) {
+                if (limit == -1) {
+                    if (!budget.update(size, last - first)) {
+                        break;
+                    }
+                    PartitionResult result = trPartition(isa, isaD - 1, isaN, first, last, last - 1);
+                    a = result.first;
+                    b = result.last;
+                    if (first < a || b < last) {
+                        if (a < last) {
+                            for (c = first, v = a - 1; c < a; ++c) {
+                                SA[isa + SA[c]] = v;
+                            }
+                        }
+                        if (b < last) {
+                            for (c = a, v = b - 1; c < b; ++c) {
+                                SA[isa + SA[c]] = v;
+                            }
+                        }
+
+                        stack[ssize++] = new StackEntry(0, a, b, 0);
+                        stack[ssize++] = new StackEntry(isaD - 1, first, last, -2);
+                        if (a - first <= last - b) {
+                            if (1 < a - first) {
+                                stack[ssize++] = new StackEntry(isaD, b, last, trLog(last - b));
+                                last = a; limit = trLog(a - first);
+                            } else if (1 < last - b) {
+                                first = b; limit = trLog(last - b);
+                            } else {
+                                if (ssize == 0) {
+                                    return;
+                                }
+                                StackEntry entry = stack[--ssize];
+                                isaD = entry.a;
+                                first = entry.b;
+                                last = entry.c;
+                                limit = entry.d;
+                            }
+                        } else {
+                            if (1 < last - b) {
+                                stack[ssize++] = new StackEntry(isaD, first, a, trLog(a - first));
+                                first = b;
+                                limit = trLog(last - b);
+                            } else if (1 < a - first) {
+                                last = a;
+                                limit = trLog(a - first);
+                            } else {
+                                if (ssize == 0) {
+                                    return;
+                                }
+                                StackEntry entry = stack[--ssize];
+                                isaD = entry.a;
+                                first = entry.b;
+                                last = entry.c;
+                                limit = entry.d;
+                            }
+                        }
+                    } else {
+                        for (c = first; c < last; ++c) {
+                            SA[isa + SA[c]] = c;
+                        }
+                        if (ssize == 0) {
+                            return;
+                        }
+                        StackEntry entry = stack[--ssize];
+                        isaD = entry.a;
+                        first = entry.b;
+                        last = entry.c;
+                        limit = entry.d;
+                    }
+                } else if (limit == -2) {
+                    a = stack[--ssize].b;
+                    b = stack[ssize].c;
+                    trCopy(isa, isaN, first, a, b, last, isaD - isa);
+                    if (ssize == 0) {
+                        return;
+                    }
+                    StackEntry entry = stack[--ssize];
+                    isaD = entry.a;
+                    first = entry.b;
+                    last = entry.c;
+                    limit = entry.d;
+                } else {
+                    if (0 <= SA[first]) {
+                        a = first;
+                        do {
+                            SA[isa + SA[a]] = a;
+                        } while (++a < last && 0 <= SA[a]);
+                        first = a;
+                    }
+                    if (first < last) {
+                        a = first;
+                        do {
+                            SA[a] = ~SA[a];
+                        } while (SA[++a] < 0);
+                        next = SA[isa + SA[a]] != SA[isaD + SA[a]] ? trLog(a - first + 1) : -1;
+                        if (++a < last) {
+                            for (b = first, v = a - 1; b < a; ++b) {
+                                SA[isa + SA[b]] = v;
+                            }
+                        }
+
+                        if (a - first <= last - a) {
+                            stack[ssize++] = new StackEntry(isaD, a, last, -3);
+                            isaD += 1; last = a; limit = next;
+                        } else {
+                            if (1 < last - a) {
+                                stack[ssize++] = new StackEntry(isaD + 1, first, a, next);
+                                first = a; limit = -3;
+                            } else {
+                                isaD += 1; last = a; limit = next;
+                            }
+                        }
+                    } else {
+                        if (ssize == 0) {
+                            return;
+                        }
+                        StackEntry entry = stack[--ssize];
+                        isaD = entry.a;
+                        first = entry.b;
+                        last = entry.c;
+                        limit = entry.d;
+                    }
+                }
+                continue;
+            }
+
+            if (last - first <= INSERTIONSORT_THRESHOLD) {
+                if (!budget.update(size, last - first)) {
+                    break;
+                }
+                trInsertionSort(isa, isaD, isaN, first, last);
+                limit = -3;
+                continue;
+            }
+
+            if (limit-- == 0) {
+                if (!budget.update(size, last - first)) {
+                    break;
+                }
+                trHeapSort(isa, isaD, isaN, first, last - first);
+                for (a = last - 1; first < a; a = b) {
+                    for (x = trGetC(isa, isaD, isaN, SA[a]), b = a - 1;
+                            first <= b && trGetC(isa, isaD, isaN, SA[b]) == x;
+                            --b) {
+                        SA[b] = ~SA[b];
+                    }
+                }
+                limit = -3;
+                continue;
+            }
+
+            a = trPivot(isa, isaD, isaN, first, last);
+
+            swapElements(SA, first, SA, a);
+            v = trGetC(isa, isaD, isaN, SA[first]);
+
+            b = first + 1;
+            while (b < last && (x = trGetC(isa, isaD, isaN, SA[b])) == v) {
+                ++b;
+            }
+            if ((a = b) < last && x < v) {
+                while (++b < last && (x = trGetC(isa, isaD, isaN, SA[b])) <= v) {
+                    if (x == v) {
+                        swapElements(SA, b, SA, a);
+                        ++a;
+                    }
+                }
+            }
+
+            c = last - 1;
+            while (b < c && (x = trGetC(isa, isaD, isaN, SA[c])) == v) {
+                --c;
+            }
+            if (b < (d = c) && x > v) {
+                while (b < --c && (x = trGetC(isa, isaD, isaN, SA[c])) >= v) {
+                    if (x == v) {
+                        swapElements(SA, c, SA, d);
+                        --d;
+                    }
+                }
+            }
+            while (b < c) {
+                swapElements(SA, b, SA, c);
+                while (++b < c && (x = trGetC(isa, isaD, isaN, SA[b])) <= v) {
+                    if (x == v) {
+                        swapElements(SA, b, SA, a);
+                        ++a;
+                    }
+                }
+                while (b < --c && (x = trGetC(isa, isaD, isaN, SA[c])) >= v) {
+                    if (x == v) {
+                        swapElements(SA, c, SA, d);
+                        --d;
+                    }
+                }
+            }
+
+            if (a <= d) {
+                c = b - 1;
+
+                if ((s = a - first) > (t = b - a)) {
+                    s = t;
+                }
+                for (e = first, f = b - s; 0 < s; --s, ++e, ++f) {
+                    swapElements(SA, e, SA, f);
+                }
+                if ((s = d - c) > (t = last - d - 1)) {
+                    s = t;
+                }
+                for (e = b, f = last - s; 0 < s; --s, ++e, ++f) {
+                    swapElements(SA, e, SA, f);
+                }
+
+                a = first + (b - a);
+                b = last - (d - c);
+                next = SA[isa + SA[a]] != v ? trLog(b - a) : -1;
+
+                for (c = first, v = a - 1; c < a; ++c) {
+                    SA[isa + SA[c]] = v;
+                }
+                if (b < last) {
+                    for (c = a, v = b - 1; c < b; ++c) {
+                        SA[isa + SA[c]] = v; }
+                }
+
+                if (a - first <= last - b) {
+                    if (last - b <= b - a) {
+                        if (1 < a - first) {
+                            stack[ssize++] = new StackEntry(isaD + 1, a, b, next);
+                            stack[ssize++] = new StackEntry(isaD, b, last, limit);
+                            last = a;
+                        } else if (1 < last - b) {
+                            stack[ssize++] = new StackEntry(isaD + 1, a, b, next);
+                            first = b;
+                        } else if (1 < b - a) {
+                            isaD += 1;
+                            first = a;
+                            last = b;
+                            limit = next;
+                        } else {
+                            if (ssize == 0) {
+                                return;
+                            }
+                            StackEntry entry = stack[--ssize];
+                            isaD = entry.a;
+                            first = entry.b;
+                            last = entry.c;
+                            limit = entry.d;
+                        }
+                    } else if (a - first <= b - a) {
+                        if (1 < a - first) {
+                            stack[ssize++] = new StackEntry(isaD, b, last, limit);
+                            stack[ssize++] = new StackEntry(isaD + 1, a, b, next);
+                            last = a;
+                        } else if (1 < b - a) {
+                            stack[ssize++] = new StackEntry(isaD, b, last, limit);
+                            isaD += 1;
+                            first = a;
+                            last = b;
+                            limit = next;
+                        } else {
+                            first = b;
+                        }
+                    } else {
+                        if (1 < b - a) {
+                            stack[ssize++] = new StackEntry(isaD, b, last, limit);
+                            stack[ssize++] = new StackEntry(isaD, first, a, limit);
+                            isaD += 1;
+                            first = a;
+                            last = b;
+                            limit = next;
+                        } else {
+                            stack[ssize++] = new StackEntry(isaD, b, last, limit);
+                            last = a;
+                        }
+                    }
+                } else {
+                    if (a - first <= b - a) {
+                        if (1 < last - b) {
+                            stack[ssize++] = new StackEntry(isaD + 1, a, b, next);
+                            stack[ssize++] = new StackEntry(isaD, first, a, limit);
+                            first = b;
+                        } else if (1 < a - first) {
+                            stack[ssize++] = new StackEntry(isaD + 1, a, b, next);
+                            last = a;
+                        } else if (1 < b - a) {
+                            isaD += 1;
+                            first = a;
+                            last = b;
+                            limit = next;
+                        } else {
+                            stack[ssize++] = new StackEntry(isaD, first, last, limit);
+                        }
+                    } else if (last - b <= b - a) {
+                        if (1 < last - b) {
+                            stack[ssize++] = new StackEntry(isaD, first, a, limit);
+                            stack[ssize++] = new StackEntry(isaD + 1, a, b, next);
+                            first = b;
+                        } else if (1 < b - a) {
+                            stack[ssize++] = new StackEntry(isaD, first, a, limit);
+                            isaD += 1;
+                            first = a;
+                            last = b;
+                            limit = next;
+                        } else {
+                            last = a;
+                        }
+                    } else {
+                        if (1 < b - a) {
+                            stack[ssize++] = new StackEntry(isaD, first, a, limit);
+                            stack[ssize++] = new StackEntry(isaD, b, last, limit);
+                            isaD += 1;
+                            first = a;
+                            last = b;
+                            limit = next;
+                        } else {
+                            stack[ssize++] = new StackEntry(isaD, first, a, limit);
+                            first = b;
+                        }
+                    }
+                }
+            } else {
+                if (!budget.update(size, last - first)) {
+                    break; // BUGFIX : Added to prevent an infinite loop in the original code
+                }
+                limit += 1; isaD += 1;
+            }
+        }
+
+        for (s = 0; s < ssize; ++s) {
+            if (stack[s].d == -3) {
+                lsUpdateGroup(isa, stack[s].b, stack[s].c);
+            }
+        }
+    }
+
+    private static class TRBudget {
+        int budget;
+        int chance;
+
+        TRBudget(final int budget, final int chance) {
+            this.budget = budget;
+            this.chance = chance;
+        }
+
+        boolean update(final int size, final int n) {
+            budget -= n;
+            if (budget <= 0) {
+                if (--chance == 0) {
+                    return false;
+                }
+                budget += size;
+            }
+            return true;
+        }
+    }
+
+    private void trSort(final int isa, final int n, final int depth) {
+        final int[] SA = this.SA;
+
+        int first = 0, last;
+        int t;
+
+        if (-n < SA[0]) {
+            TRBudget budget = new TRBudget(n, trLog(n) * 2 / 3 + 1);
+            do {
+                if ((t = SA[first]) < 0) {
+                    first -= t;
+                } else {
+                    last = SA[isa + t] + 1;
+                    if (1 < last - first) {
+                        trIntroSort(isa, isa + depth, isa + n, first, last, budget, n);
+                        if (budget.chance == 0) {
+                            /* Switch to Larsson-Sadakane sorting algorithm */
+                            if (0 < first) {
+                                SA[0] = -first;
+                            }
+                            lsSort(isa, n, depth);
+                            break;
+                        }
+                    }
+                    first = last;
+                }
+            } while (first < n);
+        }
+    }
+
+    /*---------------------------------------------------------------------------*/
+
+    private static int BUCKET_B(final int c0, final int c1) {
+        return (c1 << 8) | c0;
+    }
+
+    private static int BUCKET_BSTAR(final int c0, final int c1) {
+        return (c0 << 8) | c1;
+    }
+
+    private int sortTypeBstar(final int[] bucketA, final int[] bucketB) {
+        final byte[] T = this.T;
+        final int[] SA = this.SA;
+        final int n = this.n;
+        final int[] tempbuf = new int[256];
+
+        int[] buf;
+        int PAb, ISAb, bufoffset;
+        int i, j, k, t, m, bufsize;
+        int c0, c1;
+        int flag;
+
+        for (i = 1, flag = 1; i < n; ++i) {
+            if (T[i - 1] != T[i]) {
+                if ((T[i - 1] & 0xff) > (T[i] & 0xff)) {
+                    flag = 0;
+                }
+                break;
+            }
+        }
+        i = n - 1;
+        m = n;
+
+        int ti, ti1, t0;
+        if ((ti = T[i] & 0xff) < (t0 = T[0] & 0xff) || (T[i] == T[0] && flag != 0)) {
+            if (flag == 0) {
+                ++bucketB[BUCKET_BSTAR(ti, t0)];
+                SA[--m] = i;
+            } else {
+                ++bucketB[BUCKET_B(ti, t0)];
+            }
+            for (--i; 0 <= i && (ti = T[i] & 0xff) <= (ti1 = T[i + 1] & 0xff); --i) {
+                ++bucketB[BUCKET_B(ti, ti1)];
+            }
+        }
+
+        while (0 <= i) {
+            do {
+                ++bucketA[T[i] & 0xff];
+            } while (0 <= --i && (T[i] & 0xff) >= (T[i + 1] & 0xff));
+            if (0 <= i) {
+                ++bucketB[BUCKET_BSTAR(T[i] & 0xff, T[i + 1] & 0xff)];
+                SA[--m] = i;
+                for (--i; 0 <= i && (ti = T[i] & 0xff) <= (ti1 = T[i + 1] & 0xff); --i) {
+                    ++bucketB[BUCKET_B(ti, ti1)];
+                }
+            }
+        }
+        m = n - m;
+        if (m == 0) {
+            for (i = 0; i < n; ++i) {
+                SA[i] = i;
+            }
+            return 0;
+        }
+
+        for (c0 = 0, i = -1, j = 0; c0 < 256; ++c0) {
+            t = i + bucketA[c0];
+            bucketA[c0] = i + j;
+            i = t + bucketB[BUCKET_B(c0, c0)];
+            for (c1 = c0 + 1; c1 < 256; ++c1) {
+                j += bucketB[BUCKET_BSTAR(c0, c1)];
+                bucketB[(c0 << 8) | c1] = j;
+                i += bucketB[BUCKET_B(c0, c1)];
+            }
+        }
+
+        PAb = n - m;
+        ISAb = m;
+        for (i = m - 2; 0 <= i; --i) {
+            t = SA[PAb + i];
+            c0 = T[t] & 0xff;
+            c1 = T[t + 1] & 0xff;
+            SA[--bucketB[BUCKET_BSTAR(c0, c1)]] = i;
+        }
+        t = SA[PAb + m - 1];
+        c0 = T[t] & 0xff;
+        c1 = T[t + 1] & 0xff;
+        SA[--bucketB[BUCKET_BSTAR(c0, c1)]] = m - 1;
+
+        buf = SA;
+        bufoffset = m;
+        bufsize = n - 2 * m;
+        if (bufsize <= 256) {
+            buf = tempbuf;
+            bufoffset = 0;
+            bufsize = 256;
+        }
+
+        for (c0 = 255, j = m; 0 < j; --c0) {
+            for (c1 = 255; c0 < c1; j = i, --c1) {
+                i = bucketB[BUCKET_BSTAR(c0, c1)];
+                if (1 < j - i) {
+                    subStringSort(PAb, i, j, buf, bufoffset, bufsize, 2, SA[i] == m - 1, n);
+                }
+            }
+        }
+
+        for (i = m - 1; 0 <= i; --i) {
+            if (0 <= SA[i]) {
+                j = i;
+                do {
+                    SA[ISAb + SA[i]] = i;
+                } while (0 <= --i && 0 <= SA[i]);
+                SA[i + 1] = i - j;
+                if (i <= 0) {
+                    break;
+                }
+            }
+            j = i;
+            do {
+                SA[ISAb + (SA[i] = ~SA[i])] = j;
+            } while (SA[--i] < 0);
+            SA[ISAb + SA[i]] = j;
+        }
+
+        trSort(ISAb, m, 1);
+
+        i = n - 1; j = m;
+        if ((T[i] & 0xff) < (T[0] & 0xff) || (T[i] == T[0] && flag != 0)) {
+            if (flag == 0) {
+                SA[SA[ISAb + --j]] = i;
+            }
+            for (--i; 0 <= i && (T[i] & 0xff) <= (T[i + 1] & 0xff);) {
+                --i;
+            }
+        }
+        while (0 <= i) {
+            for (--i; 0 <= i && (T[i] & 0xff) >= (T[i + 1] & 0xff);) {
+                --i;
+            }
+            if (0 <= i) {
+                SA[SA[ISAb + --j]] = i;
+                for (--i; 0 <= i && (T[i] & 0xff) <= (T[i + 1] & 0xff);) {
+                    --i;
+                }
+            }
+        }
+
+        for (c0 = 255, i = n - 1, k = m - 1; 0 <= c0; --c0) {
+            for (c1 = 255; c0 < c1; --c1) {
+                t = i - bucketB[BUCKET_B(c0, c1)];
+                bucketB[BUCKET_B(c0, c1)] = i + 1;
+
+                for (i = t, j = bucketB[BUCKET_BSTAR(c0, c1)]; j <= k; --i, --k) {
+                    SA[i] = SA[k];
+                }
+            }
+            t = i - bucketB[BUCKET_B(c0, c0)];
+            bucketB[BUCKET_B(c0, c0)] = i + 1;
+            if (c0 < 255) {
+                bucketB[BUCKET_BSTAR(c0, c0 + 1)] = t + 1;
+            }
+            i = bucketA[c0];
+        }
+        return m;
+    }
+
+    private int constructBWT(final int[] bucketA, final int[] bucketB) {
+        final byte[] T = this.T;
+        final int[] SA = this.SA;
+        final int n = this.n;
+
+        int i, j, t = 0;
+        int s, s1;
+        int c0, c1, c2 = 0;
+        int orig = -1;
+
+        for (c1 = 254; 0 <= c1; --c1) {
+            for (i = bucketB[BUCKET_BSTAR(c1, c1 + 1)], j = bucketA[c1 + 1], t = 0, c2 = -1;
+                    i <= j;
+                    --j) {
+                if (0 <= (s1 = s = SA[j])) {
+                    if (--s < 0) {
+                        s = n - 1;
+                    }
+                    if ((c0 = T[s] & 0xff) <= c1) {
+                        SA[j] = ~s1;
+                        if (0 < s && (T[s - 1] & 0xff) > c0) {
+                            s = ~s;
+                        }
+                        if (c2 == c0) {
+                            SA[--t] = s;
+                        } else {
+                            if (0 <= c2) {
+                                bucketB[BUCKET_B(c2, c1)] = t;
+                            }
+                            SA[t = bucketB[BUCKET_B(c2 = c0, c1)] - 1] = s;
+                        }
+                    }
+                } else {
+                    SA[j] = ~s;
+                }
+            }
+        }
+
+        for (i = 0; i < n; ++i) {
+            if (0 <= (s1 = s = SA[i])) {
+                if (--s < 0) {
+                    s = n - 1;
+                }
+                if ((c0 = T[s] & 0xff) >= (T[s + 1] & 0xff)) {
+                    if (0 < s && (T[s - 1] & 0xff) < c0) {
+                        s = ~s;
+                    }
+                    if (c0 == c2) {
+                        SA[++t] = s;
+                    } else {
+                        if (c2 != -1) {
+                            bucketA[c2] = t;    // BUGFIX: Original code can write to bucketA[-1]
+                        }
+                        SA[t = bucketA[c2 = c0] + 1] = s;
+                    }
+                }
+            } else {
+                s1 = ~s1;
+            }
+
+            if (s1 == 0) {
+                SA[i] = T[n - 1];
+                orig = i;
+            } else {
+                SA[i] = T[s1 - 1];
+            }
+        }
+        return orig;
+    }
+
+    /**
+     * Performs a Burrows Wheeler Transform on the input array.
+     * @return the index of the first character of the input array within the output array
+     */
+    public int bwt() {
+        final int[] SA = this.SA;
+        final byte[] T = this.T;
+        final int n = this.n;
+
+        final int[] bucketA = new int[BUCKET_A_SIZE];
+        final int[] bucketB = new int[BUCKET_B_SIZE];
+
+        if (n == 0) {
+            return 0;
+        }
+        if (n == 1) {
+            SA[0] = T[0];
+            return 0;
+        }
+
+        int m = sortTypeBstar(bucketA, bucketB);
+        if (0 < m) {
+            return constructBWT(bucketA, bucketB);
+        }
+        return 0;
+    }
+}
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Encoder.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Encoder.java
new file mode 100644
index 000000000000..20be51ac9d8f
--- /dev/null
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2Encoder.java
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.channel.ChannelFuture;
+import io.netty.channel.ChannelFutureListener;
+import io.netty.channel.ChannelHandlerContext;
+import io.netty.channel.ChannelPipeline;
+import io.netty.channel.ChannelPromise;
+import io.netty.channel.ChannelPromiseNotifier;
+import io.netty.handler.codec.MessageToByteEncoder;
+import io.netty.util.concurrent.EventExecutor;
+
+import java.util.concurrent.TimeUnit;
+
+import static io.netty.handler.codec.compression.Bzip2Constants.*;
+
+/**
+ * Compresses a {@link ByteBuf} using the Bzip2 algorithm.
+ *
+ * See <a href="http://en.wikipedia.org/wiki/Bzip2">Bzip2</a>.
+ */
+public class Bzip2Encoder extends MessageToByteEncoder<ByteBuf> {
+    /**
+     * Current state of stream.
+     */
+    private enum State {
+        INIT,
+        INIT_BLOCK,
+        WRITE_DATA,
+        CLOSE_BLOCK,
+        EOF
+    }
+
+    private State currentState = State.INIT;
+
+    /**
+     * A writer that provides bit-level writes.
+     */
+    private final Bzip2BitWriter writer = new Bzip2BitWriter();
+
+    /**
+     * The declared maximum block size of the stream (before final run-length decoding).
+     */
+    private final int streamBlockSize;
+
+    /**
+     * The merged CRC of all blocks compressed so far.
+     */
+    private int streamCRC;
+
+    /**
+     * The compressor for the current block.
+     */
+    private Bzip2BlockCompressor blockCompressor;
+
+    /**
+     * (@code true} if the compressed stream has been finished, otherwise {@code false}.
+     */
+    private volatile boolean finished;
+
+    /**
+     * Used to interact with its {@link ChannelPipeline} and other handlers.
+     */
+    private volatile ChannelHandlerContext ctx;
+
+    /**
+     * Creates a new bzip2 encoder with the maximum (900,000 byte) block size.
+     */
+    public Bzip2Encoder() {
+        this(MAX_BLOCK_SIZE);
+    }
+
+    /**
+     * Creates a new bzip2 encoder with the specified {@code blockSizeMultiplier}.
+     * @param blockSizeMultiplier
+     *        The Bzip2 block size as a multiple of 100,000 bytes (minimum {@code 1}, maximum {@code 9}).
+     *        Larger block sizes require more memory for both compression and decompression,
+     *        but give better compression ratios. {@code 9} will usually be the best value to use.
+     */
+    public Bzip2Encoder(final int blockSizeMultiplier) {
+        if (blockSizeMultiplier < MIN_BLOCK_SIZE || blockSizeMultiplier > MAX_BLOCK_SIZE) {
+            throw new IllegalArgumentException(
+                    "blockSizeMultiplier: " + blockSizeMultiplier + " (expected: 1-9)");
+        }
+        streamBlockSize = blockSizeMultiplier * BASE_BLOCK_SIZE;
+    }
+
+    @Override
+    protected void encode(ChannelHandlerContext ctx, ByteBuf in, ByteBuf out) throws Exception {
+        if (finished) {
+            out.writeBytes(in);
+            return;
+        }
+
+        for (;;) {
+            switch (currentState) {
+                case INIT:
+                    out.ensureWritable(4);
+                    out.writeMedium(MAGIC_NUMBER);
+                    out.writeByte('0' + streamBlockSize / BASE_BLOCK_SIZE);
+                    currentState = State.INIT_BLOCK;
+                case INIT_BLOCK:
+                    blockCompressor = new Bzip2BlockCompressor(writer, streamBlockSize);
+                    currentState = State.WRITE_DATA;
+                case WRITE_DATA:
+                    if (!in.isReadable()) {
+                        return;
+                    }
+                    Bzip2BlockCompressor blockCompressor = this.blockCompressor;
+                    final int length = in.readableBytes() < blockCompressor.availableSize() ?
+                                    in.readableBytes() : blockCompressor.availableSize();
+                    final int offset;
+                    final byte[] array;
+                    if (in.hasArray()) {
+                        array = in.array();
+                        offset = in.arrayOffset() + in.readerIndex();
+                    } else {
+                        array = new byte[length];
+                        in.getBytes(in.readerIndex(), array);
+                        offset = 0;
+                    }
+                    final int bytesWritten = blockCompressor.write(array, offset, length);
+                    in.skipBytes(bytesWritten);
+                    if (!blockCompressor.isFull()) {
+                        if (in.isReadable()) {
+                            break;
+                        } else {
+                            return;
+                        }
+                    }
+                    currentState = State.CLOSE_BLOCK;
+                case CLOSE_BLOCK:
+                    closeBlock(out);
+                    currentState = State.INIT_BLOCK;
+                    break;
+                default:
+                    throw new IllegalStateException();
+            }
+        }
+    }
+
+    /**
+     * Close current block and update {@link #streamCRC}.
+     */
+    private void closeBlock(ByteBuf out) {
+        final Bzip2BlockCompressor blockCompressor = this.blockCompressor;
+        if (!blockCompressor.isEmpty()) {
+            blockCompressor.close(out);
+            final int blockCRC = blockCompressor.crc();
+            streamCRC = (streamCRC << 1 | streamCRC >>> 31) ^ blockCRC;
+        }
+    }
+
+    /**
+     * Returns {@code true} if and only if the end of the compressed stream has been reached.
+     */
+    public boolean isClosed() {
+        return finished;
+    }
+
+    /**
+     * Close this {@link Bzip2Encoder} and so finish the encoding.
+     *
+     * The returned {@link ChannelFuture} will be notified once the operation completes.
+     */
+    public ChannelFuture close() {
+        return close(ctx().newPromise());
+    }
+
+    /**
+     * Close this {@link Bzip2Encoder} and so finish the encoding.
+     * The given {@link ChannelFuture} will be notified once the operation
+     * completes and will also be returned.
+     */
+    public ChannelFuture close(final ChannelPromise promise) {
+        ChannelHandlerContext ctx = ctx();
+        EventExecutor executor = ctx.executor();
+        if (executor.inEventLoop()) {
+            return finishEncode(ctx, promise);
+        } else {
+            executor.execute(new Runnable() {
+                @Override
+                public void run() {
+                    ChannelFuture f = finishEncode(ctx(), promise);
+                    f.addListener(new ChannelPromiseNotifier(promise));
+                }
+            });
+            return promise;
+        }
+    }
+
+    @Override
+    public void close(final ChannelHandlerContext ctx, final ChannelPromise promise) throws Exception {
+        ChannelFuture f = finishEncode(ctx, ctx.newPromise());
+        f.addListener(new ChannelFutureListener() {
+            @Override
+            public void operationComplete(ChannelFuture f) throws Exception {
+                ctx.close(promise);
+            }
+        });
+
+        if (!f.isDone()) {
+            // Ensure the channel is closed even if the write operation completes in time.
+            ctx.executor().schedule(new Runnable() {
+                @Override
+                public void run() {
+                    ctx.close(promise);
+                }
+            }, 10, TimeUnit.SECONDS); // FIXME: Magic number
+        }
+    }
+
+    private ChannelFuture finishEncode(final ChannelHandlerContext ctx, ChannelPromise promise) {
+        if (finished) {
+            promise.setSuccess();
+            return promise;
+        }
+        finished = true;
+
+        final ByteBuf footer = ctx.alloc().buffer();
+        closeBlock(footer);
+
+        final int streamCRC = this.streamCRC;
+        final Bzip2BitWriter writer = this.writer;
+        try {
+            writer.writeBits(footer, 24, END_OF_STREAM_MAGIC_1);
+            writer.writeBits(footer, 24, END_OF_STREAM_MAGIC_2);
+            writer.writeInt(footer, streamCRC);
+            writer.flush(footer);
+        } finally {
+            blockCompressor = null;
+        }
+        return ctx.writeAndFlush(footer, promise);
+    }
+
+    private ChannelHandlerContext ctx() {
+        ChannelHandlerContext ctx = this.ctx;
+        if (ctx == null) {
+            throw new IllegalStateException("not added to a pipeline");
+        }
+        return ctx;
+    }
+
+    @Override
+    public void handlerAdded(ChannelHandlerContext ctx) throws Exception {
+        this.ctx = ctx;
+    }
+}
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2HuffmanAllocator.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2HuffmanAllocator.java
new file mode 100644
index 000000000000..70a3fd53e7a3
--- /dev/null
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2HuffmanAllocator.java
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+/**
+ * An in-place, length restricted Canonical Huffman code length allocator.<br>
+ * Based on the algorithm proposed by R. L. Milidi'u, A. A. Pessoa and E. S. Laber in
+ * <a href="http://www-di.inf.puc-rio.br/~laber/public/spire98.ps">In-place Length-Restricted Prefix Coding</a>
+ * and incorporating additional ideas from the implementation of
+ * <a href="http://entropyware.info/shcodec/index.html">shcodec</a> by Simakov Alexander.
+ */
+final class Bzip2HuffmanAllocator {
+    /**
+     * @param array The code length array
+     * @param i The input position
+     * @param nodesToMove The number of internal nodes to be relocated
+     * @return The smallest {@code k} such that {@code nodesToMove <= k <= i} and
+     *         {@code i <= (array[k] % array.length)}
+     */
+    private static int first(final int[] array, int i, final int nodesToMove) {
+        final int length = array.length;
+        final int limit = i;
+        int k = array.length - 2;
+
+        while (i >= nodesToMove && array[i] % length > limit) {
+            k = i;
+            i -= limit - i + 1;
+        }
+        i = Math.max(nodesToMove - 1, i);
+
+        while (k > i + 1) {
+            int temp = i + k >> 1;
+            if (array[temp] % length > limit) {
+                k = temp;
+            } else {
+                i = temp;
+            }
+        }
+        return k;
+    }
+
+    /**
+     * Fills the code array with extended parent pointers.
+     * @param array The code length array
+     */
+    private static void setExtendedParentPointers(final int[] array) {
+        final int length = array.length;
+        array[0] += array[1];
+
+        for (int headNode = 0, tailNode = 1, topNode = 2; tailNode < length - 1; tailNode++) {
+            int temp;
+            if (topNode >= length || array[headNode] < array[topNode]) {
+                temp = array[headNode];
+                array[headNode++] = tailNode;
+            } else {
+                temp = array[topNode++];
+            }
+
+            if (topNode >= length || (headNode < tailNode && array[headNode] < array[topNode])) {
+                temp += array[headNode];
+                array[headNode++] = tailNode + length;
+            } else {
+                temp += array[topNode++];
+            }
+            array[tailNode] = temp;
+        }
+    }
+
+    /**
+     * Finds the number of nodes to relocate in order to achieve a given code length limit.
+     * @param array The code length array
+     * @param maximumLength The maximum bit length for the generated codes
+     * @return The number of nodes to relocate
+     */
+    private static int findNodesToRelocate(final int[] array, final int maximumLength) {
+        int currentNode = array.length - 2;
+        for (int currentDepth = 1; currentDepth < maximumLength - 1 && currentNode > 1; currentDepth++) {
+            currentNode =  first(array, currentNode - 1, 0);
+        }
+        return currentNode;
+    }
+
+    /**
+     * A final allocation pass with no code length limit.
+     * @param array The code length array
+     */
+    private static void allocateNodeLengths(final int[] array) {
+        int firstNode = array.length - 2;
+        int nextNode = array.length - 1;
+
+        for (int currentDepth = 1, availableNodes = 2; availableNodes > 0; currentDepth++) {
+            final int lastNode = firstNode;
+            firstNode = first(array, lastNode - 1, 0);
+
+            for (int i = availableNodes - (lastNode - firstNode); i > 0; i--) {
+                array[nextNode--] = currentDepth;
+            }
+
+            availableNodes = (lastNode - firstNode) << 1;
+        }
+    }
+
+    /**
+     * A final allocation pass that relocates nodes in order to achieve a maximum code length limit.
+     * @param array The code length array
+     * @param nodesToMove The number of internal nodes to be relocated
+     * @param insertDepth The depth at which to insert relocated nodes
+     */
+    private static void allocateNodeLengthsWithRelocation(final int[] array,
+                                                           final int nodesToMove, final int insertDepth) {
+        int firstNode = array.length - 2;
+        int nextNode = array.length - 1;
+        int currentDepth = insertDepth == 1 ? 2 : 1;
+        int nodesLeftToMove = insertDepth == 1 ? nodesToMove - 2 : nodesToMove;
+
+        for (int availableNodes = currentDepth << 1; availableNodes > 0; currentDepth++) {
+            final int lastNode = firstNode;
+            firstNode = firstNode <= nodesToMove ? firstNode : first(array, lastNode - 1, nodesToMove);
+
+            int offset = 0;
+            if (currentDepth >= insertDepth) {
+                offset = Math.min(nodesLeftToMove, 1 << (currentDepth - insertDepth));
+            } else if (currentDepth == insertDepth - 1) {
+                offset = 1;
+                if (array[firstNode] == lastNode) {
+                    firstNode++;
+                }
+            }
+
+            for (int i = availableNodes - (lastNode - firstNode + offset); i > 0; i--) {
+                array[nextNode--] = currentDepth;
+            }
+
+            nodesLeftToMove -= offset;
+            availableNodes = (lastNode - firstNode + offset) << 1;
+        }
+    }
+
+    /**
+     * Allocates Canonical Huffman code lengths in place based on a sorted frequency array.
+     * @param array On input, a sorted array of symbol frequencies; On output, an array of Canonical
+     *              Huffman code lengths
+     * @param maximumLength The maximum code length. Must be at least {@code ceil(log2(array.length))}
+     */
+    static void allocateHuffmanCodeLengths(final int[] array, final int maximumLength) {
+        switch (array.length) {
+            case 2:
+                array[1] = 1;
+            case 1:
+                array[0] = 1;
+                return;
+        }
+
+        /* Pass 1 : Set extended parent pointers */
+        setExtendedParentPointers(array);
+
+        /* Pass 2 : Find number of nodes to relocate in order to achieve maximum code length */
+        int nodesToRelocate = findNodesToRelocate(array, maximumLength);
+
+        /* Pass 3 : Generate code lengths */
+        if (array[0] % array.length >= nodesToRelocate) {
+            allocateNodeLengths(array);
+        } else {
+            int insertDepth = maximumLength - (32 - Integer.numberOfLeadingZeros(nodesToRelocate - 1));
+            allocateNodeLengthsWithRelocation(array, nodesToRelocate, insertDepth);
+        }
+    }
+
+    private Bzip2HuffmanAllocator() { }
+}
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2HuffmanStageEncoder.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2HuffmanStageEncoder.java
new file mode 100644
index 000000000000..f026ba563371
--- /dev/null
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2HuffmanStageEncoder.java
@@ -0,0 +1,374 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import io.netty.buffer.ByteBuf;
+
+import java.util.Arrays;
+
+import static io.netty.handler.codec.compression.Bzip2Constants.*;
+
+/**
+ * An encoder for the Bzip2 Huffman encoding stage.
+ */
+final class Bzip2HuffmanStageEncoder {
+    /**
+     * Used in initial Huffman table generation.
+     */
+    private static final int HUFFMAN_HIGH_SYMBOL_COST = 15;
+
+    /**
+     * The {@link Bzip2BitWriter} to which the Huffman tables and data is written.
+     */
+    private final Bzip2BitWriter writer;
+
+    /**
+     * The output of the Move To Front Transform and Run Length Encoding[2] stages.
+     */
+    private final char[] mtfBlock;
+
+    /**
+     * The actual number of values contained in the {@link #mtfBlock} array.
+     */
+    private final int mtfLength;
+
+    /**
+     * The number of unique values in the {@link #mtfBlock} array.
+     */
+    private final int mtfAlphabetSize;
+
+    /**
+     * The global frequencies of values within the {@link #mtfBlock} array.
+     */
+    private final int[] mtfSymbolFrequencies;
+
+    /**
+     * The Canonical Huffman code lengths for each table.
+     */
+    private final int[][] huffmanCodeLengths;
+
+    /**
+     * Merged code symbols for each table. The value at each position is ((code length << 24) | code).
+     */
+    private final int[][] huffmanMergedCodeSymbols;
+
+    /**
+     * The selectors for each segment.
+     */
+    private final byte[] selectors;
+
+    /**
+     * @param writer The {@link Bzip2BitWriter} which provides bit-level writes
+     * @param mtfBlock The MTF block data
+     * @param mtfLength The actual length of the MTF block
+     * @param mtfAlphabetSize The size of the MTF block's alphabet
+     * @param mtfSymbolFrequencies The frequencies the MTF block's symbols
+     */
+    Bzip2HuffmanStageEncoder(final Bzip2BitWriter writer, final char[] mtfBlock,
+                             final int mtfLength, final int mtfAlphabetSize, final int[] mtfSymbolFrequencies) {
+        this.writer = writer;
+        this.mtfBlock = mtfBlock;
+        this.mtfLength = mtfLength;
+        this.mtfAlphabetSize = mtfAlphabetSize;
+        this.mtfSymbolFrequencies = mtfSymbolFrequencies;
+
+        final int totalTables = selectTableCount(mtfLength);
+
+        huffmanCodeLengths = new int[totalTables][mtfAlphabetSize];
+        huffmanMergedCodeSymbols = new int[totalTables][mtfAlphabetSize];
+        selectors = new byte[(mtfLength + HUFFMAN_GROUP_RUN_LENGTH - 1) / HUFFMAN_GROUP_RUN_LENGTH];
+    }
+
+    /**
+     * Selects an appropriate table count for a given MTF length.
+     * @param mtfLength The length to select a table count for
+     * @return The selected table count
+     */
+    private static int selectTableCount(final int mtfLength) {
+        if (mtfLength >= 2400) {
+            return 6;
+        }
+        if (mtfLength >= 1200) {
+            return 5;
+        }
+        if (mtfLength >= 600) {
+            return 4;
+        }
+        if (mtfLength >= 200) {
+            return 3;
+        }
+        return 2;
+    }
+
+    /**
+     * Generate a Huffman code length table for a given list of symbol frequencies.
+     * @param alphabetSize The total number of symbols
+     * @param symbolFrequencies The frequencies of the symbols
+     * @param codeLengths The array to which the generated code lengths should be written
+     */
+    private static void generateHuffmanCodeLengths(final int alphabetSize,
+                                                   final int[] symbolFrequencies, final int[] codeLengths) {
+
+        final int[] mergedFrequenciesAndIndices = new int[alphabetSize];
+        final int[] sortedFrequencies = new int[alphabetSize];
+
+        // The Huffman allocator needs its input symbol frequencies to be sorted, but we need to
+        // return code lengths in the same order as the corresponding frequencies are passed in.
+
+        // The symbol frequency and index are merged into a single array of
+        // integers - frequency in the high 23 bits, index in the low 9 bits.
+        //     2^23 = 8,388,608 which is higher than the maximum possible frequency for one symbol in a block
+        //     2^9 = 512 which is higher than the maximum possible alphabet size (== 258)
+        // Sorting this array simultaneously sorts the frequencies and
+        // leaves a lookup that can be used to cheaply invert the sort.
+        for (int i = 0; i < alphabetSize; i++) {
+            mergedFrequenciesAndIndices[i] = (symbolFrequencies[i] << 9) | i;
+        }
+        Arrays.sort(mergedFrequenciesAndIndices);
+        for (int i = 0; i < alphabetSize; i++) {
+            sortedFrequencies[i] = mergedFrequenciesAndIndices[i] >>> 9;
+        }
+
+        // Allocate code lengths - the allocation is in place,
+        // so the code lengths will be in the sortedFrequencies array afterwards
+        Bzip2HuffmanAllocator.allocateHuffmanCodeLengths(sortedFrequencies, HUFFMAN_ENCODE_MAX_CODE_LENGTH);
+
+        // Reverse the sort to place the code lengths in the same order as the symbols whose frequencies were passed in
+        for (int i = 0; i < alphabetSize; i++) {
+            codeLengths[mergedFrequenciesAndIndices[i] & 0x1ff] = sortedFrequencies[i];
+        }
+    }
+
+    /**
+     * Generate initial Huffman code length tables, giving each table a different low cost section
+     * of the alphabet that is roughly equal in overall cumulative frequency. Note that the initial
+     * tables are invalid for actual Huffman code generation, and only serve as the seed for later
+     * iterative optimisation in {@link #optimiseSelectorsAndHuffmanTables(boolean)}.
+     */
+    private void generateHuffmanOptimisationSeeds() {
+        final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
+        final int[] mtfSymbolFrequencies = this.mtfSymbolFrequencies;
+        final int mtfAlphabetSize = this.mtfAlphabetSize;
+
+        final int totalTables = huffmanCodeLengths.length;
+
+        int remainingLength = mtfLength;
+        int lowCostEnd = -1;
+
+        for (int i = 0; i < totalTables; i++) {
+
+            final int targetCumulativeFrequency = remainingLength / (totalTables - i);
+            final int lowCostStart = lowCostEnd + 1;
+            int actualCumulativeFrequency = 0;
+
+            while (actualCumulativeFrequency < targetCumulativeFrequency && lowCostEnd < mtfAlphabetSize - 1) {
+                actualCumulativeFrequency += mtfSymbolFrequencies[++lowCostEnd];
+            }
+
+            if (lowCostEnd > lowCostStart && i != 0 && i != totalTables - 1 && (totalTables - i & 1) == 0) {
+                actualCumulativeFrequency -= mtfSymbolFrequencies[lowCostEnd--];
+            }
+
+            final int[] tableCodeLengths = huffmanCodeLengths[i];
+            for (int j = 0; j < mtfAlphabetSize; j++) {
+                if (j < lowCostStart || j > lowCostEnd) {
+                    tableCodeLengths[j] = HUFFMAN_HIGH_SYMBOL_COST;
+                }
+            }
+
+            remainingLength -= actualCumulativeFrequency;
+        }
+    }
+
+    /**
+     * Co-optimise the selector list and the alternative Huffman table code lengths. This method is
+     * called repeatedly in the hope that the total encoded size of the selectors, the Huffman code
+     * lengths and the block data encoded with them will converge towards a minimum.<br>
+     * If the data is highly incompressible, it is possible that the total encoded size will
+     * instead diverge (increase) slightly.<br>
+     * @param storeSelectors If {@code true}, write out the (final) chosen selectors
+     */
+    private void optimiseSelectorsAndHuffmanTables(final boolean storeSelectors) {
+        final char[] mtfBlock = this.mtfBlock;
+        final byte[] selectors = this.selectors;
+        final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
+        final int mtfLength = this.mtfLength;
+        final int mtfAlphabetSize = this.mtfAlphabetSize;
+
+        final int totalTables = huffmanCodeLengths.length;
+        final int[][] tableFrequencies = new int[totalTables][mtfAlphabetSize];
+
+        int selectorIndex = 0;
+
+        // Find the best table for each group of 50 block bytes based on the current Huffman code lengths
+        for (int groupStart = 0; groupStart < mtfLength;) {
+
+            final int groupEnd = Math.min(groupStart + HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1;
+
+            // Calculate the cost of this group when encoded by each table
+            short[] cost = new short[totalTables];
+            for (int i = groupStart; i <= groupEnd; i++) {
+                final int value = mtfBlock[i];
+                for (int j = 0; j < totalTables; j++) {
+                    cost[j] += huffmanCodeLengths[j][value];
+                }
+            }
+
+            // Find the table with the least cost for this group
+            byte bestTable = 0;
+            int bestCost = cost[0];
+            for (byte i = 1 ; i < totalTables; i++) {
+                final int tableCost = cost[i];
+                if (tableCost < bestCost) {
+                    bestCost = tableCost;
+                    bestTable = i;
+                }
+            }
+
+            // Accumulate symbol frequencies for the table chosen for this block
+            final int[] bestGroupFrequencies = tableFrequencies[bestTable];
+            for (int i = groupStart; i <= groupEnd; i++) {
+                bestGroupFrequencies[mtfBlock[i]]++;
+            }
+
+            // Store a selector indicating the table chosen for this block
+            if (storeSelectors) {
+                selectors[selectorIndex++] = bestTable;
+            }
+            groupStart = groupEnd + 1;
+        }
+
+        // Generate new Huffman code lengths based on the frequencies for each table accumulated in this iteration
+        for (int i = 0; i < totalTables; i++) {
+            generateHuffmanCodeLengths(mtfAlphabetSize, tableFrequencies[i], huffmanCodeLengths[i]);
+        }
+    }
+
+    /**
+     * Assigns Canonical Huffman codes based on the calculated lengths.
+     */
+    private void assignHuffmanCodeSymbols() {
+        final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols;
+        final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
+        final int mtfAlphabetSize = this.mtfAlphabetSize;
+
+        final int totalTables = huffmanCodeLengths.length;
+
+        for (int i = 0; i < totalTables; i++) {
+            final int[] tableLengths = huffmanCodeLengths[i];
+
+            int minimumLength = 32;
+            int maximumLength = 0;
+            for (int j = 0; j < mtfAlphabetSize; j++) {
+                final int length = tableLengths[j];
+                if (length > maximumLength) {
+                    maximumLength = length;
+                }
+                if (length < minimumLength) {
+                    minimumLength = length;
+                }
+            }
+
+            int code = 0;
+            for (int j = minimumLength; j <= maximumLength; j++) {
+                for (int k = 0; k < mtfAlphabetSize; k++) {
+                    if ((huffmanCodeLengths[i][k] & 0xff) == j) {
+                        huffmanMergedCodeSymbols[i][k] = (j << 24) | code;
+                        code++;
+                    }
+                }
+                code <<= 1;
+            }
+        }
+    }
+
+    /**
+     * Write out the selector list and Huffman tables.
+     */
+    private void writeSelectorsAndHuffmanTables(ByteBuf out) {
+        final Bzip2BitWriter writer = this.writer;
+        final byte[] selectors = this.selectors;
+        final int totalSelectors = selectors.length;
+        final int[][] huffmanCodeLengths = this.huffmanCodeLengths;
+        final int totalTables = huffmanCodeLengths.length;
+        final int mtfAlphabetSize = this.mtfAlphabetSize;
+
+        writer.writeBits(out, 3, totalTables);
+        writer.writeBits(out, 15, totalSelectors);
+
+        // Write the selectors
+        Bzip2MoveToFrontTable selectorMTF = new Bzip2MoveToFrontTable();
+        for (byte selector : selectors) {
+            writer.writeUnary(out, selectorMTF.valueToFront(selector));
+        }
+
+        // Write the Huffman tables
+        for (final int[] tableLengths : huffmanCodeLengths) {
+            int currentLength = tableLengths[0];
+
+            writer.writeBits(out, 5, currentLength);
+
+            for (int j = 0; j < mtfAlphabetSize; j++) {
+                final int codeLength = tableLengths[j];
+                final int value = currentLength < codeLength ? 2 : 3;
+                int delta = Math.abs(codeLength - currentLength);
+                while (delta-- > 0) {
+                    writer.writeBits(out, 2, value);
+                }
+                writer.writeBoolean(out, false);
+                currentLength = codeLength;
+            }
+        }
+    }
+
+    /**
+     * Writes out the encoded block data.
+     */
+    private void writeBlockData(ByteBuf out) {
+        final Bzip2BitWriter writer = this.writer;
+        final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols;
+        final byte[] selectors = this.selectors;
+        final char[] mtf = mtfBlock;
+        final int mtfLength = this.mtfLength;
+
+        int selectorIndex = 0;
+        for (int mtfIndex = 0; mtfIndex < mtfLength;) {
+            final int groupEnd = Math.min(mtfIndex + HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1;
+            final int[] tableMergedCodeSymbols = huffmanMergedCodeSymbols[selectors[selectorIndex++]];
+
+            while (mtfIndex <= groupEnd) {
+                final int mergedCodeSymbol = tableMergedCodeSymbols[mtf[mtfIndex++]];
+                writer.writeBits(out, mergedCodeSymbol >>> 24, mergedCodeSymbol);
+            }
+        }
+    }
+
+    /**
+     * Encodes and writes the block data.
+     */
+    void encode(ByteBuf out) {
+        // Create optimised selector list and Huffman tables
+        generateHuffmanOptimisationSeeds();
+        for (int i = 3; i >= 0; i--) {
+            optimiseSelectorsAndHuffmanTables(i == 0);
+        }
+        assignHuffmanCodeSymbols();
+
+        // Write out the tables and the block data encoded with them
+        writeSelectorsAndHuffmanTables(out);
+        writeBlockData(out);
+    }
+}
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2MTFAndRLE2StageEncoder.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2MTFAndRLE2StageEncoder.java
new file mode 100644
index 000000000000..95df65db4e6b
--- /dev/null
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2MTFAndRLE2StageEncoder.java
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import static io.netty.handler.codec.compression.Bzip2Constants.*;
+
+/**
+ * An encoder for the Bzip2 Move To Front Transform and Run-Length Encoding[2] stages.<br>
+ * Although conceptually these two stages are separate, it is computationally efficient to perform
+ * them in one pass.
+ */
+final class Bzip2MTFAndRLE2StageEncoder {
+    /**
+     * The Burrows-Wheeler transformed block.
+     */
+    private final int[] bwtBlock;
+
+    /**
+     * Actual length of the data in the {@link #bwtBlock} array.
+     */
+    private final int bwtLength;
+
+    /**
+     * At each position, {@code true} if the byte value with that index is present within the block,
+     * otherwise {@code false}.
+     */
+    private final boolean[] bwtValuesPresent;
+
+    /**
+     * The output of the Move To Front Transform and Run-Length Encoding[2] stages.
+     */
+    private final char[] mtfBlock;
+
+    /**
+     * The actual number of values contained in the {@link #mtfBlock} array.
+     */
+    private int mtfLength;
+
+    /**
+     * The global frequencies of values within the {@link #mtfBlock} array.
+     */
+    private final int[] mtfSymbolFrequencies = new int[HUFFMAN_MAX_ALPHABET_SIZE];
+
+    /**
+     * The encoded alphabet size.
+     */
+    private int alphabetSize;
+
+    /**
+     * @param bwtBlock The Burrows Wheeler Transformed block data
+     * @param bwtLength The actual length of the BWT data
+     * @param bwtValuesPresent The values that are present within the BWT data. For each index,
+     *            {@code true} if that value is present within the data, otherwise {@code false}
+     */
+    Bzip2MTFAndRLE2StageEncoder(final int[] bwtBlock, final int bwtLength, final boolean[] bwtValuesPresent) {
+        this.bwtBlock = bwtBlock;
+        this.bwtLength = bwtLength;
+        this.bwtValuesPresent = bwtValuesPresent;
+        mtfBlock = new char[bwtLength + 1];
+    }
+
+    /**
+     * Performs the Move To Front transform and Run Length Encoding[1] stages.
+     */
+    void encode() {
+        final int bwtLength = this.bwtLength;
+        final boolean[] bwtValuesPresent = this.bwtValuesPresent;
+        final int[] bwtBlock = this.bwtBlock;
+        final char[] mtfBlock = this.mtfBlock;
+        final int[] mtfSymbolFrequencies = this.mtfSymbolFrequencies;
+        final byte[] huffmanSymbolMap = new byte[256];
+        final Bzip2MoveToFrontTable symbolMTF = new Bzip2MoveToFrontTable();
+
+        int totalUniqueValues = 0;
+        for (int i = 0; i < huffmanSymbolMap.length; i++) {
+            if (bwtValuesPresent[i]) {
+                huffmanSymbolMap[i] = (byte) totalUniqueValues++;
+            }
+        }
+        final int endOfBlockSymbol = totalUniqueValues + 1;
+
+        int mtfIndex = 0;
+        int repeatCount = 0;
+        int totalRunAs = 0;
+        int totalRunBs = 0;
+        for (int i = 0; i < bwtLength; i++) {
+            // Move To Front
+            final int mtfPosition = symbolMTF.valueToFront(huffmanSymbolMap[bwtBlock[i] & 0xff]);
+            // Run Length Encode
+            if (mtfPosition == 0) {
+                repeatCount++;
+            } else {
+                if (repeatCount > 0) {
+                    repeatCount--;
+                    while (true) {
+                        if ((repeatCount & 1) == 0) {
+                            mtfBlock[mtfIndex++] = HUFFMAN_SYMBOL_RUNA;
+                            totalRunAs++;
+                        } else {
+                            mtfBlock[mtfIndex++] = HUFFMAN_SYMBOL_RUNB;
+                            totalRunBs++;
+                        }
+
+                        if (repeatCount <= 1) {
+                            break;
+                        }
+                        repeatCount = (repeatCount - 2) >>> 1;
+                    }
+                    repeatCount = 0;
+                }
+                mtfBlock[mtfIndex++] = (char) (mtfPosition + 1);
+                mtfSymbolFrequencies[mtfPosition + 1]++;
+            }
+        }
+
+        if (repeatCount > 0) {
+            repeatCount--;
+            while (true) {
+                if ((repeatCount & 1) == 0) {
+                    mtfBlock[mtfIndex++] = HUFFMAN_SYMBOL_RUNA;
+                    totalRunAs++;
+                } else {
+                    mtfBlock[mtfIndex++] = HUFFMAN_SYMBOL_RUNB;
+                    totalRunBs++;
+                }
+
+                if (repeatCount <= 1) {
+                    break;
+                }
+                repeatCount = (repeatCount - 2) >>> 1;
+            }
+        }
+
+        mtfBlock[mtfIndex] = (char) endOfBlockSymbol;
+        mtfSymbolFrequencies[endOfBlockSymbol]++;
+        mtfSymbolFrequencies[HUFFMAN_SYMBOL_RUNA] += totalRunAs;
+        mtfSymbolFrequencies[HUFFMAN_SYMBOL_RUNB] += totalRunBs;
+
+        mtfLength = mtfIndex + 1;
+        alphabetSize = endOfBlockSymbol + 1;
+    }
+
+    /**
+     * @return The encoded MTF block
+     */
+    char[] mtfBlock() {
+        return mtfBlock;
+    }
+
+    /**
+     * @return The actual length of the MTF block
+     */
+    int mtfLength() {
+        return mtfLength;
+    }
+
+    /**
+     * @return The size of the MTF block's alphabet
+     */
+    int mtfAlphabetSize() {
+        return alphabetSize;
+    }
+
+    /**
+     * @return The frequencies of the MTF block's symbols
+     */
+    int[] mtfSymbolFrequencies() {
+        return mtfSymbolFrequencies;
+    }
+}
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2MoveToFrontTable.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2MoveToFrontTable.java
index 18e2512fcfcc..3f66027960f5 100644
--- a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2MoveToFrontTable.java
+++ b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2MoveToFrontTable.java
@@ -18,7 +18,7 @@
 /**
  * A 256 entry Move To Front transform.
  */
-class Bzip2MoveToFrontTable {
+final class Bzip2MoveToFrontTable {
     /**
      * The Move To Front list.
      */
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/package-info.java b/codec/src/main/java/io/netty/handler/codec/compression/package-info.java
index 0b709ec90fbc..faa177210004 100644
--- a/codec/src/main/java/io/netty/handler/codec/compression/package-info.java
+++ b/codec/src/main/java/io/netty/handler/codec/compression/package-info.java
@@ -21,4 +21,4 @@
  * <a href="http://code.google.com/p/snappy/">Snappy</a>.
  */
 package io.netty.handler.codec.compression;
-// TODO Implement bzip2 and lzma handlers
+// TODO Implement lzma handler
diff --git a/codec/src/test/java/io/netty/handler/codec/compression/Bzip2DecoderTest.java b/codec/src/test/java/io/netty/handler/codec/compression/Bzip2DecoderTest.java
index 58812f077abe..788b12119940 100644
--- a/codec/src/test/java/io/netty/handler/codec/compression/Bzip2DecoderTest.java
+++ b/codec/src/test/java/io/netty/handler/codec/compression/Bzip2DecoderTest.java
@@ -16,6 +16,7 @@
 package io.netty.handler.codec.compression;
 
 import io.netty.buffer.ByteBuf;
+import io.netty.buffer.CompositeByteBuf;
 import io.netty.buffer.Unpooled;
 import io.netty.channel.embedded.EmbeddedChannel;
 import io.netty.util.internal.ThreadLocalRandom;
@@ -84,9 +85,9 @@ public void testBadBlockHeader() throws Exception {
         ByteBuf in = Unpooled.buffer();
         in.writeMedium(MAGIC_NUMBER);
         in.writeByte('1');  //block size
-        in.writeInt(11111); //random value
-        in.writeShort(111); //random value
-        in.writeInt(111);   //block CRC
+        in.writeMedium(11); //incorrect block header
+        in.writeMedium(11); //incorrect block header
+        in.writeInt(11111); //block CRC
 
         channel.writeInbound(in);
     }
@@ -99,8 +100,8 @@ public void testStreamCrcErrorOfEmptyBlock() throws Exception {
         ByteBuf in = Unpooled.buffer();
         in.writeMedium(MAGIC_NUMBER);
         in.writeByte('1');  //block size
-        in.writeInt((int) (END_OF_STREAM_MAGIC >> 16));
-        in.writeShort((int) END_OF_STREAM_MAGIC);
+        in.writeMedium(END_OF_STREAM_MAGIC_1);
+        in.writeMedium(END_OF_STREAM_MAGIC_2);
         in.writeInt(1);  //wrong storedCombinedCRC
 
         channel.writeInbound(in);
@@ -181,6 +182,22 @@ public void testBlockCrcError() throws Exception {
         channel.writeInbound(in);
     }
 
+    @Test
+    public void testStartPointerInvalid() throws Exception {
+        expected.expect(DecompressionException.class);
+        expected.expectMessage("start pointer invalid");
+
+        final byte[] data = { 0x42, 0x5A, 0x68, 0x37, 0x31, 0x41, 0x59, 0x26, 0x53,
+                              0x59, 0x77, 0x7B, (byte) 0xCA, (byte) 0xC0, (byte) 0xFF, 0x00,
+                              0x00, 0x05, (byte) 0x80, 0x00, 0x01, 0x02, 0x00, 0x04,
+                              0x20, 0x20, 0x00, 0x30, (byte) 0xCD, 0x34, 0x19, (byte) 0xA6,
+                              (byte) 0x89, (byte) 0x99, (byte) 0xC5, (byte) 0xDC, (byte) 0x91,
+                              0x4E, 0x14, 0x24, 0x1D, (byte) 0xDE, (byte) 0xF2, (byte) 0xB0, 0x00 };
+
+        ByteBuf in = Unpooled.wrappedBuffer(data);
+        channel.writeInbound(in);
+    }
+
     private static void testDecompression(final byte[] data) throws Exception {
         for (int blockSize = MIN_BLOCK_SIZE; blockSize <= MAX_BLOCK_SIZE; blockSize++) {
             final EmbeddedChannel channel = new EmbeddedChannel(new Bzip2Decoder());
@@ -193,17 +210,13 @@ private static void testDecompression(final byte[] data) throws Exception {
             ByteBuf compressed = Unpooled.wrappedBuffer(os.toByteArray());
             channel.writeInbound(compressed);
 
-            ByteBuf uncompressed = Unpooled.buffer();
-            ByteBuf msg;
-            while ((msg = channel.readInbound()) != null) {
-                uncompressed.writeBytes(msg);
-                msg.release();
-            }
-            final byte[] result = new byte[uncompressed.readableBytes()];
-            uncompressed.readBytes(result);
-            uncompressed.release();
+            ByteBuf uncompressed = readUncompressed(channel);
+            ByteBuf dataBuf = Unpooled.wrappedBuffer(data);
 
-            assertArrayEquals(data, result);
+            assertEquals(dataBuf, uncompressed);
+
+            uncompressed.release();
+            dataBuf.release();
         }
     }
 
@@ -219,10 +232,12 @@ public void testDecompressionOfLargeChunkOfData() throws Exception {
 
     @Test
     public void testDecompressionOfBatchedFlowOfData() throws Exception {
+        final byte[] data = BYTES_LARGE;
+
         ByteArrayOutputStream os = new ByteArrayOutputStream();
         BZip2CompressorOutputStream bZip2Os = new BZip2CompressorOutputStream(os,
                                                     rand.nextInt(MIN_BLOCK_SIZE, MAX_BLOCK_SIZE + 1));
-        bZip2Os.write(BYTES_LARGE);
+        bZip2Os.write(data);
         bZip2Os.close();
 
         final byte[] compressedArray = os.toByteArray();
@@ -236,16 +251,23 @@ public void testDecompressionOfBatchedFlowOfData() throws Exception {
         ByteBuf compressed = Unpooled.wrappedBuffer(compressedArray, written, compressedArray.length - written);
         channel.writeInbound(compressed);
 
-        ByteBuf uncompressed = Unpooled.buffer();
+        ByteBuf uncompressed = readUncompressed(channel);
+        ByteBuf dataBuf = Unpooled.wrappedBuffer(data);
+
+        assertEquals(dataBuf, uncompressed);
+
+        uncompressed.release();
+        dataBuf.release();
+    }
+
+    private static ByteBuf readUncompressed(EmbeddedChannel channel) throws Exception {
+        CompositeByteBuf uncompressed = Unpooled.compositeBuffer();
         ByteBuf msg;
         while ((msg = channel.readInbound()) != null) {
-            uncompressed.writeBytes(msg);
-            msg.release();
+            uncompressed.addComponent(msg);
+            uncompressed.writerIndex(uncompressed.writerIndex() + msg.readableBytes());
         }
-        final byte[] result = new byte[uncompressed.readableBytes()];
-        uncompressed.readBytes(result);
-        uncompressed.release();
 
-        assertArrayEquals(BYTES_LARGE, result);
+        return uncompressed;
     }
 }
diff --git a/codec/src/test/java/io/netty/handler/codec/compression/Bzip2EncoderTest.java b/codec/src/test/java/io/netty/handler/codec/compression/Bzip2EncoderTest.java
new file mode 100644
index 000000000000..466b455f7917
--- /dev/null
+++ b/codec/src/test/java/io/netty/handler/codec/compression/Bzip2EncoderTest.java
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.CompositeByteBuf;
+import io.netty.buffer.Unpooled;
+import io.netty.channel.embedded.EmbeddedChannel;
+import io.netty.util.internal.ThreadLocalRandom;
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+
+import static io.netty.handler.codec.compression.Bzip2Constants.*;
+import static org.junit.Assert.*;
+
+public class Bzip2EncoderTest {
+
+    private static final ThreadLocalRandom rand;
+
+    private static final byte[] BYTES_SMALL = new byte[256];
+    private static final byte[] BYTES_LARGE = new byte[MAX_BLOCK_SIZE * BASE_BLOCK_SIZE * 2];
+
+    static {
+        rand = ThreadLocalRandom.current();
+        rand.nextBytes(BYTES_SMALL);
+        rand.nextBytes(BYTES_LARGE);
+    }
+
+    @Test
+    public void testStreamInitialization() throws Exception {
+        final EmbeddedChannel channel = new EmbeddedChannel(new Bzip2Encoder());
+
+        ByteBuf in = Unpooled.wrappedBuffer("test".getBytes());
+        channel.writeOutbound(in);
+
+        ByteBuf out = channel.readOutbound();
+
+        assertEquals(MAGIC_NUMBER, out.readMedium());
+        assertEquals(9 + '0', out.readByte());
+
+        out.release();
+        channel.finish();
+    }
+
+    private static void testCompression(final byte[] data) throws Exception {
+        for (int blockSize = MIN_BLOCK_SIZE; blockSize <= MAX_BLOCK_SIZE; blockSize++) {
+            final EmbeddedChannel channel = new EmbeddedChannel(new Bzip2Encoder(blockSize));
+
+            ByteBuf in = Unpooled.wrappedBuffer(data);
+            channel.writeOutbound(in);
+            channel.finish();
+
+            byte[] uncompressed = uncompress(channel, data.length);
+
+            assertArrayEquals(data, uncompressed);
+        }
+    }
+
+    @Test
+    public void testCompressionOfSmallChunkOfData() throws Exception {
+        testCompression(BYTES_SMALL);
+    }
+
+    @Test
+    public void testCompressionOfLargeChunkOfData() throws Exception {
+        testCompression(BYTES_LARGE);
+    }
+
+    @Test
+    public void testCompressionOfBatchedFlowOfData() throws Exception {
+        final EmbeddedChannel channel = new EmbeddedChannel(new Bzip2Encoder(
+                                rand.nextInt(MIN_BLOCK_SIZE, MAX_BLOCK_SIZE + 1)));
+
+        int written = 0, length = rand.nextInt(100);
+        while (written + length < BYTES_LARGE.length) {
+            ByteBuf in = Unpooled.wrappedBuffer(BYTES_LARGE, written, length);
+            channel.writeOutbound(in);
+            written += length;
+            length = rand.nextInt(100);
+        }
+        ByteBuf in = Unpooled.wrappedBuffer(BYTES_LARGE, written, BYTES_LARGE.length - written);
+        channel.writeOutbound(in);
+        channel.finish();
+
+        byte[] uncompressed = uncompress(channel, BYTES_LARGE.length);
+
+        assertArrayEquals(BYTES_LARGE, uncompressed);
+    }
+
+    private static byte[] uncompress(EmbeddedChannel channel, int length) throws Exception {
+        CompositeByteBuf out = Unpooled.compositeBuffer();
+        ByteBuf msg;
+        while ((msg = channel.readOutbound()) != null) {
+            out.addComponent(msg);
+            out.writerIndex(out.writerIndex() + msg.readableBytes());
+        }
+
+        byte[] compressed = new byte[out.readableBytes()];
+        out.readBytes(compressed);
+        out.release();
+
+        ByteArrayInputStream is = new ByteArrayInputStream(compressed);
+        BZip2CompressorInputStream bZip2Is = new BZip2CompressorInputStream(is);
+        byte[] uncompressed = new byte[length];
+        int remaining = length;
+        while (remaining > 0) {
+            int read = bZip2Is.read(uncompressed, length - remaining, remaining);
+            if (read > 0) {
+                remaining -= read;
+            } else {
+                break;
+            }
+        }
+
+        assertEquals(-1, bZip2Is.read());
+
+        return uncompressed;
+    }
+}
diff --git a/codec/src/test/java/io/netty/handler/codec/compression/Bzip2IntegrationTest.java b/codec/src/test/java/io/netty/handler/codec/compression/Bzip2IntegrationTest.java
new file mode 100644
index 000000000000..83df45f450c9
--- /dev/null
+++ b/codec/src/test/java/io/netty/handler/codec/compression/Bzip2IntegrationTest.java
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.CompositeByteBuf;
+import io.netty.buffer.Unpooled;
+import io.netty.channel.embedded.EmbeddedChannel;
+import io.netty.util.ReferenceCountUtil;
+import io.netty.util.internal.ThreadLocalRandom;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+import static org.hamcrest.Matchers.*;
+import static org.junit.Assert.*;
+
+public class Bzip2IntegrationTest {
+
+    private static final ThreadLocalRandom rand = ThreadLocalRandom.current();
+
+    public static final byte[] EMPTY = new byte[0];
+
+    @Test
+    public void testEmpty() throws Exception {
+        testIdentity(EMPTY);
+    }
+
+    @Test
+    public void testOneByte() throws Exception {
+        testIdentity(new byte[] { 'A' });
+    }
+
+    @Test
+    public void testTwoBytes() throws Exception {
+        testIdentity(new byte[] { 'B', 'A' });
+    }
+
+    @Test
+    public void testRegular() throws Exception {
+        byte[] data = ("Netty is a NIO client server framework which enables quick and easy development " +
+                                "of network applications such as protocol servers and clients.").getBytes();
+        testIdentity(data);
+    }
+
+    @Test
+    public void test3Tables() throws Exception {
+        byte[] data = new byte[500];
+        rand.nextBytes(data);
+        testIdentity(data);
+    }
+
+    @Test
+    public void test4Tables() throws Exception {
+        byte[] data = new byte[1100];
+        rand.nextBytes(data);
+        testIdentity(data);
+    }
+
+    @Test
+    public void test5Tables() throws Exception {
+        byte[] data = new byte[2300];
+        rand.nextBytes(data);
+        testIdentity(data);
+    }
+
+    @Test
+    public void testLargeRandom() throws Exception {
+        byte[] data = new byte[1048576];
+        rand.nextBytes(data);
+        testIdentity(data);
+    }
+
+    @Test
+    public void testPartRandom() throws Exception {
+        byte[] data = new byte[12345];
+        rand.nextBytes(data);
+        for (int i = 0; i < 1024; i++) {
+            data[i] = 123;
+        }
+        testIdentity(data);
+    }
+
+    @Test
+    public void testCompressible() throws Exception {
+        byte[] data = new byte[10000];
+        for (int i = 0; i < data.length; i++) {
+            data[i] = i % 4 != 0 ? 0 : (byte) rand.nextInt();
+        }
+        testIdentity(data);
+    }
+
+    @Test
+    public void testLongBlank() throws Exception {
+        byte[] data = new byte[100000];
+        testIdentity(data);
+    }
+
+    @Test
+    public void testLongSame() throws Exception {
+        byte[] data = new byte[100000];
+        Arrays.fill(data, (byte) 123);
+        testIdentity(data);
+    }
+
+    @Test
+    public void testSequential() throws Exception {
+        byte[] data = new byte[49];
+        for (int i = 0; i < data.length; i++) {
+            data[i] = (byte) i;
+        }
+        testIdentity(data);
+    }
+
+    private static void testIdentity(byte[] data) {
+        ByteBuf in = Unpooled.wrappedBuffer(data);
+        EmbeddedChannel encoder = new EmbeddedChannel(new Bzip2Encoder());
+        EmbeddedChannel decoder = new EmbeddedChannel(new Bzip2Decoder());
+        try {
+            ByteBuf msg;
+
+            encoder.writeOutbound(in.copy());
+            encoder.finish();
+            CompositeByteBuf compressed = Unpooled.compositeBuffer();
+            while ((msg = encoder.readOutbound()) != null) {
+                compressed.addComponent(msg);
+                compressed.writerIndex(compressed.writerIndex() + msg.readableBytes());
+            }
+            assertThat(compressed, is(notNullValue()));
+            assertThat(compressed, is(not(in)));
+
+            decoder.writeInbound(compressed.retain());
+            assertFalse(compressed.isReadable());
+            CompositeByteBuf decompressed = Unpooled.compositeBuffer();
+            while ((msg = decoder.readInbound()) != null) {
+                decompressed.addComponent(msg);
+                decompressed.writerIndex(decompressed.writerIndex() + msg.readableBytes());
+            }
+            assertEquals(in, decompressed);
+
+            compressed.release();
+            decompressed.release();
+            in.release();
+        } finally {
+            encoder.close();
+            decoder.close();
+
+            for (;;) {
+                Object msg = encoder.readOutbound();
+                if (msg == null) {
+                    break;
+                }
+                ReferenceCountUtil.release(msg);
+            }
+
+            for (;;) {
+                Object msg = decoder.readInbound();
+                if (msg == null) {
+                    break;
+                }
+                ReferenceCountUtil.release(msg);
+            }
+        }
+    }
+}
diff --git a/license/LICENSE.libdivsufsort.txt b/license/LICENSE.libdivsufsort.txt
new file mode 100644
index 000000000000..3bad2dcc18c4
--- /dev/null
+++ b/license/LICENSE.libdivsufsort.txt
@@ -0,0 +1,22 @@
+Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.