Skip to content

Commit

Permalink
Avoid calling seekToRow on ORC reader
Browse files Browse the repository at this point in the history
Calling seekToRow on the ORC reader tries to recompute the offsets each
time which in this implementation can be an expensive operation. Avoid
calling seekToRow instead iterate through the file sequentially.
  • Loading branch information
nileema committed Feb 10, 2016
1 parent b2ef00c commit 8c612bb
Showing 1 changed file with 13 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -91,25 +91,30 @@ private static OrcFileInfo rewrite(RecordReader reader, Writer writer, BitSet ro
long rowCount = 0;
long uncompressedSize = 0;

while (true) {
row = rowsToDelete.nextClearBit(row);
if (row < inputRowCount) {
reader.seekToRow(row);
}

while (row < inputRowCount) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedIOException();
}

row = rowsToDelete.nextClearBit(row);
if (row >= inputRowCount) {
return new OrcFileInfo(rowCount, uncompressedSize);
// seekToRow() is extremely expensive
if (reader.getRowNumber() < row) {
reader.next(object);
continue;
}

reader.seekToRow(row);
object = reader.next(object);
writer.addRow(object);

row++;

rowCount++;
uncompressedSize += uncompressedSize(object);

row = rowsToDelete.nextClearBit(row + 1);
}
return new OrcFileInfo(rowCount, uncompressedSize);
}

private static Path path(File input)
Expand Down

0 comments on commit 8c612bb

Please sign in to comment.