Skip to content

Commit

Permalink
Opportunistically record computed dictionary size and uniqueIds
Browse files Browse the repository at this point in the history
  • Loading branch information
pettyjamesm committed Mar 9, 2022
1 parent e4f477c commit 4871cee
Showing 1 changed file with 38 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,15 +242,15 @@ public OptionalInt fixedSizeInBytesPerPosition()

private void calculateCompactSize()
{
int usedIds = 0;
int uniqueIds = 0;
boolean[] used = new boolean[dictionary.getPositionCount()];
for (int i = idsOffset; i < idsOffset + positionCount; i++) {
int id = ids[i];
usedIds += used[id] ? 0 : 1;
uniqueIds += used[id] ? 0 : 1;
used[id] = true;
}
this.uniqueIds = usedIds;
this.sizeInBytes = dictionary.getPositionsSizeInBytes(used, usedIds) + (Integer.BYTES * (long) positionCount);
this.uniqueIds = uniqueIds;
this.sizeInBytes = getSizeInBytesForSelectedPositions(used, uniqueIds, positionCount);
}

@Override
Expand All @@ -273,14 +273,14 @@ public long getRegionSizeInBytes(int positionOffset, int length)
return fixedSizePerPosition.getAsInt() * (long) length;
}

int usedIds = 0;
int uniqueIds = 0;
boolean[] used = new boolean[dictionary.getPositionCount()];
for (int i = idsOffset + positionOffset; i < idsOffset + positionOffset + length; i++) {
int id = ids[i];
usedIds += used[id] ? 0 : 1;
uniqueIds += used[id] ? 0 : 1;
used[id] = true;
}
return dictionary.getPositionsSizeInBytes(used, usedIds) + (Integer.BYTES * (long) length);
return getSizeInBytesForSelectedPositions(used, uniqueIds, length);
}

@Override
Expand Down Expand Up @@ -341,16 +341,27 @@ public long getPositionsSizeInBytes(boolean[] positions, int usedPositionCount)
return fixedSizePerPosition.getAsInt() * (long) usedPositionCount;
}

int usedIds = 0;
int uniqueIds = 0;
boolean[] used = new boolean[dictionary.getPositionCount()];
for (int i = 0; i < positions.length; i++) {
int id = ids[idsOffset + i];
if (positions[i]) {
int id = ids[idsOffset + i];
usedIds += used[id] ? 0 : 1;
uniqueIds += used[id] ? 0 : 1;
used[id] = true;
}
}
return dictionary.getPositionsSizeInBytes(used, usedIds) + (Integer.BYTES * (long) usedPositionCount);
return getSizeInBytesForSelectedPositions(used, uniqueIds, usedPositionCount);
}

private long getSizeInBytesForSelectedPositions(boolean[] usedIds, int uniqueIds, int selectedPositions)
{
long dictionarySize = dictionary.getPositionsSizeInBytes(usedIds, uniqueIds);
if (uniqueIds == dictionary.getPositionCount() && this.uniqueIds == -1) {
// All positions in the dictionary are referenced, store the uniqueId count and sizeInBytes
this.uniqueIds = uniqueIds;
this.sizeInBytes = dictionarySize + (Integer.BYTES * (long) positionCount);
}
return dictionarySize + (Integer.BYTES * (long) selectedPositions);
}

@Override
Expand Down Expand Up @@ -434,21 +445,26 @@ public Block getPositions(int[] positions, int offset, int length)
checkArrayRange(positions, offset, length);

int[] newIds = new int[length];
boolean isCompact = isCompact() && length >= dictionary.getPositionCount();
boolean[] seen = null;
if (isCompact) {
seen = new boolean[dictionary.getPositionCount()];
}
boolean isCompact = length >= dictionary.getPositionCount() && isCompact();
boolean[] usedIds = isCompact ? new boolean[dictionary.getPositionCount()] : null;
int uniqueIds = 0;
for (int i = 0; i < length; i++) {
newIds[i] = getId(positions[offset + i]);
if (isCompact) {
seen[newIds[i]] = true;
int id = getId(positions[offset + i]);
newIds[i] = id;
if (usedIds != null) {
uniqueIds += usedIds[id] ? 0 : 1;
usedIds[id] = true;
}
}
for (int i = 0; i < dictionary.getPositionCount() && isCompact; i++) {
isCompact &= seen[i];
// All positions must have been referenced in order to be compact
isCompact &= (usedIds != null && usedIds.length == uniqueIds);
DictionaryBlock result = new DictionaryBlock(newIds.length, dictionary, newIds, isCompact, dictionarySourceId);
if (usedIds != null && !isCompact) {
// resulting dictionary is not compact, but we know the number of unique ids and which positions are used
result.uniqueIds = uniqueIds;
result.sizeInBytes = dictionary.getPositionsSizeInBytes(usedIds, uniqueIds) + (Integer.BYTES * (long) length);
}
return new DictionaryBlock(newIds.length, getDictionary(), newIds, isCompact, getDictionarySourceId());
return result;
}

@Override
Expand Down

0 comments on commit 4871cee

Please sign in to comment.